1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/WinEHFuncInfo.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/DiagnosticInfo.h"
44 #include "llvm/IR/Function.h"
45 #include "llvm/IR/GlobalAlias.h"
46 #include "llvm/IR/GlobalVariable.h"
47 #include "llvm/IR/Instructions.h"
48 #include "llvm/IR/Intrinsics.h"
49 #include "llvm/MC/MCAsmInfo.h"
50 #include "llvm/MC/MCContext.h"
51 #include "llvm/MC/MCExpr.h"
52 #include "llvm/MC/MCSymbol.h"
53 #include "llvm/Support/CommandLine.h"
54 #include "llvm/Support/Debug.h"
55 #include "llvm/Support/ErrorHandling.h"
56 #include "llvm/Support/KnownBits.h"
57 #include "llvm/Support/MathExtras.h"
58 #include "llvm/Target/TargetLowering.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
191 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
193 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
194 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
195 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
197 if (Subtarget.is64Bit()) {
198 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
199 // f32/f64 are legal, f80 is custom.
200 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
204 } else if (!Subtarget.useSoftFloat()) {
205 // We have an algorithm for SSE2->double, and we turn this into a
206 // 64-bit FILD followed by conditional FADD for other targets.
207 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
208 // We have an algorithm for SSE2, and we turn this into a 64-bit
209 // FILD or VCVTUSI2SS/SD for other targets.
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
213 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
215 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
216 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
218 if (!Subtarget.useSoftFloat()) {
219 // SSE has no i16 to fp conversion, only i32.
220 if (X86ScalarSSEf32) {
221 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
222 // f32 and f64 cases are Legal, f80 case is not
223 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
225 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
226 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
229 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
230 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
233 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
235 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
236 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
238 if (!Subtarget.useSoftFloat()) {
239 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
240 // are Legal, f80 is custom lowered.
241 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
242 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
244 if (X86ScalarSSEf32) {
245 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
246 // f32 and f64 cases are Legal, f80 case is not
247 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
249 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
250 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
253 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
254 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
255 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
258 // Handle FP_TO_UINT by promoting the destination to a larger signed
260 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
261 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
262 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
264 if (Subtarget.is64Bit()) {
265 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
266 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
267 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
268 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
270 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
271 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
273 } else if (!Subtarget.useSoftFloat()) {
274 // Since AVX is a superset of SSE3, only check for SSE here.
275 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
276 // Expand FP_TO_UINT into a select.
277 // FIXME: We would like to use a Custom expander here eventually to do
278 // the optimal thing for SSE vs. the default expansion in the legalizer.
279 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
281 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
282 // With SSE3 we can use fisttpll to convert to a signed i64; without
283 // SSE, we're stuck with a fistpll.
284 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
286 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
289 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
290 if (!X86ScalarSSEf64) {
291 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
292 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
293 if (Subtarget.is64Bit()) {
294 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
295 // Without SSE, i64->f64 goes through memory.
296 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
298 } else if (!Subtarget.is64Bit())
299 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
301 // Scalar integer divide and remainder are lowered to use operations that
302 // produce two results, to match the available instructions. This exposes
303 // the two-result form to trivial CSE, which is able to combine x/y and x%y
304 // into a single instruction.
306 // Scalar integer multiply-high is also lowered to use two-result
307 // operations, to match the available instructions. However, plain multiply
308 // (low) operations are left as Legal, as there are single-result
309 // instructions for this in x86. Using the two-result multiply instructions
310 // when both high and low results are needed must be arranged by dagcombine.
311 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
312 setOperationAction(ISD::MULHS, VT, Expand);
313 setOperationAction(ISD::MULHU, VT, Expand);
314 setOperationAction(ISD::SDIV, VT, Expand);
315 setOperationAction(ISD::UDIV, VT, Expand);
316 setOperationAction(ISD::SREM, VT, Expand);
317 setOperationAction(ISD::UREM, VT, Expand);
320 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
321 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
322 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
323 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
324 setOperationAction(ISD::BR_CC, VT, Expand);
325 setOperationAction(ISD::SELECT_CC, VT, Expand);
327 if (Subtarget.is64Bit())
328 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
329 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
330 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
331 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
332 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
334 setOperationAction(ISD::FREM , MVT::f32 , Expand);
335 setOperationAction(ISD::FREM , MVT::f64 , Expand);
336 setOperationAction(ISD::FREM , MVT::f80 , Expand);
337 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
339 // Promote the i8 variants and force them on up to i32 which has a shorter
341 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
342 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
343 if (!Subtarget.hasBMI()) {
344 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
345 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
346 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
347 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
348 if (Subtarget.is64Bit()) {
349 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
350 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
354 if (Subtarget.hasLZCNT()) {
355 // When promoting the i8 variants, force them to i32 for a shorter
357 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
358 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
360 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
361 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
362 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
363 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
364 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
365 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
366 if (Subtarget.is64Bit()) {
367 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
368 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
372 // Special handling for half-precision floating point conversions.
373 // If we don't have F16C support, then lower half float conversions
374 // into library calls.
375 if (Subtarget.useSoftFloat() ||
376 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
377 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
378 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
381 // There's never any support for operations beyond MVT::f32.
382 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
383 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
384 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
385 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
387 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
388 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
389 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
390 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
391 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
392 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
394 if (Subtarget.hasPOPCNT()) {
395 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
397 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
398 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
399 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
400 if (Subtarget.is64Bit())
401 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
404 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
406 if (!Subtarget.hasMOVBE())
407 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
409 // These should be promoted to a larger select which is supported.
410 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
411 // X86 wants to expand cmov itself.
412 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
413 setOperationAction(ISD::SELECT, VT, Custom);
414 setOperationAction(ISD::SETCC, VT, Custom);
416 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
417 if (VT == MVT::i64 && !Subtarget.is64Bit())
419 setOperationAction(ISD::SELECT, VT, Custom);
420 setOperationAction(ISD::SETCC, VT, Custom);
423 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
424 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
425 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
427 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
428 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
429 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
430 // support continuation, user-level threading, and etc.. As a result, no
431 // other SjLj exception interfaces are implemented and please don't build
432 // your own exception handling based on them.
433 // LLVM/Clang supports zero-cost DWARF exception handling.
434 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
435 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
436 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
437 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
438 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
441 for (auto VT : { MVT::i32, MVT::i64 }) {
442 if (VT == MVT::i64 && !Subtarget.is64Bit())
444 setOperationAction(ISD::ConstantPool , VT, Custom);
445 setOperationAction(ISD::JumpTable , VT, Custom);
446 setOperationAction(ISD::GlobalAddress , VT, Custom);
447 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
448 setOperationAction(ISD::ExternalSymbol , VT, Custom);
449 setOperationAction(ISD::BlockAddress , VT, Custom);
452 // 64-bit shl, sra, srl (iff 32-bit x86)
453 for (auto VT : { MVT::i32, MVT::i64 }) {
454 if (VT == MVT::i64 && !Subtarget.is64Bit())
456 setOperationAction(ISD::SHL_PARTS, VT, Custom);
457 setOperationAction(ISD::SRA_PARTS, VT, Custom);
458 setOperationAction(ISD::SRL_PARTS, VT, Custom);
461 if (Subtarget.hasSSE1())
462 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
464 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
466 // Expand certain atomics
467 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
468 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
469 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
470 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
471 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
472 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
473 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
474 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
477 if (Subtarget.hasCmpxchg16b()) {
478 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
481 // FIXME - use subtarget debug flags
482 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
483 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
484 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
485 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
488 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
489 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
491 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
492 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
494 setOperationAction(ISD::TRAP, MVT::Other, Legal);
495 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
497 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
498 setOperationAction(ISD::VASTART , MVT::Other, Custom);
499 setOperationAction(ISD::VAEND , MVT::Other, Expand);
500 bool Is64Bit = Subtarget.is64Bit();
501 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
502 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
504 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
505 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
507 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
509 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
510 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
511 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
513 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
514 // f32 and f64 use SSE.
515 // Set up the FP register classes.
516 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
517 : &X86::FR32RegClass);
518 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
519 : &X86::FR64RegClass);
521 for (auto VT : { MVT::f32, MVT::f64 }) {
522 // Use ANDPD to simulate FABS.
523 setOperationAction(ISD::FABS, VT, Custom);
525 // Use XORP to simulate FNEG.
526 setOperationAction(ISD::FNEG, VT, Custom);
528 // Use ANDPD and ORPD to simulate FCOPYSIGN.
529 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
531 // We don't support sin/cos/fmod
532 setOperationAction(ISD::FSIN , VT, Expand);
533 setOperationAction(ISD::FCOS , VT, Expand);
534 setOperationAction(ISD::FSINCOS, VT, Expand);
537 // Lower this to MOVMSK plus an AND.
538 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
539 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
541 // Expand FP immediates into loads from the stack, except for the special
543 addLegalFPImmediate(APFloat(+0.0)); // xorpd
544 addLegalFPImmediate(APFloat(+0.0f)); // xorps
545 } else if (UseX87 && X86ScalarSSEf32) {
546 // Use SSE for f32, x87 for f64.
547 // Set up the FP register classes.
548 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
549 : &X86::FR32RegClass);
550 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
552 // Use ANDPS to simulate FABS.
553 setOperationAction(ISD::FABS , MVT::f32, Custom);
555 // Use XORP to simulate FNEG.
556 setOperationAction(ISD::FNEG , MVT::f32, Custom);
558 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
560 // Use ANDPS and ORPS to simulate FCOPYSIGN.
561 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
562 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
564 // We don't support sin/cos/fmod
565 setOperationAction(ISD::FSIN , MVT::f32, Expand);
566 setOperationAction(ISD::FCOS , MVT::f32, Expand);
567 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
569 // Special cases we handle for FP constants.
570 addLegalFPImmediate(APFloat(+0.0f)); // xorps
571 addLegalFPImmediate(APFloat(+0.0)); // FLD0
572 addLegalFPImmediate(APFloat(+1.0)); // FLD1
573 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
574 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
576 if (!TM.Options.UnsafeFPMath) {
577 setOperationAction(ISD::FSIN , MVT::f64, Expand);
578 setOperationAction(ISD::FCOS , MVT::f64, Expand);
579 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
582 // f32 and f64 in x87.
583 // Set up the FP register classes.
584 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
585 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
587 for (auto VT : { MVT::f32, MVT::f64 }) {
588 setOperationAction(ISD::UNDEF, VT, Expand);
589 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
591 if (!TM.Options.UnsafeFPMath) {
592 setOperationAction(ISD::FSIN , VT, Expand);
593 setOperationAction(ISD::FCOS , VT, Expand);
594 setOperationAction(ISD::FSINCOS, VT, Expand);
597 addLegalFPImmediate(APFloat(+0.0)); // FLD0
598 addLegalFPImmediate(APFloat(+1.0)); // FLD1
599 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
600 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
601 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
602 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
603 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
604 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
607 // We don't support FMA.
608 setOperationAction(ISD::FMA, MVT::f64, Expand);
609 setOperationAction(ISD::FMA, MVT::f32, Expand);
611 // Long double always uses X87, except f128 in MMX.
613 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
614 addRegisterClass(MVT::f128, &X86::FR128RegClass);
615 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
616 setOperationAction(ISD::FABS , MVT::f128, Custom);
617 setOperationAction(ISD::FNEG , MVT::f128, Custom);
618 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
621 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
622 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
623 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
625 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
626 addLegalFPImmediate(TmpFlt); // FLD0
628 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
631 APFloat TmpFlt2(+1.0);
632 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
634 addLegalFPImmediate(TmpFlt2); // FLD1
635 TmpFlt2.changeSign();
636 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
639 if (!TM.Options.UnsafeFPMath) {
640 setOperationAction(ISD::FSIN , MVT::f80, Expand);
641 setOperationAction(ISD::FCOS , MVT::f80, Expand);
642 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
645 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
646 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
647 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
648 setOperationAction(ISD::FRINT, MVT::f80, Expand);
649 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
650 setOperationAction(ISD::FMA, MVT::f80, Expand);
653 // Always use a library call for pow.
654 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
655 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
656 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
658 setOperationAction(ISD::FLOG, MVT::f80, Expand);
659 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
660 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
661 setOperationAction(ISD::FEXP, MVT::f80, Expand);
662 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
663 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
664 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
666 // Some FP actions are always expanded for vector types.
667 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
668 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
669 setOperationAction(ISD::FSIN, VT, Expand);
670 setOperationAction(ISD::FSINCOS, VT, Expand);
671 setOperationAction(ISD::FCOS, VT, Expand);
672 setOperationAction(ISD::FREM, VT, Expand);
673 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
674 setOperationAction(ISD::FPOW, VT, Expand);
675 setOperationAction(ISD::FLOG, VT, Expand);
676 setOperationAction(ISD::FLOG2, VT, Expand);
677 setOperationAction(ISD::FLOG10, VT, Expand);
678 setOperationAction(ISD::FEXP, VT, Expand);
679 setOperationAction(ISD::FEXP2, VT, Expand);
682 // First set operation action for all vector types to either promote
683 // (for widening) or expand (for scalarization). Then we will selectively
684 // turn on ones that can be effectively codegen'd.
685 for (MVT VT : MVT::vector_valuetypes()) {
686 setOperationAction(ISD::SDIV, VT, Expand);
687 setOperationAction(ISD::UDIV, VT, Expand);
688 setOperationAction(ISD::SREM, VT, Expand);
689 setOperationAction(ISD::UREM, VT, Expand);
690 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
691 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
692 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
693 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
694 setOperationAction(ISD::FMA, VT, Expand);
695 setOperationAction(ISD::FFLOOR, VT, Expand);
696 setOperationAction(ISD::FCEIL, VT, Expand);
697 setOperationAction(ISD::FTRUNC, VT, Expand);
698 setOperationAction(ISD::FRINT, VT, Expand);
699 setOperationAction(ISD::FNEARBYINT, VT, Expand);
700 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
701 setOperationAction(ISD::MULHS, VT, Expand);
702 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
703 setOperationAction(ISD::MULHU, VT, Expand);
704 setOperationAction(ISD::SDIVREM, VT, Expand);
705 setOperationAction(ISD::UDIVREM, VT, Expand);
706 setOperationAction(ISD::CTPOP, VT, Expand);
707 setOperationAction(ISD::CTTZ, VT, Expand);
708 setOperationAction(ISD::CTLZ, VT, Expand);
709 setOperationAction(ISD::ROTL, VT, Expand);
710 setOperationAction(ISD::ROTR, VT, Expand);
711 setOperationAction(ISD::BSWAP, VT, Expand);
712 setOperationAction(ISD::SETCC, VT, Expand);
713 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
714 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
715 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
716 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
717 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
718 setOperationAction(ISD::TRUNCATE, VT, Expand);
719 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
720 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
721 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
722 setOperationAction(ISD::SELECT_CC, VT, Expand);
723 for (MVT InnerVT : MVT::vector_valuetypes()) {
724 setTruncStoreAction(InnerVT, VT, Expand);
726 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
727 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
729 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
730 // types, we have to deal with them whether we ask for Expansion or not.
731 // Setting Expand causes its own optimisation problems though, so leave
733 if (VT.getVectorElementType() == MVT::i1)
734 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
736 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
737 // split/scalarized right now.
738 if (VT.getVectorElementType() == MVT::f16)
739 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
743 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
744 // with -msoft-float, disable use of MMX as well.
745 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
746 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
747 // No operations on x86mmx supported, everything uses intrinsics.
750 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
751 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
752 : &X86::VR128RegClass);
754 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
755 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
756 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
757 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
758 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
759 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
760 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
761 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
762 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
765 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
766 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
767 : &X86::VR128RegClass);
769 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
770 // registers cannot be used even for integer operations.
771 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
772 : &X86::VR128RegClass);
773 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
774 : &X86::VR128RegClass);
775 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
776 : &X86::VR128RegClass);
777 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
778 : &X86::VR128RegClass);
780 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
781 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
782 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
783 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
784 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
785 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
786 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
787 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
788 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
789 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
790 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
791 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
792 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
794 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
795 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
796 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
797 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
799 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
800 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
801 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
803 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
804 setOperationAction(ISD::SETCC, VT, Custom);
805 setOperationAction(ISD::CTPOP, VT, Custom);
806 setOperationAction(ISD::CTTZ, VT, Custom);
809 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
810 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
811 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
812 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
813 setOperationAction(ISD::VSELECT, VT, Custom);
814 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
817 // We support custom legalizing of sext and anyext loads for specific
818 // memory vector types which we can load as a scalar (or sequence of
819 // scalars) and extend in-register to a legal 128-bit vector type. For sext
820 // loads these must work with a single scalar load.
821 for (MVT VT : MVT::integer_vector_valuetypes()) {
822 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
823 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
824 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
825 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
826 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
827 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
828 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
829 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
830 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
833 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
834 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
835 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
836 setOperationAction(ISD::VSELECT, VT, Custom);
838 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
841 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
842 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
845 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
846 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
847 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
848 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
849 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
850 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
851 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
854 // Custom lower v2i64 and v2f64 selects.
855 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
856 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
858 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
859 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
861 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
862 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
864 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
865 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
866 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
868 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
869 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
871 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
872 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
874 for (MVT VT : MVT::fp_vector_valuetypes())
875 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
877 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
878 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
879 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
881 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
882 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
883 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
885 // In the customized shift lowering, the legal v4i32/v2i64 cases
886 // in AVX2 will be recognized.
887 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
888 setOperationAction(ISD::SRL, VT, Custom);
889 setOperationAction(ISD::SHL, VT, Custom);
890 setOperationAction(ISD::SRA, VT, Custom);
894 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
895 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
896 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
897 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
898 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
899 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
900 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
901 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
902 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
905 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
906 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
907 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
908 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
909 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
910 setOperationAction(ISD::FRINT, RoundedTy, Legal);
911 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
914 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
915 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
916 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
917 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
918 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
919 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
920 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
921 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
923 // FIXME: Do we need to handle scalar-to-vector here?
924 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
926 // We directly match byte blends in the backend as they match the VSELECT
928 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
930 // SSE41 brings specific instructions for doing vector sign extend even in
931 // cases where we don't have SRA.
932 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
933 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
934 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
937 for (MVT VT : MVT::integer_vector_valuetypes()) {
938 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
939 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
940 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
943 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
944 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
945 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
946 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
947 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
948 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
949 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
950 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
953 // i8 vectors are custom because the source register and source
954 // source memory operand types are not the same width.
955 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
958 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
959 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
960 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
961 setOperationAction(ISD::ROTL, VT, Custom);
963 // XOP can efficiently perform BITREVERSE with VPPERM.
964 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
965 setOperationAction(ISD::BITREVERSE, VT, Custom);
967 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
968 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
969 setOperationAction(ISD::BITREVERSE, VT, Custom);
972 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
973 bool HasInt256 = Subtarget.hasInt256();
975 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
976 : &X86::VR256RegClass);
977 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
978 : &X86::VR256RegClass);
979 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
980 : &X86::VR256RegClass);
981 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
982 : &X86::VR256RegClass);
983 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
984 : &X86::VR256RegClass);
985 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
986 : &X86::VR256RegClass);
988 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
989 setOperationAction(ISD::FFLOOR, VT, Legal);
990 setOperationAction(ISD::FCEIL, VT, Legal);
991 setOperationAction(ISD::FTRUNC, VT, Legal);
992 setOperationAction(ISD::FRINT, VT, Legal);
993 setOperationAction(ISD::FNEARBYINT, VT, Legal);
994 setOperationAction(ISD::FNEG, VT, Custom);
995 setOperationAction(ISD::FABS, VT, Custom);
996 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
999 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1000 // even though v8i16 is a legal type.
1001 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
1002 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
1003 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1005 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
1006 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1007 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1009 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1010 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1012 for (MVT VT : MVT::fp_vector_valuetypes())
1013 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1015 // In the customized shift lowering, the legal v8i32/v4i64 cases
1016 // in AVX2 will be recognized.
1017 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1018 setOperationAction(ISD::SRL, VT, Custom);
1019 setOperationAction(ISD::SHL, VT, Custom);
1020 setOperationAction(ISD::SRA, VT, Custom);
1023 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1024 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1025 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1027 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1028 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1029 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1030 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1033 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1034 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1035 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1036 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1038 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1039 setOperationAction(ISD::SETCC, VT, Custom);
1040 setOperationAction(ISD::CTPOP, VT, Custom);
1041 setOperationAction(ISD::CTTZ, VT, Custom);
1042 setOperationAction(ISD::CTLZ, VT, Custom);
1045 if (Subtarget.hasAnyFMA()) {
1046 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1047 MVT::v2f64, MVT::v4f64 })
1048 setOperationAction(ISD::FMA, VT, Legal);
1051 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1052 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1053 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1056 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1057 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1058 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1059 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1061 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1062 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1064 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1065 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1066 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1067 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1069 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1070 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1071 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1072 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1073 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1074 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1078 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1079 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1080 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1082 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1083 // when we have a 256bit-wide blend with immediate.
1084 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1086 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1087 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1088 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1089 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1090 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1091 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1092 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1093 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1097 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1098 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1099 setOperationAction(ISD::MLOAD, VT, Legal);
1100 setOperationAction(ISD::MSTORE, VT, Legal);
1103 // Extract subvector is special because the value type
1104 // (result) is 128-bit but the source is 256-bit wide.
1105 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1106 MVT::v4f32, MVT::v2f64 }) {
1107 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1110 // Custom lower several nodes for 256-bit types.
1111 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1112 MVT::v8f32, MVT::v4f64 }) {
1113 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1114 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1115 setOperationAction(ISD::VSELECT, VT, Custom);
1116 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1117 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1118 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1119 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1120 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1124 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1126 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1127 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1128 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1129 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1130 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1131 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1132 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1136 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1137 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1138 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1139 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1140 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1142 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1143 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1144 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1146 for (MVT VT : MVT::fp_vector_valuetypes())
1147 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1149 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1150 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1151 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1152 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1153 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1154 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1155 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1158 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1159 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1160 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1161 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1162 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1163 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1164 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1165 setTruncStoreAction(VT, MaskVT, Custom);
1168 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1169 setOperationAction(ISD::FNEG, VT, Custom);
1170 setOperationAction(ISD::FABS, VT, Custom);
1171 setOperationAction(ISD::FMA, VT, Legal);
1172 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1175 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1176 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1177 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1178 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1179 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1180 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1181 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1182 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1183 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1184 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1185 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1186 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1187 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1188 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1189 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1190 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1191 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1192 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1193 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1194 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1195 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1196 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1197 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1198 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1199 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1201 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1202 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1203 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1204 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1205 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1206 if (Subtarget.hasVLX()){
1207 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1208 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1209 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1210 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1211 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1213 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1214 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1215 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1216 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1217 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1219 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1220 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1221 setOperationAction(ISD::MLOAD, VT, Custom);
1222 setOperationAction(ISD::MSTORE, VT, Custom);
1225 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1226 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1228 if (Subtarget.hasDQI()) {
1229 for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1230 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1231 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1232 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1233 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1235 if (Subtarget.hasVLX()) {
1236 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1237 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1238 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1239 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1242 if (Subtarget.hasVLX()) {
1243 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1244 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1245 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1246 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1247 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1248 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1249 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1250 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1251 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1252 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1253 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1255 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1256 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1257 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1258 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1259 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1260 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1261 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1262 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1263 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1264 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1265 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1268 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1269 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1270 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1271 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1272 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1273 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1274 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1275 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1276 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1277 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1279 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1280 setOperationAction(ISD::FFLOOR, VT, Legal);
1281 setOperationAction(ISD::FCEIL, VT, Legal);
1282 setOperationAction(ISD::FTRUNC, VT, Legal);
1283 setOperationAction(ISD::FRINT, VT, Legal);
1284 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1287 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1288 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1290 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1291 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1292 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1294 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1295 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1296 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1297 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1298 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1300 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1302 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1303 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1304 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1305 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1306 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1307 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1309 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1311 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1312 setOperationAction(ISD::ABS, MVT::v4i64, Legal);
1313 setOperationAction(ISD::ABS, MVT::v2i64, Legal);
1315 for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1316 setOperationAction(ISD::ADD, VT, Custom);
1317 setOperationAction(ISD::SUB, VT, Custom);
1318 setOperationAction(ISD::MUL, VT, Custom);
1319 setOperationAction(ISD::SETCC, VT, Custom);
1320 setOperationAction(ISD::SELECT, VT, Custom);
1321 setOperationAction(ISD::TRUNCATE, VT, Custom);
1323 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1324 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1325 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1326 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1327 setOperationAction(ISD::VSELECT, VT, Expand);
1330 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1331 setOperationAction(ISD::SMAX, VT, Legal);
1332 setOperationAction(ISD::UMAX, VT, Legal);
1333 setOperationAction(ISD::SMIN, VT, Legal);
1334 setOperationAction(ISD::UMIN, VT, Legal);
1335 setOperationAction(ISD::ABS, VT, Legal);
1336 setOperationAction(ISD::SRL, VT, Custom);
1337 setOperationAction(ISD::SHL, VT, Custom);
1338 setOperationAction(ISD::SRA, VT, Custom);
1339 setOperationAction(ISD::CTPOP, VT, Custom);
1340 setOperationAction(ISD::CTTZ, VT, Custom);
1343 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1344 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
1346 setOperationAction(ISD::ROTL, VT, Custom);
1347 setOperationAction(ISD::ROTR, VT, Custom);
1350 // Need to promote to 64-bit even though we have 32-bit masked instructions
1351 // because the IR optimizers rearrange bitcasts around logic ops leaving
1352 // too many variations to handle if we don't promote them.
1353 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1354 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1355 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1357 if (Subtarget.hasCDI()) {
1358 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1359 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1360 MVT::v4i64, MVT::v8i64}) {
1361 setOperationAction(ISD::CTLZ, VT, Legal);
1362 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1364 } // Subtarget.hasCDI()
1366 if (Subtarget.hasDQI()) {
1367 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1368 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1369 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1370 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1373 if (Subtarget.hasVPOPCNTDQ()) {
1374 // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1375 // version of popcntd/q.
1376 for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1377 MVT::v4i32, MVT::v2i64})
1378 setOperationAction(ISD::CTPOP, VT, Legal);
1381 // Custom lower several nodes.
1382 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1383 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1384 setOperationAction(ISD::MGATHER, VT, Custom);
1385 setOperationAction(ISD::MSCATTER, VT, Custom);
1387 // Extract subvector is special because the value type
1388 // (result) is 256-bit but the source is 512-bit wide.
1389 // 128-bit was made Custom under AVX1.
1390 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1391 MVT::v8f32, MVT::v4f64, MVT::v1i1 })
1392 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1393 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1394 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1395 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1397 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1398 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1399 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1400 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1401 setOperationAction(ISD::VSELECT, VT, Custom);
1402 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1403 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1404 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1405 setOperationAction(ISD::MLOAD, VT, Legal);
1406 setOperationAction(ISD::MSTORE, VT, Legal);
1407 setOperationAction(ISD::MGATHER, VT, Legal);
1408 setOperationAction(ISD::MSCATTER, VT, Custom);
1410 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1411 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1412 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1416 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1417 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1418 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1420 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1421 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1423 setOperationAction(ISD::ADD, MVT::v32i1, Custom);
1424 setOperationAction(ISD::ADD, MVT::v64i1, Custom);
1425 setOperationAction(ISD::SUB, MVT::v32i1, Custom);
1426 setOperationAction(ISD::SUB, MVT::v64i1, Custom);
1427 setOperationAction(ISD::MUL, MVT::v32i1, Custom);
1428 setOperationAction(ISD::MUL, MVT::v64i1, Custom);
1430 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1431 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1432 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1433 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1434 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1435 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1436 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1437 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1438 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1439 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1440 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1441 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1442 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1443 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1444 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1445 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1446 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1447 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1448 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1449 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1450 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1451 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1452 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1453 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1454 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1455 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1456 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1457 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1458 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1459 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1460 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1461 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1462 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1463 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1464 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1465 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1466 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1467 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1468 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1469 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1470 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1471 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1472 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1473 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1474 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1476 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1478 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1479 if (Subtarget.hasVLX()) {
1480 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1481 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1484 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1485 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1486 setOperationAction(ISD::MLOAD, VT, Action);
1487 setOperationAction(ISD::MSTORE, VT, Action);
1490 if (Subtarget.hasCDI()) {
1491 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1492 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1495 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1496 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1497 setOperationAction(ISD::VSELECT, VT, Custom);
1498 setOperationAction(ISD::ABS, VT, Legal);
1499 setOperationAction(ISD::SRL, VT, Custom);
1500 setOperationAction(ISD::SHL, VT, Custom);
1501 setOperationAction(ISD::SRA, VT, Custom);
1502 setOperationAction(ISD::MLOAD, VT, Legal);
1503 setOperationAction(ISD::MSTORE, VT, Legal);
1504 setOperationAction(ISD::CTPOP, VT, Custom);
1505 setOperationAction(ISD::CTTZ, VT, Custom);
1506 setOperationAction(ISD::SMAX, VT, Legal);
1507 setOperationAction(ISD::UMAX, VT, Legal);
1508 setOperationAction(ISD::SMIN, VT, Legal);
1509 setOperationAction(ISD::UMIN, VT, Legal);
1511 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1512 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1513 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1516 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1517 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1518 if (Subtarget.hasVLX()) {
1519 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1520 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1521 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1526 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1527 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1528 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1530 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1531 setOperationAction(ISD::ADD, VT, Custom);
1532 setOperationAction(ISD::SUB, VT, Custom);
1533 setOperationAction(ISD::MUL, VT, Custom);
1534 setOperationAction(ISD::VSELECT, VT, Expand);
1536 setOperationAction(ISD::TRUNCATE, VT, Custom);
1537 setOperationAction(ISD::SETCC, VT, Custom);
1538 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1539 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1540 setOperationAction(ISD::SELECT, VT, Custom);
1541 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1542 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1545 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1546 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1547 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1548 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1550 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1551 setOperationAction(ISD::SMAX, VT, Legal);
1552 setOperationAction(ISD::UMAX, VT, Legal);
1553 setOperationAction(ISD::SMIN, VT, Legal);
1554 setOperationAction(ISD::UMIN, VT, Legal);
1558 // We want to custom lower some of our intrinsics.
1559 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1560 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1561 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1562 if (!Subtarget.is64Bit()) {
1563 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1564 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1567 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1568 // handle type legalization for these operations here.
1570 // FIXME: We really should do custom legalization for addition and
1571 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1572 // than generic legalization for 64-bit multiplication-with-overflow, though.
1573 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1574 if (VT == MVT::i64 && !Subtarget.is64Bit())
1576 // Add/Sub/Mul with overflow operations are custom lowered.
1577 setOperationAction(ISD::SADDO, VT, Custom);
1578 setOperationAction(ISD::UADDO, VT, Custom);
1579 setOperationAction(ISD::SSUBO, VT, Custom);
1580 setOperationAction(ISD::USUBO, VT, Custom);
1581 setOperationAction(ISD::SMULO, VT, Custom);
1582 setOperationAction(ISD::UMULO, VT, Custom);
1584 // Support carry in as value rather than glue.
1585 setOperationAction(ISD::ADDCARRY, VT, Custom);
1586 setOperationAction(ISD::SUBCARRY, VT, Custom);
1587 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1590 if (!Subtarget.is64Bit()) {
1591 // These libcalls are not available in 32-bit.
1592 setLibcallName(RTLIB::SHL_I128, nullptr);
1593 setLibcallName(RTLIB::SRL_I128, nullptr);
1594 setLibcallName(RTLIB::SRA_I128, nullptr);
1597 // Combine sin / cos into one node or libcall if possible.
1598 if (Subtarget.hasSinCos()) {
1599 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1600 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1601 if (Subtarget.isTargetDarwin()) {
1602 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1603 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1604 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1605 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1609 if (Subtarget.isTargetWin64()) {
1610 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1611 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1612 setOperationAction(ISD::SREM, MVT::i128, Custom);
1613 setOperationAction(ISD::UREM, MVT::i128, Custom);
1614 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1615 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1618 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1619 // is. We should promote the value to 64-bits to solve this.
1620 // This is what the CRT headers do - `fmodf` is an inline header
1621 // function casting to f64 and calling `fmod`.
1622 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1623 Subtarget.isTargetWindowsItanium()))
1624 for (ISD::NodeType Op :
1625 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1626 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1627 if (isOperationExpand(Op, MVT::f32))
1628 setOperationAction(Op, MVT::f32, Promote);
1630 // We have target-specific dag combine patterns for the following nodes:
1631 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1632 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1633 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1634 setTargetDAGCombine(ISD::BITCAST);
1635 setTargetDAGCombine(ISD::VSELECT);
1636 setTargetDAGCombine(ISD::SELECT);
1637 setTargetDAGCombine(ISD::SHL);
1638 setTargetDAGCombine(ISD::SRA);
1639 setTargetDAGCombine(ISD::SRL);
1640 setTargetDAGCombine(ISD::OR);
1641 setTargetDAGCombine(ISD::AND);
1642 setTargetDAGCombine(ISD::ADD);
1643 setTargetDAGCombine(ISD::FADD);
1644 setTargetDAGCombine(ISD::FSUB);
1645 setTargetDAGCombine(ISD::FNEG);
1646 setTargetDAGCombine(ISD::FMA);
1647 setTargetDAGCombine(ISD::FMINNUM);
1648 setTargetDAGCombine(ISD::FMAXNUM);
1649 setTargetDAGCombine(ISD::SUB);
1650 setTargetDAGCombine(ISD::LOAD);
1651 setTargetDAGCombine(ISD::MLOAD);
1652 setTargetDAGCombine(ISD::STORE);
1653 setTargetDAGCombine(ISD::MSTORE);
1654 setTargetDAGCombine(ISD::TRUNCATE);
1655 setTargetDAGCombine(ISD::ZERO_EXTEND);
1656 setTargetDAGCombine(ISD::ANY_EXTEND);
1657 setTargetDAGCombine(ISD::SIGN_EXTEND);
1658 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1659 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1660 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1661 setTargetDAGCombine(ISD::SINT_TO_FP);
1662 setTargetDAGCombine(ISD::UINT_TO_FP);
1663 setTargetDAGCombine(ISD::SETCC);
1664 setTargetDAGCombine(ISD::MUL);
1665 setTargetDAGCombine(ISD::XOR);
1666 setTargetDAGCombine(ISD::MSCATTER);
1667 setTargetDAGCombine(ISD::MGATHER);
1669 computeRegisterProperties(Subtarget.getRegisterInfo());
1671 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1672 MaxStoresPerMemsetOptSize = 8;
1673 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1674 MaxStoresPerMemcpyOptSize = 4;
1675 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1676 MaxStoresPerMemmoveOptSize = 4;
1678 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1679 // that needs to benchmarked and balanced with the potential use of vector
1680 // load/store types (PR33329, PR33914).
1681 MaxLoadsPerMemcmp = 2;
1682 MaxLoadsPerMemcmpOptSize = 2;
1684 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1685 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1687 // An out-of-order CPU can speculatively execute past a predictable branch,
1688 // but a conditional move could be stalled by an expensive earlier operation.
1689 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1690 EnableExtLdPromotion = true;
1691 setPrefFunctionAlignment(4); // 2^4 bytes.
1693 verifyIntrinsicTables();
1696 // This has so far only been implemented for 64-bit MachO.
1697 bool X86TargetLowering::useLoadStackGuardNode() const {
1698 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1701 TargetLoweringBase::LegalizeTypeAction
1702 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1703 if (ExperimentalVectorWideningLegalization &&
1704 VT.getVectorNumElements() != 1 &&
1705 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1706 return TypeWidenVector;
1708 return TargetLoweringBase::getPreferredVectorAction(VT);
1711 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1712 LLVMContext& Context,
1717 if (VT.isSimple()) {
1718 MVT VVT = VT.getSimpleVT();
1719 const unsigned NumElts = VVT.getVectorNumElements();
1720 MVT EltVT = VVT.getVectorElementType();
1721 if (VVT.is512BitVector()) {
1722 if (Subtarget.hasAVX512())
1723 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1724 EltVT == MVT::f32 || EltVT == MVT::f64)
1726 case 8: return MVT::v8i1;
1727 case 16: return MVT::v16i1;
1729 if (Subtarget.hasBWI())
1730 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1732 case 32: return MVT::v32i1;
1733 case 64: return MVT::v64i1;
1737 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1738 return MVT::getVectorVT(MVT::i1, NumElts);
1740 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1741 EVT LegalVT = getTypeToTransformTo(Context, VT);
1742 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1745 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1747 case 2: return MVT::v2i1;
1748 case 4: return MVT::v4i1;
1749 case 8: return MVT::v8i1;
1753 return VT.changeVectorElementTypeToInteger();
1756 /// Helper for getByValTypeAlignment to determine
1757 /// the desired ByVal argument alignment.
1758 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1761 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1762 if (VTy->getBitWidth() == 128)
1764 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1765 unsigned EltAlign = 0;
1766 getMaxByValAlign(ATy->getElementType(), EltAlign);
1767 if (EltAlign > MaxAlign)
1768 MaxAlign = EltAlign;
1769 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1770 for (auto *EltTy : STy->elements()) {
1771 unsigned EltAlign = 0;
1772 getMaxByValAlign(EltTy, EltAlign);
1773 if (EltAlign > MaxAlign)
1774 MaxAlign = EltAlign;
1781 /// Return the desired alignment for ByVal aggregate
1782 /// function arguments in the caller parameter area. For X86, aggregates
1783 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1784 /// are at 4-byte boundaries.
1785 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1786 const DataLayout &DL) const {
1787 if (Subtarget.is64Bit()) {
1788 // Max of 8 and alignment of type.
1789 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1796 if (Subtarget.hasSSE1())
1797 getMaxByValAlign(Ty, Align);
1801 /// Returns the target specific optimal type for load
1802 /// and store operations as a result of memset, memcpy, and memmove
1803 /// lowering. If DstAlign is zero that means it's safe to destination
1804 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1805 /// means there isn't a need to check it against alignment requirement,
1806 /// probably because the source does not need to be loaded. If 'IsMemset' is
1807 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1808 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1809 /// source is constant so it does not need to be loaded.
1810 /// It returns EVT::Other if the type should be determined using generic
1811 /// target-independent logic.
1813 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1814 unsigned DstAlign, unsigned SrcAlign,
1815 bool IsMemset, bool ZeroMemset,
1817 MachineFunction &MF) const {
1818 const Function *F = MF.getFunction();
1819 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1821 (!Subtarget.isUnalignedMem16Slow() ||
1822 ((DstAlign == 0 || DstAlign >= 16) &&
1823 (SrcAlign == 0 || SrcAlign >= 16)))) {
1824 // FIXME: Check if unaligned 32-byte accesses are slow.
1825 if (Size >= 32 && Subtarget.hasAVX()) {
1826 // Although this isn't a well-supported type for AVX1, we'll let
1827 // legalization and shuffle lowering produce the optimal codegen. If we
1828 // choose an optimal type with a vector element larger than a byte,
1829 // getMemsetStores() may create an intermediate splat (using an integer
1830 // multiply) before we splat as a vector.
1833 if (Subtarget.hasSSE2())
1835 // TODO: Can SSE1 handle a byte vector?
1836 if (Subtarget.hasSSE1())
1838 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1839 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1840 // Do not use f64 to lower memcpy if source is string constant. It's
1841 // better to use i32 to avoid the loads.
1842 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1843 // The gymnastics of splatting a byte value into an XMM register and then
1844 // only using 8-byte stores (because this is a CPU with slow unaligned
1845 // 16-byte accesses) makes that a loser.
1849 // This is a compromise. If we reach here, unaligned accesses may be slow on
1850 // this target. However, creating smaller, aligned accesses could be even
1851 // slower and would certainly be a lot more code.
1852 if (Subtarget.is64Bit() && Size >= 8)
1857 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1859 return X86ScalarSSEf32;
1860 else if (VT == MVT::f64)
1861 return X86ScalarSSEf64;
1866 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1871 switch (VT.getSizeInBits()) {
1873 // 8-byte and under are always assumed to be fast.
1877 *Fast = !Subtarget.isUnalignedMem16Slow();
1880 *Fast = !Subtarget.isUnalignedMem32Slow();
1882 // TODO: What about AVX-512 (512-bit) accesses?
1885 // Misaligned accesses of any size are always allowed.
1889 /// Return the entry encoding for a jump table in the
1890 /// current function. The returned value is a member of the
1891 /// MachineJumpTableInfo::JTEntryKind enum.
1892 unsigned X86TargetLowering::getJumpTableEncoding() const {
1893 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1895 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1896 return MachineJumpTableInfo::EK_Custom32;
1898 // Otherwise, use the normal jump table encoding heuristics.
1899 return TargetLowering::getJumpTableEncoding();
1902 bool X86TargetLowering::useSoftFloat() const {
1903 return Subtarget.useSoftFloat();
1906 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1907 ArgListTy &Args) const {
1909 // Only relabel X86-32 for C / Stdcall CCs.
1910 if (Subtarget.is64Bit())
1912 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1914 unsigned ParamRegs = 0;
1915 if (auto *M = MF->getFunction()->getParent())
1916 ParamRegs = M->getNumberRegisterParameters();
1918 // Mark the first N int arguments as having reg
1919 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1920 Type *T = Args[Idx].Ty;
1921 if (T->isPointerTy() || T->isIntegerTy())
1922 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1923 unsigned numRegs = 1;
1924 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1926 if (ParamRegs < numRegs)
1928 ParamRegs -= numRegs;
1929 Args[Idx].IsInReg = true;
1935 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1936 const MachineBasicBlock *MBB,
1937 unsigned uid,MCContext &Ctx) const{
1938 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1939 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1941 return MCSymbolRefExpr::create(MBB->getSymbol(),
1942 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1945 /// Returns relocation base for the given PIC jumptable.
1946 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1947 SelectionDAG &DAG) const {
1948 if (!Subtarget.is64Bit())
1949 // This doesn't have SDLoc associated with it, but is not really the
1950 // same as a Register.
1951 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1952 getPointerTy(DAG.getDataLayout()));
1956 /// This returns the relocation base for the given PIC jumptable,
1957 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1958 const MCExpr *X86TargetLowering::
1959 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1960 MCContext &Ctx) const {
1961 // X86-64 uses RIP relative addressing based on the jump table label.
1962 if (Subtarget.isPICStyleRIPRel())
1963 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1965 // Otherwise, the reference is relative to the PIC base.
1966 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1969 std::pair<const TargetRegisterClass *, uint8_t>
1970 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1972 const TargetRegisterClass *RRC = nullptr;
1974 switch (VT.SimpleTy) {
1976 return TargetLowering::findRepresentativeClass(TRI, VT);
1977 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1978 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1981 RRC = &X86::VR64RegClass;
1983 case MVT::f32: case MVT::f64:
1984 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1985 case MVT::v4f32: case MVT::v2f64:
1986 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1987 case MVT::v8f32: case MVT::v4f64:
1988 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1989 case MVT::v16f32: case MVT::v8f64:
1990 RRC = &X86::VR128XRegClass;
1993 return std::make_pair(RRC, Cost);
1996 unsigned X86TargetLowering::getAddressSpace() const {
1997 if (Subtarget.is64Bit())
1998 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2002 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2003 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2004 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2007 static Constant* SegmentOffset(IRBuilder<> &IRB,
2008 unsigned Offset, unsigned AddressSpace) {
2009 return ConstantExpr::getIntToPtr(
2010 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2011 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2014 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2015 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2016 // tcbhead_t; use it instead of the usual global variable (see
2017 // sysdeps/{i386,x86_64}/nptl/tls.h)
2018 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2019 if (Subtarget.isTargetFuchsia()) {
2020 // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
2021 return SegmentOffset(IRB, 0x10, getAddressSpace());
2023 // %fs:0x28, unless we're using a Kernel code model, in which case
2024 // it's %gs:0x28. gs:0x14 on i386.
2025 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2026 return SegmentOffset(IRB, Offset, getAddressSpace());
2030 return TargetLowering::getIRStackGuard(IRB);
2033 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2034 // MSVC CRT provides functionalities for stack protection.
2035 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2036 // MSVC CRT has a global variable holding security cookie.
2037 M.getOrInsertGlobal("__security_cookie",
2038 Type::getInt8PtrTy(M.getContext()));
2040 // MSVC CRT has a function to validate security cookie.
2041 auto *SecurityCheckCookie = cast<Function>(
2042 M.getOrInsertFunction("__security_check_cookie",
2043 Type::getVoidTy(M.getContext()),
2044 Type::getInt8PtrTy(M.getContext())));
2045 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2046 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2049 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2050 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2052 TargetLowering::insertSSPDeclarations(M);
2055 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2056 // MSVC CRT has a global variable holding security cookie.
2057 if (Subtarget.getTargetTriple().isOSMSVCRT())
2058 return M.getGlobalVariable("__security_cookie");
2059 return TargetLowering::getSDagStackGuard(M);
2062 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2063 // MSVC CRT has a function to validate security cookie.
2064 if (Subtarget.getTargetTriple().isOSMSVCRT())
2065 return M.getFunction("__security_check_cookie");
2066 return TargetLowering::getSSPStackGuardCheck(M);
2069 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2070 if (Subtarget.getTargetTriple().isOSContiki())
2071 return getDefaultSafeStackPointerLocation(IRB, false);
2073 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2074 // definition of TLS_SLOT_SAFESTACK in
2075 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2076 if (Subtarget.isTargetAndroid()) {
2077 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2079 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2080 return SegmentOffset(IRB, Offset, getAddressSpace());
2083 // Fuchsia is similar.
2084 if (Subtarget.isTargetFuchsia()) {
2085 // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
2086 return SegmentOffset(IRB, 0x18, getAddressSpace());
2089 return TargetLowering::getSafeStackPointerLocation(IRB);
2092 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2093 unsigned DestAS) const {
2094 assert(SrcAS != DestAS && "Expected different address spaces!");
2096 return SrcAS < 256 && DestAS < 256;
2099 //===----------------------------------------------------------------------===//
2100 // Return Value Calling Convention Implementation
2101 //===----------------------------------------------------------------------===//
2103 #include "X86GenCallingConv.inc"
2105 bool X86TargetLowering::CanLowerReturn(
2106 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2107 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2108 SmallVector<CCValAssign, 16> RVLocs;
2109 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2110 return CCInfo.CheckReturn(Outs, RetCC_X86);
2113 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2114 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2118 /// Lowers masks values (v*i1) to the local register values
2119 /// \returns DAG node after lowering to register type
2120 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2121 const SDLoc &Dl, SelectionDAG &DAG) {
2122 EVT ValVT = ValArg.getValueType();
2124 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2125 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2126 // Two stage lowering might be required
2127 // bitcast: v8i1 -> i8 / v16i1 -> i16
2128 // anyextend: i8 -> i32 / i16 -> i32
2129 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2130 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2131 if (ValLoc == MVT::i32)
2132 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2134 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2135 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2136 // One stage lowering is required
2137 // bitcast: v32i1 -> i32 / v64i1 -> i64
2138 return DAG.getBitcast(ValLoc, ValArg);
2140 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2143 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2144 static void Passv64i1ArgInRegs(
2145 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2146 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2147 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2148 assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2149 "Expected AVX512BW or AVX512BMI target!");
2150 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2151 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2152 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2153 "The value should reside in two registers");
2155 // Before splitting the value we cast it to i64
2156 Arg = DAG.getBitcast(MVT::i64, Arg);
2158 // Splitting the value into two i32 types
2160 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2161 DAG.getConstant(0, Dl, MVT::i32));
2162 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2163 DAG.getConstant(1, Dl, MVT::i32));
2165 // Attach the two i32 types into corresponding registers
2166 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2167 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2171 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2173 const SmallVectorImpl<ISD::OutputArg> &Outs,
2174 const SmallVectorImpl<SDValue> &OutVals,
2175 const SDLoc &dl, SelectionDAG &DAG) const {
2176 MachineFunction &MF = DAG.getMachineFunction();
2177 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2179 // In some cases we need to disable registers from the default CSR list.
2180 // For example, when they are used for argument passing.
2181 bool ShouldDisableCalleeSavedRegister =
2182 CallConv == CallingConv::X86_RegCall ||
2183 MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2185 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2186 report_fatal_error("X86 interrupts may not return any value");
2188 SmallVector<CCValAssign, 16> RVLocs;
2189 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2190 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2193 SmallVector<SDValue, 6> RetOps;
2194 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2195 // Operand #1 = Bytes To Pop
2196 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2199 // Copy the result values into the output registers.
2200 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2202 CCValAssign &VA = RVLocs[I];
2203 assert(VA.isRegLoc() && "Can only return in registers!");
2205 // Add the register to the CalleeSaveDisableRegs list.
2206 if (ShouldDisableCalleeSavedRegister)
2207 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2209 SDValue ValToCopy = OutVals[OutsIndex];
2210 EVT ValVT = ValToCopy.getValueType();
2212 // Promote values to the appropriate types.
2213 if (VA.getLocInfo() == CCValAssign::SExt)
2214 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2215 else if (VA.getLocInfo() == CCValAssign::ZExt)
2216 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2217 else if (VA.getLocInfo() == CCValAssign::AExt) {
2218 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2219 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2221 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2223 else if (VA.getLocInfo() == CCValAssign::BCvt)
2224 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2226 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2227 "Unexpected FP-extend for return value.");
2229 // If this is x86-64, and we disabled SSE, we can't return FP values,
2230 // or SSE or MMX vectors.
2231 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2232 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2233 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2234 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2235 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2236 } else if (ValVT == MVT::f64 &&
2237 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2238 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2239 // llvm-gcc has never done it right and no one has noticed, so this
2240 // should be OK for now.
2241 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2242 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2245 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2246 // the RET instruction and handled by the FP Stackifier.
2247 if (VA.getLocReg() == X86::FP0 ||
2248 VA.getLocReg() == X86::FP1) {
2249 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2250 // change the value to the FP stack register class.
2251 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2252 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2253 RetOps.push_back(ValToCopy);
2254 // Don't emit a copytoreg.
2258 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2259 // which is returned in RAX / RDX.
2260 if (Subtarget.is64Bit()) {
2261 if (ValVT == MVT::x86mmx) {
2262 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2263 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2264 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2266 // If we don't have SSE2 available, convert to v4f32 so the generated
2267 // register is legal.
2268 if (!Subtarget.hasSSE2())
2269 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2274 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2276 if (VA.needsCustom()) {
2277 assert(VA.getValVT() == MVT::v64i1 &&
2278 "Currently the only custom case is when we split v64i1 to 2 regs");
2280 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2283 assert(2 == RegsToPass.size() &&
2284 "Expecting two registers after Pass64BitArgInRegs");
2286 // Add the second register to the CalleeSaveDisableRegs list.
2287 if (ShouldDisableCalleeSavedRegister)
2288 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2290 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2293 // Add nodes to the DAG and add the values into the RetOps list
2294 for (auto &Reg : RegsToPass) {
2295 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2296 Flag = Chain.getValue(1);
2297 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2301 // Swift calling convention does not require we copy the sret argument
2302 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2304 // All x86 ABIs require that for returning structs by value we copy
2305 // the sret argument into %rax/%eax (depending on ABI) for the return.
2306 // We saved the argument into a virtual register in the entry block,
2307 // so now we copy the value out and into %rax/%eax.
2309 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2310 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2311 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2312 // either case FuncInfo->setSRetReturnReg() will have been called.
2313 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2314 // When we have both sret and another return value, we should use the
2315 // original Chain stored in RetOps[0], instead of the current Chain updated
2316 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2318 // For the case of sret and another return value, we have
2319 // Chain_0 at the function entry
2320 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2321 // If we use Chain_1 in getCopyFromReg, we will have
2322 // Val = getCopyFromReg(Chain_1)
2323 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2325 // getCopyToReg(Chain_0) will be glued together with
2326 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2327 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2328 // Data dependency from Unit B to Unit A due to usage of Val in
2329 // getCopyToReg(Chain_1, Val)
2330 // Chain dependency from Unit A to Unit B
2332 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2333 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2334 getPointerTy(MF.getDataLayout()));
2337 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2338 X86::RAX : X86::EAX;
2339 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2340 Flag = Chain.getValue(1);
2342 // RAX/EAX now acts like a return value.
2344 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2346 // Add the returned register to the CalleeSaveDisableRegs list.
2347 if (ShouldDisableCalleeSavedRegister)
2348 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2351 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2352 const MCPhysReg *I =
2353 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2356 if (X86::GR64RegClass.contains(*I))
2357 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2359 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2363 RetOps[0] = Chain; // Update chain.
2365 // Add the flag if we have it.
2367 RetOps.push_back(Flag);
2369 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2370 if (CallConv == CallingConv::X86_INTR)
2371 opcode = X86ISD::IRET;
2372 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2375 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2376 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2379 SDValue TCChain = Chain;
2380 SDNode *Copy = *N->use_begin();
2381 if (Copy->getOpcode() == ISD::CopyToReg) {
2382 // If the copy has a glue operand, we conservatively assume it isn't safe to
2383 // perform a tail call.
2384 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2386 TCChain = Copy->getOperand(0);
2387 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2390 bool HasRet = false;
2391 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2393 if (UI->getOpcode() != X86ISD::RET_FLAG)
2395 // If we are returning more than one value, we can definitely
2396 // not make a tail call see PR19530
2397 if (UI->getNumOperands() > 4)
2399 if (UI->getNumOperands() == 4 &&
2400 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2412 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2413 ISD::NodeType ExtendKind) const {
2414 MVT ReturnMVT = MVT::i32;
2416 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2417 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2418 // The ABI does not require i1, i8 or i16 to be extended.
2420 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2421 // always extending i8/i16 return values, so keep doing that for now.
2423 ReturnMVT = MVT::i8;
2426 EVT MinVT = getRegisterType(Context, ReturnMVT);
2427 return VT.bitsLT(MinVT) ? MinVT : VT;
2430 /// Reads two 32 bit registers and creates a 64 bit mask value.
2431 /// \param VA The current 32 bit value that need to be assigned.
2432 /// \param NextVA The next 32 bit value that need to be assigned.
2433 /// \param Root The parent DAG node.
2434 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2435 /// glue purposes. In the case the DAG is already using
2436 /// physical register instead of virtual, we should glue
2437 /// our new SDValue to InFlag SDvalue.
2438 /// \return a new SDvalue of size 64bit.
2439 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2440 SDValue &Root, SelectionDAG &DAG,
2441 const SDLoc &Dl, const X86Subtarget &Subtarget,
2442 SDValue *InFlag = nullptr) {
2443 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2444 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2445 assert(VA.getValVT() == MVT::v64i1 &&
2446 "Expecting first location of 64 bit width type");
2447 assert(NextVA.getValVT() == VA.getValVT() &&
2448 "The locations should have the same type");
2449 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2450 "The values should reside in two registers");
2454 SDValue ArgValueLo, ArgValueHi;
2456 MachineFunction &MF = DAG.getMachineFunction();
2457 const TargetRegisterClass *RC = &X86::GR32RegClass;
2459 // Read a 32 bit value from the registers
2460 if (nullptr == InFlag) {
2461 // When no physical register is present,
2462 // create an intermediate virtual register
2463 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2464 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2465 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2466 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2468 // When a physical register is available read the value from it and glue
2469 // the reads together.
2471 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2472 *InFlag = ArgValueLo.getValue(2);
2474 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2475 *InFlag = ArgValueHi.getValue(2);
2478 // Convert the i32 type into v32i1 type
2479 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2481 // Convert the i32 type into v32i1 type
2482 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2484 // Concatenate the two values together
2485 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2488 /// The function will lower a register of various sizes (8/16/32/64)
2489 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2490 /// \returns a DAG node contains the operand after lowering to mask type.
2491 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2492 const EVT &ValLoc, const SDLoc &Dl,
2493 SelectionDAG &DAG) {
2494 SDValue ValReturned = ValArg;
2496 if (ValVT == MVT::v1i1)
2497 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2499 if (ValVT == MVT::v64i1) {
2500 // In 32 bit machine, this case is handled by getv64i1Argument
2501 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2502 // In 64 bit machine, There is no need to truncate the value only bitcast
2505 switch (ValVT.getSimpleVT().SimpleTy) {
2516 llvm_unreachable("Expecting a vector of i1 types");
2519 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2521 return DAG.getBitcast(ValVT, ValReturned);
2524 /// Lower the result values of a call into the
2525 /// appropriate copies out of appropriate physical registers.
2527 SDValue X86TargetLowering::LowerCallResult(
2528 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2529 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2530 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2531 uint32_t *RegMask) const {
2533 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2534 // Assign locations to each value returned by this call.
2535 SmallVector<CCValAssign, 16> RVLocs;
2536 bool Is64Bit = Subtarget.is64Bit();
2537 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2539 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2541 // Copy all of the result registers out of their specified physreg.
2542 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2544 CCValAssign &VA = RVLocs[I];
2545 EVT CopyVT = VA.getLocVT();
2547 // In some calling conventions we need to remove the used registers
2548 // from the register mask.
2550 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2551 SubRegs.isValid(); ++SubRegs)
2552 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2555 // If this is x86-64, and we disabled SSE, we can't return FP values
2556 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2557 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2558 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2559 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2562 // If we prefer to use the value in xmm registers, copy it out as f80 and
2563 // use a truncate to move it from fp stack reg to xmm reg.
2564 bool RoundAfterCopy = false;
2565 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2566 isScalarFPTypeInSSEReg(VA.getValVT())) {
2567 if (!Subtarget.hasX87())
2568 report_fatal_error("X87 register return with X87 disabled");
2570 RoundAfterCopy = (CopyVT != VA.getLocVT());
2574 if (VA.needsCustom()) {
2575 assert(VA.getValVT() == MVT::v64i1 &&
2576 "Currently the only custom case is when we split v64i1 to 2 regs");
2578 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2580 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2582 Val = Chain.getValue(0);
2583 InFlag = Chain.getValue(2);
2587 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2588 // This truncation won't change the value.
2589 DAG.getIntPtrConstant(1, dl));
2591 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2592 if (VA.getValVT().isVector() &&
2593 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2594 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2595 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2596 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2598 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2601 InVals.push_back(Val);
2607 //===----------------------------------------------------------------------===//
2608 // C & StdCall & Fast Calling Convention implementation
2609 //===----------------------------------------------------------------------===//
2610 // StdCall calling convention seems to be standard for many Windows' API
2611 // routines and around. It differs from C calling convention just a little:
2612 // callee should clean up the stack, not caller. Symbols should be also
2613 // decorated in some fancy way :) It doesn't support any vector arguments.
2614 // For info on fast calling convention see Fast Calling Convention (tail call)
2615 // implementation LowerX86_32FastCCCallTo.
2617 /// CallIsStructReturn - Determines whether a call uses struct return
2619 enum StructReturnType {
2624 static StructReturnType
2625 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2627 return NotStructReturn;
2629 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2630 if (!Flags.isSRet())
2631 return NotStructReturn;
2632 if (Flags.isInReg() || IsMCU)
2633 return RegStructReturn;
2634 return StackStructReturn;
2637 /// Determines whether a function uses struct return semantics.
2638 static StructReturnType
2639 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2641 return NotStructReturn;
2643 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2644 if (!Flags.isSRet())
2645 return NotStructReturn;
2646 if (Flags.isInReg() || IsMCU)
2647 return RegStructReturn;
2648 return StackStructReturn;
2651 /// Make a copy of an aggregate at address specified by "Src" to address
2652 /// "Dst" with size and alignment information specified by the specific
2653 /// parameter attribute. The copy will be passed as a byval function parameter.
2654 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2655 SDValue Chain, ISD::ArgFlagsTy Flags,
2656 SelectionDAG &DAG, const SDLoc &dl) {
2657 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2659 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2660 /*isVolatile*/false, /*AlwaysInline=*/true,
2661 /*isTailCall*/false,
2662 MachinePointerInfo(), MachinePointerInfo());
2665 /// Return true if the calling convention is one that we can guarantee TCO for.
2666 static bool canGuaranteeTCO(CallingConv::ID CC) {
2667 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2668 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2669 CC == CallingConv::HHVM);
2672 /// Return true if we might ever do TCO for calls with this calling convention.
2673 static bool mayTailCallThisCC(CallingConv::ID CC) {
2675 // C calling conventions:
2676 case CallingConv::C:
2677 case CallingConv::Win64:
2678 case CallingConv::X86_64_SysV:
2679 // Callee pop conventions:
2680 case CallingConv::X86_ThisCall:
2681 case CallingConv::X86_StdCall:
2682 case CallingConv::X86_VectorCall:
2683 case CallingConv::X86_FastCall:
2686 return canGuaranteeTCO(CC);
2690 /// Return true if the function is being made into a tailcall target by
2691 /// changing its ABI.
2692 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2693 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2696 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2698 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2699 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2702 ImmutableCallSite CS(CI);
2703 CallingConv::ID CalleeCC = CS.getCallingConv();
2704 if (!mayTailCallThisCC(CalleeCC))
2711 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2712 const SmallVectorImpl<ISD::InputArg> &Ins,
2713 const SDLoc &dl, SelectionDAG &DAG,
2714 const CCValAssign &VA,
2715 MachineFrameInfo &MFI, unsigned i) const {
2716 // Create the nodes corresponding to a load from this parameter slot.
2717 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2718 bool AlwaysUseMutable = shouldGuaranteeTCO(
2719 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2720 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2722 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2724 // If value is passed by pointer we have address passed instead of the value
2725 // itself. No need to extend if the mask value and location share the same
2727 bool ExtendedInMem =
2728 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2729 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2731 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2732 ValVT = VA.getLocVT();
2734 ValVT = VA.getValVT();
2736 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2737 // taken by a return address.
2739 if (CallConv == CallingConv::X86_INTR) {
2740 // X86 interrupts may take one or two arguments.
2741 // On the stack there will be no return address as in regular call.
2742 // Offset of last argument need to be set to -4/-8 bytes.
2743 // Where offset of the first argument out of two, should be set to 0 bytes.
2744 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2745 if (Subtarget.is64Bit() && Ins.size() == 2) {
2746 // The stack pointer needs to be realigned for 64 bit handlers with error
2747 // code, so the argument offset changes by 8 bytes.
2752 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2753 // changed with more analysis.
2754 // In case of tail call optimization mark all arguments mutable. Since they
2755 // could be overwritten by lowering of arguments in case of a tail call.
2756 if (Flags.isByVal()) {
2757 unsigned Bytes = Flags.getByValSize();
2758 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2759 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2760 // Adjust SP offset of interrupt parameter.
2761 if (CallConv == CallingConv::X86_INTR) {
2762 MFI.setObjectOffset(FI, Offset);
2764 return DAG.getFrameIndex(FI, PtrVT);
2767 // This is an argument in memory. We might be able to perform copy elision.
2768 if (Flags.isCopyElisionCandidate()) {
2769 EVT ArgVT = Ins[i].ArgVT;
2771 if (Ins[i].PartOffset == 0) {
2772 // If this is a one-part value or the first part of a multi-part value,
2773 // create a stack object for the entire argument value type and return a
2774 // load from our portion of it. This assumes that if the first part of an
2775 // argument is in memory, the rest will also be in memory.
2776 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2777 /*Immutable=*/false);
2778 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2780 ValVT, dl, Chain, PartAddr,
2781 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2783 // This is not the first piece of an argument in memory. See if there is
2784 // already a fixed stack object including this offset. If so, assume it
2785 // was created by the PartOffset == 0 branch above and create a load from
2786 // the appropriate offset into it.
2787 int64_t PartBegin = VA.getLocMemOffset();
2788 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2789 int FI = MFI.getObjectIndexBegin();
2790 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2791 int64_t ObjBegin = MFI.getObjectOffset(FI);
2792 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2793 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2796 if (MFI.isFixedObjectIndex(FI)) {
2798 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2799 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2801 ValVT, dl, Chain, Addr,
2802 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2803 Ins[i].PartOffset));
2808 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2809 VA.getLocMemOffset(), isImmutable);
2811 // Set SExt or ZExt flag.
2812 if (VA.getLocInfo() == CCValAssign::ZExt) {
2813 MFI.setObjectZExt(FI, true);
2814 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2815 MFI.setObjectSExt(FI, true);
2818 // Adjust SP offset of interrupt parameter.
2819 if (CallConv == CallingConv::X86_INTR) {
2820 MFI.setObjectOffset(FI, Offset);
2823 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2824 SDValue Val = DAG.getLoad(
2825 ValVT, dl, Chain, FIN,
2826 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2827 return ExtendedInMem
2828 ? (VA.getValVT().isVector()
2829 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2830 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2834 // FIXME: Get this from tablegen.
2835 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2836 const X86Subtarget &Subtarget) {
2837 assert(Subtarget.is64Bit());
2839 if (Subtarget.isCallingConvWin64(CallConv)) {
2840 static const MCPhysReg GPR64ArgRegsWin64[] = {
2841 X86::RCX, X86::RDX, X86::R8, X86::R9
2843 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2846 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2847 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2849 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2852 // FIXME: Get this from tablegen.
2853 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2854 CallingConv::ID CallConv,
2855 const X86Subtarget &Subtarget) {
2856 assert(Subtarget.is64Bit());
2857 if (Subtarget.isCallingConvWin64(CallConv)) {
2858 // The XMM registers which might contain var arg parameters are shadowed
2859 // in their paired GPR. So we only need to save the GPR to their home
2861 // TODO: __vectorcall will change this.
2865 const Function *Fn = MF.getFunction();
2866 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2867 bool isSoftFloat = Subtarget.useSoftFloat();
2868 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2869 "SSE register cannot be used when SSE is disabled!");
2870 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2871 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2875 static const MCPhysReg XMMArgRegs64Bit[] = {
2876 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2877 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2879 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2883 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2884 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2885 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2886 return A.getValNo() < B.getValNo();
2891 SDValue X86TargetLowering::LowerFormalArguments(
2892 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2893 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2894 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2895 MachineFunction &MF = DAG.getMachineFunction();
2896 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2897 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2899 const Function *Fn = MF.getFunction();
2900 if (Fn->hasExternalLinkage() &&
2901 Subtarget.isTargetCygMing() &&
2902 Fn->getName() == "main")
2903 FuncInfo->setForceFramePointer(true);
2905 MachineFrameInfo &MFI = MF.getFrameInfo();
2906 bool Is64Bit = Subtarget.is64Bit();
2907 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2910 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2911 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2913 if (CallConv == CallingConv::X86_INTR) {
2914 bool isLegal = Ins.size() == 1 ||
2915 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2916 (!Is64Bit && Ins[1].VT == MVT::i32)));
2918 report_fatal_error("X86 interrupts may take one or two arguments");
2921 // Assign locations to all of the incoming arguments.
2922 SmallVector<CCValAssign, 16> ArgLocs;
2923 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2925 // Allocate shadow area for Win64.
2927 CCInfo.AllocateStack(32, 8);
2929 CCInfo.AnalyzeArguments(Ins, CC_X86);
2931 // In vectorcall calling convention a second pass is required for the HVA
2933 if (CallingConv::X86_VectorCall == CallConv) {
2934 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2937 // The next loop assumes that the locations are in the same order of the
2939 assert(isSortedByValueNo(ArgLocs) &&
2940 "Argument Location list must be sorted before lowering");
2943 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2945 assert(InsIndex < Ins.size() && "Invalid Ins index");
2946 CCValAssign &VA = ArgLocs[I];
2948 if (VA.isRegLoc()) {
2949 EVT RegVT = VA.getLocVT();
2950 if (VA.needsCustom()) {
2952 VA.getValVT() == MVT::v64i1 &&
2953 "Currently the only custom case is when we split v64i1 to 2 regs");
2955 // v64i1 values, in regcall calling convention, that are
2956 // compiled to 32 bit arch, are split up into two registers.
2958 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2960 const TargetRegisterClass *RC;
2961 if (RegVT == MVT::i32)
2962 RC = &X86::GR32RegClass;
2963 else if (Is64Bit && RegVT == MVT::i64)
2964 RC = &X86::GR64RegClass;
2965 else if (RegVT == MVT::f32)
2966 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2967 else if (RegVT == MVT::f64)
2968 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2969 else if (RegVT == MVT::f80)
2970 RC = &X86::RFP80RegClass;
2971 else if (RegVT == MVT::f128)
2972 RC = &X86::FR128RegClass;
2973 else if (RegVT.is512BitVector())
2974 RC = &X86::VR512RegClass;
2975 else if (RegVT.is256BitVector())
2976 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2977 else if (RegVT.is128BitVector())
2978 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2979 else if (RegVT == MVT::x86mmx)
2980 RC = &X86::VR64RegClass;
2981 else if (RegVT == MVT::v1i1)
2982 RC = &X86::VK1RegClass;
2983 else if (RegVT == MVT::v8i1)
2984 RC = &X86::VK8RegClass;
2985 else if (RegVT == MVT::v16i1)
2986 RC = &X86::VK16RegClass;
2987 else if (RegVT == MVT::v32i1)
2988 RC = &X86::VK32RegClass;
2989 else if (RegVT == MVT::v64i1)
2990 RC = &X86::VK64RegClass;
2992 llvm_unreachable("Unknown argument type!");
2994 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2995 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2998 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2999 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3001 if (VA.getLocInfo() == CCValAssign::SExt)
3002 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3003 DAG.getValueType(VA.getValVT()));
3004 else if (VA.getLocInfo() == CCValAssign::ZExt)
3005 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3006 DAG.getValueType(VA.getValVT()));
3007 else if (VA.getLocInfo() == CCValAssign::BCvt)
3008 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3010 if (VA.isExtInLoc()) {
3011 // Handle MMX values passed in XMM regs.
3012 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3013 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3014 else if (VA.getValVT().isVector() &&
3015 VA.getValVT().getScalarType() == MVT::i1 &&
3016 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3017 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3018 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3019 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3021 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3024 assert(VA.isMemLoc());
3026 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3029 // If value is passed via pointer - do a load.
3030 if (VA.getLocInfo() == CCValAssign::Indirect)
3032 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3034 InVals.push_back(ArgValue);
3037 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3038 // Swift calling convention does not require we copy the sret argument
3039 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3040 if (CallConv == CallingConv::Swift)
3043 // All x86 ABIs require that for returning structs by value we copy the
3044 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3045 // the argument into a virtual register so that we can access it from the
3047 if (Ins[I].Flags.isSRet()) {
3048 unsigned Reg = FuncInfo->getSRetReturnReg();
3050 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3051 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3052 FuncInfo->setSRetReturnReg(Reg);
3054 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3055 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3060 unsigned StackSize = CCInfo.getNextStackOffset();
3061 // Align stack specially for tail calls.
3062 if (shouldGuaranteeTCO(CallConv,
3063 MF.getTarget().Options.GuaranteedTailCallOpt))
3064 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3066 // If the function takes variable number of arguments, make a frame index for
3067 // the start of the first vararg value... for expansion of llvm.va_start. We
3068 // can skip this if there are no va_start calls.
3069 if (MFI.hasVAStart() &&
3070 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3071 CallConv != CallingConv::X86_ThisCall))) {
3072 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3075 // Figure out if XMM registers are in use.
3076 assert(!(Subtarget.useSoftFloat() &&
3077 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3078 "SSE register cannot be used when SSE is disabled!");
3080 // 64-bit calling conventions support varargs and register parameters, so we
3081 // have to do extra work to spill them in the prologue.
3082 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3083 // Find the first unallocated argument registers.
3084 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3085 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3086 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3087 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3088 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3089 "SSE register cannot be used when SSE is disabled!");
3091 // Gather all the live in physical registers.
3092 SmallVector<SDValue, 6> LiveGPRs;
3093 SmallVector<SDValue, 8> LiveXMMRegs;
3095 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3096 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3098 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3100 if (!ArgXMMs.empty()) {
3101 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3102 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3103 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3104 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3105 LiveXMMRegs.push_back(
3106 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3111 // Get to the caller-allocated home save location. Add 8 to account
3112 // for the return address.
3113 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3114 FuncInfo->setRegSaveFrameIndex(
3115 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3116 // Fixup to set vararg frame on shadow area (4 x i64).
3118 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3120 // For X86-64, if there are vararg parameters that are passed via
3121 // registers, then we must store them to their spots on the stack so
3122 // they may be loaded by dereferencing the result of va_next.
3123 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3124 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3125 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3126 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3129 // Store the integer parameter registers.
3130 SmallVector<SDValue, 8> MemOps;
3131 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3132 getPointerTy(DAG.getDataLayout()));
3133 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3134 for (SDValue Val : LiveGPRs) {
3135 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3136 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3138 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3139 MachinePointerInfo::getFixedStack(
3140 DAG.getMachineFunction(),
3141 FuncInfo->getRegSaveFrameIndex(), Offset));
3142 MemOps.push_back(Store);
3146 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3147 // Now store the XMM (fp + vector) parameter registers.
3148 SmallVector<SDValue, 12> SaveXMMOps;
3149 SaveXMMOps.push_back(Chain);
3150 SaveXMMOps.push_back(ALVal);
3151 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3152 FuncInfo->getRegSaveFrameIndex(), dl));
3153 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3154 FuncInfo->getVarArgsFPOffset(), dl));
3155 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3157 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3158 MVT::Other, SaveXMMOps));
3161 if (!MemOps.empty())
3162 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3165 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3166 // Find the largest legal vector type.
3167 MVT VecVT = MVT::Other;
3168 // FIXME: Only some x86_32 calling conventions support AVX512.
3169 if (Subtarget.hasAVX512() &&
3170 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3171 CallConv == CallingConv::Intel_OCL_BI)))
3172 VecVT = MVT::v16f32;
3173 else if (Subtarget.hasAVX())
3175 else if (Subtarget.hasSSE2())
3178 // We forward some GPRs and some vector types.
3179 SmallVector<MVT, 2> RegParmTypes;
3180 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3181 RegParmTypes.push_back(IntVT);
3182 if (VecVT != MVT::Other)
3183 RegParmTypes.push_back(VecVT);
3185 // Compute the set of forwarded registers. The rest are scratch.
3186 SmallVectorImpl<ForwardedRegister> &Forwards =
3187 FuncInfo->getForwardedMustTailRegParms();
3188 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3190 // Conservatively forward AL on x86_64, since it might be used for varargs.
3191 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3192 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3193 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3196 // Copy all forwards from physical to virtual registers.
3197 for (ForwardedRegister &F : Forwards) {
3198 // FIXME: Can we use a less constrained schedule?
3199 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3200 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3201 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3205 // Some CCs need callee pop.
3206 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3207 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3208 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3209 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3210 // X86 interrupts must pop the error code (and the alignment padding) if
3212 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3214 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3215 // If this is an sret function, the return should pop the hidden pointer.
3216 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3217 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3218 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3219 FuncInfo->setBytesToPopOnReturn(4);
3223 // RegSaveFrameIndex is X86-64 only.
3224 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3225 if (CallConv == CallingConv::X86_FastCall ||
3226 CallConv == CallingConv::X86_ThisCall)
3227 // fastcc functions can't have varargs.
3228 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3231 FuncInfo->setArgumentStackSize(StackSize);
3233 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3234 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3235 if (Personality == EHPersonality::CoreCLR) {
3237 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3238 // that we'd prefer this slot be allocated towards the bottom of the frame
3239 // (i.e. near the stack pointer after allocating the frame). Every
3240 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3241 // offset from the bottom of this and each funclet's frame must be the
3242 // same, so the size of funclets' (mostly empty) frames is dictated by
3243 // how far this slot is from the bottom (since they allocate just enough
3244 // space to accommodate holding this slot at the correct offset).
3245 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3246 EHInfo->PSPSymFrameIdx = PSPSymFI;
3250 if (CallConv == CallingConv::X86_RegCall ||
3251 Fn->hasFnAttribute("no_caller_saved_registers")) {
3252 const MachineRegisterInfo &MRI = MF.getRegInfo();
3253 for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3254 MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3260 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3261 SDValue Arg, const SDLoc &dl,
3263 const CCValAssign &VA,
3264 ISD::ArgFlagsTy Flags) const {
3265 unsigned LocMemOffset = VA.getLocMemOffset();
3266 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3267 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3269 if (Flags.isByVal())
3270 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3272 return DAG.getStore(
3273 Chain, dl, Arg, PtrOff,
3274 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3277 /// Emit a load of return address if tail call
3278 /// optimization is performed and it is required.
3279 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3280 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3281 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3282 // Adjust the Return address stack slot.
3283 EVT VT = getPointerTy(DAG.getDataLayout());
3284 OutRetAddr = getReturnAddressFrameIndex(DAG);
3286 // Load the "old" Return address.
3287 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3288 return SDValue(OutRetAddr.getNode(), 1);
3291 /// Emit a store of the return address if tail call
3292 /// optimization is performed and it is required (FPDiff!=0).
3293 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3294 SDValue Chain, SDValue RetAddrFrIdx,
3295 EVT PtrVT, unsigned SlotSize,
3296 int FPDiff, const SDLoc &dl) {
3297 // Store the return address to the appropriate stack slot.
3298 if (!FPDiff) return Chain;
3299 // Calculate the new stack slot for the return address.
3300 int NewReturnAddrFI =
3301 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3303 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3304 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3305 MachinePointerInfo::getFixedStack(
3306 DAG.getMachineFunction(), NewReturnAddrFI));
3310 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3311 /// operation of specified width.
3312 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3314 unsigned NumElems = VT.getVectorNumElements();
3315 SmallVector<int, 8> Mask;
3316 Mask.push_back(NumElems);
3317 for (unsigned i = 1; i != NumElems; ++i)
3319 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3323 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3324 SmallVectorImpl<SDValue> &InVals) const {
3325 SelectionDAG &DAG = CLI.DAG;
3327 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3328 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3329 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3330 SDValue Chain = CLI.Chain;
3331 SDValue Callee = CLI.Callee;
3332 CallingConv::ID CallConv = CLI.CallConv;
3333 bool &isTailCall = CLI.IsTailCall;
3334 bool isVarArg = CLI.IsVarArg;
3336 MachineFunction &MF = DAG.getMachineFunction();
3337 bool Is64Bit = Subtarget.is64Bit();
3338 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3339 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3340 bool IsSibcall = false;
3341 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3342 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3343 const CallInst *CI =
3344 CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
3345 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3346 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3347 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3349 if (CallConv == CallingConv::X86_INTR)
3350 report_fatal_error("X86 interrupts may not be called directly");
3352 if (Attr.getValueAsString() == "true")
3355 if (Subtarget.isPICStyleGOT() &&
3356 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3357 // If we are using a GOT, disable tail calls to external symbols with
3358 // default visibility. Tail calling such a symbol requires using a GOT
3359 // relocation, which forces early binding of the symbol. This breaks code
3360 // that require lazy function symbol resolution. Using musttail or
3361 // GuaranteedTailCallOpt will override this.
3362 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3363 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3364 G->getGlobal()->hasDefaultVisibility()))
3368 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3370 // Force this to be a tail call. The verifier rules are enough to ensure
3371 // that we can lower this successfully without moving the return address
3374 } else if (isTailCall) {
3375 // Check if it's really possible to do a tail call.
3376 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3377 isVarArg, SR != NotStructReturn,
3378 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3379 Outs, OutVals, Ins, DAG);
3381 // Sibcalls are automatically detected tailcalls which do not require
3383 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3390 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3391 "Var args not supported with calling convention fastcc, ghc or hipe");
3393 // Analyze operands of the call, assigning locations to each operand.
3394 SmallVector<CCValAssign, 16> ArgLocs;
3395 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3397 // Allocate shadow area for Win64.
3399 CCInfo.AllocateStack(32, 8);
3401 CCInfo.AnalyzeArguments(Outs, CC_X86);
3403 // In vectorcall calling convention a second pass is required for the HVA
3405 if (CallingConv::X86_VectorCall == CallConv) {
3406 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3409 // Get a count of how many bytes are to be pushed on the stack.
3410 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3412 // This is a sibcall. The memory operands are available in caller's
3413 // own caller's stack.
3415 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3416 canGuaranteeTCO(CallConv))
3417 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3420 if (isTailCall && !IsSibcall && !IsMustTail) {
3421 // Lower arguments at fp - stackoffset + fpdiff.
3422 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3424 FPDiff = NumBytesCallerPushed - NumBytes;
3426 // Set the delta of movement of the returnaddr stackslot.
3427 // But only set if delta is greater than previous delta.
3428 if (FPDiff < X86Info->getTCReturnAddrDelta())
3429 X86Info->setTCReturnAddrDelta(FPDiff);
3432 unsigned NumBytesToPush = NumBytes;
3433 unsigned NumBytesToPop = NumBytes;
3435 // If we have an inalloca argument, all stack space has already been allocated
3436 // for us and be right at the top of the stack. We don't support multiple
3437 // arguments passed in memory when using inalloca.
3438 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3440 if (!ArgLocs.back().isMemLoc())
3441 report_fatal_error("cannot use inalloca attribute on a register "
3443 if (ArgLocs.back().getLocMemOffset() != 0)
3444 report_fatal_error("any parameter with the inalloca attribute must be "
3445 "the only memory argument");
3449 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3450 NumBytes - NumBytesToPush, dl);
3452 SDValue RetAddrFrIdx;
3453 // Load return address for tail calls.
3454 if (isTailCall && FPDiff)
3455 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3456 Is64Bit, FPDiff, dl);
3458 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3459 SmallVector<SDValue, 8> MemOpChains;
3462 // The next loop assumes that the locations are in the same order of the
3464 assert(isSortedByValueNo(ArgLocs) &&
3465 "Argument Location list must be sorted before lowering");
3467 // Walk the register/memloc assignments, inserting copies/loads. In the case
3468 // of tail call optimization arguments are handle later.
3469 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3470 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3472 assert(OutIndex < Outs.size() && "Invalid Out index");
3473 // Skip inalloca arguments, they have already been written.
3474 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3475 if (Flags.isInAlloca())
3478 CCValAssign &VA = ArgLocs[I];
3479 EVT RegVT = VA.getLocVT();
3480 SDValue Arg = OutVals[OutIndex];
3481 bool isByVal = Flags.isByVal();
3483 // Promote the value if needed.
3484 switch (VA.getLocInfo()) {
3485 default: llvm_unreachable("Unknown loc info!");
3486 case CCValAssign::Full: break;
3487 case CCValAssign::SExt:
3488 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3490 case CCValAssign::ZExt:
3491 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3493 case CCValAssign::AExt:
3494 if (Arg.getValueType().isVector() &&
3495 Arg.getValueType().getVectorElementType() == MVT::i1)
3496 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3497 else if (RegVT.is128BitVector()) {
3498 // Special case: passing MMX values in XMM registers.
3499 Arg = DAG.getBitcast(MVT::i64, Arg);
3500 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3501 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3503 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3505 case CCValAssign::BCvt:
3506 Arg = DAG.getBitcast(RegVT, Arg);
3508 case CCValAssign::Indirect: {
3509 // Store the argument.
3510 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3511 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3512 Chain = DAG.getStore(
3513 Chain, dl, Arg, SpillSlot,
3514 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3520 if (VA.needsCustom()) {
3521 assert(VA.getValVT() == MVT::v64i1 &&
3522 "Currently the only custom case is when we split v64i1 to 2 regs");
3523 // Split v64i1 value into two registers
3524 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3526 } else if (VA.isRegLoc()) {
3527 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3528 if (isVarArg && IsWin64) {
3529 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3530 // shadow reg if callee is a varargs function.
3531 unsigned ShadowReg = 0;
3532 switch (VA.getLocReg()) {
3533 case X86::XMM0: ShadowReg = X86::RCX; break;
3534 case X86::XMM1: ShadowReg = X86::RDX; break;
3535 case X86::XMM2: ShadowReg = X86::R8; break;
3536 case X86::XMM3: ShadowReg = X86::R9; break;
3539 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3541 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3542 assert(VA.isMemLoc());
3543 if (!StackPtr.getNode())
3544 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3545 getPointerTy(DAG.getDataLayout()));
3546 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3547 dl, DAG, VA, Flags));
3551 if (!MemOpChains.empty())
3552 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3554 if (Subtarget.isPICStyleGOT()) {
3555 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3558 RegsToPass.push_back(std::make_pair(
3559 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3560 getPointerTy(DAG.getDataLayout()))));
3562 // If we are tail calling and generating PIC/GOT style code load the
3563 // address of the callee into ECX. The value in ecx is used as target of
3564 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3565 // for tail calls on PIC/GOT architectures. Normally we would just put the
3566 // address of GOT into ebx and then call target@PLT. But for tail calls
3567 // ebx would be restored (since ebx is callee saved) before jumping to the
3570 // Note: The actual moving to ECX is done further down.
3571 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3572 if (G && !G->getGlobal()->hasLocalLinkage() &&
3573 G->getGlobal()->hasDefaultVisibility())
3574 Callee = LowerGlobalAddress(Callee, DAG);
3575 else if (isa<ExternalSymbolSDNode>(Callee))
3576 Callee = LowerExternalSymbol(Callee, DAG);
3580 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3581 // From AMD64 ABI document:
3582 // For calls that may call functions that use varargs or stdargs
3583 // (prototype-less calls or calls to functions containing ellipsis (...) in
3584 // the declaration) %al is used as hidden argument to specify the number
3585 // of SSE registers used. The contents of %al do not need to match exactly
3586 // the number of registers, but must be an ubound on the number of SSE
3587 // registers used and is in the range 0 - 8 inclusive.
3589 // Count the number of XMM registers allocated.
3590 static const MCPhysReg XMMArgRegs[] = {
3591 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3592 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3594 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3595 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3596 && "SSE registers cannot be used when SSE is disabled");
3598 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3599 DAG.getConstant(NumXMMRegs, dl,
3603 if (isVarArg && IsMustTail) {
3604 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3605 for (const auto &F : Forwards) {
3606 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3607 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3611 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3612 // don't need this because the eligibility check rejects calls that require
3613 // shuffling arguments passed in memory.
3614 if (!IsSibcall && isTailCall) {
3615 // Force all the incoming stack arguments to be loaded from the stack
3616 // before any new outgoing arguments are stored to the stack, because the
3617 // outgoing stack slots may alias the incoming argument stack slots, and
3618 // the alias isn't otherwise explicit. This is slightly more conservative
3619 // than necessary, because it means that each store effectively depends
3620 // on every argument instead of just those arguments it would clobber.
3621 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3623 SmallVector<SDValue, 8> MemOpChains2;
3626 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3628 CCValAssign &VA = ArgLocs[I];
3630 if (VA.isRegLoc()) {
3631 if (VA.needsCustom()) {
3632 assert((CallConv == CallingConv::X86_RegCall) &&
3633 "Expecting custom case only in regcall calling convention");
3634 // This means that we are in special case where one argument was
3635 // passed through two register locations - Skip the next location
3642 assert(VA.isMemLoc());
3643 SDValue Arg = OutVals[OutsIndex];
3644 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3645 // Skip inalloca arguments. They don't require any work.
3646 if (Flags.isInAlloca())
3648 // Create frame index.
3649 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3650 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3651 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3652 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3654 if (Flags.isByVal()) {
3655 // Copy relative to framepointer.
3656 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3657 if (!StackPtr.getNode())
3658 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3659 getPointerTy(DAG.getDataLayout()));
3660 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3663 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3667 // Store relative to framepointer.
3668 MemOpChains2.push_back(DAG.getStore(
3669 ArgChain, dl, Arg, FIN,
3670 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3674 if (!MemOpChains2.empty())
3675 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3677 // Store the return address to the appropriate stack slot.
3678 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3679 getPointerTy(DAG.getDataLayout()),
3680 RegInfo->getSlotSize(), FPDiff, dl);
3683 // Build a sequence of copy-to-reg nodes chained together with token chain
3684 // and flag operands which copy the outgoing args into registers.
3686 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3687 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3688 RegsToPass[i].second, InFlag);
3689 InFlag = Chain.getValue(1);
3692 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3693 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3694 // In the 64-bit large code model, we have to make all calls
3695 // through a register, since the call instruction's 32-bit
3696 // pc-relative offset may not be large enough to hold the whole
3698 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3699 // If the callee is a GlobalAddress node (quite common, every direct call
3700 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3702 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3704 // We should use extra load for direct calls to dllimported functions in
3706 const GlobalValue *GV = G->getGlobal();
3707 if (!GV->hasDLLImportStorageClass()) {
3708 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3710 Callee = DAG.getTargetGlobalAddress(
3711 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3713 if (OpFlags == X86II::MO_GOTPCREL) {
3715 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3716 getPointerTy(DAG.getDataLayout()), Callee);
3717 // Add extra indirection
3718 Callee = DAG.getLoad(
3719 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3720 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3723 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3724 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3725 unsigned char OpFlags =
3726 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3728 Callee = DAG.getTargetExternalSymbol(
3729 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3730 } else if (Subtarget.isTarget64BitILP32() &&
3731 Callee->getValueType(0) == MVT::i32) {
3732 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3733 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3736 // Returns a chain & a flag for retval copy to use.
3737 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3738 SmallVector<SDValue, 8> Ops;
3740 if (!IsSibcall && isTailCall) {
3741 Chain = DAG.getCALLSEQ_END(Chain,
3742 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3743 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3744 InFlag = Chain.getValue(1);
3747 Ops.push_back(Chain);
3748 Ops.push_back(Callee);
3751 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3753 // Add argument registers to the end of the list so that they are known live
3755 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3756 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3757 RegsToPass[i].second.getValueType()));
3759 // Add a register mask operand representing the call-preserved registers.
3760 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3761 // set X86_INTR calling convention because it has the same CSR mask
3762 // (same preserved registers).
3763 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3764 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3765 assert(Mask && "Missing call preserved mask for calling convention");
3767 // If this is an invoke in a 32-bit function using a funclet-based
3768 // personality, assume the function clobbers all registers. If an exception
3769 // is thrown, the runtime will not restore CSRs.
3770 // FIXME: Model this more precisely so that we can register allocate across
3771 // the normal edge and spill and fill across the exceptional edge.
3772 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3773 const Function *CallerFn = MF.getFunction();
3774 EHPersonality Pers =
3775 CallerFn->hasPersonalityFn()
3776 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3777 : EHPersonality::Unknown;
3778 if (isFuncletEHPersonality(Pers))
3779 Mask = RegInfo->getNoPreservedMask();
3782 // Define a new register mask from the existing mask.
3783 uint32_t *RegMask = nullptr;
3785 // In some calling conventions we need to remove the used physical registers
3786 // from the reg mask.
3787 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3788 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3790 // Allocate a new Reg Mask and copy Mask.
3791 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3792 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3793 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3795 // Make sure all sub registers of the argument registers are reset
3797 for (auto const &RegPair : RegsToPass)
3798 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3799 SubRegs.isValid(); ++SubRegs)
3800 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3802 // Create the RegMask Operand according to our updated mask.
3803 Ops.push_back(DAG.getRegisterMask(RegMask));
3805 // Create the RegMask Operand according to the static mask.
3806 Ops.push_back(DAG.getRegisterMask(Mask));
3809 if (InFlag.getNode())
3810 Ops.push_back(InFlag);
3814 //// If this is the first return lowered for this function, add the regs
3815 //// to the liveout set for the function.
3816 // This isn't right, although it's probably harmless on x86; liveouts
3817 // should be computed from returns not tail calls. Consider a void
3818 // function making a tail call to a function returning int.
3819 MF.getFrameInfo().setHasTailCall();
3820 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3823 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3824 InFlag = Chain.getValue(1);
3826 // Create the CALLSEQ_END node.
3827 unsigned NumBytesForCalleeToPop;
3828 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3829 DAG.getTarget().Options.GuaranteedTailCallOpt))
3830 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3831 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3832 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3833 SR == StackStructReturn)
3834 // If this is a call to a struct-return function, the callee
3835 // pops the hidden struct pointer, so we have to push it back.
3836 // This is common for Darwin/X86, Linux & Mingw32 targets.
3837 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3838 NumBytesForCalleeToPop = 4;
3840 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3842 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3843 // No need to reset the stack after the call if the call doesn't return. To
3844 // make the MI verify, we'll pretend the callee does it for us.
3845 NumBytesForCalleeToPop = NumBytes;
3848 // Returns a flag for retval copy to use.
3850 Chain = DAG.getCALLSEQ_END(Chain,
3851 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3852 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3855 InFlag = Chain.getValue(1);
3858 // Handle result values, copying them out of physregs into vregs that we
3860 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3864 //===----------------------------------------------------------------------===//
3865 // Fast Calling Convention (tail call) implementation
3866 //===----------------------------------------------------------------------===//
3868 // Like std call, callee cleans arguments, convention except that ECX is
3869 // reserved for storing the tail called function address. Only 2 registers are
3870 // free for argument passing (inreg). Tail call optimization is performed
3872 // * tailcallopt is enabled
3873 // * caller/callee are fastcc
3874 // On X86_64 architecture with GOT-style position independent code only local
3875 // (within module) calls are supported at the moment.
3876 // To keep the stack aligned according to platform abi the function
3877 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3878 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3879 // If a tail called function callee has more arguments than the caller the
3880 // caller needs to make sure that there is room to move the RETADDR to. This is
3881 // achieved by reserving an area the size of the argument delta right after the
3882 // original RETADDR, but before the saved framepointer or the spilled registers
3883 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3895 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3898 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3899 SelectionDAG& DAG) const {
3900 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3901 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3902 unsigned StackAlignment = TFI.getStackAlignment();
3903 uint64_t AlignMask = StackAlignment - 1;
3904 int64_t Offset = StackSize;
3905 unsigned SlotSize = RegInfo->getSlotSize();
3906 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3907 // Number smaller than 12 so just add the difference.
3908 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3910 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3911 Offset = ((~AlignMask) & Offset) + StackAlignment +
3912 (StackAlignment-SlotSize);
3917 /// Return true if the given stack call argument is already available in the
3918 /// same position (relatively) of the caller's incoming argument stack.
3920 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3921 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3922 const X86InstrInfo *TII, const CCValAssign &VA) {
3923 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3926 // Look through nodes that don't alter the bits of the incoming value.
3927 unsigned Op = Arg.getOpcode();
3928 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3929 Arg = Arg.getOperand(0);
3932 if (Op == ISD::TRUNCATE) {
3933 const SDValue &TruncInput = Arg.getOperand(0);
3934 if (TruncInput.getOpcode() == ISD::AssertZext &&
3935 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3936 Arg.getValueType()) {
3937 Arg = TruncInput.getOperand(0);
3945 if (Arg.getOpcode() == ISD::CopyFromReg) {
3946 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3947 if (!TargetRegisterInfo::isVirtualRegister(VR))
3949 MachineInstr *Def = MRI->getVRegDef(VR);
3952 if (!Flags.isByVal()) {
3953 if (!TII->isLoadFromStackSlot(*Def, FI))
3956 unsigned Opcode = Def->getOpcode();
3957 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3958 Opcode == X86::LEA64_32r) &&
3959 Def->getOperand(1).isFI()) {
3960 FI = Def->getOperand(1).getIndex();
3961 Bytes = Flags.getByValSize();
3965 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3966 if (Flags.isByVal())
3967 // ByVal argument is passed in as a pointer but it's now being
3968 // dereferenced. e.g.
3969 // define @foo(%struct.X* %A) {
3970 // tail call @bar(%struct.X* byval %A)
3973 SDValue Ptr = Ld->getBasePtr();
3974 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3977 FI = FINode->getIndex();
3978 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3979 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3980 FI = FINode->getIndex();
3981 Bytes = Flags.getByValSize();
3985 assert(FI != INT_MAX);
3986 if (!MFI.isFixedObjectIndex(FI))
3989 if (Offset != MFI.getObjectOffset(FI))
3992 // If this is not byval, check that the argument stack object is immutable.
3993 // inalloca and argument copy elision can create mutable argument stack
3994 // objects. Byval objects can be mutated, but a byval call intends to pass the
3996 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
3999 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4000 // If the argument location is wider than the argument type, check that any
4001 // extension flags match.
4002 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4003 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4008 return Bytes == MFI.getObjectSize(FI);
4011 /// Check whether the call is eligible for tail call optimization. Targets
4012 /// that want to do tail call optimization should implement this function.
4013 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4014 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4015 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4016 const SmallVectorImpl<ISD::OutputArg> &Outs,
4017 const SmallVectorImpl<SDValue> &OutVals,
4018 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4019 if (!mayTailCallThisCC(CalleeCC))
4022 // If -tailcallopt is specified, make fastcc functions tail-callable.
4023 MachineFunction &MF = DAG.getMachineFunction();
4024 const Function *CallerF = MF.getFunction();
4026 // If the function return type is x86_fp80 and the callee return type is not,
4027 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4028 // perform a tailcall optimization here.
4029 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4032 CallingConv::ID CallerCC = CallerF->getCallingConv();
4033 bool CCMatch = CallerCC == CalleeCC;
4034 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4035 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4037 // Win64 functions have extra shadow space for argument homing. Don't do the
4038 // sibcall if the caller and callee have mismatched expectations for this
4040 if (IsCalleeWin64 != IsCallerWin64)
4043 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4044 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4049 // Look for obvious safe cases to perform tail call optimization that do not
4050 // require ABI changes. This is what gcc calls sibcall.
4052 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4053 // emit a special epilogue.
4054 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4055 if (RegInfo->needsStackRealignment(MF))
4058 // Also avoid sibcall optimization if either caller or callee uses struct
4059 // return semantics.
4060 if (isCalleeStructRet || isCallerStructRet)
4063 // Do not sibcall optimize vararg calls unless all arguments are passed via
4065 LLVMContext &C = *DAG.getContext();
4066 if (isVarArg && !Outs.empty()) {
4067 // Optimizing for varargs on Win64 is unlikely to be safe without
4068 // additional testing.
4069 if (IsCalleeWin64 || IsCallerWin64)
4072 SmallVector<CCValAssign, 16> ArgLocs;
4073 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4075 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4076 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4077 if (!ArgLocs[i].isRegLoc())
4081 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4082 // stack. Therefore, if it's not used by the call it is not safe to optimize
4083 // this into a sibcall.
4084 bool Unused = false;
4085 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4092 SmallVector<CCValAssign, 16> RVLocs;
4093 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4094 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4095 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4096 CCValAssign &VA = RVLocs[i];
4097 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4102 // Check that the call results are passed in the same way.
4103 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4104 RetCC_X86, RetCC_X86))
4106 // The callee has to preserve all registers the caller needs to preserve.
4107 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4108 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4110 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4111 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4115 unsigned StackArgsSize = 0;
4117 // If the callee takes no arguments then go on to check the results of the
4119 if (!Outs.empty()) {
4120 // Check if stack adjustment is needed. For now, do not do this if any
4121 // argument is passed on the stack.
4122 SmallVector<CCValAssign, 16> ArgLocs;
4123 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4125 // Allocate shadow area for Win64
4127 CCInfo.AllocateStack(32, 8);
4129 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4130 StackArgsSize = CCInfo.getNextStackOffset();
4132 if (CCInfo.getNextStackOffset()) {
4133 // Check if the arguments are already laid out in the right way as
4134 // the caller's fixed stack objects.
4135 MachineFrameInfo &MFI = MF.getFrameInfo();
4136 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4137 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4138 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4139 CCValAssign &VA = ArgLocs[i];
4140 SDValue Arg = OutVals[i];
4141 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4142 if (VA.getLocInfo() == CCValAssign::Indirect)
4144 if (!VA.isRegLoc()) {
4145 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4152 bool PositionIndependent = isPositionIndependent();
4153 // If the tailcall address may be in a register, then make sure it's
4154 // possible to register allocate for it. In 32-bit, the call address can
4155 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4156 // callee-saved registers are restored. These happen to be the same
4157 // registers used to pass 'inreg' arguments so watch out for those.
4158 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4159 !isa<ExternalSymbolSDNode>(Callee)) ||
4160 PositionIndependent)) {
4161 unsigned NumInRegs = 0;
4162 // In PIC we need an extra register to formulate the address computation
4164 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4166 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4167 CCValAssign &VA = ArgLocs[i];
4170 unsigned Reg = VA.getLocReg();
4173 case X86::EAX: case X86::EDX: case X86::ECX:
4174 if (++NumInRegs == MaxInRegs)
4181 const MachineRegisterInfo &MRI = MF.getRegInfo();
4182 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4186 bool CalleeWillPop =
4187 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4188 MF.getTarget().Options.GuaranteedTailCallOpt);
4190 if (unsigned BytesToPop =
4191 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4192 // If we have bytes to pop, the callee must pop them.
4193 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4194 if (!CalleePopMatches)
4196 } else if (CalleeWillPop && StackArgsSize > 0) {
4197 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4205 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4206 const TargetLibraryInfo *libInfo) const {
4207 return X86::createFastISel(funcInfo, libInfo);
4210 //===----------------------------------------------------------------------===//
4211 // Other Lowering Hooks
4212 //===----------------------------------------------------------------------===//
4214 static bool MayFoldLoad(SDValue Op) {
4215 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4218 static bool MayFoldIntoStore(SDValue Op) {
4219 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4222 static bool MayFoldIntoZeroExtend(SDValue Op) {
4223 if (Op.hasOneUse()) {
4224 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4225 return (ISD::ZERO_EXTEND == Opcode);
4230 static bool isTargetShuffle(unsigned Opcode) {
4232 default: return false;
4233 case X86ISD::BLENDI:
4234 case X86ISD::PSHUFB:
4235 case X86ISD::PSHUFD:
4236 case X86ISD::PSHUFHW:
4237 case X86ISD::PSHUFLW:
4239 case X86ISD::INSERTPS:
4240 case X86ISD::EXTRQI:
4241 case X86ISD::INSERTQI:
4242 case X86ISD::PALIGNR:
4243 case X86ISD::VSHLDQ:
4244 case X86ISD::VSRLDQ:
4245 case X86ISD::MOVLHPS:
4246 case X86ISD::MOVLHPD:
4247 case X86ISD::MOVHLPS:
4248 case X86ISD::MOVLPS:
4249 case X86ISD::MOVLPD:
4250 case X86ISD::MOVSHDUP:
4251 case X86ISD::MOVSLDUP:
4252 case X86ISD::MOVDDUP:
4255 case X86ISD::UNPCKL:
4256 case X86ISD::UNPCKH:
4257 case X86ISD::VBROADCAST:
4258 case X86ISD::VPERMILPI:
4259 case X86ISD::VPERMILPV:
4260 case X86ISD::VPERM2X128:
4261 case X86ISD::VPERMIL2:
4262 case X86ISD::VPERMI:
4263 case X86ISD::VPPERM:
4264 case X86ISD::VPERMV:
4265 case X86ISD::VPERMV3:
4266 case X86ISD::VPERMIV3:
4267 case X86ISD::VZEXT_MOVL:
4272 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4274 default: return false;
4276 case X86ISD::PSHUFB:
4277 case X86ISD::VPERMILPV:
4278 case X86ISD::VPERMIL2:
4279 case X86ISD::VPPERM:
4280 case X86ISD::VPERMV:
4281 case X86ISD::VPERMV3:
4282 case X86ISD::VPERMIV3:
4284 // 'Faux' Target Shuffles.
4291 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4292 MachineFunction &MF = DAG.getMachineFunction();
4293 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4294 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4295 int ReturnAddrIndex = FuncInfo->getRAIndex();
4297 if (ReturnAddrIndex == 0) {
4298 // Set up a frame object for the return address.
4299 unsigned SlotSize = RegInfo->getSlotSize();
4300 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4303 FuncInfo->setRAIndex(ReturnAddrIndex);
4306 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4309 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4310 bool hasSymbolicDisplacement) {
4311 // Offset should fit into 32 bit immediate field.
4312 if (!isInt<32>(Offset))
4315 // If we don't have a symbolic displacement - we don't have any extra
4317 if (!hasSymbolicDisplacement)
4320 // FIXME: Some tweaks might be needed for medium code model.
4321 if (M != CodeModel::Small && M != CodeModel::Kernel)
4324 // For small code model we assume that latest object is 16MB before end of 31
4325 // bits boundary. We may also accept pretty large negative constants knowing
4326 // that all objects are in the positive half of address space.
4327 if (M == CodeModel::Small && Offset < 16*1024*1024)
4330 // For kernel code model we know that all object resist in the negative half
4331 // of 32bits address space. We may not accept negative offsets, since they may
4332 // be just off and we may accept pretty large positive ones.
4333 if (M == CodeModel::Kernel && Offset >= 0)
4339 /// Determines whether the callee is required to pop its own arguments.
4340 /// Callee pop is necessary to support tail calls.
4341 bool X86::isCalleePop(CallingConv::ID CallingConv,
4342 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4343 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4344 // can guarantee TCO.
4345 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4348 switch (CallingConv) {
4351 case CallingConv::X86_StdCall:
4352 case CallingConv::X86_FastCall:
4353 case CallingConv::X86_ThisCall:
4354 case CallingConv::X86_VectorCall:
4359 /// \brief Return true if the condition is an unsigned comparison operation.
4360 static bool isX86CCUnsigned(unsigned X86CC) {
4363 llvm_unreachable("Invalid integer condition!");
4379 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4380 switch (SetCCOpcode) {
4381 default: llvm_unreachable("Invalid integer condition!");
4382 case ISD::SETEQ: return X86::COND_E;
4383 case ISD::SETGT: return X86::COND_G;
4384 case ISD::SETGE: return X86::COND_GE;
4385 case ISD::SETLT: return X86::COND_L;
4386 case ISD::SETLE: return X86::COND_LE;
4387 case ISD::SETNE: return X86::COND_NE;
4388 case ISD::SETULT: return X86::COND_B;
4389 case ISD::SETUGT: return X86::COND_A;
4390 case ISD::SETULE: return X86::COND_BE;
4391 case ISD::SETUGE: return X86::COND_AE;
4395 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4396 /// condition code, returning the condition code and the LHS/RHS of the
4397 /// comparison to make.
4398 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4399 bool isFP, SDValue &LHS, SDValue &RHS,
4400 SelectionDAG &DAG) {
4402 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4403 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4404 // X > -1 -> X == 0, jump !sign.
4405 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4406 return X86::COND_NS;
4408 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4409 // X < 0 -> X == 0, jump on sign.
4412 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4414 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4415 return X86::COND_LE;
4419 return TranslateIntegerX86CC(SetCCOpcode);
4422 // First determine if it is required or is profitable to flip the operands.
4424 // If LHS is a foldable load, but RHS is not, flip the condition.
4425 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4426 !ISD::isNON_EXTLoad(RHS.getNode())) {
4427 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4428 std::swap(LHS, RHS);
4431 switch (SetCCOpcode) {
4437 std::swap(LHS, RHS);
4441 // On a floating point condition, the flags are set as follows:
4443 // 0 | 0 | 0 | X > Y
4444 // 0 | 0 | 1 | X < Y
4445 // 1 | 0 | 0 | X == Y
4446 // 1 | 1 | 1 | unordered
4447 switch (SetCCOpcode) {
4448 default: llvm_unreachable("Condcode should be pre-legalized away");
4450 case ISD::SETEQ: return X86::COND_E;
4451 case ISD::SETOLT: // flipped
4453 case ISD::SETGT: return X86::COND_A;
4454 case ISD::SETOLE: // flipped
4456 case ISD::SETGE: return X86::COND_AE;
4457 case ISD::SETUGT: // flipped
4459 case ISD::SETLT: return X86::COND_B;
4460 case ISD::SETUGE: // flipped
4462 case ISD::SETLE: return X86::COND_BE;
4464 case ISD::SETNE: return X86::COND_NE;
4465 case ISD::SETUO: return X86::COND_P;
4466 case ISD::SETO: return X86::COND_NP;
4468 case ISD::SETUNE: return X86::COND_INVALID;
4472 /// Is there a floating point cmov for the specific X86 condition code?
4473 /// Current x86 isa includes the following FP cmov instructions:
4474 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4475 static bool hasFPCMov(unsigned X86CC) {
4492 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4494 unsigned Intrinsic) const {
4496 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4500 Info.opc = ISD::INTRINSIC_W_CHAIN;
4501 Info.readMem = false;
4502 Info.writeMem = false;
4506 switch (IntrData->Type) {
4507 case EXPAND_FROM_MEM: {
4508 Info.ptrVal = I.getArgOperand(0);
4509 Info.memVT = MVT::getVT(I.getType());
4511 Info.readMem = true;
4514 case COMPRESS_TO_MEM: {
4515 Info.ptrVal = I.getArgOperand(0);
4516 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4518 Info.writeMem = true;
4521 case TRUNCATE_TO_MEM_VI8:
4522 case TRUNCATE_TO_MEM_VI16:
4523 case TRUNCATE_TO_MEM_VI32: {
4524 Info.ptrVal = I.getArgOperand(0);
4525 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4526 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4527 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4529 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4530 ScalarVT = MVT::i16;
4531 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4532 ScalarVT = MVT::i32;
4534 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4536 Info.writeMem = true;
4546 /// Returns true if the target can instruction select the
4547 /// specified FP immediate natively. If false, the legalizer will
4548 /// materialize the FP immediate as a load from a constant pool.
4549 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4550 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4551 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4557 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4558 ISD::LoadExtType ExtTy,
4560 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4561 // relocation target a movq or addq instruction: don't let the load shrink.
4562 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4563 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4564 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4565 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4569 /// \brief Returns true if it is beneficial to convert a load of a constant
4570 /// to just the constant itself.
4571 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4573 assert(Ty->isIntegerTy());
4575 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4576 if (BitSize == 0 || BitSize > 64)
4581 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4582 unsigned Index) const {
4583 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4586 return (Index == 0 || Index == ResVT.getVectorNumElements());
4589 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4590 // Speculate cttz only if we can directly use TZCNT.
4591 return Subtarget.hasBMI();
4594 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4595 // Speculate ctlz only if we can directly use LZCNT.
4596 return Subtarget.hasLZCNT();
4599 bool X86TargetLowering::isCtlzFast() const {
4600 return Subtarget.hasFastLZCNT();
4603 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4604 const Instruction &AndI) const {
4608 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4609 if (!Subtarget.hasBMI())
4612 // There are only 32-bit and 64-bit forms for 'andn'.
4613 EVT VT = Y.getValueType();
4614 if (VT != MVT::i32 && VT != MVT::i64)
4620 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4621 MVT VT = MVT::getIntegerVT(NumBits);
4622 if (isTypeLegal(VT))
4625 // PMOVMSKB can handle this.
4626 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4629 // VPMOVMSKB can handle this.
4630 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4633 // TODO: Allow 64-bit type for 32-bit target.
4634 // TODO: 512-bit types should be allowed, but make sure that those
4635 // cases are handled in combineVectorSizedSetCCEquality().
4637 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4640 /// Val is the undef sentinel value or equal to the specified value.
4641 static bool isUndefOrEqual(int Val, int CmpVal) {
4642 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4645 /// Val is either the undef or zero sentinel value.
4646 static bool isUndefOrZero(int Val) {
4647 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4650 /// Return true if every element in Mask, beginning
4651 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4652 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4653 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4654 if (Mask[i] != SM_SentinelUndef)
4659 /// Return true if Val is undef or if its value falls within the
4660 /// specified range (L, H].
4661 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4662 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4665 /// Return true if every element in Mask is undef or if its value
4666 /// falls within the specified range (L, H].
4667 static bool isUndefOrInRange(ArrayRef<int> Mask,
4670 if (!isUndefOrInRange(M, Low, Hi))
4675 /// Return true if Val is undef, zero or if its value falls within the
4676 /// specified range (L, H].
4677 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4678 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4681 /// Return true if every element in Mask is undef, zero or if its value
4682 /// falls within the specified range (L, H].
4683 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4685 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4690 /// Return true if every element in Mask, beginning
4691 /// from position Pos and ending in Pos+Size, falls within the specified
4692 /// sequential range (Low, Low+Size]. or is undef.
4693 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4694 unsigned Pos, unsigned Size, int Low) {
4695 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4696 if (!isUndefOrEqual(Mask[i], Low))
4701 /// Return true if every element in Mask, beginning
4702 /// from position Pos and ending in Pos+Size, falls within the specified
4703 /// sequential range (Low, Low+Size], or is undef or is zero.
4704 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4705 unsigned Size, int Low) {
4706 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4707 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4712 /// Return true if every element in Mask, beginning
4713 /// from position Pos and ending in Pos+Size is undef or is zero.
4714 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4716 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4717 if (!isUndefOrZero(Mask[i]))
4722 /// \brief Helper function to test whether a shuffle mask could be
4723 /// simplified by widening the elements being shuffled.
4725 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4726 /// leaves it in an unspecified state.
4728 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4729 /// shuffle masks. The latter have the special property of a '-2' representing
4730 /// a zero-ed lane of a vector.
4731 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4732 SmallVectorImpl<int> &WidenedMask) {
4733 WidenedMask.assign(Mask.size() / 2, 0);
4734 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4736 int M1 = Mask[i + 1];
4738 // If both elements are undef, its trivial.
4739 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4740 WidenedMask[i / 2] = SM_SentinelUndef;
4744 // Check for an undef mask and a mask value properly aligned to fit with
4745 // a pair of values. If we find such a case, use the non-undef mask's value.
4746 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4747 WidenedMask[i / 2] = M1 / 2;
4750 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4751 WidenedMask[i / 2] = M0 / 2;
4755 // When zeroing, we need to spread the zeroing across both lanes to widen.
4756 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4757 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4758 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4759 WidenedMask[i / 2] = SM_SentinelZero;
4765 // Finally check if the two mask values are adjacent and aligned with
4767 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4768 WidenedMask[i / 2] = M0 / 2;
4772 // Otherwise we can't safely widen the elements used in this shuffle.
4775 assert(WidenedMask.size() == Mask.size() / 2 &&
4776 "Incorrect size of mask after widening the elements!");
4781 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4782 /// mask index with the scaled sequential indices for an equivalent narrowed
4783 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4785 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4786 SmallVectorImpl<int> &ScaledMask) {
4787 assert(0 < Scale && "Unexpected scaling factor");
4788 int NumElts = Mask.size();
4789 ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);
4791 for (int i = 0; i != NumElts; ++i) {
4794 // Repeat sentinel values in every mask element.
4796 for (int s = 0; s != Scale; ++s)
4797 ScaledMask[(Scale * i) + s] = M;
4801 // Scale mask element and increment across each mask element.
4802 for (int s = 0; s != Scale; ++s)
4803 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4807 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4808 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4809 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4810 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4811 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4814 // The index should be aligned on a vecWidth-bit boundary.
4815 uint64_t Index = N->getConstantOperandVal(1);
4816 MVT VT = N->getSimpleValueType(0);
4817 unsigned ElSize = VT.getScalarSizeInBits();
4818 return (Index * ElSize) % vecWidth == 0;
4821 /// Return true if the specified INSERT_SUBVECTOR
4822 /// operand specifies a subvector insert that is suitable for input to
4823 /// insertion of 128 or 256-bit subvectors
4824 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4825 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4826 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4829 // The index should be aligned on a vecWidth-bit boundary.
4830 uint64_t Index = N->getConstantOperandVal(2);
4831 MVT VT = N->getSimpleValueType(0);
4832 unsigned ElSize = VT.getScalarSizeInBits();
4833 return (Index * ElSize) % vecWidth == 0;
4836 bool X86::isVINSERT128Index(SDNode *N) {
4837 return isVINSERTIndex(N, 128);
4840 bool X86::isVINSERT256Index(SDNode *N) {
4841 return isVINSERTIndex(N, 256);
4844 bool X86::isVEXTRACT128Index(SDNode *N) {
4845 return isVEXTRACTIndex(N, 128);
4848 bool X86::isVEXTRACT256Index(SDNode *N) {
4849 return isVEXTRACTIndex(N, 256);
4852 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4853 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4854 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4855 "Illegal extract subvector for VEXTRACT");
4857 uint64_t Index = N->getConstantOperandVal(1);
4858 MVT VecVT = N->getOperand(0).getSimpleValueType();
4859 unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4860 return Index / NumElemsPerChunk;
4863 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4864 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4865 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4866 "Illegal insert subvector for VINSERT");
4868 uint64_t Index = N->getConstantOperandVal(2);
4869 MVT VecVT = N->getSimpleValueType(0);
4870 unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4871 return Index / NumElemsPerChunk;
4874 /// Return the appropriate immediate to extract the specified
4875 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4876 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4877 return getExtractVEXTRACTImmediate(N, 128);
4880 /// Return the appropriate immediate to extract the specified
4881 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4882 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4883 return getExtractVEXTRACTImmediate(N, 256);
4886 /// Return the appropriate immediate to insert at the specified
4887 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4888 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4889 return getInsertVINSERTImmediate(N, 128);
4892 /// Return the appropriate immediate to insert at the specified
4893 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4894 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4895 return getInsertVINSERTImmediate(N, 256);
4898 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4899 bool X86::isZeroNode(SDValue Elt) {
4900 return isNullConstant(Elt) || isNullFPConstant(Elt);
4903 // Build a vector of constants.
4904 // Use an UNDEF node if MaskElt == -1.
4905 // Split 64-bit constants in the 32-bit mode.
4906 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4907 const SDLoc &dl, bool IsMask = false) {
4909 SmallVector<SDValue, 32> Ops;
4912 MVT ConstVecVT = VT;
4913 unsigned NumElts = VT.getVectorNumElements();
4914 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4915 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4916 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4920 MVT EltVT = ConstVecVT.getVectorElementType();
4921 for (unsigned i = 0; i < NumElts; ++i) {
4922 bool IsUndef = Values[i] < 0 && IsMask;
4923 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4924 DAG.getConstant(Values[i], dl, EltVT);
4925 Ops.push_back(OpNode);
4927 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4928 DAG.getConstant(0, dl, EltVT));
4930 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4932 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4936 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4937 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4938 assert(Bits.size() == Undefs.getBitWidth() &&
4939 "Unequal constant and undef arrays");
4940 SmallVector<SDValue, 32> Ops;
4943 MVT ConstVecVT = VT;
4944 unsigned NumElts = VT.getVectorNumElements();
4945 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4946 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4947 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4951 MVT EltVT = ConstVecVT.getVectorElementType();
4952 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4954 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4957 const APInt &V = Bits[i];
4958 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4960 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4961 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4962 } else if (EltVT == MVT::f32) {
4963 APFloat FV(APFloat::IEEEsingle(), V);
4964 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4965 } else if (EltVT == MVT::f64) {
4966 APFloat FV(APFloat::IEEEdouble(), V);
4967 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4969 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4973 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4974 return DAG.getBitcast(VT, ConstsNode);
4977 /// Returns a vector of specified type with all zero elements.
4978 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4979 SelectionDAG &DAG, const SDLoc &dl) {
4980 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4981 VT.getVectorElementType() == MVT::i1) &&
4982 "Unexpected vector type");
4984 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4985 // type. This ensures they get CSE'd. But if the integer type is not
4986 // available, use a floating-point +0.0 instead.
4988 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4989 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4990 } else if (VT.getVectorElementType() == MVT::i1) {
4991 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4992 "Unexpected vector type");
4993 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4994 "Unexpected vector type");
4995 Vec = DAG.getConstant(0, dl, VT);
4997 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4998 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5000 return DAG.getBitcast(VT, Vec);
5003 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5004 const SDLoc &dl, unsigned vectorWidth) {
5005 EVT VT = Vec.getValueType();
5006 EVT ElVT = VT.getVectorElementType();
5007 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5008 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5009 VT.getVectorNumElements()/Factor);
5011 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5012 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5013 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5015 // This is the index of the first element of the vectorWidth-bit chunk
5016 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5017 IdxVal &= ~(ElemsPerChunk - 1);
5019 // If the input is a buildvector just emit a smaller one.
5020 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5021 return DAG.getBuildVector(
5022 ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
5024 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5025 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5028 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5029 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5030 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5031 /// instructions or a simple subregister reference. Idx is an index in the
5032 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5033 /// lowering EXTRACT_VECTOR_ELT operations easier.
5034 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5035 SelectionDAG &DAG, const SDLoc &dl) {
5036 assert((Vec.getValueType().is256BitVector() ||
5037 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5038 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5041 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5042 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5043 SelectionDAG &DAG, const SDLoc &dl) {
5044 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5045 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5048 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5049 SelectionDAG &DAG, const SDLoc &dl,
5050 unsigned vectorWidth) {
5051 assert((vectorWidth == 128 || vectorWidth == 256) &&
5052 "Unsupported vector width");
5053 // Inserting UNDEF is Result
5056 EVT VT = Vec.getValueType();
5057 EVT ElVT = VT.getVectorElementType();
5058 EVT ResultVT = Result.getValueType();
5060 // Insert the relevant vectorWidth bits.
5061 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5062 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5064 // This is the index of the first element of the vectorWidth-bit chunk
5065 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5066 IdxVal &= ~(ElemsPerChunk - 1);
5068 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5069 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5072 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5073 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5074 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5075 /// simple superregister reference. Idx is an index in the 128 bits
5076 /// we want. It need not be aligned to a 128-bit boundary. That makes
5077 /// lowering INSERT_VECTOR_ELT operations easier.
5078 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5079 SelectionDAG &DAG, const SDLoc &dl) {
5080 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5081 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5084 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5085 SelectionDAG &DAG, const SDLoc &dl) {
5086 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5087 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5090 // Return true if the instruction zeroes the unused upper part of the
5091 // destination and accepts mask.
5092 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5096 case X86ISD::PCMPEQM:
5097 case X86ISD::PCMPGTM:
5104 /// Insert i1-subvector to i1-vector.
5105 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5106 const X86Subtarget &Subtarget) {
5109 SDValue Vec = Op.getOperand(0);
5110 SDValue SubVec = Op.getOperand(1);
5111 SDValue Idx = Op.getOperand(2);
5113 if (!isa<ConstantSDNode>(Idx))
5116 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5117 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5120 MVT OpVT = Op.getSimpleValueType();
5121 MVT SubVecVT = SubVec.getSimpleValueType();
5122 unsigned NumElems = OpVT.getVectorNumElements();
5123 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5125 assert(IdxVal + SubVecNumElems <= NumElems &&
5126 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5127 "Unexpected index value in INSERT_SUBVECTOR");
5129 // There are 3 possible cases:
5130 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5131 // 2. Subvector should be inserted in the upper part
5132 // (IdxVal + SubVecNumElems == NumElems)
5133 // 3. Subvector should be inserted in the middle (for example v2i1
5134 // to v16i1, index 2)
5136 // If this node widens - by concatenating zeroes - the type of the result
5137 // of a node with instruction that zeroes all upper (irrelevant) bits of the
5138 // output register, mark this node as legal to enable replacing them with
5139 // the v8i1 version of the previous instruction during instruction selection.
5140 // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
5141 // while zeroing all the upper remaining 60 bits of the register. if the
5142 // result of such instruction is inserted into an allZeroVector, then we can
5143 // safely remove insert_vector (in instruction selection) as the cmp instr
5144 // already zeroed the rest of the register.
5145 if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 &&
5146 (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) ||
5147 (SubVec.getOpcode() == ISD::AND &&
5148 (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) ||
5149 isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
5152 // extend to natively supported kshift
5153 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5154 MVT WideOpVT = OpVT;
5155 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5158 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5159 SDValue Undef = DAG.getUNDEF(WideOpVT);
5160 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5161 Undef, SubVec, ZeroIdx);
5163 // Extract sub-vector if require.
5164 auto ExtractSubVec = [&](SDValue V) {
5165 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5169 if (Vec.isUndef()) {
5171 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5172 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5175 return ExtractSubVec(WideSubVec);
5178 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5179 NumElems = WideOpVT.getVectorNumElements();
5180 unsigned ShiftLeft = NumElems - SubVecNumElems;
5181 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5182 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5183 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5184 Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5185 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5186 return ExtractSubVec(Vec);
5190 // Zero lower bits of the Vec
5191 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5192 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5193 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5194 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5195 // Merge them together, SubVec should be zero extended.
5196 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5197 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5199 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5200 return ExtractSubVec(Vec);
5203 // Simple case when we put subvector in the upper part
5204 if (IdxVal + SubVecNumElems == NumElems) {
5205 // Zero upper bits of the Vec
5206 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5207 DAG.getConstant(IdxVal, dl, MVT::i8));
5208 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5209 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5210 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5211 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5212 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5213 return ExtractSubVec(Vec);
5215 // Subvector should be inserted in the middle - use shuffle
5216 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5218 SmallVector<int, 64> Mask;
5219 for (unsigned i = 0; i < NumElems; ++i)
5220 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5222 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5225 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5226 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5227 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5228 /// large BUILD_VECTORS.
5229 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5230 unsigned NumElems, SelectionDAG &DAG,
5232 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5233 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5236 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5237 unsigned NumElems, SelectionDAG &DAG,
5239 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5240 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5243 /// Returns a vector of specified type with all bits set.
5244 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5245 /// Then bitcast to their original type, ensuring they get CSE'd.
5246 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5247 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5248 "Expected a 128/256/512-bit vector type");
5250 APInt Ones = APInt::getAllOnesValue(32);
5251 unsigned NumElts = VT.getSizeInBits() / 32;
5252 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5253 return DAG.getBitcast(VT, Vec);
5256 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5257 SelectionDAG &DAG) {
5258 EVT InVT = In.getValueType();
5259 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5261 if (VT.is128BitVector() && InVT.is128BitVector())
5262 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5263 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5265 // For 256-bit vectors, we only need the lower (128-bit) input half.
5266 // For 512-bit vectors, we only need the lower input half or quarter.
5267 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5268 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5269 In = extractSubVector(In, 0, DAG, DL,
5270 std::max(128, (int)VT.getSizeInBits() / Scale));
5273 return DAG.getNode(Opc, DL, VT, In);
5276 /// Generate unpacklo/unpackhi shuffle mask.
5277 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5279 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5280 int NumElts = VT.getVectorNumElements();
5281 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5283 for (int i = 0; i < NumElts; ++i) {
5284 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5285 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5286 Pos += (Unary ? 0 : NumElts * (i % 2));
5287 Pos += (Lo ? 0 : NumEltsInLane / 2);
5288 Mask.push_back(Pos);
5292 /// Returns a vector_shuffle node for an unpackl operation.
5293 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5294 SDValue V1, SDValue V2) {
5295 SmallVector<int, 8> Mask;
5296 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5297 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5300 /// Returns a vector_shuffle node for an unpackh operation.
5301 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5302 SDValue V1, SDValue V2) {
5303 SmallVector<int, 8> Mask;
5304 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5305 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5308 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5309 /// This produces a shuffle where the low element of V2 is swizzled into the
5310 /// zero/undef vector, landing at element Idx.
5311 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5312 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5314 const X86Subtarget &Subtarget,
5315 SelectionDAG &DAG) {
5316 MVT VT = V2.getSimpleValueType();
5318 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5319 int NumElems = VT.getVectorNumElements();
5320 SmallVector<int, 16> MaskVec(NumElems);
5321 for (int i = 0; i != NumElems; ++i)
5322 // If this is the insertion idx, put the low elt of V2 here.
5323 MaskVec[i] = (i == Idx) ? NumElems : i;
5324 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5327 static SDValue peekThroughBitcasts(SDValue V) {
5328 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5329 V = V.getOperand(0);
5333 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5334 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5335 V.getOperand(0).hasOneUse())
5336 V = V.getOperand(0);
5340 static const Constant *getTargetConstantFromNode(SDValue Op) {
5341 Op = peekThroughBitcasts(Op);
5343 auto *Load = dyn_cast<LoadSDNode>(Op);
5347 SDValue Ptr = Load->getBasePtr();
5348 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5349 Ptr->getOpcode() == X86ISD::WrapperRIP)
5350 Ptr = Ptr->getOperand(0);
5352 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5353 if (!CNode || CNode->isMachineConstantPoolEntry())
5356 return dyn_cast<Constant>(CNode->getConstVal());
5359 // Extract raw constant bits from constant pools.
5360 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5362 SmallVectorImpl<APInt> &EltBits,
5363 bool AllowWholeUndefs = true,
5364 bool AllowPartialUndefs = true) {
5365 assert(EltBits.empty() && "Expected an empty EltBits vector");
5367 Op = peekThroughBitcasts(Op);
5369 EVT VT = Op.getValueType();
5370 unsigned SizeInBits = VT.getSizeInBits();
5371 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5372 unsigned NumElts = SizeInBits / EltSizeInBits;
5374 // Bitcast a source array of element bits to the target size.
5375 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5376 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5377 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5378 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5379 "Constant bit sizes don't match");
5381 // Don't split if we don't allow undef bits.
5382 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5383 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5386 // If we're already the right size, don't bother bitcasting.
5387 if (NumSrcElts == NumElts) {
5388 UndefElts = UndefSrcElts;
5389 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5393 // Extract all the undef/constant element data and pack into single bitsets.
5394 APInt UndefBits(SizeInBits, 0);
5395 APInt MaskBits(SizeInBits, 0);
5397 for (unsigned i = 0; i != NumSrcElts; ++i) {
5398 unsigned BitOffset = i * SrcEltSizeInBits;
5399 if (UndefSrcElts[i])
5400 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5401 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5404 // Split the undef/constant single bitset data into the target elements.
5405 UndefElts = APInt(NumElts, 0);
5406 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5408 for (unsigned i = 0; i != NumElts; ++i) {
5409 unsigned BitOffset = i * EltSizeInBits;
5410 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5412 // Only treat an element as UNDEF if all bits are UNDEF.
5413 if (UndefEltBits.isAllOnesValue()) {
5414 if (!AllowWholeUndefs)
5416 UndefElts.setBit(i);
5420 // If only some bits are UNDEF then treat them as zero (or bail if not
5422 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5425 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5426 EltBits[i] = Bits.getZExtValue();
5431 // Collect constant bits and insert into mask/undef bit masks.
5432 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5433 unsigned UndefBitIndex) {
5436 if (isa<UndefValue>(Cst)) {
5437 Undefs.setBit(UndefBitIndex);
5440 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5441 Mask = CInt->getValue();
5444 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5445 Mask = CFP->getValueAPF().bitcastToAPInt();
5451 // Extract constant bits from build vector.
5452 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5453 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5454 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5456 APInt UndefSrcElts(NumSrcElts, 0);
5457 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5458 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5459 const SDValue &Src = Op.getOperand(i);
5460 if (Src.isUndef()) {
5461 UndefSrcElts.setBit(i);
5464 auto *Cst = cast<ConstantSDNode>(Src);
5465 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5467 return CastBitData(UndefSrcElts, SrcEltBits);
5470 // Extract constant bits from constant pool vector.
5471 if (auto *Cst = getTargetConstantFromNode(Op)) {
5472 Type *CstTy = Cst->getType();
5473 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5476 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5477 unsigned NumSrcElts = CstTy->getVectorNumElements();
5479 APInt UndefSrcElts(NumSrcElts, 0);
5480 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5481 for (unsigned i = 0; i != NumSrcElts; ++i)
5482 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5486 return CastBitData(UndefSrcElts, SrcEltBits);
5489 // Extract constant bits from a broadcasted constant pool scalar.
5490 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5491 EltSizeInBits <= VT.getScalarSizeInBits()) {
5492 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5493 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5494 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5496 APInt UndefSrcElts(NumSrcElts, 0);
5497 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5498 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5499 if (UndefSrcElts[0])
5500 UndefSrcElts.setBits(0, NumSrcElts);
5501 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5502 return CastBitData(UndefSrcElts, SrcEltBits);
5507 // Extract a rematerialized scalar constant insertion.
5508 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5509 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5510 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5511 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5512 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5514 APInt UndefSrcElts(NumSrcElts, 0);
5515 SmallVector<APInt, 64> SrcEltBits;
5516 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5517 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5518 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5519 return CastBitData(UndefSrcElts, SrcEltBits);
5525 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5526 unsigned MaskEltSizeInBits,
5527 SmallVectorImpl<uint64_t> &RawMask) {
5529 SmallVector<APInt, 64> EltBits;
5531 // Extract the raw target constant bits.
5532 // FIXME: We currently don't support UNDEF bits or mask entries.
5533 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5534 EltBits, /* AllowWholeUndefs */ false,
5535 /* AllowPartialUndefs */ false))
5538 // Insert the extracted elements into the mask.
5539 for (APInt Elt : EltBits)
5540 RawMask.push_back(Elt.getZExtValue());
5545 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5546 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5547 /// operands in \p Ops, and returns true.
5548 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5549 /// IsUnary for shuffles which use a single input multiple times, and in those
5550 /// cases it will adjust the mask to only have indices within that single input.
5551 /// It is an error to call this with non-empty Mask/Ops vectors.
5552 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5553 SmallVectorImpl<SDValue> &Ops,
5554 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5555 unsigned NumElems = VT.getVectorNumElements();
5558 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5559 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5562 bool IsFakeUnary = false;
5563 switch(N->getOpcode()) {
5564 case X86ISD::BLENDI:
5565 ImmN = N->getOperand(N->getNumOperands()-1);
5566 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5567 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5570 ImmN = N->getOperand(N->getNumOperands()-1);
5571 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5572 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5574 case X86ISD::INSERTPS:
5575 ImmN = N->getOperand(N->getNumOperands()-1);
5576 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5577 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5579 case X86ISD::EXTRQI:
5580 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5581 isa<ConstantSDNode>(N->getOperand(2))) {
5582 int BitLen = N->getConstantOperandVal(1);
5583 int BitIdx = N->getConstantOperandVal(2);
5584 DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
5588 case X86ISD::INSERTQI:
5589 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5590 isa<ConstantSDNode>(N->getOperand(3))) {
5591 int BitLen = N->getConstantOperandVal(2);
5592 int BitIdx = N->getConstantOperandVal(3);
5593 DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
5594 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5597 case X86ISD::UNPCKH:
5598 DecodeUNPCKHMask(VT, Mask);
5599 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5601 case X86ISD::UNPCKL:
5602 DecodeUNPCKLMask(VT, Mask);
5603 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5605 case X86ISD::MOVHLPS:
5606 DecodeMOVHLPSMask(NumElems, Mask);
5607 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5609 case X86ISD::MOVLHPS:
5610 DecodeMOVLHPSMask(NumElems, Mask);
5611 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5613 case X86ISD::PALIGNR:
5614 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5615 ImmN = N->getOperand(N->getNumOperands()-1);
5616 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5617 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5618 Ops.push_back(N->getOperand(1));
5619 Ops.push_back(N->getOperand(0));
5621 case X86ISD::VSHLDQ:
5622 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5623 ImmN = N->getOperand(N->getNumOperands() - 1);
5624 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5627 case X86ISD::VSRLDQ:
5628 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5629 ImmN = N->getOperand(N->getNumOperands() - 1);
5630 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5633 case X86ISD::PSHUFD:
5634 case X86ISD::VPERMILPI:
5635 ImmN = N->getOperand(N->getNumOperands()-1);
5636 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5639 case X86ISD::PSHUFHW:
5640 ImmN = N->getOperand(N->getNumOperands()-1);
5641 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5644 case X86ISD::PSHUFLW:
5645 ImmN = N->getOperand(N->getNumOperands()-1);
5646 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5649 case X86ISD::VZEXT_MOVL:
5650 DecodeZeroMoveLowMask(VT, Mask);
5653 case X86ISD::VBROADCAST: {
5654 SDValue N0 = N->getOperand(0);
5655 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5656 // add the pre-extracted value to the Ops vector.
5657 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5658 N0.getOperand(0).getValueType() == VT &&
5659 N0.getConstantOperandVal(1) == 0)
5660 Ops.push_back(N0.getOperand(0));
5662 // We only decode broadcasts of same-sized vectors, unless the broadcast
5663 // came from an extract from the original width. If we found one, we
5664 // pushed it the Ops vector above.
5665 if (N0.getValueType() == VT || !Ops.empty()) {
5666 DecodeVectorBroadcast(VT, Mask);
5672 case X86ISD::VPERMILPV: {
5674 SDValue MaskNode = N->getOperand(1);
5675 unsigned MaskEltSize = VT.getScalarSizeInBits();
5676 SmallVector<uint64_t, 32> RawMask;
5677 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5678 DecodeVPERMILPMask(VT, RawMask, Mask);
5681 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5682 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5687 case X86ISD::PSHUFB: {
5689 SDValue MaskNode = N->getOperand(1);
5690 SmallVector<uint64_t, 32> RawMask;
5691 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5692 DecodePSHUFBMask(RawMask, Mask);
5695 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5696 DecodePSHUFBMask(C, Mask);
5701 case X86ISD::VPERMI:
5702 ImmN = N->getOperand(N->getNumOperands()-1);
5703 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5708 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5710 case X86ISD::VPERM2X128:
5711 ImmN = N->getOperand(N->getNumOperands()-1);
5712 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5713 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5715 case X86ISD::MOVSLDUP:
5716 DecodeMOVSLDUPMask(VT, Mask);
5719 case X86ISD::MOVSHDUP:
5720 DecodeMOVSHDUPMask(VT, Mask);
5723 case X86ISD::MOVDDUP:
5724 DecodeMOVDDUPMask(VT, Mask);
5727 case X86ISD::MOVLHPD:
5728 case X86ISD::MOVLPD:
5729 case X86ISD::MOVLPS:
5730 // Not yet implemented
5732 case X86ISD::VPERMIL2: {
5733 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5734 unsigned MaskEltSize = VT.getScalarSizeInBits();
5735 SDValue MaskNode = N->getOperand(2);
5736 SDValue CtrlNode = N->getOperand(3);
5737 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5738 unsigned CtrlImm = CtrlOp->getZExtValue();
5739 SmallVector<uint64_t, 32> RawMask;
5740 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5741 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5744 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5745 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5751 case X86ISD::VPPERM: {
5752 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5753 SDValue MaskNode = N->getOperand(2);
5754 SmallVector<uint64_t, 32> RawMask;
5755 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5756 DecodeVPPERMMask(RawMask, Mask);
5759 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5760 DecodeVPPERMMask(C, Mask);
5765 case X86ISD::VPERMV: {
5767 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5768 Ops.push_back(N->getOperand(1));
5769 SDValue MaskNode = N->getOperand(0);
5770 SmallVector<uint64_t, 32> RawMask;
5771 unsigned MaskEltSize = VT.getScalarSizeInBits();
5772 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5773 DecodeVPERMVMask(RawMask, Mask);
5776 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5777 DecodeVPERMVMask(C, MaskEltSize, Mask);
5782 case X86ISD::VPERMV3: {
5783 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5784 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5785 Ops.push_back(N->getOperand(0));
5786 Ops.push_back(N->getOperand(2));
5787 SDValue MaskNode = N->getOperand(1);
5788 unsigned MaskEltSize = VT.getScalarSizeInBits();
5789 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5790 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5795 case X86ISD::VPERMIV3: {
5796 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5797 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5798 Ops.push_back(N->getOperand(1));
5799 Ops.push_back(N->getOperand(2));
5800 SDValue MaskNode = N->getOperand(0);
5801 unsigned MaskEltSize = VT.getScalarSizeInBits();
5802 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5803 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5808 default: llvm_unreachable("unknown target shuffle node");
5811 // Empty mask indicates the decode failed.
5815 // Check if we're getting a shuffle mask with zero'd elements.
5816 if (!AllowSentinelZero)
5817 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5820 // If we have a fake unary shuffle, the shuffle mask is spread across two
5821 // inputs that are actually the same node. Re-map the mask to always point
5822 // into the first input.
5825 if (M >= (int)Mask.size())
5828 // If we didn't already add operands in the opcode-specific code, default to
5829 // adding 1 or 2 operands starting at 0.
5831 Ops.push_back(N->getOperand(0));
5832 if (!IsUnary || IsFakeUnary)
5833 Ops.push_back(N->getOperand(1));
5839 /// Check a target shuffle mask's inputs to see if we can set any values to
5840 /// SM_SentinelZero - this is for elements that are known to be zero
5841 /// (not just zeroable) from their inputs.
5842 /// Returns true if the target shuffle mask was decoded.
5843 static bool setTargetShuffleZeroElements(SDValue N,
5844 SmallVectorImpl<int> &Mask,
5845 SmallVectorImpl<SDValue> &Ops) {
5847 if (!isTargetShuffle(N.getOpcode()))
5850 MVT VT = N.getSimpleValueType();
5851 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5854 SDValue V1 = Ops[0];
5855 SDValue V2 = IsUnary ? V1 : Ops[1];
5857 V1 = peekThroughBitcasts(V1);
5858 V2 = peekThroughBitcasts(V2);
5860 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5861 "Illegal split of shuffle value type");
5862 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5864 // Extract known constant input data.
5865 APInt UndefSrcElts[2];
5866 SmallVector<APInt, 32> SrcEltBits[2];
5867 bool IsSrcConstant[2] = {
5868 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5869 SrcEltBits[0], true, false),
5870 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5871 SrcEltBits[1], true, false)};
5873 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5876 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5880 // Determine shuffle input and normalize the mask.
5881 unsigned SrcIdx = M / Size;
5882 SDValue V = M < Size ? V1 : V2;
5885 // We are referencing an UNDEF input.
5887 Mask[i] = SM_SentinelUndef;
5891 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5892 // TODO: We currently only set UNDEF for integer types - floats use the same
5893 // registers as vectors and many of the scalar folded loads rely on the
5894 // SCALAR_TO_VECTOR pattern.
5895 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5896 (Size % V.getValueType().getVectorNumElements()) == 0) {
5897 int Scale = Size / V.getValueType().getVectorNumElements();
5898 int Idx = M / Scale;
5899 if (Idx != 0 && !VT.isFloatingPoint())
5900 Mask[i] = SM_SentinelUndef;
5901 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5902 Mask[i] = SM_SentinelZero;
5906 // Attempt to extract from the source's constant bits.
5907 if (IsSrcConstant[SrcIdx]) {
5908 if (UndefSrcElts[SrcIdx][M])
5909 Mask[i] = SM_SentinelUndef;
5910 else if (SrcEltBits[SrcIdx][M] == 0)
5911 Mask[i] = SM_SentinelZero;
5915 assert(VT.getVectorNumElements() == Mask.size() &&
5916 "Different mask size from vector size!");
5920 // Attempt to decode ops that could be represented as a shuffle mask.
5921 // The decoded shuffle mask may contain a different number of elements to the
5922 // destination value type.
5923 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5924 SmallVectorImpl<SDValue> &Ops,
5925 SelectionDAG &DAG) {
5929 MVT VT = N.getSimpleValueType();
5930 unsigned NumElts = VT.getVectorNumElements();
5931 unsigned NumSizeInBits = VT.getSizeInBits();
5932 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5933 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5934 "Expected byte aligned value types");
5936 unsigned Opcode = N.getOpcode();
5939 case X86ISD::ANDNP: {
5940 // Attempt to decode as a per-byte mask.
5942 SmallVector<APInt, 32> EltBits;
5943 SDValue N0 = N.getOperand(0);
5944 SDValue N1 = N.getOperand(1);
5945 bool IsAndN = (X86ISD::ANDNP == Opcode);
5946 uint64_t ZeroMask = IsAndN ? 255 : 0;
5947 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5949 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5951 Mask.push_back(SM_SentinelUndef);
5954 uint64_t ByteBits = EltBits[i].getZExtValue();
5955 if (ByteBits != 0 && ByteBits != 255)
5957 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5959 Ops.push_back(IsAndN ? N1 : N0);
5962 case ISD::SCALAR_TO_VECTOR: {
5963 // Match against a scalar_to_vector of an extract from a vector,
5964 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
5965 SDValue N0 = N.getOperand(0);
5968 if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5969 N0.getOperand(0).getValueType() == VT) {
5971 } else if (N0.getOpcode() == ISD::AssertZext &&
5972 N0.getOperand(0).getOpcode() == X86ISD::PEXTRW &&
5973 cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) {
5974 SrcExtract = N0.getOperand(0);
5975 assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16);
5976 } else if (N0.getOpcode() == ISD::AssertZext &&
5977 N0.getOperand(0).getOpcode() == X86ISD::PEXTRB &&
5978 cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) {
5979 SrcExtract = N0.getOperand(0);
5980 assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8);
5983 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
5986 SDValue SrcVec = SrcExtract.getOperand(0);
5987 EVT SrcVT = SrcVec.getValueType();
5988 unsigned NumSrcElts = SrcVT.getVectorNumElements();
5989 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
5991 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
5992 if (NumSrcElts <= SrcIdx)
5995 Ops.push_back(SrcVec);
5996 Mask.push_back(SrcIdx);
5997 Mask.append(NumZeros, SM_SentinelZero);
5998 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
6001 case X86ISD::PINSRB:
6002 case X86ISD::PINSRW: {
6003 SDValue InVec = N.getOperand(0);
6004 SDValue InScl = N.getOperand(1);
6005 uint64_t InIdx = N.getConstantOperandVal(2);
6006 assert(InIdx < NumElts && "Illegal insertion index");
6008 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6009 if (X86::isZeroNode(InScl)) {
6010 Ops.push_back(InVec);
6011 for (unsigned i = 0; i != NumElts; ++i)
6012 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6016 // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
6017 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6019 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6020 if (InScl.getOpcode() != ISD::AssertZext ||
6021 InScl.getOperand(0).getOpcode() != ExOp)
6024 SDValue ExVec = InScl.getOperand(0).getOperand(0);
6025 uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
6026 assert(ExIdx < NumElts && "Illegal extraction index");
6027 Ops.push_back(InVec);
6028 Ops.push_back(ExVec);
6029 for (unsigned i = 0; i != NumElts; ++i)
6030 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6033 case X86ISD::PACKSS: {
6034 // If we know input saturation won't happen we can treat this
6035 // as a truncation shuffle.
6036 if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt ||
6037 DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt)
6040 Ops.push_back(N.getOperand(0));
6041 Ops.push_back(N.getOperand(1));
6042 for (unsigned i = 0; i != NumElts; ++i)
6043 Mask.push_back(i * 2);
6047 case X86ISD::VSRLI: {
6048 uint64_t ShiftVal = N.getConstantOperandVal(1);
6049 // Out of range bit shifts are guaranteed to be zero.
6050 if (NumBitsPerElt <= ShiftVal) {
6051 Mask.append(NumElts, SM_SentinelZero);
6055 // We can only decode 'whole byte' bit shifts as shuffles.
6056 if ((ShiftVal % 8) != 0)
6059 uint64_t ByteShift = ShiftVal / 8;
6060 unsigned NumBytes = NumSizeInBits / 8;
6061 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6062 Ops.push_back(N.getOperand(0));
6064 // Clear mask to all zeros and insert the shifted byte indices.
6065 Mask.append(NumBytes, SM_SentinelZero);
6067 if (X86ISD::VSHLI == Opcode) {
6068 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6069 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6070 Mask[i + j] = i + j - ByteShift;
6072 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6073 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6074 Mask[i + j - ByteShift] = i + j;
6078 case ISD::ZERO_EXTEND_VECTOR_INREG:
6079 case X86ISD::VZEXT: {
6080 // TODO - add support for VPMOVZX with smaller input vector types.
6081 SDValue Src = N.getOperand(0);
6082 MVT SrcVT = Src.getSimpleValueType();
6083 if (NumSizeInBits != SrcVT.getSizeInBits())
6085 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
6094 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6095 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6096 SmallVectorImpl<int> &Mask) {
6097 int MaskWidth = Mask.size();
6098 SmallVector<SDValue, 16> UsedInputs;
6099 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6100 int lo = UsedInputs.size() * MaskWidth;
6101 int hi = lo + MaskWidth;
6102 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6103 UsedInputs.push_back(Inputs[i]);
6110 Inputs = UsedInputs;
6113 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6114 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6115 /// remaining input indices in case we now have a unary shuffle and adjust the
6116 /// inputs accordingly.
6117 /// Returns true if the target shuffle mask was decoded.
6118 static bool resolveTargetShuffleInputs(SDValue Op,
6119 SmallVectorImpl<SDValue> &Inputs,
6120 SmallVectorImpl<int> &Mask,
6121 SelectionDAG &DAG) {
6122 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6123 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6126 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6130 /// Returns the scalar element that will make up the ith
6131 /// element of the result of the vector shuffle.
6132 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6135 return SDValue(); // Limit search depth.
6137 SDValue V = SDValue(N, 0);
6138 EVT VT = V.getValueType();
6139 unsigned Opcode = V.getOpcode();
6141 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6142 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6143 int Elt = SV->getMaskElt(Index);
6146 return DAG.getUNDEF(VT.getVectorElementType());
6148 unsigned NumElems = VT.getVectorNumElements();
6149 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6150 : SV->getOperand(1);
6151 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6154 // Recurse into target specific vector shuffles to find scalars.
6155 if (isTargetShuffle(Opcode)) {
6156 MVT ShufVT = V.getSimpleValueType();
6157 MVT ShufSVT = ShufVT.getVectorElementType();
6158 int NumElems = (int)ShufVT.getVectorNumElements();
6159 SmallVector<int, 16> ShuffleMask;
6160 SmallVector<SDValue, 16> ShuffleOps;
6163 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6166 int Elt = ShuffleMask[Index];
6167 if (Elt == SM_SentinelZero)
6168 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6169 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6170 if (Elt == SM_SentinelUndef)
6171 return DAG.getUNDEF(ShufSVT);
6173 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6174 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6175 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6179 // Actual nodes that may contain scalar elements
6180 if (Opcode == ISD::BITCAST) {
6181 V = V.getOperand(0);
6182 EVT SrcVT = V.getValueType();
6183 unsigned NumElems = VT.getVectorNumElements();
6185 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6189 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6190 return (Index == 0) ? V.getOperand(0)
6191 : DAG.getUNDEF(VT.getVectorElementType());
6193 if (V.getOpcode() == ISD::BUILD_VECTOR)
6194 return V.getOperand(Index);
6199 /// Custom lower build_vector of v16i8.
6200 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6201 unsigned NumNonZero, unsigned NumZero,
6203 const X86Subtarget &Subtarget) {
6204 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6211 // SSE4.1 - use PINSRB to insert each byte directly.
6212 if (Subtarget.hasSSE41()) {
6213 for (unsigned i = 0; i < 16; ++i) {
6214 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6216 // If the build vector contains zeros or our first insertion is not the
6217 // first index then insert into zero vector to break any register
6218 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6221 if (NumZero || 0 != i)
6222 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
6224 assert(0 == i && "Expected insertion into zero-index");
6225 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6226 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6227 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6228 V = DAG.getBitcast(MVT::v16i8, V);
6232 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
6233 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6240 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6241 for (unsigned i = 0; i < 16; ++i) {
6242 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6243 if (ThisIsNonZero && First) {
6245 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6247 V = DAG.getUNDEF(MVT::v8i16);
6252 // FIXME: Investigate extending to i32 instead of just i16.
6253 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6254 SDValue ThisElt, LastElt;
6255 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6256 if (LastIsNonZero) {
6258 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6260 if (ThisIsNonZero) {
6261 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6262 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6263 DAG.getConstant(8, dl, MVT::i8));
6265 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6271 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6272 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6273 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6274 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6275 V = DAG.getBitcast(MVT::v8i16, V);
6277 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6278 DAG.getIntPtrConstant(i / 2, dl));
6284 return DAG.getBitcast(MVT::v16i8, V);
6287 /// Custom lower build_vector of v8i16.
6288 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6289 unsigned NumNonZero, unsigned NumZero,
6291 const X86Subtarget &Subtarget) {
6292 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6298 for (unsigned i = 0; i < 8; ++i) {
6299 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6301 // If the build vector contains zeros or our first insertion is not the
6302 // first index then insert into zero vector to break any register
6303 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6306 if (NumZero || 0 != i)
6307 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6309 assert(0 == i && "Expected insertion into zero-index");
6310 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6311 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6312 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6313 V = DAG.getBitcast(MVT::v8i16, V);
6317 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
6318 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6325 /// Custom lower build_vector of v4i32 or v4f32.
6326 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6327 const X86Subtarget &Subtarget) {
6328 // Find all zeroable elements.
6329 std::bitset<4> Zeroable;
6330 for (int i=0; i < 4; ++i) {
6331 SDValue Elt = Op->getOperand(i);
6332 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6334 assert(Zeroable.size() - Zeroable.count() > 1 &&
6335 "We expect at least two non-zero elements!");
6337 // We only know how to deal with build_vector nodes where elements are either
6338 // zeroable or extract_vector_elt with constant index.
6339 SDValue FirstNonZero;
6340 unsigned FirstNonZeroIdx;
6341 for (unsigned i=0; i < 4; ++i) {
6344 SDValue Elt = Op->getOperand(i);
6345 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6346 !isa<ConstantSDNode>(Elt.getOperand(1)))
6348 // Make sure that this node is extracting from a 128-bit vector.
6349 MVT VT = Elt.getOperand(0).getSimpleValueType();
6350 if (!VT.is128BitVector())
6352 if (!FirstNonZero.getNode()) {
6354 FirstNonZeroIdx = i;
6358 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6359 SDValue V1 = FirstNonZero.getOperand(0);
6360 MVT VT = V1.getSimpleValueType();
6362 // See if this build_vector can be lowered as a blend with zero.
6364 unsigned EltMaskIdx, EltIdx;
6366 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6367 if (Zeroable[EltIdx]) {
6368 // The zero vector will be on the right hand side.
6369 Mask[EltIdx] = EltIdx+4;
6373 Elt = Op->getOperand(EltIdx);
6374 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6375 EltMaskIdx = Elt.getConstantOperandVal(1);
6376 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6378 Mask[EltIdx] = EltIdx;
6382 // Let the shuffle legalizer deal with blend operations.
6383 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6384 if (V1.getSimpleValueType() != VT)
6385 V1 = DAG.getBitcast(VT, V1);
6386 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6389 // See if we can lower this build_vector to a INSERTPS.
6390 if (!Subtarget.hasSSE41())
6393 SDValue V2 = Elt.getOperand(0);
6394 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6397 bool CanFold = true;
6398 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6402 SDValue Current = Op->getOperand(i);
6403 SDValue SrcVector = Current->getOperand(0);
6406 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6412 assert(V1.getNode() && "Expected at least two non-zero elements!");
6413 if (V1.getSimpleValueType() != MVT::v4f32)
6414 V1 = DAG.getBitcast(MVT::v4f32, V1);
6415 if (V2.getSimpleValueType() != MVT::v4f32)
6416 V2 = DAG.getBitcast(MVT::v4f32, V2);
6418 // Ok, we can emit an INSERTPS instruction.
6419 unsigned ZMask = Zeroable.to_ulong();
6421 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6422 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6424 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6425 DAG.getIntPtrConstant(InsertPSMask, DL));
6426 return DAG.getBitcast(VT, Result);
6429 /// Return a vector logical shift node.
6430 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6431 SelectionDAG &DAG, const TargetLowering &TLI,
6433 assert(VT.is128BitVector() && "Unknown type for VShift");
6434 MVT ShVT = MVT::v16i8;
6435 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6436 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6437 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6438 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6439 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6440 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6443 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6444 SelectionDAG &DAG) {
6446 // Check if the scalar load can be widened into a vector load. And if
6447 // the address is "base + cst" see if the cst can be "absorbed" into
6448 // the shuffle mask.
6449 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6450 SDValue Ptr = LD->getBasePtr();
6451 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6453 EVT PVT = LD->getValueType(0);
6454 if (PVT != MVT::i32 && PVT != MVT::f32)
6459 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6460 FI = FINode->getIndex();
6462 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6463 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6464 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6465 Offset = Ptr.getConstantOperandVal(1);
6466 Ptr = Ptr.getOperand(0);
6471 // FIXME: 256-bit vector instructions don't require a strict alignment,
6472 // improve this code to support it better.
6473 unsigned RequiredAlign = VT.getSizeInBits()/8;
6474 SDValue Chain = LD->getChain();
6475 // Make sure the stack object alignment is at least 16 or 32.
6476 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6477 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6478 if (MFI.isFixedObjectIndex(FI)) {
6479 // Can't change the alignment. FIXME: It's possible to compute
6480 // the exact stack offset and reference FI + adjust offset instead.
6481 // If someone *really* cares about this. That's the way to implement it.
6484 MFI.setObjectAlignment(FI, RequiredAlign);
6488 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6489 // Ptr + (Offset & ~15).
6492 if ((Offset % RequiredAlign) & 3)
6494 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6497 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6498 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6501 int EltNo = (Offset - StartOffset) >> 2;
6502 unsigned NumElems = VT.getVectorNumElements();
6504 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6505 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6506 LD->getPointerInfo().getWithOffset(StartOffset));
6508 SmallVector<int, 8> Mask(NumElems, EltNo);
6510 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6516 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6517 /// elements can be replaced by a single large load which has the same value as
6518 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6520 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6521 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6522 const SDLoc &DL, SelectionDAG &DAG,
6523 const X86Subtarget &Subtarget,
6524 bool isAfterLegalize) {
6525 unsigned NumElems = Elts.size();
6527 int LastLoadedElt = -1;
6528 SmallBitVector LoadMask(NumElems, false);
6529 SmallBitVector ZeroMask(NumElems, false);
6530 SmallBitVector UndefMask(NumElems, false);
6532 // For each element in the initializer, see if we've found a load, zero or an
6534 for (unsigned i = 0; i < NumElems; ++i) {
6535 SDValue Elt = peekThroughBitcasts(Elts[i]);
6540 UndefMask[i] = true;
6541 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6543 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6546 // Each loaded element must be the correct fractional portion of the
6547 // requested vector load.
6548 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6553 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6554 "Incomplete element masks");
6556 // Handle Special Cases - all undef or undef/zero.
6557 if (UndefMask.count() == NumElems)
6558 return DAG.getUNDEF(VT);
6560 // FIXME: Should we return this as a BUILD_VECTOR instead?
6561 if ((ZeroMask | UndefMask).count() == NumElems)
6562 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6563 : DAG.getConstantFP(0.0, DL, VT);
6565 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6566 int FirstLoadedElt = LoadMask.find_first();
6567 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6568 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6569 EVT LDBaseVT = EltBase.getValueType();
6571 // Consecutive loads can contain UNDEFS but not ZERO elements.
6572 // Consecutive loads with UNDEFs and ZEROs elements require a
6573 // an additional shuffle stage to clear the ZERO elements.
6574 bool IsConsecutiveLoad = true;
6575 bool IsConsecutiveLoadWithZeros = true;
6576 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6578 SDValue Elt = peekThroughBitcasts(Elts[i]);
6579 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6580 if (!DAG.areNonVolatileConsecutiveLoads(
6581 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6582 i - FirstLoadedElt)) {
6583 IsConsecutiveLoad = false;
6584 IsConsecutiveLoadWithZeros = false;
6587 } else if (ZeroMask[i]) {
6588 IsConsecutiveLoad = false;
6592 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6593 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6594 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6595 "Cannot merge volatile loads.");
6597 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6598 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6599 DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);
6603 // LOAD - all consecutive load/undefs (must start/end with a load).
6604 // If we have found an entire vector of loads and undefs, then return a large
6605 // load of the entire vector width starting at the base pointer.
6606 // If the vector contains zeros, then attempt to shuffle those elements.
6607 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6608 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6609 assert(LDBase && "Did not find base load for merging consecutive loads");
6610 EVT EltVT = LDBase->getValueType(0);
6611 // Ensure that the input vector size for the merged loads matches the
6612 // cumulative size of the input elements.
6613 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6616 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6619 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6620 // will lower to regular temporal loads and use the cache.
6621 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6622 VT.is256BitVector() && !Subtarget.hasInt256())
6625 if (IsConsecutiveLoad)
6626 return CreateLoad(VT, LDBase);
6628 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6629 // vector and a zero vector to clear out the zero elements.
6630 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6631 SmallVector<int, 4> ClearMask(NumElems, -1);
6632 for (unsigned i = 0; i < NumElems; ++i) {
6634 ClearMask[i] = i + NumElems;
6635 else if (LoadMask[i])
6638 SDValue V = CreateLoad(VT, LDBase);
6639 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6640 : DAG.getConstantFP(0.0, DL, VT);
6641 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6646 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6648 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6649 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6650 (LoadSize == 32 || LoadSize == 64) &&
6651 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6652 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6653 : MVT::getIntegerVT(LoadSize);
6654 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6655 if (TLI.isTypeLegal(VecVT)) {
6656 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6657 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6659 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6660 LDBase->getPointerInfo(),
6661 LDBase->getAlignment(),
6662 false/*isVolatile*/, true/*ReadMem*/,
6664 DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);
6665 return DAG.getBitcast(VT, ResNode);
6672 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6673 unsigned SplatBitSize, LLVMContext &C) {
6674 unsigned ScalarSize = VT.getScalarSizeInBits();
6675 unsigned NumElm = SplatBitSize / ScalarSize;
6677 SmallVector<Constant *, 32> ConstantVec;
6678 for (unsigned i = 0; i < NumElm; i++) {
6679 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6681 if (VT.isFloatingPoint()) {
6682 if (ScalarSize == 32) {
6683 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6685 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6686 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6689 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6690 ConstantVec.push_back(Const);
6692 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6695 static bool isUseOfShuffle(SDNode *N) {
6696 for (auto *U : N->uses()) {
6697 if (isTargetShuffle(U->getOpcode()))
6699 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6700 return isUseOfShuffle(U);
6705 /// Attempt to use the vbroadcast instruction to generate a splat value
6706 /// from a splat BUILD_VECTOR which uses:
6707 /// a. A single scalar load, or a constant.
6708 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6710 /// The VBROADCAST node is returned when a pattern is found,
6711 /// or SDValue() otherwise.
6712 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6713 const X86Subtarget &Subtarget,
6714 SelectionDAG &DAG) {
6715 // VBROADCAST requires AVX.
6716 // TODO: Splats could be generated for non-AVX CPUs using SSE
6717 // instructions, but there's less potential gain for only 128-bit vectors.
6718 if (!Subtarget.hasAVX())
6721 MVT VT = BVOp->getSimpleValueType(0);
6724 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6725 "Unsupported vector type for broadcast.");
6727 BitVector UndefElements;
6728 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6730 // We need a splat of a single value to use broadcast, and it doesn't
6731 // make any sense if the value is only in one element of the vector.
6732 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6733 APInt SplatValue, Undef;
6734 unsigned SplatBitSize;
6736 // Check if this is a repeated constant pattern suitable for broadcasting.
6737 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6738 SplatBitSize > VT.getScalarSizeInBits() &&
6739 SplatBitSize < VT.getSizeInBits()) {
6740 // Avoid replacing with broadcast when it's a use of a shuffle
6741 // instruction to preserve the present custom lowering of shuffles.
6742 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6744 // replace BUILD_VECTOR with broadcast of the repeated constants.
6745 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6746 LLVMContext *Ctx = DAG.getContext();
6747 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6748 if (Subtarget.hasAVX()) {
6749 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6750 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6751 // Splatted value can fit in one INTEGER constant in constant pool.
6752 // Load the constant and broadcast it.
6753 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6754 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6755 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6756 SDValue CP = DAG.getConstantPool(C, PVT);
6757 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6759 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6761 CVT, dl, DAG.getEntryNode(), CP,
6762 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6764 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6765 MVT::getVectorVT(CVT, Repeat), Ld);
6766 return DAG.getBitcast(VT, Brdcst);
6767 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6768 // Splatted value can fit in one FLOAT constant in constant pool.
6769 // Load the constant and broadcast it.
6770 // AVX have support for 32 and 64 bit broadcast for floats only.
6771 // No 64bit integer in 32bit subtarget.
6772 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6773 // Lower the splat via APFloat directly, to avoid any conversion.
6776 ? ConstantFP::get(*Ctx,
6777 APFloat(APFloat::IEEEsingle(), SplatValue))
6778 : ConstantFP::get(*Ctx,
6779 APFloat(APFloat::IEEEdouble(), SplatValue));
6780 SDValue CP = DAG.getConstantPool(C, PVT);
6781 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6783 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6785 CVT, dl, DAG.getEntryNode(), CP,
6786 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6788 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6789 MVT::getVectorVT(CVT, Repeat), Ld);
6790 return DAG.getBitcast(VT, Brdcst);
6791 } else if (SplatBitSize > 64) {
6792 // Load the vector of constants and broadcast it.
6793 MVT CVT = VT.getScalarType();
6794 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6796 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6797 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6798 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6800 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6801 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6803 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6804 return DAG.getBitcast(VT, Brdcst);
6811 bool ConstSplatVal =
6812 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6814 // Make sure that all of the users of a non-constant load are from the
6815 // BUILD_VECTOR node.
6816 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6819 unsigned ScalarSize = Ld.getValueSizeInBits();
6820 bool IsGE256 = (VT.getSizeInBits() >= 256);
6822 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6823 // instruction to save 8 or more bytes of constant pool data.
6824 // TODO: If multiple splats are generated to load the same constant,
6825 // it may be detrimental to overall size. There needs to be a way to detect
6826 // that condition to know if this is truly a size win.
6827 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6829 // Handle broadcasting a single constant scalar from the constant pool
6831 // On Sandybridge (no AVX2), it is still better to load a constant vector
6832 // from the constant pool and not to broadcast it from a scalar.
6833 // But override that restriction when optimizing for size.
6834 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6835 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6836 EVT CVT = Ld.getValueType();
6837 assert(!CVT.isVector() && "Must not broadcast a vector type");
6839 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6840 // For size optimization, also splat v2f64 and v2i64, and for size opt
6841 // with AVX2, also splat i8 and i16.
6842 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6843 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6844 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6845 const Constant *C = nullptr;
6846 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6847 C = CI->getConstantIntValue();
6848 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6849 C = CF->getConstantFPValue();
6851 assert(C && "Invalid constant type");
6853 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6855 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6856 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6858 CVT, dl, DAG.getEntryNode(), CP,
6859 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6862 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6866 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6868 // Handle AVX2 in-register broadcasts.
6869 if (!IsLoad && Subtarget.hasInt256() &&
6870 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6871 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6873 // The scalar source must be a normal load.
6877 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6878 (Subtarget.hasVLX() && ScalarSize == 64))
6879 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6881 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6882 // double since there is no vbroadcastsd xmm
6883 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6884 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6885 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6888 // Unsupported broadcast.
6892 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6893 /// underlying vector and index.
6895 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6897 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6899 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6900 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6903 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6905 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6907 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6908 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6911 // In this case the vector is the extract_subvector expression and the index
6912 // is 2, as specified by the shuffle.
6913 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6914 SDValue ShuffleVec = SVOp->getOperand(0);
6915 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6916 assert(ShuffleVecVT.getVectorElementType() ==
6917 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6919 int ShuffleIdx = SVOp->getMaskElt(Idx);
6920 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6921 ExtractedFromVec = ShuffleVec;
6927 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6928 MVT VT = Op.getSimpleValueType();
6930 // Skip if insert_vec_elt is not supported.
6931 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6932 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6936 unsigned NumElems = Op.getNumOperands();
6940 SmallVector<unsigned, 4> InsertIndices;
6941 SmallVector<int, 8> Mask(NumElems, -1);
6943 for (unsigned i = 0; i != NumElems; ++i) {
6944 unsigned Opc = Op.getOperand(i).getOpcode();
6946 if (Opc == ISD::UNDEF)
6949 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6950 // Quit if more than 1 elements need inserting.
6951 if (InsertIndices.size() > 1)
6954 InsertIndices.push_back(i);
6958 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6959 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6961 // Quit if non-constant index.
6962 if (!isa<ConstantSDNode>(ExtIdx))
6964 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6966 // Quit if extracted from vector of different type.
6967 if (ExtractedFromVec.getValueType() != VT)
6970 if (!VecIn1.getNode())
6971 VecIn1 = ExtractedFromVec;
6972 else if (VecIn1 != ExtractedFromVec) {
6973 if (!VecIn2.getNode())
6974 VecIn2 = ExtractedFromVec;
6975 else if (VecIn2 != ExtractedFromVec)
6976 // Quit if more than 2 vectors to shuffle
6980 if (ExtractedFromVec == VecIn1)
6982 else if (ExtractedFromVec == VecIn2)
6983 Mask[i] = Idx + NumElems;
6986 if (!VecIn1.getNode())
6989 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6990 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6992 for (unsigned Idx : InsertIndices)
6993 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6994 DAG.getIntPtrConstant(Idx, DL));
6999 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7000 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7001 Op.getScalarValueSizeInBits() == 1 &&
7002 "Can not convert non-constant vector");
7003 uint64_t Immediate = 0;
7004 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7005 SDValue In = Op.getOperand(idx);
7007 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7010 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7011 return DAG.getConstant(Immediate, dl, VT);
7013 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7015 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
7017 MVT VT = Op.getSimpleValueType();
7018 assert((VT.getVectorElementType() == MVT::i1) &&
7019 "Unexpected type in LowerBUILD_VECTORvXi1!");
7022 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7023 return DAG.getTargetConstant(0, dl, VT);
7025 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7026 return DAG.getTargetConstant(1, dl, VT);
7028 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7029 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7030 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7031 return DAG.getBitcast(VT, Imm);
7032 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7033 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7034 DAG.getIntPtrConstant(0, dl));
7037 // Vector has one or more non-const elements
7038 uint64_t Immediate = 0;
7039 SmallVector<unsigned, 16> NonConstIdx;
7040 bool IsSplat = true;
7041 bool HasConstElts = false;
7043 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7044 SDValue In = Op.getOperand(idx);
7047 if (!isa<ConstantSDNode>(In))
7048 NonConstIdx.push_back(idx);
7050 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7051 HasConstElts = true;
7055 else if (In != Op.getOperand(SplatIdx))
7059 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7061 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7062 DAG.getConstant(1, dl, VT),
7063 DAG.getConstant(0, dl, VT));
7065 // insert elements one by one
7069 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7070 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7072 else if (HasConstElts)
7073 Imm = DAG.getConstant(0, dl, VT);
7075 Imm = DAG.getUNDEF(VT);
7076 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7077 DstVec = DAG.getBitcast(VT, Imm);
7079 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7080 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7081 DAG.getIntPtrConstant(0, dl));
7084 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7085 unsigned InsertIdx = NonConstIdx[i];
7086 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7087 Op.getOperand(InsertIdx),
7088 DAG.getIntPtrConstant(InsertIdx, dl));
7093 /// \brief Return true if \p N implements a horizontal binop and return the
7094 /// operands for the horizontal binop into V0 and V1.
7096 /// This is a helper function of LowerToHorizontalOp().
7097 /// This function checks that the build_vector \p N in input implements a
7098 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7099 /// operation to match.
7100 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7101 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7102 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7105 /// This function only analyzes elements of \p N whose indices are
7106 /// in range [BaseIdx, LastIdx).
7107 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7109 unsigned BaseIdx, unsigned LastIdx,
7110 SDValue &V0, SDValue &V1) {
7111 EVT VT = N->getValueType(0);
7113 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7114 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7115 "Invalid Vector in input!");
7117 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7118 bool CanFold = true;
7119 unsigned ExpectedVExtractIdx = BaseIdx;
7120 unsigned NumElts = LastIdx - BaseIdx;
7121 V0 = DAG.getUNDEF(VT);
7122 V1 = DAG.getUNDEF(VT);
7124 // Check if N implements a horizontal binop.
7125 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7126 SDValue Op = N->getOperand(i + BaseIdx);
7129 if (Op->isUndef()) {
7130 // Update the expected vector extract index.
7131 if (i * 2 == NumElts)
7132 ExpectedVExtractIdx = BaseIdx;
7133 ExpectedVExtractIdx += 2;
7137 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7142 SDValue Op0 = Op.getOperand(0);
7143 SDValue Op1 = Op.getOperand(1);
7145 // Try to match the following pattern:
7146 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7147 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7148 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7149 Op0.getOperand(0) == Op1.getOperand(0) &&
7150 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7151 isa<ConstantSDNode>(Op1.getOperand(1)));
7155 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7156 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7158 if (i * 2 < NumElts) {
7160 V0 = Op0.getOperand(0);
7161 if (V0.getValueType() != VT)
7166 V1 = Op0.getOperand(0);
7167 if (V1.getValueType() != VT)
7170 if (i * 2 == NumElts)
7171 ExpectedVExtractIdx = BaseIdx;
7174 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7175 if (I0 == ExpectedVExtractIdx)
7176 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7177 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7178 // Try to match the following dag sequence:
7179 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7180 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7184 ExpectedVExtractIdx += 2;
7190 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7191 /// a concat_vector.
7193 /// This is a helper function of LowerToHorizontalOp().
7194 /// This function expects two 256-bit vectors called V0 and V1.
7195 /// At first, each vector is split into two separate 128-bit vectors.
7196 /// Then, the resulting 128-bit vectors are used to implement two
7197 /// horizontal binary operations.
7199 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7201 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7202 /// the two new horizontal binop.
7203 /// When Mode is set, the first horizontal binop dag node would take as input
7204 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7205 /// horizontal binop dag node would take as input the lower 128-bit of V1
7206 /// and the upper 128-bit of V1.
7208 /// HADD V0_LO, V0_HI
7209 /// HADD V1_LO, V1_HI
7211 /// Otherwise, the first horizontal binop dag node takes as input the lower
7212 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7213 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7215 /// HADD V0_LO, V1_LO
7216 /// HADD V0_HI, V1_HI
7218 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7219 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7220 /// the upper 128-bits of the result.
7221 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7222 const SDLoc &DL, SelectionDAG &DAG,
7223 unsigned X86Opcode, bool Mode,
7224 bool isUndefLO, bool isUndefHI) {
7225 MVT VT = V0.getSimpleValueType();
7226 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7227 "Invalid nodes in input!");
7229 unsigned NumElts = VT.getVectorNumElements();
7230 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7231 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7232 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7233 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7234 MVT NewVT = V0_LO.getSimpleValueType();
7236 SDValue LO = DAG.getUNDEF(NewVT);
7237 SDValue HI = DAG.getUNDEF(NewVT);
7240 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7241 if (!isUndefLO && !V0->isUndef())
7242 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7243 if (!isUndefHI && !V1->isUndef())
7244 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7246 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7247 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7248 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7250 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7251 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7254 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7257 /// Returns true iff \p BV builds a vector with the result equivalent to
7258 /// the result of ADDSUB operation.
7259 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7260 /// are written to the parameters \p Opnd0 and \p Opnd1.
7261 static bool isAddSub(const BuildVectorSDNode *BV,
7262 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7263 SDValue &Opnd0, SDValue &Opnd1) {
7265 MVT VT = BV->getSimpleValueType(0);
7266 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7267 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7268 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7271 unsigned NumElts = VT.getVectorNumElements();
7272 SDValue InVec0 = DAG.getUNDEF(VT);
7273 SDValue InVec1 = DAG.getUNDEF(VT);
7275 // Odd-numbered elements in the input build vector are obtained from
7276 // adding two integer/float elements.
7277 // Even-numbered elements in the input build vector are obtained from
7278 // subtracting two integer/float elements.
7279 unsigned ExpectedOpcode = ISD::FSUB;
7280 unsigned NextExpectedOpcode = ISD::FADD;
7281 bool AddFound = false;
7282 bool SubFound = false;
7284 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7285 SDValue Op = BV->getOperand(i);
7287 // Skip 'undef' values.
7288 unsigned Opcode = Op.getOpcode();
7289 if (Opcode == ISD::UNDEF) {
7290 std::swap(ExpectedOpcode, NextExpectedOpcode);
7294 // Early exit if we found an unexpected opcode.
7295 if (Opcode != ExpectedOpcode)
7298 SDValue Op0 = Op.getOperand(0);
7299 SDValue Op1 = Op.getOperand(1);
7301 // Try to match the following pattern:
7302 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7303 // Early exit if we cannot match that sequence.
7304 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7305 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7306 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7307 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7308 Op0.getOperand(1) != Op1.getOperand(1))
7311 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7315 // We found a valid add/sub node. Update the information accordingly.
7321 // Update InVec0 and InVec1.
7322 if (InVec0.isUndef()) {
7323 InVec0 = Op0.getOperand(0);
7324 if (InVec0.getSimpleValueType() != VT)
7327 if (InVec1.isUndef()) {
7328 InVec1 = Op1.getOperand(0);
7329 if (InVec1.getSimpleValueType() != VT)
7333 // Make sure that operands in input to each add/sub node always
7334 // come from a same pair of vectors.
7335 if (InVec0 != Op0.getOperand(0)) {
7336 if (ExpectedOpcode == ISD::FSUB)
7339 // FADD is commutable. Try to commute the operands
7340 // and then test again.
7341 std::swap(Op0, Op1);
7342 if (InVec0 != Op0.getOperand(0))
7346 if (InVec1 != Op1.getOperand(0))
7349 // Update the pair of expected opcodes.
7350 std::swap(ExpectedOpcode, NextExpectedOpcode);
7353 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7354 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7362 /// Returns true if is possible to fold MUL and an idiom that has already been
7363 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7364 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7365 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7367 /// Prior to calling this function it should be known that there is some
7368 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7369 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7370 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7371 /// of \p Opnd0 uses is expected to be equal to 2.
7372 /// For example, this function may be called for the following IR:
7373 /// %AB = fmul fast <2 x double> %A, %B
7374 /// %Sub = fsub fast <2 x double> %AB, %C
7375 /// %Add = fadd fast <2 x double> %AB, %C
7376 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7377 /// <2 x i32> <i32 0, i32 3>
7378 /// There is a def for %Addsub here, which potentially can be replaced by
7379 /// X86ISD::ADDSUB operation:
7380 /// %Addsub = X86ISD::ADDSUB %AB, %C
7381 /// and such ADDSUB can further be replaced with FMADDSUB:
7382 /// %Addsub = FMADDSUB %A, %B, %C.
7384 /// The main reason why this method is called before the replacement of the
7385 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7386 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7388 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7389 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7390 if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7391 !Subtarget.hasAnyFMA())
7394 // FIXME: These checks must match the similar ones in
7395 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7396 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7397 // or MUL + ADDSUB to FMADDSUB.
7398 const TargetOptions &Options = DAG.getTarget().Options;
7400 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7405 Opnd1 = Opnd0.getOperand(1);
7406 Opnd0 = Opnd0.getOperand(0);
7411 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7412 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7413 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7414 const X86Subtarget &Subtarget,
7415 SelectionDAG &DAG) {
7416 SDValue Opnd0, Opnd1;
7417 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7420 MVT VT = BV->getSimpleValueType(0);
7423 // Try to generate X86ISD::FMADDSUB node here.
7425 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7426 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7428 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7429 // the ADDSUB idiom has been successfully recognized. There are no known
7430 // X86 targets with 512-bit ADDSUB instructions!
7431 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7433 if (VT.is512BitVector())
7436 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7439 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7440 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7441 const X86Subtarget &Subtarget,
7442 SelectionDAG &DAG) {
7443 MVT VT = BV->getSimpleValueType(0);
7444 unsigned NumElts = VT.getVectorNumElements();
7445 unsigned NumUndefsLO = 0;
7446 unsigned NumUndefsHI = 0;
7447 unsigned Half = NumElts/2;
7449 // Count the number of UNDEF operands in the build_vector in input.
7450 for (unsigned i = 0, e = Half; i != e; ++i)
7451 if (BV->getOperand(i)->isUndef())
7454 for (unsigned i = Half, e = NumElts; i != e; ++i)
7455 if (BV->getOperand(i)->isUndef())
7458 // Early exit if this is either a build_vector of all UNDEFs or all the
7459 // operands but one are UNDEF.
7460 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7464 SDValue InVec0, InVec1;
7465 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7466 // Try to match an SSE3 float HADD/HSUB.
7467 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7468 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7470 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7471 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7472 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7473 // Try to match an SSSE3 integer HADD/HSUB.
7474 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7475 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7477 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7478 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7481 if (!Subtarget.hasAVX())
7484 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7485 // Try to match an AVX horizontal add/sub of packed single/double
7486 // precision floating point values from 256-bit vectors.
7487 SDValue InVec2, InVec3;
7488 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7489 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7490 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7491 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7492 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7494 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7495 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7496 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7497 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7498 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7499 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7500 // Try to match an AVX2 horizontal add/sub of signed integers.
7501 SDValue InVec2, InVec3;
7503 bool CanFold = true;
7505 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7506 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7507 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7508 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7509 X86Opcode = X86ISD::HADD;
7510 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7511 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7512 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7513 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7514 X86Opcode = X86ISD::HSUB;
7519 // Fold this build_vector into a single horizontal add/sub.
7520 // Do this only if the target has AVX2.
7521 if (Subtarget.hasAVX2())
7522 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7524 // Do not try to expand this build_vector into a pair of horizontal
7525 // add/sub if we can emit a pair of scalar add/sub.
7526 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7529 // Convert this build_vector into a pair of horizontal binop followed by
7531 bool isUndefLO = NumUndefsLO == Half;
7532 bool isUndefHI = NumUndefsHI == Half;
7533 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7534 isUndefLO, isUndefHI);
7538 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7539 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7541 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7542 X86Opcode = X86ISD::HADD;
7543 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7544 X86Opcode = X86ISD::HSUB;
7545 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7546 X86Opcode = X86ISD::FHADD;
7547 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7548 X86Opcode = X86ISD::FHSUB;
7552 // Don't try to expand this build_vector into a pair of horizontal add/sub
7553 // if we can simply emit a pair of scalar add/sub.
7554 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7557 // Convert this build_vector into two horizontal add/sub followed by
7559 bool isUndefLO = NumUndefsLO == Half;
7560 bool isUndefHI = NumUndefsHI == Half;
7561 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7562 isUndefLO, isUndefHI);
7568 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7569 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7570 /// just apply the bit to the vectors.
7571 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7572 /// from this, but enough scalar bit operations are created from the later
7573 /// legalization + scalarization stages to need basic support.
7574 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7575 SelectionDAG &DAG) {
7577 MVT VT = Op->getSimpleValueType(0);
7578 unsigned NumElems = VT.getVectorNumElements();
7579 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7581 // Check that all elements have the same opcode.
7582 // TODO: Should we allow UNDEFS and if so how many?
7583 unsigned Opcode = Op->getOperand(0).getOpcode();
7584 for (unsigned i = 1; i < NumElems; ++i)
7585 if (Opcode != Op->getOperand(i).getOpcode())
7588 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7595 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7600 SmallVector<SDValue, 4> LHSElts, RHSElts;
7601 for (SDValue Elt : Op->ops()) {
7602 SDValue LHS = Elt.getOperand(0);
7603 SDValue RHS = Elt.getOperand(1);
7605 // We expect the canonicalized RHS operand to be the constant.
7606 if (!isa<ConstantSDNode>(RHS))
7608 LHSElts.push_back(LHS);
7609 RHSElts.push_back(RHS);
7612 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7613 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7614 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7617 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7618 /// functionality to do this, so it's all zeros, all ones, or some derivation
7619 /// that is cheap to calculate.
7620 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7621 const X86Subtarget &Subtarget) {
7623 MVT VT = Op.getSimpleValueType();
7625 // Vectors containing all zeros can be matched by pxor and xorps.
7626 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7627 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7628 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7629 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7632 return getZeroVector(VT, Subtarget, DAG, DL);
7635 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7636 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7637 // vpcmpeqd on 256-bit vectors.
7638 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7639 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7640 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7643 return getOnesVector(VT, DAG, DL);
7650 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7653 MVT VT = Op.getSimpleValueType();
7654 MVT ExtVT = VT.getVectorElementType();
7655 unsigned NumElems = Op.getNumOperands();
7657 // Generate vectors for predicate vectors.
7658 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7659 return LowerBUILD_VECTORvXi1(Op, DAG);
7661 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7662 return VectorConstant;
7664 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7665 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7667 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7668 return HorizontalOp;
7669 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7671 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7674 unsigned EVTBits = ExtVT.getSizeInBits();
7676 unsigned NumZero = 0;
7677 unsigned NumNonZero = 0;
7678 uint64_t NonZeros = 0;
7679 bool IsAllConstants = true;
7680 SmallSet<SDValue, 8> Values;
7681 for (unsigned i = 0; i < NumElems; ++i) {
7682 SDValue Elt = Op.getOperand(i);
7686 if (Elt.getOpcode() != ISD::Constant &&
7687 Elt.getOpcode() != ISD::ConstantFP)
7688 IsAllConstants = false;
7689 if (X86::isZeroNode(Elt))
7692 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7693 NonZeros |= ((uint64_t)1 << i);
7698 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7699 if (NumNonZero == 0)
7700 return DAG.getUNDEF(VT);
7702 // Special case for single non-zero, non-undef, element.
7703 if (NumNonZero == 1) {
7704 unsigned Idx = countTrailingZeros(NonZeros);
7705 SDValue Item = Op.getOperand(Idx);
7707 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7708 // the value are obviously zero, truncate the value to i32 and do the
7709 // insertion that way. Only do this if the value is non-constant or if the
7710 // value is a constant being inserted into element 0. It is cheaper to do
7711 // a constant pool load than it is to do a movd + shuffle.
7712 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7713 (!IsAllConstants || Idx == 0)) {
7714 if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7716 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7717 MVT VecVT = MVT::v4i32;
7719 // Truncate the value (which may itself be a constant) to i32, and
7720 // convert it to a vector with movd (S2V+shuffle to zero extend).
7721 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7722 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7723 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7724 Item, Idx * 2, true, Subtarget, DAG));
7728 // If we have a constant or non-constant insertion into the low element of
7729 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7730 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7731 // depending on what the source datatype is.
7734 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7736 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7737 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7738 assert((VT.is128BitVector() || VT.is256BitVector() ||
7739 VT.is512BitVector()) &&
7740 "Expected an SSE value type!");
7741 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7742 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7743 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7746 // We can't directly insert an i8 or i16 into a vector, so zero extend
7748 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7749 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7750 if (VT.getSizeInBits() >= 256) {
7751 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7752 if (Subtarget.hasAVX()) {
7753 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7754 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7756 // Without AVX, we need to extend to a 128-bit vector and then
7757 // insert into the 256-bit vector.
7758 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7759 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7760 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7763 assert(VT.is128BitVector() && "Expected an SSE value type!");
7764 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7765 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7767 return DAG.getBitcast(VT, Item);
7771 // Is it a vector logical left shift?
7772 if (NumElems == 2 && Idx == 1 &&
7773 X86::isZeroNode(Op.getOperand(0)) &&
7774 !X86::isZeroNode(Op.getOperand(1))) {
7775 unsigned NumBits = VT.getSizeInBits();
7776 return getVShift(true, VT,
7777 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7778 VT, Op.getOperand(1)),
7779 NumBits/2, DAG, *this, dl);
7782 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7785 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7786 // is a non-constant being inserted into an element other than the low one,
7787 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7788 // movd/movss) to move this into the low element, then shuffle it into
7790 if (EVTBits == 32) {
7791 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7792 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7796 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7797 if (Values.size() == 1) {
7798 if (EVTBits == 32) {
7799 // Instead of a shuffle like this:
7800 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7801 // Check if it's possible to issue this instead.
7802 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7803 unsigned Idx = countTrailingZeros(NonZeros);
7804 SDValue Item = Op.getOperand(Idx);
7805 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7806 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7811 // A vector full of immediates; various special cases are already
7812 // handled, so this is best done with a single constant-pool load.
7816 // See if we can use a vector load to get all of the elements.
7817 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7818 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7820 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
7824 // For AVX-length vectors, build the individual 128-bit pieces and use
7825 // shuffles to put them in place.
7826 if (VT.is256BitVector() || VT.is512BitVector()) {
7827 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7829 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7831 // Build both the lower and upper subvector.
7833 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7834 SDValue Upper = DAG.getBuildVector(
7835 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7837 // Recreate the wider vector with the lower and upper part.
7838 if (VT.is256BitVector())
7839 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7840 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7843 // Let legalizer expand 2-wide build_vectors.
7844 if (EVTBits == 64) {
7845 if (NumNonZero == 1) {
7846 // One half is zero or undef.
7847 unsigned Idx = countTrailingZeros(NonZeros);
7848 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7849 Op.getOperand(Idx));
7850 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7855 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7856 if (EVTBits == 8 && NumElems == 16)
7857 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7861 if (EVTBits == 16 && NumElems == 8)
7862 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7866 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7867 if (EVTBits == 32 && NumElems == 4)
7868 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
7871 // If element VT is == 32 bits, turn it into a number of shuffles.
7872 if (NumElems == 4 && NumZero > 0) {
7873 SmallVector<SDValue, 8> Ops(NumElems);
7874 for (unsigned i = 0; i < 4; ++i) {
7875 bool isZero = !(NonZeros & (1ULL << i));
7877 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7879 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7882 for (unsigned i = 0; i < 2; ++i) {
7883 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7886 Ops[i] = Ops[i*2]; // Must be a zero vector.
7889 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7892 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7895 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7900 bool Reverse1 = (NonZeros & 0x3) == 2;
7901 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7905 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7906 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7908 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7911 if (Values.size() > 1 && VT.is128BitVector()) {
7912 // Check for a build vector from mostly shuffle plus few inserting.
7913 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7916 // For SSE 4.1, use insertps to put the high elements into the low element.
7917 if (Subtarget.hasSSE41()) {
7919 if (!Op.getOperand(0).isUndef())
7920 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7922 Result = DAG.getUNDEF(VT);
7924 for (unsigned i = 1; i < NumElems; ++i) {
7925 if (Op.getOperand(i).isUndef()) continue;
7926 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7927 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7932 // Otherwise, expand into a number of unpckl*, start by extending each of
7933 // our (non-undef) elements to the full vector width with the element in the
7934 // bottom slot of the vector (which generates no code for SSE).
7935 SmallVector<SDValue, 8> Ops(NumElems);
7936 for (unsigned i = 0; i < NumElems; ++i) {
7937 if (!Op.getOperand(i).isUndef())
7938 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7940 Ops[i] = DAG.getUNDEF(VT);
7943 // Next, we iteratively mix elements, e.g. for v4f32:
7944 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
7945 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
7946 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
7947 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
7948 // Generate scaled UNPCKL shuffle mask.
7949 SmallVector<int, 16> Mask;
7950 for(unsigned i = 0; i != Scale; ++i)
7952 for (unsigned i = 0; i != Scale; ++i)
7953 Mask.push_back(NumElems+i);
7954 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
7956 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
7957 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
7964 // 256-bit AVX can use the vinsertf128 instruction
7965 // to create 256-bit vectors from two other 128-bit ones.
7966 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7968 MVT ResVT = Op.getSimpleValueType();
7970 assert((ResVT.is256BitVector() ||
7971 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7973 SDValue V1 = Op.getOperand(0);
7974 SDValue V2 = Op.getOperand(1);
7975 unsigned NumElems = ResVT.getVectorNumElements();
7976 if (ResVT.is256BitVector())
7977 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7979 if (Op.getNumOperands() == 4) {
7980 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7981 ResVT.getVectorNumElements()/2);
7982 SDValue V3 = Op.getOperand(2);
7983 SDValue V4 = Op.getOperand(3);
7984 return concat256BitVectors(
7985 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7986 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7989 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7992 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
7993 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
7994 static bool isExpandWithZeros(const SDValue &Op) {
7995 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
7996 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
7998 for (unsigned i = 1; i < Op.getNumOperands(); i++)
7999 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8005 // Returns true if the given node is a type promotion (by concatenating i1
8006 // zeros) of the result of a node that already zeros all upper bits of
8008 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8009 unsigned Opc = Op.getOpcode();
8011 assert(Opc == ISD::CONCAT_VECTORS &&
8012 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8013 "Unexpected node to check for type promotion!");
8015 // As long as we are concatenating zeros to the upper part of a previous node
8016 // result, climb up the tree until a node with different opcode is
8018 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8019 if (Opc == ISD::INSERT_SUBVECTOR) {
8020 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8021 Op.getConstantOperandVal(2) == 0)
8022 Op = Op.getOperand(1);
8025 } else { // Opc == ISD::CONCAT_VECTORS
8026 if (isExpandWithZeros(Op))
8027 Op = Op.getOperand(0);
8031 Opc = Op.getOpcode();
8034 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8035 // of a node that zeros the upper bits (its masked version).
8036 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8037 (Op.getOpcode() == ISD::AND &&
8038 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8039 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8046 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8047 const X86Subtarget &Subtarget,
8048 SelectionDAG & DAG) {
8050 MVT ResVT = Op.getSimpleValueType();
8051 unsigned NumOfOperands = Op.getNumOperands();
8053 assert(isPowerOf2_32(NumOfOperands) &&
8054 "Unexpected number of operands in CONCAT_VECTORS");
8056 // If this node promotes - by concatenating zeroes - the type of the result
8057 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8058 // output register, mark it as legal and catch the pattern in instruction
8059 // selection to avoid emitting extra insturctions (for zeroing upper bits).
8060 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
8061 SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64);
8062 SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC);
8063 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
8067 SDValue Undef = DAG.getUNDEF(ResVT);
8068 if (NumOfOperands > 2) {
8069 // Specialize the cases when all, or all but one, of the operands are undef.
8070 unsigned NumOfDefinedOps = 0;
8072 for (unsigned i = 0; i < NumOfOperands; i++)
8073 if (!Op.getOperand(i).isUndef()) {
8077 if (NumOfDefinedOps == 0)
8079 if (NumOfDefinedOps == 1) {
8080 unsigned SubVecNumElts =
8081 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
8082 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
8083 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
8084 Op.getOperand(OpIdx), IdxVal);
8087 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8088 ResVT.getVectorNumElements()/2);
8089 SmallVector<SDValue, 2> Ops;
8090 for (unsigned i = 0; i < NumOfOperands/2; i++)
8091 Ops.push_back(Op.getOperand(i));
8092 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8094 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
8095 Ops.push_back(Op.getOperand(i));
8096 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8097 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8101 SDValue V1 = Op.getOperand(0);
8102 SDValue V2 = Op.getOperand(1);
8103 unsigned NumElems = ResVT.getVectorNumElements();
8104 assert(V1.getValueType() == V2.getValueType() &&
8105 V1.getValueType().getVectorNumElements() == NumElems/2 &&
8106 "Unexpected operands in CONCAT_VECTORS");
8108 if (ResVT.getSizeInBits() >= 16)
8109 return Op; // The operation is legal with KUNPCK
8111 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
8112 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
8113 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
8114 if (IsZeroV1 && IsZeroV2)
8117 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
8119 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8121 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
8123 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
8125 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
8128 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
8130 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8131 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
8134 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8135 const X86Subtarget &Subtarget,
8136 SelectionDAG &DAG) {
8137 MVT VT = Op.getSimpleValueType();
8138 if (VT.getVectorElementType() == MVT::i1)
8139 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8141 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8142 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8143 Op.getNumOperands() == 4)));
8145 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8146 // from two other 128-bit ones.
8148 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8149 return LowerAVXCONCAT_VECTORS(Op, DAG);
8152 //===----------------------------------------------------------------------===//
8153 // Vector shuffle lowering
8155 // This is an experimental code path for lowering vector shuffles on x86. It is
8156 // designed to handle arbitrary vector shuffles and blends, gracefully
8157 // degrading performance as necessary. It works hard to recognize idiomatic
8158 // shuffles and lower them to optimal instruction patterns without leaving
8159 // a framework that allows reasonably efficient handling of all vector shuffle
8161 //===----------------------------------------------------------------------===//
8163 /// \brief Tiny helper function to identify a no-op mask.
8165 /// This is a somewhat boring predicate function. It checks whether the mask
8166 /// array input, which is assumed to be a single-input shuffle mask of the kind
8167 /// used by the X86 shuffle instructions (not a fully general
8168 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8169 /// in-place shuffle are 'no-op's.
8170 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8171 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8172 assert(Mask[i] >= -1 && "Out of bound mask element!");
8173 if (Mask[i] >= 0 && Mask[i] != i)
8179 /// \brief Test whether there are elements crossing 128-bit lanes in this
8182 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8183 /// and we routinely test for these.
8184 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8185 int LaneSize = 128 / VT.getScalarSizeInBits();
8186 int Size = Mask.size();
8187 for (int i = 0; i < Size; ++i)
8188 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8193 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8195 /// This checks a shuffle mask to see if it is performing the same
8196 /// lane-relative shuffle in each sub-lane. This trivially implies
8197 /// that it is also not lane-crossing. It may however involve a blend from the
8198 /// same lane of a second vector.
8200 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8201 /// non-trivial to compute in the face of undef lanes. The representation is
8202 /// suitable for use with existing 128-bit shuffles as entries from the second
8203 /// vector have been remapped to [LaneSize, 2*LaneSize).
8204 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8206 SmallVectorImpl<int> &RepeatedMask) {
8207 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8208 RepeatedMask.assign(LaneSize, -1);
8209 int Size = Mask.size();
8210 for (int i = 0; i < Size; ++i) {
8211 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8214 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8215 // This entry crosses lanes, so there is no way to model this shuffle.
8218 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8219 // Adjust second vector indices to start at LaneSize instead of Size.
8220 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8221 : Mask[i] % LaneSize + LaneSize;
8222 if (RepeatedMask[i % LaneSize] < 0)
8223 // This is the first non-undef entry in this slot of a 128-bit lane.
8224 RepeatedMask[i % LaneSize] = LocalM;
8225 else if (RepeatedMask[i % LaneSize] != LocalM)
8226 // Found a mismatch with the repeated mask.
8232 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8234 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8235 SmallVectorImpl<int> &RepeatedMask) {
8236 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8239 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8241 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8242 SmallVectorImpl<int> &RepeatedMask) {
8243 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8246 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8247 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8248 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8250 SmallVectorImpl<int> &RepeatedMask) {
8251 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8252 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8253 int Size = Mask.size();
8254 for (int i = 0; i < Size; ++i) {
8255 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8256 if (Mask[i] == SM_SentinelUndef)
8258 if (Mask[i] == SM_SentinelZero) {
8259 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8261 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8264 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8265 // This entry crosses lanes, so there is no way to model this shuffle.
8268 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8269 // Adjust second vector indices to start at LaneSize instead of Size.
8271 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8272 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8273 // This is the first non-undef entry in this slot of a 128-bit lane.
8274 RepeatedMask[i % LaneSize] = LocalM;
8275 else if (RepeatedMask[i % LaneSize] != LocalM)
8276 // Found a mismatch with the repeated mask.
8282 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8285 /// This is a fast way to test a shuffle mask against a fixed pattern:
8287 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8289 /// It returns true if the mask is exactly as wide as the argument list, and
8290 /// each element of the mask is either -1 (signifying undef) or the value given
8291 /// in the argument.
8292 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8293 ArrayRef<int> ExpectedMask) {
8294 if (Mask.size() != ExpectedMask.size())
8297 int Size = Mask.size();
8299 // If the values are build vectors, we can look through them to find
8300 // equivalent inputs that make the shuffles equivalent.
8301 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8302 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8304 for (int i = 0; i < Size; ++i) {
8305 assert(Mask[i] >= -1 && "Out of bound mask element!");
8306 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8307 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8308 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8309 if (!MaskBV || !ExpectedBV ||
8310 MaskBV->getOperand(Mask[i] % Size) !=
8311 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8319 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8321 /// The masks must be exactly the same width.
8323 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8324 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8326 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8327 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8328 ArrayRef<int> ExpectedMask) {
8329 int Size = Mask.size();
8330 if (Size != (int)ExpectedMask.size())
8333 for (int i = 0; i < Size; ++i)
8334 if (Mask[i] == SM_SentinelUndef)
8336 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8338 else if (Mask[i] != ExpectedMask[i])
8344 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8346 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8347 const APInt &Zeroable) {
8348 int NumElts = Mask.size();
8349 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8351 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8352 for (int i = 0; i != NumElts; ++i) {
8354 if (M == SM_SentinelUndef)
8356 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8357 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8362 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8364 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8365 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8368 SmallVector<int, 8> Unpcklwd;
8369 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8370 /* Unary = */ false);
8371 SmallVector<int, 8> Unpckhwd;
8372 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8373 /* Unary = */ false);
8374 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8375 isTargetShuffleEquivalent(Mask, Unpckhwd));
8376 return IsUnpackwdMask;
8379 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8381 /// This helper function produces an 8-bit shuffle immediate corresponding to
8382 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8383 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8386 /// NB: We rely heavily on "undef" masks preserving the input lane.
8387 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8388 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8389 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8390 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8391 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8392 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8395 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8396 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8397 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8398 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8402 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8403 SelectionDAG &DAG) {
8404 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8407 /// \brief Compute whether each element of a shuffle is zeroable.
8409 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8410 /// Either it is an undef element in the shuffle mask, the element of the input
8411 /// referenced is undef, or the element of the input referenced is known to be
8412 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8413 /// as many lanes with this technique as possible to simplify the remaining
8415 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8416 SDValue V1, SDValue V2) {
8417 APInt Zeroable(Mask.size(), 0);
8418 V1 = peekThroughBitcasts(V1);
8419 V2 = peekThroughBitcasts(V2);
8421 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8422 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8424 int VectorSizeInBits = V1.getValueSizeInBits();
8425 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8426 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8428 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8430 // Handle the easy cases.
8431 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8436 // Determine shuffle input and normalize the mask.
8437 SDValue V = M < Size ? V1 : V2;
8440 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8441 if (V.getOpcode() != ISD::BUILD_VECTOR)
8444 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8445 // the (larger) source element must be UNDEF/ZERO.
8446 if ((Size % V.getNumOperands()) == 0) {
8447 int Scale = Size / V->getNumOperands();
8448 SDValue Op = V.getOperand(M / Scale);
8449 if (Op.isUndef() || X86::isZeroNode(Op))
8451 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8452 APInt Val = Cst->getAPIntValue();
8453 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8454 Val = Val.getLoBits(ScalarSizeInBits);
8457 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8458 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8459 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8460 Val = Val.getLoBits(ScalarSizeInBits);
8467 // If the BUILD_VECTOR has more elements then all the (smaller) source
8468 // elements must be UNDEF or ZERO.
8469 if ((V.getNumOperands() % Size) == 0) {
8470 int Scale = V->getNumOperands() / Size;
8471 bool AllZeroable = true;
8472 for (int j = 0; j < Scale; ++j) {
8473 SDValue Op = V.getOperand((M * Scale) + j);
8474 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8485 // The Shuffle result is as follow:
8486 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8487 // Each Zeroable's element correspond to a particular Mask's element.
8488 // As described in computeZeroableShuffleElements function.
8490 // The function looks for a sub-mask that the nonzero elements are in
8491 // increasing order. If such sub-mask exist. The function returns true.
8492 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8493 ArrayRef<int> Mask, const EVT &VectorType,
8494 bool &IsZeroSideLeft) {
8495 int NextElement = -1;
8496 // Check if the Mask's nonzero elements are in increasing order.
8497 for (int i = 0, e = Mask.size(); i < e; i++) {
8498 // Checks if the mask's zeros elements are built from only zeros.
8499 assert(Mask[i] >= -1 && "Out of bound mask element!");
8504 // Find the lowest non zero element
8505 if (NextElement < 0) {
8506 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8507 IsZeroSideLeft = NextElement != 0;
8509 // Exit if the mask's non zero elements are not in increasing order.
8510 if (NextElement != Mask[i])
8517 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8518 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8519 ArrayRef<int> Mask, SDValue V1,
8521 const APInt &Zeroable,
8522 const X86Subtarget &Subtarget,
8523 SelectionDAG &DAG) {
8524 int Size = Mask.size();
8525 int LaneSize = 128 / VT.getScalarSizeInBits();
8526 const int NumBytes = VT.getSizeInBits() / 8;
8527 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8529 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8530 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8531 (Subtarget.hasBWI() && VT.is512BitVector()));
8533 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8534 // Sign bit set in i8 mask means zero element.
8535 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8538 for (int i = 0; i < NumBytes; ++i) {
8539 int M = Mask[i / NumEltBytes];
8541 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8544 if (Zeroable[i / NumEltBytes]) {
8545 PSHUFBMask[i] = ZeroMask;
8549 // We can only use a single input of V1 or V2.
8550 SDValue SrcV = (M >= Size ? V2 : V1);
8556 // PSHUFB can't cross lanes, ensure this doesn't happen.
8557 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8561 M = M * NumEltBytes + (i % NumEltBytes);
8562 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8564 assert(V && "Failed to find a source input");
8566 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8567 return DAG.getBitcast(
8568 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8569 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8572 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8573 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8576 // X86 has dedicated shuffle that can be lowered to VEXPAND
8577 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8578 const APInt &Zeroable,
8579 ArrayRef<int> Mask, SDValue &V1,
8580 SDValue &V2, SelectionDAG &DAG,
8581 const X86Subtarget &Subtarget) {
8582 bool IsLeftZeroSide = true;
8583 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8586 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8588 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8589 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8590 unsigned NumElts = VT.getVectorNumElements();
8591 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8592 "Unexpected number of vector elements");
8593 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8594 Subtarget, DAG, DL);
8595 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8596 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8597 return DAG.getSelect(DL, VT, VMask,
8598 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8602 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8603 unsigned &UnpackOpcode, bool IsUnary,
8604 ArrayRef<int> TargetMask, SDLoc &DL,
8606 const X86Subtarget &Subtarget) {
8607 int NumElts = VT.getVectorNumElements();
8609 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8610 for (int i = 0; i != NumElts; i += 2) {
8611 int M1 = TargetMask[i + 0];
8612 int M2 = TargetMask[i + 1];
8613 Undef1 &= (SM_SentinelUndef == M1);
8614 Undef2 &= (SM_SentinelUndef == M2);
8615 Zero1 &= isUndefOrZero(M1);
8616 Zero2 &= isUndefOrZero(M2);
8618 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8619 "Zeroable shuffle detected");
8621 // Attempt to match the target mask against the unpack lo/hi mask patterns.
8622 SmallVector<int, 64> Unpckl, Unpckh;
8623 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8624 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8625 UnpackOpcode = X86ISD::UNPCKL;
8626 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8627 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8631 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8632 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8633 UnpackOpcode = X86ISD::UNPCKH;
8634 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8635 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8639 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8640 if (IsUnary && (Zero1 || Zero2)) {
8641 // Don't bother if we can blend instead.
8642 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8643 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8646 bool MatchLo = true, MatchHi = true;
8647 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8648 int M = TargetMask[i];
8650 // Ignore if the input is known to be zero or the index is undef.
8651 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8652 (M == SM_SentinelUndef))
8655 MatchLo &= (M == Unpckl[i]);
8656 MatchHi &= (M == Unpckh[i]);
8659 if (MatchLo || MatchHi) {
8660 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8661 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8662 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8667 // If a binary shuffle, commute and try again.
8669 ShuffleVectorSDNode::commuteMask(Unpckl);
8670 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8671 UnpackOpcode = X86ISD::UNPCKL;
8676 ShuffleVectorSDNode::commuteMask(Unpckh);
8677 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8678 UnpackOpcode = X86ISD::UNPCKH;
8687 // X86 has dedicated unpack instructions that can handle specific blend
8688 // operations: UNPCKH and UNPCKL.
8689 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8690 ArrayRef<int> Mask, SDValue V1,
8691 SDValue V2, SelectionDAG &DAG) {
8692 SmallVector<int, 8> Unpckl;
8693 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8694 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8695 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8697 SmallVector<int, 8> Unpckh;
8698 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8699 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8700 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8702 // Commute and try again.
8703 ShuffleVectorSDNode::commuteMask(Unpckl);
8704 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8705 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8707 ShuffleVectorSDNode::commuteMask(Unpckh);
8708 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8709 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8714 /// \brief Try to emit a bitmask instruction for a shuffle.
8716 /// This handles cases where we can model a blend exactly as a bitmask due to
8717 /// one of the inputs being zeroable.
8718 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8719 SDValue V2, ArrayRef<int> Mask,
8720 const APInt &Zeroable,
8721 SelectionDAG &DAG) {
8722 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8723 MVT EltVT = VT.getVectorElementType();
8724 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8725 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8726 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8728 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8731 if (Mask[i] % Size != i)
8732 return SDValue(); // Not a blend.
8734 V = Mask[i] < Size ? V1 : V2;
8735 else if (V != (Mask[i] < Size ? V1 : V2))
8736 return SDValue(); // Can only let one input through the mask.
8738 VMaskOps[i] = AllOnes;
8741 return SDValue(); // No non-zeroable elements!
8743 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8744 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8747 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8749 /// This is used as a fallback approach when first class blend instructions are
8750 /// unavailable. Currently it is only suitable for integer vectors, but could
8751 /// be generalized for floating point vectors if desirable.
8752 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8753 SDValue V2, ArrayRef<int> Mask,
8754 SelectionDAG &DAG) {
8755 assert(VT.isInteger() && "Only supports integer vector types!");
8756 MVT EltVT = VT.getVectorElementType();
8757 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8758 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8759 SmallVector<SDValue, 16> MaskOps;
8760 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8761 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8762 return SDValue(); // Shuffled input!
8763 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8766 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8767 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8768 // We have to cast V2 around.
8769 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8770 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8771 DAG.getBitcast(MaskVT, V1Mask),
8772 DAG.getBitcast(MaskVT, V2)));
8773 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8776 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
8777 SDValue PreservedSrc,
8778 const X86Subtarget &Subtarget,
8781 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
8782 MutableArrayRef<int> TargetMask,
8783 bool &ForceV1Zero, bool &ForceV2Zero,
8784 uint64_t &BlendMask) {
8785 bool V1IsZeroOrUndef =
8786 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
8787 bool V2IsZeroOrUndef =
8788 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
8791 ForceV1Zero = false, ForceV2Zero = false;
8792 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
8794 // Attempt to generate the binary blend mask. If an input is zero then
8795 // we can use any lane.
8796 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8797 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
8798 int M = TargetMask[i];
8799 if (M == SM_SentinelUndef)
8803 if (M == i + Size) {
8804 BlendMask |= 1ull << i;
8807 if (M == SM_SentinelZero) {
8808 if (V1IsZeroOrUndef) {
8813 if (V2IsZeroOrUndef) {
8815 BlendMask |= 1ull << i;
8816 TargetMask[i] = i + Size;
8825 uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
8826 uint64_t ScaledMask = 0;
8827 for (int i = 0; i != Size; ++i)
8828 if (BlendMask & (1ull << i))
8829 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
8833 /// \brief Try to emit a blend instruction for a shuffle.
8835 /// This doesn't do any checks for the availability of instructions for blending
8836 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8837 /// be matched in the backend with the type given. What it does check for is
8838 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8839 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8840 SDValue V2, ArrayRef<int> Original,
8841 const APInt &Zeroable,
8842 const X86Subtarget &Subtarget,
8843 SelectionDAG &DAG) {
8844 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
8846 uint64_t BlendMask = 0;
8847 bool ForceV1Zero = false, ForceV2Zero = false;
8848 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
8852 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8854 V1 = getZeroVector(VT, Subtarget, DAG, DL);
8856 V2 = getZeroVector(VT, Subtarget, DAG, DL);
8858 switch (VT.SimpleTy) {
8863 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8864 DAG.getConstant(BlendMask, DL, MVT::i8));
8868 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8872 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8873 // that instruction.
8874 if (Subtarget.hasAVX2()) {
8875 // Scale the blend by the number of 32-bit dwords per element.
8876 int Scale = VT.getScalarSizeInBits() / 32;
8877 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8878 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8879 V1 = DAG.getBitcast(BlendVT, V1);
8880 V2 = DAG.getBitcast(BlendVT, V2);
8881 return DAG.getBitcast(
8882 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8883 DAG.getConstant(BlendMask, DL, MVT::i8)));
8887 // For integer shuffles we need to expand the mask and cast the inputs to
8888 // v8i16s prior to blending.
8889 int Scale = 8 / VT.getVectorNumElements();
8890 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8891 V1 = DAG.getBitcast(MVT::v8i16, V1);
8892 V2 = DAG.getBitcast(MVT::v8i16, V2);
8893 return DAG.getBitcast(VT,
8894 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8895 DAG.getConstant(BlendMask, DL, MVT::i8)));
8899 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8900 SmallVector<int, 8> RepeatedMask;
8901 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8902 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8903 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8905 for (int i = 0; i < 8; ++i)
8906 if (RepeatedMask[i] >= 8)
8907 BlendMask |= 1ull << i;
8908 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8909 DAG.getConstant(BlendMask, DL, MVT::i8));
8915 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8916 "256-bit byte-blends require AVX2 support!");
8918 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
8920 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8921 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8922 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8925 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8926 if (SDValue Masked =
8927 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8930 // Scale the blend by the number of bytes per element.
8931 int Scale = VT.getScalarSizeInBits() / 8;
8933 // This form of blend is always done on bytes. Compute the byte vector
8935 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8937 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8938 // mix of LLVM's code generator and the x86 backend. We tell the code
8939 // generator that boolean values in the elements of an x86 vector register
8940 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8941 // mapping a select to operand #1, and 'false' mapping to operand #2. The
8942 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8943 // of the element (the remaining are ignored) and 0 in that high bit would
8944 // mean operand #1 while 1 in the high bit would mean operand #2. So while
8945 // the LLVM model for boolean values in vector elements gets the relevant
8946 // bit set, it is set backwards and over constrained relative to x86's
8948 SmallVector<SDValue, 32> VSELECTMask;
8949 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8950 for (int j = 0; j < Scale; ++j)
8951 VSELECTMask.push_back(
8952 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8953 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8956 V1 = DAG.getBitcast(BlendVT, V1);
8957 V2 = DAG.getBitcast(BlendVT, V2);
8958 return DAG.getBitcast(
8960 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
8970 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8971 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8972 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8975 llvm_unreachable("Not a supported integer vector type!");
8979 /// \brief Try to lower as a blend of elements from two inputs followed by
8980 /// a single-input permutation.
8982 /// This matches the pattern where we can blend elements from two inputs and
8983 /// then reduce the shuffle to a single-input permutation.
8984 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8985 SDValue V1, SDValue V2,
8987 SelectionDAG &DAG) {
8988 // We build up the blend mask while checking whether a blend is a viable way
8989 // to reduce the shuffle.
8990 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8991 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8993 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8997 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8999 if (BlendMask[Mask[i] % Size] < 0)
9000 BlendMask[Mask[i] % Size] = Mask[i];
9001 else if (BlendMask[Mask[i] % Size] != Mask[i])
9002 return SDValue(); // Can't blend in the needed input!
9004 PermuteMask[i] = Mask[i] % Size;
9007 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9008 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9011 /// \brief Generic routine to decompose a shuffle and blend into independent
9012 /// blends and permutes.
9014 /// This matches the extremely common pattern for handling combined
9015 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9016 /// operations. It will try to pick the best arrangement of shuffles and
9018 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9022 SelectionDAG &DAG) {
9023 // Shuffle the input elements into the desired positions in V1 and V2 and
9024 // blend them together.
9025 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9026 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9027 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9028 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9029 if (Mask[i] >= 0 && Mask[i] < Size) {
9030 V1Mask[i] = Mask[i];
9032 } else if (Mask[i] >= Size) {
9033 V2Mask[i] = Mask[i] - Size;
9034 BlendMask[i] = i + Size;
9037 // Try to lower with the simpler initial blend strategy unless one of the
9038 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9039 // shuffle may be able to fold with a load or other benefit. However, when
9040 // we'll have to do 2x as many shuffles in order to achieve this, blending
9041 // first is a better strategy.
9042 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9043 if (SDValue BlendPerm =
9044 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9047 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9048 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9049 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9052 /// \brief Try to lower a vector shuffle as a rotation.
9054 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9055 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9056 ArrayRef<int> Mask) {
9057 int NumElts = Mask.size();
9059 // We need to detect various ways of spelling a rotation:
9060 // [11, 12, 13, 14, 15, 0, 1, 2]
9061 // [-1, 12, 13, 14, -1, -1, 1, -1]
9062 // [-1, -1, -1, -1, -1, -1, 1, 2]
9063 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9064 // [-1, 4, 5, 6, -1, -1, 9, -1]
9065 // [-1, 4, 5, 6, -1, -1, -1, -1]
9068 for (int i = 0; i < NumElts; ++i) {
9070 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9071 "Unexpected mask index.");
9075 // Determine where a rotated vector would have started.
9076 int StartIdx = i - (M % NumElts);
9078 // The identity rotation isn't interesting, stop.
9081 // If we found the tail of a vector the rotation must be the missing
9082 // front. If we found the head of a vector, it must be how much of the
9084 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9087 Rotation = CandidateRotation;
9088 else if (Rotation != CandidateRotation)
9089 // The rotations don't match, so we can't match this mask.
9092 // Compute which value this mask is pointing at.
9093 SDValue MaskV = M < NumElts ? V1 : V2;
9095 // Compute which of the two target values this index should be assigned
9096 // to. This reflects whether the high elements are remaining or the low
9097 // elements are remaining.
9098 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9100 // Either set up this value if we've not encountered it before, or check
9101 // that it remains consistent.
9104 else if (TargetV != MaskV)
9105 // This may be a rotation, but it pulls from the inputs in some
9106 // unsupported interleaving.
9110 // Check that we successfully analyzed the mask, and normalize the results.
9111 assert(Rotation != 0 && "Failed to locate a viable rotation!");
9112 assert((Lo || Hi) && "Failed to find a rotated input vector!");
9124 /// \brief Try to lower a vector shuffle as a byte rotation.
9126 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9127 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9128 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9129 /// try to generically lower a vector shuffle through such an pattern. It
9130 /// does not check for the profitability of lowering either as PALIGNR or
9131 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9132 /// This matches shuffle vectors that look like:
9134 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9136 /// Essentially it concatenates V1 and V2, shifts right by some number of
9137 /// elements, and takes the low elements as the result. Note that while this is
9138 /// specified as a *right shift* because x86 is little-endian, it is a *left
9139 /// rotate* of the vector lanes.
9140 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9141 ArrayRef<int> Mask) {
9142 // Don't accept any shuffles with zero elements.
9143 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9146 // PALIGNR works on 128-bit lanes.
9147 SmallVector<int, 16> RepeatedMask;
9148 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9151 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9155 // PALIGNR rotates bytes, so we need to scale the
9156 // rotation based on how many bytes are in the vector lane.
9157 int NumElts = RepeatedMask.size();
9158 int Scale = 16 / NumElts;
9159 return Rotation * Scale;
9162 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9163 SDValue V1, SDValue V2,
9165 const X86Subtarget &Subtarget,
9166 SelectionDAG &DAG) {
9167 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
9169 SDValue Lo = V1, Hi = V2;
9170 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9171 if (ByteRotation <= 0)
9174 // Cast the inputs to i8 vector of correct length to match PALIGNR or
9176 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9177 Lo = DAG.getBitcast(ByteVT, Lo);
9178 Hi = DAG.getBitcast(ByteVT, Hi);
9180 // SSSE3 targets can use the palignr instruction.
9181 if (Subtarget.hasSSSE3()) {
9182 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
9183 "512-bit PALIGNR requires BWI instructions");
9184 return DAG.getBitcast(
9185 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9186 DAG.getConstant(ByteRotation, DL, MVT::i8)));
9189 assert(VT.is128BitVector() &&
9190 "Rotate-based lowering only supports 128-bit lowering!");
9191 assert(Mask.size() <= 16 &&
9192 "Can shuffle at most 16 bytes in a 128-bit vector!");
9193 assert(ByteVT == MVT::v16i8 &&
9194 "SSE2 rotate lowering only needed for v16i8!");
9196 // Default SSE2 implementation
9197 int LoByteShift = 16 - ByteRotation;
9198 int HiByteShift = ByteRotation;
9200 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9201 DAG.getConstant(LoByteShift, DL, MVT::i8));
9202 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9203 DAG.getConstant(HiByteShift, DL, MVT::i8));
9204 return DAG.getBitcast(VT,
9205 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9208 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9210 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9211 /// rotation of the concatenation of two vectors; This routine will
9212 /// try to generically lower a vector shuffle through such an pattern.
9214 /// Essentially it concatenates V1 and V2, shifts right by some number of
9215 /// elements, and takes the low elements as the result. Note that while this is
9216 /// specified as a *right shift* because x86 is little-endian, it is a *left
9217 /// rotate* of the vector lanes.
9218 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9219 SDValue V1, SDValue V2,
9221 const X86Subtarget &Subtarget,
9222 SelectionDAG &DAG) {
9223 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9224 "Only 32-bit and 64-bit elements are supported!");
9226 // 128/256-bit vectors are only supported with VLX.
9227 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9228 && "VLX required for 128/256-bit vectors");
9230 SDValue Lo = V1, Hi = V2;
9231 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9235 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9236 DAG.getConstant(Rotation, DL, MVT::i8));
9239 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9241 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9242 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9243 /// matches elements from one of the input vectors shuffled to the left or
9244 /// right with zeroable elements 'shifted in'. It handles both the strictly
9245 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9248 /// PSHL : (little-endian) left bit shift.
9249 /// [ zz, 0, zz, 2 ]
9250 /// [ -1, 4, zz, -1 ]
9251 /// PSRL : (little-endian) right bit shift.
9253 /// [ -1, -1, 7, zz]
9254 /// PSLLDQ : (little-endian) left byte shift
9255 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9256 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9257 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9258 /// PSRLDQ : (little-endian) right byte shift
9259 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9260 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9261 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9262 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9263 unsigned ScalarSizeInBits,
9264 ArrayRef<int> Mask, int MaskOffset,
9265 const APInt &Zeroable,
9266 const X86Subtarget &Subtarget) {
9267 int Size = Mask.size();
9268 unsigned SizeInBits = Size * ScalarSizeInBits;
9270 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9271 for (int i = 0; i < Size; i += Scale)
9272 for (int j = 0; j < Shift; ++j)
9273 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9279 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9280 for (int i = 0; i != Size; i += Scale) {
9281 unsigned Pos = Left ? i + Shift : i;
9282 unsigned Low = Left ? i : i + Shift;
9283 unsigned Len = Scale - Shift;
9284 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9288 int ShiftEltBits = ScalarSizeInBits * Scale;
9289 bool ByteShift = ShiftEltBits > 64;
9290 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9291 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9292 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9294 // Normalize the scale for byte shifts to still produce an i64 element
9296 Scale = ByteShift ? Scale / 2 : Scale;
9298 // We need to round trip through the appropriate type for the shift.
9299 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9300 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9301 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9302 return (int)ShiftAmt;
9305 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9306 // keep doubling the size of the integer elements up to that. We can
9307 // then shift the elements of the integer vector by whole multiples of
9308 // their width within the elements of the larger integer vector. Test each
9309 // multiple to see if we can find a match with the moved element indices
9310 // and that the shifted in elements are all zeroable.
9311 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9312 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9313 for (int Shift = 1; Shift != Scale; ++Shift)
9314 for (bool Left : {true, false})
9315 if (CheckZeros(Shift, Scale, Left)) {
9316 int ShiftAmt = MatchShift(Shift, Scale, Left);
9325 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9326 SDValue V2, ArrayRef<int> Mask,
9327 const APInt &Zeroable,
9328 const X86Subtarget &Subtarget,
9329 SelectionDAG &DAG) {
9330 int Size = Mask.size();
9331 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9337 // Try to match shuffle against V1 shift.
9338 int ShiftAmt = matchVectorShuffleAsShift(
9339 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9341 // If V1 failed, try to match shuffle against V2 shift.
9344 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9345 Mask, Size, Zeroable, Subtarget);
9352 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9353 "Illegal integer vector type");
9354 V = DAG.getBitcast(ShiftVT, V);
9355 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9356 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9357 return DAG.getBitcast(VT, V);
9360 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9361 // Remainder of lower half result is zero and upper half is all undef.
9362 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
9363 ArrayRef<int> Mask, uint64_t &BitLen,
9364 uint64_t &BitIdx, const APInt &Zeroable) {
9365 int Size = Mask.size();
9366 int HalfSize = Size / 2;
9367 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9368 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9370 // Upper half must be undefined.
9371 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9374 // Determine the extraction length from the part of the
9375 // lower half that isn't zeroable.
9377 for (; Len > 0; --Len)
9378 if (!Zeroable[Len - 1])
9380 assert(Len > 0 && "Zeroable shuffle mask");
9382 // Attempt to match first Len sequential elements from the lower half.
9385 for (int i = 0; i != Len; ++i) {
9387 if (M == SM_SentinelUndef)
9389 SDValue &V = (M < Size ? V1 : V2);
9392 // The extracted elements must start at a valid index and all mask
9393 // elements must be in the lower half.
9394 if (i > M || M >= HalfSize)
9397 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9405 if (!Src || Idx < 0)
9408 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9409 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9410 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9415 // INSERTQ: Extract lowest Len elements from lower half of second source and
9416 // insert over first source, starting at Idx.
9417 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9418 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
9419 ArrayRef<int> Mask, uint64_t &BitLen,
9421 int Size = Mask.size();
9422 int HalfSize = Size / 2;
9423 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9425 // Upper half must be undefined.
9426 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9429 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9432 // Attempt to match first source from mask before insertion point.
9433 if (isUndefInRange(Mask, 0, Idx)) {
9435 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9437 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9443 // Extend the extraction length looking to match both the insertion of
9444 // the second source and the remaining elements of the first.
9445 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9450 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9452 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9458 // Match the remaining elements of the lower half.
9459 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9461 } else if ((!Base || (Base == V1)) &&
9462 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9464 } else if ((!Base || (Base == V2)) &&
9465 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9472 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9473 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9483 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9484 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9485 SDValue V2, ArrayRef<int> Mask,
9486 const APInt &Zeroable,
9487 SelectionDAG &DAG) {
9488 uint64_t BitLen, BitIdx;
9489 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
9490 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
9491 DAG.getConstant(BitLen, DL, MVT::i8),
9492 DAG.getConstant(BitIdx, DL, MVT::i8));
9494 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
9495 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
9496 V2 ? V2 : DAG.getUNDEF(VT),
9497 DAG.getConstant(BitLen, DL, MVT::i8),
9498 DAG.getConstant(BitIdx, DL, MVT::i8));
9503 /// \brief Lower a vector shuffle as a zero or any extension.
9505 /// Given a specific number of elements, element bit width, and extension
9506 /// stride, produce either a zero or any extension based on the available
9507 /// features of the subtarget. The extended elements are consecutive and
9508 /// begin and can start from an offsetted element index in the input; to
9509 /// avoid excess shuffling the offset must either being in the bottom lane
9510 /// or at the start of a higher lane. All extended elements must be from
9512 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9513 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9514 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9515 assert(Scale > 1 && "Need a scale to extend.");
9516 int EltBits = VT.getScalarSizeInBits();
9517 int NumElements = VT.getVectorNumElements();
9518 int NumEltsPerLane = 128 / EltBits;
9519 int OffsetLane = Offset / NumEltsPerLane;
9520 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9521 "Only 8, 16, and 32 bit elements can be extended.");
9522 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9523 assert(0 <= Offset && "Extension offset must be positive.");
9524 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9525 "Extension offset must be in the first lane or start an upper lane.");
9527 // Check that an index is in same lane as the base offset.
9528 auto SafeOffset = [&](int Idx) {
9529 return OffsetLane == (Idx / NumEltsPerLane);
9532 // Shift along an input so that the offset base moves to the first element.
9533 auto ShuffleOffset = [&](SDValue V) {
9537 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9538 for (int i = 0; i * Scale < NumElements; ++i) {
9539 int SrcIdx = i + Offset;
9540 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9542 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9545 // Found a valid zext mask! Try various lowering strategies based on the
9546 // input type and available ISA extensions.
9547 if (Subtarget.hasSSE41()) {
9548 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9549 // PUNPCK will catch this in a later shuffle match.
9550 if (Offset && Scale == 2 && VT.is128BitVector())
9552 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9553 NumElements / Scale);
9554 InputV = ShuffleOffset(InputV);
9555 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9556 return DAG.getBitcast(VT, InputV);
9559 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9561 // For any extends we can cheat for larger element sizes and use shuffle
9562 // instructions that can fold with a load and/or copy.
9563 if (AnyExt && EltBits == 32) {
9564 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9566 return DAG.getBitcast(
9567 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9568 DAG.getBitcast(MVT::v4i32, InputV),
9569 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9571 if (AnyExt && EltBits == 16 && Scale > 2) {
9572 int PSHUFDMask[4] = {Offset / 2, -1,
9573 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9574 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9575 DAG.getBitcast(MVT::v4i32, InputV),
9576 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9577 int PSHUFWMask[4] = {1, -1, -1, -1};
9578 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9579 return DAG.getBitcast(
9580 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9581 DAG.getBitcast(MVT::v8i16, InputV),
9582 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9585 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9587 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9588 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9589 assert(VT.is128BitVector() && "Unexpected vector width!");
9591 int LoIdx = Offset * EltBits;
9592 SDValue Lo = DAG.getBitcast(
9593 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9594 DAG.getConstant(EltBits, DL, MVT::i8),
9595 DAG.getConstant(LoIdx, DL, MVT::i8)));
9597 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9598 !SafeOffset(Offset + 1))
9599 return DAG.getBitcast(VT, Lo);
9601 int HiIdx = (Offset + 1) * EltBits;
9602 SDValue Hi = DAG.getBitcast(
9603 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9604 DAG.getConstant(EltBits, DL, MVT::i8),
9605 DAG.getConstant(HiIdx, DL, MVT::i8)));
9606 return DAG.getBitcast(VT,
9607 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9610 // If this would require more than 2 unpack instructions to expand, use
9611 // pshufb when available. We can only use more than 2 unpack instructions
9612 // when zero extending i8 elements which also makes it easier to use pshufb.
9613 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9614 assert(NumElements == 16 && "Unexpected byte vector width!");
9615 SDValue PSHUFBMask[16];
9616 for (int i = 0; i < 16; ++i) {
9617 int Idx = Offset + (i / Scale);
9618 PSHUFBMask[i] = DAG.getConstant(
9619 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9621 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9622 return DAG.getBitcast(
9623 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9624 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9627 // If we are extending from an offset, ensure we start on a boundary that
9628 // we can unpack from.
9629 int AlignToUnpack = Offset % (NumElements / Scale);
9630 if (AlignToUnpack) {
9631 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9632 for (int i = AlignToUnpack; i < NumElements; ++i)
9633 ShMask[i - AlignToUnpack] = i;
9634 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9635 Offset -= AlignToUnpack;
9638 // Otherwise emit a sequence of unpacks.
9640 unsigned UnpackLoHi = X86ISD::UNPCKL;
9641 if (Offset >= (NumElements / 2)) {
9642 UnpackLoHi = X86ISD::UNPCKH;
9643 Offset -= (NumElements / 2);
9646 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9647 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9648 : getZeroVector(InputVT, Subtarget, DAG, DL);
9649 InputV = DAG.getBitcast(InputVT, InputV);
9650 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9654 } while (Scale > 1);
9655 return DAG.getBitcast(VT, InputV);
9658 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9660 /// This routine will try to do everything in its power to cleverly lower
9661 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9662 /// check for the profitability of this lowering, it tries to aggressively
9663 /// match this pattern. It will use all of the micro-architectural details it
9664 /// can to emit an efficient lowering. It handles both blends with all-zero
9665 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9666 /// masking out later).
9668 /// The reason we have dedicated lowering for zext-style shuffles is that they
9669 /// are both incredibly common and often quite performance sensitive.
9670 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9671 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9672 const APInt &Zeroable, const X86Subtarget &Subtarget,
9673 SelectionDAG &DAG) {
9674 int Bits = VT.getSizeInBits();
9675 int NumLanes = Bits / 128;
9676 int NumElements = VT.getVectorNumElements();
9677 int NumEltsPerLane = NumElements / NumLanes;
9678 assert(VT.getScalarSizeInBits() <= 32 &&
9679 "Exceeds 32-bit integer zero extension limit");
9680 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9682 // Define a helper function to check a particular ext-scale and lower to it if
9684 auto Lower = [&](int Scale) -> SDValue {
9689 for (int i = 0; i < NumElements; ++i) {
9692 continue; // Valid anywhere but doesn't tell us anything.
9693 if (i % Scale != 0) {
9694 // Each of the extended elements need to be zeroable.
9698 // We no longer are in the anyext case.
9703 // Each of the base elements needs to be consecutive indices into the
9704 // same input vector.
9705 SDValue V = M < NumElements ? V1 : V2;
9706 M = M % NumElements;
9709 Offset = M - (i / Scale);
9710 } else if (InputV != V)
9711 return SDValue(); // Flip-flopping inputs.
9713 // Offset must start in the lowest 128-bit lane or at the start of an
9715 // FIXME: Is it ever worth allowing a negative base offset?
9716 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9717 (Offset % NumEltsPerLane) == 0))
9720 // If we are offsetting, all referenced entries must come from the same
9722 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9725 if ((M % NumElements) != (Offset + (i / Scale)))
9726 return SDValue(); // Non-consecutive strided elements.
9730 // If we fail to find an input, we have a zero-shuffle which should always
9731 // have already been handled.
9732 // FIXME: Maybe handle this here in case during blending we end up with one?
9736 // If we are offsetting, don't extend if we only match a single input, we
9737 // can always do better by using a basic PSHUF or PUNPCK.
9738 if (Offset != 0 && Matches < 2)
9741 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9742 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9745 // The widest scale possible for extending is to a 64-bit integer.
9746 assert(Bits % 64 == 0 &&
9747 "The number of bits in a vector must be divisible by 64 on x86!");
9748 int NumExtElements = Bits / 64;
9750 // Each iteration, try extending the elements half as much, but into twice as
9752 for (; NumExtElements < NumElements; NumExtElements *= 2) {
9753 assert(NumElements % NumExtElements == 0 &&
9754 "The input vector size must be divisible by the extended size.");
9755 if (SDValue V = Lower(NumElements / NumExtElements))
9759 // General extends failed, but 128-bit vectors may be able to use MOVQ.
9763 // Returns one of the source operands if the shuffle can be reduced to a
9764 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9765 auto CanZExtLowHalf = [&]() {
9766 for (int i = NumElements / 2; i != NumElements; ++i)
9769 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9771 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9776 if (SDValue V = CanZExtLowHalf()) {
9777 V = DAG.getBitcast(MVT::v2i64, V);
9778 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9779 return DAG.getBitcast(VT, V);
9782 // No viable ext lowering found.
9786 /// \brief Try to get a scalar value for a specific element of a vector.
9788 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9789 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9790 SelectionDAG &DAG) {
9791 MVT VT = V.getSimpleValueType();
9792 MVT EltVT = VT.getVectorElementType();
9793 V = peekThroughBitcasts(V);
9795 // If the bitcasts shift the element size, we can't extract an equivalent
9797 MVT NewVT = V.getSimpleValueType();
9798 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9801 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9802 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9803 // Ensure the scalar operand is the same size as the destination.
9804 // FIXME: Add support for scalar truncation where possible.
9805 SDValue S = V.getOperand(Idx);
9806 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9807 return DAG.getBitcast(EltVT, S);
9813 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9815 /// This is particularly important because the set of instructions varies
9816 /// significantly based on whether the operand is a load or not.
9817 static bool isShuffleFoldableLoad(SDValue V) {
9818 V = peekThroughBitcasts(V);
9819 return ISD::isNON_EXTLoad(V.getNode());
9822 /// \brief Try to lower insertion of a single element into a zero vector.
9824 /// This is a common pattern that we have especially efficient patterns to lower
9825 /// across all subtarget feature sets.
9826 static SDValue lowerVectorShuffleAsElementInsertion(
9827 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9828 const APInt &Zeroable, const X86Subtarget &Subtarget,
9829 SelectionDAG &DAG) {
9831 MVT EltVT = VT.getVectorElementType();
9834 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9836 bool IsV1Zeroable = true;
9837 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9838 if (i != V2Index && !Zeroable[i]) {
9839 IsV1Zeroable = false;
9843 // Check for a single input from a SCALAR_TO_VECTOR node.
9844 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9845 // all the smarts here sunk into that routine. However, the current
9846 // lowering of BUILD_VECTOR makes that nearly impossible until the old
9847 // vector shuffle lowering is dead.
9848 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9850 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9851 // We need to zext the scalar if it is smaller than an i32.
9852 V2S = DAG.getBitcast(EltVT, V2S);
9853 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9854 // Using zext to expand a narrow element won't work for non-zero
9859 // Zero-extend directly to i32.
9861 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9863 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9864 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9865 EltVT == MVT::i16) {
9866 // Either not inserting from the low element of the input or the input
9867 // element size is too small to use VZEXT_MOVL to clear the high bits.
9871 if (!IsV1Zeroable) {
9872 // If V1 can't be treated as a zero vector we have fewer options to lower
9873 // this. We can't support integer vectors or non-zero targets cheaply, and
9874 // the V1 elements can't be permuted in any way.
9875 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9876 if (!VT.isFloatingPoint() || V2Index != 0)
9878 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9879 V1Mask[V2Index] = -1;
9880 if (!isNoopShuffleMask(V1Mask))
9882 // This is essentially a special case blend operation, but if we have
9883 // general purpose blend operations, they are always faster. Bail and let
9884 // the rest of the lowering handle these as blends.
9885 if (Subtarget.hasSSE41())
9888 // Otherwise, use MOVSD or MOVSS.
9889 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9890 "Only two types of floating point element types to handle!");
9891 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9895 // This lowering only works for the low element with floating point vectors.
9896 if (VT.isFloatingPoint() && V2Index != 0)
9899 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9901 V2 = DAG.getBitcast(VT, V2);
9904 // If we have 4 or fewer lanes we can cheaply shuffle the element into
9905 // the desired position. Otherwise it is more efficient to do a vector
9906 // shift left. We know that we can do a vector shift left because all
9907 // the inputs are zero.
9908 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9909 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9910 V2Shuffle[V2Index] = 0;
9911 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9913 V2 = DAG.getBitcast(MVT::v16i8, V2);
9915 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9916 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9917 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9918 DAG.getDataLayout(), VT)));
9919 V2 = DAG.getBitcast(VT, V2);
9925 /// Try to lower broadcast of a single - truncated - integer element,
9926 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9928 /// This assumes we have AVX2.
9929 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9930 SDValue V0, int BroadcastIdx,
9931 const X86Subtarget &Subtarget,
9932 SelectionDAG &DAG) {
9933 assert(Subtarget.hasAVX2() &&
9934 "We can only lower integer broadcasts with AVX2!");
9936 EVT EltVT = VT.getVectorElementType();
9937 EVT V0VT = V0.getValueType();
9939 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9940 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9942 EVT V0EltVT = V0VT.getVectorElementType();
9943 if (!V0EltVT.isInteger())
9946 const unsigned EltSize = EltVT.getSizeInBits();
9947 const unsigned V0EltSize = V0EltVT.getSizeInBits();
9949 // This is only a truncation if the original element type is larger.
9950 if (V0EltSize <= EltSize)
9953 assert(((V0EltSize % EltSize) == 0) &&
9954 "Scalar type sizes must all be powers of 2 on x86!");
9956 const unsigned V0Opc = V0.getOpcode();
9957 const unsigned Scale = V0EltSize / EltSize;
9958 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9960 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9961 V0Opc != ISD::BUILD_VECTOR)
9964 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9966 // If we're extracting non-least-significant bits, shift so we can truncate.
9967 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9968 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9969 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9970 if (const int OffsetIdx = BroadcastIdx % Scale)
9971 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9972 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9974 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9975 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9978 /// \brief Try to lower broadcast of a single element.
9980 /// For convenience, this code also bundles all of the subtarget feature set
9981 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9982 /// a convenient way to factor it out.
9983 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9984 SDValue V1, SDValue V2,
9986 const X86Subtarget &Subtarget,
9987 SelectionDAG &DAG) {
9988 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9989 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9990 (Subtarget.hasAVX2() && VT.isInteger())))
9993 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9994 // we can only broadcast from a register with AVX2.
9995 unsigned NumElts = Mask.size();
9996 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9997 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9999 // Check that the mask is a broadcast.
10000 int BroadcastIdx = -1;
10001 for (int i = 0; i != (int)NumElts; ++i) {
10002 SmallVector<int, 8> BroadcastMask(NumElts, i);
10003 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10009 if (BroadcastIdx < 0)
10011 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10012 "a sorted mask where the broadcast "
10015 // Go up the chain of (vector) values to find a scalar load that we can
10016 // combine with the broadcast.
10019 switch (V.getOpcode()) {
10020 case ISD::BITCAST: {
10021 SDValue VSrc = V.getOperand(0);
10022 MVT SrcVT = VSrc.getSimpleValueType();
10023 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
10028 case ISD::CONCAT_VECTORS: {
10029 int OperandSize = Mask.size() / V.getNumOperands();
10030 V = V.getOperand(BroadcastIdx / OperandSize);
10031 BroadcastIdx %= OperandSize;
10034 case ISD::INSERT_SUBVECTOR: {
10035 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10036 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10040 int BeginIdx = (int)ConstantIdx->getZExtValue();
10042 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10043 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10044 BroadcastIdx -= BeginIdx;
10055 // Check if this is a broadcast of a scalar. We special case lowering
10056 // for scalars so that we can more effectively fold with loads.
10057 // First, look through bitcast: if the original value has a larger element
10058 // type than the shuffle, the broadcast element is in essence truncated.
10059 // Make that explicit to ease folding.
10060 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10061 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10062 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10063 return TruncBroadcast;
10065 MVT BroadcastVT = VT;
10067 // Peek through any bitcast (only useful for loads).
10068 SDValue BC = peekThroughBitcasts(V);
10070 // Also check the simpler case, where we can directly reuse the scalar.
10071 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10072 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10073 V = V.getOperand(BroadcastIdx);
10075 // If we can't broadcast from a register, check that the input is a load.
10076 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10078 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10079 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10080 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10081 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10082 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
10085 // If we are broadcasting a load that is only used by the shuffle
10086 // then we can reduce the vector load to the broadcasted scalar load.
10087 LoadSDNode *Ld = cast<LoadSDNode>(BC);
10088 SDValue BaseAddr = Ld->getOperand(1);
10089 EVT SVT = BroadcastVT.getScalarType();
10090 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10091 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10092 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10093 DAG.getMachineFunction().getMachineMemOperand(
10094 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10095 DAG.makeEquivalentMemoryOrdering(Ld, V);
10096 } else if (!BroadcastFromReg) {
10097 // We can't broadcast from a vector register.
10099 } else if (BroadcastIdx != 0) {
10100 // We can only broadcast from the zero-element of a vector register,
10101 // but it can be advantageous to broadcast from the zero-element of a
10103 if (!VT.is256BitVector() && !VT.is512BitVector())
10106 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10107 if (VT == MVT::v4f64 || VT == MVT::v4i64)
10110 // Only broadcast the zero-element of a 128-bit subvector.
10111 unsigned EltSize = VT.getScalarSizeInBits();
10112 if (((BroadcastIdx * EltSize) % 128) != 0)
10115 // The shuffle input might have been a bitcast we looked through; look at
10116 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
10117 // later bitcast it to BroadcastVT.
10118 MVT SrcVT = V.getSimpleValueType();
10119 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10120 "Unexpected vector element size");
10121 assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
10122 "Unexpected vector size");
10124 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
10125 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
10126 DAG.getIntPtrConstant(BroadcastIdx, DL));
10129 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10130 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10131 DAG.getBitcast(MVT::f64, V));
10133 // Bitcast back to the same scalar type as BroadcastVT.
10134 MVT SrcVT = V.getSimpleValueType();
10135 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10136 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10137 "Unexpected vector element size");
10138 if (SrcVT.isVector()) {
10139 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10140 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10142 SrcVT = BroadcastVT.getScalarType();
10144 V = DAG.getBitcast(SrcVT, V);
10147 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10148 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10149 V = DAG.getBitcast(MVT::f64, V);
10150 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10151 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10154 // We only support broadcasting from 128-bit vectors to minimize the
10155 // number of patterns we need to deal with in isel. So extract down to
10157 if (SrcVT.getSizeInBits() > 128)
10158 V = extract128BitVector(V, 0, DAG, DL);
10160 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10163 // Check for whether we can use INSERTPS to perform the shuffle. We only use
10164 // INSERTPS when the V1 elements are already in the correct locations
10165 // because otherwise we can just always use two SHUFPS instructions which
10166 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10167 // perform INSERTPS if a single V1 element is out of place and all V2
10168 // elements are zeroable.
10169 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10170 unsigned &InsertPSMask,
10171 const APInt &Zeroable,
10172 ArrayRef<int> Mask,
10173 SelectionDAG &DAG) {
10174 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
10175 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
10176 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10178 // Attempt to match INSERTPS with one element from VA or VB being
10179 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10181 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10182 ArrayRef<int> CandidateMask) {
10183 unsigned ZMask = 0;
10184 int VADstIndex = -1;
10185 int VBDstIndex = -1;
10186 bool VAUsedInPlace = false;
10188 for (int i = 0; i < 4; ++i) {
10189 // Synthesize a zero mask from the zeroable elements (includes undefs).
10195 // Flag if we use any VA inputs in place.
10196 if (i == CandidateMask[i]) {
10197 VAUsedInPlace = true;
10201 // We can only insert a single non-zeroable element.
10202 if (VADstIndex >= 0 || VBDstIndex >= 0)
10205 if (CandidateMask[i] < 4) {
10206 // VA input out of place for insertion.
10209 // VB input for insertion.
10214 // Don't bother if we have no (non-zeroable) element for insertion.
10215 if (VADstIndex < 0 && VBDstIndex < 0)
10218 // Determine element insertion src/dst indices. The src index is from the
10219 // start of the inserted vector, not the start of the concatenated vector.
10220 unsigned VBSrcIndex = 0;
10221 if (VADstIndex >= 0) {
10222 // If we have a VA input out of place, we use VA as the V2 element
10223 // insertion and don't use the original V2 at all.
10224 VBSrcIndex = CandidateMask[VADstIndex];
10225 VBDstIndex = VADstIndex;
10228 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10231 // If no V1 inputs are used in place, then the result is created only from
10232 // the zero mask and the V2 insertion - so remove V1 dependency.
10233 if (!VAUsedInPlace)
10234 VA = DAG.getUNDEF(MVT::v4f32);
10236 // Update V1, V2 and InsertPSMask accordingly.
10240 // Insert the V2 element into the desired position.
10241 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10242 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10246 if (matchAsInsertPS(V1, V2, Mask))
10249 // Commute and try again.
10250 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10251 ShuffleVectorSDNode::commuteMask(CommutedMask);
10252 if (matchAsInsertPS(V2, V1, CommutedMask))
10258 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10259 SDValue V2, ArrayRef<int> Mask,
10260 const APInt &Zeroable,
10261 SelectionDAG &DAG) {
10262 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10263 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10265 // Attempt to match the insertps pattern.
10266 unsigned InsertPSMask;
10267 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10270 // Insert the V2 element into the desired position.
10271 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10272 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10275 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10276 /// UNPCK instruction.
10278 /// This specifically targets cases where we end up with alternating between
10279 /// the two inputs, and so can permute them into something that feeds a single
10280 /// UNPCK instruction. Note that this routine only targets integer vectors
10281 /// because for floating point vectors we have a generalized SHUFPS lowering
10282 /// strategy that handles everything that doesn't *exactly* match an unpack,
10283 /// making this clever lowering unnecessary.
10284 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10285 SDValue V1, SDValue V2,
10286 ArrayRef<int> Mask,
10287 SelectionDAG &DAG) {
10288 assert(!VT.isFloatingPoint() &&
10289 "This routine only supports integer vectors.");
10290 assert(VT.is128BitVector() &&
10291 "This routine only works on 128-bit vectors.");
10292 assert(!V2.isUndef() &&
10293 "This routine should only be used when blending two inputs.");
10294 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10296 int Size = Mask.size();
10299 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10301 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10303 bool UnpackLo = NumLoInputs >= NumHiInputs;
10305 auto TryUnpack = [&](int ScalarSize, int Scale) {
10306 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10307 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10309 for (int i = 0; i < Size; ++i) {
10313 // Each element of the unpack contains Scale elements from this mask.
10314 int UnpackIdx = i / Scale;
10316 // We only handle the case where V1 feeds the first slots of the unpack.
10317 // We rely on canonicalization to ensure this is the case.
10318 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10321 // Setup the mask for this input. The indexing is tricky as we have to
10322 // handle the unpack stride.
10323 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10324 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10328 // If we will have to shuffle both inputs to use the unpack, check whether
10329 // we can just unpack first and shuffle the result. If so, skip this unpack.
10330 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10331 !isNoopShuffleMask(V2Mask))
10334 // Shuffle the inputs into place.
10335 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10336 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10338 // Cast the inputs to the type we will use to unpack them.
10339 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10340 V1 = DAG.getBitcast(UnpackVT, V1);
10341 V2 = DAG.getBitcast(UnpackVT, V2);
10343 // Unpack the inputs and cast the result back to the desired type.
10344 return DAG.getBitcast(
10345 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10346 UnpackVT, V1, V2));
10349 // We try each unpack from the largest to the smallest to try and find one
10350 // that fits this mask.
10351 int OrigScalarSize = VT.getScalarSizeInBits();
10352 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10353 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10356 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10358 if (NumLoInputs == 0 || NumHiInputs == 0) {
10359 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10360 "We have to have *some* inputs!");
10361 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10363 // FIXME: We could consider the total complexity of the permute of each
10364 // possible unpacking. Or at the least we should consider how many
10365 // half-crossings are created.
10366 // FIXME: We could consider commuting the unpacks.
10368 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10369 for (int i = 0; i < Size; ++i) {
10373 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10376 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10378 return DAG.getVectorShuffle(
10379 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10381 DAG.getUNDEF(VT), PermMask);
10387 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10389 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10390 /// support for floating point shuffles but not integer shuffles. These
10391 /// instructions will incur a domain crossing penalty on some chips though so
10392 /// it is better to avoid lowering through this for integer vectors where
10394 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10395 const APInt &Zeroable,
10396 SDValue V1, SDValue V2,
10397 const X86Subtarget &Subtarget,
10398 SelectionDAG &DAG) {
10399 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10400 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10401 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10403 if (V2.isUndef()) {
10404 // Check for being able to broadcast a single element.
10405 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10406 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10409 // Straight shuffle of a single input vector. Simulate this by using the
10410 // single input as both of the "inputs" to this instruction..
10411 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10413 if (Subtarget.hasAVX()) {
10414 // If we have AVX, we can use VPERMILPS which will allow folding a load
10415 // into the shuffle.
10416 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10417 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10420 return DAG.getNode(
10421 X86ISD::SHUFP, DL, MVT::v2f64,
10422 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10423 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10424 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10426 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10427 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10429 // If we have a single input, insert that into V1 if we can do so cheaply.
10430 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10431 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10432 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10434 // Try inverting the insertion since for v2 masks it is easy to do and we
10435 // can't reliably sort the mask one way or the other.
10436 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10437 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10438 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10439 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10443 // Try to use one of the special instruction patterns to handle two common
10444 // blend patterns if a zero-blend above didn't work.
10445 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10446 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10447 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10448 // We can either use a special instruction to load over the low double or
10449 // to move just the low double.
10450 return DAG.getNode(
10451 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10452 DL, MVT::v2f64, V2,
10453 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10455 if (Subtarget.hasSSE41())
10456 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10457 Zeroable, Subtarget, DAG))
10460 // Use dedicated unpack instructions for masks that match their pattern.
10462 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10465 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10466 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10467 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10470 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10472 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10473 /// the integer unit to minimize domain crossing penalties. However, for blends
10474 /// it falls back to the floating point shuffle operation with appropriate bit
10476 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10477 const APInt &Zeroable,
10478 SDValue V1, SDValue V2,
10479 const X86Subtarget &Subtarget,
10480 SelectionDAG &DAG) {
10481 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10482 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10483 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10485 if (V2.isUndef()) {
10486 // Check for being able to broadcast a single element.
10487 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10488 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10491 // Straight shuffle of a single input vector. For everything from SSE2
10492 // onward this has a single fast instruction with no scary immediates.
10493 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10494 V1 = DAG.getBitcast(MVT::v4i32, V1);
10495 int WidenedMask[4] = {
10496 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10497 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10498 return DAG.getBitcast(
10500 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10501 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10503 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10504 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10505 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10506 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10508 // If we have a blend of two same-type PACKUS operations and the blend aligns
10509 // with the low and high halves, we can just merge the PACKUS operations.
10510 // This is particularly important as it lets us merge shuffles that this
10511 // routine itself creates.
10512 auto GetPackNode = [](SDValue V) {
10513 V = peekThroughBitcasts(V);
10514 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
10516 if (SDValue V1Pack = GetPackNode(V1))
10517 if (SDValue V2Pack = GetPackNode(V2)) {
10518 EVT PackVT = V1Pack.getValueType();
10519 if (PackVT == V2Pack.getValueType())
10520 return DAG.getBitcast(MVT::v2i64,
10521 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10522 Mask[0] == 0 ? V1Pack.getOperand(0)
10523 : V1Pack.getOperand(1),
10524 Mask[1] == 2 ? V2Pack.getOperand(0)
10525 : V2Pack.getOperand(1)));
10528 // Try to use shift instructions.
10529 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10530 Zeroable, Subtarget, DAG))
10533 // When loading a scalar and then shuffling it into a vector we can often do
10534 // the insertion cheaply.
10535 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10536 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10538 // Try inverting the insertion since for v2 masks it is easy to do and we
10539 // can't reliably sort the mask one way or the other.
10540 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10541 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10542 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10545 // We have different paths for blend lowering, but they all must use the
10546 // *exact* same predicate.
10547 bool IsBlendSupported = Subtarget.hasSSE41();
10548 if (IsBlendSupported)
10549 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10550 Zeroable, Subtarget, DAG))
10553 // Use dedicated unpack instructions for masks that match their pattern.
10555 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10558 // Try to use byte rotation instructions.
10559 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10560 if (Subtarget.hasSSSE3())
10561 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10562 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10565 // If we have direct support for blends, we should lower by decomposing into
10566 // a permute. That will be faster than the domain cross.
10567 if (IsBlendSupported)
10568 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10571 // We implement this with SHUFPD which is pretty lame because it will likely
10572 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10573 // However, all the alternatives are still more cycles and newer chips don't
10574 // have this problem. It would be really nice if x86 had better shuffles here.
10575 V1 = DAG.getBitcast(MVT::v2f64, V1);
10576 V2 = DAG.getBitcast(MVT::v2f64, V2);
10577 return DAG.getBitcast(MVT::v2i64,
10578 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10581 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10583 /// This is used to disable more specialized lowerings when the shufps lowering
10584 /// will happen to be efficient.
10585 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10586 // This routine only handles 128-bit shufps.
10587 assert(Mask.size() == 4 && "Unsupported mask size!");
10588 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10589 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10590 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10591 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10593 // To lower with a single SHUFPS we need to have the low half and high half
10594 // each requiring a single input.
10595 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10597 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10603 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10605 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10606 /// It makes no assumptions about whether this is the *best* lowering, it simply
10608 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10609 ArrayRef<int> Mask, SDValue V1,
10610 SDValue V2, SelectionDAG &DAG) {
10611 SDValue LowV = V1, HighV = V2;
10612 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10614 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10616 if (NumV2Elements == 1) {
10617 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10619 // Compute the index adjacent to V2Index and in the same half by toggling
10621 int V2AdjIndex = V2Index ^ 1;
10623 if (Mask[V2AdjIndex] < 0) {
10624 // Handles all the cases where we have a single V2 element and an undef.
10625 // This will only ever happen in the high lanes because we commute the
10626 // vector otherwise.
10628 std::swap(LowV, HighV);
10629 NewMask[V2Index] -= 4;
10631 // Handle the case where the V2 element ends up adjacent to a V1 element.
10632 // To make this work, blend them together as the first step.
10633 int V1Index = V2AdjIndex;
10634 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10635 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10636 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10638 // Now proceed to reconstruct the final blend as we have the necessary
10639 // high or low half formed.
10646 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10647 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10649 } else if (NumV2Elements == 2) {
10650 if (Mask[0] < 4 && Mask[1] < 4) {
10651 // Handle the easy case where we have V1 in the low lanes and V2 in the
10655 } else if (Mask[2] < 4 && Mask[3] < 4) {
10656 // We also handle the reversed case because this utility may get called
10657 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10658 // arrange things in the right direction.
10664 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10665 // trying to place elements directly, just blend them and set up the final
10666 // shuffle to place them.
10668 // The first two blend mask elements are for V1, the second two are for
10670 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10671 Mask[2] < 4 ? Mask[2] : Mask[3],
10672 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10673 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10674 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10675 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10677 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10680 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10681 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10682 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10683 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10686 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10687 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10690 /// \brief Lower 4-lane 32-bit floating point shuffles.
10692 /// Uses instructions exclusively from the floating point unit to minimize
10693 /// domain crossing penalties, as these are sufficient to implement all v4f32
10695 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10696 const APInt &Zeroable,
10697 SDValue V1, SDValue V2,
10698 const X86Subtarget &Subtarget,
10699 SelectionDAG &DAG) {
10700 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10701 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10702 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10704 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10706 if (NumV2Elements == 0) {
10707 // Check for being able to broadcast a single element.
10708 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10709 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10712 // Use even/odd duplicate instructions for masks that match their pattern.
10713 if (Subtarget.hasSSE3()) {
10714 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10715 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10716 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10717 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10720 if (Subtarget.hasAVX()) {
10721 // If we have AVX, we can use VPERMILPS which will allow folding a load
10722 // into the shuffle.
10723 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10724 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10727 // Otherwise, use a straight shuffle of a single input vector. We pass the
10728 // input vector to both operands to simulate this with a SHUFPS.
10729 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10730 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10733 // There are special ways we can lower some single-element blends. However, we
10734 // have custom ways we can lower more complex single-element blends below that
10735 // we defer to if both this and BLENDPS fail to match, so restrict this to
10736 // when the V2 input is targeting element 0 of the mask -- that is the fast
10738 if (NumV2Elements == 1 && Mask[0] >= 4)
10739 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10740 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10743 if (Subtarget.hasSSE41()) {
10744 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10745 Zeroable, Subtarget, DAG))
10748 // Use INSERTPS if we can complete the shuffle efficiently.
10750 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10753 if (!isSingleSHUFPSMask(Mask))
10754 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10755 DL, MVT::v4f32, V1, V2, Mask, DAG))
10759 // Use low/high mov instructions.
10760 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10761 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10762 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10763 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10765 // Use dedicated unpack instructions for masks that match their pattern.
10767 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10770 // Otherwise fall back to a SHUFPS lowering strategy.
10771 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10774 /// \brief Lower 4-lane i32 vector shuffles.
10776 /// We try to handle these with integer-domain shuffles where we can, but for
10777 /// blends we use the floating point domain blend instructions.
10778 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10779 const APInt &Zeroable,
10780 SDValue V1, SDValue V2,
10781 const X86Subtarget &Subtarget,
10782 SelectionDAG &DAG) {
10783 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10784 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10785 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10787 // Whenever we can lower this as a zext, that instruction is strictly faster
10788 // than any alternative. It also allows us to fold memory operands into the
10789 // shuffle in many cases.
10790 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10791 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10794 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10796 if (NumV2Elements == 0) {
10797 // Check for being able to broadcast a single element.
10798 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10799 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10802 // Straight shuffle of a single input vector. For everything from SSE2
10803 // onward this has a single fast instruction with no scary immediates.
10804 // We coerce the shuffle pattern to be compatible with UNPCK instructions
10805 // but we aren't actually going to use the UNPCK instruction because doing
10806 // so prevents folding a load into this instruction or making a copy.
10807 const int UnpackLoMask[] = {0, 0, 1, 1};
10808 const int UnpackHiMask[] = {2, 2, 3, 3};
10809 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10810 Mask = UnpackLoMask;
10811 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10812 Mask = UnpackHiMask;
10814 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10815 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10818 // Try to use shift instructions.
10819 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10820 Zeroable, Subtarget, DAG))
10823 // There are special ways we can lower some single-element blends.
10824 if (NumV2Elements == 1)
10825 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10826 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10829 // We have different paths for blend lowering, but they all must use the
10830 // *exact* same predicate.
10831 bool IsBlendSupported = Subtarget.hasSSE41();
10832 if (IsBlendSupported)
10833 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10834 Zeroable, Subtarget, DAG))
10837 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10841 // Use dedicated unpack instructions for masks that match their pattern.
10843 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10846 // Try to use byte rotation instructions.
10847 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10848 if (Subtarget.hasSSSE3())
10849 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10850 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10853 // Assume that a single SHUFPS is faster than an alternative sequence of
10854 // multiple instructions (even if the CPU has a domain penalty).
10855 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10856 if (!isSingleSHUFPSMask(Mask)) {
10857 // If we have direct support for blends, we should lower by decomposing into
10858 // a permute. That will be faster than the domain cross.
10859 if (IsBlendSupported)
10860 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10863 // Try to lower by permuting the inputs into an unpack instruction.
10864 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10865 DL, MVT::v4i32, V1, V2, Mask, DAG))
10869 // We implement this with SHUFPS because it can blend from two vectors.
10870 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10871 // up the inputs, bypassing domain shift penalties that we would incur if we
10872 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10874 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10875 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10876 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10877 return DAG.getBitcast(MVT::v4i32, ShufPS);
10880 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10881 /// shuffle lowering, and the most complex part.
10883 /// The lowering strategy is to try to form pairs of input lanes which are
10884 /// targeted at the same half of the final vector, and then use a dword shuffle
10885 /// to place them onto the right half, and finally unpack the paired lanes into
10886 /// their final position.
10888 /// The exact breakdown of how to form these dword pairs and align them on the
10889 /// correct sides is really tricky. See the comments within the function for
10890 /// more of the details.
10892 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10893 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10894 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10895 /// vector, form the analogous 128-bit 8-element Mask.
10896 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10897 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10898 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10899 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10900 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10902 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
10903 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10904 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10906 SmallVector<int, 4> LoInputs;
10907 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
10908 std::sort(LoInputs.begin(), LoInputs.end());
10909 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10910 SmallVector<int, 4> HiInputs;
10911 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
10912 std::sort(HiInputs.begin(), HiInputs.end());
10913 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10915 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10916 int NumHToL = LoInputs.size() - NumLToL;
10918 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10919 int NumHToH = HiInputs.size() - NumLToH;
10920 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10921 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10922 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10923 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10925 // If we are splatting two values from one half - one to each half, then
10926 // we can shuffle that half so each is splatted to a dword, then splat those
10927 // to their respective halves.
10928 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10930 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10931 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10932 V = DAG.getNode(ShufWOp, DL, VT, V,
10933 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10934 V = DAG.getBitcast(PSHUFDVT, V);
10935 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10936 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10937 return DAG.getBitcast(VT, V);
10940 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10941 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10942 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10943 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10945 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10946 // such inputs we can swap two of the dwords across the half mark and end up
10947 // with <=2 inputs to each half in each half. Once there, we can fall through
10948 // to the generic code below. For example:
10950 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10951 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10953 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10954 // and an existing 2-into-2 on the other half. In this case we may have to
10955 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10956 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10957 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10958 // because any other situation (including a 3-into-1 or 1-into-3 in the other
10959 // half than the one we target for fixing) will be fixed when we re-enter this
10960 // path. We will also combine away any sequence of PSHUFD instructions that
10961 // result into a single instruction. Here is an example of the tricky case:
10963 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10964 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10966 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10968 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10969 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10971 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10972 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10974 // The result is fine to be handled by the generic logic.
10975 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10976 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10977 int AOffset, int BOffset) {
10978 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10979 "Must call this with A having 3 or 1 inputs from the A half.");
10980 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10981 "Must call this with B having 1 or 3 inputs from the B half.");
10982 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10983 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10985 bool ThreeAInputs = AToAInputs.size() == 3;
10987 // Compute the index of dword with only one word among the three inputs in
10988 // a half by taking the sum of the half with three inputs and subtracting
10989 // the sum of the actual three inputs. The difference is the remaining
10991 int ADWord, BDWord;
10992 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10993 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10994 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10995 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10996 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10997 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10998 int TripleNonInputIdx =
10999 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11000 TripleDWord = TripleNonInputIdx / 2;
11002 // We use xor with one to compute the adjacent DWord to whichever one the
11004 OneInputDWord = (OneInput / 2) ^ 1;
11006 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11007 // and BToA inputs. If there is also such a problem with the BToB and AToB
11008 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11009 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11010 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11011 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11012 // Compute how many inputs will be flipped by swapping these DWords. We
11014 // to balance this to ensure we don't form a 3-1 shuffle in the other
11016 int NumFlippedAToBInputs =
11017 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11018 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11019 int NumFlippedBToBInputs =
11020 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11021 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11022 if ((NumFlippedAToBInputs == 1 &&
11023 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11024 (NumFlippedBToBInputs == 1 &&
11025 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
11026 // We choose whether to fix the A half or B half based on whether that
11027 // half has zero flipped inputs. At zero, we may not be able to fix it
11028 // with that half. We also bias towards fixing the B half because that
11029 // will more commonly be the high half, and we have to bias one way.
11030 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
11031 ArrayRef<int> Inputs) {
11032 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
11033 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
11034 // Determine whether the free index is in the flipped dword or the
11035 // unflipped dword based on where the pinned index is. We use this bit
11036 // in an xor to conditionally select the adjacent dword.
11037 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
11038 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11039 if (IsFixIdxInput == IsFixFreeIdxInput)
11041 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11042 assert(IsFixIdxInput != IsFixFreeIdxInput &&
11043 "We need to be changing the number of flipped inputs!");
11044 int PSHUFHalfMask[] = {0, 1, 2, 3};
11045 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
11047 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
11048 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
11049 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11051 for (int &M : Mask)
11052 if (M >= 0 && M == FixIdx)
11054 else if (M >= 0 && M == FixFreeIdx)
11057 if (NumFlippedBToBInputs != 0) {
11059 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
11060 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
11062 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
11063 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
11064 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
11069 int PSHUFDMask[] = {0, 1, 2, 3};
11070 PSHUFDMask[ADWord] = BDWord;
11071 PSHUFDMask[BDWord] = ADWord;
11072 V = DAG.getBitcast(
11074 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11075 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11077 // Adjust the mask to match the new locations of A and B.
11078 for (int &M : Mask)
11079 if (M >= 0 && M/2 == ADWord)
11080 M = 2 * BDWord + M % 2;
11081 else if (M >= 0 && M/2 == BDWord)
11082 M = 2 * ADWord + M % 2;
11084 // Recurse back into this routine to re-compute state now that this isn't
11085 // a 3 and 1 problem.
11086 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
11089 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
11090 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
11091 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
11092 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
11094 // At this point there are at most two inputs to the low and high halves from
11095 // each half. That means the inputs can always be grouped into dwords and
11096 // those dwords can then be moved to the correct half with a dword shuffle.
11097 // We use at most one low and one high word shuffle to collect these paired
11098 // inputs into dwords, and finally a dword shuffle to place them.
11099 int PSHUFLMask[4] = {-1, -1, -1, -1};
11100 int PSHUFHMask[4] = {-1, -1, -1, -1};
11101 int PSHUFDMask[4] = {-1, -1, -1, -1};
11103 // First fix the masks for all the inputs that are staying in their
11104 // original halves. This will then dictate the targets of the cross-half
11106 auto fixInPlaceInputs =
11107 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
11108 MutableArrayRef<int> SourceHalfMask,
11109 MutableArrayRef<int> HalfMask, int HalfOffset) {
11110 if (InPlaceInputs.empty())
11112 if (InPlaceInputs.size() == 1) {
11113 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11114 InPlaceInputs[0] - HalfOffset;
11115 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
11118 if (IncomingInputs.empty()) {
11119 // Just fix all of the in place inputs.
11120 for (int Input : InPlaceInputs) {
11121 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
11122 PSHUFDMask[Input / 2] = Input / 2;
11127 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
11128 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11129 InPlaceInputs[0] - HalfOffset;
11130 // Put the second input next to the first so that they are packed into
11131 // a dword. We find the adjacent index by toggling the low bit.
11132 int AdjIndex = InPlaceInputs[0] ^ 1;
11133 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11134 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11135 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11137 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11138 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11140 // Now gather the cross-half inputs and place them into a free dword of
11141 // their target half.
11142 // FIXME: This operation could almost certainly be simplified dramatically to
11143 // look more like the 3-1 fixing operation.
11144 auto moveInputsToRightHalf = [&PSHUFDMask](
11145 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11146 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11147 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11149 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11150 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11152 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11154 int LowWord = Word & ~1;
11155 int HighWord = Word | 1;
11156 return isWordClobbered(SourceHalfMask, LowWord) ||
11157 isWordClobbered(SourceHalfMask, HighWord);
11160 if (IncomingInputs.empty())
11163 if (ExistingInputs.empty()) {
11164 // Map any dwords with inputs from them into the right half.
11165 for (int Input : IncomingInputs) {
11166 // If the source half mask maps over the inputs, turn those into
11167 // swaps and use the swapped lane.
11168 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11169 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11170 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11171 Input - SourceOffset;
11172 // We have to swap the uses in our half mask in one sweep.
11173 for (int &M : HalfMask)
11174 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11176 else if (M == Input)
11177 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11179 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
11180 Input - SourceOffset &&
11181 "Previous placement doesn't match!");
11183 // Note that this correctly re-maps both when we do a swap and when
11184 // we observe the other side of the swap above. We rely on that to
11185 // avoid swapping the members of the input list directly.
11186 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11189 // Map the input's dword into the correct half.
11190 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11191 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11193 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11195 "Previous placement doesn't match!");
11198 // And just directly shift any other-half mask elements to be same-half
11199 // as we will have mirrored the dword containing the element into the
11200 // same position within that half.
11201 for (int &M : HalfMask)
11202 if (M >= SourceOffset && M < SourceOffset + 4) {
11203 M = M - SourceOffset + DestOffset;
11204 assert(M >= 0 && "This should never wrap below zero!");
11209 // Ensure we have the input in a viable dword of its current half. This
11210 // is particularly tricky because the original position may be clobbered
11211 // by inputs being moved and *staying* in that half.
11212 if (IncomingInputs.size() == 1) {
11213 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11214 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11216 SourceHalfMask[InputFixed - SourceOffset] =
11217 IncomingInputs[0] - SourceOffset;
11218 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11220 IncomingInputs[0] = InputFixed;
11222 } else if (IncomingInputs.size() == 2) {
11223 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11224 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11225 // We have two non-adjacent or clobbered inputs we need to extract from
11226 // the source half. To do this, we need to map them into some adjacent
11227 // dword slot in the source mask.
11228 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11229 IncomingInputs[1] - SourceOffset};
11231 // If there is a free slot in the source half mask adjacent to one of
11232 // the inputs, place the other input in it. We use (Index XOR 1) to
11233 // compute an adjacent index.
11234 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11235 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11236 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11237 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11238 InputsFixed[1] = InputsFixed[0] ^ 1;
11239 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11240 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11241 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11242 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11243 InputsFixed[0] = InputsFixed[1] ^ 1;
11244 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11245 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11246 // The two inputs are in the same DWord but it is clobbered and the
11247 // adjacent DWord isn't used at all. Move both inputs to the free
11249 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11250 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11251 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11252 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11254 // The only way we hit this point is if there is no clobbering
11255 // (because there are no off-half inputs to this half) and there is no
11256 // free slot adjacent to one of the inputs. In this case, we have to
11257 // swap an input with a non-input.
11258 for (int i = 0; i < 4; ++i)
11259 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11260 "We can't handle any clobbers here!");
11261 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11262 "Cannot have adjacent inputs here!");
11264 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11265 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11267 // We also have to update the final source mask in this case because
11268 // it may need to undo the above swap.
11269 for (int &M : FinalSourceHalfMask)
11270 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11271 M = InputsFixed[1] + SourceOffset;
11272 else if (M == InputsFixed[1] + SourceOffset)
11273 M = (InputsFixed[0] ^ 1) + SourceOffset;
11275 InputsFixed[1] = InputsFixed[0] ^ 1;
11278 // Point everything at the fixed inputs.
11279 for (int &M : HalfMask)
11280 if (M == IncomingInputs[0])
11281 M = InputsFixed[0] + SourceOffset;
11282 else if (M == IncomingInputs[1])
11283 M = InputsFixed[1] + SourceOffset;
11285 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11286 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11289 llvm_unreachable("Unhandled input size!");
11292 // Now hoist the DWord down to the right half.
11293 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11294 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11295 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11296 for (int &M : HalfMask)
11297 for (int Input : IncomingInputs)
11299 M = FreeDWord * 2 + Input % 2;
11301 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11302 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11303 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11304 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11306 // Now enact all the shuffles we've computed to move the inputs into their
11308 if (!isNoopShuffleMask(PSHUFLMask))
11309 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11310 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11311 if (!isNoopShuffleMask(PSHUFHMask))
11312 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11313 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11314 if (!isNoopShuffleMask(PSHUFDMask))
11315 V = DAG.getBitcast(
11317 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11318 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11320 // At this point, each half should contain all its inputs, and we can then
11321 // just shuffle them into their final position.
11322 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11323 "Failed to lift all the high half inputs to the low mask!");
11324 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11325 "Failed to lift all the low half inputs to the high mask!");
11327 // Do a half shuffle for the low mask.
11328 if (!isNoopShuffleMask(LoMask))
11329 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11330 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11332 // Do a half shuffle with the high mask after shifting its values down.
11333 for (int &M : HiMask)
11336 if (!isNoopShuffleMask(HiMask))
11337 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11338 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11343 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11344 /// blend if only one input is used.
11345 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11346 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11347 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11349 SDValue V1Mask[16];
11350 SDValue V2Mask[16];
11354 int Size = Mask.size();
11355 int Scale = 16 / Size;
11356 for (int i = 0; i < 16; ++i) {
11357 if (Mask[i / Scale] < 0) {
11358 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11360 const int ZeroMask = 0x80;
11361 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11363 int V2Idx = Mask[i / Scale] < Size
11365 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11366 if (Zeroable[i / Scale])
11367 V1Idx = V2Idx = ZeroMask;
11368 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11369 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11370 V1InUse |= (ZeroMask != V1Idx);
11371 V2InUse |= (ZeroMask != V2Idx);
11376 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11377 DAG.getBitcast(MVT::v16i8, V1),
11378 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11380 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11381 DAG.getBitcast(MVT::v16i8, V2),
11382 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11384 // If we need shuffled inputs from both, blend the two.
11386 if (V1InUse && V2InUse)
11387 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11389 V = V1InUse ? V1 : V2;
11391 // Cast the result back to the correct type.
11392 return DAG.getBitcast(VT, V);
11395 /// \brief Generic lowering of 8-lane i16 shuffles.
11397 /// This handles both single-input shuffles and combined shuffle/blends with
11398 /// two inputs. The single input shuffles are immediately delegated to
11399 /// a dedicated lowering routine.
11401 /// The blends are lowered in one of three fundamental ways. If there are few
11402 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11403 /// of the input is significantly cheaper when lowered as an interleaving of
11404 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11405 /// halves of the inputs separately (making them have relatively few inputs)
11406 /// and then concatenate them.
11407 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11408 const APInt &Zeroable,
11409 SDValue V1, SDValue V2,
11410 const X86Subtarget &Subtarget,
11411 SelectionDAG &DAG) {
11412 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11413 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11414 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11416 // Whenever we can lower this as a zext, that instruction is strictly faster
11417 // than any alternative.
11418 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11419 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11422 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11424 if (NumV2Inputs == 0) {
11425 // Check for being able to broadcast a single element.
11426 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11427 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11430 // Try to use shift instructions.
11431 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11432 Zeroable, Subtarget, DAG))
11435 // Use dedicated unpack instructions for masks that match their pattern.
11437 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11440 // Try to use byte rotation instructions.
11441 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11442 Mask, Subtarget, DAG))
11445 // Make a copy of the mask so it can be modified.
11446 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11447 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11448 MutableMask, Subtarget,
11452 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11453 "All single-input shuffles should be canonicalized to be V1-input "
11456 // Try to use shift instructions.
11457 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11458 Zeroable, Subtarget, DAG))
11461 // See if we can use SSE4A Extraction / Insertion.
11462 if (Subtarget.hasSSE4A())
11463 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11467 // There are special ways we can lower some single-element blends.
11468 if (NumV2Inputs == 1)
11469 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11470 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11473 // We have different paths for blend lowering, but they all must use the
11474 // *exact* same predicate.
11475 bool IsBlendSupported = Subtarget.hasSSE41();
11476 if (IsBlendSupported)
11477 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11478 Zeroable, Subtarget, DAG))
11481 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11485 // Use dedicated unpack instructions for masks that match their pattern.
11487 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11490 // Try to use byte rotation instructions.
11491 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11492 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11495 if (SDValue BitBlend =
11496 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11499 // Try to lower by permuting the inputs into an unpack instruction.
11500 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11504 // If we can't directly blend but can use PSHUFB, that will be better as it
11505 // can both shuffle and set up the inefficient blend.
11506 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11507 bool V1InUse, V2InUse;
11508 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11509 Zeroable, DAG, V1InUse, V2InUse);
11512 // We can always bit-blend if we have to so the fallback strategy is to
11513 // decompose into single-input permutes and blends.
11514 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11518 /// \brief Check whether a compaction lowering can be done by dropping even
11519 /// elements and compute how many times even elements must be dropped.
11521 /// This handles shuffles which take every Nth element where N is a power of
11522 /// two. Example shuffle masks:
11524 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11525 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11526 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11527 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11528 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11529 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11531 /// Any of these lanes can of course be undef.
11533 /// This routine only supports N <= 3.
11534 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11537 /// \returns N above, or the number of times even elements must be dropped if
11538 /// there is such a number. Otherwise returns zero.
11539 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11540 bool IsSingleInput) {
11541 // The modulus for the shuffle vector entries is based on whether this is
11542 // a single input or not.
11543 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11544 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11545 "We should only be called with masks with a power-of-2 size!");
11547 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11549 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11550 // and 2^3 simultaneously. This is because we may have ambiguity with
11551 // partially undef inputs.
11552 bool ViableForN[3] = {true, true, true};
11554 for (int i = 0, e = Mask.size(); i < e; ++i) {
11555 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11560 bool IsAnyViable = false;
11561 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11562 if (ViableForN[j]) {
11563 uint64_t N = j + 1;
11565 // The shuffle mask must be equal to (i * 2^N) % M.
11566 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11567 IsAnyViable = true;
11569 ViableForN[j] = false;
11571 // Early exit if we exhaust the possible powers of two.
11576 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11580 // Return 0 as there is no viable power of two.
11584 /// \brief Generic lowering of v16i8 shuffles.
11586 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11587 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11588 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11589 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11591 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11592 const APInt &Zeroable,
11593 SDValue V1, SDValue V2,
11594 const X86Subtarget &Subtarget,
11595 SelectionDAG &DAG) {
11596 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11597 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11598 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11600 // Try to use shift instructions.
11601 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11602 Zeroable, Subtarget, DAG))
11605 // Try to use byte rotation instructions.
11606 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11607 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11610 // Try to use a zext lowering.
11611 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11612 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11615 // See if we can use SSE4A Extraction / Insertion.
11616 if (Subtarget.hasSSE4A())
11617 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11621 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11623 // For single-input shuffles, there are some nicer lowering tricks we can use.
11624 if (NumV2Elements == 0) {
11625 // Check for being able to broadcast a single element.
11626 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11627 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11630 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11631 // Notably, this handles splat and partial-splat shuffles more efficiently.
11632 // However, it only makes sense if the pre-duplication shuffle simplifies
11633 // things significantly. Currently, this means we need to be able to
11634 // express the pre-duplication shuffle as an i16 shuffle.
11636 // FIXME: We should check for other patterns which can be widened into an
11637 // i16 shuffle as well.
11638 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11639 for (int i = 0; i < 16; i += 2)
11640 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11645 auto tryToWidenViaDuplication = [&]() -> SDValue {
11646 if (!canWidenViaDuplication(Mask))
11648 SmallVector<int, 4> LoInputs;
11649 copy_if(Mask, std::back_inserter(LoInputs),
11650 [](int M) { return M >= 0 && M < 8; });
11651 std::sort(LoInputs.begin(), LoInputs.end());
11652 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11654 SmallVector<int, 4> HiInputs;
11655 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11656 std::sort(HiInputs.begin(), HiInputs.end());
11657 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11660 bool TargetLo = LoInputs.size() >= HiInputs.size();
11661 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11662 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11664 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11665 SmallDenseMap<int, int, 8> LaneMap;
11666 for (int I : InPlaceInputs) {
11667 PreDupI16Shuffle[I/2] = I/2;
11670 int j = TargetLo ? 0 : 4, je = j + 4;
11671 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11672 // Check if j is already a shuffle of this input. This happens when
11673 // there are two adjacent bytes after we move the low one.
11674 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11675 // If we haven't yet mapped the input, search for a slot into which
11677 while (j < je && PreDupI16Shuffle[j] >= 0)
11681 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11684 // Map this input with the i16 shuffle.
11685 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11688 // Update the lane map based on the mapping we ended up with.
11689 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11691 V1 = DAG.getBitcast(
11693 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11694 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11696 // Unpack the bytes to form the i16s that will be shuffled into place.
11697 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11698 MVT::v16i8, V1, V1);
11700 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11701 for (int i = 0; i < 16; ++i)
11702 if (Mask[i] >= 0) {
11703 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11704 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11705 if (PostDupI16Shuffle[i / 2] < 0)
11706 PostDupI16Shuffle[i / 2] = MappedMask;
11708 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11709 "Conflicting entries in the original shuffle!");
11711 return DAG.getBitcast(
11713 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11714 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11716 if (SDValue V = tryToWidenViaDuplication())
11720 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11724 // Use dedicated unpack instructions for masks that match their pattern.
11726 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11729 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11730 // with PSHUFB. It is important to do this before we attempt to generate any
11731 // blends but after all of the single-input lowerings. If the single input
11732 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11733 // want to preserve that and we can DAG combine any longer sequences into
11734 // a PSHUFB in the end. But once we start blending from multiple inputs,
11735 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11736 // and there are *very* few patterns that would actually be faster than the
11737 // PSHUFB approach because of its ability to zero lanes.
11739 // FIXME: The only exceptions to the above are blends which are exact
11740 // interleavings with direct instructions supporting them. We currently don't
11741 // handle those well here.
11742 if (Subtarget.hasSSSE3()) {
11743 bool V1InUse = false;
11744 bool V2InUse = false;
11746 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11747 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11749 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11750 // do so. This avoids using them to handle blends-with-zero which is
11751 // important as a single pshufb is significantly faster for that.
11752 if (V1InUse && V2InUse) {
11753 if (Subtarget.hasSSE41())
11754 if (SDValue Blend = lowerVectorShuffleAsBlend(
11755 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11758 // We can use an unpack to do the blending rather than an or in some
11759 // cases. Even though the or may be (very minorly) more efficient, we
11760 // preference this lowering because there are common cases where part of
11761 // the complexity of the shuffles goes away when we do the final blend as
11763 // FIXME: It might be worth trying to detect if the unpack-feeding
11764 // shuffles will both be pshufb, in which case we shouldn't bother with
11766 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11767 DL, MVT::v16i8, V1, V2, Mask, DAG))
11774 // There are special ways we can lower some single-element blends.
11775 if (NumV2Elements == 1)
11776 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11777 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11780 if (SDValue BitBlend =
11781 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11784 // Check whether a compaction lowering can be done. This handles shuffles
11785 // which take every Nth element for some even N. See the helper function for
11788 // We special case these as they can be particularly efficiently handled with
11789 // the PACKUSB instruction on x86 and they show up in common patterns of
11790 // rearranging bytes to truncate wide elements.
11791 bool IsSingleInput = V2.isUndef();
11792 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11793 // NumEvenDrops is the power of two stride of the elements. Another way of
11794 // thinking about it is that we need to drop the even elements this many
11795 // times to get the original input.
11797 // First we need to zero all the dropped bytes.
11798 assert(NumEvenDrops <= 3 &&
11799 "No support for dropping even elements more than 3 times.");
11800 // We use the mask type to pick which bytes are preserved based on how many
11801 // elements are dropped.
11802 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11803 SDValue ByteClearMask = DAG.getBitcast(
11804 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11805 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11806 if (!IsSingleInput)
11807 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11809 // Now pack things back together.
11810 V1 = DAG.getBitcast(MVT::v8i16, V1);
11811 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11812 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11813 for (int i = 1; i < NumEvenDrops; ++i) {
11814 Result = DAG.getBitcast(MVT::v8i16, Result);
11815 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11821 // Handle multi-input cases by blending single-input shuffles.
11822 if (NumV2Elements > 0)
11823 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11826 // The fallback path for single-input shuffles widens this into two v8i16
11827 // vectors with unpacks, shuffles those, and then pulls them back together
11831 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11832 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11833 for (int i = 0; i < 16; ++i)
11835 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11837 SDValue VLoHalf, VHiHalf;
11838 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11839 // them out and avoid using UNPCK{L,H} to extract the elements of V as
11841 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11842 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11843 // Use a mask to drop the high bytes.
11844 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11845 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11846 DAG.getConstant(0x00FF, DL, MVT::v8i16));
11848 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11849 VHiHalf = DAG.getUNDEF(MVT::v8i16);
11851 // Squash the masks to point directly into VLoHalf.
11852 for (int &M : LoBlendMask)
11855 for (int &M : HiBlendMask)
11859 // Otherwise just unpack the low half of V into VLoHalf and the high half into
11860 // VHiHalf so that we can blend them as i16s.
11861 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11863 VLoHalf = DAG.getBitcast(
11864 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11865 VHiHalf = DAG.getBitcast(
11866 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11869 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11870 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11872 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11875 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11877 /// This routine breaks down the specific type of 128-bit shuffle and
11878 /// dispatches to the lowering routines accordingly.
11879 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11880 MVT VT, SDValue V1, SDValue V2,
11881 const APInt &Zeroable,
11882 const X86Subtarget &Subtarget,
11883 SelectionDAG &DAG) {
11884 switch (VT.SimpleTy) {
11886 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11888 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11890 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11892 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11894 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11896 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11899 llvm_unreachable("Unimplemented!");
11903 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11905 /// This routine just extracts two subvectors, shuffles them independently, and
11906 /// then concatenates them back together. This should work effectively with all
11907 /// AVX vector shuffle types.
11908 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11909 SDValue V2, ArrayRef<int> Mask,
11910 SelectionDAG &DAG) {
11911 assert(VT.getSizeInBits() >= 256 &&
11912 "Only for 256-bit or wider vector shuffles!");
11913 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11914 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11916 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11917 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11919 int NumElements = VT.getVectorNumElements();
11920 int SplitNumElements = NumElements / 2;
11921 MVT ScalarVT = VT.getVectorElementType();
11922 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11924 // Rather than splitting build-vectors, just build two narrower build
11925 // vectors. This helps shuffling with splats and zeros.
11926 auto SplitVector = [&](SDValue V) {
11927 V = peekThroughBitcasts(V);
11929 MVT OrigVT = V.getSimpleValueType();
11930 int OrigNumElements = OrigVT.getVectorNumElements();
11931 int OrigSplitNumElements = OrigNumElements / 2;
11932 MVT OrigScalarVT = OrigVT.getVectorElementType();
11933 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11937 auto *BV = dyn_cast<BuildVectorSDNode>(V);
11939 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11940 DAG.getIntPtrConstant(0, DL));
11941 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11942 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11945 SmallVector<SDValue, 16> LoOps, HiOps;
11946 for (int i = 0; i < OrigSplitNumElements; ++i) {
11947 LoOps.push_back(BV->getOperand(i));
11948 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11950 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11951 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11953 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11954 DAG.getBitcast(SplitVT, HiV));
11957 SDValue LoV1, HiV1, LoV2, HiV2;
11958 std::tie(LoV1, HiV1) = SplitVector(V1);
11959 std::tie(LoV2, HiV2) = SplitVector(V2);
11961 // Now create two 4-way blends of these half-width vectors.
11962 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11963 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11964 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11965 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11966 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11967 for (int i = 0; i < SplitNumElements; ++i) {
11968 int M = HalfMask[i];
11969 if (M >= NumElements) {
11970 if (M >= NumElements + SplitNumElements)
11974 V2BlendMask[i] = M - NumElements;
11975 BlendMask[i] = SplitNumElements + i;
11976 } else if (M >= 0) {
11977 if (M >= SplitNumElements)
11981 V1BlendMask[i] = M;
11986 // Because the lowering happens after all combining takes place, we need to
11987 // manually combine these blend masks as much as possible so that we create
11988 // a minimal number of high-level vector shuffle nodes.
11990 // First try just blending the halves of V1 or V2.
11991 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11992 return DAG.getUNDEF(SplitVT);
11993 if (!UseLoV2 && !UseHiV2)
11994 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11995 if (!UseLoV1 && !UseHiV1)
11996 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11998 SDValue V1Blend, V2Blend;
11999 if (UseLoV1 && UseHiV1) {
12001 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12003 // We only use half of V1 so map the usage down into the final blend mask.
12004 V1Blend = UseLoV1 ? LoV1 : HiV1;
12005 for (int i = 0; i < SplitNumElements; ++i)
12006 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
12007 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
12009 if (UseLoV2 && UseHiV2) {
12011 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12013 // We only use half of V2 so map the usage down into the final blend mask.
12014 V2Blend = UseLoV2 ? LoV2 : HiV2;
12015 for (int i = 0; i < SplitNumElements; ++i)
12016 if (BlendMask[i] >= SplitNumElements)
12017 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
12019 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
12021 SDValue Lo = HalfBlend(LoMask);
12022 SDValue Hi = HalfBlend(HiMask);
12023 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
12026 /// \brief Either split a vector in halves or decompose the shuffles and the
12029 /// This is provided as a good fallback for many lowerings of non-single-input
12030 /// shuffles with more than one 128-bit lane. In those cases, we want to select
12031 /// between splitting the shuffle into 128-bit components and stitching those
12032 /// back together vs. extracting the single-input shuffles and blending those
12034 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
12035 SDValue V1, SDValue V2,
12036 ArrayRef<int> Mask,
12037 SelectionDAG &DAG) {
12038 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
12039 "shuffles as it could then recurse on itself.");
12040 int Size = Mask.size();
12042 // If this can be modeled as a broadcast of two elements followed by a blend,
12043 // prefer that lowering. This is especially important because broadcasts can
12044 // often fold with memory operands.
12045 auto DoBothBroadcast = [&] {
12046 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
12049 if (V2BroadcastIdx < 0)
12050 V2BroadcastIdx = M - Size;
12051 else if (M - Size != V2BroadcastIdx)
12053 } else if (M >= 0) {
12054 if (V1BroadcastIdx < 0)
12055 V1BroadcastIdx = M;
12056 else if (M != V1BroadcastIdx)
12061 if (DoBothBroadcast())
12062 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
12065 // If the inputs all stem from a single 128-bit lane of each input, then we
12066 // split them rather than blending because the split will decompose to
12067 // unusually few instructions.
12068 int LaneCount = VT.getSizeInBits() / 128;
12069 int LaneSize = Size / LaneCount;
12070 SmallBitVector LaneInputs[2];
12071 LaneInputs[0].resize(LaneCount, false);
12072 LaneInputs[1].resize(LaneCount, false);
12073 for (int i = 0; i < Size; ++i)
12075 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
12076 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
12077 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12079 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
12080 // that the decomposed single-input shuffles don't end up here.
12081 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
12084 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
12085 /// a permutation and blend of those lanes.
12087 /// This essentially blends the out-of-lane inputs to each lane into the lane
12088 /// from a permuted copy of the vector. This lowering strategy results in four
12089 /// instructions in the worst case for a single-input cross lane shuffle which
12090 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
12091 /// of. Special cases for each particular shuffle pattern should be handled
12092 /// prior to trying this lowering.
12093 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
12094 SDValue V1, SDValue V2,
12095 ArrayRef<int> Mask,
12096 SelectionDAG &DAG) {
12097 // FIXME: This should probably be generalized for 512-bit vectors as well.
12098 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
12099 int Size = Mask.size();
12100 int LaneSize = Size / 2;
12102 // If there are only inputs from one 128-bit lane, splitting will in fact be
12103 // less expensive. The flags track whether the given lane contains an element
12104 // that crosses to another lane.
12105 bool LaneCrossing[2] = {false, false};
12106 for (int i = 0; i < Size; ++i)
12107 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
12108 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
12109 if (!LaneCrossing[0] || !LaneCrossing[1])
12110 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12112 assert(V2.isUndef() &&
12113 "This last part of this routine only works on single input shuffles");
12115 SmallVector<int, 32> FlippedBlendMask(Size);
12116 for (int i = 0; i < Size; ++i)
12117 FlippedBlendMask[i] =
12118 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
12120 : Mask[i] % LaneSize +
12121 (i / LaneSize) * LaneSize + Size);
12123 // Flip the vector, and blend the results which should now be in-lane. The
12124 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
12125 // 5 for the high source. The value 3 selects the high half of source 2 and
12126 // the value 2 selects the low half of source 2. We only use source 2 to
12127 // allow folding it into a memory operand.
12128 unsigned PERMMask = 3 | 2 << 4;
12129 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
12130 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
12131 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12134 /// \brief Handle lowering 2-lane 128-bit shuffles.
12135 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12136 SDValue V2, ArrayRef<int> Mask,
12137 const APInt &Zeroable,
12138 const X86Subtarget &Subtarget,
12139 SelectionDAG &DAG) {
12140 SmallVector<int, 4> WidenedMask;
12141 if (!canWidenShuffleElements(Mask, WidenedMask))
12144 // TODO: If minimizing size and one of the inputs is a zero vector and the
12145 // the zero vector has only one use, we could use a VPERM2X128 to save the
12146 // instruction bytes needed to explicitly generate the zero vector.
12148 // Blends are faster and handle all the non-lane-crossing cases.
12149 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12150 Zeroable, Subtarget, DAG))
12153 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
12154 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
12156 // If either input operand is a zero vector, use VPERM2X128 because its mask
12157 // allows us to replace the zero input with an implicit zero.
12158 if (!IsV1Zero && !IsV2Zero) {
12159 // Check for patterns which can be matched with a single insert of a 128-bit
12161 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12162 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12163 // With AVX2, use VPERMQ/VPERMPD to allow memory folding.
12164 if (Subtarget.hasAVX2() && V2.isUndef())
12167 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
12168 // this will likely become vinsertf128 which can't fold a 256-bit memop.
12169 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
12170 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12171 VT.getVectorNumElements() / 2);
12172 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12173 DAG.getIntPtrConstant(0, DL));
12174 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12175 OnlyUsesV1 ? V1 : V2,
12176 DAG.getIntPtrConstant(0, DL));
12177 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12182 // Otherwise form a 128-bit permutation. After accounting for undefs,
12183 // convert the 64-bit shuffle mask selection values into 128-bit
12184 // selection bits by dividing the indexes by 2 and shifting into positions
12185 // defined by a vperm2*128 instruction's immediate control byte.
12187 // The immediate permute control byte looks like this:
12188 // [1:0] - select 128 bits from sources for low half of destination
12190 // [3] - zero low half of destination
12191 // [5:4] - select 128 bits from sources for high half of destination
12193 // [7] - zero high half of destination
12195 int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
12196 int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
12198 unsigned PermMask = MaskLO | (MaskHI << 4);
12200 // If either input is a zero vector, replace it with an undef input.
12201 // Shuffle mask values < 4 are selecting elements of V1.
12202 // Shuffle mask values >= 4 are selecting elements of V2.
12203 // Adjust each half of the permute mask by clearing the half that was
12204 // selecting the zero vector and setting the zero mask bit.
12206 V1 = DAG.getUNDEF(VT);
12208 PermMask = (PermMask & 0xf0) | 0x08;
12210 PermMask = (PermMask & 0x0f) | 0x80;
12213 V2 = DAG.getUNDEF(VT);
12215 PermMask = (PermMask & 0xf0) | 0x08;
12217 PermMask = (PermMask & 0x0f) | 0x80;
12220 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12221 DAG.getConstant(PermMask, DL, MVT::i8));
12224 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12225 /// shuffling each lane.
12227 /// This will only succeed when the result of fixing the 128-bit lanes results
12228 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12229 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12230 /// the lane crosses early and then use simpler shuffles within each lane.
12232 /// FIXME: It might be worthwhile at some point to support this without
12233 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12234 /// in x86 only floating point has interesting non-repeating shuffles, and even
12235 /// those are still *marginally* more expensive.
12236 static SDValue lowerVectorShuffleByMerging128BitLanes(
12237 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12238 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12239 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12241 int Size = Mask.size();
12242 int LaneSize = 128 / VT.getScalarSizeInBits();
12243 int NumLanes = Size / LaneSize;
12244 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12246 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12247 // check whether the in-128-bit lane shuffles share a repeating pattern.
12248 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12249 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12250 for (int i = 0; i < Size; ++i) {
12254 int j = i / LaneSize;
12256 if (Lanes[j] < 0) {
12257 // First entry we've seen for this lane.
12258 Lanes[j] = Mask[i] / LaneSize;
12259 } else if (Lanes[j] != Mask[i] / LaneSize) {
12260 // This doesn't match the lane selected previously!
12264 // Check that within each lane we have a consistent shuffle mask.
12265 int k = i % LaneSize;
12266 if (InLaneMask[k] < 0) {
12267 InLaneMask[k] = Mask[i] % LaneSize;
12268 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12269 // This doesn't fit a repeating in-lane mask.
12274 // First shuffle the lanes into place.
12275 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12276 VT.getSizeInBits() / 64);
12277 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12278 for (int i = 0; i < NumLanes; ++i)
12279 if (Lanes[i] >= 0) {
12280 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12281 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12284 V1 = DAG.getBitcast(LaneVT, V1);
12285 V2 = DAG.getBitcast(LaneVT, V2);
12286 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12288 // Cast it back to the type we actually want.
12289 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12291 // Now do a simple shuffle that isn't lane crossing.
12292 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12293 for (int i = 0; i < Size; ++i)
12295 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12296 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12297 "Must not introduce lane crosses at this point!");
12299 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12302 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
12303 /// This allows for fast cases such as subvector extraction/insertion
12304 /// or shuffling smaller vector types which can lower more efficiently.
12305 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12306 SDValue V1, SDValue V2,
12307 ArrayRef<int> Mask,
12308 const X86Subtarget &Subtarget,
12309 SelectionDAG &DAG) {
12310 assert(VT.is256BitVector() && "Expected 256-bit vector");
12312 unsigned NumElts = VT.getVectorNumElements();
12313 unsigned HalfNumElts = NumElts / 2;
12314 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12316 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12317 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12318 if (!UndefLower && !UndefUpper)
12321 // Upper half is undef and lower half is whole upper subvector.
12322 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12324 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12325 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12326 DAG.getIntPtrConstant(HalfNumElts, DL));
12327 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12328 DAG.getIntPtrConstant(0, DL));
12331 // Lower half is undef and upper half is whole lower subvector.
12332 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12334 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12335 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12336 DAG.getIntPtrConstant(0, DL));
12337 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12338 DAG.getIntPtrConstant(HalfNumElts, DL));
12341 // If the shuffle only uses two of the four halves of the input operands,
12342 // then extract them and perform the 'half' shuffle at half width.
12343 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12344 int HalfIdx1 = -1, HalfIdx2 = -1;
12345 SmallVector<int, 8> HalfMask(HalfNumElts);
12346 unsigned Offset = UndefLower ? HalfNumElts : 0;
12347 for (unsigned i = 0; i != HalfNumElts; ++i) {
12348 int M = Mask[i + Offset];
12354 // Determine which of the 4 half vectors this element is from.
12355 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12356 int HalfIdx = M / HalfNumElts;
12358 // Determine the element index into its half vector source.
12359 int HalfElt = M % HalfNumElts;
12361 // We can shuffle with up to 2 half vectors, set the new 'half'
12362 // shuffle mask accordingly.
12363 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12364 HalfMask[i] = HalfElt;
12365 HalfIdx1 = HalfIdx;
12368 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12369 HalfMask[i] = HalfElt + HalfNumElts;
12370 HalfIdx2 = HalfIdx;
12374 // Too many half vectors referenced.
12377 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12379 // Only shuffle the halves of the inputs when useful.
12380 int NumLowerHalves =
12381 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12382 int NumUpperHalves =
12383 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12385 // uuuuXXXX - don't extract uppers just to insert again.
12386 if (UndefLower && NumUpperHalves != 0)
12389 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12390 if (UndefUpper && NumUpperHalves == 2)
12393 // AVX2 - XXXXuuuu - always extract lowers.
12394 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12395 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12396 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12398 // AVX2 supports variable 32-bit element cross-lane shuffles.
12399 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12400 // XXXXuuuu - don't extract lowers and uppers.
12401 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12406 auto GetHalfVector = [&](int HalfIdx) {
12408 return DAG.getUNDEF(HalfVT);
12409 SDValue V = (HalfIdx < 2 ? V1 : V2);
12410 HalfIdx = (HalfIdx % 2) * HalfNumElts;
12411 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12412 DAG.getIntPtrConstant(HalfIdx, DL));
12415 SDValue Half1 = GetHalfVector(HalfIdx1);
12416 SDValue Half2 = GetHalfVector(HalfIdx2);
12417 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12418 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12419 DAG.getIntPtrConstant(Offset, DL));
12422 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12425 /// This returns true if the elements from a particular input are already in the
12426 /// slot required by the given mask and require no permutation.
12427 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12428 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12429 int Size = Mask.size();
12430 for (int i = 0; i < Size; ++i)
12431 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12437 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12438 /// every lane can be represented as the same repeating mask - allowing us to
12439 /// shuffle the sources with the repeating shuffle and then permute the result
12440 /// to the destination lanes.
12441 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12442 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12443 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12444 int NumElts = VT.getVectorNumElements();
12445 int NumLanes = VT.getSizeInBits() / 128;
12446 int NumLaneElts = NumElts / NumLanes;
12448 // On AVX2 we may be able to just shuffle the lowest elements and then
12449 // broadcast the result.
12450 if (Subtarget.hasAVX2()) {
12451 for (unsigned BroadcastSize : {16, 32, 64}) {
12452 if (BroadcastSize <= VT.getScalarSizeInBits())
12454 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12456 // Attempt to match a repeating pattern every NumBroadcastElts,
12457 // accounting for UNDEFs but only references the lowest 128-bit
12458 // lane of the inputs.
12459 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12460 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12461 for (int j = 0; j != NumBroadcastElts; ++j) {
12462 int M = Mask[i + j];
12465 int &R = RepeatMask[j];
12466 if (0 != ((M % NumElts) / NumLaneElts))
12468 if (0 <= R && R != M)
12475 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12476 if (!FindRepeatingBroadcastMask(RepeatMask))
12479 // Shuffle the (lowest) repeated elements in place for broadcast.
12480 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12482 // Shuffle the actual broadcast.
12483 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12484 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12485 for (int j = 0; j != NumBroadcastElts; ++j)
12486 BroadcastMask[i + j] = j;
12487 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12492 // Bail if the shuffle mask doesn't cross 128-bit lanes.
12493 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12496 // Bail if we already have a repeated lane shuffle mask.
12497 SmallVector<int, 8> RepeatedShuffleMask;
12498 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12501 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12502 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12503 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12504 int NumSubLanes = NumLanes * SubLaneScale;
12505 int NumSubLaneElts = NumLaneElts / SubLaneScale;
12507 // Check that all the sources are coming from the same lane and see if we can
12508 // form a repeating shuffle mask (local to each sub-lane). At the same time,
12509 // determine the source sub-lane for each destination sub-lane.
12510 int TopSrcSubLane = -1;
12511 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12512 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12513 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12514 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12516 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12517 // Extract the sub-lane mask, check that it all comes from the same lane
12518 // and normalize the mask entries to come from the first lane.
12520 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12521 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12522 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12525 int Lane = (M % NumElts) / NumLaneElts;
12526 if ((0 <= SrcLane) && (SrcLane != Lane))
12529 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12530 SubLaneMask[Elt] = LocalM;
12533 // Whole sub-lane is UNDEF.
12537 // Attempt to match against the candidate repeated sub-lane masks.
12538 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12539 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12540 for (int i = 0; i != NumSubLaneElts; ++i) {
12541 if (M1[i] < 0 || M2[i] < 0)
12543 if (M1[i] != M2[i])
12549 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12550 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12553 // Merge the sub-lane mask into the matching repeated sub-lane mask.
12554 for (int i = 0; i != NumSubLaneElts; ++i) {
12555 int M = SubLaneMask[i];
12558 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12559 "Unexpected mask element");
12560 RepeatedSubLaneMask[i] = M;
12563 // Track the top most source sub-lane - by setting the remaining to UNDEF
12564 // we can greatly simplify shuffle matching.
12565 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12566 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12567 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12571 // Bail if we failed to find a matching repeated sub-lane mask.
12572 if (Dst2SrcSubLanes[DstSubLane] < 0)
12575 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12576 "Unexpected source lane");
12578 // Create a repeating shuffle mask for the entire vector.
12579 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12580 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12581 int Lane = SubLane / SubLaneScale;
12582 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12583 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12584 int M = RepeatedSubLaneMask[Elt];
12587 int Idx = (SubLane * NumSubLaneElts) + Elt;
12588 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12591 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12593 // Shuffle each source sub-lane to its destination.
12594 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12595 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12596 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12597 if (SrcSubLane < 0)
12599 for (int j = 0; j != NumSubLaneElts; ++j)
12600 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12603 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12607 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12608 unsigned &ShuffleImm,
12609 ArrayRef<int> Mask) {
12610 int NumElts = VT.getVectorNumElements();
12611 assert(VT.getScalarSizeInBits() == 64 &&
12612 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12613 "Unexpected data type for VSHUFPD");
12615 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
12616 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
12618 bool ShufpdMask = true;
12619 bool CommutableMask = true;
12620 for (int i = 0; i < NumElts; ++i) {
12621 if (Mask[i] == SM_SentinelUndef)
12625 int Val = (i & 6) + NumElts * (i & 1);
12626 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12627 if (Mask[i] < Val || Mask[i] > Val + 1)
12628 ShufpdMask = false;
12629 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12630 CommutableMask = false;
12631 ShuffleImm |= (Mask[i] % 2) << i;
12636 if (CommutableMask) {
12644 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12645 ArrayRef<int> Mask, SDValue V1,
12646 SDValue V2, SelectionDAG &DAG) {
12647 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
12648 "Unexpected data type for VSHUFPD");
12650 unsigned Immediate = 0;
12651 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12654 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12655 DAG.getConstant(Immediate, DL, MVT::i8));
12658 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12659 ArrayRef<int> Mask, SDValue V1,
12660 SDValue V2, SelectionDAG &DAG) {
12661 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12662 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12664 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12666 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12668 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12671 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12673 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12674 /// isn't available.
12675 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12676 const APInt &Zeroable,
12677 SDValue V1, SDValue V2,
12678 const X86Subtarget &Subtarget,
12679 SelectionDAG &DAG) {
12680 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12681 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12682 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12684 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12685 Zeroable, Subtarget, DAG))
12688 if (V2.isUndef()) {
12689 // Check for being able to broadcast a single element.
12690 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12691 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12694 // Use low duplicate instructions for masks that match their pattern.
12695 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12696 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12698 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12699 // Non-half-crossing single input shuffles can be lowered with an
12700 // interleaved permutation.
12701 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12702 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12703 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12704 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12707 // With AVX2 we have direct support for this permutation.
12708 if (Subtarget.hasAVX2())
12709 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12710 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12712 // Try to create an in-lane repeating shuffle mask and then shuffle the
12713 // the results into the target lanes.
12714 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12715 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12718 // Otherwise, fall back.
12719 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12723 // Use dedicated unpack instructions for masks that match their pattern.
12725 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12728 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12729 Zeroable, Subtarget, DAG))
12732 // Check if the blend happens to exactly fit that of SHUFPD.
12734 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12737 // Try to create an in-lane repeating shuffle mask and then shuffle the
12738 // the results into the target lanes.
12739 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12740 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12743 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12744 // shuffle. However, if we have AVX2 and either inputs are already in place,
12745 // we will be able to shuffle even across lanes the other input in a single
12746 // instruction so skip this pattern.
12747 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12748 isShuffleMaskInputInPlace(1, Mask))))
12749 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12750 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12752 // If we have VLX support, we can use VEXPAND.
12753 if (Subtarget.hasVLX())
12754 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12755 V1, V2, DAG, Subtarget))
12758 // If we have AVX2 then we always want to lower with a blend because an v4 we
12759 // can fully permute the elements.
12760 if (Subtarget.hasAVX2())
12761 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12764 // Otherwise fall back on generic lowering.
12765 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12768 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12770 /// This routine is only called when we have AVX2 and thus a reasonable
12771 /// instruction set for v4i64 shuffling..
12772 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12773 const APInt &Zeroable,
12774 SDValue V1, SDValue V2,
12775 const X86Subtarget &Subtarget,
12776 SelectionDAG &DAG) {
12777 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12778 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12779 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12780 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12782 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12783 Zeroable, Subtarget, DAG))
12786 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12787 Zeroable, Subtarget, DAG))
12790 // Check for being able to broadcast a single element.
12791 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12792 Mask, Subtarget, DAG))
12795 if (V2.isUndef()) {
12796 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12797 // can use lower latency instructions that will operate on both lanes.
12798 SmallVector<int, 2> RepeatedMask;
12799 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12800 SmallVector<int, 4> PSHUFDMask;
12801 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12802 return DAG.getBitcast(
12804 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12805 DAG.getBitcast(MVT::v8i32, V1),
12806 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12809 // AVX2 provides a direct instruction for permuting a single input across
12811 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12812 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12815 // Try to use shift instructions.
12816 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12817 Zeroable, Subtarget, DAG))
12820 // If we have VLX support, we can use VALIGN or VEXPAND.
12821 if (Subtarget.hasVLX()) {
12822 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12823 Mask, Subtarget, DAG))
12826 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12827 V1, V2, DAG, Subtarget))
12831 // Try to use PALIGNR.
12832 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12833 Mask, Subtarget, DAG))
12836 // Use dedicated unpack instructions for masks that match their pattern.
12838 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12841 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12842 // shuffle. However, if we have AVX2 and either inputs are already in place,
12843 // we will be able to shuffle even across lanes the other input in a single
12844 // instruction so skip this pattern.
12845 if (!isShuffleMaskInputInPlace(0, Mask) &&
12846 !isShuffleMaskInputInPlace(1, Mask))
12847 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12848 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12851 // Otherwise fall back on generic blend lowering.
12852 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12856 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12858 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12859 /// isn't available.
12860 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12861 const APInt &Zeroable,
12862 SDValue V1, SDValue V2,
12863 const X86Subtarget &Subtarget,
12864 SelectionDAG &DAG) {
12865 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12866 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12867 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12869 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12870 Zeroable, Subtarget, DAG))
12873 // Check for being able to broadcast a single element.
12874 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12875 Mask, Subtarget, DAG))
12878 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12879 // options to efficiently lower the shuffle.
12880 SmallVector<int, 4> RepeatedMask;
12881 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12882 assert(RepeatedMask.size() == 4 &&
12883 "Repeated masks must be half the mask width!");
12885 // Use even/odd duplicate instructions for masks that match their pattern.
12886 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12887 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12888 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12889 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12892 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12893 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12895 // Use dedicated unpack instructions for masks that match their pattern.
12897 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12900 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12901 // have already handled any direct blends.
12902 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12905 // Try to create an in-lane repeating shuffle mask and then shuffle the
12906 // the results into the target lanes.
12907 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12908 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12911 // If we have a single input shuffle with different shuffle patterns in the
12912 // two 128-bit lanes use the variable mask to VPERMILPS.
12913 if (V2.isUndef()) {
12914 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12915 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12916 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12918 if (Subtarget.hasAVX2())
12919 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12921 // Otherwise, fall back.
12922 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12926 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12928 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12929 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12931 // If we have VLX support, we can use VEXPAND.
12932 if (Subtarget.hasVLX())
12933 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12934 V1, V2, DAG, Subtarget))
12937 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12938 // since after split we get a more efficient code using vpunpcklwd and
12939 // vpunpckhwd instrs than vblend.
12940 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
12941 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12945 // If we have AVX2 then we always want to lower with a blend because at v8 we
12946 // can fully permute the elements.
12947 if (Subtarget.hasAVX2())
12948 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12951 // Otherwise fall back on generic lowering.
12952 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12955 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12957 /// This routine is only called when we have AVX2 and thus a reasonable
12958 /// instruction set for v8i32 shuffling..
12959 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12960 const APInt &Zeroable,
12961 SDValue V1, SDValue V2,
12962 const X86Subtarget &Subtarget,
12963 SelectionDAG &DAG) {
12964 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12965 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12966 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12967 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12969 // Whenever we can lower this as a zext, that instruction is strictly faster
12970 // than any alternative. It also allows us to fold memory operands into the
12971 // shuffle in many cases.
12972 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12973 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12976 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12977 // since after split we get a more efficient code than vblend by using
12978 // vpunpcklwd and vpunpckhwd instrs.
12979 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
12980 !Subtarget.hasAVX512())
12982 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
12985 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12986 Zeroable, Subtarget, DAG))
12989 // Check for being able to broadcast a single element.
12990 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12991 Mask, Subtarget, DAG))
12994 // If the shuffle mask is repeated in each 128-bit lane we can use more
12995 // efficient instructions that mirror the shuffles across the two 128-bit
12997 SmallVector<int, 4> RepeatedMask;
12998 bool Is128BitLaneRepeatedShuffle =
12999 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
13000 if (Is128BitLaneRepeatedShuffle) {
13001 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13003 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
13004 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13006 // Use dedicated unpack instructions for masks that match their pattern.
13008 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
13012 // Try to use shift instructions.
13013 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
13014 Zeroable, Subtarget, DAG))
13017 // If we have VLX support, we can use VALIGN or EXPAND.
13018 if (Subtarget.hasVLX()) {
13019 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
13020 Mask, Subtarget, DAG))
13023 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
13024 V1, V2, DAG, Subtarget))
13028 // Try to use byte rotation instructions.
13029 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13030 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13033 // Try to create an in-lane repeating shuffle mask and then shuffle the
13034 // results into the target lanes.
13035 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13036 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13039 // If the shuffle patterns aren't repeated but it is a single input, directly
13040 // generate a cross-lane VPERMD instruction.
13041 if (V2.isUndef()) {
13042 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13043 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
13046 // Assume that a single SHUFPS is faster than an alternative sequence of
13047 // multiple instructions (even if the CPU has a domain penalty).
13048 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13049 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13050 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
13051 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
13052 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
13053 CastV1, CastV2, DAG);
13054 return DAG.getBitcast(MVT::v8i32, ShufPS);
13057 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13059 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13060 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13063 // Otherwise fall back on generic blend lowering.
13064 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
13068 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
13070 /// This routine is only called when we have AVX2 and thus a reasonable
13071 /// instruction set for v16i16 shuffling..
13072 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13073 const APInt &Zeroable,
13074 SDValue V1, SDValue V2,
13075 const X86Subtarget &Subtarget,
13076 SelectionDAG &DAG) {
13077 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13078 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13079 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13080 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
13082 // Whenever we can lower this as a zext, that instruction is strictly faster
13083 // than any alternative. It also allows us to fold memory operands into the
13084 // shuffle in many cases.
13085 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13086 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13089 // Check for being able to broadcast a single element.
13090 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
13091 Mask, Subtarget, DAG))
13094 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
13095 Zeroable, Subtarget, DAG))
13098 // Use dedicated unpack instructions for masks that match their pattern.
13100 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
13103 // Try to use shift instructions.
13104 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
13105 Zeroable, Subtarget, DAG))
13108 // Try to use byte rotation instructions.
13109 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13110 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13113 // Try to create an in-lane repeating shuffle mask and then shuffle the
13114 // the results into the target lanes.
13115 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13116 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13119 if (V2.isUndef()) {
13120 // There are no generalized cross-lane shuffle operations available on i16
13122 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
13123 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
13126 SmallVector<int, 8> RepeatedMask;
13127 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13128 // As this is a single-input shuffle, the repeated mask should be
13129 // a strictly valid v8i16 mask that we can pass through to the v8i16
13130 // lowering to handle even the v16 case.
13131 return lowerV8I16GeneralSingleInputVectorShuffle(
13132 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13136 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13137 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13140 // AVX512BWVL can lower to VPERMW.
13141 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13142 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13144 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13146 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13147 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13150 // Otherwise fall back on generic lowering.
13151 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13154 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13156 /// This routine is only called when we have AVX2 and thus a reasonable
13157 /// instruction set for v32i8 shuffling..
13158 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13159 const APInt &Zeroable,
13160 SDValue V1, SDValue V2,
13161 const X86Subtarget &Subtarget,
13162 SelectionDAG &DAG) {
13163 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13164 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13165 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13166 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
13168 // Whenever we can lower this as a zext, that instruction is strictly faster
13169 // than any alternative. It also allows us to fold memory operands into the
13170 // shuffle in many cases.
13171 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13172 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13175 // Check for being able to broadcast a single element.
13176 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13177 Mask, Subtarget, DAG))
13180 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13181 Zeroable, Subtarget, DAG))
13184 // Use dedicated unpack instructions for masks that match their pattern.
13186 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13189 // Try to use shift instructions.
13190 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13191 Zeroable, Subtarget, DAG))
13194 // Try to use byte rotation instructions.
13195 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13196 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13199 // Try to create an in-lane repeating shuffle mask and then shuffle the
13200 // the results into the target lanes.
13201 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13202 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13205 // There are no generalized cross-lane shuffle operations available on i8
13207 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13208 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13211 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13212 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13215 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13217 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13218 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13221 // Otherwise fall back on generic lowering.
13222 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13225 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13227 /// This routine either breaks down the specific type of a 256-bit x86 vector
13228 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13229 /// together based on the available instructions.
13230 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13231 MVT VT, SDValue V1, SDValue V2,
13232 const APInt &Zeroable,
13233 const X86Subtarget &Subtarget,
13234 SelectionDAG &DAG) {
13235 // If we have a single input to the zero element, insert that into V1 if we
13236 // can do so cheaply.
13237 int NumElts = VT.getVectorNumElements();
13238 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13240 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13241 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13242 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13245 // Handle special cases where the lower or upper half is UNDEF.
13247 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13250 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13251 // can check for those subtargets here and avoid much of the subtarget
13252 // querying in the per-vector-type lowering routines. With AVX1 we have
13253 // essentially *zero* ability to manipulate a 256-bit vector with integer
13254 // types. Since we'll use floating point types there eventually, just
13255 // immediately cast everything to a float and operate entirely in that domain.
13256 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13257 int ElementBits = VT.getScalarSizeInBits();
13258 if (ElementBits < 32) {
13259 // No floating point type available, if we can't use the bit operations
13260 // for masking/blending then decompose into 128-bit vectors.
13262 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13264 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13266 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13269 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13270 VT.getVectorNumElements());
13271 V1 = DAG.getBitcast(FpVT, V1);
13272 V2 = DAG.getBitcast(FpVT, V2);
13273 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13276 switch (VT.SimpleTy) {
13278 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13280 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13282 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13284 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13286 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13288 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13291 llvm_unreachable("Not a valid 256-bit x86 vector type!");
13295 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13296 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13297 ArrayRef<int> Mask, SDValue V1,
13298 SDValue V2, SelectionDAG &DAG) {
13299 assert(VT.getScalarSizeInBits() == 64 &&
13300 "Unexpected element type size for 128bit shuffle.");
13302 // To handle 256 bit vector requires VLX and most probably
13303 // function lowerV2X128VectorShuffle() is better solution.
13304 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13306 SmallVector<int, 4> WidenedMask;
13307 if (!canWidenShuffleElements(Mask, WidenedMask))
13310 // Check for patterns which can be matched with a single insert of a 256-bit
13312 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13313 {0, 1, 2, 3, 0, 1, 2, 3});
13314 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13315 {0, 1, 2, 3, 8, 9, 10, 11})) {
13316 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13317 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13318 DAG.getIntPtrConstant(0, DL));
13319 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13320 OnlyUsesV1 ? V1 : V2,
13321 DAG.getIntPtrConstant(0, DL));
13322 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13325 assert(WidenedMask.size() == 4);
13327 // See if this is an insertion of the lower 128-bits of V2 into V1.
13328 bool IsInsert = true;
13330 for (int i = 0; i < 4; ++i) {
13331 assert(WidenedMask[i] >= -1);
13332 if (WidenedMask[i] < 0)
13335 // Make sure all V1 subvectors are in place.
13336 if (WidenedMask[i] < 4) {
13337 if (WidenedMask[i] != i) {
13342 // Make sure we only have a single V2 index and its the lowest 128-bits.
13343 if (V2Index >= 0 || WidenedMask[i] != 4) {
13350 if (IsInsert && V2Index >= 0) {
13351 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13352 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13353 DAG.getIntPtrConstant(0, DL));
13354 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13357 // Try to lower to to vshuf64x2/vshuf32x4.
13358 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13359 unsigned PermMask = 0;
13360 // Insure elements came from the same Op.
13361 for (int i = 0; i < 4; ++i) {
13362 assert(WidenedMask[i] >= -1);
13363 if (WidenedMask[i] < 0)
13366 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13367 unsigned OpIndex = i / 2;
13368 if (Ops[OpIndex].isUndef())
13370 else if (Ops[OpIndex] != Op)
13373 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13374 // bits defined by a vshuf64x2 instruction's immediate control byte.
13375 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13378 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13379 DAG.getConstant(PermMask, DL, MVT::i8));
13382 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13383 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13384 const APInt &Zeroable,
13385 SDValue V1, SDValue V2,
13386 const X86Subtarget &Subtarget,
13387 SelectionDAG &DAG) {
13388 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13389 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13390 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13392 if (V2.isUndef()) {
13393 // Use low duplicate instructions for masks that match their pattern.
13394 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13395 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13397 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13398 // Non-half-crossing single input shuffles can be lowered with an
13399 // interleaved permutation.
13400 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13401 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13402 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13403 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13404 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13405 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13408 SmallVector<int, 4> RepeatedMask;
13409 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13410 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13411 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13414 if (SDValue Shuf128 =
13415 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13418 if (SDValue Unpck =
13419 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13422 // Check if the blend happens to exactly fit that of SHUFPD.
13424 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13427 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13428 V2, DAG, Subtarget))
13431 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13432 Zeroable, Subtarget, DAG))
13435 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13438 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13439 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13440 const APInt &Zeroable,
13441 SDValue V1, SDValue V2,
13442 const X86Subtarget &Subtarget,
13443 SelectionDAG &DAG) {
13444 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13445 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13446 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13448 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13449 // options to efficiently lower the shuffle.
13450 SmallVector<int, 4> RepeatedMask;
13451 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13452 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13454 // Use even/odd duplicate instructions for masks that match their pattern.
13455 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13456 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13457 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13458 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13461 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13462 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13464 // Use dedicated unpack instructions for masks that match their pattern.
13465 if (SDValue Unpck =
13466 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13469 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13470 Zeroable, Subtarget, DAG))
13473 // Otherwise, fall back to a SHUFPS sequence.
13474 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13476 // If we have AVX512F support, we can use VEXPAND.
13477 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13478 V1, V2, DAG, Subtarget))
13481 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13484 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13485 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13486 const APInt &Zeroable,
13487 SDValue V1, SDValue V2,
13488 const X86Subtarget &Subtarget,
13489 SelectionDAG &DAG) {
13490 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13491 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13492 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13494 if (SDValue Shuf128 =
13495 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13498 if (V2.isUndef()) {
13499 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13500 // can use lower latency instructions that will operate on all four
13502 SmallVector<int, 2> Repeated128Mask;
13503 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13504 SmallVector<int, 4> PSHUFDMask;
13505 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
13506 return DAG.getBitcast(
13508 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13509 DAG.getBitcast(MVT::v16i32, V1),
13510 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13513 SmallVector<int, 4> Repeated256Mask;
13514 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13515 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13516 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13519 // Try to use shift instructions.
13520 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13521 Zeroable, Subtarget, DAG))
13524 // Try to use VALIGN.
13525 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13526 Mask, Subtarget, DAG))
13529 // Try to use PALIGNR.
13530 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13531 Mask, Subtarget, DAG))
13534 if (SDValue Unpck =
13535 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13537 // If we have AVX512F support, we can use VEXPAND.
13538 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13539 V2, DAG, Subtarget))
13542 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13543 Zeroable, Subtarget, DAG))
13546 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13549 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13550 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13551 const APInt &Zeroable,
13552 SDValue V1, SDValue V2,
13553 const X86Subtarget &Subtarget,
13554 SelectionDAG &DAG) {
13555 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13556 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13557 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13559 // Whenever we can lower this as a zext, that instruction is strictly faster
13560 // than any alternative. It also allows us to fold memory operands into the
13561 // shuffle in many cases.
13562 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13563 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13566 // If the shuffle mask is repeated in each 128-bit lane we can use more
13567 // efficient instructions that mirror the shuffles across the four 128-bit
13569 SmallVector<int, 4> RepeatedMask;
13570 bool Is128BitLaneRepeatedShuffle =
13571 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13572 if (Is128BitLaneRepeatedShuffle) {
13573 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13575 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13576 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13578 // Use dedicated unpack instructions for masks that match their pattern.
13580 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13584 // Try to use shift instructions.
13585 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13586 Zeroable, Subtarget, DAG))
13589 // Try to use VALIGN.
13590 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13591 Mask, Subtarget, DAG))
13594 // Try to use byte rotation instructions.
13595 if (Subtarget.hasBWI())
13596 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13597 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13600 // Assume that a single SHUFPS is faster than using a permv shuffle.
13601 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13602 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13603 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13604 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13605 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13606 CastV1, CastV2, DAG);
13607 return DAG.getBitcast(MVT::v16i32, ShufPS);
13609 // If we have AVX512F support, we can use VEXPAND.
13610 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13611 V1, V2, DAG, Subtarget))
13614 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13615 Zeroable, Subtarget, DAG))
13617 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13620 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13621 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13622 const APInt &Zeroable,
13623 SDValue V1, SDValue V2,
13624 const X86Subtarget &Subtarget,
13625 SelectionDAG &DAG) {
13626 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13627 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13628 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13629 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13631 // Whenever we can lower this as a zext, that instruction is strictly faster
13632 // than any alternative. It also allows us to fold memory operands into the
13633 // shuffle in many cases.
13634 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13635 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13638 // Use dedicated unpack instructions for masks that match their pattern.
13640 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13643 // Try to use shift instructions.
13644 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13645 Zeroable, Subtarget, DAG))
13648 // Try to use byte rotation instructions.
13649 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13650 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13653 if (V2.isUndef()) {
13654 SmallVector<int, 8> RepeatedMask;
13655 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13656 // As this is a single-input shuffle, the repeated mask should be
13657 // a strictly valid v8i16 mask that we can pass through to the v8i16
13658 // lowering to handle even the v32 case.
13659 return lowerV8I16GeneralSingleInputVectorShuffle(
13660 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13664 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
13665 Zeroable, Subtarget, DAG))
13668 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13671 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13672 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13673 const APInt &Zeroable,
13674 SDValue V1, SDValue V2,
13675 const X86Subtarget &Subtarget,
13676 SelectionDAG &DAG) {
13677 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13678 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13679 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13680 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13682 // Whenever we can lower this as a zext, that instruction is strictly faster
13683 // than any alternative. It also allows us to fold memory operands into the
13684 // shuffle in many cases.
13685 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13686 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13689 // Use dedicated unpack instructions for masks that match their pattern.
13691 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13694 // Try to use shift instructions.
13695 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13696 Zeroable, Subtarget, DAG))
13699 // Try to use byte rotation instructions.
13700 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13701 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13704 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13705 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13708 // VBMI can use VPERMV/VPERMV3 byte shuffles.
13709 if (Subtarget.hasVBMI())
13710 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13712 // Try to create an in-lane repeating shuffle mask and then shuffle the
13713 // the results into the target lanes.
13714 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13715 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13718 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
13719 Zeroable, Subtarget, DAG))
13722 // FIXME: Implement direct support for this type!
13723 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13726 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13728 /// This routine either breaks down the specific type of a 512-bit x86 vector
13729 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
13730 /// together based on the available instructions.
13731 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13732 MVT VT, SDValue V1, SDValue V2,
13733 const APInt &Zeroable,
13734 const X86Subtarget &Subtarget,
13735 SelectionDAG &DAG) {
13736 assert(Subtarget.hasAVX512() &&
13737 "Cannot lower 512-bit vectors w/ basic ISA!");
13739 // If we have a single input to the zero element, insert that into V1 if we
13740 // can do so cheaply.
13741 int NumElts = Mask.size();
13742 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13744 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13745 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13746 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13749 // Check for being able to broadcast a single element.
13750 if (SDValue Broadcast =
13751 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13754 // Dispatch to each element type for lowering. If we don't have support for
13755 // specific element type shuffles at 512 bits, immediately split them and
13756 // lower them. Each lowering routine of a given type is allowed to assume that
13757 // the requisite ISA extensions for that element type are available.
13758 switch (VT.SimpleTy) {
13760 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13762 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13764 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13766 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13768 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13770 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13773 llvm_unreachable("Not a valid 512-bit x86 vector type!");
13777 // Lower vXi1 vector shuffles.
13778 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13779 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13780 // vector, shuffle and then truncate it back.
13781 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13782 MVT VT, SDValue V1, SDValue V2,
13783 const X86Subtarget &Subtarget,
13784 SelectionDAG &DAG) {
13785 assert(Subtarget.hasAVX512() &&
13786 "Cannot lower 512-bit vectors w/o basic ISA!");
13788 switch (VT.SimpleTy) {
13790 llvm_unreachable("Expected a vector of i1 elements");
13792 ExtVT = MVT::v2i64;
13795 ExtVT = MVT::v4i32;
13798 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13801 ExtVT = MVT::v16i32;
13804 ExtVT = MVT::v32i16;
13807 ExtVT = MVT::v64i8;
13811 if (ISD::isBuildVectorAllZeros(V1.getNode()))
13812 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13813 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13814 V1 = getOnesVector(ExtVT, DAG, DL);
13816 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13819 V2 = DAG.getUNDEF(ExtVT);
13820 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13821 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13822 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13823 V2 = getOnesVector(ExtVT, DAG, DL);
13825 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13827 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13828 // i1 was sign extended we can use X86ISD::CVT2MASK.
13829 int NumElems = VT.getVectorNumElements();
13830 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13831 (Subtarget.hasDQI() && (NumElems < 32)))
13832 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13834 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13837 /// Helper function that returns true if the shuffle mask should be
13838 /// commuted to improve canonicalization.
13839 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13840 int NumElements = Mask.size();
13842 int NumV1Elements = 0, NumV2Elements = 0;
13846 else if (M < NumElements)
13851 // Commute the shuffle as needed such that more elements come from V1 than
13852 // V2. This allows us to match the shuffle pattern strictly on how many
13853 // elements come from V1 without handling the symmetric cases.
13854 if (NumV2Elements > NumV1Elements)
13857 assert(NumV1Elements > 0 && "No V1 indices");
13859 if (NumV2Elements == 0)
13862 // When the number of V1 and V2 elements are the same, try to minimize the
13863 // number of uses of V2 in the low half of the vector. When that is tied,
13864 // ensure that the sum of indices for V1 is equal to or lower than the sum
13865 // indices for V2. When those are equal, try to ensure that the number of odd
13866 // indices for V1 is lower than the number of odd indices for V2.
13867 if (NumV1Elements == NumV2Elements) {
13868 int LowV1Elements = 0, LowV2Elements = 0;
13869 for (int M : Mask.slice(0, NumElements / 2))
13870 if (M >= NumElements)
13874 if (LowV2Elements > LowV1Elements)
13876 if (LowV2Elements == LowV1Elements) {
13877 int SumV1Indices = 0, SumV2Indices = 0;
13878 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13879 if (Mask[i] >= NumElements)
13881 else if (Mask[i] >= 0)
13883 if (SumV2Indices < SumV1Indices)
13885 if (SumV2Indices == SumV1Indices) {
13886 int NumV1OddIndices = 0, NumV2OddIndices = 0;
13887 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13888 if (Mask[i] >= NumElements)
13889 NumV2OddIndices += i % 2;
13890 else if (Mask[i] >= 0)
13891 NumV1OddIndices += i % 2;
13892 if (NumV2OddIndices < NumV1OddIndices)
13901 /// \brief Top-level lowering for x86 vector shuffles.
13903 /// This handles decomposition, canonicalization, and lowering of all x86
13904 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13905 /// above in helper routines. The canonicalization attempts to widen shuffles
13906 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13907 /// s.t. only one of the two inputs needs to be tested, etc.
13908 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13909 SelectionDAG &DAG) {
13910 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13911 ArrayRef<int> Mask = SVOp->getMask();
13912 SDValue V1 = Op.getOperand(0);
13913 SDValue V2 = Op.getOperand(1);
13914 MVT VT = Op.getSimpleValueType();
13915 int NumElements = VT.getVectorNumElements();
13917 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13919 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13920 "Can't lower MMX shuffles");
13922 bool V1IsUndef = V1.isUndef();
13923 bool V2IsUndef = V2.isUndef();
13924 if (V1IsUndef && V2IsUndef)
13925 return DAG.getUNDEF(VT);
13927 // When we create a shuffle node we put the UNDEF node to second operand,
13928 // but in some cases the first operand may be transformed to UNDEF.
13929 // In this case we should just commute the node.
13931 return DAG.getCommutedVectorShuffle(*SVOp);
13933 // Check for non-undef masks pointing at an undef vector and make the masks
13934 // undef as well. This makes it easier to match the shuffle based solely on
13938 if (M >= NumElements) {
13939 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13940 for (int &M : NewMask)
13941 if (M >= NumElements)
13943 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13946 // Check for illegal shuffle mask element index values.
13947 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13948 assert(llvm::all_of(Mask,
13949 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13950 "Out of bounds shuffle index");
13952 // We actually see shuffles that are entirely re-arrangements of a set of
13953 // zero inputs. This mostly happens while decomposing complex shuffles into
13954 // simple ones. Directly lower these as a buildvector of zeros.
13955 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13956 if (Zeroable.isAllOnesValue())
13957 return getZeroVector(VT, Subtarget, DAG, DL);
13959 // Try to collapse shuffles into using a vector type with fewer elements but
13960 // wider element types. We cap this to not form integers or floating point
13961 // elements wider than 64 bits, but it might be interesting to form i128
13962 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13963 SmallVector<int, 16> WidenedMask;
13964 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13965 canWidenShuffleElements(Mask, WidenedMask)) {
13966 MVT NewEltVT = VT.isFloatingPoint()
13967 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13968 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13969 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13970 // Make sure that the new vector type is legal. For example, v2f64 isn't
13972 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13973 V1 = DAG.getBitcast(NewVT, V1);
13974 V2 = DAG.getBitcast(NewVT, V2);
13975 return DAG.getBitcast(
13976 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13980 // Commute the shuffle if it will improve canonicalization.
13981 if (canonicalizeShuffleMaskWithCommute(Mask))
13982 return DAG.getCommutedVectorShuffle(*SVOp);
13984 // For each vector width, delegate to a specialized lowering routine.
13985 if (VT.is128BitVector())
13986 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13989 if (VT.is256BitVector())
13990 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13993 if (VT.is512BitVector())
13994 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13998 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
14000 llvm_unreachable("Unimplemented!");
14003 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
14004 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
14005 const X86Subtarget &Subtarget,
14006 SelectionDAG &DAG) {
14007 SDValue Cond = Op.getOperand(0);
14008 SDValue LHS = Op.getOperand(1);
14009 SDValue RHS = Op.getOperand(2);
14011 MVT VT = Op.getSimpleValueType();
14013 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
14015 auto *CondBV = cast<BuildVectorSDNode>(Cond);
14017 // Only non-legal VSELECTs reach this lowering, convert those into generic
14018 // shuffles and re-use the shuffle lowering path for blends.
14019 SmallVector<int, 32> Mask;
14020 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
14021 SDValue CondElt = CondBV->getOperand(i);
14023 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
14026 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
14029 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
14030 // A vselect where all conditions and data are constants can be optimized into
14031 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
14032 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
14033 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
14034 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
14037 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
14038 // with patterns on the mask registers on AVX-512.
14039 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
14042 // Try to lower this to a blend-style vector shuffle. This can handle all
14043 // constant condition cases.
14044 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
14047 // Variable blends are only legal from SSE4.1 onward.
14048 if (!Subtarget.hasSSE41())
14052 MVT VT = Op.getSimpleValueType();
14054 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
14055 // into an i1 condition so that we can use the mask-based 512-bit blend
14057 if (VT.getSizeInBits() == 512) {
14058 SDValue Cond = Op.getOperand(0);
14059 // The vNi1 condition case should be handled above as it can be trivially
14061 assert(Cond.getValueType().getScalarSizeInBits() ==
14062 VT.getScalarSizeInBits() &&
14063 "Should have a size-matched integer condition!");
14064 // Build a mask by testing the condition against itself (tests for zero).
14065 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
14066 SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
14067 // Now return a new VSELECT using the mask.
14068 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
14071 // Only some types will be legal on some subtargets. If we can emit a legal
14072 // VSELECT-matching blend, return Op, and but if we need to expand, return
14074 switch (VT.SimpleTy) {
14076 // Most of the vector types have blends past SSE4.1.
14080 // The byte blends for AVX vectors were introduced only in AVX2.
14081 if (Subtarget.hasAVX2())
14088 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
14089 if (Subtarget.hasBWI() && Subtarget.hasVLX())
14092 // FIXME: We should custom lower this by fixing the condition and using i8
14098 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
14099 MVT VT = Op.getSimpleValueType();
14102 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
14105 if (VT.getSizeInBits() == 8) {
14106 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
14107 Op.getOperand(0), Op.getOperand(1));
14108 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14109 DAG.getValueType(VT));
14110 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14113 if (VT == MVT::f32) {
14114 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
14115 // the result back to FR32 register. It's only worth matching if the
14116 // result has a single use which is a store or a bitcast to i32. And in
14117 // the case of a store, it's not worth it if the index is a constant 0,
14118 // because a MOVSSmr can be used instead, which is smaller and faster.
14119 if (!Op.hasOneUse())
14121 SDNode *User = *Op.getNode()->use_begin();
14122 if ((User->getOpcode() != ISD::STORE ||
14123 isNullConstant(Op.getOperand(1))) &&
14124 (User->getOpcode() != ISD::BITCAST ||
14125 User->getValueType(0) != MVT::i32))
14127 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14128 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
14130 return DAG.getBitcast(MVT::f32, Extract);
14133 if (VT == MVT::i32 || VT == MVT::i64) {
14134 // ExtractPS/pextrq works with constant index.
14135 if (isa<ConstantSDNode>(Op.getOperand(1)))
14142 /// Extract one bit from mask vector, like v16i1 or v8i1.
14143 /// AVX-512 feature.
14145 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
14146 SDValue Vec = Op.getOperand(0);
14148 MVT VecVT = Vec.getSimpleValueType();
14149 SDValue Idx = Op.getOperand(1);
14150 MVT EltVT = Op.getSimpleValueType();
14152 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
14153 "Unexpected vector type in ExtractBitFromMaskVector");
14155 // variable index can't be handled in mask registers,
14156 // extend vector to VR512/128
14157 if (!isa<ConstantSDNode>(Idx)) {
14158 unsigned NumElts = VecVT.getVectorNumElements();
14159 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
14160 // than extending to 128/256bit.
14161 unsigned VecSize = (NumElts <= 4 ? 128 : 512);
14162 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
14163 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
14164 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
14165 ExtVT.getVectorElementType(), Ext, Idx);
14166 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14169 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14170 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
14171 (VecVT.getVectorNumElements() < 8)) {
14172 // Use kshiftlw/rw instruction.
14173 VecVT = MVT::v16i1;
14174 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14175 DAG.getUNDEF(VecVT),
14177 DAG.getIntPtrConstant(0, dl));
14179 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
14180 if (MaxSift - IdxVal)
14181 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14182 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
14183 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14184 DAG.getConstant(MaxSift, dl, MVT::i8));
14185 return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
14186 DAG.getIntPtrConstant(0, dl));
14190 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14191 SelectionDAG &DAG) const {
14193 SDValue Vec = Op.getOperand(0);
14194 MVT VecVT = Vec.getSimpleValueType();
14195 SDValue Idx = Op.getOperand(1);
14197 if (VecVT.getVectorElementType() == MVT::i1)
14198 return ExtractBitFromMaskVector(Op, DAG);
14200 if (!isa<ConstantSDNode>(Idx)) {
14201 // Its more profitable to go through memory (1 cycles throughput)
14202 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14203 // IACA tool was used to get performance estimation
14204 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14206 // example : extractelement <16 x i8> %a, i32 %i
14208 // Block Throughput: 3.00 Cycles
14209 // Throughput Bottleneck: Port5
14211 // | Num Of | Ports pressure in cycles | |
14212 // | Uops | 0 - DV | 5 | 6 | 7 | |
14213 // ---------------------------------------------
14214 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
14215 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
14216 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
14217 // Total Num Of Uops: 4
14220 // Block Throughput: 1.00 Cycles
14221 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14223 // | | Ports pressure in cycles | |
14224 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
14225 // ---------------------------------------------------------
14226 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14227 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
14228 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
14229 // Total Num Of Uops: 4
14234 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14236 // If this is a 256-bit vector result, first extract the 128-bit vector and
14237 // then extract the element from the 128-bit vector.
14238 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14239 // Get the 128-bit vector.
14240 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14241 MVT EltVT = VecVT.getVectorElementType();
14243 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14244 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14246 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14247 // this can be done with a mask.
14248 IdxVal &= ElemsPerChunk - 1;
14249 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14250 DAG.getConstant(IdxVal, dl, MVT::i32));
14253 assert(VecVT.is128BitVector() && "Unexpected vector length");
14255 MVT VT = Op.getSimpleValueType();
14257 if (VT.getSizeInBits() == 16) {
14258 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14259 // we're going to zero extend the register or fold the store (SSE41 only).
14260 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14261 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14262 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14263 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14264 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14266 // Transform it so it match pextrw which produces a 32-bit result.
14267 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14268 Op.getOperand(0), Op.getOperand(1));
14269 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14270 DAG.getValueType(VT));
14271 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14274 if (Subtarget.hasSSE41())
14275 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14278 // TODO: We only extract a single element from v16i8, we can probably afford
14279 // to be more aggressive here before using the default approach of spilling to
14281 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14282 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14283 int DWordIdx = IdxVal / 4;
14284 if (DWordIdx == 0) {
14285 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14286 DAG.getBitcast(MVT::v4i32, Vec),
14287 DAG.getIntPtrConstant(DWordIdx, dl));
14288 int ShiftVal = (IdxVal % 4) * 8;
14290 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14291 DAG.getConstant(ShiftVal, dl, MVT::i32));
14292 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14295 int WordIdx = IdxVal / 2;
14296 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14297 DAG.getBitcast(MVT::v8i16, Vec),
14298 DAG.getIntPtrConstant(WordIdx, dl));
14299 int ShiftVal = (IdxVal % 2) * 8;
14301 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14302 DAG.getConstant(ShiftVal, dl, MVT::i16));
14303 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14306 if (VT.getSizeInBits() == 32) {
14310 // SHUFPS the element to the lowest double word, then movss.
14311 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14312 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14313 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14314 DAG.getIntPtrConstant(0, dl));
14317 if (VT.getSizeInBits() == 64) {
14318 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14319 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14320 // to match extract_elt for f64.
14324 // UNPCKHPD the element to the lowest double word, then movsd.
14325 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14326 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14327 int Mask[2] = { 1, -1 };
14328 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14329 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14330 DAG.getIntPtrConstant(0, dl));
14336 /// Insert one bit to mask vector, like v16i1 or v8i1.
14337 /// AVX-512 feature.
14339 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14341 SDValue Vec = Op.getOperand(0);
14342 SDValue Elt = Op.getOperand(1);
14343 SDValue Idx = Op.getOperand(2);
14344 MVT VecVT = Vec.getSimpleValueType();
14346 if (!isa<ConstantSDNode>(Idx)) {
14347 // Non constant index. Extend source and destination,
14348 // insert element and then truncate the result.
14349 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
14350 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
14351 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14352 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14353 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14354 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14357 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14358 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14359 unsigned NumElems = VecVT.getVectorNumElements();
14361 if(Vec.isUndef()) {
14363 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14364 DAG.getConstant(IdxVal, dl, MVT::i8));
14368 // Insertion of one bit into first position
14369 if (IdxVal == 0 ) {
14370 // Clean top bits of vector.
14371 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14372 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14373 EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14374 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14375 // Clean the first bit in source vector.
14376 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14377 DAG.getConstant(1 , dl, MVT::i8));
14378 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14379 DAG.getConstant(1, dl, MVT::i8));
14381 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14383 // Insertion of one bit into last position
14384 if (IdxVal == NumElems -1) {
14385 // Move the bit to the last position inside the vector.
14386 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14387 DAG.getConstant(IdxVal, dl, MVT::i8));
14388 // Clean the last bit in the source vector.
14389 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14390 DAG.getConstant(1, dl, MVT::i8));
14391 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14392 DAG.getConstant(1 , dl, MVT::i8));
14394 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14397 // Use shuffle to insert element.
14398 SmallVector<int, 64> MaskVec(NumElems);
14399 for (unsigned i = 0; i != NumElems; ++i)
14400 MaskVec[i] = (i == IdxVal) ? NumElems : i;
14402 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14405 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14406 SelectionDAG &DAG) const {
14407 MVT VT = Op.getSimpleValueType();
14408 MVT EltVT = VT.getVectorElementType();
14409 unsigned NumElts = VT.getVectorNumElements();
14411 if (EltVT == MVT::i1)
14412 return InsertBitToMaskVector(Op, DAG);
14415 SDValue N0 = Op.getOperand(0);
14416 SDValue N1 = Op.getOperand(1);
14417 SDValue N2 = Op.getOperand(2);
14418 if (!isa<ConstantSDNode>(N2))
14420 auto *N2C = cast<ConstantSDNode>(N2);
14421 unsigned IdxVal = N2C->getZExtValue();
14423 bool IsZeroElt = X86::isZeroNode(N1);
14424 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14426 // If we are inserting a element, see if we can do this more efficiently with
14427 // a blend shuffle with a rematerializable vector than a costly integer
14429 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
14430 16 <= EltVT.getSizeInBits()) {
14431 SmallVector<int, 8> BlendMask;
14432 for (unsigned i = 0; i != NumElts; ++i)
14433 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14434 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14435 : DAG.getConstant(-1, dl, VT);
14436 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14439 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14440 // into that, and then insert the subvector back into the result.
14441 if (VT.is256BitVector() || VT.is512BitVector()) {
14442 // With a 256-bit vector, we can insert into the zero element efficiently
14443 // using a blend if we have AVX or AVX2 and the right data type.
14444 if (VT.is256BitVector() && IdxVal == 0) {
14445 // TODO: It is worthwhile to cast integer to floating point and back
14446 // and incur a domain crossing penalty if that's what we'll end up
14447 // doing anyway after extracting to a 128-bit vector.
14448 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14449 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14450 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14451 N2 = DAG.getIntPtrConstant(1, dl);
14452 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14456 // Get the desired 128-bit vector chunk.
14457 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14459 // Insert the element into the desired chunk.
14460 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14461 assert(isPowerOf2_32(NumEltsIn128));
14462 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14463 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14465 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14466 DAG.getConstant(IdxIn128, dl, MVT::i32));
14468 // Insert the changed part back into the bigger vector
14469 return insert128BitVector(N0, V, IdxVal, DAG, dl);
14471 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14473 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14474 // argument. SSE41 required for pinsrb.
14475 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14477 if (VT == MVT::v8i16) {
14478 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14479 Opc = X86ISD::PINSRW;
14481 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14482 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14483 Opc = X86ISD::PINSRB;
14486 if (N1.getValueType() != MVT::i32)
14487 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14488 if (N2.getValueType() != MVT::i32)
14489 N2 = DAG.getIntPtrConstant(IdxVal, dl);
14490 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14493 if (Subtarget.hasSSE41()) {
14494 if (EltVT == MVT::f32) {
14495 // Bits [7:6] of the constant are the source select. This will always be
14496 // zero here. The DAG Combiner may combine an extract_elt index into
14497 // these bits. For example (insert (extract, 3), 2) could be matched by
14498 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14499 // Bits [5:4] of the constant are the destination select. This is the
14500 // value of the incoming immediate.
14501 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14502 // combine either bitwise AND or insert of float 0.0 to set these bits.
14504 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14505 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14506 // If this is an insertion of 32-bits into the low 32-bits of
14507 // a vector, we prefer to generate a blend with immediate rather
14508 // than an insertps. Blends are simpler operations in hardware and so
14509 // will always have equal or better performance than insertps.
14510 // But if optimizing for size and there's a load folding opportunity,
14511 // generate insertps because blendps does not have a 32-bit memory
14513 N2 = DAG.getIntPtrConstant(1, dl);
14514 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14515 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14517 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14518 // Create this as a scalar to vector..
14519 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14520 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14523 // PINSR* works with constant index.
14524 if (EltVT == MVT::i32 || EltVT == MVT::i64)
14531 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14532 SelectionDAG &DAG) {
14534 MVT OpVT = Op.getSimpleValueType();
14536 // It's always cheaper to replace a xor+movd with xorps and simplifies further
14538 if (X86::isZeroNode(Op.getOperand(0)))
14539 return getZeroVector(OpVT, Subtarget, DAG, dl);
14541 // If this is a 256-bit vector result, first insert into a 128-bit
14542 // vector and then insert into the 256-bit vector.
14543 if (!OpVT.is128BitVector()) {
14544 // Insert into a 128-bit vector.
14545 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14546 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14547 OpVT.getVectorNumElements() / SizeFactor);
14549 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14551 // Insert the 128-bit vector.
14552 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14554 assert(OpVT.is128BitVector() && "Expected an SSE type!");
14556 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14557 if (OpVT == MVT::v4i32)
14560 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14561 return DAG.getBitcast(
14562 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14565 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
14566 // a simple subregister reference or explicit instructions to grab
14567 // upper bits of a vector.
14568 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14569 SelectionDAG &DAG) {
14570 assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
14573 SDValue In = Op.getOperand(0);
14574 SDValue Idx = Op.getOperand(1);
14575 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14576 MVT ResVT = Op.getSimpleValueType();
14578 // When v1i1 is legal a scalarization of a vselect with a vXi1 Cond
14579 // would result with: v1i1 = extract_subvector(vXi1, idx).
14580 // Lower these into extract_vector_elt which is already selectable.
14581 if (ResVT == MVT::v1i1) {
14582 assert(Subtarget.hasAVX512() &&
14583 "Boolean EXTRACT_SUBVECTOR requires AVX512");
14585 MVT EltVT = ResVT.getVectorElementType();
14586 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14588 (TLI.getTypeToTransformTo(*DAG.getContext(), EltVT)).getSimpleVT();
14589 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LegalVT, In, Idx);
14590 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ResVT, Res);
14593 assert((In.getSimpleValueType().is256BitVector() ||
14594 In.getSimpleValueType().is512BitVector()) &&
14595 "Can only extract from 256-bit or 512-bit vectors");
14597 // If the input is a buildvector just emit a smaller one.
14598 unsigned ElemsPerChunk = ResVT.getVectorNumElements();
14599 if (In.getOpcode() == ISD::BUILD_VECTOR)
14600 return DAG.getBuildVector(
14601 ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
14603 // Everything else is legal.
14607 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
14608 // simple superregister reference or explicit instructions to insert
14609 // the upper bits of a vector.
14610 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14611 SelectionDAG &DAG) {
14612 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
14614 return insert1BitVector(Op, DAG, Subtarget);
14617 // Returns the appropriate wrapper opcode for a global reference.
14618 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14619 // References to absolute symbols are never PC-relative.
14620 if (GV && GV->isAbsoluteSymbolRef())
14621 return X86ISD::Wrapper;
14623 CodeModel::Model M = getTargetMachine().getCodeModel();
14624 if (Subtarget.isPICStyleRIPRel() &&
14625 (M == CodeModel::Small || M == CodeModel::Kernel))
14626 return X86ISD::WrapperRIP;
14628 return X86ISD::Wrapper;
14631 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14632 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14633 // one of the above mentioned nodes. It has to be wrapped because otherwise
14634 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14635 // be used to form addressing mode. These wrapped nodes will be selected
14638 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14639 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14641 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14642 // global base reg.
14643 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14645 auto PtrVT = getPointerTy(DAG.getDataLayout());
14646 SDValue Result = DAG.getTargetConstantPool(
14647 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14649 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14650 // With PIC, the address is actually $g + Offset.
14653 DAG.getNode(ISD::ADD, DL, PtrVT,
14654 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14660 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14661 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14663 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14664 // global base reg.
14665 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14667 auto PtrVT = getPointerTy(DAG.getDataLayout());
14668 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14670 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14672 // With PIC, the address is actually $g + Offset.
14675 DAG.getNode(ISD::ADD, DL, PtrVT,
14676 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14682 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14683 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14685 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14686 // global base reg.
14687 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14688 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14690 auto PtrVT = getPointerTy(DAG.getDataLayout());
14691 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14694 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14696 // With PIC, the address is actually $g + Offset.
14697 if (isPositionIndependent() && !Subtarget.is64Bit()) {
14699 DAG.getNode(ISD::ADD, DL, PtrVT,
14700 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14703 // For symbols that require a load from a stub to get the address, emit the
14705 if (isGlobalStubReference(OpFlag))
14706 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14707 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14713 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14714 // Create the TargetBlockAddressAddress node.
14715 unsigned char OpFlags =
14716 Subtarget.classifyBlockAddressReference();
14717 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14718 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14720 auto PtrVT = getPointerTy(DAG.getDataLayout());
14721 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14722 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14724 // With PIC, the address is actually $g + Offset.
14725 if (isGlobalRelativeToPICBase(OpFlags)) {
14726 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14727 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14733 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14734 const SDLoc &dl, int64_t Offset,
14735 SelectionDAG &DAG) const {
14736 // Create the TargetGlobalAddress node, folding in the constant
14737 // offset if it is legal.
14738 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14739 CodeModel::Model M = DAG.getTarget().getCodeModel();
14740 auto PtrVT = getPointerTy(DAG.getDataLayout());
14742 if (OpFlags == X86II::MO_NO_FLAG &&
14743 X86::isOffsetSuitableForCodeModel(Offset, M)) {
14744 // A direct static reference to a global.
14745 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14748 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14751 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14753 // With PIC, the address is actually $g + Offset.
14754 if (isGlobalRelativeToPICBase(OpFlags)) {
14755 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14756 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14759 // For globals that require a load from a stub to get the address, emit the
14761 if (isGlobalStubReference(OpFlags))
14762 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14763 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14765 // If there was a non-zero offset that we didn't fold, create an explicit
14766 // addition for it.
14768 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14769 DAG.getConstant(Offset, dl, PtrVT));
14775 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14776 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14777 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14778 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14782 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14783 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14784 unsigned char OperandFlags, bool LocalDynamic = false) {
14785 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14786 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14788 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14789 GA->getValueType(0),
14793 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14797 SDValue Ops[] = { Chain, TGA, *InFlag };
14798 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14800 SDValue Ops[] = { Chain, TGA };
14801 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14804 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14805 MFI.setAdjustsStack(true);
14806 MFI.setHasCalls(true);
14808 SDValue Flag = Chain.getValue(1);
14809 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14812 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14814 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14817 SDLoc dl(GA); // ? function entry point might be better
14818 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14819 DAG.getNode(X86ISD::GlobalBaseReg,
14820 SDLoc(), PtrVT), InFlag);
14821 InFlag = Chain.getValue(1);
14823 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14826 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14828 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14830 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14831 X86::RAX, X86II::MO_TLSGD);
14834 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14840 // Get the start address of the TLS block for this module.
14841 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14842 .getInfo<X86MachineFunctionInfo>();
14843 MFI->incNumLocalDynamicTLSAccesses();
14847 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14848 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14851 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14852 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14853 InFlag = Chain.getValue(1);
14854 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14855 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14858 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14862 unsigned char OperandFlags = X86II::MO_DTPOFF;
14863 unsigned WrapperKind = X86ISD::Wrapper;
14864 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14865 GA->getValueType(0),
14866 GA->getOffset(), OperandFlags);
14867 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14869 // Add x@dtpoff with the base.
14870 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14873 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14874 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14875 const EVT PtrVT, TLSModel::Model model,
14876 bool is64Bit, bool isPIC) {
14879 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14880 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14881 is64Bit ? 257 : 256));
14883 SDValue ThreadPointer =
14884 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14885 MachinePointerInfo(Ptr));
14887 unsigned char OperandFlags = 0;
14888 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14890 unsigned WrapperKind = X86ISD::Wrapper;
14891 if (model == TLSModel::LocalExec) {
14892 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14893 } else if (model == TLSModel::InitialExec) {
14895 OperandFlags = X86II::MO_GOTTPOFF;
14896 WrapperKind = X86ISD::WrapperRIP;
14898 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14901 llvm_unreachable("Unexpected model");
14904 // emit "addl x@ntpoff,%eax" (local exec)
14905 // or "addl x@indntpoff,%eax" (initial exec)
14906 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14908 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14909 GA->getOffset(), OperandFlags);
14910 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14912 if (model == TLSModel::InitialExec) {
14913 if (isPIC && !is64Bit) {
14914 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14915 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14919 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14920 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14923 // The address of the thread local variable is the add of the thread
14924 // pointer with the offset of the variable.
14925 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14929 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14931 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14933 if (DAG.getTarget().Options.EmulatedTLS)
14934 return LowerToTLSEmulatedModel(GA, DAG);
14936 const GlobalValue *GV = GA->getGlobal();
14937 auto PtrVT = getPointerTy(DAG.getDataLayout());
14938 bool PositionIndependent = isPositionIndependent();
14940 if (Subtarget.isTargetELF()) {
14941 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14943 case TLSModel::GeneralDynamic:
14944 if (Subtarget.is64Bit())
14945 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14946 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14947 case TLSModel::LocalDynamic:
14948 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14949 Subtarget.is64Bit());
14950 case TLSModel::InitialExec:
14951 case TLSModel::LocalExec:
14952 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14953 PositionIndependent);
14955 llvm_unreachable("Unknown TLS model.");
14958 if (Subtarget.isTargetDarwin()) {
14959 // Darwin only has one model of TLS. Lower to that.
14960 unsigned char OpFlag = 0;
14961 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14962 X86ISD::WrapperRIP : X86ISD::Wrapper;
14964 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14965 // global base reg.
14966 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14968 OpFlag = X86II::MO_TLVP_PIC_BASE;
14970 OpFlag = X86II::MO_TLVP;
14972 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14973 GA->getValueType(0),
14974 GA->getOffset(), OpFlag);
14975 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14977 // With PIC32, the address is actually $g + Offset.
14979 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14980 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14983 // Lowering the machine isd will make sure everything is in the right
14985 SDValue Chain = DAG.getEntryNode();
14986 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14987 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
14988 SDValue Args[] = { Chain, Offset };
14989 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14990 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14991 DAG.getIntPtrConstant(0, DL, true),
14992 Chain.getValue(1), DL);
14994 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14995 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14996 MFI.setAdjustsStack(true);
14998 // And our return value (tls address) is in the standard call return value
15000 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
15001 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
15004 if (Subtarget.isTargetKnownWindowsMSVC() ||
15005 Subtarget.isTargetWindowsItanium() ||
15006 Subtarget.isTargetWindowsGNU()) {
15007 // Just use the implicit TLS architecture
15008 // Need to generate something similar to:
15009 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
15011 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
15012 // mov rcx, qword [rdx+rcx*8]
15013 // mov eax, .tls$:tlsvar
15014 // [rax+rcx] contains the address
15015 // Windows 64bit: gs:0x58
15016 // Windows 32bit: fs:__tls_array
15019 SDValue Chain = DAG.getEntryNode();
15021 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
15022 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
15023 // use its literal value of 0x2C.
15024 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
15025 ? Type::getInt8PtrTy(*DAG.getContext(),
15027 : Type::getInt32PtrTy(*DAG.getContext(),
15030 SDValue TlsArray = Subtarget.is64Bit()
15031 ? DAG.getIntPtrConstant(0x58, dl)
15032 : (Subtarget.isTargetWindowsGNU()
15033 ? DAG.getIntPtrConstant(0x2C, dl)
15034 : DAG.getExternalSymbol("_tls_array", PtrVT));
15036 SDValue ThreadPointer =
15037 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
15040 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
15041 res = ThreadPointer;
15043 // Load the _tls_index variable
15044 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
15045 if (Subtarget.is64Bit())
15046 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
15047 MachinePointerInfo(), MVT::i32);
15049 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
15051 auto &DL = DAG.getDataLayout();
15053 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
15054 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
15056 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
15059 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
15061 // Get the offset of start of .tls section
15062 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15063 GA->getValueType(0),
15064 GA->getOffset(), X86II::MO_SECREL);
15065 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
15067 // The address of the thread local variable is the add of the thread
15068 // pointer with the offset of the variable.
15069 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
15072 llvm_unreachable("TLS not implemented for this target.");
15075 /// Lower SRA_PARTS and friends, which return two i32 values
15076 /// and take a 2 x i32 value to shift plus a shift amount.
15077 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
15078 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
15079 MVT VT = Op.getSimpleValueType();
15080 unsigned VTBits = VT.getSizeInBits();
15082 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
15083 SDValue ShOpLo = Op.getOperand(0);
15084 SDValue ShOpHi = Op.getOperand(1);
15085 SDValue ShAmt = Op.getOperand(2);
15086 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
15087 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
15089 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15090 DAG.getConstant(VTBits - 1, dl, MVT::i8));
15091 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
15092 DAG.getConstant(VTBits - 1, dl, MVT::i8))
15093 : DAG.getConstant(0, dl, VT);
15095 SDValue Tmp2, Tmp3;
15096 if (Op.getOpcode() == ISD::SHL_PARTS) {
15097 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
15098 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
15100 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
15101 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
15104 // If the shift amount is larger or equal than the width of a part we can't
15105 // rely on the results of shld/shrd. Insert a test and select the appropriate
15106 // values for large shift amounts.
15107 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15108 DAG.getConstant(VTBits, dl, MVT::i8));
15109 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
15110 AndNode, DAG.getConstant(0, dl, MVT::i8));
15113 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
15114 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
15115 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
15117 if (Op.getOpcode() == ISD::SHL_PARTS) {
15118 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15119 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15121 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15122 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15125 SDValue Ops[2] = { Lo, Hi };
15126 return DAG.getMergeValues(Ops, dl);
15129 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
15130 SelectionDAG &DAG) const {
15131 SDValue Src = Op.getOperand(0);
15132 MVT SrcVT = Src.getSimpleValueType();
15133 MVT VT = Op.getSimpleValueType();
15136 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15137 if (SrcVT.isVector()) {
15138 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
15139 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
15140 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
15141 DAG.getUNDEF(SrcVT)));
15143 if (SrcVT.getVectorElementType() == MVT::i1) {
15144 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
15145 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15146 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
15147 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15148 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15149 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
15154 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
15155 "Unknown SINT_TO_FP to lower!");
15157 // These are really Legal; return the operand so the caller accepts it as
15159 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
15161 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15162 Subtarget.is64Bit()) {
15166 SDValue ValueToStore = Op.getOperand(0);
15167 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15168 !Subtarget.is64Bit())
15169 // Bitcasting to f64 here allows us to do a single 64-bit store from
15170 // an SSE register, avoiding the store forwarding penalty that would come
15171 // with two 32-bit stores.
15172 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15174 unsigned Size = SrcVT.getSizeInBits()/8;
15175 MachineFunction &MF = DAG.getMachineFunction();
15176 auto PtrVT = getPointerTy(MF.getDataLayout());
15177 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15178 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15179 SDValue Chain = DAG.getStore(
15180 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15181 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15182 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15185 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15187 SelectionDAG &DAG) const {
15191 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15193 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15195 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15197 unsigned ByteSize = SrcVT.getSizeInBits()/8;
15199 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15200 MachineMemOperand *MMO;
15202 int SSFI = FI->getIndex();
15203 MMO = DAG.getMachineFunction().getMachineMemOperand(
15204 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15205 MachineMemOperand::MOLoad, ByteSize, ByteSize);
15207 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15208 StackSlot = StackSlot.getOperand(1);
15210 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15211 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15213 Tys, Ops, SrcVT, MMO);
15216 Chain = Result.getValue(1);
15217 SDValue InFlag = Result.getValue(2);
15219 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15220 // shouldn't be necessary except that RFP cannot be live across
15221 // multiple blocks. When stackifier is fixed, they can be uncoupled.
15222 MachineFunction &MF = DAG.getMachineFunction();
15223 unsigned SSFISize = Op.getValueSizeInBits()/8;
15224 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15225 auto PtrVT = getPointerTy(MF.getDataLayout());
15226 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15227 Tys = DAG.getVTList(MVT::Other);
15229 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15231 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15232 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15233 MachineMemOperand::MOStore, SSFISize, SSFISize);
15235 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15236 Ops, Op.getValueType(), MMO);
15237 Result = DAG.getLoad(
15238 Op.getValueType(), DL, Chain, StackSlot,
15239 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15245 /// 64-bit unsigned integer to double expansion.
15246 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15247 SelectionDAG &DAG) const {
15248 // This algorithm is not obvious. Here it is what we're trying to output:
15251 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15252 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15254 haddpd %xmm0, %xmm0
15256 pshufd $0x4e, %xmm0, %xmm1
15262 LLVMContext *Context = DAG.getContext();
15264 // Build some magic constants.
15265 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15266 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15267 auto PtrVT = getPointerTy(DAG.getDataLayout());
15268 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15270 SmallVector<Constant*,2> CV1;
15272 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15273 APInt(64, 0x4330000000000000ULL))));
15275 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15276 APInt(64, 0x4530000000000000ULL))));
15277 Constant *C1 = ConstantVector::get(CV1);
15278 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15280 // Load the 64-bit value into an XMM register.
15281 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15284 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15285 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15286 /* Alignment = */ 16);
15288 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15291 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15292 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15293 /* Alignment = */ 16);
15294 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15295 // TODO: Are there any fast-math-flags to propagate here?
15296 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15299 if (Subtarget.hasSSE3()) {
15300 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15301 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15303 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15304 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15305 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15306 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15309 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15310 DAG.getIntPtrConstant(0, dl));
15313 /// 32-bit unsigned integer to float expansion.
15314 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15315 SelectionDAG &DAG) const {
15317 // FP constant to bias correct the final result.
15318 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15321 // Load the 32-bit value into an XMM register.
15322 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15325 // Zero out the upper parts of the register.
15326 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15328 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15329 DAG.getBitcast(MVT::v2f64, Load),
15330 DAG.getIntPtrConstant(0, dl));
15332 // Or the load with the bias.
15333 SDValue Or = DAG.getNode(
15334 ISD::OR, dl, MVT::v2i64,
15335 DAG.getBitcast(MVT::v2i64,
15336 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15337 DAG.getBitcast(MVT::v2i64,
15338 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15340 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15341 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15343 // Subtract the bias.
15344 // TODO: Are there any fast-math-flags to propagate here?
15345 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15347 // Handle final rounding.
15348 MVT DestVT = Op.getSimpleValueType();
15350 if (DestVT.bitsLT(MVT::f64))
15351 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15352 DAG.getIntPtrConstant(0, dl));
15353 if (DestVT.bitsGT(MVT::f64))
15354 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15356 // Handle final rounding.
15360 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15361 const X86Subtarget &Subtarget, SDLoc &DL) {
15362 if (Op.getSimpleValueType() != MVT::v2f64)
15365 SDValue N0 = Op.getOperand(0);
15366 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15368 // Legalize to v4i32 type.
15369 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15370 DAG.getUNDEF(MVT::v2i32));
15372 if (Subtarget.hasAVX512())
15373 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15375 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15376 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15377 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15378 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15380 // Two to the power of half-word-size.
15381 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15383 // Clear upper part of LO, lower HI.
15384 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15385 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15387 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15388 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15389 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15391 // Add the two halves.
15392 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15395 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15396 const X86Subtarget &Subtarget) {
15397 // The algorithm is the following:
15398 // #ifdef __SSE4_1__
15399 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15400 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15401 // (uint4) 0x53000000, 0xaa);
15403 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15404 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15406 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15407 // return (float4) lo + fhi;
15409 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15410 // reassociate the two FADDs, and if we do that, the algorithm fails
15411 // spectacularly (PR24512).
15412 // FIXME: If we ever have some kind of Machine FMF, this should be marked
15413 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15414 // there's also the MachineCombiner reassociations happening on Machine IR.
15415 if (DAG.getTarget().Options.UnsafeFPMath)
15419 SDValue V = Op->getOperand(0);
15420 MVT VecIntVT = V.getSimpleValueType();
15421 bool Is128 = VecIntVT == MVT::v4i32;
15422 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15423 // If we convert to something else than the supported type, e.g., to v4f64,
15425 if (VecFloatVT != Op->getSimpleValueType(0))
15428 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15429 "Unsupported custom type");
15431 // In the #idef/#else code, we have in common:
15432 // - The vector of constants:
15438 // Create the splat vector for 0x4b000000.
15439 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15440 // Create the splat vector for 0x53000000.
15441 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15443 // Create the right shift.
15444 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15445 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15448 if (Subtarget.hasSSE41()) {
15449 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15450 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15451 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15452 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15453 // Low will be bitcasted right away, so do not bother bitcasting back to its
15455 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15456 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15457 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15458 // (uint4) 0x53000000, 0xaa);
15459 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15460 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15461 // High will be bitcasted right away, so do not bother bitcasting back to
15462 // its original type.
15463 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15464 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15466 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15467 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15468 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15469 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15471 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15472 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15475 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15476 SDValue VecCstFAdd = DAG.getConstantFP(
15477 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15479 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15480 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15481 // TODO: Are there any fast-math-flags to propagate here?
15483 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15484 // return (float4) lo + fhi;
15485 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15486 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15489 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15490 SelectionDAG &DAG) const {
15491 SDValue N0 = Op.getOperand(0);
15492 MVT SrcVT = N0.getSimpleValueType();
15495 if (SrcVT.getVectorElementType() == MVT::i1) {
15496 if (SrcVT == MVT::v2i1)
15497 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15498 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15499 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15500 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15501 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15504 switch (SrcVT.SimpleTy) {
15506 llvm_unreachable("Custom UINT_TO_FP is not supported!");
15511 MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15512 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15513 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
15516 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15519 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15522 assert(Subtarget.hasAVX512());
15523 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15524 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
15528 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15529 SelectionDAG &DAG) const {
15530 SDValue N0 = Op.getOperand(0);
15532 auto PtrVT = getPointerTy(DAG.getDataLayout());
15534 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
15535 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
15536 // the optimization here.
15537 if (DAG.SignBitIsZero(N0))
15538 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
15540 if (Op.getSimpleValueType().isVector())
15541 return lowerUINT_TO_FP_vec(Op, DAG);
15543 MVT SrcVT = N0.getSimpleValueType();
15544 MVT DstVT = Op.getSimpleValueType();
15546 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15547 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15548 // Conversions from unsigned i32 to f32/f64 are legal,
15549 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
15553 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15554 return LowerUINT_TO_FP_i64(Op, DAG);
15555 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15556 return LowerUINT_TO_FP_i32(Op, DAG);
15557 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15560 // Make a 64-bit buffer, and use it to build an FILD.
15561 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15562 if (SrcVT == MVT::i32) {
15563 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15564 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15565 StackSlot, MachinePointerInfo());
15566 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15567 OffsetSlot, MachinePointerInfo());
15568 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15572 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15573 SDValue ValueToStore = Op.getOperand(0);
15574 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15575 // Bitcasting to f64 here allows us to do a single 64-bit store from
15576 // an SSE register, avoiding the store forwarding penalty that would come
15577 // with two 32-bit stores.
15578 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15579 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15580 MachinePointerInfo());
15581 // For i64 source, we need to add the appropriate power of 2 if the input
15582 // was negative. This is the same as the optimization in
15583 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15584 // we must be careful to do the computation in x87 extended precision, not
15585 // in SSE. (The generic code can't know it's OK to do this, or how to.)
15586 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15587 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15588 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15589 MachineMemOperand::MOLoad, 8, 8);
15591 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15592 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15593 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15596 APInt FF(32, 0x5F800000ULL);
15598 // Check whether the sign bit is set.
15599 SDValue SignSet = DAG.getSetCC(
15600 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15601 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15603 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15604 SDValue FudgePtr = DAG.getConstantPool(
15605 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15607 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15608 SDValue Zero = DAG.getIntPtrConstant(0, dl);
15609 SDValue Four = DAG.getIntPtrConstant(4, dl);
15610 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
15611 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15613 // Load the value out, extending it from f32 to f80.
15614 // FIXME: Avoid the extend by constructing the right constant pool?
15615 SDValue Fudge = DAG.getExtLoad(
15616 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15617 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15618 /* Alignment = */ 4);
15619 // Extend everything to 80 bits to force it to be done on x87.
15620 // TODO: Are there any fast-math-flags to propagate here?
15621 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15622 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15623 DAG.getIntPtrConstant(0, dl));
15626 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15627 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15628 // just return an <SDValue(), SDValue()> pair.
15629 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15630 // to i16, i32 or i64, and we lower it to a legal sequence.
15631 // If lowered to the final integer result we return a <result, SDValue()> pair.
15632 // Otherwise we lower it to a sequence ending with a FIST, return a
15633 // <FIST, StackSlot> pair, and the caller is responsible for loading
15634 // the final integer result from StackSlot.
15635 std::pair<SDValue,SDValue>
15636 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15637 bool IsSigned, bool IsReplace) const {
15640 EVT DstTy = Op.getValueType();
15641 EVT TheVT = Op.getOperand(0).getValueType();
15642 auto PtrVT = getPointerTy(DAG.getDataLayout());
15644 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15645 // f16 must be promoted before using the lowering in this routine.
15646 // fp128 does not use this lowering.
15647 return std::make_pair(SDValue(), SDValue());
15650 // If using FIST to compute an unsigned i64, we'll need some fixup
15651 // to handle values above the maximum signed i64. A FIST is always
15652 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15653 bool UnsignedFixup = !IsSigned &&
15654 DstTy == MVT::i64 &&
15655 (!Subtarget.is64Bit() ||
15656 !isScalarFPTypeInSSEReg(TheVT));
15658 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15659 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15660 // The low 32 bits of the fist result will have the correct uint32 result.
15661 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15665 assert(DstTy.getSimpleVT() <= MVT::i64 &&
15666 DstTy.getSimpleVT() >= MVT::i16 &&
15667 "Unknown FP_TO_INT to lower!");
15669 // These are really Legal.
15670 if (DstTy == MVT::i32 &&
15671 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15672 return std::make_pair(SDValue(), SDValue());
15673 if (Subtarget.is64Bit() &&
15674 DstTy == MVT::i64 &&
15675 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15676 return std::make_pair(SDValue(), SDValue());
15678 // We lower FP->int64 into FISTP64 followed by a load from a temporary
15680 MachineFunction &MF = DAG.getMachineFunction();
15681 unsigned MemSize = DstTy.getSizeInBits()/8;
15682 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15683 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15686 switch (DstTy.getSimpleVT().SimpleTy) {
15687 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15688 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15689 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15690 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15693 SDValue Chain = DAG.getEntryNode();
15694 SDValue Value = Op.getOperand(0);
15695 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15697 if (UnsignedFixup) {
15699 // Conversion to unsigned i64 is implemented with a select,
15700 // depending on whether the source value fits in the range
15701 // of a signed i64. Let Thresh be the FP equivalent of
15702 // 0x8000000000000000ULL.
15704 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15705 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
15706 // Fist-to-mem64 FistSrc
15707 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15708 // to XOR'ing the high 32 bits with Adjust.
15710 // Being a power of 2, Thresh is exactly representable in all FP formats.
15711 // For X87 we'd like to use the smallest FP type for this constant, but
15712 // for DAG type consistency we have to match the FP operand type.
15714 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15715 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15716 bool LosesInfo = false;
15717 if (TheVT == MVT::f64)
15718 // The rounding mode is irrelevant as the conversion should be exact.
15719 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15721 else if (TheVT == MVT::f80)
15722 Status = Thresh.convert(APFloat::x87DoubleExtended(),
15723 APFloat::rmNearestTiesToEven, &LosesInfo);
15725 assert(Status == APFloat::opOK && !LosesInfo &&
15726 "FP conversion should have been exact");
15728 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15730 SDValue Cmp = DAG.getSetCC(DL,
15731 getSetCCResultType(DAG.getDataLayout(),
15732 *DAG.getContext(), TheVT),
15733 Value, ThreshVal, ISD::SETLT);
15734 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15735 DAG.getConstant(0, DL, MVT::i32),
15736 DAG.getConstant(0x80000000, DL, MVT::i32));
15737 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15738 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15739 *DAG.getContext(), TheVT),
15740 Value, ThreshVal, ISD::SETLT);
15741 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15744 // FIXME This causes a redundant load/store if the SSE-class value is already
15745 // in memory, such as if it is on the callstack.
15746 if (isScalarFPTypeInSSEReg(TheVT)) {
15747 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15748 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15749 MachinePointerInfo::getFixedStack(MF, SSFI));
15750 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15752 Chain, StackSlot, DAG.getValueType(TheVT)
15755 MachineMemOperand *MMO =
15756 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15757 MachineMemOperand::MOLoad, MemSize, MemSize);
15758 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15759 Chain = Value.getValue(1);
15760 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15761 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15764 MachineMemOperand *MMO =
15765 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15766 MachineMemOperand::MOStore, MemSize, MemSize);
15768 if (UnsignedFixup) {
15770 // Insert the FIST, load its result as two i32's,
15771 // and XOR the high i32 with Adjust.
15773 SDValue FistOps[] = { Chain, Value, StackSlot };
15774 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15775 FistOps, DstTy, MMO);
15778 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15779 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15782 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15783 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15785 if (Subtarget.is64Bit()) {
15786 // Join High32 and Low32 into a 64-bit result.
15787 // (High32 << 32) | Low32
15788 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15789 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15790 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15791 DAG.getConstant(32, DL, MVT::i8));
15792 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15793 return std::make_pair(Result, SDValue());
15796 SDValue ResultOps[] = { Low32, High32 };
15798 SDValue pair = IsReplace
15799 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15800 : DAG.getMergeValues(ResultOps, DL);
15801 return std::make_pair(pair, SDValue());
15803 // Build the FP_TO_INT*_IN_MEM
15804 SDValue Ops[] = { Chain, Value, StackSlot };
15805 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15807 return std::make_pair(FIST, StackSlot);
15811 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15812 const X86Subtarget &Subtarget) {
15813 MVT VT = Op->getSimpleValueType(0);
15814 SDValue In = Op->getOperand(0);
15815 MVT InVT = In.getSimpleValueType();
15818 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15819 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15821 // Optimize vectors in AVX mode:
15824 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15825 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15826 // Concat upper and lower parts.
15829 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15830 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15831 // Concat upper and lower parts.
15834 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15835 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15836 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15839 if (Subtarget.hasInt256())
15840 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15842 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15843 SDValue Undef = DAG.getUNDEF(InVT);
15844 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15845 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15846 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15848 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15849 VT.getVectorNumElements()/2);
15851 OpLo = DAG.getBitcast(HVT, OpLo);
15852 OpHi = DAG.getBitcast(HVT, OpHi);
15854 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15857 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15858 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15859 MVT VT = Op->getSimpleValueType(0);
15860 SDValue In = Op->getOperand(0);
15861 MVT InVT = In.getSimpleValueType();
15863 unsigned NumElts = VT.getVectorNumElements();
15865 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
15866 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
15867 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15869 if (InVT.getVectorElementType() != MVT::i1)
15872 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15874 if (!VT.is512BitVector() && !Subtarget.hasVLX())
15875 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15878 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15880 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15882 SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
15884 return SelectedVal;
15885 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15888 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15889 SelectionDAG &DAG) {
15890 if (Subtarget.hasFp256())
15891 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15897 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15898 SelectionDAG &DAG) {
15900 MVT VT = Op.getSimpleValueType();
15901 SDValue In = Op.getOperand(0);
15902 MVT SVT = In.getSimpleValueType();
15904 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15905 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15907 if (Subtarget.hasFp256())
15908 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15911 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15912 VT.getVectorNumElements() != SVT.getVectorNumElements());
15916 /// Helper to recursively truncate vector elements in half with PACKSS.
15917 /// It makes use of the fact that vector comparison results will be all-zeros
15918 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15919 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15920 /// within each 128-bit lane.
15921 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15924 const X86Subtarget &Subtarget) {
15925 // Requires SSE2 but AVX512 has fast truncate.
15926 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15929 EVT SrcVT = In.getValueType();
15931 // No truncation required, we might get here due to recursive calls.
15932 if (SrcVT == DstVT)
15935 // We only support vector truncation to 128bits or greater from a
15936 // 256bits or greater source.
15937 if ((DstVT.getSizeInBits() % 128) != 0)
15939 if ((SrcVT.getSizeInBits() % 256) != 0)
15942 unsigned NumElems = SrcVT.getVectorNumElements();
15943 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15944 assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15947 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15949 // Extract lower/upper subvectors.
15950 unsigned NumSubElts = NumElems / 2;
15951 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15952 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15953 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15955 // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15956 if (SrcVT.is256BitVector()) {
15957 Lo = DAG.getBitcast(MVT::v8i16, Lo);
15958 Hi = DAG.getBitcast(MVT::v8i16, Hi);
15959 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15960 return DAG.getBitcast(DstVT, Res);
15963 // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15964 // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15965 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15966 Lo = DAG.getBitcast(MVT::v16i16, Lo);
15967 Hi = DAG.getBitcast(MVT::v16i16, Hi);
15968 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15970 // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15971 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15972 Res = DAG.getBitcast(MVT::v4i64, Res);
15973 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15975 if (DstVT.is256BitVector())
15976 return DAG.getBitcast(DstVT, Res);
15978 // If 512bit -> 128bit truncate another stage.
15979 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15980 Res = DAG.getBitcast(PackedVT, Res);
15981 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15984 // Recursively pack lower/upper subvectors, concat result and pack again.
15985 assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15986 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15987 Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15988 Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15990 PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15991 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15992 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15995 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15996 const X86Subtarget &Subtarget) {
15999 MVT VT = Op.getSimpleValueType();
16000 SDValue In = Op.getOperand(0);
16001 MVT InVT = In.getSimpleValueType();
16003 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
16005 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
16006 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
16007 if (InVT.getScalarSizeInBits() <= 16) {
16008 if (Subtarget.hasBWI()) {
16009 // legal, will go to VPMOVB2M, VPMOVW2M
16010 // Shift packed bytes not supported natively, bitcast to word
16011 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
16012 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
16013 DAG.getBitcast(ExtVT, In),
16014 DAG.getConstant(ShiftInx, DL, ExtVT));
16015 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
16016 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
16018 // Use TESTD/Q, extended vector to packed dword/qword.
16019 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
16020 "Unexpected vector type.");
16021 unsigned NumElts = InVT.getVectorNumElements();
16022 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
16023 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
16025 ShiftInx = InVT.getScalarSizeInBits() - 1;
16028 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
16029 DAG.getConstant(ShiftInx, DL, InVT));
16030 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
16033 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
16035 MVT VT = Op.getSimpleValueType();
16036 SDValue In = Op.getOperand(0);
16037 MVT InVT = In.getSimpleValueType();
16039 if (VT == MVT::i1) {
16040 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
16041 "Invalid scalar TRUNCATE operation");
16042 if (InVT.getSizeInBits() >= 32)
16044 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
16045 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
16047 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
16048 "Invalid TRUNCATE operation");
16050 if (VT.getVectorElementType() == MVT::i1)
16051 return LowerTruncateVecI1(Op, DAG, Subtarget);
16053 // vpmovqb/w/d, vpmovdb/w, vpmovwb
16054 if (Subtarget.hasAVX512()) {
16055 // word to byte only under BWI
16056 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
16057 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
16058 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
16059 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
16062 // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
16063 if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
16064 if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
16067 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
16068 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
16069 if (Subtarget.hasInt256()) {
16070 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
16071 In = DAG.getBitcast(MVT::v8i32, In);
16072 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
16073 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
16074 DAG.getIntPtrConstant(0, DL));
16077 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16078 DAG.getIntPtrConstant(0, DL));
16079 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16080 DAG.getIntPtrConstant(2, DL));
16081 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16082 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16083 static const int ShufMask[] = {0, 2, 4, 6};
16084 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
16087 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
16088 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
16089 if (Subtarget.hasInt256()) {
16090 In = DAG.getBitcast(MVT::v32i8, In);
16092 // The PSHUFB mask:
16093 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
16094 -1, -1, -1, -1, -1, -1, -1, -1,
16095 16, 17, 20, 21, 24, 25, 28, 29,
16096 -1, -1, -1, -1, -1, -1, -1, -1 };
16097 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
16098 In = DAG.getBitcast(MVT::v4i64, In);
16100 static const int ShufMask2[] = {0, 2, -1, -1};
16101 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
16102 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16103 DAG.getIntPtrConstant(0, DL));
16104 return DAG.getBitcast(VT, In);
16107 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16108 DAG.getIntPtrConstant(0, DL));
16110 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16111 DAG.getIntPtrConstant(4, DL));
16113 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
16114 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
16116 // The PSHUFB mask:
16117 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
16118 -1, -1, -1, -1, -1, -1, -1, -1};
16120 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
16121 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
16123 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16124 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16126 // The MOVLHPS Mask:
16127 static const int ShufMask2[] = {0, 1, 4, 5};
16128 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
16129 return DAG.getBitcast(MVT::v8i16, res);
16132 // Handle truncation of V256 to V128 using shuffles.
16133 if (!VT.is128BitVector() || !InVT.is256BitVector())
16136 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
16138 unsigned NumElems = VT.getVectorNumElements();
16139 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
16141 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
16142 // Prepare truncation shuffle mask
16143 for (unsigned i = 0; i != NumElems; ++i)
16144 MaskVec[i] = i * 2;
16145 In = DAG.getBitcast(NVT, In);
16146 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
16147 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
16148 DAG.getIntPtrConstant(0, DL));
16151 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
16152 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
16153 MVT VT = Op.getSimpleValueType();
16155 if (VT.isVector()) {
16156 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
16157 SDValue Src = Op.getOperand(0);
16159 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
16160 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
16161 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
16162 DAG.getUNDEF(MVT::v2f32)));
16168 assert(!VT.isVector());
16170 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
16171 IsSigned, /*IsReplace=*/ false);
16172 SDValue FIST = Vals.first, StackSlot = Vals.second;
16173 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16174 if (!FIST.getNode())
16177 if (StackSlot.getNode())
16178 // Load the result.
16179 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16181 // The node is the result.
16185 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16187 MVT VT = Op.getSimpleValueType();
16188 SDValue In = Op.getOperand(0);
16189 MVT SVT = In.getSimpleValueType();
16191 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
16193 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16194 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16195 In, DAG.getUNDEF(SVT)));
16198 /// The only differences between FABS and FNEG are the mask and the logic op.
16199 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
16200 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16201 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
16202 "Wrong opcode for lowering FABS or FNEG.");
16204 bool IsFABS = (Op.getOpcode() == ISD::FABS);
16206 // If this is a FABS and it has an FNEG user, bail out to fold the combination
16207 // into an FNABS. We'll lower the FABS after that if it is still in use.
16209 for (SDNode *User : Op->uses())
16210 if (User->getOpcode() == ISD::FNEG)
16214 MVT VT = Op.getSimpleValueType();
16216 bool IsF128 = (VT == MVT::f128);
16218 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16219 // decide if we should generate a 16-byte constant mask when we only need 4 or
16220 // 8 bytes for the scalar case.
16225 if (VT.isVector()) {
16227 EltVT = VT.getVectorElementType();
16228 } else if (IsF128) {
16229 // SSE instructions are used for optimized f128 logical operations.
16230 LogicVT = MVT::f128;
16233 // There are no scalar bitwise logical SSE/AVX instructions, so we
16234 // generate a 16-byte vector constant and logic op even for the scalar case.
16235 // Using a 16-byte mask allows folding the load of the mask with
16236 // the logic op, so it can save (~4 bytes) on code size.
16237 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16241 unsigned EltBits = EltVT.getSizeInBits();
16242 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16244 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16245 const fltSemantics &Sem =
16246 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16247 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16248 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16250 SDValue Op0 = Op.getOperand(0);
16251 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16253 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16254 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16256 if (VT.isVector() || IsF128)
16257 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16259 // For the scalar case extend to a 128-bit vector, perform the logic op,
16260 // and extract the scalar result back out.
16261 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16262 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16263 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16264 DAG.getIntPtrConstant(0, dl));
16267 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16268 SDValue Mag = Op.getOperand(0);
16269 SDValue Sign = Op.getOperand(1);
16272 // If the sign operand is smaller, extend it first.
16273 MVT VT = Op.getSimpleValueType();
16274 if (Sign.getSimpleValueType().bitsLT(VT))
16275 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16277 // And if it is bigger, shrink it first.
16278 if (Sign.getSimpleValueType().bitsGT(VT))
16279 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16281 // At this point the operands and the result should have the same
16282 // type, and that won't be f80 since that is not custom lowered.
16283 bool IsF128 = (VT == MVT::f128);
16284 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16285 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16286 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16287 "Unexpected type in LowerFCOPYSIGN");
16289 MVT EltVT = VT.getScalarType();
16290 const fltSemantics &Sem =
16291 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16292 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16294 // Perform all scalar logic operations as 16-byte vectors because there are no
16295 // scalar FP logic instructions in SSE.
16296 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16297 // unnecessary splats, but we might miss load folding opportunities. Should
16298 // this decision be based on OptimizeForSize?
16299 bool IsFakeVector = !VT.isVector() && !IsF128;
16302 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16304 // The mask constants are automatically splatted for vector types.
16305 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16306 SDValue SignMask = DAG.getConstantFP(
16307 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16308 SDValue MagMask = DAG.getConstantFP(
16309 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16311 // First, clear all bits but the sign bit from the second operand (sign).
16313 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16314 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16316 // Next, clear the sign bit from the first operand (magnitude).
16317 // TODO: If we had general constant folding for FP logic ops, this check
16318 // wouldn't be necessary.
16320 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16321 APFloat APF = Op0CN->getValueAPF();
16323 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16325 // If the magnitude operand wasn't a constant, we need to AND out the sign.
16327 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16328 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16331 // OR the magnitude value with the sign bit.
16332 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16333 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16334 DAG.getIntPtrConstant(0, dl));
16337 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16338 SDValue N0 = Op.getOperand(0);
16340 MVT VT = Op.getSimpleValueType();
16342 MVT OpVT = N0.getSimpleValueType();
16343 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16344 "Unexpected type for FGETSIGN");
16346 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16347 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16348 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16349 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16350 Res = DAG.getZExtOrTrunc(Res, dl, VT);
16351 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16355 // Check whether an OR'd tree is PTEST-able.
16356 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16357 SelectionDAG &DAG) {
16358 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16360 if (!Subtarget.hasSSE41())
16363 if (!Op->hasOneUse())
16366 SDNode *N = Op.getNode();
16369 SmallVector<SDValue, 8> Opnds;
16370 DenseMap<SDValue, unsigned> VecInMap;
16371 SmallVector<SDValue, 8> VecIns;
16372 EVT VT = MVT::Other;
16374 // Recognize a special case where a vector is casted into wide integer to
16376 Opnds.push_back(N->getOperand(0));
16377 Opnds.push_back(N->getOperand(1));
16379 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16380 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16381 // BFS traverse all OR'd operands.
16382 if (I->getOpcode() == ISD::OR) {
16383 Opnds.push_back(I->getOperand(0));
16384 Opnds.push_back(I->getOperand(1));
16385 // Re-evaluate the number of nodes to be traversed.
16386 e += 2; // 2 more nodes (LHS and RHS) are pushed.
16390 // Quit if a non-EXTRACT_VECTOR_ELT
16391 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16394 // Quit if without a constant index.
16395 SDValue Idx = I->getOperand(1);
16396 if (!isa<ConstantSDNode>(Idx))
16399 SDValue ExtractedFromVec = I->getOperand(0);
16400 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16401 if (M == VecInMap.end()) {
16402 VT = ExtractedFromVec.getValueType();
16403 // Quit if not 128/256-bit vector.
16404 if (!VT.is128BitVector() && !VT.is256BitVector())
16406 // Quit if not the same type.
16407 if (VecInMap.begin() != VecInMap.end() &&
16408 VT != VecInMap.begin()->first.getValueType())
16410 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16411 VecIns.push_back(ExtractedFromVec);
16413 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16416 assert((VT.is128BitVector() || VT.is256BitVector()) &&
16417 "Not extracted from 128-/256-bit vector.");
16419 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16421 for (DenseMap<SDValue, unsigned>::const_iterator
16422 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16423 // Quit if not all elements are used.
16424 if (I->second != FullMask)
16428 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16430 // Cast all vectors into TestVT for PTEST.
16431 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16432 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16434 // If more than one full vector is evaluated, OR them first before PTEST.
16435 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16436 // Each iteration will OR 2 nodes and append the result until there is only
16437 // 1 node left, i.e. the final OR'd value of all vectors.
16438 SDValue LHS = VecIns[Slot];
16439 SDValue RHS = VecIns[Slot + 1];
16440 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16443 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16446 /// \brief return true if \c Op has a use that doesn't just read flags.
16447 static bool hasNonFlagsUse(SDValue Op) {
16448 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16450 SDNode *User = *UI;
16451 unsigned UOpNo = UI.getOperandNo();
16452 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16453 // Look pass truncate.
16454 UOpNo = User->use_begin().getOperandNo();
16455 User = *User->use_begin();
16458 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16459 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16465 // Emit KTEST instruction for bit vectors on AVX-512
16466 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16467 const X86Subtarget &Subtarget) {
16468 if (Op.getOpcode() == ISD::BITCAST) {
16469 auto hasKTEST = [&](MVT VT) {
16470 unsigned SizeInBits = VT.getSizeInBits();
16471 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16472 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16474 SDValue Op0 = Op.getOperand(0);
16475 MVT Op0VT = Op0.getValueType().getSimpleVT();
16476 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16478 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16483 /// Emit nodes that will be selected as "test Op0,Op0", or something
16485 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16486 SelectionDAG &DAG) const {
16487 if (Op.getValueType() == MVT::i1) {
16488 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16489 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16490 DAG.getConstant(0, dl, MVT::i8));
16492 // CF and OF aren't always set the way we want. Determine which
16493 // of these we need.
16494 bool NeedCF = false;
16495 bool NeedOF = false;
16498 case X86::COND_A: case X86::COND_AE:
16499 case X86::COND_B: case X86::COND_BE:
16502 case X86::COND_G: case X86::COND_GE:
16503 case X86::COND_L: case X86::COND_LE:
16504 case X86::COND_O: case X86::COND_NO: {
16505 // Check if we really need to set the
16506 // Overflow flag. If NoSignedWrap is present
16507 // that is not actually needed.
16508 switch (Op->getOpcode()) {
16513 if (Op.getNode()->getFlags().hasNoSignedWrap())
16523 // See if we can use the EFLAGS value from the operand instead of
16524 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16525 // we prove that the arithmetic won't overflow, we can't use OF or CF.
16526 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16527 // Emit KTEST for bit vectors
16528 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16530 // Emit a CMP with 0, which is the TEST pattern.
16531 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16532 DAG.getConstant(0, dl, Op.getValueType()));
16534 unsigned Opcode = 0;
16535 unsigned NumOperands = 0;
16537 // Truncate operations may prevent the merge of the SETCC instruction
16538 // and the arithmetic instruction before it. Attempt to truncate the operands
16539 // of the arithmetic instruction and use a reduced bit-width instruction.
16540 bool NeedTruncation = false;
16541 SDValue ArithOp = Op;
16542 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16543 SDValue Arith = Op->getOperand(0);
16544 // Both the trunc and the arithmetic op need to have one user each.
16545 if (Arith->hasOneUse())
16546 switch (Arith.getOpcode()) {
16553 NeedTruncation = true;
16559 // Sometimes flags can be set either with an AND or with an SRL/SHL
16560 // instruction. SRL/SHL variant should be preferred for masks longer than this
16562 const int ShiftToAndMaxMaskWidth = 32;
16563 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16565 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16566 // which may be the result of a CAST. We use the variable 'Op', which is the
16567 // non-casted variable when we check for possible users.
16568 switch (ArithOp.getOpcode()) {
16570 // Due to an isel shortcoming, be conservative if this add is likely to be
16571 // selected as part of a load-modify-store instruction. When the root node
16572 // in a match is a store, isel doesn't know how to remap non-chain non-flag
16573 // uses of other nodes in the match, such as the ADD in this case. This
16574 // leads to the ADD being left around and reselected, with the result being
16575 // two adds in the output. Alas, even if none our users are stores, that
16576 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
16577 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
16578 // climbing the DAG back to the root, and it doesn't seem to be worth the
16580 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16581 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16582 if (UI->getOpcode() != ISD::CopyToReg &&
16583 UI->getOpcode() != ISD::SETCC &&
16584 UI->getOpcode() != ISD::STORE)
16587 if (ConstantSDNode *C =
16588 dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16589 // An add of one will be selected as an INC.
16590 if (C->isOne() && !Subtarget.slowIncDec()) {
16591 Opcode = X86ISD::INC;
16596 // An add of negative one (subtract of one) will be selected as a DEC.
16597 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
16598 Opcode = X86ISD::DEC;
16604 // Otherwise use a regular EFLAGS-setting add.
16605 Opcode = X86ISD::ADD;
16610 // If we have a constant logical shift that's only used in a comparison
16611 // against zero turn it into an equivalent AND. This allows turning it into
16612 // a TEST instruction later.
16613 if (ZeroCheck && Op->hasOneUse() &&
16614 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16615 EVT VT = Op.getValueType();
16616 unsigned BitWidth = VT.getSizeInBits();
16617 unsigned ShAmt = Op->getConstantOperandVal(1);
16618 if (ShAmt >= BitWidth) // Avoid undefined shifts.
16620 APInt Mask = ArithOp.getOpcode() == ISD::SRL
16621 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16622 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16623 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16625 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16626 DAG.getConstant(Mask, dl, VT));
16631 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16632 // because a TEST instruction will be better. However, AND should be
16633 // preferred if the instruction can be combined into ANDN.
16634 if (!hasNonFlagsUse(Op)) {
16635 SDValue Op0 = ArithOp->getOperand(0);
16636 SDValue Op1 = ArithOp->getOperand(1);
16637 EVT VT = ArithOp.getValueType();
16638 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16639 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16640 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16642 // If we cannot select an ANDN instruction, check if we can replace
16643 // AND+IMM64 with a shift before giving up. This is possible for masks
16644 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16645 if (!isProperAndn) {
16649 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16650 auto *CN = dyn_cast<ConstantSDNode>(Op1);
16654 const APInt &Mask = CN->getAPIntValue();
16655 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16656 break; // Prefer TEST instruction.
16658 unsigned BitWidth = Mask.getBitWidth();
16659 unsigned LeadingOnes = Mask.countLeadingOnes();
16660 unsigned TrailingZeros = Mask.countTrailingZeros();
16662 if (LeadingOnes + TrailingZeros == BitWidth) {
16663 assert(TrailingZeros < VT.getSizeInBits() &&
16664 "Shift amount should be less than the type width");
16665 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16666 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16667 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16671 unsigned LeadingZeros = Mask.countLeadingZeros();
16672 unsigned TrailingOnes = Mask.countTrailingOnes();
16674 if (LeadingZeros + TrailingOnes == BitWidth) {
16675 assert(LeadingZeros < VT.getSizeInBits() &&
16676 "Shift amount should be less than the type width");
16677 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16678 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16679 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16690 // Due to the ISEL shortcoming noted above, be conservative if this op is
16691 // likely to be selected as part of a load-modify-store instruction.
16692 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16693 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16694 if (UI->getOpcode() == ISD::STORE)
16697 // Otherwise use a regular EFLAGS-setting instruction.
16698 switch (ArithOp.getOpcode()) {
16699 default: llvm_unreachable("unexpected operator!");
16700 case ISD::SUB: Opcode = X86ISD::SUB; break;
16701 case ISD::XOR: Opcode = X86ISD::XOR; break;
16702 case ISD::AND: Opcode = X86ISD::AND; break;
16704 if (!NeedTruncation && ZeroCheck) {
16705 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16708 Opcode = X86ISD::OR;
16722 return SDValue(Op.getNode(), 1);
16728 // If we found that truncation is beneficial, perform the truncation and
16730 if (NeedTruncation) {
16731 EVT VT = Op.getValueType();
16732 SDValue WideVal = Op->getOperand(0);
16733 EVT WideVT = WideVal.getValueType();
16734 unsigned ConvertedOp = 0;
16735 // Use a target machine opcode to prevent further DAGCombine
16736 // optimizations that may separate the arithmetic operations
16737 // from the setcc node.
16738 switch (WideVal.getOpcode()) {
16740 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16741 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16742 case ISD::AND: ConvertedOp = X86ISD::AND; break;
16743 case ISD::OR: ConvertedOp = X86ISD::OR; break;
16744 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16748 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16749 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16750 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16751 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16752 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16758 // Emit KTEST for bit vectors
16759 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16762 // Emit a CMP with 0, which is the TEST pattern.
16763 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16764 DAG.getConstant(0, dl, Op.getValueType()));
16766 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16767 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16769 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16770 DAG.ReplaceAllUsesWith(Op, New);
16771 return SDValue(New.getNode(), 1);
16774 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
16776 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16777 const SDLoc &dl, SelectionDAG &DAG) const {
16778 if (isNullConstant(Op1))
16779 return EmitTest(Op0, X86CC, dl, DAG);
16781 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16782 "Unexpected comparison operation for MVT::i1 operands");
16784 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16785 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16786 // Only promote the compare up to I32 if it is a 16 bit operation
16787 // with an immediate. 16 bit immediates are to be avoided.
16788 if ((Op0.getValueType() == MVT::i16 &&
16789 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16790 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16791 !Subtarget.isAtom()) {
16792 unsigned ExtendOp =
16793 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16794 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16795 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16797 // Use SUB instead of CMP to enable CSE between SUB and CMP.
16798 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16799 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16801 return SDValue(Sub.getNode(), 1);
16803 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16806 /// Convert a comparison if required by the subtarget.
16807 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16808 SelectionDAG &DAG) const {
16809 // If the subtarget does not support the FUCOMI instruction, floating-point
16810 // comparisons have to be converted.
16811 if (Subtarget.hasCMov() ||
16812 Cmp.getOpcode() != X86ISD::CMP ||
16813 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16814 !Cmp.getOperand(1).getValueType().isFloatingPoint())
16817 // The instruction selector will select an FUCOM instruction instead of
16818 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16819 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16820 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16822 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16823 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16824 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16825 DAG.getConstant(8, dl, MVT::i8));
16826 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16828 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16829 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16830 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16833 /// Check if replacement of SQRT with RSQRT should be disabled.
16834 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16835 EVT VT = Op.getValueType();
16837 // We never want to use both SQRT and RSQRT instructions for the same input.
16838 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16842 return Subtarget.hasFastVectorFSQRT();
16843 return Subtarget.hasFastScalarFSQRT();
16846 /// The minimum architected relative accuracy is 2^-12. We need one
16847 /// Newton-Raphson step to have a good float result (24 bits of precision).
16848 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16849 SelectionDAG &DAG, int Enabled,
16850 int &RefinementSteps,
16851 bool &UseOneConstNR,
16852 bool Reciprocal) const {
16853 EVT VT = Op.getValueType();
16855 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16856 // TODO: Add support for AVX512 (v16f32).
16857 // It is likely not profitable to do this for f64 because a double-precision
16858 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16859 // instructions: convert to single, rsqrtss, convert back to double, refine
16860 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16861 // along with FMA, this could be a throughput win.
16862 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16863 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16864 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16865 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16866 RefinementSteps = 1;
16868 UseOneConstNR = false;
16869 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16874 /// The minimum architected relative accuracy is 2^-12. We need one
16875 /// Newton-Raphson step to have a good float result (24 bits of precision).
16876 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16878 int &RefinementSteps) const {
16879 EVT VT = Op.getValueType();
16881 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16882 // TODO: Add support for AVX512 (v16f32).
16883 // It is likely not profitable to do this for f64 because a double-precision
16884 // reciprocal estimate with refinement on x86 prior to FMA requires
16885 // 15 instructions: convert to single, rcpss, convert back to double, refine
16886 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16887 // along with FMA, this could be a throughput win.
16889 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16890 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16891 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16892 // Enable estimate codegen with 1 refinement step for vector division.
16893 // Scalar division estimates are disabled because they break too much
16894 // real-world code. These defaults are intended to match GCC behavior.
16895 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16898 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16899 RefinementSteps = 1;
16901 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16906 /// If we have at least two divisions that use the same divisor, convert to
16907 /// multiplication by a reciprocal. This may need to be adjusted for a given
16908 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16909 /// This is because we still need one division to calculate the reciprocal and
16910 /// then we need two multiplies by that reciprocal as replacements for the
16911 /// original divisions.
16912 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16916 /// Helper for creating a X86ISD::SETCC node.
16917 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16918 SelectionDAG &DAG) {
16919 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16920 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16923 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16924 /// according to equal/not-equal condition code \p CC.
16925 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16926 const SDLoc &dl, SelectionDAG &DAG) {
16927 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16928 // instruction. Since the shift amount is in-range-or-undefined, we know
16929 // that doing a bittest on the i32 value is ok. We extend to i32 because
16930 // the encoding for the i16 version is larger than the i32 version.
16931 // Also promote i16 to i32 for performance / code size reason.
16932 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16933 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16935 // See if we can use the 32-bit instruction instead of the 64-bit one for a
16936 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16937 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16938 // known to be zero.
16939 if (Src.getValueType() == MVT::i64 &&
16940 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16941 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16943 // If the operand types disagree, extend the shift amount to match. Since
16944 // BT ignores high bits (like shifts) we can use anyextend.
16945 if (Src.getValueType() != BitNo.getValueType())
16946 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16948 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16949 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16950 return getSETCC(Cond, BT, dl , DAG);
16953 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16954 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16955 const SDLoc &dl, SelectionDAG &DAG) {
16956 SDValue Op0 = And.getOperand(0);
16957 SDValue Op1 = And.getOperand(1);
16958 if (Op0.getOpcode() == ISD::TRUNCATE)
16959 Op0 = Op0.getOperand(0);
16960 if (Op1.getOpcode() == ISD::TRUNCATE)
16961 Op1 = Op1.getOperand(0);
16964 if (Op1.getOpcode() == ISD::SHL)
16965 std::swap(Op0, Op1);
16966 if (Op0.getOpcode() == ISD::SHL) {
16967 if (isOneConstant(Op0.getOperand(0))) {
16968 // If we looked past a truncate, check that it's only truncating away
16970 unsigned BitWidth = Op0.getValueSizeInBits();
16971 unsigned AndBitWidth = And.getValueSizeInBits();
16972 if (BitWidth > AndBitWidth) {
16974 DAG.computeKnownBits(Op0, Known);
16975 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
16979 RHS = Op0.getOperand(1);
16981 } else if (Op1.getOpcode() == ISD::Constant) {
16982 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16983 uint64_t AndRHSVal = AndRHS->getZExtValue();
16984 SDValue AndLHS = Op0;
16986 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16987 LHS = AndLHS.getOperand(0);
16988 RHS = AndLHS.getOperand(1);
16991 // Use BT if the immediate can't be encoded in a TEST instruction.
16992 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16994 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16999 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
17004 // Convert (truncate (srl X, N) to i1) to (bt X, N)
17005 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
17006 const SDLoc &dl, SelectionDAG &DAG) {
17008 assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
17009 "Expected TRUNCATE to i1 node");
17011 if (Op.getOperand(0).getOpcode() != ISD::SRL)
17014 SDValue ShiftRight = Op.getOperand(0);
17015 return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
17019 /// Result of 'and' or 'trunc to i1' is compared against zero.
17020 /// Change to a BT node if possible.
17021 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
17022 const SDLoc &dl, SelectionDAG &DAG) const {
17023 if (Op.getOpcode() == ISD::AND)
17024 return LowerAndToBT(Op, CC, dl, DAG);
17025 if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
17026 return LowerTruncateToBT(Op, CC, dl, DAG);
17030 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
17032 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
17037 // SSE Condition code mapping:
17046 switch (SetCCOpcode) {
17047 default: llvm_unreachable("Unexpected SETCC condition");
17049 case ISD::SETEQ: SSECC = 0; break;
17051 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
17053 case ISD::SETOLT: SSECC = 1; break;
17055 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
17057 case ISD::SETOLE: SSECC = 2; break;
17058 case ISD::SETUO: SSECC = 3; break;
17060 case ISD::SETNE: SSECC = 4; break;
17061 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
17062 case ISD::SETUGE: SSECC = 5; break;
17063 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
17064 case ISD::SETUGT: SSECC = 6; break;
17065 case ISD::SETO: SSECC = 7; break;
17067 case ISD::SETONE: SSECC = 8; break;
17070 std::swap(Op0, Op1);
17075 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
17076 /// concatenate the result back.
17077 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
17078 MVT VT = Op.getSimpleValueType();
17080 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
17081 "Unsupported value type for operation");
17083 unsigned NumElems = VT.getVectorNumElements();
17085 SDValue CC = Op.getOperand(2);
17087 // Extract the LHS vectors
17088 SDValue LHS = Op.getOperand(0);
17089 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
17090 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
17092 // Extract the RHS vectors
17093 SDValue RHS = Op.getOperand(1);
17094 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
17095 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
17097 // Issue the operation on the smaller types and concatenate the result back
17098 MVT EltVT = VT.getVectorElementType();
17099 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
17100 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
17101 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
17102 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
17105 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17106 SDValue Op0 = Op.getOperand(0);
17107 SDValue Op1 = Op.getOperand(1);
17108 SDValue CC = Op.getOperand(2);
17109 MVT VT = Op.getSimpleValueType();
17112 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
17113 "Unexpected type for boolean compare operation");
17114 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17115 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
17116 DAG.getConstant(-1, dl, VT));
17117 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
17118 DAG.getConstant(-1, dl, VT));
17119 switch (SetCCOpcode) {
17120 default: llvm_unreachable("Unexpected SETCC condition");
17122 // (x == y) -> ~(x ^ y)
17123 return DAG.getNode(ISD::XOR, dl, VT,
17124 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
17125 DAG.getConstant(-1, dl, VT));
17127 // (x != y) -> (x ^ y)
17128 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
17131 // (x > y) -> (x & ~y)
17132 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
17135 // (x < y) -> (~x & y)
17136 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
17139 // (x <= y) -> (~x | y)
17140 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
17143 // (x >=y) -> (x | ~y)
17144 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
17148 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17150 SDValue Op0 = Op.getOperand(0);
17151 SDValue Op1 = Op.getOperand(1);
17152 SDValue CC = Op.getOperand(2);
17153 MVT VT = Op.getSimpleValueType();
17156 assert(VT.getVectorElementType() == MVT::i1 &&
17157 "Cannot set masked compare for this operation");
17159 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17161 bool Unsigned = false;
17164 switch (SetCCOpcode) {
17165 default: llvm_unreachable("Unexpected SETCC condition");
17166 case ISD::SETNE: SSECC = 4; break;
17167 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
17168 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
17169 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
17170 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
17171 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
17172 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
17173 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
17174 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
17175 case ISD::SETLE: SSECC = 2; break;
17179 std::swap(Op0, Op1);
17181 return DAG.getNode(Opc, dl, VT, Op0, Op1);
17182 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
17183 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17184 DAG.getConstant(SSECC, dl, MVT::i8));
17187 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17188 /// operand \p Op1. If non-trivial (for example because it's not constant)
17189 /// return an empty value.
17190 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17191 SelectionDAG &DAG) {
17192 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17196 MVT VT = Op1.getSimpleValueType();
17197 MVT EVT = VT.getVectorElementType();
17198 unsigned n = VT.getVectorNumElements();
17199 SmallVector<SDValue, 8> ULTOp1;
17201 for (unsigned i = 0; i < n; ++i) {
17202 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17203 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
17206 // Avoid underflow.
17207 APInt Val = Elt->getAPIntValue();
17211 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17214 return DAG.getBuildVector(VT, dl, ULTOp1);
17217 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17218 SelectionDAG &DAG) {
17219 SDValue Op0 = Op.getOperand(0);
17220 SDValue Op1 = Op.getOperand(1);
17221 SDValue CC = Op.getOperand(2);
17222 MVT VT = Op.getSimpleValueType();
17223 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
17224 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17229 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17230 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17234 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17235 assert(VT.getVectorNumElements() <= 16);
17236 Opc = X86ISD::CMPM;
17238 Opc = X86ISD::CMPP;
17239 // The SSE/AVX packed FP comparison nodes are defined with a
17240 // floating-point vector result that matches the operand type. This allows
17241 // them to work with an SSE1 target (integer vector types are not legal).
17242 VT = Op0.getSimpleValueType();
17245 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17246 // emit two comparisons and a logic op to tie them together.
17247 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
17250 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
17252 // LLVM predicate is SETUEQ or SETONE.
17254 unsigned CombineOpc;
17255 if (Cond == ISD::SETUEQ) {
17258 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
17259 static_cast<unsigned>(ISD::OR);
17261 assert(Cond == ISD::SETONE);
17264 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
17265 static_cast<unsigned>(ISD::AND);
17268 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17269 DAG.getConstant(CC0, dl, MVT::i8));
17270 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17271 DAG.getConstant(CC1, dl, MVT::i8));
17272 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17274 // Handle all other FP comparisons here.
17275 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17276 DAG.getConstant(SSECC, dl, MVT::i8));
17279 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17280 // result type of SETCC. The bitcast is expected to be optimized away
17281 // during combining/isel.
17282 if (Opc == X86ISD::CMPP)
17283 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17288 MVT VTOp0 = Op0.getSimpleValueType();
17289 assert(VTOp0 == Op1.getSimpleValueType() &&
17290 "Expected operands with same type!");
17291 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17292 "Invalid number of packed elements for source and destination!");
17294 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17295 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17296 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17297 // legalizer firstly checks if the first operand in input to the setcc has
17298 // a legal type. If so, then it promotes the return type to that same type.
17299 // Otherwise, the return type is promoted to the 'next legal type' which,
17300 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17302 // We reach this code only if the following two conditions are met:
17303 // 1. Both return type and operand type have been promoted to wider types
17304 // by the type legalizer.
17305 // 2. The original operand type has been promoted to a 256-bit vector.
17307 // Note that condition 2. only applies for AVX targets.
17308 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
17309 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17312 // The non-AVX512 code below works under the assumption that source and
17313 // destination types are the same.
17314 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17315 "Value types for source and destination must be the same!");
17317 // Break 256-bit integer vector compare into smaller ones.
17318 if (VT.is256BitVector() && !Subtarget.hasInt256())
17319 return Lower256IntVSETCC(Op, DAG);
17321 // Operands are boolean (vectors of i1)
17322 MVT OpVT = Op1.getSimpleValueType();
17323 if (OpVT.getVectorElementType() == MVT::i1)
17324 return LowerBoolVSETCC_AVX512(Op, DAG);
17326 // The result is boolean, but operands are int/float
17327 if (VT.getVectorElementType() == MVT::i1) {
17328 // In AVX-512 architecture setcc returns mask with i1 elements,
17329 // But there is no compare instruction for i8 and i16 elements in KNL.
17330 // In this case use SSE compare
17331 bool UseAVX512Inst =
17332 (OpVT.is512BitVector() ||
17333 OpVT.getScalarSizeInBits() >= 32 ||
17334 (Subtarget.hasBWI() && Subtarget.hasVLX()));
17337 return LowerIntVSETCC_AVX512(Op, DAG);
17339 return DAG.getNode(ISD::TRUNCATE, dl, VT,
17340 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17343 // Lower using XOP integer comparisons.
17344 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17345 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17346 // Translate compare code to XOP PCOM compare mode.
17347 unsigned CmpMode = 0;
17349 default: llvm_unreachable("Unexpected SETCC condition");
17351 case ISD::SETLT: CmpMode = 0x00; break;
17353 case ISD::SETLE: CmpMode = 0x01; break;
17355 case ISD::SETGT: CmpMode = 0x02; break;
17357 case ISD::SETGE: CmpMode = 0x03; break;
17358 case ISD::SETEQ: CmpMode = 0x04; break;
17359 case ISD::SETNE: CmpMode = 0x05; break;
17362 // Are we comparing unsigned or signed integers?
17364 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
17366 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17367 DAG.getConstant(CmpMode, dl, MVT::i8));
17370 // We are handling one of the integer comparisons here. Since SSE only has
17371 // GT and EQ comparisons for integer, swapping operands and multiple
17372 // operations may be required for some comparisons.
17373 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
17375 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
17376 Cond == ISD::SETGE || Cond == ISD::SETUGE;
17377 bool Invert = Cond == ISD::SETNE ||
17378 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
17380 // If both operands are known non-negative, then an unsigned compare is the
17381 // same as a signed compare and there's no need to flip signbits.
17382 // TODO: We could check for more general simplifications here since we're
17383 // computing known bits.
17384 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
17385 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
17387 // Special case: Use min/max operations for SETULE/SETUGE
17388 MVT VET = VT.getVectorElementType();
17390 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) ||
17391 (Subtarget.hasSSE2() && (VET == MVT::i8));
17392 bool MinMax = false;
17396 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17397 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17401 Swap = Invert = FlipSigns = false;
17404 bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17405 bool Subus = false;
17406 if (!MinMax && HasSubus) {
17407 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17409 // t = psubus Op0, Op1
17410 // pcmpeq t, <0..0>
17413 case ISD::SETULT: {
17414 // If the comparison is against a constant we can turn this into a
17415 // setule. With psubus, setule does not require a swap. This is
17416 // beneficial because the constant in the register is no longer
17417 // destructed as the destination so it can be hoisted out of a loop.
17418 // Only do this pre-AVX since vpcmp* is no longer destructive.
17419 if (Subtarget.hasAVX())
17421 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17423 Subus = true; Invert = false; Swap = false;
17427 // Psubus is better than flip-sign because it requires no inversion.
17428 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17429 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17433 Opc = X86ISD::SUBUS;
17439 std::swap(Op0, Op1);
17441 // Check that the operation in question is available (most are plain SSE2,
17442 // but PCMPGTQ and PCMPEQQ have different requirements).
17443 if (VT == MVT::v2i64) {
17444 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17445 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17447 // First cast everything to the right type.
17448 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17449 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17451 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17452 // bits of the inputs before performing those operations. The lower
17453 // compare is always unsigned.
17456 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17458 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17459 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17460 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17462 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17463 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17465 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17466 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17467 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17469 // Create masks for only the low parts/high parts of the 64 bit integers.
17470 static const int MaskHi[] = { 1, 1, 3, 3 };
17471 static const int MaskLo[] = { 0, 0, 2, 2 };
17472 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17473 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17474 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17476 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17477 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17480 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17482 return DAG.getBitcast(VT, Result);
17485 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17486 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17487 // pcmpeqd + pshufd + pand.
17488 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17490 // First cast everything to the right type.
17491 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17492 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17495 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17497 // Make sure the lower and upper halves are both all-ones.
17498 static const int Mask[] = { 1, 0, 3, 2 };
17499 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17500 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17503 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17505 return DAG.getBitcast(VT, Result);
17509 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17510 // bits of the inputs before performing those operations.
17512 MVT EltVT = VT.getVectorElementType();
17513 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17515 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17516 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17519 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17521 // If the logical-not of the result is required, perform that now.
17523 Result = DAG.getNOT(dl, Result, VT);
17526 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17529 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17530 getZeroVector(VT, Subtarget, DAG, dl));
17535 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17537 MVT VT = Op.getSimpleValueType();
17539 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17541 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
17542 SDValue Op0 = Op.getOperand(0);
17543 SDValue Op1 = Op.getOperand(1);
17545 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17547 // Optimize to BT if possible.
17548 // Lower (X & (1 << N)) == 0 to BT(X, N).
17549 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17550 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17551 // Lower (trunc (X >> N) to i1) to BT(X, N).
17552 if (Op0.hasOneUse() && isNullConstant(Op1) &&
17553 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17554 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
17556 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
17561 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
17563 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17564 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17566 // If the input is a setcc, then reuse the input setcc or use a new one with
17567 // the inverted condition.
17568 if (Op0.getOpcode() == X86ISD::SETCC) {
17569 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17570 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17574 CCode = X86::GetOppositeBranchCondition(CCode);
17575 SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17577 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17581 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17582 if (isOneConstant(Op1)) {
17583 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17584 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17586 if (!isNullConstant(Op1)) {
17587 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17588 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17592 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17593 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17594 if (X86CC == X86::COND_INVALID)
17597 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17598 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17599 SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17601 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17605 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
17606 SDValue LHS = Op.getOperand(0);
17607 SDValue RHS = Op.getOperand(1);
17608 SDValue Carry = Op.getOperand(2);
17609 SDValue Cond = Op.getOperand(3);
17612 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
17613 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17615 // Recreate the carry if needed.
17616 EVT CarryVT = Carry.getValueType();
17617 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
17618 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
17619 Carry, DAG.getConstant(NegOne, DL, CarryVT));
17621 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17622 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
17623 SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17624 if (Op.getSimpleValueType() == MVT::i1)
17625 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17629 /// Return true if opcode is a X86 logical comparison.
17630 static bool isX86LogicalCmp(SDValue Op) {
17631 unsigned Opc = Op.getOpcode();
17632 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17633 Opc == X86ISD::SAHF)
17635 if (Op.getResNo() == 1 &&
17636 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17637 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
17638 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17639 Opc == X86ISD::XOR || Opc == X86ISD::AND))
17642 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17648 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17649 if (V.getOpcode() != ISD::TRUNCATE)
17652 SDValue VOp0 = V.getOperand(0);
17653 unsigned InBits = VOp0.getValueSizeInBits();
17654 unsigned Bits = V.getValueSizeInBits();
17655 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17658 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17659 bool AddTest = true;
17660 SDValue Cond = Op.getOperand(0);
17661 SDValue Op1 = Op.getOperand(1);
17662 SDValue Op2 = Op.getOperand(2);
17664 MVT VT = Op1.getSimpleValueType();
17667 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17668 // are available or VBLENDV if AVX is available.
17669 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17670 if (Cond.getOpcode() == ISD::SETCC &&
17671 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
17672 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
17673 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17674 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17675 int SSECC = translateX86FSETCC(
17676 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17679 if (Subtarget.hasAVX512()) {
17680 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
17681 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17682 return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
17683 DL, VT, Cmp, Op1, Op2);
17686 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17687 DAG.getConstant(SSECC, DL, MVT::i8));
17689 // If we have AVX, we can use a variable vector select (VBLENDV) instead
17690 // of 3 logic instructions for size savings and potentially speed.
17691 // Unfortunately, there is no scalar form of VBLENDV.
17693 // If either operand is a constant, don't try this. We can expect to
17694 // optimize away at least one of the logic instructions later in that
17695 // case, so that sequence would be faster than a variable blend.
17697 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17698 // uses XMM0 as the selection register. That may need just as many
17699 // instructions as the AND/ANDN/OR sequence due to register moves, so
17702 if (Subtarget.hasAVX() &&
17703 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17705 // Convert to vectors, do a VSELECT, and convert back to scalar.
17706 // All of the conversions should be optimized away.
17708 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17709 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17710 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17711 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17713 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17714 VCmp = DAG.getBitcast(VCmpVT, VCmp);
17716 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
17718 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17719 VSel, DAG.getIntPtrConstant(0, DL));
17721 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17722 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17723 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17727 // AVX512 fallback is to lower selects of scalar floats to masked moves.
17728 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
17729 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
17730 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
17733 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17735 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17736 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17737 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17738 Op1Scalar = Op1.getOperand(0);
17740 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17741 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17742 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17743 Op2Scalar = Op2.getOperand(0);
17744 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17745 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
17746 Op1Scalar, Op2Scalar);
17747 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17748 return DAG.getBitcast(VT, newSelect);
17749 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17750 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17751 DAG.getIntPtrConstant(0, DL));
17755 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
17756 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17757 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17758 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17759 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17760 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17761 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
17762 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17765 if (Cond.getOpcode() == ISD::SETCC) {
17766 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17768 // If the condition was updated, it's possible that the operands of the
17769 // select were also updated (for example, EmitTest has a RAUW). Refresh
17770 // the local references to the select operands in case they got stale.
17771 Op1 = Op.getOperand(1);
17772 Op2 = Op.getOperand(2);
17776 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17777 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17778 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17779 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17780 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
17781 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
17782 if (Cond.getOpcode() == X86ISD::SETCC &&
17783 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17784 isNullConstant(Cond.getOperand(1).getOperand(1))) {
17785 SDValue Cmp = Cond.getOperand(1);
17786 unsigned CondCode =
17787 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17789 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17790 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17791 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17792 SDValue CmpOp0 = Cmp.getOperand(0);
17794 // Apply further optimizations for special cases
17795 // (select (x != 0), -1, 0) -> neg & sbb
17796 // (select (x == 0), 0, -1) -> neg & sbb
17797 if (isNullConstant(Y) &&
17798 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17799 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17800 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
17801 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
17802 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17803 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17804 SDValue(Neg.getNode(), 1));
17808 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17809 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17810 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17812 SDValue Res = // Res = 0 or -1.
17813 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17814 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17816 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17817 Res = DAG.getNOT(DL, Res, Res.getValueType());
17819 if (!isNullConstant(Op2))
17820 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17822 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
17823 Cmp.getOperand(0).getOpcode() == ISD::AND &&
17824 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
17825 SDValue CmpOp0 = Cmp.getOperand(0);
17826 SDValue Src1, Src2;
17827 // true if Op2 is XOR or OR operator and one of its operands
17829 // ( a , a op b) || ( b , a op b)
17830 auto isOrXorPattern = [&]() {
17831 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
17832 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
17834 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
17841 if (isOrXorPattern()) {
17843 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
17844 // we need mask of all zeros or ones with same size of the other
17846 if (CmpSz > VT.getSizeInBits())
17847 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
17848 else if (CmpSz < VT.getSizeInBits())
17849 Neg = DAG.getNode(ISD::AND, DL, VT,
17850 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
17851 DAG.getConstant(1, DL, VT));
17854 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17855 Neg); // -(and (x, 0x1))
17856 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
17857 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
17862 // Look past (and (setcc_carry (cmp ...)), 1).
17863 if (Cond.getOpcode() == ISD::AND &&
17864 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17865 isOneConstant(Cond.getOperand(1)))
17866 Cond = Cond.getOperand(0);
17868 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17869 // setting operand in place of the X86ISD::SETCC.
17870 unsigned CondOpcode = Cond.getOpcode();
17871 if (CondOpcode == X86ISD::SETCC ||
17872 CondOpcode == X86ISD::SETCC_CARRY) {
17873 CC = Cond.getOperand(0);
17875 SDValue Cmp = Cond.getOperand(1);
17876 unsigned Opc = Cmp.getOpcode();
17877 MVT VT = Op.getSimpleValueType();
17879 bool IllegalFPCMov = false;
17880 if (VT.isFloatingPoint() && !VT.isVector() &&
17881 !isScalarFPTypeInSSEReg(VT)) // FPStack?
17882 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17884 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17885 Opc == X86ISD::BT) { // FIXME
17889 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17890 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17891 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17892 Cond.getOperand(0).getValueType() != MVT::i8)) {
17893 SDValue LHS = Cond.getOperand(0);
17894 SDValue RHS = Cond.getOperand(1);
17895 unsigned X86Opcode;
17898 switch (CondOpcode) {
17899 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17900 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17901 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17902 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17903 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17904 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17905 default: llvm_unreachable("unexpected overflowing operator");
17907 if (CondOpcode == ISD::UMULO)
17908 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17911 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17913 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17915 if (CondOpcode == ISD::UMULO)
17916 Cond = X86Op.getValue(2);
17918 Cond = X86Op.getValue(1);
17920 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17925 // Look past the truncate if the high bits are known zero.
17926 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17927 Cond = Cond.getOperand(0);
17929 // We know the result of AND is compared against zero. Try to match
17931 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17932 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17933 CC = NewSetCC.getOperand(0);
17934 Cond = NewSetCC.getOperand(1);
17941 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17942 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17945 // a < b ? -1 : 0 -> RES = ~setcc_carry
17946 // a < b ? 0 : -1 -> RES = setcc_carry
17947 // a >= b ? -1 : 0 -> RES = setcc_carry
17948 // a >= b ? 0 : -1 -> RES = ~setcc_carry
17949 if (Cond.getOpcode() == X86ISD::SUB) {
17950 Cond = ConvertCmpIfNecessary(Cond, DAG);
17951 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17953 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17954 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17955 (isNullConstant(Op1) || isNullConstant(Op2))) {
17956 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17957 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17959 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17960 return DAG.getNOT(DL, Res, Res.getValueType());
17965 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17966 // widen the cmov and push the truncate through. This avoids introducing a new
17967 // branch during isel and doesn't add any extensions.
17968 if (Op.getValueType() == MVT::i8 &&
17969 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17970 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17971 if (T1.getValueType() == T2.getValueType() &&
17972 // Blacklist CopyFromReg to avoid partial register stalls.
17973 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17974 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17975 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17976 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17980 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17981 // condition is true.
17982 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17983 SDValue Ops[] = { Op2, Op1, CC, Cond };
17984 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17987 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17988 const X86Subtarget &Subtarget,
17989 SelectionDAG &DAG) {
17990 MVT VT = Op->getSimpleValueType(0);
17991 SDValue In = Op->getOperand(0);
17992 MVT InVT = In.getSimpleValueType();
17993 MVT VTElt = VT.getVectorElementType();
17994 MVT InVTElt = InVT.getVectorElementType();
17998 if ((InVTElt == MVT::i1) &&
17999 (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
18001 ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
18003 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18005 unsigned NumElts = VT.getVectorNumElements();
18007 if (VT.is512BitVector() && InVTElt != MVT::i1 &&
18008 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
18009 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
18010 return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
18011 return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
18014 if (InVTElt != MVT::i1)
18018 if (!VT.is512BitVector() && !Subtarget.hasVLX())
18019 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
18022 if (Subtarget.hasDQI()) {
18023 V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
18024 assert(!VT.is512BitVector() && "Unexpected vector type");
18026 SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
18027 SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
18028 V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
18033 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
18036 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
18037 // For sign extend this needs to handle all vector sizes and SSE4.1 and
18038 // non-SSE4.1 targets. For zero extend this should only handle inputs of
18039 // MVT::v64i8 when BWI is not supported, but AVX512 is.
18040 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
18041 const X86Subtarget &Subtarget,
18042 SelectionDAG &DAG) {
18043 SDValue In = Op->getOperand(0);
18044 MVT VT = Op->getSimpleValueType(0);
18045 MVT InVT = In.getSimpleValueType();
18046 assert(VT.getSizeInBits() == InVT.getSizeInBits());
18048 MVT SVT = VT.getVectorElementType();
18049 MVT InSVT = InVT.getVectorElementType();
18050 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
18052 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
18054 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
18056 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
18057 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
18058 !(VT.is512BitVector() && Subtarget.hasAVX512()))
18063 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
18064 // For 512-bit vectors, we need 128-bits or 256-bits.
18065 if (VT.getSizeInBits() > 128) {
18066 // Input needs to be at least the same number of elements as output, and
18067 // at least 128-bits.
18068 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
18069 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
18072 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
18073 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
18075 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
18076 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
18077 // need to be handled here for 256/512-bit results.
18078 if (Subtarget.hasInt256()) {
18079 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
18080 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
18081 X86ISD::VSEXT : X86ISD::VZEXT;
18082 return DAG.getNode(ExtOpc, dl, VT, In);
18085 // We should only get here for sign extend.
18086 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
18087 "Unexpected opcode!");
18089 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
18093 // As SRAI is only available on i16/i32 types, we expand only up to i32
18094 // and handle i64 separately.
18095 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
18096 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
18097 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
18098 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
18099 Curr = DAG.getBitcast(CurrVT, Curr);
18102 SDValue SignExt = Curr;
18103 if (CurrVT != InVT) {
18104 unsigned SignExtShift =
18105 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
18106 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18107 DAG.getConstant(SignExtShift, dl, MVT::i8));
18113 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
18114 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18115 DAG.getConstant(31, dl, MVT::i8));
18116 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
18117 return DAG.getBitcast(VT, Ext);
18123 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18124 SelectionDAG &DAG) {
18125 MVT VT = Op->getSimpleValueType(0);
18126 SDValue In = Op->getOperand(0);
18127 MVT InVT = In.getSimpleValueType();
18130 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
18131 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
18133 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
18134 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
18135 (VT != MVT::v16i16 || InVT != MVT::v16i8))
18138 if (Subtarget.hasInt256())
18139 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18141 // Optimize vectors in AVX mode
18142 // Sign extend v8i16 to v8i32 and
18145 // Divide input vector into two parts
18146 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
18147 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
18148 // concat the vectors to original VT
18150 unsigned NumElems = InVT.getVectorNumElements();
18151 SDValue Undef = DAG.getUNDEF(InVT);
18153 SmallVector<int,8> ShufMask1(NumElems, -1);
18154 for (unsigned i = 0; i != NumElems/2; ++i)
18157 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
18159 SmallVector<int,8> ShufMask2(NumElems, -1);
18160 for (unsigned i = 0; i != NumElems/2; ++i)
18161 ShufMask2[i] = i + NumElems/2;
18163 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
18165 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
18166 VT.getVectorNumElements() / 2);
18168 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18169 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18171 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18174 // Lower truncating store. We need a special lowering to vXi1 vectors
18175 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
18176 SelectionDAG &DAG) {
18177 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
18179 EVT MemVT = St->getMemoryVT();
18180 assert(St->isTruncatingStore() && "We only custom truncating store.");
18181 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
18182 "Expected truncstore of i1 vector");
18184 SDValue Op = St->getValue();
18185 MVT OpVT = Op.getValueType().getSimpleVT();
18186 unsigned NumElts = OpVT.getVectorNumElements();
18187 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
18189 // Truncate and store - everything is legal
18190 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
18191 if (MemVT.getSizeInBits() < 8)
18192 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
18193 DAG.getUNDEF(MVT::v8i1), Op,
18194 DAG.getIntPtrConstant(0, dl));
18195 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18196 St->getMemOperand());
18199 // A subset, assume that we have only AVX-512F
18200 if (NumElts <= 8) {
18202 // Extend to 8-elts vector
18203 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
18204 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
18205 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18207 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18208 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18209 St->getMemOperand());
18212 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
18213 // Divide the vector into 2 parts and store each part separately
18214 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18215 DAG.getIntPtrConstant(0, dl));
18216 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18217 SDValue BasePtr = St->getBasePtr();
18218 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18219 St->getMemOperand());
18220 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18221 DAG.getIntPtrConstant(16, dl));
18222 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18224 SDValue BasePtrHi =
18225 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18226 DAG.getConstant(2, dl, BasePtr.getValueType()));
18228 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18229 BasePtrHi, St->getMemOperand());
18230 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18233 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18234 const X86Subtarget &Subtarget,
18235 SelectionDAG &DAG) {
18237 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18239 EVT MemVT = Ld->getMemoryVT();
18240 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18241 "Expected i1 vector load");
18242 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18243 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18244 MVT VT = Op.getValueType().getSimpleVT();
18245 unsigned NumElts = VT.getVectorNumElements();
18247 if ((Subtarget.hasBWI() && NumElts >= 32) ||
18248 (Subtarget.hasDQI() && NumElts < 16) ||
18250 // Load and extend - everything is legal
18252 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18254 Ld->getMemOperand());
18255 // Replace chain users with the new chain.
18256 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18257 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18258 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18259 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18261 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18262 DAG.getIntPtrConstant(0, dl));
18264 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18266 Ld->getMemOperand());
18267 // Replace chain users with the new chain.
18268 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18269 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18271 // Finally, do a normal sign-extend to the desired register.
18272 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18275 if (NumElts <= 8) {
18276 // A subset, assume that we have only AVX-512F
18277 unsigned NumBitsToLoad = 8;
18278 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18279 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18281 Ld->getMemOperand());
18282 // Replace chain users with the new chain.
18283 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18284 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18286 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18287 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18290 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18292 // we should take care to v4i1 and v2i1
18294 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18295 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18296 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18297 DAG.getIntPtrConstant(0, dl));
18300 assert(VT == MVT::v32i8 && "Unexpected extload type");
18302 SmallVector<SDValue, 2> Chains;
18304 SDValue BasePtr = Ld->getBasePtr();
18305 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18307 Ld->getMemOperand());
18308 Chains.push_back(LoadLo.getValue(1));
18310 SDValue BasePtrHi =
18311 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18312 DAG.getConstant(2, dl, BasePtr.getValueType()));
18314 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18316 Ld->getMemOperand());
18317 Chains.push_back(LoadHi.getValue(1));
18318 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18319 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18321 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18322 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18323 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18326 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18327 // may emit an illegal shuffle but the expansion is still better than scalar
18328 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18329 // we'll emit a shuffle and a arithmetic shift.
18330 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18331 // TODO: It is possible to support ZExt by zeroing the undef values during
18332 // the shuffle phase or after the shuffle.
18333 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18334 SelectionDAG &DAG) {
18335 MVT RegVT = Op.getSimpleValueType();
18336 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18337 assert(RegVT.isInteger() &&
18338 "We only custom lower integer vector sext loads.");
18340 // Nothing useful we can do without SSE2 shuffles.
18341 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18343 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18345 EVT MemVT = Ld->getMemoryVT();
18346 if (MemVT.getScalarType() == MVT::i1)
18347 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18349 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18350 unsigned RegSz = RegVT.getSizeInBits();
18352 ISD::LoadExtType Ext = Ld->getExtensionType();
18354 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18355 && "Only anyext and sext are currently implemented.");
18356 assert(MemVT != RegVT && "Cannot extend to the same type");
18357 assert(MemVT.isVector() && "Must load a vector from memory");
18359 unsigned NumElems = RegVT.getVectorNumElements();
18360 unsigned MemSz = MemVT.getSizeInBits();
18361 assert(RegSz > MemSz && "Register size must be greater than the mem size");
18363 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18364 // The only way in which we have a legal 256-bit vector result but not the
18365 // integer 256-bit operations needed to directly lower a sextload is if we
18366 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18367 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18368 // correctly legalized. We do this late to allow the canonical form of
18369 // sextload to persist throughout the rest of the DAG combiner -- it wants
18370 // to fold together any extensions it can, and so will fuse a sign_extend
18371 // of an sextload into a sextload targeting a wider value.
18373 if (MemSz == 128) {
18374 // Just switch this to a normal load.
18375 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18376 "it must be a legal 128-bit vector "
18378 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18379 Ld->getPointerInfo(), Ld->getAlignment(),
18380 Ld->getMemOperand()->getFlags());
18382 assert(MemSz < 128 &&
18383 "Can't extend a type wider than 128 bits to a 256 bit vector!");
18384 // Do an sext load to a 128-bit vector type. We want to use the same
18385 // number of elements, but elements half as wide. This will end up being
18386 // recursively lowered by this routine, but will succeed as we definitely
18387 // have all the necessary features if we're using AVX1.
18389 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18390 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18392 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18393 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18394 Ld->getMemOperand()->getFlags());
18397 // Replace chain users with the new chain.
18398 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18399 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18401 // Finally, do a normal sign-extend to the desired register.
18402 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18405 // All sizes must be a power of two.
18406 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18407 "Non-power-of-two elements are not custom lowered!");
18409 // Attempt to load the original value using scalar loads.
18410 // Find the largest scalar type that divides the total loaded size.
18411 MVT SclrLoadTy = MVT::i8;
18412 for (MVT Tp : MVT::integer_valuetypes()) {
18413 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18418 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18419 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18421 SclrLoadTy = MVT::f64;
18423 // Calculate the number of scalar loads that we need to perform
18424 // in order to load our vector from memory.
18425 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18427 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18428 "Can only lower sext loads with a single scalar load!");
18430 unsigned loadRegZize = RegSz;
18431 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18434 // Represent our vector as a sequence of elements which are the
18435 // largest scalar that we can load.
18436 EVT LoadUnitVecVT = EVT::getVectorVT(
18437 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18439 // Represent the data using the same element type that is stored in
18440 // memory. In practice, we ''widen'' MemVT.
18442 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18443 loadRegZize / MemVT.getScalarSizeInBits());
18445 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18446 "Invalid vector type");
18448 // We can't shuffle using an illegal type.
18449 assert(TLI.isTypeLegal(WideVecVT) &&
18450 "We only lower types that form legal widened vector types");
18452 SmallVector<SDValue, 8> Chains;
18453 SDValue Ptr = Ld->getBasePtr();
18454 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18455 TLI.getPointerTy(DAG.getDataLayout()));
18456 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18458 for (unsigned i = 0; i < NumLoads; ++i) {
18459 // Perform a single load.
18460 SDValue ScalarLoad =
18461 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18462 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18463 Chains.push_back(ScalarLoad.getValue(1));
18464 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18465 // another round of DAGCombining.
18467 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18469 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18470 ScalarLoad, DAG.getIntPtrConstant(i, dl));
18472 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18475 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18477 // Bitcast the loaded value to a vector of the original element type, in
18478 // the size of the target vector type.
18479 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18480 unsigned SizeRatio = RegSz / MemSz;
18482 if (Ext == ISD::SEXTLOAD) {
18483 // If we have SSE4.1, we can directly emit a VSEXT node.
18484 if (Subtarget.hasSSE41()) {
18485 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18486 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18490 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18492 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18493 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18495 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18496 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18500 // Redistribute the loaded elements into the different locations.
18501 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18502 for (unsigned i = 0; i != NumElems; ++i)
18503 ShuffleVec[i * SizeRatio] = i;
18505 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18506 DAG.getUNDEF(WideVecVT), ShuffleVec);
18508 // Bitcast to the requested type.
18509 Shuff = DAG.getBitcast(RegVT, Shuff);
18510 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18514 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18515 /// each of which has no other use apart from the AND / OR.
18516 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18517 Opc = Op.getOpcode();
18518 if (Opc != ISD::OR && Opc != ISD::AND)
18520 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18521 Op.getOperand(0).hasOneUse() &&
18522 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18523 Op.getOperand(1).hasOneUse());
18526 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18527 /// SETCC node has a single use.
18528 static bool isXor1OfSetCC(SDValue Op) {
18529 if (Op.getOpcode() != ISD::XOR)
18531 if (isOneConstant(Op.getOperand(1)))
18532 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18533 Op.getOperand(0).hasOneUse();
18537 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18538 bool addTest = true;
18539 SDValue Chain = Op.getOperand(0);
18540 SDValue Cond = Op.getOperand(1);
18541 SDValue Dest = Op.getOperand(2);
18544 bool Inverted = false;
18546 if (Cond.getOpcode() == ISD::SETCC) {
18547 // Check for setcc([su]{add,sub,mul}o == 0).
18548 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18549 isNullConstant(Cond.getOperand(1)) &&
18550 Cond.getOperand(0).getResNo() == 1 &&
18551 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18552 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18553 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18554 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18555 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18556 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18558 Cond = Cond.getOperand(0);
18560 if (SDValue NewCond = LowerSETCC(Cond, DAG))
18565 // FIXME: LowerXALUO doesn't handle these!!
18566 else if (Cond.getOpcode() == X86ISD::ADD ||
18567 Cond.getOpcode() == X86ISD::SUB ||
18568 Cond.getOpcode() == X86ISD::SMUL ||
18569 Cond.getOpcode() == X86ISD::UMUL)
18570 Cond = LowerXALUO(Cond, DAG);
18573 // Look pass (and (setcc_carry (cmp ...)), 1).
18574 if (Cond.getOpcode() == ISD::AND &&
18575 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18576 isOneConstant(Cond.getOperand(1)))
18577 Cond = Cond.getOperand(0);
18579 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18580 // setting operand in place of the X86ISD::SETCC.
18581 unsigned CondOpcode = Cond.getOpcode();
18582 if (CondOpcode == X86ISD::SETCC ||
18583 CondOpcode == X86ISD::SETCC_CARRY) {
18584 CC = Cond.getOperand(0);
18586 SDValue Cmp = Cond.getOperand(1);
18587 unsigned Opc = Cmp.getOpcode();
18588 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18589 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18593 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18597 // These can only come from an arithmetic instruction with overflow,
18598 // e.g. SADDO, UADDO.
18599 Cond = Cond.getOperand(1);
18605 CondOpcode = Cond.getOpcode();
18606 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18607 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18608 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18609 Cond.getOperand(0).getValueType() != MVT::i8)) {
18610 SDValue LHS = Cond.getOperand(0);
18611 SDValue RHS = Cond.getOperand(1);
18612 unsigned X86Opcode;
18615 // Keep this in sync with LowerXALUO, otherwise we might create redundant
18616 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18618 switch (CondOpcode) {
18619 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18621 if (isOneConstant(RHS)) {
18622 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18625 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18626 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18628 if (isOneConstant(RHS)) {
18629 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18632 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18633 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18634 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18635 default: llvm_unreachable("unexpected overflowing operator");
18638 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18639 if (CondOpcode == ISD::UMULO)
18640 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18643 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18645 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18647 if (CondOpcode == ISD::UMULO)
18648 Cond = X86Op.getValue(2);
18650 Cond = X86Op.getValue(1);
18652 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18656 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18657 SDValue Cmp = Cond.getOperand(0).getOperand(1);
18658 if (CondOpc == ISD::OR) {
18659 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18660 // two branches instead of an explicit OR instruction with a
18662 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18663 isX86LogicalCmp(Cmp)) {
18664 CC = Cond.getOperand(0).getOperand(0);
18665 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18666 Chain, Dest, CC, Cmp);
18667 CC = Cond.getOperand(1).getOperand(0);
18671 } else { // ISD::AND
18672 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18673 // two branches instead of an explicit AND instruction with a
18674 // separate test. However, we only do this if this block doesn't
18675 // have a fall-through edge, because this requires an explicit
18676 // jmp when the condition is false.
18677 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18678 isX86LogicalCmp(Cmp) &&
18679 Op.getNode()->hasOneUse()) {
18680 X86::CondCode CCode =
18681 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18682 CCode = X86::GetOppositeBranchCondition(CCode);
18683 CC = DAG.getConstant(CCode, dl, MVT::i8);
18684 SDNode *User = *Op.getNode()->use_begin();
18685 // Look for an unconditional branch following this conditional branch.
18686 // We need this because we need to reverse the successors in order
18687 // to implement FCMP_OEQ.
18688 if (User->getOpcode() == ISD::BR) {
18689 SDValue FalseBB = User->getOperand(1);
18691 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18692 assert(NewBR == User);
18696 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18697 Chain, Dest, CC, Cmp);
18698 X86::CondCode CCode =
18699 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18700 CCode = X86::GetOppositeBranchCondition(CCode);
18701 CC = DAG.getConstant(CCode, dl, MVT::i8);
18707 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
18708 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18709 // It should be transformed during dag combiner except when the condition
18710 // is set by a arithmetics with overflow node.
18711 X86::CondCode CCode =
18712 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18713 CCode = X86::GetOppositeBranchCondition(CCode);
18714 CC = DAG.getConstant(CCode, dl, MVT::i8);
18715 Cond = Cond.getOperand(0).getOperand(1);
18717 } else if (Cond.getOpcode() == ISD::SETCC &&
18718 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
18719 // For FCMP_OEQ, we can emit
18720 // two branches instead of an explicit AND instruction with a
18721 // separate test. However, we only do this if this block doesn't
18722 // have a fall-through edge, because this requires an explicit
18723 // jmp when the condition is false.
18724 if (Op.getNode()->hasOneUse()) {
18725 SDNode *User = *Op.getNode()->use_begin();
18726 // Look for an unconditional branch following this conditional branch.
18727 // We need this because we need to reverse the successors in order
18728 // to implement FCMP_OEQ.
18729 if (User->getOpcode() == ISD::BR) {
18730 SDValue FalseBB = User->getOperand(1);
18732 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18733 assert(NewBR == User);
18737 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18738 Cond.getOperand(0), Cond.getOperand(1));
18739 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18740 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18741 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18742 Chain, Dest, CC, Cmp);
18743 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18748 } else if (Cond.getOpcode() == ISD::SETCC &&
18749 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18750 // For FCMP_UNE, we can emit
18751 // two branches instead of an explicit AND instruction with a
18752 // separate test. However, we only do this if this block doesn't
18753 // have a fall-through edge, because this requires an explicit
18754 // jmp when the condition is false.
18755 if (Op.getNode()->hasOneUse()) {
18756 SDNode *User = *Op.getNode()->use_begin();
18757 // Look for an unconditional branch following this conditional branch.
18758 // We need this because we need to reverse the successors in order
18759 // to implement FCMP_UNE.
18760 if (User->getOpcode() == ISD::BR) {
18761 SDValue FalseBB = User->getOperand(1);
18763 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18764 assert(NewBR == User);
18767 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18768 Cond.getOperand(0), Cond.getOperand(1));
18769 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18770 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18771 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18772 Chain, Dest, CC, Cmp);
18773 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18783 // Look pass the truncate if the high bits are known zero.
18784 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18785 Cond = Cond.getOperand(0);
18787 // We know the result is compared against zero. Try to match it to BT.
18788 if (Cond.hasOneUse()) {
18789 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18790 CC = NewSetCC.getOperand(0);
18791 Cond = NewSetCC.getOperand(1);
18798 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18799 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18800 Cond = EmitTest(Cond, X86Cond, dl, DAG);
18802 Cond = ConvertCmpIfNecessary(Cond, DAG);
18803 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18804 Chain, Dest, CC, Cond);
18807 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18808 // Calls to _alloca are needed to probe the stack when allocating more than 4k
18809 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
18810 // that the guard pages used by the OS virtual memory manager are allocated in
18811 // correct sequence.
18813 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18814 SelectionDAG &DAG) const {
18815 MachineFunction &MF = DAG.getMachineFunction();
18816 bool SplitStack = MF.shouldSplitStack();
18817 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
18818 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18819 SplitStack || EmitStackProbe;
18823 SDNode *Node = Op.getNode();
18824 SDValue Chain = Op.getOperand(0);
18825 SDValue Size = Op.getOperand(1);
18826 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18827 EVT VT = Node->getValueType(0);
18829 // Chain the dynamic stack allocation so that it doesn't modify the stack
18830 // pointer when other instructions are using the stack.
18831 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
18833 bool Is64Bit = Subtarget.is64Bit();
18834 MVT SPTy = getPointerTy(DAG.getDataLayout());
18838 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18839 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18840 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18841 " not tell us which reg is the stack pointer!");
18843 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18844 Chain = SP.getValue(1);
18845 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18846 unsigned StackAlign = TFI.getStackAlignment();
18847 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18848 if (Align > StackAlign)
18849 Result = DAG.getNode(ISD::AND, dl, VT, Result,
18850 DAG.getConstant(-(uint64_t)Align, dl, VT));
18851 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18852 } else if (SplitStack) {
18853 MachineRegisterInfo &MRI = MF.getRegInfo();
18856 // The 64 bit implementation of segmented stacks needs to clobber both r10
18857 // r11. This makes it impossible to use it along with nested parameters.
18858 const Function *F = MF.getFunction();
18859 for (const auto &A : F->args()) {
18860 if (A.hasNestAttr())
18861 report_fatal_error("Cannot use segmented stacks with functions that "
18862 "have nested arguments.");
18866 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18867 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18868 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18869 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18870 DAG.getRegister(Vreg, SPTy));
18872 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18873 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18874 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18876 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18877 unsigned SPReg = RegInfo->getStackRegister();
18878 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18879 Chain = SP.getValue(1);
18882 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18883 DAG.getConstant(-(uint64_t)Align, dl, VT));
18884 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18890 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18891 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18893 SDValue Ops[2] = {Result, Chain};
18894 return DAG.getMergeValues(Ops, dl);
18897 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18898 MachineFunction &MF = DAG.getMachineFunction();
18899 auto PtrVT = getPointerTy(MF.getDataLayout());
18900 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18902 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18905 if (!Subtarget.is64Bit() ||
18906 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18907 // vastart just stores the address of the VarArgsFrameIndex slot into the
18908 // memory location argument.
18909 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18910 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18911 MachinePointerInfo(SV));
18915 // gp_offset (0 - 6 * 8)
18916 // fp_offset (48 - 48 + 8 * 16)
18917 // overflow_arg_area (point to parameters coming in memory).
18919 SmallVector<SDValue, 8> MemOps;
18920 SDValue FIN = Op.getOperand(1);
18922 SDValue Store = DAG.getStore(
18923 Op.getOperand(0), DL,
18924 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18925 MachinePointerInfo(SV));
18926 MemOps.push_back(Store);
18929 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18930 Store = DAG.getStore(
18931 Op.getOperand(0), DL,
18932 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18933 MachinePointerInfo(SV, 4));
18934 MemOps.push_back(Store);
18936 // Store ptr to overflow_arg_area
18937 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18938 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18940 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18941 MemOps.push_back(Store);
18943 // Store ptr to reg_save_area.
18944 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18945 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18946 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18947 Store = DAG.getStore(
18948 Op.getOperand(0), DL, RSFIN, FIN,
18949 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18950 MemOps.push_back(Store);
18951 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18954 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18955 assert(Subtarget.is64Bit() &&
18956 "LowerVAARG only handles 64-bit va_arg!");
18957 assert(Op.getNumOperands() == 4);
18959 MachineFunction &MF = DAG.getMachineFunction();
18960 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18961 // The Win64 ABI uses char* instead of a structure.
18962 return DAG.expandVAArg(Op.getNode());
18964 SDValue Chain = Op.getOperand(0);
18965 SDValue SrcPtr = Op.getOperand(1);
18966 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18967 unsigned Align = Op.getConstantOperandVal(3);
18970 EVT ArgVT = Op.getNode()->getValueType(0);
18971 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18972 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18975 // Decide which area this value should be read from.
18976 // TODO: Implement the AMD64 ABI in its entirety. This simple
18977 // selection mechanism works only for the basic types.
18978 if (ArgVT == MVT::f80) {
18979 llvm_unreachable("va_arg for f80 not yet implemented");
18980 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18981 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18982 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18983 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18985 llvm_unreachable("Unhandled argument type in LowerVAARG");
18988 if (ArgMode == 2) {
18989 // Sanity Check: Make sure using fp_offset makes sense.
18990 assert(!Subtarget.useSoftFloat() &&
18991 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18992 Subtarget.hasSSE1());
18995 // Insert VAARG_64 node into the DAG
18996 // VAARG_64 returns two values: Variable Argument Address, Chain
18997 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18998 DAG.getConstant(ArgMode, dl, MVT::i8),
18999 DAG.getConstant(Align, dl, MVT::i32)};
19000 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
19001 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
19002 VTs, InstOps, MVT::i64,
19003 MachinePointerInfo(SV),
19005 /*Volatile=*/false,
19007 /*WriteMem=*/true);
19008 Chain = VAARG.getValue(1);
19010 // Load the next argument and return it
19011 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
19014 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
19015 SelectionDAG &DAG) {
19016 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
19017 // where a va_list is still an i8*.
19018 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
19019 if (Subtarget.isCallingConvWin64(
19020 DAG.getMachineFunction().getFunction()->getCallingConv()))
19021 // Probably a Win64 va_copy.
19022 return DAG.expandVACopy(Op.getNode());
19024 SDValue Chain = Op.getOperand(0);
19025 SDValue DstPtr = Op.getOperand(1);
19026 SDValue SrcPtr = Op.getOperand(2);
19027 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
19028 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
19031 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
19032 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
19034 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
19037 /// Handle vector element shifts where the shift amount is a constant.
19038 /// Takes immediate version of shift as input.
19039 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
19040 SDValue SrcOp, uint64_t ShiftAmt,
19041 SelectionDAG &DAG) {
19042 MVT ElementType = VT.getVectorElementType();
19044 // Bitcast the source vector to the output type, this is mainly necessary for
19045 // vXi8/vXi64 shifts.
19046 if (VT != SrcOp.getSimpleValueType())
19047 SrcOp = DAG.getBitcast(VT, SrcOp);
19049 // Fold this packed shift into its first operand if ShiftAmt is 0.
19053 // Check for ShiftAmt >= element width
19054 if (ShiftAmt >= ElementType.getSizeInBits()) {
19055 if (Opc == X86ISD::VSRAI)
19056 ShiftAmt = ElementType.getSizeInBits() - 1;
19058 return DAG.getConstant(0, dl, VT);
19061 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
19062 && "Unknown target vector shift-by-constant node");
19064 // Fold this packed vector shift into a build vector if SrcOp is a
19065 // vector of Constants or UNDEFs.
19066 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
19067 SmallVector<SDValue, 8> Elts;
19068 unsigned NumElts = SrcOp->getNumOperands();
19069 ConstantSDNode *ND;
19072 default: llvm_unreachable("Unknown opcode!");
19073 case X86ISD::VSHLI:
19074 for (unsigned i=0; i!=NumElts; ++i) {
19075 SDValue CurrentOp = SrcOp->getOperand(i);
19076 if (CurrentOp->isUndef()) {
19077 Elts.push_back(CurrentOp);
19080 ND = cast<ConstantSDNode>(CurrentOp);
19081 const APInt &C = ND->getAPIntValue();
19082 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
19085 case X86ISD::VSRLI:
19086 for (unsigned i=0; i!=NumElts; ++i) {
19087 SDValue CurrentOp = SrcOp->getOperand(i);
19088 if (CurrentOp->isUndef()) {
19089 Elts.push_back(CurrentOp);
19092 ND = cast<ConstantSDNode>(CurrentOp);
19093 const APInt &C = ND->getAPIntValue();
19094 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
19097 case X86ISD::VSRAI:
19098 for (unsigned i=0; i!=NumElts; ++i) {
19099 SDValue CurrentOp = SrcOp->getOperand(i);
19100 if (CurrentOp->isUndef()) {
19101 Elts.push_back(CurrentOp);
19104 ND = cast<ConstantSDNode>(CurrentOp);
19105 const APInt &C = ND->getAPIntValue();
19106 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
19111 return DAG.getBuildVector(VT, dl, Elts);
19114 return DAG.getNode(Opc, dl, VT, SrcOp,
19115 DAG.getConstant(ShiftAmt, dl, MVT::i8));
19118 /// Handle vector element shifts where the shift amount may or may not be a
19119 /// constant. Takes immediate version of shift as input.
19120 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
19121 SDValue SrcOp, SDValue ShAmt,
19122 const X86Subtarget &Subtarget,
19123 SelectionDAG &DAG) {
19124 MVT SVT = ShAmt.getSimpleValueType();
19125 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
19127 // Catch shift-by-constant.
19128 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
19129 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
19130 CShAmt->getZExtValue(), DAG);
19132 // Change opcode to non-immediate version
19134 default: llvm_unreachable("Unknown target vector shift node");
19135 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
19136 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
19137 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
19140 // Need to build a vector containing shift amount.
19141 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
19142 // +=================+============+=======================================+
19143 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
19144 // +=================+============+=======================================+
19145 // | i64 | Yes, No | Use ShAmt as lowest elt |
19146 // | i32 | Yes | zero-extend in-reg |
19147 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
19148 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
19149 // +=================+============+=======================================+
19151 if (SVT == MVT::i64)
19152 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
19153 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
19154 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
19155 ShAmt = ShAmt.getOperand(0);
19156 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
19157 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19158 } else if (Subtarget.hasSSE41() &&
19159 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
19160 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
19161 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19163 SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
19164 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
19165 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19168 // The return type has to be a 128-bit type with the same element
19169 // type as the input type.
19170 MVT EltVT = VT.getVectorElementType();
19171 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19173 ShAmt = DAG.getBitcast(ShVT, ShAmt);
19174 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19177 /// \brief Return Mask with the necessary casting or extending
19178 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
19179 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19180 const X86Subtarget &Subtarget, SelectionDAG &DAG,
19183 if (isAllOnesConstant(Mask))
19184 return DAG.getTargetConstant(1, dl, MaskVT);
19185 if (X86::isZeroNode(Mask))
19186 return DAG.getTargetConstant(0, dl, MaskVT);
19188 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
19189 // Mask should be extended
19190 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19191 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19194 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
19195 if (MaskVT == MVT::v64i1) {
19196 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
19197 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
19199 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19200 DAG.getConstant(0, dl, MVT::i32));
19201 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19202 DAG.getConstant(1, dl, MVT::i32));
19204 Lo = DAG.getBitcast(MVT::v32i1, Lo);
19205 Hi = DAG.getBitcast(MVT::v32i1, Hi);
19207 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19209 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19211 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19212 return DAG.getBitcast(MaskVT,
19213 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19217 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19218 Mask.getSimpleValueType().getSizeInBits());
19219 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19220 // are extracted by EXTRACT_SUBVECTOR.
19221 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19222 DAG.getBitcast(BitcastVT, Mask),
19223 DAG.getIntPtrConstant(0, dl));
19227 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19228 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19229 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19230 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19231 SDValue PreservedSrc,
19232 const X86Subtarget &Subtarget,
19233 SelectionDAG &DAG) {
19234 MVT VT = Op.getSimpleValueType();
19235 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19236 unsigned OpcodeSelect = ISD::VSELECT;
19239 if (isAllOnesConstant(Mask))
19242 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19244 switch (Op.getOpcode()) {
19246 case X86ISD::PCMPEQM:
19247 case X86ISD::PCMPGTM:
19249 case X86ISD::CMPMU:
19250 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19251 case X86ISD::VFPCLASS:
19252 case X86ISD::VFPCLASSS:
19253 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19254 case X86ISD::VTRUNC:
19255 case X86ISD::VTRUNCS:
19256 case X86ISD::VTRUNCUS:
19257 case X86ISD::CVTPS2PH:
19258 // We can't use ISD::VSELECT here because it is not always "Legal"
19259 // for the destination type. For example vpmovqb require only AVX512
19260 // and vselect that can operate on byte element type require BWI
19261 OpcodeSelect = X86ISD::SELECT;
19264 if (PreservedSrc.isUndef())
19265 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19266 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19269 /// \brief Creates an SDNode for a predicated scalar operation.
19270 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19271 /// The mask is coming as MVT::i8 and it should be transformed
19272 /// to MVT::v1i1 while lowering masking intrinsics.
19273 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19274 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19275 /// for a scalar instruction.
19276 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19277 SDValue PreservedSrc,
19278 const X86Subtarget &Subtarget,
19279 SelectionDAG &DAG) {
19281 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19282 if (MaskConst->getZExtValue() & 0x1)
19285 MVT VT = Op.getSimpleValueType();
19288 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19289 if (Op.getOpcode() == X86ISD::FSETCCM ||
19290 Op.getOpcode() == X86ISD::FSETCCM_RND)
19291 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19292 if (Op.getOpcode() == X86ISD::VFPCLASSS)
19293 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19295 if (PreservedSrc.isUndef())
19296 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19297 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19300 static int getSEHRegistrationNodeSize(const Function *Fn) {
19301 if (!Fn->hasPersonalityFn())
19302 report_fatal_error(
19303 "querying registration node size for function without personality");
19304 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19305 // WinEHStatePass for the full struct definition.
19306 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19307 case EHPersonality::MSVC_X86SEH: return 24;
19308 case EHPersonality::MSVC_CXX: return 16;
19311 report_fatal_error(
19312 "can only recover FP for 32-bit MSVC EH personality functions");
19315 /// When the MSVC runtime transfers control to us, either to an outlined
19316 /// function or when returning to a parent frame after catching an exception, we
19317 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19318 /// Here's the math:
19319 /// RegNodeBase = EntryEBP - RegNodeSize
19320 /// ParentFP = RegNodeBase - ParentFrameOffset
19321 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19322 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19323 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19324 SDValue EntryEBP) {
19325 MachineFunction &MF = DAG.getMachineFunction();
19328 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19329 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19331 // It's possible that the parent function no longer has a personality function
19332 // if the exceptional code was optimized away, in which case we just return
19333 // the incoming EBP.
19334 if (!Fn->hasPersonalityFn())
19337 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19338 // registration, or the .set_setframe offset.
19339 MCSymbol *OffsetSym =
19340 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19341 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19342 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19343 SDValue ParentFrameOffset =
19344 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19346 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19347 // prologue to RBP in the parent function.
19348 const X86Subtarget &Subtarget =
19349 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19350 if (Subtarget.is64Bit())
19351 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19353 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19354 // RegNodeBase = EntryEBP - RegNodeSize
19355 // ParentFP = RegNodeBase - ParentFrameOffset
19356 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19357 DAG.getConstant(RegNodeSize, dl, PtrVT));
19358 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19361 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19362 SelectionDAG &DAG) {
19363 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19364 auto isRoundModeCurDirection = [](SDValue Rnd) {
19365 if (!isa<ConstantSDNode>(Rnd))
19368 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19369 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19373 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19374 MVT VT = Op.getSimpleValueType();
19375 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19377 switch(IntrData->Type) {
19378 case INTR_TYPE_1OP:
19379 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19380 case INTR_TYPE_2OP:
19381 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19383 case INTR_TYPE_3OP:
19384 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19385 Op.getOperand(2), Op.getOperand(3));
19386 case INTR_TYPE_4OP:
19387 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19388 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19389 case INTR_TYPE_1OP_MASK_RM: {
19390 SDValue Src = Op.getOperand(1);
19391 SDValue PassThru = Op.getOperand(2);
19392 SDValue Mask = Op.getOperand(3);
19393 SDValue RoundingMode;
19394 // We always add rounding mode to the Node.
19395 // If the rounding mode is not specified, we add the
19396 // "current direction" mode.
19397 if (Op.getNumOperands() == 4)
19399 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19401 RoundingMode = Op.getOperand(4);
19402 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19403 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19405 Mask, PassThru, Subtarget, DAG);
19407 case INTR_TYPE_1OP_MASK: {
19408 SDValue Src = Op.getOperand(1);
19409 SDValue PassThru = Op.getOperand(2);
19410 SDValue Mask = Op.getOperand(3);
19411 // We add rounding mode to the Node when
19412 // - RM Opcode is specified and
19413 // - RM is not "current direction".
19414 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19415 if (IntrWithRoundingModeOpcode != 0) {
19416 SDValue Rnd = Op.getOperand(4);
19417 if (!isRoundModeCurDirection(Rnd)) {
19418 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19419 dl, Op.getValueType(),
19421 Mask, PassThru, Subtarget, DAG);
19424 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19425 Mask, PassThru, Subtarget, DAG);
19427 case INTR_TYPE_SCALAR_MASK: {
19428 SDValue Src1 = Op.getOperand(1);
19429 SDValue Src2 = Op.getOperand(2);
19430 SDValue passThru = Op.getOperand(3);
19431 SDValue Mask = Op.getOperand(4);
19432 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19433 if (IntrWithRoundingModeOpcode != 0) {
19434 SDValue Rnd = Op.getOperand(5);
19435 if (!isRoundModeCurDirection(Rnd))
19436 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19437 dl, VT, Src1, Src2, Rnd),
19438 Mask, passThru, Subtarget, DAG);
19440 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
19441 Mask, passThru, Subtarget, DAG);
19443 case INTR_TYPE_SCALAR_MASK_RM: {
19444 SDValue Src1 = Op.getOperand(1);
19445 SDValue Src2 = Op.getOperand(2);
19446 SDValue Src0 = Op.getOperand(3);
19447 SDValue Mask = Op.getOperand(4);
19448 // There are 2 kinds of intrinsics in this group:
19449 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19450 // (2) With rounding mode and sae - 7 operands.
19451 if (Op.getNumOperands() == 6) {
19452 SDValue Sae = Op.getOperand(5);
19453 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19455 Mask, Src0, Subtarget, DAG);
19457 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19458 SDValue RoundingMode = Op.getOperand(5);
19459 SDValue Sae = Op.getOperand(6);
19460 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19461 RoundingMode, Sae),
19462 Mask, Src0, Subtarget, DAG);
19464 case INTR_TYPE_2OP_MASK:
19465 case INTR_TYPE_2OP_IMM8_MASK: {
19466 SDValue Src1 = Op.getOperand(1);
19467 SDValue Src2 = Op.getOperand(2);
19468 SDValue PassThru = Op.getOperand(3);
19469 SDValue Mask = Op.getOperand(4);
19471 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19472 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19474 // We specify 2 possible opcodes for intrinsics with rounding modes.
19475 // First, we check if the intrinsic may have non-default rounding mode,
19476 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19477 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19478 if (IntrWithRoundingModeOpcode != 0) {
19479 SDValue Rnd = Op.getOperand(5);
19480 if (!isRoundModeCurDirection(Rnd)) {
19481 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19482 dl, Op.getValueType(),
19484 Mask, PassThru, Subtarget, DAG);
19487 // TODO: Intrinsics should have fast-math-flags to propagate.
19488 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19489 Mask, PassThru, Subtarget, DAG);
19491 case INTR_TYPE_2OP_MASK_RM: {
19492 SDValue Src1 = Op.getOperand(1);
19493 SDValue Src2 = Op.getOperand(2);
19494 SDValue PassThru = Op.getOperand(3);
19495 SDValue Mask = Op.getOperand(4);
19496 // We specify 2 possible modes for intrinsics, with/without rounding
19498 // First, we check if the intrinsic have rounding mode (6 operands),
19499 // if not, we set rounding mode to "current".
19501 if (Op.getNumOperands() == 6)
19502 Rnd = Op.getOperand(5);
19504 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19505 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19507 Mask, PassThru, Subtarget, DAG);
19509 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
19510 SDValue Src1 = Op.getOperand(1);
19511 SDValue Src2 = Op.getOperand(2);
19512 SDValue Src3 = Op.getOperand(3);
19513 SDValue PassThru = Op.getOperand(4);
19514 SDValue Mask = Op.getOperand(5);
19515 SDValue Sae = Op.getOperand(6);
19517 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19519 Mask, PassThru, Subtarget, DAG);
19521 case INTR_TYPE_3OP_MASK_RM: {
19522 SDValue Src1 = Op.getOperand(1);
19523 SDValue Src2 = Op.getOperand(2);
19524 SDValue Imm = Op.getOperand(3);
19525 SDValue PassThru = Op.getOperand(4);
19526 SDValue Mask = Op.getOperand(5);
19527 // We specify 2 possible modes for intrinsics, with/without rounding
19529 // First, we check if the intrinsic have rounding mode (7 operands),
19530 // if not, we set rounding mode to "current".
19532 if (Op.getNumOperands() == 7)
19533 Rnd = Op.getOperand(6);
19535 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19536 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19537 Src1, Src2, Imm, Rnd),
19538 Mask, PassThru, Subtarget, DAG);
19540 case INTR_TYPE_3OP_IMM8_MASK:
19541 case INTR_TYPE_3OP_MASK: {
19542 SDValue Src1 = Op.getOperand(1);
19543 SDValue Src2 = Op.getOperand(2);
19544 SDValue Src3 = Op.getOperand(3);
19545 SDValue PassThru = Op.getOperand(4);
19546 SDValue Mask = Op.getOperand(5);
19548 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19549 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19551 // We specify 2 possible opcodes for intrinsics with rounding modes.
19552 // First, we check if the intrinsic may have non-default rounding mode,
19553 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19554 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19555 if (IntrWithRoundingModeOpcode != 0) {
19556 SDValue Rnd = Op.getOperand(6);
19557 if (!isRoundModeCurDirection(Rnd)) {
19558 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19559 dl, Op.getValueType(),
19560 Src1, Src2, Src3, Rnd),
19561 Mask, PassThru, Subtarget, DAG);
19564 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19566 Mask, PassThru, Subtarget, DAG);
19568 case VPERM_2OP_MASK : {
19569 SDValue Src1 = Op.getOperand(1);
19570 SDValue Src2 = Op.getOperand(2);
19571 SDValue PassThru = Op.getOperand(3);
19572 SDValue Mask = Op.getOperand(4);
19574 // Swap Src1 and Src2 in the node creation
19575 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19576 Mask, PassThru, Subtarget, DAG);
19578 case VPERM_3OP_MASKZ:
19579 case VPERM_3OP_MASK:{
19580 MVT VT = Op.getSimpleValueType();
19581 // Src2 is the PassThru
19582 SDValue Src1 = Op.getOperand(1);
19583 // PassThru needs to be the same type as the destination in order
19584 // to pattern match correctly.
19585 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19586 SDValue Src3 = Op.getOperand(3);
19587 SDValue Mask = Op.getOperand(4);
19588 SDValue PassThru = SDValue();
19590 // set PassThru element
19591 if (IntrData->Type == VPERM_3OP_MASKZ)
19592 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19596 // Swap Src1 and Src2 in the node creation
19597 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19598 dl, Op.getValueType(),
19600 Mask, PassThru, Subtarget, DAG);
19604 case FMA_OP_MASK: {
19605 SDValue Src1 = Op.getOperand(1);
19606 SDValue Src2 = Op.getOperand(2);
19607 SDValue Src3 = Op.getOperand(3);
19608 SDValue Mask = Op.getOperand(4);
19609 MVT VT = Op.getSimpleValueType();
19610 SDValue PassThru = SDValue();
19612 // set PassThru element
19613 if (IntrData->Type == FMA_OP_MASKZ)
19614 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19615 else if (IntrData->Type == FMA_OP_MASK3)
19620 // We specify 2 possible opcodes for intrinsics with rounding modes.
19621 // First, we check if the intrinsic may have non-default rounding mode,
19622 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19623 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19624 if (IntrWithRoundingModeOpcode != 0) {
19625 SDValue Rnd = Op.getOperand(5);
19626 if (!isRoundModeCurDirection(Rnd))
19627 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19628 dl, Op.getValueType(),
19629 Src1, Src2, Src3, Rnd),
19630 Mask, PassThru, Subtarget, DAG);
19632 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19633 dl, Op.getValueType(),
19635 Mask, PassThru, Subtarget, DAG);
19637 case FMA_OP_SCALAR_MASK:
19638 case FMA_OP_SCALAR_MASK3:
19639 case FMA_OP_SCALAR_MASKZ: {
19640 SDValue Src1 = Op.getOperand(1);
19641 SDValue Src2 = Op.getOperand(2);
19642 SDValue Src3 = Op.getOperand(3);
19643 SDValue Mask = Op.getOperand(4);
19644 MVT VT = Op.getSimpleValueType();
19645 SDValue PassThru = SDValue();
19647 // set PassThru element
19648 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19649 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19650 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19655 SDValue Rnd = Op.getOperand(5);
19656 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19657 Op.getValueType(), Src1, Src2,
19659 Mask, PassThru, Subtarget, DAG);
19661 case TERLOG_OP_MASK:
19662 case TERLOG_OP_MASKZ: {
19663 SDValue Src1 = Op.getOperand(1);
19664 SDValue Src2 = Op.getOperand(2);
19665 SDValue Src3 = Op.getOperand(3);
19666 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19667 SDValue Mask = Op.getOperand(5);
19668 MVT VT = Op.getSimpleValueType();
19669 SDValue PassThru = Src1;
19670 // Set PassThru element.
19671 if (IntrData->Type == TERLOG_OP_MASKZ)
19672 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19674 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19675 Src1, Src2, Src3, Src4),
19676 Mask, PassThru, Subtarget, DAG);
19679 // ISD::FP_ROUND has a second argument that indicates if the truncation
19680 // does not change the value. Set it to 0 since it can change.
19681 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19682 DAG.getIntPtrConstant(0, dl));
19683 case CVTPD2PS_MASK: {
19684 SDValue Src = Op.getOperand(1);
19685 SDValue PassThru = Op.getOperand(2);
19686 SDValue Mask = Op.getOperand(3);
19687 // We add rounding mode to the Node when
19688 // - RM Opcode is specified and
19689 // - RM is not "current direction".
19690 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19691 if (IntrWithRoundingModeOpcode != 0) {
19692 SDValue Rnd = Op.getOperand(4);
19693 if (!isRoundModeCurDirection(Rnd)) {
19694 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19695 dl, Op.getValueType(),
19697 Mask, PassThru, Subtarget, DAG);
19700 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
19701 // ISD::FP_ROUND has a second argument that indicates if the truncation
19702 // does not change the value. Set it to 0 since it can change.
19703 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19704 DAG.getIntPtrConstant(0, dl)),
19705 Mask, PassThru, Subtarget, DAG);
19708 // FPclass intrinsics with mask
19709 SDValue Src1 = Op.getOperand(1);
19710 MVT VT = Src1.getSimpleValueType();
19711 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19712 SDValue Imm = Op.getOperand(2);
19713 SDValue Mask = Op.getOperand(3);
19714 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19715 Mask.getSimpleValueType().getSizeInBits());
19716 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19717 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19718 DAG.getTargetConstant(0, dl, MaskVT),
19720 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19721 DAG.getUNDEF(BitcastVT), FPclassMask,
19722 DAG.getIntPtrConstant(0, dl));
19723 return DAG.getBitcast(Op.getValueType(), Res);
19726 SDValue Src1 = Op.getOperand(1);
19727 SDValue Imm = Op.getOperand(2);
19728 SDValue Mask = Op.getOperand(3);
19729 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
19730 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19731 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19732 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
19733 DAG.getIntPtrConstant(0, dl));
19736 case CMP_MASK_CC: {
19737 // Comparison intrinsics with masks.
19738 // Example of transformation:
19739 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
19740 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19742 // (v8i1 (insert_subvector undef,
19743 // (v2i1 (and (PCMPEQM %a, %b),
19744 // (extract_subvector
19745 // (v8i1 (bitcast %mask)), 0))), 0))))
19746 MVT VT = Op.getOperand(1).getSimpleValueType();
19747 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19748 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19749 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19750 Mask.getSimpleValueType().getSizeInBits());
19752 if (IntrData->Type == CMP_MASK_CC) {
19753 SDValue CC = Op.getOperand(3);
19754 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19755 // We specify 2 possible opcodes for intrinsics with rounding modes.
19756 // First, we check if the intrinsic may have non-default rounding mode,
19757 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19758 if (IntrData->Opc1 != 0) {
19759 SDValue Rnd = Op.getOperand(5);
19760 if (!isRoundModeCurDirection(Rnd))
19761 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19762 Op.getOperand(2), CC, Rnd);
19764 //default rounding mode
19766 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19767 Op.getOperand(2), CC);
19770 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19771 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19774 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19775 DAG.getTargetConstant(0, dl,
19778 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19779 DAG.getUNDEF(BitcastVT), CmpMask,
19780 DAG.getIntPtrConstant(0, dl));
19781 return DAG.getBitcast(Op.getValueType(), Res);
19783 case CMP_MASK_SCALAR_CC: {
19784 SDValue Src1 = Op.getOperand(1);
19785 SDValue Src2 = Op.getOperand(2);
19786 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19787 SDValue Mask = Op.getOperand(4);
19790 if (IntrData->Opc1 != 0) {
19791 SDValue Rnd = Op.getOperand(5);
19792 if (!isRoundModeCurDirection(Rnd))
19793 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
19795 //default rounding mode
19797 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
19799 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19800 DAG.getTargetConstant(0, dl,
19803 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
19804 DAG.getIntPtrConstant(0, dl));
19806 case COMI: { // Comparison intrinsics
19807 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19808 SDValue LHS = Op.getOperand(1);
19809 SDValue RHS = Op.getOperand(2);
19810 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19811 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19814 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19815 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19816 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19817 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19820 case ISD::SETNE: { // (ZF = 1 or PF = 1)
19821 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19822 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19823 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19826 case ISD::SETGT: // (CF = 0 and ZF = 0)
19827 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19829 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19830 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19833 case ISD::SETGE: // CF = 0
19834 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19836 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19837 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19840 llvm_unreachable("Unexpected illegal condition!");
19842 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19844 case COMI_RM: { // Comparison intrinsics with Sae
19845 SDValue LHS = Op.getOperand(1);
19846 SDValue RHS = Op.getOperand(2);
19847 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19848 SDValue Sae = Op.getOperand(4);
19851 if (isRoundModeCurDirection(Sae))
19852 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
19853 DAG.getConstant(CondVal, dl, MVT::i8));
19855 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
19856 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19857 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
19858 DAG.getIntPtrConstant(0, dl));
19861 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19862 Op.getOperand(1), Op.getOperand(2), Subtarget,
19864 case COMPRESS_EXPAND_IN_REG: {
19865 SDValue Mask = Op.getOperand(3);
19866 SDValue DataToCompress = Op.getOperand(1);
19867 SDValue PassThru = Op.getOperand(2);
19868 if (isAllOnesConstant(Mask)) // return data as is
19869 return Op.getOperand(1);
19871 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19873 Mask, PassThru, Subtarget, DAG);
19876 SDValue Mask = Op.getOperand(1);
19877 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19878 Mask.getSimpleValueType().getSizeInBits());
19879 Mask = DAG.getBitcast(MaskVT, Mask);
19880 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19883 MVT VT = Op.getSimpleValueType();
19884 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19886 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19887 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19888 // Arguments should be swapped.
19889 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19890 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19892 return DAG.getBitcast(VT, Res);
19895 MVT VT = Op.getSimpleValueType();
19896 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19898 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19899 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19900 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
19901 return DAG.getBitcast(VT, Res);
19904 case FIXUPIMMS_MASKZ:
19906 case FIXUPIMM_MASKZ:{
19907 SDValue Src1 = Op.getOperand(1);
19908 SDValue Src2 = Op.getOperand(2);
19909 SDValue Src3 = Op.getOperand(3);
19910 SDValue Imm = Op.getOperand(4);
19911 SDValue Mask = Op.getOperand(5);
19912 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19913 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19914 // We specify 2 possible modes for intrinsics, with/without rounding
19916 // First, we check if the intrinsic have rounding mode (7 operands),
19917 // if not, we set rounding mode to "current".
19919 if (Op.getNumOperands() == 7)
19920 Rnd = Op.getOperand(6);
19922 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19923 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19924 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19925 Src1, Src2, Src3, Imm, Rnd),
19926 Mask, Passthru, Subtarget, DAG);
19927 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19928 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19929 Src1, Src2, Src3, Imm, Rnd),
19930 Mask, Passthru, Subtarget, DAG);
19932 case CONVERT_TO_MASK: {
19933 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19934 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19935 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19937 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19939 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19940 DAG.getUNDEF(BitcastVT), CvtMask,
19941 DAG.getIntPtrConstant(0, dl));
19942 return DAG.getBitcast(Op.getValueType(), Res);
19944 case BRCST_SUBVEC_TO_VEC: {
19945 SDValue Src = Op.getOperand(1);
19946 SDValue Passthru = Op.getOperand(2);
19947 SDValue Mask = Op.getOperand(3);
19948 EVT resVT = Passthru.getValueType();
19949 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19950 DAG.getUNDEF(resVT), Src,
19951 DAG.getIntPtrConstant(0, dl));
19953 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19954 immVal = DAG.getConstant(0x44, dl, MVT::i8);
19956 immVal = DAG.getConstant(0, dl, MVT::i8);
19957 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19958 subVec, subVec, immVal),
19959 Mask, Passthru, Subtarget, DAG);
19961 case BRCST32x2_TO_VEC: {
19962 SDValue Src = Op.getOperand(1);
19963 SDValue PassThru = Op.getOperand(2);
19964 SDValue Mask = Op.getOperand(3);
19966 assert((VT.getScalarType() == MVT::i32 ||
19967 VT.getScalarType() == MVT::f32) && "Unexpected type!");
19968 //bitcast Src to packed 64
19969 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19970 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19971 Src = DAG.getBitcast(BitcastVT, Src);
19973 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19974 Mask, PassThru, Subtarget, DAG);
19982 default: return SDValue(); // Don't custom lower most intrinsics.
19984 case Intrinsic::x86_avx2_permd:
19985 case Intrinsic::x86_avx2_permps:
19986 // Operands intentionally swapped. Mask is last operand to intrinsic,
19987 // but second operand for node/instruction.
19988 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19989 Op.getOperand(2), Op.getOperand(1));
19991 // ptest and testp intrinsics. The intrinsic these come from are designed to
19992 // return an integer value, not just an instruction so lower it to the ptest
19993 // or testp pattern and a setcc for the result.
19994 case Intrinsic::x86_sse41_ptestz:
19995 case Intrinsic::x86_sse41_ptestc:
19996 case Intrinsic::x86_sse41_ptestnzc:
19997 case Intrinsic::x86_avx_ptestz_256:
19998 case Intrinsic::x86_avx_ptestc_256:
19999 case Intrinsic::x86_avx_ptestnzc_256:
20000 case Intrinsic::x86_avx_vtestz_ps:
20001 case Intrinsic::x86_avx_vtestc_ps:
20002 case Intrinsic::x86_avx_vtestnzc_ps:
20003 case Intrinsic::x86_avx_vtestz_pd:
20004 case Intrinsic::x86_avx_vtestc_pd:
20005 case Intrinsic::x86_avx_vtestnzc_pd:
20006 case Intrinsic::x86_avx_vtestz_ps_256:
20007 case Intrinsic::x86_avx_vtestc_ps_256:
20008 case Intrinsic::x86_avx_vtestnzc_ps_256:
20009 case Intrinsic::x86_avx_vtestz_pd_256:
20010 case Intrinsic::x86_avx_vtestc_pd_256:
20011 case Intrinsic::x86_avx_vtestnzc_pd_256: {
20012 bool IsTestPacked = false;
20013 X86::CondCode X86CC;
20015 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
20016 case Intrinsic::x86_avx_vtestz_ps:
20017 case Intrinsic::x86_avx_vtestz_pd:
20018 case Intrinsic::x86_avx_vtestz_ps_256:
20019 case Intrinsic::x86_avx_vtestz_pd_256:
20020 IsTestPacked = true;
20022 case Intrinsic::x86_sse41_ptestz:
20023 case Intrinsic::x86_avx_ptestz_256:
20025 X86CC = X86::COND_E;
20027 case Intrinsic::x86_avx_vtestc_ps:
20028 case Intrinsic::x86_avx_vtestc_pd:
20029 case Intrinsic::x86_avx_vtestc_ps_256:
20030 case Intrinsic::x86_avx_vtestc_pd_256:
20031 IsTestPacked = true;
20033 case Intrinsic::x86_sse41_ptestc:
20034 case Intrinsic::x86_avx_ptestc_256:
20036 X86CC = X86::COND_B;
20038 case Intrinsic::x86_avx_vtestnzc_ps:
20039 case Intrinsic::x86_avx_vtestnzc_pd:
20040 case Intrinsic::x86_avx_vtestnzc_ps_256:
20041 case Intrinsic::x86_avx_vtestnzc_pd_256:
20042 IsTestPacked = true;
20044 case Intrinsic::x86_sse41_ptestnzc:
20045 case Intrinsic::x86_avx_ptestnzc_256:
20047 X86CC = X86::COND_A;
20051 SDValue LHS = Op.getOperand(1);
20052 SDValue RHS = Op.getOperand(2);
20053 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
20054 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
20055 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20056 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20058 case Intrinsic::x86_avx512_kortestz_w:
20059 case Intrinsic::x86_avx512_kortestc_w: {
20060 X86::CondCode X86CC =
20061 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
20062 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20063 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20064 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
20065 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20066 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20069 case Intrinsic::x86_avx512_knot_w: {
20070 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20071 SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
20072 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20073 return DAG.getBitcast(MVT::i16, Res);
20076 case Intrinsic::x86_avx512_kandn_w: {
20077 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20078 // Invert LHS for the not.
20079 LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
20080 DAG.getConstant(1, dl, MVT::v16i1));
20081 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20082 SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
20083 return DAG.getBitcast(MVT::i16, Res);
20086 case Intrinsic::x86_avx512_kxnor_w: {
20087 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20088 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20089 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20090 // Invert result for the not.
20091 Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
20092 DAG.getConstant(1, dl, MVT::v16i1));
20093 return DAG.getBitcast(MVT::i16, Res);
20096 case Intrinsic::x86_sse42_pcmpistria128:
20097 case Intrinsic::x86_sse42_pcmpestria128:
20098 case Intrinsic::x86_sse42_pcmpistric128:
20099 case Intrinsic::x86_sse42_pcmpestric128:
20100 case Intrinsic::x86_sse42_pcmpistrio128:
20101 case Intrinsic::x86_sse42_pcmpestrio128:
20102 case Intrinsic::x86_sse42_pcmpistris128:
20103 case Intrinsic::x86_sse42_pcmpestris128:
20104 case Intrinsic::x86_sse42_pcmpistriz128:
20105 case Intrinsic::x86_sse42_pcmpestriz128: {
20107 X86::CondCode X86CC;
20109 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
20110 case Intrinsic::x86_sse42_pcmpistria128:
20111 Opcode = X86ISD::PCMPISTRI;
20112 X86CC = X86::COND_A;
20114 case Intrinsic::x86_sse42_pcmpestria128:
20115 Opcode = X86ISD::PCMPESTRI;
20116 X86CC = X86::COND_A;
20118 case Intrinsic::x86_sse42_pcmpistric128:
20119 Opcode = X86ISD::PCMPISTRI;
20120 X86CC = X86::COND_B;
20122 case Intrinsic::x86_sse42_pcmpestric128:
20123 Opcode = X86ISD::PCMPESTRI;
20124 X86CC = X86::COND_B;
20126 case Intrinsic::x86_sse42_pcmpistrio128:
20127 Opcode = X86ISD::PCMPISTRI;
20128 X86CC = X86::COND_O;
20130 case Intrinsic::x86_sse42_pcmpestrio128:
20131 Opcode = X86ISD::PCMPESTRI;
20132 X86CC = X86::COND_O;
20134 case Intrinsic::x86_sse42_pcmpistris128:
20135 Opcode = X86ISD::PCMPISTRI;
20136 X86CC = X86::COND_S;
20138 case Intrinsic::x86_sse42_pcmpestris128:
20139 Opcode = X86ISD::PCMPESTRI;
20140 X86CC = X86::COND_S;
20142 case Intrinsic::x86_sse42_pcmpistriz128:
20143 Opcode = X86ISD::PCMPISTRI;
20144 X86CC = X86::COND_E;
20146 case Intrinsic::x86_sse42_pcmpestriz128:
20147 Opcode = X86ISD::PCMPESTRI;
20148 X86CC = X86::COND_E;
20151 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20152 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20153 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
20154 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
20155 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20158 case Intrinsic::x86_sse42_pcmpistri128:
20159 case Intrinsic::x86_sse42_pcmpestri128: {
20161 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
20162 Opcode = X86ISD::PCMPISTRI;
20164 Opcode = X86ISD::PCMPESTRI;
20166 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20167 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20168 return DAG.getNode(Opcode, dl, VTs, NewOps);
20171 case Intrinsic::eh_sjlj_lsda: {
20172 MachineFunction &MF = DAG.getMachineFunction();
20173 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20174 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20175 auto &Context = MF.getMMI().getContext();
20176 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20177 Twine(MF.getFunctionNumber()));
20178 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
20181 case Intrinsic::x86_seh_lsda: {
20182 // Compute the symbol for the LSDA. We know it'll get emitted later.
20183 MachineFunction &MF = DAG.getMachineFunction();
20184 SDValue Op1 = Op.getOperand(1);
20185 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20186 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20187 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20189 // Generate a simple absolute symbol reference. This intrinsic is only
20190 // supported on 32-bit Windows, which isn't PIC.
20191 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20192 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20195 case Intrinsic::x86_seh_recoverfp: {
20196 SDValue FnOp = Op.getOperand(1);
20197 SDValue IncomingFPOp = Op.getOperand(2);
20198 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20199 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
20201 report_fatal_error(
20202 "llvm.x86.seh.recoverfp must take a function as the first argument");
20203 return recoverFramePointer(DAG, Fn, IncomingFPOp);
20206 case Intrinsic::localaddress: {
20207 // Returns one of the stack, base, or frame pointer registers, depending on
20208 // which is used to reference local variables.
20209 MachineFunction &MF = DAG.getMachineFunction();
20210 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20212 if (RegInfo->hasBasePointer(MF))
20213 Reg = RegInfo->getBaseRegister();
20214 else // This function handles the SP or FP case.
20215 Reg = RegInfo->getPtrSizedFrameRegister(MF);
20216 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20221 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20222 SDValue Src, SDValue Mask, SDValue Base,
20223 SDValue Index, SDValue ScaleOp, SDValue Chain,
20224 const X86Subtarget &Subtarget) {
20226 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20227 // Scale must be constant.
20230 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20231 EVT MaskVT = Mask.getValueType();
20232 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20233 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20234 SDValue Segment = DAG.getRegister(0, MVT::i32);
20235 // If source is undef or we know it won't be used, use a zero vector
20236 // to break register dependency.
20237 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20238 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20239 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20240 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20241 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20242 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20243 return DAG.getMergeValues(RetOps, dl);
20246 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20247 SDValue Src, SDValue Mask, SDValue Base,
20248 SDValue Index, SDValue ScaleOp, SDValue Chain,
20249 const X86Subtarget &Subtarget) {
20251 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20252 // Scale must be constant.
20255 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20256 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20257 Index.getSimpleValueType().getVectorNumElements());
20259 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20260 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20261 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20262 SDValue Segment = DAG.getRegister(0, MVT::i32);
20263 // If source is undef or we know it won't be used, use a zero vector
20264 // to break register dependency.
20265 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20266 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20267 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20268 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20269 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20270 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20271 return DAG.getMergeValues(RetOps, dl);
20274 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20275 SDValue Src, SDValue Mask, SDValue Base,
20276 SDValue Index, SDValue ScaleOp, SDValue Chain,
20277 const X86Subtarget &Subtarget) {
20279 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20280 // Scale must be constant.
20283 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20284 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20285 SDValue Segment = DAG.getRegister(0, MVT::i32);
20286 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20287 Index.getSimpleValueType().getVectorNumElements());
20289 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20290 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20291 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20292 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20293 return SDValue(Res, 1);
20296 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20297 SDValue Mask, SDValue Base, SDValue Index,
20298 SDValue ScaleOp, SDValue Chain,
20299 const X86Subtarget &Subtarget) {
20301 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20302 // Scale must be constant.
20305 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20306 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20307 SDValue Segment = DAG.getRegister(0, MVT::i32);
20309 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20310 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20311 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20312 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20313 return SDValue(Res, 0);
20316 /// Handles the lowering of builtin intrinsic that return the value
20317 /// of the extended control register.
20318 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20320 const X86Subtarget &Subtarget,
20321 SmallVectorImpl<SDValue> &Results) {
20322 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20323 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20326 // The ECX register is used to select the index of the XCR register to
20329 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20330 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20331 Chain = SDValue(N1, 0);
20333 // Reads the content of XCR and returns it in registers EDX:EAX.
20334 if (Subtarget.is64Bit()) {
20335 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20336 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20339 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20340 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20343 Chain = HI.getValue(1);
20345 if (Subtarget.is64Bit()) {
20346 // Merge the two 32-bit values into a 64-bit one..
20347 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20348 DAG.getConstant(32, DL, MVT::i8));
20349 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20350 Results.push_back(Chain);
20354 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20355 SDValue Ops[] = { LO, HI };
20356 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20357 Results.push_back(Pair);
20358 Results.push_back(Chain);
20361 /// Handles the lowering of builtin intrinsics that read performance monitor
20362 /// counters (x86_rdpmc).
20363 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20365 const X86Subtarget &Subtarget,
20366 SmallVectorImpl<SDValue> &Results) {
20367 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20368 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20371 // The ECX register is used to select the index of the performance counter
20373 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20375 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20377 // Reads the content of a 64-bit performance counter and returns it in the
20378 // registers EDX:EAX.
20379 if (Subtarget.is64Bit()) {
20380 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20381 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20384 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20385 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20388 Chain = HI.getValue(1);
20390 if (Subtarget.is64Bit()) {
20391 // The EAX register is loaded with the low-order 32 bits. The EDX register
20392 // is loaded with the supported high-order bits of the counter.
20393 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20394 DAG.getConstant(32, DL, MVT::i8));
20395 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20396 Results.push_back(Chain);
20400 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20401 SDValue Ops[] = { LO, HI };
20402 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20403 Results.push_back(Pair);
20404 Results.push_back(Chain);
20407 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20408 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20409 /// READCYCLECOUNTER nodes.
20410 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20412 const X86Subtarget &Subtarget,
20413 SmallVectorImpl<SDValue> &Results) {
20414 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20415 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20418 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20419 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20420 // and the EAX register is loaded with the low-order 32 bits.
20421 if (Subtarget.is64Bit()) {
20422 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20423 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20426 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20427 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20430 SDValue Chain = HI.getValue(1);
20432 if (Opcode == X86ISD::RDTSCP_DAG) {
20433 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20435 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20436 // the ECX register. Add 'ecx' explicitly to the chain.
20437 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20439 // Explicitly store the content of ECX at the location passed in input
20440 // to the 'rdtscp' intrinsic.
20441 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20442 MachinePointerInfo());
20445 if (Subtarget.is64Bit()) {
20446 // The EDX register is loaded with the high-order 32 bits of the MSR, and
20447 // the EAX register is loaded with the low-order 32 bits.
20448 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20449 DAG.getConstant(32, DL, MVT::i8));
20450 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20451 Results.push_back(Chain);
20455 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20456 SDValue Ops[] = { LO, HI };
20457 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20458 Results.push_back(Pair);
20459 Results.push_back(Chain);
20462 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20463 SelectionDAG &DAG) {
20464 SmallVector<SDValue, 2> Results;
20466 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20468 return DAG.getMergeValues(Results, DL);
20471 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20472 MachineFunction &MF = DAG.getMachineFunction();
20473 SDValue Chain = Op.getOperand(0);
20474 SDValue RegNode = Op.getOperand(2);
20475 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20477 report_fatal_error("EH registrations only live in functions using WinEH");
20479 // Cast the operand to an alloca, and remember the frame index.
20480 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20482 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20483 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20485 // Return the chain operand without making any DAG nodes.
20489 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20490 MachineFunction &MF = DAG.getMachineFunction();
20491 SDValue Chain = Op.getOperand(0);
20492 SDValue EHGuard = Op.getOperand(2);
20493 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20495 report_fatal_error("EHGuard only live in functions using WinEH");
20497 // Cast the operand to an alloca, and remember the frame index.
20498 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20500 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20501 EHInfo->EHGuardFrameIndex = FINode->getIndex();
20503 // Return the chain operand without making any DAG nodes.
20507 /// Emit Truncating Store with signed or unsigned saturation.
20509 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20510 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20511 SelectionDAG &DAG) {
20513 SDVTList VTs = DAG.getVTList(MVT::Other);
20514 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20515 SDValue Ops[] = { Chain, Val, Ptr, Undef };
20517 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20518 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20521 /// Emit Masked Truncating Store with signed or unsigned saturation.
20523 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20524 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20525 MachineMemOperand *MMO, SelectionDAG &DAG) {
20527 SDVTList VTs = DAG.getVTList(MVT::Other);
20528 SDValue Ops[] = { Chain, Ptr, Mask, Val };
20530 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20531 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20534 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20535 SelectionDAG &DAG) {
20536 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20538 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20541 case llvm::Intrinsic::x86_seh_ehregnode:
20542 return MarkEHRegistrationNode(Op, DAG);
20543 case llvm::Intrinsic::x86_seh_ehguard:
20544 return MarkEHGuard(Op, DAG);
20545 case llvm::Intrinsic::x86_flags_read_u32:
20546 case llvm::Intrinsic::x86_flags_read_u64:
20547 case llvm::Intrinsic::x86_flags_write_u32:
20548 case llvm::Intrinsic::x86_flags_write_u64: {
20549 // We need a frame pointer because this will get lowered to a PUSH/POP
20551 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20552 MFI.setHasCopyImplyingStackAdjustment(true);
20553 // Don't do anything here, we will expand these intrinsics out later
20554 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20557 case Intrinsic::x86_lwpins32:
20558 case Intrinsic::x86_lwpins64: {
20560 SDValue Chain = Op->getOperand(0);
20561 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
20563 DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
20564 Op->getOperand(3), Op->getOperand(4));
20565 SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
20566 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
20567 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
20568 LwpIns.getValue(1));
20575 switch(IntrData->Type) {
20576 default: llvm_unreachable("Unknown Intrinsic Type");
20579 // Emit the node with the right value type.
20580 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
20581 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20583 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20584 // Otherwise return the value from Rand, which is always 0, casted to i32.
20585 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20586 DAG.getConstant(1, dl, Op->getValueType(1)),
20587 DAG.getConstant(X86::COND_B, dl, MVT::i32),
20588 SDValue(Result.getNode(), 1) };
20589 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
20590 DAG.getVTList(Op->getValueType(1), MVT::Glue),
20593 // Return { result, isValid, chain }.
20594 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20595 SDValue(Result.getNode(), 2));
20597 case GATHER_AVX2: {
20598 SDValue Chain = Op.getOperand(0);
20599 SDValue Src = Op.getOperand(2);
20600 SDValue Base = Op.getOperand(3);
20601 SDValue Index = Op.getOperand(4);
20602 SDValue Mask = Op.getOperand(5);
20603 SDValue Scale = Op.getOperand(6);
20604 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20605 Scale, Chain, Subtarget);
20608 //gather(v1, mask, index, base, scale);
20609 SDValue Chain = Op.getOperand(0);
20610 SDValue Src = Op.getOperand(2);
20611 SDValue Base = Op.getOperand(3);
20612 SDValue Index = Op.getOperand(4);
20613 SDValue Mask = Op.getOperand(5);
20614 SDValue Scale = Op.getOperand(6);
20615 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20619 //scatter(base, mask, index, v1, scale);
20620 SDValue Chain = Op.getOperand(0);
20621 SDValue Base = Op.getOperand(2);
20622 SDValue Mask = Op.getOperand(3);
20623 SDValue Index = Op.getOperand(4);
20624 SDValue Src = Op.getOperand(5);
20625 SDValue Scale = Op.getOperand(6);
20626 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20627 Scale, Chain, Subtarget);
20630 SDValue Hint = Op.getOperand(6);
20631 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20632 assert((HintVal == 2 || HintVal == 3) &&
20633 "Wrong prefetch hint in intrinsic: should be 2 or 3");
20634 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
20635 SDValue Chain = Op.getOperand(0);
20636 SDValue Mask = Op.getOperand(2);
20637 SDValue Index = Op.getOperand(3);
20638 SDValue Base = Op.getOperand(4);
20639 SDValue Scale = Op.getOperand(5);
20640 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20643 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20645 SmallVector<SDValue, 2> Results;
20646 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20648 return DAG.getMergeValues(Results, dl);
20650 // Read Performance Monitoring Counters.
20652 SmallVector<SDValue, 2> Results;
20653 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
20654 return DAG.getMergeValues(Results, dl);
20656 // Get Extended Control Register.
20658 SmallVector<SDValue, 2> Results;
20659 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
20660 return DAG.getMergeValues(Results, dl);
20662 // XTEST intrinsics.
20664 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20665 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20667 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
20668 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
20669 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20670 Ret, SDValue(InTrans.getNode(), 1));
20674 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
20675 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32);
20676 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20677 DAG.getConstant(-1, dl, MVT::i8));
20678 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20679 Op.getOperand(4), GenCF.getValue(1));
20680 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20681 Op.getOperand(5), MachinePointerInfo());
20682 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20683 SDValue Results[] = { SetCC, Store };
20684 return DAG.getMergeValues(Results, dl);
20686 case COMPRESS_TO_MEM: {
20687 SDValue Mask = Op.getOperand(4);
20688 SDValue DataToCompress = Op.getOperand(3);
20689 SDValue Addr = Op.getOperand(2);
20690 SDValue Chain = Op.getOperand(0);
20691 MVT VT = DataToCompress.getSimpleValueType();
20693 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20694 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20696 if (isAllOnesConstant(Mask)) // return just a store
20697 return DAG.getStore(Chain, dl, DataToCompress, Addr,
20698 MemIntr->getMemOperand());
20700 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20701 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20703 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20704 MemIntr->getMemOperand(),
20705 false /* truncating */, true /* compressing */);
20707 case TRUNCATE_TO_MEM_VI8:
20708 case TRUNCATE_TO_MEM_VI16:
20709 case TRUNCATE_TO_MEM_VI32: {
20710 SDValue Mask = Op.getOperand(4);
20711 SDValue DataToTruncate = Op.getOperand(3);
20712 SDValue Addr = Op.getOperand(2);
20713 SDValue Chain = Op.getOperand(0);
20715 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20716 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20718 EVT MemVT = MemIntr->getMemoryVT();
20720 uint16_t TruncationOp = IntrData->Opc0;
20721 switch (TruncationOp) {
20722 case X86ISD::VTRUNC: {
20723 if (isAllOnesConstant(Mask)) // return just a truncate store
20724 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20725 MemIntr->getMemOperand());
20727 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20728 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20730 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20731 MemIntr->getMemOperand(), true /* truncating */);
20733 case X86ISD::VTRUNCUS:
20734 case X86ISD::VTRUNCS: {
20735 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20736 if (isAllOnesConstant(Mask))
20737 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20738 MemIntr->getMemOperand(), DAG);
20740 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20741 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20743 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20744 VMask, MemVT, MemIntr->getMemOperand(), DAG);
20747 llvm_unreachable("Unsupported truncstore intrinsic");
20751 case EXPAND_FROM_MEM: {
20752 SDValue Mask = Op.getOperand(4);
20753 SDValue PassThru = Op.getOperand(3);
20754 SDValue Addr = Op.getOperand(2);
20755 SDValue Chain = Op.getOperand(0);
20756 MVT VT = Op.getSimpleValueType();
20758 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20759 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20761 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20762 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20763 if (X86::isZeroNode(Mask))
20764 return DAG.getUNDEF(VT);
20766 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20767 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20768 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20769 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20770 true /* expanding */);
20775 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20776 SelectionDAG &DAG) const {
20777 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20778 MFI.setReturnAddressIsTaken(true);
20780 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20783 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20785 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20788 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20789 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20790 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20791 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20792 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20793 MachinePointerInfo());
20796 // Just load the return address.
20797 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20798 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20799 MachinePointerInfo());
20802 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20803 SelectionDAG &DAG) const {
20804 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20805 return getReturnAddressFrameIndex(DAG);
20808 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20809 MachineFunction &MF = DAG.getMachineFunction();
20810 MachineFrameInfo &MFI = MF.getFrameInfo();
20811 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20812 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20813 EVT VT = Op.getValueType();
20815 MFI.setFrameAddressIsTaken(true);
20817 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
20818 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
20819 // is not possible to crawl up the stack without looking at the unwind codes
20821 int FrameAddrIndex = FuncInfo->getFAIndex();
20822 if (!FrameAddrIndex) {
20823 // Set up a frame object for the return address.
20824 unsigned SlotSize = RegInfo->getSlotSize();
20825 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20826 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
20827 FuncInfo->setFAIndex(FrameAddrIndex);
20829 return DAG.getFrameIndex(FrameAddrIndex, VT);
20832 unsigned FrameReg =
20833 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20834 SDLoc dl(Op); // FIXME probably not meaningful
20835 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20836 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20837 (FrameReg == X86::EBP && VT == MVT::i32)) &&
20838 "Invalid Frame Register!");
20839 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20841 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20842 MachinePointerInfo());
20846 // FIXME? Maybe this could be a TableGen attribute on some registers and
20847 // this table could be generated automatically from RegInfo.
20848 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20849 SelectionDAG &DAG) const {
20850 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20851 const MachineFunction &MF = DAG.getMachineFunction();
20853 unsigned Reg = StringSwitch<unsigned>(RegName)
20854 .Case("esp", X86::ESP)
20855 .Case("rsp", X86::RSP)
20856 .Case("ebp", X86::EBP)
20857 .Case("rbp", X86::RBP)
20860 if (Reg == X86::EBP || Reg == X86::RBP) {
20861 if (!TFI.hasFP(MF))
20862 report_fatal_error("register " + StringRef(RegName) +
20863 " is allocatable: function has no frame pointer");
20866 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20867 unsigned FrameReg =
20868 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20869 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20870 "Invalid Frame Register!");
20878 report_fatal_error("Invalid register name global variable");
20881 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20882 SelectionDAG &DAG) const {
20883 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20884 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20887 unsigned X86TargetLowering::getExceptionPointerRegister(
20888 const Constant *PersonalityFn) const {
20889 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20890 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20892 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20895 unsigned X86TargetLowering::getExceptionSelectorRegister(
20896 const Constant *PersonalityFn) const {
20897 // Funclet personalities don't use selectors (the runtime does the selection).
20898 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20899 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20902 bool X86TargetLowering::needsFixedCatchObjects() const {
20903 return Subtarget.isTargetWin64();
20906 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20907 SDValue Chain = Op.getOperand(0);
20908 SDValue Offset = Op.getOperand(1);
20909 SDValue Handler = Op.getOperand(2);
20912 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20913 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20914 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20915 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20916 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20917 "Invalid Frame Register!");
20918 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20919 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20921 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20922 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20924 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20925 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20926 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20928 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20929 DAG.getRegister(StoreAddrReg, PtrVT));
20932 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20933 SelectionDAG &DAG) const {
20935 // If the subtarget is not 64bit, we may need the global base reg
20936 // after isel expand pseudo, i.e., after CGBR pass ran.
20937 // Therefore, ask for the GlobalBaseReg now, so that the pass
20938 // inserts the code for us in case we need it.
20939 // Otherwise, we will end up in a situation where we will
20940 // reference a virtual register that is not defined!
20941 if (!Subtarget.is64Bit()) {
20942 const X86InstrInfo *TII = Subtarget.getInstrInfo();
20943 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20945 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20946 DAG.getVTList(MVT::i32, MVT::Other),
20947 Op.getOperand(0), Op.getOperand(1));
20950 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20951 SelectionDAG &DAG) const {
20953 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20954 Op.getOperand(0), Op.getOperand(1));
20957 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20958 SelectionDAG &DAG) const {
20960 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20964 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20965 return Op.getOperand(0);
20968 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20969 SelectionDAG &DAG) const {
20970 SDValue Root = Op.getOperand(0);
20971 SDValue Trmp = Op.getOperand(1); // trampoline
20972 SDValue FPtr = Op.getOperand(2); // nested function
20973 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20976 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20977 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20979 if (Subtarget.is64Bit()) {
20980 SDValue OutChains[6];
20982 // Large code-model.
20983 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20984 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20986 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20987 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20989 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20991 // Load the pointer to the nested function into R11.
20992 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20993 SDValue Addr = Trmp;
20994 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20995 Addr, MachinePointerInfo(TrmpAddr));
20997 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20998 DAG.getConstant(2, dl, MVT::i64));
21000 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21001 /* Alignment = */ 2);
21003 // Load the 'nest' parameter value into R10.
21004 // R10 is specified in X86CallingConv.td
21005 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21006 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21007 DAG.getConstant(10, dl, MVT::i64));
21008 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21009 Addr, MachinePointerInfo(TrmpAddr, 10));
21011 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21012 DAG.getConstant(12, dl, MVT::i64));
21014 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21015 /* Alignment = */ 2);
21017 // Jump to the nested function.
21018 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21019 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21020 DAG.getConstant(20, dl, MVT::i64));
21021 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21022 Addr, MachinePointerInfo(TrmpAddr, 20));
21024 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
21025 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21026 DAG.getConstant(22, dl, MVT::i64));
21027 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
21028 Addr, MachinePointerInfo(TrmpAddr, 22));
21030 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21032 const Function *Func =
21033 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
21034 CallingConv::ID CC = Func->getCallingConv();
21039 llvm_unreachable("Unsupported calling convention");
21040 case CallingConv::C:
21041 case CallingConv::X86_StdCall: {
21042 // Pass 'nest' parameter in ECX.
21043 // Must be kept in sync with X86CallingConv.td
21044 NestReg = X86::ECX;
21046 // Check that ECX wasn't needed by an 'inreg' parameter.
21047 FunctionType *FTy = Func->getFunctionType();
21048 const AttributeList &Attrs = Func->getAttributes();
21050 if (!Attrs.isEmpty() && !Func->isVarArg()) {
21051 unsigned InRegCount = 0;
21054 for (FunctionType::param_iterator I = FTy->param_begin(),
21055 E = FTy->param_end(); I != E; ++I, ++Idx)
21056 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
21057 auto &DL = DAG.getDataLayout();
21058 // FIXME: should only count parameters that are lowered to integers.
21059 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
21062 if (InRegCount > 2) {
21063 report_fatal_error("Nest register in use - reduce number of inreg"
21069 case CallingConv::X86_FastCall:
21070 case CallingConv::X86_ThisCall:
21071 case CallingConv::Fast:
21072 // Pass 'nest' parameter in EAX.
21073 // Must be kept in sync with X86CallingConv.td
21074 NestReg = X86::EAX;
21078 SDValue OutChains[4];
21079 SDValue Addr, Disp;
21081 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21082 DAG.getConstant(10, dl, MVT::i32));
21083 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
21085 // This is storing the opcode for MOV32ri.
21086 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
21087 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
21089 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
21090 Trmp, MachinePointerInfo(TrmpAddr));
21092 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21093 DAG.getConstant(1, dl, MVT::i32));
21095 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
21096 /* Alignment = */ 1);
21098 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
21099 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21100 DAG.getConstant(5, dl, MVT::i32));
21101 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
21102 Addr, MachinePointerInfo(TrmpAddr, 5),
21103 /* Alignment = */ 1);
21105 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21106 DAG.getConstant(6, dl, MVT::i32));
21108 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
21109 /* Alignment = */ 1);
21111 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21115 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
21116 SelectionDAG &DAG) const {
21118 The rounding mode is in bits 11:10 of FPSR, and has the following
21120 00 Round to nearest
21125 FLT_ROUNDS, on the other hand, expects the following:
21132 To perform the conversion, we do:
21133 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
21136 MachineFunction &MF = DAG.getMachineFunction();
21137 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21138 unsigned StackAlignment = TFI.getStackAlignment();
21139 MVT VT = Op.getSimpleValueType();
21142 // Save FP Control Word to stack slot
21143 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
21144 SDValue StackSlot =
21145 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
21147 MachineMemOperand *MMO =
21148 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
21149 MachineMemOperand::MOStore, 2, 2);
21151 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
21152 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
21153 DAG.getVTList(MVT::Other),
21154 Ops, MVT::i16, MMO);
21156 // Load FP Control Word from stack slot
21158 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
21160 // Transform as necessary
21162 DAG.getNode(ISD::SRL, DL, MVT::i16,
21163 DAG.getNode(ISD::AND, DL, MVT::i16,
21164 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
21165 DAG.getConstant(11, DL, MVT::i8));
21167 DAG.getNode(ISD::SRL, DL, MVT::i16,
21168 DAG.getNode(ISD::AND, DL, MVT::i16,
21169 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
21170 DAG.getConstant(9, DL, MVT::i8));
21173 DAG.getNode(ISD::AND, DL, MVT::i16,
21174 DAG.getNode(ISD::ADD, DL, MVT::i16,
21175 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
21176 DAG.getConstant(1, DL, MVT::i16)),
21177 DAG.getConstant(3, DL, MVT::i16));
21179 return DAG.getNode((VT.getSizeInBits() < 16 ?
21180 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
21183 // Split an unary integer op into 2 half sized ops.
21184 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
21185 MVT VT = Op.getSimpleValueType();
21186 unsigned NumElems = VT.getVectorNumElements();
21187 unsigned SizeInBits = VT.getSizeInBits();
21189 // Extract the Lo/Hi vectors
21191 SDValue Src = Op.getOperand(0);
21192 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
21193 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
21195 MVT EltVT = VT.getVectorElementType();
21196 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21197 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21198 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
21199 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
21202 // Decompose 256-bit ops into smaller 128-bit ops.
21203 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
21204 assert(Op.getSimpleValueType().is256BitVector() &&
21205 Op.getSimpleValueType().isInteger() &&
21206 "Only handle AVX 256-bit vector integer operation");
21207 return LowerVectorIntUnary(Op, DAG);
21210 // Decompose 512-bit ops into smaller 256-bit ops.
21211 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
21212 assert(Op.getSimpleValueType().is512BitVector() &&
21213 Op.getSimpleValueType().isInteger() &&
21214 "Only handle AVX 512-bit vector integer operation");
21215 return LowerVectorIntUnary(Op, DAG);
21218 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
21220 // i8/i16 vector implemented using dword LZCNT vector instruction
21221 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
21222 // split the vector, perform operation on it's Lo a Hi part and
21223 // concatenate the results.
21224 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
21225 assert(Op.getOpcode() == ISD::CTLZ);
21227 MVT VT = Op.getSimpleValueType();
21228 MVT EltVT = VT.getVectorElementType();
21229 unsigned NumElems = VT.getVectorNumElements();
21231 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21232 "Unsupported element type");
21234 // Split vector, it's Lo and Hi parts will be handled in next iteration.
21236 return LowerVectorIntUnary(Op, DAG);
21238 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21239 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21240 "Unsupported value type for operation");
21242 // Use native supported vector instruction vplzcntd.
21243 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21244 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21245 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21246 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21248 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21251 // Lower CTLZ using a PSHUFB lookup table implementation.
21252 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21253 const X86Subtarget &Subtarget,
21254 SelectionDAG &DAG) {
21255 MVT VT = Op.getSimpleValueType();
21256 int NumElts = VT.getVectorNumElements();
21257 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21258 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21260 // Per-nibble leading zero PSHUFB lookup table.
21261 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21262 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21263 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21264 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21266 SmallVector<SDValue, 64> LUTVec;
21267 for (int i = 0; i < NumBytes; ++i)
21268 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21269 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21271 // Begin by bitcasting the input to byte vector, then split those bytes
21272 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21273 // If the hi input nibble is zero then we add both results together, otherwise
21274 // we just take the hi result (by masking the lo result to zero before the
21276 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21277 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21279 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21280 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21281 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21282 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21283 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21285 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21286 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21287 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21288 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21290 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21291 // of the current vector width in the same way we did for the nibbles.
21292 // If the upper half of the input element is zero then add the halves'
21293 // leading zero counts together, otherwise just use the upper half's.
21294 // Double the width of the result until we are at target width.
21295 while (CurrVT != VT) {
21296 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21297 int CurrNumElts = CurrVT.getVectorNumElements();
21298 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21299 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21300 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21302 // Check if the upper half of the input element is zero.
21303 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21304 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21305 HiZ = DAG.getBitcast(NextVT, HiZ);
21307 // Move the upper/lower halves to the lower bits as we'll be extending to
21308 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21310 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21311 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21312 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21313 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21314 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21321 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21322 const X86Subtarget &Subtarget,
21323 SelectionDAG &DAG) {
21324 MVT VT = Op.getSimpleValueType();
21326 if (Subtarget.hasCDI())
21327 return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21329 // Decompose 256-bit ops into smaller 128-bit ops.
21330 if (VT.is256BitVector() && !Subtarget.hasInt256())
21331 return Lower256IntUnary(Op, DAG);
21333 // Decompose 512-bit ops into smaller 256-bit ops.
21334 if (VT.is512BitVector() && !Subtarget.hasBWI())
21335 return Lower512IntUnary(Op, DAG);
21337 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21338 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21341 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21342 SelectionDAG &DAG) {
21343 MVT VT = Op.getSimpleValueType();
21345 unsigned NumBits = VT.getSizeInBits();
21347 unsigned Opc = Op.getOpcode();
21350 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21352 Op = Op.getOperand(0);
21353 if (VT == MVT::i8) {
21354 // Zero extend to i32 since there is not an i8 bsr.
21356 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21359 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21360 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21361 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21363 if (Opc == ISD::CTLZ) {
21364 // If src is zero (i.e. bsr sets ZF), returns NumBits.
21367 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21368 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21371 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21374 // Finally xor with NumBits-1.
21375 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21376 DAG.getConstant(NumBits - 1, dl, OpVT));
21379 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21383 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21384 MVT VT = Op.getSimpleValueType();
21385 unsigned NumBits = VT.getScalarSizeInBits();
21388 if (VT.isVector()) {
21389 SDValue N0 = Op.getOperand(0);
21390 SDValue Zero = DAG.getConstant(0, dl, VT);
21392 // lsb(x) = (x & -x)
21393 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21394 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21396 // cttz_undef(x) = (width - 1) - ctlz(lsb)
21397 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21398 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21399 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21400 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21403 // cttz(x) = ctpop(lsb - 1)
21404 SDValue One = DAG.getConstant(1, dl, VT);
21405 return DAG.getNode(ISD::CTPOP, dl, VT,
21406 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21409 assert(Op.getOpcode() == ISD::CTTZ &&
21410 "Only scalar CTTZ requires custom lowering");
21412 // Issue a bsf (scan bits forward) which also sets EFLAGS.
21413 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21414 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21416 // If src is zero (i.e. bsf sets ZF), returns NumBits.
21419 DAG.getConstant(NumBits, dl, VT),
21420 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21423 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21426 /// Break a 256-bit integer operation into two new 128-bit ones and then
21427 /// concatenate the result back.
21428 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21429 MVT VT = Op.getSimpleValueType();
21431 assert(VT.is256BitVector() && VT.isInteger() &&
21432 "Unsupported value type for operation");
21434 unsigned NumElems = VT.getVectorNumElements();
21437 // Extract the LHS vectors
21438 SDValue LHS = Op.getOperand(0);
21439 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21440 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21442 // Extract the RHS vectors
21443 SDValue RHS = Op.getOperand(1);
21444 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21445 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21447 MVT EltVT = VT.getVectorElementType();
21448 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21450 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21451 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21452 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21455 /// Break a 512-bit integer operation into two new 256-bit ones and then
21456 /// concatenate the result back.
21457 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21458 MVT VT = Op.getSimpleValueType();
21460 assert(VT.is512BitVector() && VT.isInteger() &&
21461 "Unsupported value type for operation");
21463 unsigned NumElems = VT.getVectorNumElements();
21466 // Extract the LHS vectors
21467 SDValue LHS = Op.getOperand(0);
21468 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21469 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21471 // Extract the RHS vectors
21472 SDValue RHS = Op.getOperand(1);
21473 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21474 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21476 MVT EltVT = VT.getVectorElementType();
21477 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21479 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21480 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21481 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21484 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21485 MVT VT = Op.getSimpleValueType();
21486 if (VT.getScalarType() == MVT::i1)
21487 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21488 Op.getOperand(0), Op.getOperand(1));
21489 assert(Op.getSimpleValueType().is256BitVector() &&
21490 Op.getSimpleValueType().isInteger() &&
21491 "Only handle AVX 256-bit vector integer operation");
21492 return Lower256IntArith(Op, DAG);
21495 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21496 assert(Op.getSimpleValueType().is256BitVector() &&
21497 Op.getSimpleValueType().isInteger() &&
21498 "Only handle AVX 256-bit vector integer operation");
21499 return Lower256IntUnary(Op, DAG);
21502 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21503 assert(Op.getSimpleValueType().is256BitVector() &&
21504 Op.getSimpleValueType().isInteger() &&
21505 "Only handle AVX 256-bit vector integer operation");
21506 return Lower256IntArith(Op, DAG);
21509 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21510 SelectionDAG &DAG) {
21512 MVT VT = Op.getSimpleValueType();
21514 if (VT.getScalarType() == MVT::i1)
21515 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21517 // Decompose 256-bit ops into smaller 128-bit ops.
21518 if (VT.is256BitVector() && !Subtarget.hasInt256())
21519 return Lower256IntArith(Op, DAG);
21521 SDValue A = Op.getOperand(0);
21522 SDValue B = Op.getOperand(1);
21524 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21525 // vector pairs, multiply and truncate.
21526 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
21527 if (Subtarget.hasInt256()) {
21528 // For 512-bit vectors, split into 256-bit vectors to allow the
21529 // sign-extension to occur.
21530 if (VT == MVT::v64i8)
21531 return Lower512IntArith(Op, DAG);
21533 // For 256-bit vectors, split into 128-bit vectors to allow the
21534 // sign-extension to occur. We don't need this on AVX512BW as we can
21535 // safely sign-extend to v32i16.
21536 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21537 return Lower256IntArith(Op, DAG);
21539 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21540 return DAG.getNode(
21541 ISD::TRUNCATE, dl, VT,
21542 DAG.getNode(ISD::MUL, dl, ExVT,
21543 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21544 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21547 assert(VT == MVT::v16i8 &&
21548 "Pre-AVX2 support only supports v16i8 multiplication");
21549 MVT ExVT = MVT::v8i16;
21551 // Extract the lo parts and sign extend to i16
21553 if (Subtarget.hasSSE41()) {
21554 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21555 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21557 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21558 -1, 4, -1, 5, -1, 6, -1, 7};
21559 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21560 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21561 ALo = DAG.getBitcast(ExVT, ALo);
21562 BLo = DAG.getBitcast(ExVT, BLo);
21563 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21564 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21567 // Extract the hi parts and sign extend to i16
21569 if (Subtarget.hasSSE41()) {
21570 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21571 -1, -1, -1, -1, -1, -1, -1, -1};
21572 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21573 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21574 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21575 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21577 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21578 -1, 12, -1, 13, -1, 14, -1, 15};
21579 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21580 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21581 AHi = DAG.getBitcast(ExVT, AHi);
21582 BHi = DAG.getBitcast(ExVT, BHi);
21583 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21584 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21587 // Multiply, mask the lower 8bits of the lo/hi results and pack
21588 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21589 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21590 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21591 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21592 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21595 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21596 if (VT == MVT::v4i32) {
21597 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
21598 "Should not custom lower when pmuldq is available!");
21600 // Extract the odd parts.
21601 static const int UnpackMask[] = { 1, -1, 3, -1 };
21602 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21603 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21605 // Multiply the even parts.
21606 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21607 // Now multiply odd parts.
21608 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21610 Evens = DAG.getBitcast(VT, Evens);
21611 Odds = DAG.getBitcast(VT, Odds);
21613 // Merge the two vectors back together with a shuffle. This expands into 2
21615 static const int ShufMask[] = { 0, 4, 2, 6 };
21616 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21619 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
21620 "Only know how to lower V2I64/V4I64/V8I64 multiply");
21622 // 32-bit vector types used for MULDQ/MULUDQ.
21623 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21625 // MULDQ returns the 64-bit result of the signed multiplication of the lower
21626 // 32-bits. We can lower with this if the sign bits stretch that far.
21627 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
21628 DAG.ComputeNumSignBits(B) > 32) {
21629 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
21630 DAG.getBitcast(MulVT, B));
21633 // Ahi = psrlqi(a, 32);
21634 // Bhi = psrlqi(b, 32);
21636 // AloBlo = pmuludq(a, b);
21637 // AloBhi = pmuludq(a, Bhi);
21638 // AhiBlo = pmuludq(Ahi, b);
21640 // Hi = psllqi(AloBhi + AhiBlo, 32);
21641 // return AloBlo + Hi;
21642 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
21643 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
21644 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
21646 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
21647 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
21648 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
21650 // Bit cast to 32-bit vectors for MULUDQ.
21651 SDValue Alo = DAG.getBitcast(MulVT, A);
21652 SDValue Blo = DAG.getBitcast(MulVT, B);
21654 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21656 // Only multiply lo/hi halves that aren't known to be zero.
21657 SDValue AloBlo = Zero;
21658 if (!ALoIsZero && !BLoIsZero)
21659 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
21661 SDValue AloBhi = Zero;
21662 if (!ALoIsZero && !BHiIsZero) {
21663 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
21664 Bhi = DAG.getBitcast(MulVT, Bhi);
21665 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
21668 SDValue AhiBlo = Zero;
21669 if (!AHiIsZero && !BLoIsZero) {
21670 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
21671 Ahi = DAG.getBitcast(MulVT, Ahi);
21672 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
21675 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
21676 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21678 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21681 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21682 SelectionDAG &DAG) {
21684 MVT VT = Op.getSimpleValueType();
21686 // Decompose 256-bit ops into smaller 128-bit ops.
21687 if (VT.is256BitVector() && !Subtarget.hasInt256())
21688 return Lower256IntArith(Op, DAG);
21690 // Only i8 vectors should need custom lowering after this.
21691 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
21692 "Unsupported vector type");
21694 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21695 // logical shift down the upper half and pack back to i8.
21696 SDValue A = Op.getOperand(0);
21697 SDValue B = Op.getOperand(1);
21699 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21700 // and then ashr/lshr the upper bits down to the lower bits before multiply.
21701 unsigned Opcode = Op.getOpcode();
21702 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
21703 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
21705 // AVX2 implementations - extend xmm subvectors to ymm.
21706 if (Subtarget.hasInt256()) {
21707 SDValue Lo = DAG.getIntPtrConstant(0, dl);
21708 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
21710 if (VT == MVT::v32i8) {
21711 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
21712 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
21713 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
21714 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
21715 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
21716 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
21717 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
21718 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
21719 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21720 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21721 DAG.getConstant(8, dl, MVT::v16i16));
21722 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21723 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21724 DAG.getConstant(8, dl, MVT::v16i16));
21725 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21726 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21727 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
21728 16, 17, 18, 19, 20, 21, 22, 23};
21729 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21730 24, 25, 26, 27, 28, 29, 30, 31};
21731 return DAG.getNode(X86ISD::PACKUS, dl, VT,
21732 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21733 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21736 SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
21737 SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
21738 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21739 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21740 DAG.getConstant(8, dl, MVT::v16i16));
21741 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21742 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21743 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21746 assert(VT == MVT::v16i8 &&
21747 "Pre-AVX2 support only supports v16i8 multiplication");
21748 MVT ExVT = MVT::v8i16;
21750 // Extract the lo parts and zero/sign extend to i16.
21752 if (Subtarget.hasSSE41()) {
21753 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
21754 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
21756 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21757 -1, 4, -1, 5, -1, 6, -1, 7};
21758 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21759 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21760 ALo = DAG.getBitcast(ExVT, ALo);
21761 BLo = DAG.getBitcast(ExVT, BLo);
21762 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21763 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21766 // Extract the hi parts and zero/sign extend to i16.
21768 if (Subtarget.hasSSE41()) {
21769 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21770 -1, -1, -1, -1, -1, -1, -1, -1};
21771 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21772 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21773 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
21774 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
21776 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21777 -1, 12, -1, 13, -1, 14, -1, 15};
21778 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21779 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21780 AHi = DAG.getBitcast(ExVT, AHi);
21781 BHi = DAG.getBitcast(ExVT, BHi);
21782 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21783 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21786 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21787 // pack back to v16i8.
21788 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21789 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21790 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21791 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21792 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21795 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21796 assert(Subtarget.isTargetWin64() && "Unexpected target");
21797 EVT VT = Op.getValueType();
21798 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
21799 "Unexpected return type for lowering");
21803 switch (Op->getOpcode()) {
21804 default: llvm_unreachable("Unexpected request for libcall!");
21805 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
21806 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
21807 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
21808 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
21809 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
21810 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21814 SDValue InChain = DAG.getEntryNode();
21816 TargetLowering::ArgListTy Args;
21817 TargetLowering::ArgListEntry Entry;
21818 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
21819 EVT ArgVT = Op->getOperand(i).getValueType();
21820 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
21821 "Unexpected argument type for lowering");
21822 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21823 Entry.Node = StackPtr;
21824 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21825 MachinePointerInfo(), /* Alignment = */ 16);
21826 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21827 Entry.Ty = PointerType::get(ArgTy,0);
21828 Entry.IsSExt = false;
21829 Entry.IsZExt = false;
21830 Args.push_back(Entry);
21833 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21834 getPointerTy(DAG.getDataLayout()));
21836 TargetLowering::CallLoweringInfo CLI(DAG);
21837 CLI.setDebugLoc(dl)
21840 getLibcallCallingConv(LC),
21841 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
21844 .setSExtResult(isSigned)
21845 .setZExtResult(!isSigned);
21847 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21848 return DAG.getBitcast(VT, CallInfo.first);
21851 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21852 SelectionDAG &DAG) {
21853 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21854 MVT VT = Op0.getSimpleValueType();
21857 // Decompose 256-bit ops into smaller 128-bit ops.
21858 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21859 unsigned Opcode = Op.getOpcode();
21860 unsigned NumElems = VT.getVectorNumElements();
21861 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21862 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21863 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21864 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21865 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21866 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21867 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21869 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21870 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21872 return DAG.getMergeValues(Ops, dl);
21875 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21876 (VT == MVT::v8i32 && Subtarget.hasInt256()));
21878 // PMULxD operations multiply each even value (starting at 0) of LHS with
21879 // the related value of RHS and produce a widen result.
21880 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21881 // => <2 x i64> <ae|cg>
21883 // In other word, to have all the results, we need to perform two PMULxD:
21884 // 1. one with the even values.
21885 // 2. one with the odd values.
21886 // To achieve #2, with need to place the odd values at an even position.
21888 // Place the odd value at an even position (basically, shift all values 1
21889 // step to the left):
21890 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21891 // <a|b|c|d> => <b|undef|d|undef>
21892 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21893 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21894 // <e|f|g|h> => <f|undef|h|undef>
21895 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21896 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21898 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21900 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21901 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21903 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21904 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21905 // => <2 x i64> <ae|cg>
21906 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21907 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21908 // => <2 x i64> <bf|dh>
21909 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21911 // Shuffle it back into the right order.
21912 SDValue Highs, Lows;
21913 if (VT == MVT::v8i32) {
21914 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21915 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21916 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21917 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21919 const int HighMask[] = {1, 5, 3, 7};
21920 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21921 const int LowMask[] = {0, 4, 2, 6};
21922 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21925 // If we have a signed multiply but no PMULDQ fix up the high parts of a
21926 // unsigned multiply.
21927 if (IsSigned && !Subtarget.hasSSE41()) {
21928 SDValue ShAmt = DAG.getConstant(
21930 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21931 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21932 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21933 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21934 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21936 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21937 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21940 // The first result of MUL_LOHI is actually the low value, followed by the
21942 SDValue Ops[] = {Lows, Highs};
21943 return DAG.getMergeValues(Ops, dl);
21946 // Return true if the required (according to Opcode) shift-imm form is natively
21947 // supported by the Subtarget
21948 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21950 if (VT.getScalarSizeInBits() < 16)
21953 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
21954 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21957 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
21958 (VT.is256BitVector() && Subtarget.hasInt256());
21960 bool AShift = LShift && (Subtarget.hasAVX512() ||
21961 (VT != MVT::v2i64 && VT != MVT::v4i64));
21962 return (Opcode == ISD::SRA) ? AShift : LShift;
21965 // The shift amount is a variable, but it is the same for all vector lanes.
21966 // These instructions are defined together with shift-immediate.
21968 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21970 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21973 // Return true if the required (according to Opcode) variable-shift form is
21974 // natively supported by the Subtarget
21975 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21978 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21981 // vXi16 supported only on AVX-512, BWI
21982 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21985 if (Subtarget.hasAVX512())
21988 bool LShift = VT.is128BitVector() || VT.is256BitVector();
21989 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
21990 return (Opcode == ISD::SRA) ? AShift : LShift;
21993 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21994 const X86Subtarget &Subtarget) {
21995 MVT VT = Op.getSimpleValueType();
21997 SDValue R = Op.getOperand(0);
21998 SDValue Amt = Op.getOperand(1);
22000 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22001 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22003 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
22004 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
22005 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
22006 SDValue Ex = DAG.getBitcast(ExVT, R);
22008 // ashr(R, 63) === cmp_slt(R, 0)
22009 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
22010 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
22011 "Unsupported PCMPGT op");
22012 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
22013 getZeroVector(VT, Subtarget, DAG, dl), R);
22016 if (ShiftAmt >= 32) {
22017 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
22019 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
22020 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22021 ShiftAmt - 32, DAG);
22022 if (VT == MVT::v2i64)
22023 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
22024 if (VT == MVT::v4i64)
22025 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22026 {9, 1, 11, 3, 13, 5, 15, 7});
22028 // SRA upper i32, SHL whole i64 and select lower i32.
22029 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22032 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
22033 Lower = DAG.getBitcast(ExVT, Lower);
22034 if (VT == MVT::v2i64)
22035 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
22036 if (VT == MVT::v4i64)
22037 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22038 {8, 1, 10, 3, 12, 5, 14, 7});
22040 return DAG.getBitcast(VT, Ex);
22043 // Optimize shl/srl/sra with constant shift amount.
22044 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22045 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
22046 uint64_t ShiftAmt = ShiftConst->getZExtValue();
22048 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22049 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22051 // i64 SRA needs to be performed as partial shifts.
22052 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
22053 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
22054 Op.getOpcode() == ISD::SRA)
22055 return ArithmeticShiftRight64(ShiftAmt);
22057 if (VT == MVT::v16i8 ||
22058 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
22059 VT == MVT::v64i8) {
22060 unsigned NumElts = VT.getVectorNumElements();
22061 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
22063 // Simple i8 add case
22064 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
22065 return DAG.getNode(ISD::ADD, dl, VT, R, R);
22067 // ashr(R, 7) === cmp_slt(R, 0)
22068 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
22069 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22070 if (VT.is512BitVector()) {
22071 assert(VT == MVT::v64i8 && "Unexpected element type!");
22072 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
22073 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
22075 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
22078 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
22079 if (VT == MVT::v16i8 && Subtarget.hasXOP())
22082 if (Op.getOpcode() == ISD::SHL) {
22083 // Make a large shift.
22084 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
22086 SHL = DAG.getBitcast(VT, SHL);
22087 // Zero out the rightmost bits.
22088 return DAG.getNode(ISD::AND, dl, VT, SHL,
22089 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
22091 if (Op.getOpcode() == ISD::SRL) {
22092 // Make a large shift.
22093 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
22095 SRL = DAG.getBitcast(VT, SRL);
22096 // Zero out the leftmost bits.
22097 return DAG.getNode(ISD::AND, dl, VT, SRL,
22098 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
22100 if (Op.getOpcode() == ISD::SRA) {
22101 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
22102 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22104 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
22105 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
22106 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
22109 llvm_unreachable("Unknown shift opcode.");
22114 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22115 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
22116 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
22117 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
22118 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
22120 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
22121 unsigned SubVectorScale = 1;
22122 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22124 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
22125 Amt = Amt.getOperand(0);
22128 // Peek through any splat that was introduced for i64 shift vectorization.
22129 int SplatIndex = -1;
22130 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
22131 if (SVN->isSplat()) {
22132 SplatIndex = SVN->getSplatIndex();
22133 Amt = Amt.getOperand(0);
22134 assert(SplatIndex < (int)VT.getVectorNumElements() &&
22135 "Splat shuffle referencing second operand");
22138 if (Amt.getOpcode() != ISD::BITCAST ||
22139 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
22142 Amt = Amt.getOperand(0);
22143 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22144 (SubVectorScale * VT.getVectorNumElements());
22145 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
22146 uint64_t ShiftAmt = 0;
22147 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
22148 for (unsigned i = 0; i != Ratio; ++i) {
22149 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
22153 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
22156 // Check remaining shift amounts (if not a splat).
22157 if (SplatIndex < 0) {
22158 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22159 uint64_t ShAmt = 0;
22160 for (unsigned j = 0; j != Ratio; ++j) {
22161 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
22165 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
22167 if (ShAmt != ShiftAmt)
22172 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22173 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22175 if (Op.getOpcode() == ISD::SRA)
22176 return ArithmeticShiftRight64(ShiftAmt);
22182 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
22183 const X86Subtarget &Subtarget) {
22184 MVT VT = Op.getSimpleValueType();
22186 SDValue R = Op.getOperand(0);
22187 SDValue Amt = Op.getOperand(1);
22189 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22190 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22192 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
22193 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
22195 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
22197 MVT EltVT = VT.getVectorElementType();
22199 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
22200 // Check if this build_vector node is doing a splat.
22201 // If so, then set BaseShAmt equal to the splat value.
22202 BaseShAmt = BV->getSplatValue();
22203 if (BaseShAmt && BaseShAmt.isUndef())
22204 BaseShAmt = SDValue();
22206 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22207 Amt = Amt.getOperand(0);
22209 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22210 if (SVN && SVN->isSplat()) {
22211 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22212 SDValue InVec = Amt.getOperand(0);
22213 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
22214 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
22215 "Unexpected shuffle index found!");
22216 BaseShAmt = InVec.getOperand(SplatIdx);
22217 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
22218 if (ConstantSDNode *C =
22219 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22220 if (C->getZExtValue() == SplatIdx)
22221 BaseShAmt = InVec.getOperand(1);
22226 // Avoid introducing an extract element from a shuffle.
22227 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22228 DAG.getIntPtrConstant(SplatIdx, dl));
22232 if (BaseShAmt.getNode()) {
22233 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22234 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22235 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22236 else if (EltVT.bitsLT(MVT::i32))
22237 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22239 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22243 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22244 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
22245 Amt.getOpcode() == ISD::BITCAST &&
22246 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22247 Amt = Amt.getOperand(0);
22248 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22249 VT.getVectorNumElements();
22250 std::vector<SDValue> Vals(Ratio);
22251 for (unsigned i = 0; i != Ratio; ++i)
22252 Vals[i] = Amt.getOperand(i);
22253 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22254 for (unsigned j = 0; j != Ratio; ++j)
22255 if (Vals[j] != Amt.getOperand(i + j))
22259 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22260 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22265 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22266 SelectionDAG &DAG) {
22267 MVT VT = Op.getSimpleValueType();
22269 SDValue R = Op.getOperand(0);
22270 SDValue Amt = Op.getOperand(1);
22271 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22273 assert(VT.isVector() && "Custom lowering only for vector shifts!");
22274 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22276 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22279 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22282 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22285 // XOP has 128-bit variable logical/arithmetic shifts.
22286 // +ve/-ve Amt = shift left/right.
22287 if (Subtarget.hasXOP() &&
22288 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22289 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22290 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22291 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22292 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22294 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22295 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22296 if (Op.getOpcode() == ISD::SRA)
22297 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22300 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22301 // shifts per-lane and then shuffle the partial results back together.
22302 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22303 // Splat the shift amounts so the scalar shifts above will catch it.
22304 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22305 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22306 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22307 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22308 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22311 // i64 vector arithmetic shift can be emulated with the transform:
22312 // M = lshr(SIGN_MASK, Amt)
22313 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22314 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22315 Op.getOpcode() == ISD::SRA) {
22316 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22317 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22318 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22319 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22320 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22324 // If possible, lower this packed shift into a vector multiply instead of
22325 // expanding it into a sequence of scalar shifts.
22326 // Do this only if the vector shift count is a constant build_vector.
22327 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22328 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22329 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22330 SmallVector<SDValue, 8> Elts;
22331 MVT SVT = VT.getVectorElementType();
22332 unsigned SVTBits = SVT.getSizeInBits();
22333 APInt One(SVTBits, 1);
22334 unsigned NumElems = VT.getVectorNumElements();
22336 for (unsigned i=0; i !=NumElems; ++i) {
22337 SDValue Op = Amt->getOperand(i);
22338 if (Op->isUndef()) {
22339 Elts.push_back(Op);
22343 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22344 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22345 uint64_t ShAmt = C.getZExtValue();
22346 if (ShAmt >= SVTBits) {
22347 Elts.push_back(DAG.getUNDEF(SVT));
22350 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22352 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22353 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22356 // Lower SHL with variable shift amount.
22357 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22358 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22360 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22361 DAG.getConstant(0x3f800000U, dl, VT));
22362 Op = DAG.getBitcast(MVT::v4f32, Op);
22363 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22364 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22367 // If possible, lower this shift as a sequence of two shifts by
22368 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22370 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22372 // Could be rewritten as:
22373 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22375 // The advantage is that the two shifts from the example would be
22376 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22377 // the vector shift into four scalar shifts plus four pairs of vector
22379 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22380 unsigned TargetOpcode = X86ISD::MOVSS;
22381 bool CanBeSimplified;
22382 // The splat value for the first packed shift (the 'X' from the example).
22383 SDValue Amt1 = Amt->getOperand(0);
22384 // The splat value for the second packed shift (the 'Y' from the example).
22385 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22387 // See if it is possible to replace this node with a sequence of
22388 // two shifts followed by a MOVSS/MOVSD/PBLEND.
22389 if (VT == MVT::v4i32) {
22390 // Check if it is legal to use a MOVSS.
22391 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22392 Amt2 == Amt->getOperand(3);
22393 if (!CanBeSimplified) {
22394 // Otherwise, check if we can still simplify this node using a MOVSD.
22395 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22396 Amt->getOperand(2) == Amt->getOperand(3);
22397 TargetOpcode = X86ISD::MOVSD;
22398 Amt2 = Amt->getOperand(2);
22401 // Do similar checks for the case where the machine value type
22403 CanBeSimplified = Amt1 == Amt->getOperand(1);
22404 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22405 CanBeSimplified = Amt2 == Amt->getOperand(i);
22407 if (!CanBeSimplified) {
22408 TargetOpcode = X86ISD::MOVSD;
22409 CanBeSimplified = true;
22410 Amt2 = Amt->getOperand(4);
22411 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22412 CanBeSimplified = Amt1 == Amt->getOperand(i);
22413 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22414 CanBeSimplified = Amt2 == Amt->getOperand(j);
22418 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22419 isa<ConstantSDNode>(Amt2)) {
22420 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22421 MVT CastVT = MVT::v4i32;
22423 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22424 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22426 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22427 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22428 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
22429 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
22430 if (TargetOpcode == X86ISD::MOVSD)
22431 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22432 BitCast2, {0, 1, 6, 7}));
22433 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22434 BitCast2, {0, 5, 6, 7}));
22438 // v4i32 Non Uniform Shifts.
22439 // If the shift amount is constant we can shift each lane using the SSE2
22440 // immediate shifts, else we need to zero-extend each lane to the lower i64
22441 // and shift using the SSE2 variable shifts.
22442 // The separate results can then be blended together.
22443 if (VT == MVT::v4i32) {
22444 unsigned Opc = Op.getOpcode();
22445 SDValue Amt0, Amt1, Amt2, Amt3;
22447 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22448 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22449 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22450 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22452 // ISD::SHL is handled above but we include it here for completeness.
22455 llvm_unreachable("Unknown target vector shift node");
22457 Opc = X86ISD::VSHL;
22460 Opc = X86ISD::VSRL;
22463 Opc = X86ISD::VSRA;
22466 // The SSE2 shifts use the lower i64 as the same shift amount for
22467 // all lanes and the upper i64 is ignored. These shuffle masks
22468 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22469 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22470 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22471 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22472 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22473 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22476 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22477 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22478 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22479 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22480 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22481 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22482 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22485 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22486 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22487 // make the existing SSE solution better.
22488 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22489 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22490 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22491 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22492 MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22493 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22495 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22496 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22497 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22498 return DAG.getNode(ISD::TRUNCATE, dl, VT,
22499 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22502 if (VT == MVT::v16i8 ||
22503 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
22504 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
22505 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22506 unsigned ShiftOpcode = Op->getOpcode();
22508 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22509 if (VT.is512BitVector()) {
22510 // On AVX512BW targets we make use of the fact that VSELECT lowers
22511 // to a masked blend which selects bytes based just on the sign bit
22512 // extracted to a mask.
22513 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22514 V0 = DAG.getBitcast(VT, V0);
22515 V1 = DAG.getBitcast(VT, V1);
22516 Sel = DAG.getBitcast(VT, Sel);
22517 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22518 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22519 } else if (Subtarget.hasSSE41()) {
22520 // On SSE41 targets we make use of the fact that VSELECT lowers
22521 // to PBLENDVB which selects bytes based just on the sign bit.
22522 V0 = DAG.getBitcast(VT, V0);
22523 V1 = DAG.getBitcast(VT, V1);
22524 Sel = DAG.getBitcast(VT, Sel);
22525 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22527 // On pre-SSE41 targets we test for the sign bit by comparing to
22528 // zero - a negative value will set all bits of the lanes to true
22529 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22530 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22531 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22532 return DAG.getSelect(dl, SelVT, C, V0, V1);
22535 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22536 // We can safely do this using i16 shifts as we're only interested in
22537 // the 3 lower bits of each byte.
22538 Amt = DAG.getBitcast(ExtVT, Amt);
22539 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22540 Amt = DAG.getBitcast(VT, Amt);
22542 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
22543 // r = VSELECT(r, shift(r, 4), a);
22545 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22546 R = SignBitSelect(VT, Amt, M, R);
22549 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22551 // r = VSELECT(r, shift(r, 2), a);
22552 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22553 R = SignBitSelect(VT, Amt, M, R);
22556 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22558 // return VSELECT(r, shift(r, 1), a);
22559 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22560 R = SignBitSelect(VT, Amt, M, R);
22564 if (Op->getOpcode() == ISD::SRA) {
22565 // For SRA we need to unpack each byte to the higher byte of a i16 vector
22566 // so we can correctly sign extend. We don't care what happens to the
22568 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22569 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22570 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22571 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22572 ALo = DAG.getBitcast(ExtVT, ALo);
22573 AHi = DAG.getBitcast(ExtVT, AHi);
22574 RLo = DAG.getBitcast(ExtVT, RLo);
22575 RHi = DAG.getBitcast(ExtVT, RHi);
22577 // r = VSELECT(r, shift(r, 4), a);
22578 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22579 DAG.getConstant(4, dl, ExtVT));
22580 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22581 DAG.getConstant(4, dl, ExtVT));
22582 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22583 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22586 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22587 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22589 // r = VSELECT(r, shift(r, 2), a);
22590 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22591 DAG.getConstant(2, dl, ExtVT));
22592 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22593 DAG.getConstant(2, dl, ExtVT));
22594 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22595 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22598 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22599 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22601 // r = VSELECT(r, shift(r, 1), a);
22602 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22603 DAG.getConstant(1, dl, ExtVT));
22604 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22605 DAG.getConstant(1, dl, ExtVT));
22606 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22607 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22609 // Logical shift the result back to the lower byte, leaving a zero upper
22611 // meaning that we can safely pack with PACKUSWB.
22613 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
22615 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
22616 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22620 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
22621 MVT ExtVT = MVT::v8i32;
22622 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22623 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
22624 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
22625 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
22626 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
22627 ALo = DAG.getBitcast(ExtVT, ALo);
22628 AHi = DAG.getBitcast(ExtVT, AHi);
22629 RLo = DAG.getBitcast(ExtVT, RLo);
22630 RHi = DAG.getBitcast(ExtVT, RHi);
22631 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
22632 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
22633 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
22634 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
22635 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22638 if (VT == MVT::v8i16) {
22639 unsigned ShiftOpcode = Op->getOpcode();
22641 // If we have a constant shift amount, the non-SSE41 path is best as
22642 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
22643 bool UseSSE41 = Subtarget.hasSSE41() &&
22644 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22646 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
22647 // On SSE41 targets we make use of the fact that VSELECT lowers
22648 // to PBLENDVB which selects bytes based just on the sign bit.
22650 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
22651 V0 = DAG.getBitcast(ExtVT, V0);
22652 V1 = DAG.getBitcast(ExtVT, V1);
22653 Sel = DAG.getBitcast(ExtVT, Sel);
22654 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
22656 // On pre-SSE41 targets we splat the sign bit - a negative value will
22657 // set all bits of the lanes to true and VSELECT uses that in
22658 // its OR(AND(V0,C),AND(V1,~C)) lowering.
22660 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
22661 return DAG.getSelect(dl, VT, C, V0, V1);
22664 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
22666 // On SSE41 targets we need to replicate the shift mask in both
22667 // bytes for PBLENDVB.
22670 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
22671 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
22673 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
22676 // r = VSELECT(r, shift(r, 8), a);
22677 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
22678 R = SignBitSelect(Amt, M, R);
22681 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22683 // r = VSELECT(r, shift(r, 4), a);
22684 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22685 R = SignBitSelect(Amt, M, R);
22688 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22690 // r = VSELECT(r, shift(r, 2), a);
22691 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22692 R = SignBitSelect(Amt, M, R);
22695 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22697 // return VSELECT(r, shift(r, 1), a);
22698 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22699 R = SignBitSelect(Amt, M, R);
22703 // Decompose 256-bit shifts into smaller 128-bit shifts.
22704 if (VT.is256BitVector())
22705 return Lower256IntArith(Op, DAG);
22710 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22711 SelectionDAG &DAG) {
22712 MVT VT = Op.getSimpleValueType();
22714 SDValue R = Op.getOperand(0);
22715 SDValue Amt = Op.getOperand(1);
22716 unsigned Opcode = Op.getOpcode();
22717 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22719 if (Subtarget.hasAVX512()) {
22720 // Attempt to rotate by immediate.
22722 SmallVector<APInt, 16> EltBits;
22723 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
22724 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
22725 return EltBits[0] == V;
22727 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
22728 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
22729 return DAG.getNode(Op, DL, VT, R,
22730 DAG.getConstant(RotateAmt, DL, MVT::i8));
22734 // Else, fall-back on VPROLV/VPRORV.
22738 assert(VT.isVector() && "Custom lowering only for vector rotates!");
22739 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
22740 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
22742 // XOP has 128-bit vector variable + immediate rotates.
22743 // +ve/-ve Amt = rotate left/right.
22745 // Split 256-bit integers.
22746 if (VT.is256BitVector())
22747 return Lower256IntArith(Op, DAG);
22749 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
22751 // Attempt to rotate by immediate.
22752 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22753 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
22754 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22755 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
22756 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
22757 DAG.getConstant(RotateAmt, DL, MVT::i8));
22761 // Use general rotate by variable (per-element).
22762 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
22765 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22766 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22767 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22768 // looks for this combo and may remove the "setcc" instruction if the "setcc"
22769 // has only one use.
22770 SDNode *N = Op.getNode();
22771 SDValue LHS = N->getOperand(0);
22772 SDValue RHS = N->getOperand(1);
22773 unsigned BaseOp = 0;
22774 X86::CondCode Cond;
22776 switch (Op.getOpcode()) {
22777 default: llvm_unreachable("Unknown ovf instruction!");
22779 // A subtract of one will be selected as a INC. Note that INC doesn't
22780 // set CF, so we can't do this for UADDO.
22781 if (isOneConstant(RHS)) {
22782 BaseOp = X86ISD::INC;
22783 Cond = X86::COND_O;
22786 BaseOp = X86ISD::ADD;
22787 Cond = X86::COND_O;
22790 BaseOp = X86ISD::ADD;
22791 Cond = X86::COND_B;
22794 // A subtract of one will be selected as a DEC. Note that DEC doesn't
22795 // set CF, so we can't do this for USUBO.
22796 if (isOneConstant(RHS)) {
22797 BaseOp = X86ISD::DEC;
22798 Cond = X86::COND_O;
22801 BaseOp = X86ISD::SUB;
22802 Cond = X86::COND_O;
22805 BaseOp = X86ISD::SUB;
22806 Cond = X86::COND_B;
22809 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
22810 Cond = X86::COND_O;
22812 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22813 if (N->getValueType(0) == MVT::i8) {
22814 BaseOp = X86ISD::UMUL8;
22815 Cond = X86::COND_O;
22818 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22820 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22822 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22824 if (N->getValueType(1) == MVT::i1)
22825 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22827 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22831 // Also sets EFLAGS.
22832 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22833 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22835 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22837 if (N->getValueType(1) == MVT::i1)
22838 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22840 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22843 /// Returns true if the operand type is exactly twice the native width, and
22844 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22845 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22846 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22847 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22848 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22851 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22852 else if (OpWidth == 128)
22853 return Subtarget.hasCmpxchg16b();
22858 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22859 return needsCmpXchgNb(SI->getValueOperand()->getType());
22862 // Note: this turns large loads into lock cmpxchg8b/16b.
22863 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22864 TargetLowering::AtomicExpansionKind
22865 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22866 auto PTy = cast<PointerType>(LI->getPointerOperandType());
22867 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22868 : AtomicExpansionKind::None;
22871 TargetLowering::AtomicExpansionKind
22872 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22873 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22874 Type *MemType = AI->getType();
22876 // If the operand is too big, we must see if cmpxchg8/16b is available
22877 // and default to library calls otherwise.
22878 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
22879 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22880 : AtomicExpansionKind::None;
22883 AtomicRMWInst::BinOp Op = AI->getOperation();
22886 llvm_unreachable("Unknown atomic operation");
22887 case AtomicRMWInst::Xchg:
22888 case AtomicRMWInst::Add:
22889 case AtomicRMWInst::Sub:
22890 // It's better to use xadd, xsub or xchg for these in all cases.
22891 return AtomicExpansionKind::None;
22892 case AtomicRMWInst::Or:
22893 case AtomicRMWInst::And:
22894 case AtomicRMWInst::Xor:
22895 // If the atomicrmw's result isn't actually used, we can just add a "lock"
22896 // prefix to a normal instruction for these operations.
22897 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22898 : AtomicExpansionKind::None;
22899 case AtomicRMWInst::Nand:
22900 case AtomicRMWInst::Max:
22901 case AtomicRMWInst::Min:
22902 case AtomicRMWInst::UMax:
22903 case AtomicRMWInst::UMin:
22904 // These always require a non-trivial set of data operations on x86. We must
22905 // use a cmpxchg loop.
22906 return AtomicExpansionKind::CmpXChg;
22911 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22912 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22913 Type *MemType = AI->getType();
22914 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22915 // there is no benefit in turning such RMWs into loads, and it is actually
22916 // harmful as it introduces a mfence.
22917 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22920 auto Builder = IRBuilder<>(AI);
22921 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22922 auto SSID = AI->getSyncScopeID();
22923 // We must restrict the ordering to avoid generating loads with Release or
22924 // ReleaseAcquire orderings.
22925 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22926 auto Ptr = AI->getPointerOperand();
22928 // Before the load we need a fence. Here is an example lifted from
22929 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22932 // x.store(1, relaxed);
22933 // r1 = y.fetch_add(0, release);
22935 // y.fetch_add(42, acquire);
22936 // r2 = x.load(relaxed);
22937 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22938 // lowered to just a load without a fence. A mfence flushes the store buffer,
22939 // making the optimization clearly correct.
22940 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22941 // otherwise, we might be able to be more aggressive on relaxed idempotent
22942 // rmw. In practice, they do not look useful, so we don't try to be
22943 // especially clever.
22944 if (SSID == SyncScope::SingleThread)
22945 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22946 // the IR level, so we must wrap it in an intrinsic.
22949 if (!Subtarget.hasMFence())
22950 // FIXME: it might make sense to use a locked operation here but on a
22951 // different cache-line to prevent cache-line bouncing. In practice it
22952 // is probably a small win, and x86 processors without mfence are rare
22953 // enough that we do not bother.
22957 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22958 Builder.CreateCall(MFence, {});
22960 // Finally we can emit the atomic load.
22961 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22962 AI->getType()->getPrimitiveSizeInBits());
22963 Loaded->setAtomic(Order, SSID);
22964 AI->replaceAllUsesWith(Loaded);
22965 AI->eraseFromParent();
22969 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22970 SelectionDAG &DAG) {
22972 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22973 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22974 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
22975 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22977 // The only fence that needs an instruction is a sequentially-consistent
22978 // cross-thread fence.
22979 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22980 FenceSSID == SyncScope::System) {
22981 if (Subtarget.hasMFence())
22982 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22984 SDValue Chain = Op.getOperand(0);
22985 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22987 DAG.getRegister(X86::ESP, MVT::i32), // Base
22988 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
22989 DAG.getRegister(0, MVT::i32), // Index
22990 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
22991 DAG.getRegister(0, MVT::i32), // Segment.
22995 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22996 return SDValue(Res, 0);
22999 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
23000 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
23003 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
23004 SelectionDAG &DAG) {
23005 MVT T = Op.getSimpleValueType();
23009 switch(T.SimpleTy) {
23010 default: llvm_unreachable("Invalid value type!");
23011 case MVT::i8: Reg = X86::AL; size = 1; break;
23012 case MVT::i16: Reg = X86::AX; size = 2; break;
23013 case MVT::i32: Reg = X86::EAX; size = 4; break;
23015 assert(Subtarget.is64Bit() && "Node not type legal!");
23016 Reg = X86::RAX; size = 8;
23019 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
23020 Op.getOperand(2), SDValue());
23021 SDValue Ops[] = { cpIn.getValue(0),
23024 DAG.getTargetConstant(size, DL, MVT::i8),
23025 cpIn.getValue(1) };
23026 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23027 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
23028 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
23032 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
23033 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
23034 MVT::i32, cpOut.getValue(2));
23035 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
23037 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
23038 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
23039 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
23043 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
23044 SelectionDAG &DAG) {
23045 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
23046 MVT DstVT = Op.getSimpleValueType();
23048 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
23049 SrcVT == MVT::i64) {
23050 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23051 if (DstVT != MVT::f64)
23052 // This conversion needs to be expanded.
23055 SDValue Op0 = Op->getOperand(0);
23056 SmallVector<SDValue, 16> Elts;
23060 if (SrcVT.isVector()) {
23061 NumElts = SrcVT.getVectorNumElements();
23062 SVT = SrcVT.getVectorElementType();
23064 // Widen the vector in input in the case of MVT::v2i32.
23065 // Example: from MVT::v2i32 to MVT::v4i32.
23066 for (unsigned i = 0, e = NumElts; i != e; ++i)
23067 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
23068 DAG.getIntPtrConstant(i, dl)));
23070 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
23071 "Unexpected source type in LowerBITCAST");
23072 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23073 DAG.getIntPtrConstant(0, dl)));
23074 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23075 DAG.getIntPtrConstant(1, dl)));
23079 // Explicitly mark the extra elements as Undef.
23080 Elts.append(NumElts, DAG.getUNDEF(SVT));
23082 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23083 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
23084 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
23085 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
23086 DAG.getIntPtrConstant(0, dl));
23089 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
23090 Subtarget.hasMMX() && "Unexpected custom BITCAST");
23091 assert((DstVT == MVT::i64 ||
23092 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
23093 "Unexpected custom BITCAST");
23094 // i64 <=> MMX conversions are Legal.
23095 if (SrcVT==MVT::i64 && DstVT.isVector())
23097 if (DstVT==MVT::i64 && SrcVT.isVector())
23099 // MMX <=> MMX conversions are Legal.
23100 if (SrcVT.isVector() && DstVT.isVector())
23102 // All other conversions need to be expanded.
23106 /// Compute the horizontal sum of bytes in V for the elements of VT.
23108 /// Requires V to be a byte vector and VT to be an integer vector type with
23109 /// wider elements than V's type. The width of the elements of VT determines
23110 /// how many bytes of V are summed horizontally to produce each element of the
23112 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
23113 const X86Subtarget &Subtarget,
23114 SelectionDAG &DAG) {
23116 MVT ByteVecVT = V.getSimpleValueType();
23117 MVT EltVT = VT.getVectorElementType();
23118 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
23119 "Expected value to have byte element type.");
23120 assert(EltVT != MVT::i8 &&
23121 "Horizontal byte sum only makes sense for wider elements!");
23122 unsigned VecSize = VT.getSizeInBits();
23123 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
23125 // PSADBW instruction horizontally add all bytes and leave the result in i64
23126 // chunks, thus directly computes the pop count for v2i64 and v4i64.
23127 if (EltVT == MVT::i64) {
23128 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23129 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23130 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
23131 return DAG.getBitcast(VT, V);
23134 if (EltVT == MVT::i32) {
23135 // We unpack the low half and high half into i32s interleaved with zeros so
23136 // that we can use PSADBW to horizontally sum them. The most useful part of
23137 // this is that it lines up the results of two PSADBW instructions to be
23138 // two v2i64 vectors which concatenated are the 4 population counts. We can
23139 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
23140 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
23141 SDValue V32 = DAG.getBitcast(VT, V);
23142 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
23143 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
23145 // Do the horizontal sums into two v2i64s.
23146 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23147 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23148 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23149 DAG.getBitcast(ByteVecVT, Low), Zeros);
23150 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23151 DAG.getBitcast(ByteVecVT, High), Zeros);
23153 // Merge them together.
23154 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
23155 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
23156 DAG.getBitcast(ShortVecVT, Low),
23157 DAG.getBitcast(ShortVecVT, High));
23159 return DAG.getBitcast(VT, V);
23162 // The only element type left is i16.
23163 assert(EltVT == MVT::i16 && "Unknown how to handle type");
23165 // To obtain pop count for each i16 element starting from the pop count for
23166 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
23167 // right by 8. It is important to shift as i16s as i8 vector shift isn't
23168 // directly supported.
23169 SDValue ShifterV = DAG.getConstant(8, DL, VT);
23170 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23171 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
23172 DAG.getBitcast(ByteVecVT, V));
23173 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23176 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
23177 const X86Subtarget &Subtarget,
23178 SelectionDAG &DAG) {
23179 MVT VT = Op.getSimpleValueType();
23180 MVT EltVT = VT.getVectorElementType();
23181 unsigned VecSize = VT.getSizeInBits();
23183 // Implement a lookup table in register by using an algorithm based on:
23184 // http://wm.ite.pl/articles/sse-popcount.html
23186 // The general idea is that every lower byte nibble in the input vector is an
23187 // index into a in-register pre-computed pop count table. We then split up the
23188 // input vector in two new ones: (1) a vector with only the shifted-right
23189 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
23190 // masked out higher ones) for each byte. PSHUFB is used separately with both
23191 // to index the in-register table. Next, both are added and the result is a
23192 // i8 vector where each element contains the pop count for input byte.
23194 // To obtain the pop count for elements != i8, we follow up with the same
23195 // approach and use additional tricks as described below.
23197 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
23198 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
23199 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
23200 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
23202 int NumByteElts = VecSize / 8;
23203 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
23204 SDValue In = DAG.getBitcast(ByteVecVT, Op);
23205 SmallVector<SDValue, 64> LUTVec;
23206 for (int i = 0; i < NumByteElts; ++i)
23207 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
23208 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
23209 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
23212 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
23213 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23216 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23218 // The input vector is used as the shuffle mask that index elements into the
23219 // LUT. After counting low and high nibbles, add the vector to obtain the
23220 // final pop count per i8 element.
23221 SDValue HighPopCnt =
23222 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23223 SDValue LowPopCnt =
23224 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23225 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23227 if (EltVT == MVT::i8)
23230 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23233 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23234 const X86Subtarget &Subtarget,
23235 SelectionDAG &DAG) {
23236 MVT VT = Op.getSimpleValueType();
23237 assert(VT.is128BitVector() &&
23238 "Only 128-bit vector bitmath lowering supported.");
23240 int VecSize = VT.getSizeInBits();
23241 MVT EltVT = VT.getVectorElementType();
23242 int Len = EltVT.getSizeInBits();
23244 // This is the vectorized version of the "best" algorithm from
23245 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23246 // with a minor tweak to use a series of adds + shifts instead of vector
23247 // multiplications. Implemented for all integer vector types. We only use
23248 // this when we don't have SSSE3 which allows a LUT-based lowering that is
23249 // much faster, even faster than using native popcnt instructions.
23251 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23252 MVT VT = V.getSimpleValueType();
23253 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23254 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23256 auto GetMask = [&](SDValue V, APInt Mask) {
23257 MVT VT = V.getSimpleValueType();
23258 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23259 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23262 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23263 // x86, so set the SRL type to have elements at least i16 wide. This is
23264 // correct because all of our SRLs are followed immediately by a mask anyways
23265 // that handles any bits that sneak into the high bits of the byte elements.
23266 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23270 // v = v - ((v >> 1) & 0x55555555...)
23272 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23273 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23274 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23276 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23277 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23278 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23279 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23280 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23282 // v = (v + (v >> 4)) & 0x0F0F0F0F...
23283 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23284 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23285 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23287 // At this point, V contains the byte-wise population count, and we are
23288 // merely doing a horizontal sum if necessary to get the wider element
23290 if (EltVT == MVT::i8)
23293 return LowerHorizontalByteSum(
23294 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23298 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23299 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23300 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23301 SelectionDAG &DAG) {
23302 MVT VT = Op.getSimpleValueType();
23303 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23304 "Unknown CTPOP type to handle");
23305 SDLoc DL(Op.getNode());
23306 SDValue Op0 = Op.getOperand(0);
23308 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
23309 if (Subtarget.hasVPOPCNTDQ()) {
23310 if (VT == MVT::v8i16) {
23311 Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
23312 Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
23313 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23315 if (VT == MVT::v16i8 || VT == MVT::v16i16) {
23316 Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
23317 Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
23318 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23322 if (!Subtarget.hasSSSE3()) {
23323 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23324 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23325 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23328 // Decompose 256-bit ops into smaller 128-bit ops.
23329 if (VT.is256BitVector() && !Subtarget.hasInt256())
23330 return Lower256IntUnary(Op, DAG);
23332 // Decompose 512-bit ops into smaller 256-bit ops.
23333 if (VT.is512BitVector() && !Subtarget.hasBWI())
23334 return Lower512IntUnary(Op, DAG);
23336 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23339 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23340 SelectionDAG &DAG) {
23341 assert(Op.getSimpleValueType().isVector() &&
23342 "We only do custom lowering for vector population count.");
23343 return LowerVectorCTPOP(Op, Subtarget, DAG);
23346 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23347 MVT VT = Op.getSimpleValueType();
23348 SDValue In = Op.getOperand(0);
23351 // For scalars, its still beneficial to transfer to/from the SIMD unit to
23352 // perform the BITREVERSE.
23353 if (!VT.isVector()) {
23354 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23355 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23356 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23357 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23358 DAG.getIntPtrConstant(0, DL));
23361 int NumElts = VT.getVectorNumElements();
23362 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23364 // Decompose 256-bit ops into smaller 128-bit ops.
23365 if (VT.is256BitVector())
23366 return Lower256IntUnary(Op, DAG);
23368 assert(VT.is128BitVector() &&
23369 "Only 128-bit vector bitreverse lowering supported.");
23371 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23372 // perform the BSWAP in the shuffle.
23373 // Its best to shuffle using the second operand as this will implicitly allow
23374 // memory folding for multiple vectors.
23375 SmallVector<SDValue, 16> MaskElts;
23376 for (int i = 0; i != NumElts; ++i) {
23377 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23378 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23379 int PermuteByte = SourceByte | (2 << 5);
23380 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23384 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23385 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23386 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23388 return DAG.getBitcast(VT, Res);
23391 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23392 SelectionDAG &DAG) {
23393 if (Subtarget.hasXOP())
23394 return LowerBITREVERSE_XOP(Op, DAG);
23396 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23398 MVT VT = Op.getSimpleValueType();
23399 SDValue In = Op.getOperand(0);
23402 unsigned NumElts = VT.getVectorNumElements();
23403 assert(VT.getScalarType() == MVT::i8 &&
23404 "Only byte vector BITREVERSE supported");
23406 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23407 if (VT.is256BitVector() && !Subtarget.hasInt256())
23408 return Lower256IntUnary(Op, DAG);
23410 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23411 // two nibbles and a PSHUFB lookup to find the bitreverse of each
23412 // 0-15 value (moved to the other nibble).
23413 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23414 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23415 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23417 const int LoLUT[16] = {
23418 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23419 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23420 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23421 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23422 const int HiLUT[16] = {
23423 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23424 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23425 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23426 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23428 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23429 for (unsigned i = 0; i < NumElts; ++i) {
23430 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23431 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23434 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23435 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23436 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23437 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23438 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23441 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
23442 unsigned NewOpc = 0;
23443 switch (N->getOpcode()) {
23444 case ISD::ATOMIC_LOAD_ADD:
23445 NewOpc = X86ISD::LADD;
23447 case ISD::ATOMIC_LOAD_SUB:
23448 NewOpc = X86ISD::LSUB;
23450 case ISD::ATOMIC_LOAD_OR:
23451 NewOpc = X86ISD::LOR;
23453 case ISD::ATOMIC_LOAD_XOR:
23454 NewOpc = X86ISD::LXOR;
23456 case ISD::ATOMIC_LOAD_AND:
23457 NewOpc = X86ISD::LAND;
23460 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23463 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23464 return DAG.getMemIntrinsicNode(
23465 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23466 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23467 /*MemVT=*/N->getSimpleValueType(0), MMO);
23470 /// Lower atomic_load_ops into LOCK-prefixed operations.
23471 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23472 const X86Subtarget &Subtarget) {
23473 SDValue Chain = N->getOperand(0);
23474 SDValue LHS = N->getOperand(1);
23475 SDValue RHS = N->getOperand(2);
23476 unsigned Opc = N->getOpcode();
23477 MVT VT = N->getSimpleValueType(0);
23480 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23481 // can only be lowered when the result is unused. They should have already
23482 // been transformed into a cmpxchg loop in AtomicExpand.
23483 if (N->hasAnyUseOfValue(0)) {
23484 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23485 // select LXADD if LOCK_SUB can't be selected.
23486 if (Opc == ISD::ATOMIC_LOAD_SUB) {
23487 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23488 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23489 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23490 RHS, AN->getMemOperand());
23492 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
23493 "Used AtomicRMW ops other than Add should have been expanded!");
23497 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
23498 // RAUW the chain, but don't worry about the result, as it's unused.
23499 assert(!N->hasAnyUseOfValue(0));
23500 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23504 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23505 SDNode *Node = Op.getNode();
23507 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23509 // Convert seq_cst store -> xchg
23510 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23511 // FIXME: On 32-bit, store -> fist or movq would be more efficient
23512 // (The only way to get a 16-byte store is cmpxchg16b)
23513 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23514 if (cast<AtomicSDNode>(Node)->getOrdering() ==
23515 AtomicOrdering::SequentiallyConsistent ||
23516 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23517 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23518 cast<AtomicSDNode>(Node)->getMemoryVT(),
23519 Node->getOperand(0),
23520 Node->getOperand(1), Node->getOperand(2),
23521 cast<AtomicSDNode>(Node)->getMemOperand());
23522 return Swap.getValue(1);
23524 // Other atomic stores have a simple pattern.
23528 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
23529 SDNode *N = Op.getNode();
23530 MVT VT = N->getSimpleValueType(0);
23532 // Let legalize expand this if it isn't a legal type yet.
23533 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23536 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23539 // Set the carry flag.
23540 SDValue Carry = Op.getOperand(2);
23541 EVT CarryVT = Carry.getValueType();
23542 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
23543 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23544 Carry, DAG.getConstant(NegOne, DL, CarryVT));
23546 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
23547 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
23548 Op.getOperand(1), Carry.getValue(1));
23550 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
23551 if (N->getValueType(1) == MVT::i1)
23552 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23554 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23557 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23558 SelectionDAG &DAG) {
23559 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
23561 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
23562 // which returns the values as { float, float } (in XMM0) or
23563 // { double, double } (which is returned in XMM0, XMM1).
23565 SDValue Arg = Op.getOperand(0);
23566 EVT ArgVT = Arg.getValueType();
23567 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23569 TargetLowering::ArgListTy Args;
23570 TargetLowering::ArgListEntry Entry;
23574 Entry.IsSExt = false;
23575 Entry.IsZExt = false;
23576 Args.push_back(Entry);
23578 bool isF64 = ArgVT == MVT::f64;
23579 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
23580 // the small struct {f32, f32} is returned in (eax, edx). For f64,
23581 // the results are returned via SRet in memory.
23582 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
23583 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23585 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
23587 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
23588 : (Type *)VectorType::get(ArgTy, 4);
23590 TargetLowering::CallLoweringInfo CLI(DAG);
23591 CLI.setDebugLoc(dl)
23592 .setChain(DAG.getEntryNode())
23593 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
23595 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
23598 // Returned in xmm0 and xmm1.
23599 return CallResult.first;
23601 // Returned in bits 0:31 and 32:64 xmm0.
23602 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23603 CallResult.first, DAG.getIntPtrConstant(0, dl));
23604 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23605 CallResult.first, DAG.getIntPtrConstant(1, dl));
23606 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
23607 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
23610 /// Widen a vector input to a vector of NVT. The
23611 /// input vector must have the same element type as NVT.
23612 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
23613 bool FillWithZeroes = false) {
23614 // Check if InOp already has the right width.
23615 MVT InVT = InOp.getSimpleValueType();
23619 if (InOp.isUndef())
23620 return DAG.getUNDEF(NVT);
23622 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
23623 "input and widen element type must match");
23625 unsigned InNumElts = InVT.getVectorNumElements();
23626 unsigned WidenNumElts = NVT.getVectorNumElements();
23627 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
23628 "Unexpected request for vector widening");
23631 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
23632 InOp.getNumOperands() == 2) {
23633 SDValue N1 = InOp.getOperand(1);
23634 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
23636 InOp = InOp.getOperand(0);
23637 InVT = InOp.getSimpleValueType();
23638 InNumElts = InVT.getVectorNumElements();
23641 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
23642 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
23643 SmallVector<SDValue, 16> Ops;
23644 for (unsigned i = 0; i < InNumElts; ++i)
23645 Ops.push_back(InOp.getOperand(i));
23647 EVT EltVT = InOp.getOperand(0).getValueType();
23649 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
23650 DAG.getUNDEF(EltVT);
23651 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
23652 Ops.push_back(FillVal);
23653 return DAG.getBuildVector(NVT, dl, Ops);
23655 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
23657 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
23658 InOp, DAG.getIntPtrConstant(0, dl));
23661 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
23662 SelectionDAG &DAG) {
23663 assert(Subtarget.hasAVX512() &&
23664 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23666 // X86 scatter kills mask register, so its type should be added to
23667 // the list of return values.
23668 // If the "scatter" has 2 return values, it is already handled.
23669 if (Op.getNode()->getNumValues() == 2)
23672 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
23673 SDValue Src = N->getValue();
23674 MVT VT = Src.getSimpleValueType();
23675 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
23678 SDValue NewScatter;
23679 SDValue Index = N->getIndex();
23680 SDValue Mask = N->getMask();
23681 SDValue Chain = N->getChain();
23682 SDValue BasePtr = N->getBasePtr();
23683 MVT MemVT = N->getMemoryVT().getSimpleVT();
23684 MVT IndexVT = Index.getSimpleValueType();
23685 MVT MaskVT = Mask.getSimpleValueType();
23687 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
23688 // The v2i32 value was promoted to v2i64.
23689 // Now we "redo" the type legalizer's work and widen the original
23690 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
23692 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
23693 "Unexpected memory type");
23694 int ShuffleMask[] = {0, 2, -1, -1};
23695 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
23696 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
23697 // Now we have 4 elements instead of 2.
23698 // Expand the index.
23699 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
23700 Index = ExtendToType(Index, NewIndexVT, DAG);
23702 // Expand the mask with zeroes
23703 // Mask may be <2 x i64> or <2 x i1> at this moment
23704 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
23705 "Unexpected mask type");
23706 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23707 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23711 unsigned NumElts = VT.getVectorNumElements();
23712 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23713 !Index.getSimpleValueType().is512BitVector()) {
23714 // AVX512F supports only 512-bit vectors. Or data or index should
23715 // be 512 bit wide. If now the both index and data are 256-bit, but
23716 // the vector contains 8 elements, we just sign-extend the index
23717 if (IndexVT == MVT::v8i32)
23718 // Just extend index
23719 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23721 // The minimal number of elts in scatter is 8
23724 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23725 // Use original index here, do not modify the index twice
23726 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23727 if (IndexVT.getScalarType() == MVT::i32)
23728 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23731 // At this point we have promoted mask operand
23732 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23733 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23734 // Use the original mask here, do not modify the mask twice
23735 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23737 // The value that should be stored
23738 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23739 Src = ExtendToType(Src, NewVT, DAG);
23742 // If the mask is "wide" at this point - truncate it to i1 vector
23743 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23744 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23746 // The mask is killed by scatter, add it to the values
23747 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23748 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23749 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23750 N->getMemOperand());
23751 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23752 return SDValue(NewScatter.getNode(), 1);
23755 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23756 SelectionDAG &DAG) {
23758 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23759 MVT VT = Op.getSimpleValueType();
23760 MVT ScalarVT = VT.getScalarType();
23761 SDValue Mask = N->getMask();
23764 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
23765 "Expanding masked load is supported on AVX-512 target only!");
23767 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
23768 "Expanding masked load is supported for 32 and 64-bit types only!");
23770 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23771 // VLX. These types for exp-loads are handled here.
23772 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
23775 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23776 "Cannot lower masked load op.");
23778 assert((ScalarVT.getSizeInBits() >= 32 ||
23779 (Subtarget.hasBWI() &&
23780 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23781 "Unsupported masked load op.");
23783 // This operation is legal for targets with VLX, but without
23784 // VLX the vector should be widened to 512 bit
23785 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23786 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23787 SDValue Src0 = N->getSrc0();
23788 Src0 = ExtendToType(Src0, WideDataVT, DAG);
23790 // Mask element has to be i1.
23791 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23792 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23793 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23795 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23797 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23798 if (MaskEltTy != MVT::i1)
23799 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23800 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23801 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23802 N->getBasePtr(), Mask, Src0,
23803 N->getMemoryVT(), N->getMemOperand(),
23804 N->getExtensionType(),
23805 N->isExpandingLoad());
23807 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23808 NewLoad.getValue(0),
23809 DAG.getIntPtrConstant(0, dl));
23810 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23811 return DAG.getMergeValues(RetOps, dl);
23814 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23815 SelectionDAG &DAG) {
23816 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23817 SDValue DataToStore = N->getValue();
23818 MVT VT = DataToStore.getSimpleValueType();
23819 MVT ScalarVT = VT.getScalarType();
23820 SDValue Mask = N->getMask();
23823 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
23824 "Expanding masked load is supported on AVX-512 target only!");
23826 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
23827 "Expanding masked load is supported for 32 and 64-bit types only!");
23829 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23830 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
23833 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23834 "Cannot lower masked store op.");
23836 assert((ScalarVT.getSizeInBits() >= 32 ||
23837 (Subtarget.hasBWI() &&
23838 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23839 "Unsupported masked store op.");
23841 // This operation is legal for targets with VLX, but without
23842 // VLX the vector should be widened to 512 bit
23843 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23844 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23846 // Mask element has to be i1.
23847 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23848 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23849 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23851 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23853 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23854 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23855 if (MaskEltTy != MVT::i1)
23856 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23857 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23858 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23859 Mask, N->getMemoryVT(), N->getMemOperand(),
23860 N->isTruncatingStore(), N->isCompressingStore());
23863 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23864 SelectionDAG &DAG) {
23865 assert(Subtarget.hasAVX512() &&
23866 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23868 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23870 MVT VT = Op.getSimpleValueType();
23871 SDValue Index = N->getIndex();
23872 SDValue Mask = N->getMask();
23873 SDValue Src0 = N->getValue();
23874 MVT IndexVT = Index.getSimpleValueType();
23875 MVT MaskVT = Mask.getSimpleValueType();
23877 unsigned NumElts = VT.getVectorNumElements();
23878 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23880 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23881 !Index.getSimpleValueType().is512BitVector()) {
23882 // AVX512F supports only 512-bit vectors. Or data or index should
23883 // be 512 bit wide. If now the both index and data are 256-bit, but
23884 // the vector contains 8 elements, we just sign-extend the index
23885 if (NumElts == 8) {
23886 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23887 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
23888 N->getOperand(3), Index };
23889 DAG.UpdateNodeOperands(N, Ops);
23893 // Minimal number of elements in Gather
23896 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23897 Index = ExtendToType(Index, NewIndexVT, DAG);
23898 if (IndexVT.getScalarType() == MVT::i32)
23899 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23902 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23903 // At this point we have promoted mask operand
23904 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23905 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23906 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23907 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23909 // The pass-through value
23910 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23911 Src0 = ExtendToType(Src0, NewVT, DAG);
23913 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23914 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23915 N->getMemoryVT(), dl, Ops,
23916 N->getMemOperand());
23917 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23918 NewGather.getValue(0),
23919 DAG.getIntPtrConstant(0, dl));
23920 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23921 return DAG.getMergeValues(RetOps, dl);
23923 if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
23924 // There is a special case when the return type is v2i32 is illegal and
23925 // the type legaizer extended it to v2i64. Without this conversion we end up
23926 // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
23927 // In order to avoid this situation, we'll build an X86 specific Gather node
23928 // with index v2i64 and value type v4i32.
23929 assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
23930 "Unexpected type in masked gather");
23931 Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
23932 DAG.getBitcast(MVT::v4i32, Src0),
23933 DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
23934 // The mask should match the destination type. Extending mask with zeroes
23935 // is not necessary since instruction itself reads only two values from
23937 Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
23938 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23939 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23940 DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
23941 N->getMemOperand());
23943 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
23944 NewGather.getValue(0), DAG);
23945 SDValue RetOps[] = { Sext, NewGather.getValue(1) };
23946 return DAG.getMergeValues(RetOps, dl);
23948 if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
23949 // This transformation is for optimization only.
23950 // The type legalizer extended mask and index to 4 elements vector
23951 // in order to match requirements of the common gather node - same
23952 // vector width of index and value. X86 Gather node allows mismatch
23953 // of vector width in order to select more optimal instruction at the
23955 assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
23956 "Unexpected type in masked gather");
23957 if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
23958 ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
23959 Index.getOpcode() == ISD::CONCAT_VECTORS &&
23960 Index.getOperand(1).isUndef()) {
23961 Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
23962 Index = Index.getOperand(0);
23965 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23966 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23967 DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
23968 N->getMemOperand());
23970 SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
23971 return DAG.getMergeValues(RetOps, dl);
23977 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23978 SelectionDAG &DAG) const {
23979 // TODO: Eventually, the lowering of these nodes should be informed by or
23980 // deferred to the GC strategy for the function in which they appear. For
23981 // now, however, they must be lowered to something. Since they are logically
23982 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23983 // require special handling for these nodes), lower them as literal NOOPs for
23985 SmallVector<SDValue, 2> Ops;
23987 Ops.push_back(Op.getOperand(0));
23988 if (Op->getGluedNode())
23989 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23992 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23993 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23998 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
23999 SelectionDAG &DAG) const {
24000 // TODO: Eventually, the lowering of these nodes should be informed by or
24001 // deferred to the GC strategy for the function in which they appear. For
24002 // now, however, they must be lowered to something. Since they are logically
24003 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24004 // require special handling for these nodes), lower them as literal NOOPs for
24006 SmallVector<SDValue, 2> Ops;
24008 Ops.push_back(Op.getOperand(0));
24009 if (Op->getGluedNode())
24010 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24013 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24014 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24019 /// Provide custom lowering hooks for some operations.
24020 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
24021 switch (Op.getOpcode()) {
24022 default: llvm_unreachable("Should not custom lower this!");
24023 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
24024 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
24025 return LowerCMP_SWAP(Op, Subtarget, DAG);
24026 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
24027 case ISD::ATOMIC_LOAD_ADD:
24028 case ISD::ATOMIC_LOAD_SUB:
24029 case ISD::ATOMIC_LOAD_OR:
24030 case ISD::ATOMIC_LOAD_XOR:
24031 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
24032 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
24033 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
24034 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
24035 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
24036 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
24037 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
24038 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
24039 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
24040 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
24041 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
24042 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
24043 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
24044 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
24045 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
24046 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
24047 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
24048 case ISD::SHL_PARTS:
24049 case ISD::SRA_PARTS:
24050 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
24051 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
24052 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
24053 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
24054 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
24055 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
24056 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
24057 case ISD::ZERO_EXTEND_VECTOR_INREG:
24058 case ISD::SIGN_EXTEND_VECTOR_INREG:
24059 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
24060 case ISD::FP_TO_SINT:
24061 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
24062 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
24063 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
24065 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
24066 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
24067 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
24068 case ISD::SETCC: return LowerSETCC(Op, DAG);
24069 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
24070 case ISD::SELECT: return LowerSELECT(Op, DAG);
24071 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
24072 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
24073 case ISD::VASTART: return LowerVASTART(Op, DAG);
24074 case ISD::VAARG: return LowerVAARG(Op, DAG);
24075 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
24076 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
24077 case ISD::INTRINSIC_VOID:
24078 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
24079 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
24080 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
24081 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
24082 case ISD::FRAME_TO_ARGS_OFFSET:
24083 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
24084 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
24085 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
24086 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
24087 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
24088 case ISD::EH_SJLJ_SETUP_DISPATCH:
24089 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
24090 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
24091 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
24092 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
24094 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
24096 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
24097 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
24099 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
24100 case ISD::UMUL_LOHI:
24101 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
24103 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
24106 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
24112 case ISD::UMULO: return LowerXALUO(Op, DAG);
24113 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
24114 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
24115 case ISD::ADDCARRY:
24116 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
24118 case ISD::SUB: return LowerADD_SUB(Op, DAG);
24122 case ISD::UMIN: return LowerMINMAX(Op, DAG);
24123 case ISD::ABS: return LowerABS(Op, DAG);
24124 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
24125 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
24126 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
24127 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
24128 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
24129 case ISD::GC_TRANSITION_START:
24130 return LowerGC_TRANSITION_START(Op, DAG);
24131 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
24132 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
24136 /// Places new result values for the node in Results (their number
24137 /// and types must exactly match those of the original return values of
24138 /// the node), or leaves Results empty, which indicates that the node is not
24139 /// to be custom lowered after all.
24140 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
24141 SmallVectorImpl<SDValue> &Results,
24142 SelectionDAG &DAG) const {
24143 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
24145 if (!Res.getNode())
24148 assert((N->getNumValues() <= Res->getNumValues()) &&
24149 "Lowering returned the wrong number of results!");
24151 // Places new result values base on N result number.
24152 // In some cases (LowerSINT_TO_FP for example) Res has more result values
24153 // than original node, chain should be dropped(last value).
24154 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
24155 Results.push_back(Res.getValue(I));
24158 /// Replace a node with an illegal result type with a new node built out of
24160 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
24161 SmallVectorImpl<SDValue>&Results,
24162 SelectionDAG &DAG) const {
24164 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24165 switch (N->getOpcode()) {
24167 llvm_unreachable("Do not know how to custom type legalize this operation!");
24168 case X86ISD::AVG: {
24169 // Legalize types for X86ISD::AVG by expanding vectors.
24170 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24172 auto InVT = N->getValueType(0);
24173 auto InVTSize = InVT.getSizeInBits();
24174 const unsigned RegSize =
24175 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
24176 assert((Subtarget.hasBWI() || RegSize < 512) &&
24177 "512-bit vector requires AVX512BW");
24178 assert((Subtarget.hasAVX2() || RegSize < 256) &&
24179 "256-bit vector requires AVX2");
24181 auto ElemVT = InVT.getVectorElementType();
24182 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
24183 RegSize / ElemVT.getSizeInBits());
24184 assert(RegSize % InVT.getSizeInBits() == 0);
24185 unsigned NumConcat = RegSize / InVT.getSizeInBits();
24187 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
24188 Ops[0] = N->getOperand(0);
24189 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24190 Ops[0] = N->getOperand(1);
24191 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24193 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
24194 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
24195 DAG.getIntPtrConstant(0, dl)));
24198 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
24199 case X86ISD::FMINC:
24201 case X86ISD::FMAXC:
24202 case X86ISD::FMAX: {
24203 EVT VT = N->getValueType(0);
24204 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
24205 SDValue UNDEF = DAG.getUNDEF(VT);
24206 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24207 N->getOperand(0), UNDEF);
24208 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24209 N->getOperand(1), UNDEF);
24210 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
24218 case ISD::UDIVREM: {
24219 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
24220 Results.push_back(V);
24223 case ISD::FP_TO_SINT:
24224 case ISD::FP_TO_UINT: {
24225 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
24227 if (N->getValueType(0) == MVT::v2i32) {
24228 assert((IsSigned || Subtarget.hasAVX512()) &&
24229 "Can only handle signed conversion without AVX512");
24230 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24231 SDValue Src = N->getOperand(0);
24232 if (Src.getValueType() == MVT::v2f64) {
24233 SDValue Idx = DAG.getIntPtrConstant(0, dl);
24234 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
24235 : X86ISD::CVTTP2UI,
24236 dl, MVT::v4i32, Src);
24237 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24238 Results.push_back(Res);
24241 if (Src.getValueType() == MVT::v2f32) {
24242 SDValue Idx = DAG.getIntPtrConstant(0, dl);
24243 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24244 DAG.getUNDEF(MVT::v2f32));
24245 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
24246 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
24247 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24248 Results.push_back(Res);
24252 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24253 // so early out here.
24257 std::pair<SDValue,SDValue> Vals =
24258 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
24259 SDValue FIST = Vals.first, StackSlot = Vals.second;
24260 if (FIST.getNode()) {
24261 EVT VT = N->getValueType(0);
24262 // Return a load from the stack slot.
24263 if (StackSlot.getNode())
24265 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24267 Results.push_back(FIST);
24271 case ISD::SINT_TO_FP: {
24272 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
24273 SDValue Src = N->getOperand(0);
24274 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
24276 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24279 case ISD::UINT_TO_FP: {
24280 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24281 EVT VT = N->getValueType(0);
24282 if (VT != MVT::v2f32)
24284 SDValue Src = N->getOperand(0);
24285 EVT SrcVT = Src.getValueType();
24286 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24287 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24290 if (SrcVT != MVT::v2i32)
24292 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24294 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24295 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24296 DAG.getBitcast(MVT::v2i64, VBias));
24297 Or = DAG.getBitcast(MVT::v2f64, Or);
24298 // TODO: Are there any fast-math-flags to propagate here?
24299 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24300 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24303 case ISD::FP_ROUND: {
24304 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24306 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24307 Results.push_back(V);
24310 case ISD::FP_EXTEND: {
24311 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24312 // No other ValueType for FP_EXTEND should reach this point.
24313 assert(N->getValueType(0) == MVT::v2f32 &&
24314 "Do not know how to legalize this Node");
24317 case ISD::INTRINSIC_W_CHAIN: {
24318 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24320 default : llvm_unreachable("Do not know how to custom type "
24321 "legalize this intrinsic operation!");
24322 case Intrinsic::x86_rdtsc:
24323 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24325 case Intrinsic::x86_rdtscp:
24326 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24328 case Intrinsic::x86_rdpmc:
24329 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24331 case Intrinsic::x86_xgetbv:
24332 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24335 case ISD::INTRINSIC_WO_CHAIN: {
24336 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
24337 Results.push_back(V);
24340 case ISD::READCYCLECOUNTER: {
24341 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24344 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24345 EVT T = N->getValueType(0);
24346 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24347 bool Regs64bit = T == MVT::i128;
24348 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24349 SDValue cpInL, cpInH;
24350 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24351 DAG.getConstant(0, dl, HalfT));
24352 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24353 DAG.getConstant(1, dl, HalfT));
24354 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24355 Regs64bit ? X86::RAX : X86::EAX,
24357 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24358 Regs64bit ? X86::RDX : X86::EDX,
24359 cpInH, cpInL.getValue(1));
24360 SDValue swapInL, swapInH;
24361 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24362 DAG.getConstant(0, dl, HalfT));
24363 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24364 DAG.getConstant(1, dl, HalfT));
24366 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24367 swapInH, cpInH.getValue(1));
24368 // If the current function needs the base pointer, RBX,
24369 // we shouldn't use cmpxchg directly.
24370 // Indeed the lowering of that instruction will clobber
24371 // that register and since RBX will be a reserved register
24372 // the register allocator will not make sure its value will
24373 // be properly saved and restored around this live-range.
24374 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24376 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24377 unsigned BasePtr = TRI->getBaseRegister();
24378 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24379 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24380 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24381 // ISel prefers the LCMPXCHG64 variant.
24382 // If that assert breaks, that means it is not the case anymore,
24383 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24384 // not just EBX. This is a matter of accepting i64 input for that
24385 // pseudo, and restoring into the register of the right wide
24386 // in expand pseudo. Everything else should just work.
24387 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24388 "Saving only half of the RBX");
24389 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24390 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24391 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24392 Regs64bit ? X86::RBX : X86::EBX,
24393 HalfT, swapInH.getValue(1));
24394 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24396 /*Glue*/ RBXSave.getValue(2)};
24397 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24400 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24401 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24402 Regs64bit ? X86::RBX : X86::EBX, swapInL,
24403 swapInH.getValue(1));
24404 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24405 swapInL.getValue(1)};
24406 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24408 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24409 Regs64bit ? X86::RAX : X86::EAX,
24410 HalfT, Result.getValue(1));
24411 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24412 Regs64bit ? X86::RDX : X86::EDX,
24413 HalfT, cpOutL.getValue(2));
24414 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24416 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24417 MVT::i32, cpOutH.getValue(2));
24418 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24419 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24421 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24422 Results.push_back(Success);
24423 Results.push_back(EFLAGS.getValue(1));
24426 case ISD::ATOMIC_SWAP:
24427 case ISD::ATOMIC_LOAD_ADD:
24428 case ISD::ATOMIC_LOAD_SUB:
24429 case ISD::ATOMIC_LOAD_AND:
24430 case ISD::ATOMIC_LOAD_OR:
24431 case ISD::ATOMIC_LOAD_XOR:
24432 case ISD::ATOMIC_LOAD_NAND:
24433 case ISD::ATOMIC_LOAD_MIN:
24434 case ISD::ATOMIC_LOAD_MAX:
24435 case ISD::ATOMIC_LOAD_UMIN:
24436 case ISD::ATOMIC_LOAD_UMAX:
24437 case ISD::ATOMIC_LOAD: {
24438 // Delegate to generic TypeLegalization. Situations we can really handle
24439 // should have already been dealt with by AtomicExpandPass.cpp.
24442 case ISD::BITCAST: {
24443 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24444 EVT DstVT = N->getValueType(0);
24445 EVT SrcVT = N->getOperand(0)->getValueType(0);
24447 if (SrcVT != MVT::f64 ||
24448 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24451 unsigned NumElts = DstVT.getVectorNumElements();
24452 EVT SVT = DstVT.getVectorElementType();
24453 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24454 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24455 MVT::v2f64, N->getOperand(0));
24456 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24458 if (ExperimentalVectorWideningLegalization) {
24459 // If we are legalizing vectors by widening, we already have the desired
24460 // legal vector type, just return it.
24461 Results.push_back(ToVecInt);
24465 SmallVector<SDValue, 8> Elts;
24466 for (unsigned i = 0, e = NumElts; i != e; ++i)
24467 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24468 ToVecInt, DAG.getIntPtrConstant(i, dl)));
24470 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24475 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24476 switch ((X86ISD::NodeType)Opcode) {
24477 case X86ISD::FIRST_NUMBER: break;
24478 case X86ISD::BSF: return "X86ISD::BSF";
24479 case X86ISD::BSR: return "X86ISD::BSR";
24480 case X86ISD::SHLD: return "X86ISD::SHLD";
24481 case X86ISD::SHRD: return "X86ISD::SHRD";
24482 case X86ISD::FAND: return "X86ISD::FAND";
24483 case X86ISD::FANDN: return "X86ISD::FANDN";
24484 case X86ISD::FOR: return "X86ISD::FOR";
24485 case X86ISD::FXOR: return "X86ISD::FXOR";
24486 case X86ISD::FILD: return "X86ISD::FILD";
24487 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
24488 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24489 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24490 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24491 case X86ISD::FLD: return "X86ISD::FLD";
24492 case X86ISD::FST: return "X86ISD::FST";
24493 case X86ISD::CALL: return "X86ISD::CALL";
24494 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
24495 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
24496 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
24497 case X86ISD::BT: return "X86ISD::BT";
24498 case X86ISD::CMP: return "X86ISD::CMP";
24499 case X86ISD::COMI: return "X86ISD::COMI";
24500 case X86ISD::UCOMI: return "X86ISD::UCOMI";
24501 case X86ISD::CMPM: return "X86ISD::CMPM";
24502 case X86ISD::CMPMU: return "X86ISD::CMPMU";
24503 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
24504 case X86ISD::SETCC: return "X86ISD::SETCC";
24505 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
24506 case X86ISD::FSETCC: return "X86ISD::FSETCC";
24507 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
24508 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
24509 case X86ISD::CMOV: return "X86ISD::CMOV";
24510 case X86ISD::BRCOND: return "X86ISD::BRCOND";
24511 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
24512 case X86ISD::IRET: return "X86ISD::IRET";
24513 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
24514 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
24515 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
24516 case X86ISD::Wrapper: return "X86ISD::Wrapper";
24517 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
24518 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
24519 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
24520 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
24521 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
24522 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
24523 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
24524 case X86ISD::PINSRB: return "X86ISD::PINSRB";
24525 case X86ISD::PINSRW: return "X86ISD::PINSRW";
24526 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
24527 case X86ISD::ANDNP: return "X86ISD::ANDNP";
24528 case X86ISD::BLENDI: return "X86ISD::BLENDI";
24529 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
24530 case X86ISD::ADDUS: return "X86ISD::ADDUS";
24531 case X86ISD::SUBUS: return "X86ISD::SUBUS";
24532 case X86ISD::HADD: return "X86ISD::HADD";
24533 case X86ISD::HSUB: return "X86ISD::HSUB";
24534 case X86ISD::FHADD: return "X86ISD::FHADD";
24535 case X86ISD::FHSUB: return "X86ISD::FHSUB";
24536 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
24537 case X86ISD::FMAX: return "X86ISD::FMAX";
24538 case X86ISD::FMAXS: return "X86ISD::FMAXS";
24539 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
24540 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
24541 case X86ISD::FMIN: return "X86ISD::FMIN";
24542 case X86ISD::FMINS: return "X86ISD::FMINS";
24543 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
24544 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
24545 case X86ISD::FMAXC: return "X86ISD::FMAXC";
24546 case X86ISD::FMINC: return "X86ISD::FMINC";
24547 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
24548 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
24549 case X86ISD::FRCP: return "X86ISD::FRCP";
24550 case X86ISD::FRCPS: return "X86ISD::FRCPS";
24551 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
24552 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
24553 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
24554 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
24555 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
24556 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
24557 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
24558 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
24559 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
24560 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
24561 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
24562 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
24563 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
24564 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
24565 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
24566 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
24567 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
24568 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
24569 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
24570 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
24571 case X86ISD::LADD: return "X86ISD::LADD";
24572 case X86ISD::LSUB: return "X86ISD::LSUB";
24573 case X86ISD::LOR: return "X86ISD::LOR";
24574 case X86ISD::LXOR: return "X86ISD::LXOR";
24575 case X86ISD::LAND: return "X86ISD::LAND";
24576 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
24577 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
24578 case X86ISD::VZEXT: return "X86ISD::VZEXT";
24579 case X86ISD::VSEXT: return "X86ISD::VSEXT";
24580 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
24581 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
24582 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
24583 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
24584 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
24585 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
24586 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
24587 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
24588 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
24589 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
24590 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
24591 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
24592 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
24593 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
24594 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
24595 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
24596 case X86ISD::VSHL: return "X86ISD::VSHL";
24597 case X86ISD::VSRL: return "X86ISD::VSRL";
24598 case X86ISD::VSRA: return "X86ISD::VSRA";
24599 case X86ISD::VSHLI: return "X86ISD::VSHLI";
24600 case X86ISD::VSRLI: return "X86ISD::VSRLI";
24601 case X86ISD::VSRAI: return "X86ISD::VSRAI";
24602 case X86ISD::VSRAV: return "X86ISD::VSRAV";
24603 case X86ISD::VROTLI: return "X86ISD::VROTLI";
24604 case X86ISD::VROTRI: return "X86ISD::VROTRI";
24605 case X86ISD::VPPERM: return "X86ISD::VPPERM";
24606 case X86ISD::CMPP: return "X86ISD::CMPP";
24607 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
24608 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
24609 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
24610 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
24611 case X86ISD::ADD: return "X86ISD::ADD";
24612 case X86ISD::SUB: return "X86ISD::SUB";
24613 case X86ISD::ADC: return "X86ISD::ADC";
24614 case X86ISD::SBB: return "X86ISD::SBB";
24615 case X86ISD::SMUL: return "X86ISD::SMUL";
24616 case X86ISD::UMUL: return "X86ISD::UMUL";
24617 case X86ISD::SMUL8: return "X86ISD::SMUL8";
24618 case X86ISD::UMUL8: return "X86ISD::UMUL8";
24619 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
24620 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
24621 case X86ISD::INC: return "X86ISD::INC";
24622 case X86ISD::DEC: return "X86ISD::DEC";
24623 case X86ISD::OR: return "X86ISD::OR";
24624 case X86ISD::XOR: return "X86ISD::XOR";
24625 case X86ISD::AND: return "X86ISD::AND";
24626 case X86ISD::BEXTR: return "X86ISD::BEXTR";
24627 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
24628 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
24629 case X86ISD::PTEST: return "X86ISD::PTEST";
24630 case X86ISD::TESTP: return "X86ISD::TESTP";
24631 case X86ISD::TESTM: return "X86ISD::TESTM";
24632 case X86ISD::TESTNM: return "X86ISD::TESTNM";
24633 case X86ISD::KORTEST: return "X86ISD::KORTEST";
24634 case X86ISD::KTEST: return "X86ISD::KTEST";
24635 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
24636 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
24637 case X86ISD::PACKSS: return "X86ISD::PACKSS";
24638 case X86ISD::PACKUS: return "X86ISD::PACKUS";
24639 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
24640 case X86ISD::VALIGN: return "X86ISD::VALIGN";
24641 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
24642 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
24643 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
24644 case X86ISD::SHUFP: return "X86ISD::SHUFP";
24645 case X86ISD::SHUF128: return "X86ISD::SHUF128";
24646 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
24647 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
24648 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
24649 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
24650 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
24651 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
24652 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
24653 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
24654 case X86ISD::MOVSD: return "X86ISD::MOVSD";
24655 case X86ISD::MOVSS: return "X86ISD::MOVSS";
24656 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
24657 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
24658 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
24659 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
24660 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
24661 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
24662 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
24663 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
24664 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
24665 case X86ISD::VPERMV: return "X86ISD::VPERMV";
24666 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
24667 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
24668 case X86ISD::VPERMI: return "X86ISD::VPERMI";
24669 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
24670 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
24671 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
24672 case X86ISD::VRANGE: return "X86ISD::VRANGE";
24673 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
24674 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
24675 case X86ISD::PSADBW: return "X86ISD::PSADBW";
24676 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
24677 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
24678 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
24679 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
24680 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
24681 case X86ISD::MFENCE: return "X86ISD::MFENCE";
24682 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
24683 case X86ISD::SAHF: return "X86ISD::SAHF";
24684 case X86ISD::RDRAND: return "X86ISD::RDRAND";
24685 case X86ISD::RDSEED: return "X86ISD::RDSEED";
24686 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
24687 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
24688 case X86ISD::VPROT: return "X86ISD::VPROT";
24689 case X86ISD::VPROTI: return "X86ISD::VPROTI";
24690 case X86ISD::VPSHA: return "X86ISD::VPSHA";
24691 case X86ISD::VPSHL: return "X86ISD::VPSHL";
24692 case X86ISD::VPCOM: return "X86ISD::VPCOM";
24693 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
24694 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
24695 case X86ISD::FMADD: return "X86ISD::FMADD";
24696 case X86ISD::FMSUB: return "X86ISD::FMSUB";
24697 case X86ISD::FNMADD: return "X86ISD::FNMADD";
24698 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
24699 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
24700 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
24701 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
24702 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
24703 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
24704 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
24705 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
24706 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
24707 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
24708 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
24709 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
24710 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
24711 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
24712 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
24713 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
24714 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
24715 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
24716 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
24717 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
24718 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
24719 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
24720 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
24721 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
24722 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
24723 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
24724 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
24725 case X86ISD::XTEST: return "X86ISD::XTEST";
24726 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
24727 case X86ISD::EXPAND: return "X86ISD::EXPAND";
24728 case X86ISD::SELECT: return "X86ISD::SELECT";
24729 case X86ISD::SELECTS: return "X86ISD::SELECTS";
24730 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
24731 case X86ISD::RCP28: return "X86ISD::RCP28";
24732 case X86ISD::RCP28S: return "X86ISD::RCP28S";
24733 case X86ISD::EXP2: return "X86ISD::EXP2";
24734 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
24735 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
24736 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
24737 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
24738 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
24739 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
24740 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
24741 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
24742 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
24743 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
24744 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
24745 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
24746 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
24747 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
24748 case X86ISD::SCALEF: return "X86ISD::SCALEF";
24749 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
24750 case X86ISD::ADDS: return "X86ISD::ADDS";
24751 case X86ISD::SUBS: return "X86ISD::SUBS";
24752 case X86ISD::AVG: return "X86ISD::AVG";
24753 case X86ISD::MULHRS: return "X86ISD::MULHRS";
24754 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
24755 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
24756 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
24757 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
24758 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
24759 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
24760 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
24761 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
24762 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
24763 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
24764 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
24765 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
24766 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
24767 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24768 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24769 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
24770 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
24771 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
24772 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
24773 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
24774 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
24775 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
24776 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
24777 case X86ISD::LWPINS: return "X86ISD::LWPINS";
24778 case X86ISD::MGATHER: return "X86ISD::MGATHER";
24783 /// Return true if the addressing mode represented by AM is legal for this
24784 /// target, for a load/store of the specified type.
24785 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24786 const AddrMode &AM, Type *Ty,
24787 unsigned AS) const {
24788 // X86 supports extremely general addressing modes.
24789 CodeModel::Model M = getTargetMachine().getCodeModel();
24791 // X86 allows a sign-extended 32-bit immediate field as a displacement.
24792 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24796 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24798 // If a reference to this global requires an extra load, we can't fold it.
24799 if (isGlobalStubReference(GVFlags))
24802 // If BaseGV requires a register for the PIC base, we cannot also have a
24803 // BaseReg specified.
24804 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
24807 // If lower 4G is not available, then we must use rip-relative addressing.
24808 if ((M != CodeModel::Small || isPositionIndependent()) &&
24809 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
24813 switch (AM.Scale) {
24819 // These scales always work.
24824 // These scales are formed with basereg+scalereg. Only accept if there is
24829 default: // Other stuff never works.
24836 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24837 unsigned Bits = Ty->getScalarSizeInBits();
24839 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
24840 // particularly cheaper than those without.
24844 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24845 // variable shifts just as cheap as scalar ones.
24846 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
24849 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24850 // fully general vector.
24854 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
24855 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24857 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24858 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24859 return NumBits1 > NumBits2;
24862 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
24863 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24866 if (!isTypeLegal(EVT::getEVT(Ty1)))
24869 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
24871 // Assuming the caller doesn't have a zeroext or signext return parameter,
24872 // truncation all the way down to i1 is valid.
24876 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24877 return isInt<32>(Imm);
24880 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24881 // Can also use sub to handle negated immediates.
24882 return isInt<32>(Imm);
24885 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24886 if (!VT1.isInteger() || !VT2.isInteger())
24888 unsigned NumBits1 = VT1.getSizeInBits();
24889 unsigned NumBits2 = VT2.getSizeInBits();
24890 return NumBits1 > NumBits2;
24893 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
24894 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24895 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
24898 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24899 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24900 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
24903 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24904 EVT VT1 = Val.getValueType();
24905 if (isZExtFree(VT1, VT2))
24908 if (Val.getOpcode() != ISD::LOAD)
24911 if (!VT1.isSimple() || !VT1.isInteger() ||
24912 !VT2.isSimple() || !VT2.isInteger())
24915 switch (VT1.getSimpleVT().SimpleTy) {
24920 // X86 has 8, 16, and 32-bit zero-extending loads.
24927 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24930 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24931 if (!Subtarget.hasAnyFMA())
24934 VT = VT.getScalarType();
24936 if (!VT.isSimple())
24939 switch (VT.getSimpleVT().SimpleTy) {
24950 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24951 // i16 instructions are longer (0x66 prefix) and potentially slower.
24952 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24955 /// Targets can use this to indicate that they only support *some*
24956 /// VECTOR_SHUFFLE operations, those with specific masks.
24957 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24958 /// are assumed to be legal.
24960 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24962 if (!VT.isSimple())
24965 // Not for i1 vectors
24966 if (VT.getSimpleVT().getScalarType() == MVT::i1)
24969 // Very little shuffling can be done for 64-bit vectors right now.
24970 if (VT.getSimpleVT().getSizeInBits() == 64)
24973 // We only care that the types being shuffled are legal. The lowering can
24974 // handle any possible shuffle mask that results.
24975 return isTypeLegal(VT.getSimpleVT());
24979 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24981 // Just delegate to the generic legality, clear masks aren't special.
24982 return isShuffleMaskLegal(Mask, VT);
24985 //===----------------------------------------------------------------------===//
24986 // X86 Scheduler Hooks
24987 //===----------------------------------------------------------------------===//
24989 /// Utility function to emit xbegin specifying the start of an RTM region.
24990 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
24991 const TargetInstrInfo *TII) {
24992 DebugLoc DL = MI.getDebugLoc();
24994 const BasicBlock *BB = MBB->getBasicBlock();
24995 MachineFunction::iterator I = ++MBB->getIterator();
24997 // For the v = xbegin(), we generate
25006 // eax = # XABORT_DEF
25010 // v = phi(s0/mainBB, s1/fallBB)
25012 MachineBasicBlock *thisMBB = MBB;
25013 MachineFunction *MF = MBB->getParent();
25014 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25015 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
25016 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25017 MF->insert(I, mainMBB);
25018 MF->insert(I, fallMBB);
25019 MF->insert(I, sinkMBB);
25021 // Transfer the remainder of BB and its successor edges to sinkMBB.
25022 sinkMBB->splice(sinkMBB->begin(), MBB,
25023 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25024 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25026 MachineRegisterInfo &MRI = MF->getRegInfo();
25027 unsigned DstReg = MI.getOperand(0).getReg();
25028 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25029 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25030 unsigned fallDstReg = MRI.createVirtualRegister(RC);
25034 // # fallthrough to mainMBB
25035 // # abortion to fallMBB
25036 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
25037 thisMBB->addSuccessor(mainMBB);
25038 thisMBB->addSuccessor(fallMBB);
25041 // mainDstReg := -1
25042 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
25043 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25044 mainMBB->addSuccessor(sinkMBB);
25047 // ; pseudo instruction to model hardware's definition from XABORT
25048 // EAX := XABORT_DEF
25049 // fallDstReg := EAX
25050 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
25051 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
25053 fallMBB->addSuccessor(sinkMBB);
25056 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
25057 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
25058 .addReg(mainDstReg).addMBB(mainMBB)
25059 .addReg(fallDstReg).addMBB(fallMBB);
25061 MI.eraseFromParent();
25065 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
25066 // or XMM0_V32I8 in AVX all of this code can be replaced with that
25067 // in the .td file.
25068 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
25069 const TargetInstrInfo *TII) {
25071 switch (MI.getOpcode()) {
25072 default: llvm_unreachable("illegal opcode!");
25073 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
25074 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
25075 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
25076 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
25077 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
25078 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
25079 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
25080 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
25083 DebugLoc dl = MI.getDebugLoc();
25084 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25086 unsigned NumArgs = MI.getNumOperands();
25087 for (unsigned i = 1; i < NumArgs; ++i) {
25088 MachineOperand &Op = MI.getOperand(i);
25089 if (!(Op.isReg() && Op.isImplicit()))
25092 if (MI.hasOneMemOperand())
25093 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25095 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25096 .addReg(X86::XMM0);
25098 MI.eraseFromParent();
25102 // FIXME: Custom handling because TableGen doesn't support multiple implicit
25103 // defs in an instruction pattern
25104 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
25105 const TargetInstrInfo *TII) {
25107 switch (MI.getOpcode()) {
25108 default: llvm_unreachable("illegal opcode!");
25109 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
25110 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
25111 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
25112 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
25113 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
25114 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
25115 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
25116 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
25119 DebugLoc dl = MI.getDebugLoc();
25120 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25122 unsigned NumArgs = MI.getNumOperands(); // remove the results
25123 for (unsigned i = 1; i < NumArgs; ++i) {
25124 MachineOperand &Op = MI.getOperand(i);
25125 if (!(Op.isReg() && Op.isImplicit()))
25128 if (MI.hasOneMemOperand())
25129 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25131 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25134 MI.eraseFromParent();
25138 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25139 const X86Subtarget &Subtarget) {
25140 DebugLoc dl = MI.getDebugLoc();
25141 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25143 // insert input VAL into EAX
25144 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
25145 .addReg(MI.getOperand(0).getReg());
25146 // insert zero to ECX
25147 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25149 // insert zero to EDX
25150 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
25152 // insert WRPKRU instruction
25153 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
25155 MI.eraseFromParent(); // The pseudo is gone now.
25159 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25160 const X86Subtarget &Subtarget) {
25161 DebugLoc dl = MI.getDebugLoc();
25162 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25164 // insert zero to ECX
25165 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25167 // insert RDPKRU instruction
25168 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
25169 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25172 MI.eraseFromParent(); // The pseudo is gone now.
25176 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
25177 const X86Subtarget &Subtarget,
25179 DebugLoc dl = MI.getDebugLoc();
25180 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25181 // Address into RAX/EAX, other two args into ECX, EDX.
25182 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25183 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25184 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25185 for (int i = 0; i < X86::AddrNumOperands; ++i)
25186 MIB.add(MI.getOperand(i));
25188 unsigned ValOps = X86::AddrNumOperands;
25189 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
25190 .addReg(MI.getOperand(ValOps).getReg());
25191 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
25192 .addReg(MI.getOperand(ValOps + 1).getReg());
25194 // The instruction doesn't actually take any operands though.
25195 BuildMI(*BB, MI, dl, TII->get(Opc));
25197 MI.eraseFromParent(); // The pseudo is gone now.
25201 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
25202 const X86Subtarget &Subtarget) {
25203 DebugLoc dl = MI->getDebugLoc();
25204 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25205 // Address into RAX/EAX
25206 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25207 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25208 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25209 for (int i = 0; i < X86::AddrNumOperands; ++i)
25210 MIB.add(MI->getOperand(i));
25212 // The instruction doesn't actually take any operands though.
25213 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
25215 MI->eraseFromParent(); // The pseudo is gone now.
25221 MachineBasicBlock *
25222 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
25223 MachineBasicBlock *MBB) const {
25224 // Emit va_arg instruction on X86-64.
25226 // Operands to this pseudo-instruction:
25227 // 0 ) Output : destination address (reg)
25228 // 1-5) Input : va_list address (addr, i64mem)
25229 // 6 ) ArgSize : Size (in bytes) of vararg type
25230 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
25231 // 8 ) Align : Alignment of type
25232 // 9 ) EFLAGS (implicit-def)
25234 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
25235 static_assert(X86::AddrNumOperands == 5,
25236 "VAARG_64 assumes 5 address operands");
25238 unsigned DestReg = MI.getOperand(0).getReg();
25239 MachineOperand &Base = MI.getOperand(1);
25240 MachineOperand &Scale = MI.getOperand(2);
25241 MachineOperand &Index = MI.getOperand(3);
25242 MachineOperand &Disp = MI.getOperand(4);
25243 MachineOperand &Segment = MI.getOperand(5);
25244 unsigned ArgSize = MI.getOperand(6).getImm();
25245 unsigned ArgMode = MI.getOperand(7).getImm();
25246 unsigned Align = MI.getOperand(8).getImm();
25248 // Memory Reference
25249 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
25250 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25251 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25253 // Machine Information
25254 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25255 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
25256 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
25257 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
25258 DebugLoc DL = MI.getDebugLoc();
25260 // struct va_list {
25263 // i64 overflow_area (address)
25264 // i64 reg_save_area (address)
25266 // sizeof(va_list) = 24
25267 // alignment(va_list) = 8
25269 unsigned TotalNumIntRegs = 6;
25270 unsigned TotalNumXMMRegs = 8;
25271 bool UseGPOffset = (ArgMode == 1);
25272 bool UseFPOffset = (ArgMode == 2);
25273 unsigned MaxOffset = TotalNumIntRegs * 8 +
25274 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
25276 /* Align ArgSize to a multiple of 8 */
25277 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
25278 bool NeedsAlign = (Align > 8);
25280 MachineBasicBlock *thisMBB = MBB;
25281 MachineBasicBlock *overflowMBB;
25282 MachineBasicBlock *offsetMBB;
25283 MachineBasicBlock *endMBB;
25285 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
25286 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
25287 unsigned OffsetReg = 0;
25289 if (!UseGPOffset && !UseFPOffset) {
25290 // If we only pull from the overflow region, we don't create a branch.
25291 // We don't need to alter control flow.
25292 OffsetDestReg = 0; // unused
25293 OverflowDestReg = DestReg;
25295 offsetMBB = nullptr;
25296 overflowMBB = thisMBB;
25299 // First emit code to check if gp_offset (or fp_offset) is below the bound.
25300 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
25301 // If not, pull from overflow_area. (branch to overflowMBB)
25306 // offsetMBB overflowMBB
25311 // Registers for the PHI in endMBB
25312 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
25313 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
25315 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25316 MachineFunction *MF = MBB->getParent();
25317 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25318 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25319 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25321 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25323 // Insert the new basic blocks
25324 MF->insert(MBBIter, offsetMBB);
25325 MF->insert(MBBIter, overflowMBB);
25326 MF->insert(MBBIter, endMBB);
25328 // Transfer the remainder of MBB and its successor edges to endMBB.
25329 endMBB->splice(endMBB->begin(), thisMBB,
25330 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25331 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25333 // Make offsetMBB and overflowMBB successors of thisMBB
25334 thisMBB->addSuccessor(offsetMBB);
25335 thisMBB->addSuccessor(overflowMBB);
25337 // endMBB is a successor of both offsetMBB and overflowMBB
25338 offsetMBB->addSuccessor(endMBB);
25339 overflowMBB->addSuccessor(endMBB);
25341 // Load the offset value into a register
25342 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25343 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25347 .addDisp(Disp, UseFPOffset ? 4 : 0)
25349 .setMemRefs(MMOBegin, MMOEnd);
25351 // Check if there is enough room left to pull this argument.
25352 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25354 .addImm(MaxOffset + 8 - ArgSizeA8);
25356 // Branch to "overflowMBB" if offset >= max
25357 // Fall through to "offsetMBB" otherwise
25358 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25359 .addMBB(overflowMBB);
25362 // In offsetMBB, emit code to use the reg_save_area.
25364 assert(OffsetReg != 0);
25366 // Read the reg_save_area address.
25367 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25368 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25374 .setMemRefs(MMOBegin, MMOEnd);
25376 // Zero-extend the offset
25377 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25378 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25381 .addImm(X86::sub_32bit);
25383 // Add the offset to the reg_save_area to get the final address.
25384 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25385 .addReg(OffsetReg64)
25386 .addReg(RegSaveReg);
25388 // Compute the offset for the next argument
25389 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25390 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25392 .addImm(UseFPOffset ? 16 : 8);
25394 // Store it back into the va_list.
25395 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25399 .addDisp(Disp, UseFPOffset ? 4 : 0)
25401 .addReg(NextOffsetReg)
25402 .setMemRefs(MMOBegin, MMOEnd);
25405 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25410 // Emit code to use overflow area
25413 // Load the overflow_area address into a register.
25414 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25415 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25421 .setMemRefs(MMOBegin, MMOEnd);
25423 // If we need to align it, do so. Otherwise, just copy the address
25424 // to OverflowDestReg.
25426 // Align the overflow address
25427 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
25428 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25430 // aligned_addr = (addr + (align-1)) & ~(align-1)
25431 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25432 .addReg(OverflowAddrReg)
25435 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25437 .addImm(~(uint64_t)(Align-1));
25439 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25440 .addReg(OverflowAddrReg);
25443 // Compute the next overflow address after this argument.
25444 // (the overflow address should be kept 8-byte aligned)
25445 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25446 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25447 .addReg(OverflowDestReg)
25448 .addImm(ArgSizeA8);
25450 // Store the new overflow address.
25451 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25457 .addReg(NextAddrReg)
25458 .setMemRefs(MMOBegin, MMOEnd);
25460 // If we branched, emit the PHI to the front of endMBB.
25462 BuildMI(*endMBB, endMBB->begin(), DL,
25463 TII->get(X86::PHI), DestReg)
25464 .addReg(OffsetDestReg).addMBB(offsetMBB)
25465 .addReg(OverflowDestReg).addMBB(overflowMBB);
25468 // Erase the pseudo instruction
25469 MI.eraseFromParent();
25474 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25475 MachineInstr &MI, MachineBasicBlock *MBB) const {
25476 // Emit code to save XMM registers to the stack. The ABI says that the
25477 // number of registers to save is given in %al, so it's theoretically
25478 // possible to do an indirect jump trick to avoid saving all of them,
25479 // however this code takes a simpler approach and just executes all
25480 // of the stores if %al is non-zero. It's less code, and it's probably
25481 // easier on the hardware branch predictor, and stores aren't all that
25482 // expensive anyway.
25484 // Create the new basic blocks. One block contains all the XMM stores,
25485 // and one block is the final destination regardless of whether any
25486 // stores were performed.
25487 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25488 MachineFunction *F = MBB->getParent();
25489 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25490 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25491 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25492 F->insert(MBBIter, XMMSaveMBB);
25493 F->insert(MBBIter, EndMBB);
25495 // Transfer the remainder of MBB and its successor edges to EndMBB.
25496 EndMBB->splice(EndMBB->begin(), MBB,
25497 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25498 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25500 // The original block will now fall through to the XMM save block.
25501 MBB->addSuccessor(XMMSaveMBB);
25502 // The XMMSaveMBB will fall through to the end block.
25503 XMMSaveMBB->addSuccessor(EndMBB);
25505 // Now add the instructions.
25506 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25507 DebugLoc DL = MI.getDebugLoc();
25509 unsigned CountReg = MI.getOperand(0).getReg();
25510 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25511 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25513 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
25514 // If %al is 0, branch around the XMM save block.
25515 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
25516 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
25517 MBB->addSuccessor(EndMBB);
25520 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
25521 // that was just emitted, but clearly shouldn't be "saved".
25522 assert((MI.getNumOperands() <= 3 ||
25523 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
25524 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
25525 "Expected last argument to be EFLAGS");
25526 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
25527 // In the XMM save block, save all the XMM argument registers.
25528 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
25529 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
25530 MachineMemOperand *MMO = F->getMachineMemOperand(
25531 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
25532 MachineMemOperand::MOStore,
25533 /*Size=*/16, /*Align=*/16);
25534 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
25535 .addFrameIndex(RegSaveFrameIndex)
25536 .addImm(/*Scale=*/1)
25537 .addReg(/*IndexReg=*/0)
25538 .addImm(/*Disp=*/Offset)
25539 .addReg(/*Segment=*/0)
25540 .addReg(MI.getOperand(i).getReg())
25541 .addMemOperand(MMO);
25544 MI.eraseFromParent(); // The pseudo instruction is gone now.
25549 // The EFLAGS operand of SelectItr might be missing a kill marker
25550 // because there were multiple uses of EFLAGS, and ISel didn't know
25551 // which to mark. Figure out whether SelectItr should have had a
25552 // kill marker, and set it if it should. Returns the correct kill
25554 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
25555 MachineBasicBlock* BB,
25556 const TargetRegisterInfo* TRI) {
25557 // Scan forward through BB for a use/def of EFLAGS.
25558 MachineBasicBlock::iterator miI(std::next(SelectItr));
25559 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
25560 const MachineInstr& mi = *miI;
25561 if (mi.readsRegister(X86::EFLAGS))
25563 if (mi.definesRegister(X86::EFLAGS))
25564 break; // Should have kill-flag - update below.
25567 // If we hit the end of the block, check whether EFLAGS is live into a
25569 if (miI == BB->end()) {
25570 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
25571 sEnd = BB->succ_end();
25572 sItr != sEnd; ++sItr) {
25573 MachineBasicBlock* succ = *sItr;
25574 if (succ->isLiveIn(X86::EFLAGS))
25579 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
25580 // out. SelectMI should have a kill flag on EFLAGS.
25581 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
25585 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
25586 // together with other CMOV pseudo-opcodes into a single basic-block with
25587 // conditional jump around it.
25588 static bool isCMOVPseudo(MachineInstr &MI) {
25589 switch (MI.getOpcode()) {
25590 case X86::CMOV_FR32:
25591 case X86::CMOV_FR64:
25592 case X86::CMOV_GR8:
25593 case X86::CMOV_GR16:
25594 case X86::CMOV_GR32:
25595 case X86::CMOV_RFP32:
25596 case X86::CMOV_RFP64:
25597 case X86::CMOV_RFP80:
25598 case X86::CMOV_V2F64:
25599 case X86::CMOV_V2I64:
25600 case X86::CMOV_V4F32:
25601 case X86::CMOV_V4F64:
25602 case X86::CMOV_V4I64:
25603 case X86::CMOV_V16F32:
25604 case X86::CMOV_V8F32:
25605 case X86::CMOV_V8F64:
25606 case X86::CMOV_V8I64:
25607 case X86::CMOV_V8I1:
25608 case X86::CMOV_V16I1:
25609 case X86::CMOV_V32I1:
25610 case X86::CMOV_V64I1:
25618 MachineBasicBlock *
25619 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
25620 MachineBasicBlock *BB) const {
25621 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25622 DebugLoc DL = MI.getDebugLoc();
25624 // To "insert" a SELECT_CC instruction, we actually have to insert the
25625 // diamond control-flow pattern. The incoming instruction knows the
25626 // destination vreg to set, the condition code register to branch on, the
25627 // true/false values to select between, and a branch opcode to use.
25628 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25629 MachineFunction::iterator It = ++BB->getIterator();
25634 // cmpTY ccX, r1, r2
25636 // fallthrough --> copy0MBB
25637 MachineBasicBlock *thisMBB = BB;
25638 MachineFunction *F = BB->getParent();
25640 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
25641 // as described above, by inserting a BB, and then making a PHI at the join
25642 // point to select the true and false operands of the CMOV in the PHI.
25644 // The code also handles two different cases of multiple CMOV opcodes
25648 // In this case, there are multiple CMOVs in a row, all which are based on
25649 // the same condition setting (or the exact opposite condition setting).
25650 // In this case we can lower all the CMOVs using a single inserted BB, and
25651 // then make a number of PHIs at the join point to model the CMOVs. The only
25652 // trickiness here, is that in a case like:
25654 // t2 = CMOV cond1 t1, f1
25655 // t3 = CMOV cond1 t2, f2
25657 // when rewriting this into PHIs, we have to perform some renaming on the
25658 // temps since you cannot have a PHI operand refer to a PHI result earlier
25659 // in the same block. The "simple" but wrong lowering would be:
25661 // t2 = PHI t1(BB1), f1(BB2)
25662 // t3 = PHI t2(BB1), f2(BB2)
25664 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
25665 // renaming is to note that on the path through BB1, t2 is really just a
25666 // copy of t1, and do that renaming, properly generating:
25668 // t2 = PHI t1(BB1), f1(BB2)
25669 // t3 = PHI t1(BB1), f2(BB2)
25671 // Case 2, we lower cascaded CMOVs such as
25673 // (CMOV (CMOV F, T, cc1), T, cc2)
25675 // to two successive branches. For that, we look for another CMOV as the
25676 // following instruction.
25678 // Without this, we would add a PHI between the two jumps, which ends up
25679 // creating a few copies all around. For instance, for
25681 // (sitofp (zext (fcmp une)))
25683 // we would generate:
25685 // ucomiss %xmm1, %xmm0
25686 // movss <1.0f>, %xmm0
25687 // movaps %xmm0, %xmm1
25689 // xorps %xmm1, %xmm1
25692 // movaps %xmm1, %xmm0
25696 // because this custom-inserter would have generated:
25708 // A: X = ...; Y = ...
25710 // C: Z = PHI [X, A], [Y, B]
25712 // E: PHI [X, C], [Z, D]
25714 // If we lower both CMOVs in a single step, we can instead generate:
25726 // A: X = ...; Y = ...
25728 // E: PHI [X, A], [X, C], [Y, D]
25730 // Which, in our sitofp/fcmp example, gives us something like:
25732 // ucomiss %xmm1, %xmm0
25733 // movss <1.0f>, %xmm0
25736 // xorps %xmm0, %xmm0
25740 MachineInstr *CascadedCMOV = nullptr;
25741 MachineInstr *LastCMOV = &MI;
25742 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
25743 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25744 MachineBasicBlock::iterator NextMIIt =
25745 std::next(MachineBasicBlock::iterator(MI));
25747 // Check for case 1, where there are multiple CMOVs with the same condition
25748 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
25749 // number of jumps the most.
25751 if (isCMOVPseudo(MI)) {
25752 // See if we have a string of CMOVS with the same condition.
25753 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
25754 (NextMIIt->getOperand(3).getImm() == CC ||
25755 NextMIIt->getOperand(3).getImm() == OppCC)) {
25756 LastCMOV = &*NextMIIt;
25761 // This checks for case 2, but only do this if we didn't already find
25762 // case 1, as indicated by LastCMOV == MI.
25763 if (LastCMOV == &MI && NextMIIt != BB->end() &&
25764 NextMIIt->getOpcode() == MI.getOpcode() &&
25765 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
25766 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
25767 NextMIIt->getOperand(1).isKill()) {
25768 CascadedCMOV = &*NextMIIt;
25771 MachineBasicBlock *jcc1MBB = nullptr;
25773 // If we have a cascaded CMOV, we lower it to two successive branches to
25774 // the same block. EFLAGS is used by both, so mark it as live in the second.
25775 if (CascadedCMOV) {
25776 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
25777 F->insert(It, jcc1MBB);
25778 jcc1MBB->addLiveIn(X86::EFLAGS);
25781 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
25782 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25783 F->insert(It, copy0MBB);
25784 F->insert(It, sinkMBB);
25786 // If the EFLAGS register isn't dead in the terminator, then claim that it's
25787 // live into the sink and copy blocks.
25788 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25790 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
25791 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
25792 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
25793 copy0MBB->addLiveIn(X86::EFLAGS);
25794 sinkMBB->addLiveIn(X86::EFLAGS);
25797 // Transfer the remainder of BB and its successor edges to sinkMBB.
25798 sinkMBB->splice(sinkMBB->begin(), BB,
25799 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
25800 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
25802 // Add the true and fallthrough blocks as its successors.
25803 if (CascadedCMOV) {
25804 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
25805 BB->addSuccessor(jcc1MBB);
25807 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
25808 // jump to the sinkMBB.
25809 jcc1MBB->addSuccessor(copy0MBB);
25810 jcc1MBB->addSuccessor(sinkMBB);
25812 BB->addSuccessor(copy0MBB);
25815 // The true block target of the first (or only) branch is always sinkMBB.
25816 BB->addSuccessor(sinkMBB);
25818 // Create the conditional branch instruction.
25819 unsigned Opc = X86::GetCondBranchFromCond(CC);
25820 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
25822 if (CascadedCMOV) {
25823 unsigned Opc2 = X86::GetCondBranchFromCond(
25824 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
25825 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
25829 // %FalseValue = ...
25830 // # fallthrough to sinkMBB
25831 copy0MBB->addSuccessor(sinkMBB);
25834 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
25836 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25837 MachineBasicBlock::iterator MIItEnd =
25838 std::next(MachineBasicBlock::iterator(LastCMOV));
25839 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
25840 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25841 MachineInstrBuilder MIB;
25843 // As we are creating the PHIs, we have to be careful if there is more than
25844 // one. Later CMOVs may reference the results of earlier CMOVs, but later
25845 // PHIs have to reference the individual true/false inputs from earlier PHIs.
25846 // That also means that PHI construction must work forward from earlier to
25847 // later, and that the code must maintain a mapping from earlier PHI's
25848 // destination registers, and the registers that went into the PHI.
25850 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
25851 unsigned DestReg = MIIt->getOperand(0).getReg();
25852 unsigned Op1Reg = MIIt->getOperand(1).getReg();
25853 unsigned Op2Reg = MIIt->getOperand(2).getReg();
25855 // If this CMOV we are generating is the opposite condition from
25856 // the jump we generated, then we have to swap the operands for the
25857 // PHI that is going to be generated.
25858 if (MIIt->getOperand(3).getImm() == OppCC)
25859 std::swap(Op1Reg, Op2Reg);
25861 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25862 Op1Reg = RegRewriteTable[Op1Reg].first;
25864 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25865 Op2Reg = RegRewriteTable[Op2Reg].second;
25867 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
25868 TII->get(X86::PHI), DestReg)
25869 .addReg(Op1Reg).addMBB(copy0MBB)
25870 .addReg(Op2Reg).addMBB(thisMBB);
25872 // Add this PHI to the rewrite table.
25873 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25876 // If we have a cascaded CMOV, the second Jcc provides the same incoming
25877 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
25878 if (CascadedCMOV) {
25879 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
25880 // Copy the PHI result to the register defined by the second CMOV.
25881 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
25882 DL, TII->get(TargetOpcode::COPY),
25883 CascadedCMOV->getOperand(0).getReg())
25884 .addReg(MI.getOperand(0).getReg());
25885 CascadedCMOV->eraseFromParent();
25888 // Now remove the CMOV(s).
25889 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
25890 (MIIt++)->eraseFromParent();
25895 MachineBasicBlock *
25896 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25897 MachineBasicBlock *BB) const {
25898 // Combine the following atomic floating-point modification pattern:
25899 // a.store(reg OP a.load(acquire), release)
25900 // Transform them into:
25901 // OPss (%gpr), %xmm
25902 // movss %xmm, (%gpr)
25903 // Or sd equivalent for 64-bit operations.
25905 switch (MI.getOpcode()) {
25906 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
25907 case X86::RELEASE_FADD32mr:
25908 FOp = X86::ADDSSrm;
25909 MOp = X86::MOVSSmr;
25911 case X86::RELEASE_FADD64mr:
25912 FOp = X86::ADDSDrm;
25913 MOp = X86::MOVSDmr;
25916 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25917 DebugLoc DL = MI.getDebugLoc();
25918 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25919 unsigned ValOpIdx = X86::AddrNumOperands;
25920 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25921 MachineInstrBuilder MIB =
25922 BuildMI(*BB, MI, DL, TII->get(FOp),
25923 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25925 for (int i = 0; i < X86::AddrNumOperands; ++i) {
25926 MachineOperand &Operand = MI.getOperand(i);
25927 // Clear any kill flags on register operands as we'll create a second
25928 // instruction using the same address operands.
25929 if (Operand.isReg())
25930 Operand.setIsKill(false);
25933 MachineInstr *FOpMI = MIB;
25934 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25935 for (int i = 0; i < X86::AddrNumOperands; ++i)
25936 MIB.add(MI.getOperand(i));
25937 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25938 MI.eraseFromParent(); // The pseudo instruction is gone now.
25942 MachineBasicBlock *
25943 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25944 MachineBasicBlock *BB) const {
25945 MachineFunction *MF = BB->getParent();
25946 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25947 DebugLoc DL = MI.getDebugLoc();
25948 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25950 assert(MF->shouldSplitStack());
25952 const bool Is64Bit = Subtarget.is64Bit();
25953 const bool IsLP64 = Subtarget.isTarget64BitLP64();
25955 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
25956 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
25959 // ... [Till the alloca]
25960 // If stacklet is not large enough, jump to mallocMBB
25963 // Allocate by subtracting from RSP
25964 // Jump to continueMBB
25967 // Allocate by call to runtime
25971 // [rest of original BB]
25974 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25975 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25976 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25978 MachineRegisterInfo &MRI = MF->getRegInfo();
25979 const TargetRegisterClass *AddrRegClass =
25980 getRegClassFor(getPointerTy(MF->getDataLayout()));
25982 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25983 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25984 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25985 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25986 sizeVReg = MI.getOperand(1).getReg(),
25988 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
25990 MachineFunction::iterator MBBIter = ++BB->getIterator();
25992 MF->insert(MBBIter, bumpMBB);
25993 MF->insert(MBBIter, mallocMBB);
25994 MF->insert(MBBIter, continueMBB);
25996 continueMBB->splice(continueMBB->begin(), BB,
25997 std::next(MachineBasicBlock::iterator(MI)), BB->end());
25998 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
26000 // Add code to the main basic block to check if the stack limit has been hit,
26001 // and if so, jump to mallocMBB otherwise to bumpMBB.
26002 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
26003 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
26004 .addReg(tmpSPVReg).addReg(sizeVReg);
26005 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
26006 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
26007 .addReg(SPLimitVReg);
26008 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
26010 // bumpMBB simply decreases the stack pointer, since we know the current
26011 // stacklet has enough space.
26012 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
26013 .addReg(SPLimitVReg);
26014 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
26015 .addReg(SPLimitVReg);
26016 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26018 // Calls into a routine in libgcc to allocate more space from the heap.
26019 const uint32_t *RegMask =
26020 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
26022 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
26024 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26025 .addExternalSymbol("__morestack_allocate_stack_space")
26026 .addRegMask(RegMask)
26027 .addReg(X86::RDI, RegState::Implicit)
26028 .addReg(X86::RAX, RegState::ImplicitDefine);
26029 } else if (Is64Bit) {
26030 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
26032 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26033 .addExternalSymbol("__morestack_allocate_stack_space")
26034 .addRegMask(RegMask)
26035 .addReg(X86::EDI, RegState::Implicit)
26036 .addReg(X86::EAX, RegState::ImplicitDefine);
26038 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
26040 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
26041 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
26042 .addExternalSymbol("__morestack_allocate_stack_space")
26043 .addRegMask(RegMask)
26044 .addReg(X86::EAX, RegState::ImplicitDefine);
26048 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
26051 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
26052 .addReg(IsLP64 ? X86::RAX : X86::EAX);
26053 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26055 // Set up the CFG correctly.
26056 BB->addSuccessor(bumpMBB);
26057 BB->addSuccessor(mallocMBB);
26058 mallocMBB->addSuccessor(continueMBB);
26059 bumpMBB->addSuccessor(continueMBB);
26061 // Take care of the PHI nodes.
26062 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
26063 MI.getOperand(0).getReg())
26064 .addReg(mallocPtrVReg)
26066 .addReg(bumpSPPtrVReg)
26069 // Delete the original pseudo instruction.
26070 MI.eraseFromParent();
26073 return continueMBB;
26076 MachineBasicBlock *
26077 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
26078 MachineBasicBlock *BB) const {
26079 MachineFunction *MF = BB->getParent();
26080 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26081 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
26082 DebugLoc DL = MI.getDebugLoc();
26084 assert(!isAsynchronousEHPersonality(
26085 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
26086 "SEH does not use catchret!");
26088 // Only 32-bit EH needs to worry about manually restoring stack pointers.
26089 if (!Subtarget.is32Bit())
26092 // C++ EH creates a new target block to hold the restore code, and wires up
26093 // the new block to the return destination with a normal JMP_4.
26094 MachineBasicBlock *RestoreMBB =
26095 MF->CreateMachineBasicBlock(BB->getBasicBlock());
26096 assert(BB->succ_size() == 1);
26097 MF->insert(std::next(BB->getIterator()), RestoreMBB);
26098 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
26099 BB->addSuccessor(RestoreMBB);
26100 MI.getOperand(0).setMBB(RestoreMBB);
26102 auto RestoreMBBI = RestoreMBB->begin();
26103 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
26104 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
26108 MachineBasicBlock *
26109 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
26110 MachineBasicBlock *BB) const {
26111 MachineFunction *MF = BB->getParent();
26112 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
26113 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
26114 // Only 32-bit SEH requires special handling for catchpad.
26115 if (IsSEH && Subtarget.is32Bit()) {
26116 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26117 DebugLoc DL = MI.getDebugLoc();
26118 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
26120 MI.eraseFromParent();
26124 MachineBasicBlock *
26125 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
26126 MachineBasicBlock *BB) const {
26127 // So, here we replace TLSADDR with the sequence:
26128 // adjust_stackdown -> TLSADDR -> adjust_stackup.
26129 // We need this because TLSADDR is lowered into calls
26130 // inside MC, therefore without the two markers shrink-wrapping
26131 // may push the prologue/epilogue pass them.
26132 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26133 DebugLoc DL = MI.getDebugLoc();
26134 MachineFunction &MF = *BB->getParent();
26136 // Emit CALLSEQ_START right before the instruction.
26137 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
26138 MachineInstrBuilder CallseqStart =
26139 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
26140 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
26142 // Emit CALLSEQ_END right after the instruction.
26143 // We don't call erase from parent because we want to keep the
26144 // original instruction around.
26145 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
26146 MachineInstrBuilder CallseqEnd =
26147 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
26148 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
26153 MachineBasicBlock *
26154 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
26155 MachineBasicBlock *BB) const {
26156 // This is pretty easy. We're taking the value that we received from
26157 // our load from the relocation, sticking it in either RDI (x86-64)
26158 // or EAX and doing an indirect call. The return value will then
26159 // be in the normal return register.
26160 MachineFunction *F = BB->getParent();
26161 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26162 DebugLoc DL = MI.getDebugLoc();
26164 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
26165 assert(MI.getOperand(3).isGlobal() && "This should be a global");
26167 // Get a register mask for the lowered call.
26168 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
26169 // proper register mask.
26170 const uint32_t *RegMask =
26171 Subtarget.is64Bit() ?
26172 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
26173 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
26174 if (Subtarget.is64Bit()) {
26175 MachineInstrBuilder MIB =
26176 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
26180 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26181 MI.getOperand(3).getTargetFlags())
26183 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
26184 addDirectMem(MIB, X86::RDI);
26185 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
26186 } else if (!isPositionIndependent()) {
26187 MachineInstrBuilder MIB =
26188 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26192 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26193 MI.getOperand(3).getTargetFlags())
26195 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26196 addDirectMem(MIB, X86::EAX);
26197 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26199 MachineInstrBuilder MIB =
26200 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26201 .addReg(TII->getGlobalBaseReg(F))
26204 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26205 MI.getOperand(3).getTargetFlags())
26207 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26208 addDirectMem(MIB, X86::EAX);
26209 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26212 MI.eraseFromParent(); // The pseudo instruction is gone now.
26216 MachineBasicBlock *
26217 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
26218 MachineBasicBlock *MBB) const {
26219 DebugLoc DL = MI.getDebugLoc();
26220 MachineFunction *MF = MBB->getParent();
26221 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26222 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26223 MachineRegisterInfo &MRI = MF->getRegInfo();
26225 const BasicBlock *BB = MBB->getBasicBlock();
26226 MachineFunction::iterator I = ++MBB->getIterator();
26228 // Memory Reference
26229 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26230 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26233 unsigned MemOpndSlot = 0;
26235 unsigned CurOp = 0;
26237 DstReg = MI.getOperand(CurOp++).getReg();
26238 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26239 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
26241 unsigned mainDstReg = MRI.createVirtualRegister(RC);
26242 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
26244 MemOpndSlot = CurOp;
26246 MVT PVT = getPointerTy(MF->getDataLayout());
26247 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26248 "Invalid Pointer Size!");
26250 // For v = setjmp(buf), we generate
26253 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
26254 // SjLjSetup restoreMBB
26260 // v = phi(main, restore)
26263 // if base pointer being used, load it from frame
26266 MachineBasicBlock *thisMBB = MBB;
26267 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26268 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26269 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
26270 MF->insert(I, mainMBB);
26271 MF->insert(I, sinkMBB);
26272 MF->push_back(restoreMBB);
26273 restoreMBB->setHasAddressTaken();
26275 MachineInstrBuilder MIB;
26277 // Transfer the remainder of BB and its successor edges to sinkMBB.
26278 sinkMBB->splice(sinkMBB->begin(), MBB,
26279 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26280 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26283 unsigned PtrStoreOpc = 0;
26284 unsigned LabelReg = 0;
26285 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26286 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26287 !isPositionIndependent();
26289 // Prepare IP either in reg or imm.
26290 if (!UseImmLabel) {
26291 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26292 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
26293 LabelReg = MRI.createVirtualRegister(PtrRC);
26294 if (Subtarget.is64Bit()) {
26295 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
26299 .addMBB(restoreMBB)
26302 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
26303 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
26304 .addReg(XII->getGlobalBaseReg(MF))
26307 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
26311 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26313 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
26314 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26315 if (i == X86::AddrDisp)
26316 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26318 MIB.add(MI.getOperand(MemOpndSlot + i));
26321 MIB.addReg(LabelReg);
26323 MIB.addMBB(restoreMBB);
26324 MIB.setMemRefs(MMOBegin, MMOEnd);
26326 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26327 .addMBB(restoreMBB);
26329 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26330 MIB.addRegMask(RegInfo->getNoPreservedMask());
26331 thisMBB->addSuccessor(mainMBB);
26332 thisMBB->addSuccessor(restoreMBB);
26336 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26337 mainMBB->addSuccessor(sinkMBB);
26340 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26341 TII->get(X86::PHI), DstReg)
26342 .addReg(mainDstReg).addMBB(mainMBB)
26343 .addReg(restoreDstReg).addMBB(restoreMBB);
26346 if (RegInfo->hasBasePointer(*MF)) {
26347 const bool Uses64BitFramePtr =
26348 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26349 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26350 X86FI->setRestoreBasePointer(MF);
26351 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26352 unsigned BasePtr = RegInfo->getBaseRegister();
26353 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
26354 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26355 FramePtr, true, X86FI->getRestoreBasePointerOffset())
26356 .setMIFlag(MachineInstr::FrameSetup);
26358 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26359 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26360 restoreMBB->addSuccessor(sinkMBB);
26362 MI.eraseFromParent();
26366 MachineBasicBlock *
26367 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26368 MachineBasicBlock *MBB) const {
26369 DebugLoc DL = MI.getDebugLoc();
26370 MachineFunction *MF = MBB->getParent();
26371 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26372 MachineRegisterInfo &MRI = MF->getRegInfo();
26374 // Memory Reference
26375 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26376 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26378 MVT PVT = getPointerTy(MF->getDataLayout());
26379 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26380 "Invalid Pointer Size!");
26382 const TargetRegisterClass *RC =
26383 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26384 unsigned Tmp = MRI.createVirtualRegister(RC);
26385 // Since FP is only updated here but NOT referenced, it's treated as GPR.
26386 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26387 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26388 unsigned SP = RegInfo->getStackRegister();
26390 MachineInstrBuilder MIB;
26392 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26393 const int64_t SPOffset = 2 * PVT.getStoreSize();
26395 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26396 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
26399 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26400 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
26401 MIB.add(MI.getOperand(i));
26402 MIB.setMemRefs(MMOBegin, MMOEnd);
26404 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26405 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26406 if (i == X86::AddrDisp)
26407 MIB.addDisp(MI.getOperand(i), LabelOffset);
26409 MIB.add(MI.getOperand(i));
26411 MIB.setMemRefs(MMOBegin, MMOEnd);
26413 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26414 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26415 if (i == X86::AddrDisp)
26416 MIB.addDisp(MI.getOperand(i), SPOffset);
26418 MIB.add(MI.getOperand(i));
26420 MIB.setMemRefs(MMOBegin, MMOEnd);
26422 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26424 MI.eraseFromParent();
26428 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26429 MachineBasicBlock *MBB,
26430 MachineBasicBlock *DispatchBB,
26432 DebugLoc DL = MI.getDebugLoc();
26433 MachineFunction *MF = MBB->getParent();
26434 MachineRegisterInfo *MRI = &MF->getRegInfo();
26435 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26437 MVT PVT = getPointerTy(MF->getDataLayout());
26438 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
26443 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26444 !isPositionIndependent();
26447 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26449 const TargetRegisterClass *TRC =
26450 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26451 VR = MRI->createVirtualRegister(TRC);
26452 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26454 if (Subtarget.is64Bit())
26455 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
26459 .addMBB(DispatchBB)
26462 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
26463 .addReg(0) /* TII->getGlobalBaseReg(MF) */
26466 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
26470 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
26471 addFrameReference(MIB, FI, 36);
26473 MIB.addMBB(DispatchBB);
26478 MachineBasicBlock *
26479 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
26480 MachineBasicBlock *BB) const {
26481 DebugLoc DL = MI.getDebugLoc();
26482 MachineFunction *MF = BB->getParent();
26483 MachineFrameInfo &MFI = MF->getFrameInfo();
26484 MachineRegisterInfo *MRI = &MF->getRegInfo();
26485 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26486 int FI = MFI.getFunctionContextIndex();
26488 // Get a mapping of the call site numbers to all of the landing pads they're
26489 // associated with.
26490 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
26491 unsigned MaxCSNum = 0;
26492 for (auto &MBB : *MF) {
26493 if (!MBB.isEHPad())
26496 MCSymbol *Sym = nullptr;
26497 for (const auto &MI : MBB) {
26498 if (MI.isDebugValue())
26501 assert(MI.isEHLabel() && "expected EH_LABEL");
26502 Sym = MI.getOperand(0).getMCSymbol();
26506 if (!MF->hasCallSiteLandingPad(Sym))
26509 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
26510 CallSiteNumToLPad[CSI].push_back(&MBB);
26511 MaxCSNum = std::max(MaxCSNum, CSI);
26515 // Get an ordered list of the machine basic blocks for the jump table.
26516 std::vector<MachineBasicBlock *> LPadList;
26517 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
26518 LPadList.reserve(CallSiteNumToLPad.size());
26520 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
26521 for (auto &LP : CallSiteNumToLPad[CSI]) {
26522 LPadList.push_back(LP);
26523 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
26527 assert(!LPadList.empty() &&
26528 "No landing pad destinations for the dispatch jump table!");
26530 // Create the MBBs for the dispatch code.
26532 // Shove the dispatch's address into the return slot in the function context.
26533 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
26534 DispatchBB->setIsEHPad(true);
26536 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
26537 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
26538 DispatchBB->addSuccessor(TrapBB);
26540 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
26541 DispatchBB->addSuccessor(DispContBB);
26544 MF->push_back(DispatchBB);
26545 MF->push_back(DispContBB);
26546 MF->push_back(TrapBB);
26548 // Insert code into the entry block that creates and registers the function
26550 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
26552 // Create the jump table and associated information
26553 MachineJumpTableInfo *JTI =
26554 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
26555 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
26557 const X86RegisterInfo &RI = TII->getRegisterInfo();
26558 // Add a register mask with no preserved registers. This results in all
26559 // registers being marked as clobbered.
26560 if (RI.hasBasePointer(*MF)) {
26561 const bool FPIs64Bit =
26562 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26563 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
26564 MFI->setRestoreBasePointer(MF);
26566 unsigned FP = RI.getFrameRegister(*MF);
26567 unsigned BP = RI.getBaseRegister();
26568 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
26569 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
26570 MFI->getRestoreBasePointerOffset())
26571 .addRegMask(RI.getNoPreservedMask());
26573 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
26574 .addRegMask(RI.getNoPreservedMask());
26577 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26578 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
26580 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
26582 .addImm(LPadList.size());
26583 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
26585 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26586 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
26589 BuildMI(DispContBB, DL,
26590 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
26592 .addImm(Subtarget.is64Bit() ? 8 : 4)
26594 .addJumpTableIndex(MJTI)
26597 // Add the jump table entries as successors to the MBB.
26598 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
26599 for (auto &LP : LPadList)
26600 if (SeenMBBs.insert(LP).second)
26601 DispContBB->addSuccessor(LP);
26603 // N.B. the order the invoke BBs are processed in doesn't matter here.
26604 SmallVector<MachineBasicBlock *, 64> MBBLPads;
26605 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
26606 for (MachineBasicBlock *MBB : InvokeBBs) {
26607 // Remove the landing pad successor from the invoke block and replace it
26608 // with the new dispatch block.
26609 // Keep a copy of Successors since it's modified inside the loop.
26610 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
26612 // FIXME: Avoid quadratic complexity.
26613 for (auto MBBS : Successors) {
26614 if (MBBS->isEHPad()) {
26615 MBB->removeSuccessor(MBBS);
26616 MBBLPads.push_back(MBBS);
26620 MBB->addSuccessor(DispatchBB);
26622 // Find the invoke call and mark all of the callee-saved registers as
26623 // 'implicit defined' so that they're spilled. This prevents code from
26624 // moving instructions to before the EH block, where they will never be
26626 for (auto &II : reverse(*MBB)) {
26630 DenseMap<unsigned, bool> DefRegs;
26631 for (auto &MOp : II.operands())
26633 DefRegs[MOp.getReg()] = true;
26635 MachineInstrBuilder MIB(*MF, &II);
26636 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
26637 unsigned Reg = SavedRegs[RI];
26639 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
26646 // Mark all former landing pads as non-landing pads. The dispatch is the only
26647 // landing pad now.
26648 for (auto &LP : MBBLPads)
26649 LP->setIsEHPad(false);
26651 // The instruction is gone now.
26652 MI.eraseFromParent();
26656 MachineBasicBlock *
26657 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
26658 MachineBasicBlock *BB) const {
26659 MachineFunction *MF = BB->getParent();
26660 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26661 DebugLoc DL = MI.getDebugLoc();
26663 switch (MI.getOpcode()) {
26664 default: llvm_unreachable("Unexpected instr type to insert");
26665 case X86::TAILJMPd64:
26666 case X86::TAILJMPr64:
26667 case X86::TAILJMPm64:
26668 case X86::TAILJMPr64_REX:
26669 case X86::TAILJMPm64_REX:
26670 llvm_unreachable("TAILJMP64 would not be touched here.");
26671 case X86::TCRETURNdi64:
26672 case X86::TCRETURNri64:
26673 case X86::TCRETURNmi64:
26675 case X86::TLS_addr32:
26676 case X86::TLS_addr64:
26677 case X86::TLS_base_addr32:
26678 case X86::TLS_base_addr64:
26679 return EmitLoweredTLSAddr(MI, BB);
26680 case X86::CATCHRET:
26681 return EmitLoweredCatchRet(MI, BB);
26682 case X86::CATCHPAD:
26683 return EmitLoweredCatchPad(MI, BB);
26684 case X86::SEG_ALLOCA_32:
26685 case X86::SEG_ALLOCA_64:
26686 return EmitLoweredSegAlloca(MI, BB);
26687 case X86::TLSCall_32:
26688 case X86::TLSCall_64:
26689 return EmitLoweredTLSCall(MI, BB);
26690 case X86::CMOV_FR32:
26691 case X86::CMOV_FR64:
26692 case X86::CMOV_FR128:
26693 case X86::CMOV_GR8:
26694 case X86::CMOV_GR16:
26695 case X86::CMOV_GR32:
26696 case X86::CMOV_RFP32:
26697 case X86::CMOV_RFP64:
26698 case X86::CMOV_RFP80:
26699 case X86::CMOV_V2F64:
26700 case X86::CMOV_V2I64:
26701 case X86::CMOV_V4F32:
26702 case X86::CMOV_V4F64:
26703 case X86::CMOV_V4I64:
26704 case X86::CMOV_V16F32:
26705 case X86::CMOV_V8F32:
26706 case X86::CMOV_V8F64:
26707 case X86::CMOV_V8I64:
26708 case X86::CMOV_V8I1:
26709 case X86::CMOV_V16I1:
26710 case X86::CMOV_V32I1:
26711 case X86::CMOV_V64I1:
26712 return EmitLoweredSelect(MI, BB);
26714 case X86::RDFLAGS32:
26715 case X86::RDFLAGS64: {
26717 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
26718 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
26719 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
26720 // Permit reads of the FLAGS register without it being defined.
26721 // This intrinsic exists to read external processor state in flags, such as
26722 // the trap flag, interrupt flag, and direction flag, none of which are
26723 // modeled by the backend.
26724 Push->getOperand(2).setIsUndef();
26725 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
26727 MI.eraseFromParent(); // The pseudo is gone now.
26731 case X86::WRFLAGS32:
26732 case X86::WRFLAGS64: {
26734 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
26736 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
26737 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
26738 BuildMI(*BB, MI, DL, TII->get(PopF));
26740 MI.eraseFromParent(); // The pseudo is gone now.
26744 case X86::RELEASE_FADD32mr:
26745 case X86::RELEASE_FADD64mr:
26746 return EmitLoweredAtomicFP(MI, BB);
26748 case X86::FP32_TO_INT16_IN_MEM:
26749 case X86::FP32_TO_INT32_IN_MEM:
26750 case X86::FP32_TO_INT64_IN_MEM:
26751 case X86::FP64_TO_INT16_IN_MEM:
26752 case X86::FP64_TO_INT32_IN_MEM:
26753 case X86::FP64_TO_INT64_IN_MEM:
26754 case X86::FP80_TO_INT16_IN_MEM:
26755 case X86::FP80_TO_INT32_IN_MEM:
26756 case X86::FP80_TO_INT64_IN_MEM: {
26757 // Change the floating point control register to use "round towards zero"
26758 // mode when truncating to an integer value.
26759 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
26760 addFrameReference(BuildMI(*BB, MI, DL,
26761 TII->get(X86::FNSTCW16m)), CWFrameIdx);
26763 // Load the old value of the high byte of the control word...
26765 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
26766 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
26769 // Set the high part to be round to zero...
26770 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
26773 // Reload the modified control word now...
26774 addFrameReference(BuildMI(*BB, MI, DL,
26775 TII->get(X86::FLDCW16m)), CWFrameIdx);
26777 // Restore the memory image of control word to original value
26778 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
26781 // Get the X86 opcode to use.
26783 switch (MI.getOpcode()) {
26784 default: llvm_unreachable("illegal opcode!");
26785 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
26786 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
26787 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
26788 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
26789 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
26790 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
26791 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
26792 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
26793 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
26796 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26797 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
26798 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
26800 // Reload the original control word now.
26801 addFrameReference(BuildMI(*BB, MI, DL,
26802 TII->get(X86::FLDCW16m)), CWFrameIdx);
26804 MI.eraseFromParent(); // The pseudo instruction is gone now.
26807 // String/text processing lowering.
26808 case X86::PCMPISTRM128REG:
26809 case X86::VPCMPISTRM128REG:
26810 case X86::PCMPISTRM128MEM:
26811 case X86::VPCMPISTRM128MEM:
26812 case X86::PCMPESTRM128REG:
26813 case X86::VPCMPESTRM128REG:
26814 case X86::PCMPESTRM128MEM:
26815 case X86::VPCMPESTRM128MEM:
26816 assert(Subtarget.hasSSE42() &&
26817 "Target must have SSE4.2 or AVX features enabled");
26818 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26820 // String/text processing lowering.
26821 case X86::PCMPISTRIREG:
26822 case X86::VPCMPISTRIREG:
26823 case X86::PCMPISTRIMEM:
26824 case X86::VPCMPISTRIMEM:
26825 case X86::PCMPESTRIREG:
26826 case X86::VPCMPESTRIREG:
26827 case X86::PCMPESTRIMEM:
26828 case X86::VPCMPESTRIMEM:
26829 assert(Subtarget.hasSSE42() &&
26830 "Target must have SSE4.2 or AVX features enabled");
26831 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26833 // Thread synchronization.
26835 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26836 case X86::MONITORX:
26837 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26841 return emitClzero(&MI, BB, Subtarget);
26845 return emitWRPKRU(MI, BB, Subtarget);
26847 return emitRDPKRU(MI, BB, Subtarget);
26850 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26852 case X86::VASTART_SAVE_XMM_REGS:
26853 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26855 case X86::VAARG_64:
26856 return EmitVAARG64WithCustomInserter(MI, BB);
26858 case X86::EH_SjLj_SetJmp32:
26859 case X86::EH_SjLj_SetJmp64:
26860 return emitEHSjLjSetJmp(MI, BB);
26862 case X86::EH_SjLj_LongJmp32:
26863 case X86::EH_SjLj_LongJmp64:
26864 return emitEHSjLjLongJmp(MI, BB);
26866 case X86::Int_eh_sjlj_setup_dispatch:
26867 return EmitSjLjDispatchBlock(MI, BB);
26869 case TargetOpcode::STATEPOINT:
26870 // As an implementation detail, STATEPOINT shares the STACKMAP format at
26871 // this point in the process. We diverge later.
26872 return emitPatchPoint(MI, BB);
26874 case TargetOpcode::STACKMAP:
26875 case TargetOpcode::PATCHPOINT:
26876 return emitPatchPoint(MI, BB);
26878 case TargetOpcode::PATCHABLE_EVENT_CALL:
26879 // Do nothing here, handle in xray instrumentation pass.
26882 case X86::LCMPXCHG8B: {
26883 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26884 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26885 // requires a memory operand. If it happens that current architecture is
26886 // i686 and for current function we need a base pointer
26887 // - which is ESI for i686 - register allocator would not be able to
26888 // allocate registers for an address in form of X(%reg, %reg, Y)
26889 // - there never would be enough unreserved registers during regalloc
26890 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
26891 // We are giving a hand to register allocator by precomputing the address in
26892 // a new vreg using LEA.
26894 // If it is not i686 or there is no base pointer - nothing to do here.
26895 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
26898 // Even though this code does not necessarily needs the base pointer to
26899 // be ESI, we check for that. The reason: if this assert fails, there are
26900 // some changes happened in the compiler base pointer handling, which most
26901 // probably have to be addressed somehow here.
26902 assert(TRI->getBaseRegister() == X86::ESI &&
26903 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
26904 "base pointer in mind");
26906 MachineRegisterInfo &MRI = MF->getRegInfo();
26907 MVT SPTy = getPointerTy(MF->getDataLayout());
26908 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26909 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26911 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26912 // Regalloc does not need any help when the memory operand of CMPXCHG8B
26913 // does not use index register.
26914 if (AM.IndexReg == X86::NoRegister)
26917 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26918 // four operand definitions that are E[ABCD] registers. We skip them and
26919 // then insert the LEA.
26920 MachineBasicBlock::iterator MBBI(MI);
26921 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
26922 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
26925 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26927 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26931 case X86::LCMPXCHG16B:
26933 case X86::LCMPXCHG8B_SAVE_EBX:
26934 case X86::LCMPXCHG16B_SAVE_RBX: {
26936 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
26937 if (!BB->isLiveIn(BasePtr))
26938 BB->addLiveIn(BasePtr);
26944 //===----------------------------------------------------------------------===//
26945 // X86 Optimization Hooks
26946 //===----------------------------------------------------------------------===//
26948 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26950 const APInt &DemandedElts,
26951 const SelectionDAG &DAG,
26952 unsigned Depth) const {
26953 unsigned BitWidth = Known.getBitWidth();
26954 unsigned Opc = Op.getOpcode();
26955 EVT VT = Op.getValueType();
26956 assert((Opc >= ISD::BUILTIN_OP_END ||
26957 Opc == ISD::INTRINSIC_WO_CHAIN ||
26958 Opc == ISD::INTRINSIC_W_CHAIN ||
26959 Opc == ISD::INTRINSIC_VOID) &&
26960 "Should use MaskedValueIsZero if you don't know whether Op"
26961 " is a target node!");
26977 // These nodes' second result is a boolean.
26978 if (Op.getResNo() == 0)
26981 case X86ISD::SETCC:
26982 Known.Zero.setBitsFrom(1);
26984 case X86ISD::MOVMSK: {
26985 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26986 Known.Zero.setBitsFrom(NumLoBits);
26989 case X86ISD::VSHLI:
26990 case X86ISD::VSRLI: {
26991 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26992 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
26993 Known.setAllZero();
26997 DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
26998 unsigned ShAmt = ShiftImm->getZExtValue();
26999 if (Opc == X86ISD::VSHLI) {
27000 Known.Zero <<= ShAmt;
27001 Known.One <<= ShAmt;
27002 // Low bits are known zero.
27003 Known.Zero.setLowBits(ShAmt);
27005 Known.Zero.lshrInPlace(ShAmt);
27006 Known.One.lshrInPlace(ShAmt);
27007 // High bits are known zero.
27008 Known.Zero.setHighBits(ShAmt);
27013 case X86ISD::VZEXT: {
27014 SDValue N0 = Op.getOperand(0);
27015 unsigned NumElts = VT.getVectorNumElements();
27017 EVT SrcVT = N0.getValueType();
27018 unsigned InNumElts = SrcVT.getVectorNumElements();
27019 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
27020 assert(InNumElts >= NumElts && "Illegal VZEXT input");
27022 Known = KnownBits(InBitWidth);
27023 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
27024 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
27025 Known = Known.zext(BitWidth);
27026 Known.Zero.setBitsFrom(InBitWidth);
27032 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
27033 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
27034 unsigned Depth) const {
27035 unsigned VTBits = Op.getScalarValueSizeInBits();
27036 unsigned Opcode = Op.getOpcode();
27038 case X86ISD::SETCC_CARRY:
27039 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
27042 case X86ISD::VSEXT: {
27043 SDValue Src = Op.getOperand(0);
27044 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27045 Tmp += VTBits - Src.getScalarValueSizeInBits();
27049 case X86ISD::VSHLI: {
27050 SDValue Src = Op.getOperand(0);
27051 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27052 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27053 if (ShiftVal.uge(VTBits))
27054 return VTBits; // Shifted all bits out --> zero.
27055 if (ShiftVal.uge(Tmp))
27056 return 1; // Shifted all sign bits out --> unknown.
27057 return Tmp - ShiftVal.getZExtValue();
27060 case X86ISD::VSRAI: {
27061 SDValue Src = Op.getOperand(0);
27062 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27063 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27065 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
27068 case X86ISD::PCMPGT:
27069 case X86ISD::PCMPEQ:
27071 case X86ISD::VPCOM:
27072 case X86ISD::VPCOMU:
27073 // Vector compares return zero/all-bits result values.
27081 /// Returns true (and the GlobalValue and the offset) if the node is a
27082 /// GlobalAddress + offset.
27083 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
27084 const GlobalValue* &GA,
27085 int64_t &Offset) const {
27086 if (N->getOpcode() == X86ISD::Wrapper) {
27087 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
27088 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
27089 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
27093 return TargetLowering::isGAPlusOffset(N, GA, Offset);
27096 // Attempt to match a combined shuffle mask against supported unary shuffle
27098 // TODO: Investigate sharing more of this with shuffle lowering.
27099 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27100 bool AllowFloatDomain, bool AllowIntDomain,
27101 SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
27102 const X86Subtarget &Subtarget,
27103 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
27104 unsigned NumMaskElts = Mask.size();
27105 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
27107 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
27108 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
27109 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
27110 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
27111 unsigned MaxScale = 64 / MaskEltSize;
27112 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
27114 unsigned NumDstElts = NumMaskElts / Scale;
27115 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
27116 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
27117 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
27120 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
27121 SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
27122 if (SrcVT != MaskVT)
27123 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
27124 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
27125 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
27126 Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
27127 : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
27133 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
27134 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
27135 isUndefOrEqual(Mask[0], 0) &&
27136 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
27137 Shuffle = X86ISD::VZEXT_MOVL;
27138 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
27142 // Check if we have SSE3 which will let us use MOVDDUP etc. The
27143 // instructions are no slower than UNPCKLPD but has the option to
27144 // fold the input operand into even an unaligned memory load.
27145 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
27146 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
27147 Shuffle = X86ISD::MOVDDUP;
27148 SrcVT = DstVT = MVT::v2f64;
27151 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27152 Shuffle = X86ISD::MOVSLDUP;
27153 SrcVT = DstVT = MVT::v4f32;
27156 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
27157 Shuffle = X86ISD::MOVSHDUP;
27158 SrcVT = DstVT = MVT::v4f32;
27163 if (MaskVT.is256BitVector() && AllowFloatDomain) {
27164 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
27165 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27166 Shuffle = X86ISD::MOVDDUP;
27167 SrcVT = DstVT = MVT::v4f64;
27170 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27171 Shuffle = X86ISD::MOVSLDUP;
27172 SrcVT = DstVT = MVT::v8f32;
27175 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
27176 Shuffle = X86ISD::MOVSHDUP;
27177 SrcVT = DstVT = MVT::v8f32;
27182 if (MaskVT.is512BitVector() && AllowFloatDomain) {
27183 assert(Subtarget.hasAVX512() &&
27184 "AVX512 required for 512-bit vector shuffles");
27185 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27186 Shuffle = X86ISD::MOVDDUP;
27187 SrcVT = DstVT = MVT::v8f64;
27190 if (isTargetShuffleEquivalent(
27191 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
27192 Shuffle = X86ISD::MOVSLDUP;
27193 SrcVT = DstVT = MVT::v16f32;
27196 if (isTargetShuffleEquivalent(
27197 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
27198 Shuffle = X86ISD::MOVSHDUP;
27199 SrcVT = DstVT = MVT::v16f32;
27204 // Attempt to match against broadcast-from-vector.
27205 if (Subtarget.hasAVX2()) {
27206 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
27207 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
27208 SrcVT = DstVT = MaskVT;
27209 Shuffle = X86ISD::VBROADCAST;
27217 // Attempt to match a combined shuffle mask against supported unary immediate
27218 // permute instructions.
27219 // TODO: Investigate sharing more of this with shuffle lowering.
27220 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27221 const APInt &Zeroable,
27222 bool AllowFloatDomain,
27223 bool AllowIntDomain,
27224 const X86Subtarget &Subtarget,
27225 unsigned &Shuffle, MVT &ShuffleVT,
27226 unsigned &PermuteImm) {
27227 unsigned NumMaskElts = Mask.size();
27228 unsigned InputSizeInBits = MaskVT.getSizeInBits();
27229 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
27230 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
27232 bool ContainsZeros =
27233 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27235 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
27236 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
27237 // Check for lane crossing permutes.
27238 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
27239 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
27240 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
27241 Shuffle = X86ISD::VPERMI;
27242 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
27243 PermuteImm = getV4X86ShuffleImm(Mask);
27246 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
27247 SmallVector<int, 4> RepeatedMask;
27248 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
27249 Shuffle = X86ISD::VPERMI;
27250 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
27251 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
27255 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
27256 // VPERMILPD can permute with a non-repeating shuffle.
27257 Shuffle = X86ISD::VPERMILPI;
27258 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27260 for (int i = 0, e = Mask.size(); i != e; ++i) {
27262 if (M == SM_SentinelUndef)
27264 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
27265 PermuteImm |= (M & 1) << i;
27271 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
27272 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
27273 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
27274 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
27275 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
27276 SmallVector<int, 4> RepeatedMask;
27277 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
27278 // Narrow the repeated mask to create 32-bit element permutes.
27279 SmallVector<int, 4> WordMask = RepeatedMask;
27280 if (MaskScalarSizeInBits == 64)
27281 scaleShuffleMask(2, RepeatedMask, WordMask);
27283 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
27284 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
27285 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
27286 PermuteImm = getV4X86ShuffleImm(WordMask);
27291 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
27292 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
27293 SmallVector<int, 4> RepeatedMask;
27294 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
27295 ArrayRef<int> LoMask(Mask.data() + 0, 4);
27296 ArrayRef<int> HiMask(Mask.data() + 4, 4);
27298 // PSHUFLW: permute lower 4 elements only.
27299 if (isUndefOrInRange(LoMask, 0, 4) &&
27300 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
27301 Shuffle = X86ISD::PSHUFLW;
27302 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27303 PermuteImm = getV4X86ShuffleImm(LoMask);
27307 // PSHUFHW: permute upper 4 elements only.
27308 if (isUndefOrInRange(HiMask, 4, 8) &&
27309 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
27310 // Offset the HiMask so that we can create the shuffle immediate.
27311 int OffsetHiMask[4];
27312 for (int i = 0; i != 4; ++i)
27313 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
27315 Shuffle = X86ISD::PSHUFHW;
27316 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27317 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
27323 // Attempt to match against byte/bit shifts.
27324 // FIXME: Add 512-bit support.
27325 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27326 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27327 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
27328 MaskScalarSizeInBits, Mask,
27329 0, Zeroable, Subtarget);
27330 if (0 < ShiftAmt) {
27331 PermuteImm = (unsigned)ShiftAmt;
27339 // Attempt to match a combined unary shuffle mask against supported binary
27340 // shuffle instructions.
27341 // TODO: Investigate sharing more of this with shuffle lowering.
27342 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27343 bool AllowFloatDomain, bool AllowIntDomain,
27344 SDValue &V1, SDValue &V2, SDLoc &DL,
27346 const X86Subtarget &Subtarget,
27347 unsigned &Shuffle, MVT &ShuffleVT,
27349 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27351 if (MaskVT.is128BitVector()) {
27352 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
27354 Shuffle = X86ISD::MOVLHPS;
27355 ShuffleVT = MVT::v4f32;
27358 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
27360 Shuffle = X86ISD::MOVHLPS;
27361 ShuffleVT = MVT::v4f32;
27364 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
27365 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27367 Shuffle = X86ISD::MOVSD;
27368 ShuffleVT = MaskVT;
27371 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
27372 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27373 Shuffle = X86ISD::MOVSS;
27374 ShuffleVT = MaskVT;
27379 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
27380 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
27381 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27382 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
27383 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
27384 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
27385 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
27387 ShuffleVT = MaskVT;
27388 if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
27389 ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
27397 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27398 const APInt &Zeroable,
27399 bool AllowFloatDomain,
27400 bool AllowIntDomain,
27401 SDValue &V1, SDValue &V2, SDLoc &DL,
27403 const X86Subtarget &Subtarget,
27404 unsigned &Shuffle, MVT &ShuffleVT,
27405 unsigned &PermuteImm) {
27406 unsigned NumMaskElts = Mask.size();
27407 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27409 // Attempt to match against PALIGNR byte rotate.
27410 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27411 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27412 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
27413 if (0 < ByteRotation) {
27414 Shuffle = X86ISD::PALIGNR;
27415 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
27416 PermuteImm = ByteRotation;
27421 // Attempt to combine to X86ISD::BLENDI.
27422 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
27423 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
27424 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
27425 uint64_t BlendMask = 0;
27426 bool ForceV1Zero = false, ForceV2Zero = false;
27427 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
27428 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
27430 if (MaskVT == MVT::v16i16) {
27431 // We can only use v16i16 PBLENDW if the lanes are repeated.
27432 SmallVector<int, 8> RepeatedMask;
27433 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27435 assert(RepeatedMask.size() == 8 &&
27436 "Repeated mask size doesn't match!");
27438 for (int i = 0; i < 8; ++i)
27439 if (RepeatedMask[i] >= 8)
27440 PermuteImm |= 1 << i;
27441 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27442 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27443 Shuffle = X86ISD::BLENDI;
27444 ShuffleVT = MaskVT;
27448 // Determine a type compatible with X86ISD::BLENDI.
27449 ShuffleVT = MaskVT;
27450 if (Subtarget.hasAVX2()) {
27451 if (ShuffleVT == MVT::v4i64)
27452 ShuffleVT = MVT::v8i32;
27453 else if (ShuffleVT == MVT::v2i64)
27454 ShuffleVT = MVT::v4i32;
27456 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
27457 ShuffleVT = MVT::v8i16;
27458 else if (ShuffleVT == MVT::v4i64)
27459 ShuffleVT = MVT::v4f64;
27460 else if (ShuffleVT == MVT::v8i32)
27461 ShuffleVT = MVT::v8f32;
27464 if (!ShuffleVT.isFloatingPoint()) {
27465 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27467 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27468 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27469 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27472 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27473 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27474 PermuteImm = (unsigned)BlendMask;
27475 Shuffle = X86ISD::BLENDI;
27481 // Attempt to combine to INSERTPS.
27482 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
27483 MaskVT.is128BitVector()) {
27484 if (Zeroable.getBoolValue() &&
27485 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
27486 Shuffle = X86ISD::INSERTPS;
27487 ShuffleVT = MVT::v4f32;
27492 // Attempt to combine to SHUFPD.
27493 if (AllowFloatDomain && EltSizeInBits == 64 &&
27494 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27495 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27496 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27497 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
27498 Shuffle = X86ISD::SHUFP;
27499 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
27504 // Attempt to combine to SHUFPS.
27505 if (AllowFloatDomain && EltSizeInBits == 32 &&
27506 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
27507 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27508 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27509 SmallVector<int, 4> RepeatedMask;
27510 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
27511 // Match each half of the repeated mask, to determine if its just
27512 // referencing one of the vectors, is zeroable or entirely undef.
27513 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
27514 int M0 = RepeatedMask[Offset];
27515 int M1 = RepeatedMask[Offset + 1];
27517 if (isUndefInRange(RepeatedMask, Offset, 2)) {
27518 return DAG.getUNDEF(MaskVT);
27519 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
27520 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
27521 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
27522 return getZeroVector(MaskVT, Subtarget, DAG, DL);
27523 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
27524 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27525 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27527 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
27528 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27529 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27536 int ShufMask[4] = {-1, -1, -1, -1};
27537 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
27538 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
27543 Shuffle = X86ISD::SHUFP;
27544 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
27545 PermuteImm = getV4X86ShuffleImm(ShufMask);
27554 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
27557 /// This is the leaf of the recursive combine below. When we have found some
27558 /// chain of single-use x86 shuffle instructions and accumulated the combined
27559 /// shuffle mask represented by them, this will try to pattern match that mask
27560 /// into either a single instruction if there is a special purpose instruction
27561 /// for this operation, or into a PSHUFB instruction which is a fully general
27562 /// instruction but should only be used to replace chains over a certain depth.
27563 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
27564 ArrayRef<int> BaseMask, int Depth,
27565 bool HasVariableMask, SelectionDAG &DAG,
27566 TargetLowering::DAGCombinerInfo &DCI,
27567 const X86Subtarget &Subtarget) {
27568 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
27569 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
27570 "Unexpected number of shuffle inputs!");
27572 // Find the inputs that enter the chain. Note that multiple uses are OK
27573 // here, we're not going to remove the operands we find.
27574 bool UnaryShuffle = (Inputs.size() == 1);
27575 SDValue V1 = peekThroughBitcasts(Inputs[0]);
27576 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
27577 : peekThroughBitcasts(Inputs[1]));
27579 MVT VT1 = V1.getSimpleValueType();
27580 MVT VT2 = V2.getSimpleValueType();
27581 MVT RootVT = Root.getSimpleValueType();
27582 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
27583 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
27584 "Vector size mismatch");
27589 unsigned NumBaseMaskElts = BaseMask.size();
27590 if (NumBaseMaskElts == 1) {
27591 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
27592 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27597 unsigned RootSizeInBits = RootVT.getSizeInBits();
27598 unsigned NumRootElts = RootVT.getVectorNumElements();
27599 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
27600 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
27601 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
27603 // Don't combine if we are a AVX512/EVEX target and the mask element size
27604 // is different from the root element size - this would prevent writemasks
27605 // from being reused.
27606 // TODO - this currently prevents all lane shuffles from occurring.
27607 // TODO - check for writemasks usage instead of always preventing combining.
27608 // TODO - attempt to narrow Mask back to writemask size.
27609 bool IsEVEXShuffle =
27610 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
27611 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
27614 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
27616 // Handle 128-bit lane shuffles of 256-bit vectors.
27617 // TODO - this should support binary shuffles.
27618 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
27619 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
27620 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
27621 return false; // Nothing to do!
27622 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
27623 unsigned PermMask = 0;
27624 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
27625 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
27627 Res = DAG.getBitcast(ShuffleVT, V1);
27628 DCI.AddToWorklist(Res.getNode());
27629 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
27630 DAG.getUNDEF(ShuffleVT),
27631 DAG.getConstant(PermMask, DL, MVT::i8));
27632 DCI.AddToWorklist(Res.getNode());
27633 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27638 // For masks that have been widened to 128-bit elements or more,
27639 // narrow back down to 64-bit elements.
27640 SmallVector<int, 64> Mask;
27641 if (BaseMaskEltSizeInBits > 64) {
27642 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
27643 int MaskScale = BaseMaskEltSizeInBits / 64;
27644 scaleShuffleMask(MaskScale, BaseMask, Mask);
27646 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
27649 unsigned NumMaskElts = Mask.size();
27650 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
27652 // Determine the effective mask value type.
27653 FloatDomain &= (32 <= MaskEltSizeInBits);
27654 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
27655 : MVT::getIntegerVT(MaskEltSizeInBits);
27656 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
27658 // Only allow legal mask types.
27659 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
27662 // Attempt to match the mask against known shuffle patterns.
27663 MVT ShuffleSrcVT, ShuffleVT;
27664 unsigned Shuffle, PermuteImm;
27666 // Which shuffle domains are permitted?
27667 // Permit domain crossing at higher combine depths.
27668 bool AllowFloatDomain = FloatDomain || (Depth > 3);
27669 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) &&
27670 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
27672 // Determine zeroable mask elements.
27673 APInt Zeroable(NumMaskElts, 0);
27674 for (unsigned i = 0; i != NumMaskElts; ++i)
27675 if (isUndefOrZero(Mask[i]))
27676 Zeroable.setBit(i);
27678 if (UnaryShuffle) {
27679 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
27680 // directly if we don't shuffle the lower element and we shuffle the upper
27681 // (zero) elements within themselves.
27682 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
27683 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
27684 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
27685 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
27686 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
27687 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
27688 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27694 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27695 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
27697 if (Depth == 1 && Root.getOpcode() == Shuffle)
27698 return false; // Nothing to do!
27699 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27700 return false; // AVX512 Writemask clash.
27701 Res = DAG.getBitcast(ShuffleSrcVT, V1);
27702 DCI.AddToWorklist(Res.getNode());
27703 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
27704 DCI.AddToWorklist(Res.getNode());
27705 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27710 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
27711 AllowIntDomain, Subtarget, Shuffle,
27712 ShuffleVT, PermuteImm)) {
27713 if (Depth == 1 && Root.getOpcode() == Shuffle)
27714 return false; // Nothing to do!
27715 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27716 return false; // AVX512 Writemask clash.
27717 Res = DAG.getBitcast(ShuffleVT, V1);
27718 DCI.AddToWorklist(Res.getNode());
27719 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
27720 DAG.getConstant(PermuteImm, DL, MVT::i8));
27721 DCI.AddToWorklist(Res.getNode());
27722 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27728 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27729 V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
27731 if (Depth == 1 && Root.getOpcode() == Shuffle)
27732 return false; // Nothing to do!
27733 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27734 return false; // AVX512 Writemask clash.
27735 V1 = DAG.getBitcast(ShuffleVT, V1);
27736 DCI.AddToWorklist(V1.getNode());
27737 V2 = DAG.getBitcast(ShuffleVT, V2);
27738 DCI.AddToWorklist(V2.getNode());
27739 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
27740 DCI.AddToWorklist(Res.getNode());
27741 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27746 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
27747 AllowIntDomain, V1, V2, DL, DAG,
27748 Subtarget, Shuffle, ShuffleVT,
27750 if (Depth == 1 && Root.getOpcode() == Shuffle)
27751 return false; // Nothing to do!
27752 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27753 return false; // AVX512 Writemask clash.
27754 V1 = DAG.getBitcast(ShuffleVT, V1);
27755 DCI.AddToWorklist(V1.getNode());
27756 V2 = DAG.getBitcast(ShuffleVT, V2);
27757 DCI.AddToWorklist(V2.getNode());
27758 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
27759 DAG.getConstant(PermuteImm, DL, MVT::i8));
27760 DCI.AddToWorklist(Res.getNode());
27761 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27766 // Typically from here on, we need an integer version of MaskVT.
27767 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
27768 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
27770 // Annoyingly, SSE4A instructions don't map into the above match helpers.
27771 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
27772 uint64_t BitLen, BitIdx;
27773 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
27775 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
27776 return false; // Nothing to do!
27777 V1 = DAG.getBitcast(IntMaskVT, V1);
27778 DCI.AddToWorklist(V1.getNode());
27779 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
27780 DAG.getConstant(BitLen, DL, MVT::i8),
27781 DAG.getConstant(BitIdx, DL, MVT::i8));
27782 DCI.AddToWorklist(Res.getNode());
27783 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27788 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
27789 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
27790 return false; // Nothing to do!
27791 V1 = DAG.getBitcast(IntMaskVT, V1);
27792 DCI.AddToWorklist(V1.getNode());
27793 V2 = DAG.getBitcast(IntMaskVT, V2);
27794 DCI.AddToWorklist(V2.getNode());
27795 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
27796 DAG.getConstant(BitLen, DL, MVT::i8),
27797 DAG.getConstant(BitIdx, DL, MVT::i8));
27798 DCI.AddToWorklist(Res.getNode());
27799 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27805 // Don't try to re-form single instruction chains under any circumstances now
27806 // that we've done encoding canonicalization for them.
27810 bool MaskContainsZeros =
27811 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27813 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
27814 // If we have a single input lane-crossing shuffle then lower to VPERMV.
27815 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27816 ((Subtarget.hasAVX2() &&
27817 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27818 (Subtarget.hasAVX512() &&
27819 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27820 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27821 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27822 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27823 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27824 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27825 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27826 DCI.AddToWorklist(VPermMask.getNode());
27827 Res = DAG.getBitcast(MaskVT, V1);
27828 DCI.AddToWorklist(Res.getNode());
27829 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
27830 DCI.AddToWorklist(Res.getNode());
27831 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27836 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
27837 // vector as the second source.
27838 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27839 ((Subtarget.hasAVX512() &&
27840 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27841 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27842 (Subtarget.hasVLX() &&
27843 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27844 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27845 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27846 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27847 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27848 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27849 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
27850 for (unsigned i = 0; i != NumMaskElts; ++i)
27851 if (Mask[i] == SM_SentinelZero)
27852 Mask[i] = NumMaskElts + i;
27854 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27855 DCI.AddToWorklist(VPermMask.getNode());
27856 Res = DAG.getBitcast(MaskVT, V1);
27857 DCI.AddToWorklist(Res.getNode());
27858 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
27859 DCI.AddToWorklist(Zero.getNode());
27860 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27861 DCI.AddToWorklist(Res.getNode());
27862 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27867 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27868 if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27869 ((Subtarget.hasAVX512() &&
27870 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27871 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27872 (Subtarget.hasVLX() &&
27873 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27874 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27875 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27876 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27877 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27878 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27879 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27880 DCI.AddToWorklist(VPermMask.getNode());
27881 V1 = DAG.getBitcast(MaskVT, V1);
27882 DCI.AddToWorklist(V1.getNode());
27883 V2 = DAG.getBitcast(MaskVT, V2);
27884 DCI.AddToWorklist(V2.getNode());
27885 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27886 DCI.AddToWorklist(Res.getNode());
27887 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27894 // See if we can combine a single input shuffle with zeros to a bit-mask,
27895 // which is much simpler than any shuffle.
27896 if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
27897 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27898 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
27899 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27900 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27901 APInt UndefElts(NumMaskElts, 0);
27902 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27903 for (unsigned i = 0; i != NumMaskElts; ++i) {
27905 if (M == SM_SentinelUndef) {
27906 UndefElts.setBit(i);
27909 if (M == SM_SentinelZero)
27911 EltBits[i] = AllOnes;
27913 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27914 DCI.AddToWorklist(BitMask.getNode());
27915 Res = DAG.getBitcast(MaskVT, V1);
27916 DCI.AddToWorklist(Res.getNode());
27917 unsigned AndOpcode =
27918 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
27919 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27920 DCI.AddToWorklist(Res.getNode());
27921 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27926 // If we have a single input shuffle with different shuffle patterns in the
27927 // the 128-bit lanes use the variable mask to VPERMILPS.
27928 // TODO Combine other mask types at higher depths.
27929 if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
27930 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
27931 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
27932 SmallVector<SDValue, 16> VPermIdx;
27933 for (int M : Mask) {
27935 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
27936 VPermIdx.push_back(Idx);
27938 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
27939 DCI.AddToWorklist(VPermMask.getNode());
27940 Res = DAG.getBitcast(MaskVT, V1);
27941 DCI.AddToWorklist(Res.getNode());
27942 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27943 DCI.AddToWorklist(Res.getNode());
27944 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27949 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27950 // to VPERMIL2PD/VPERMIL2PS.
27951 if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
27952 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
27953 MaskVT == MVT::v8f32)) {
27954 // VPERMIL2 Operation.
27955 // Bits[3] - Match Bit.
27956 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
27957 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
27958 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27959 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27960 SmallVector<int, 8> VPerm2Idx;
27961 unsigned M2ZImm = 0;
27962 for (int M : Mask) {
27963 if (M == SM_SentinelUndef) {
27964 VPerm2Idx.push_back(-1);
27967 if (M == SM_SentinelZero) {
27969 VPerm2Idx.push_back(8);
27972 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27973 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
27974 VPerm2Idx.push_back(Index);
27976 V1 = DAG.getBitcast(MaskVT, V1);
27977 DCI.AddToWorklist(V1.getNode());
27978 V2 = DAG.getBitcast(MaskVT, V2);
27979 DCI.AddToWorklist(V2.getNode());
27980 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
27981 DCI.AddToWorklist(VPerm2MaskOp.getNode());
27982 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27983 DAG.getConstant(M2ZImm, DL, MVT::i8));
27984 DCI.AddToWorklist(Res.getNode());
27985 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27990 // If we have 3 or more shuffle instructions or a chain involving a variable
27991 // mask, we can replace them with a single PSHUFB instruction profitably.
27992 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
27993 // instructions, but in practice PSHUFB tends to be *very* fast so we're
27994 // more aggressive.
27995 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27996 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27997 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
27998 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
27999 SmallVector<SDValue, 16> PSHUFBMask;
28000 int NumBytes = RootVT.getSizeInBits() / 8;
28001 int Ratio = NumBytes / NumMaskElts;
28002 for (int i = 0; i < NumBytes; ++i) {
28003 int M = Mask[i / Ratio];
28004 if (M == SM_SentinelUndef) {
28005 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
28008 if (M == SM_SentinelZero) {
28009 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
28012 M = Ratio * M + i % Ratio;
28013 assert ((M / 16) == (i / 16) && "Lane crossing detected");
28014 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28016 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
28017 Res = DAG.getBitcast(ByteVT, V1);
28018 DCI.AddToWorklist(Res.getNode());
28019 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
28020 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
28021 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
28022 DCI.AddToWorklist(Res.getNode());
28023 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
28028 // With XOP, if we have a 128-bit binary input shuffle we can always combine
28029 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
28030 // slower than PSHUFB on targets that support both.
28031 if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
28032 Subtarget.hasXOP()) {
28033 // VPPERM Mask Operation
28034 // Bits[4:0] - Byte Index (0 - 31)
28035 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
28036 SmallVector<SDValue, 16> VPPERMMask;
28038 int Ratio = NumBytes / NumMaskElts;
28039 for (int i = 0; i < NumBytes; ++i) {
28040 int M = Mask[i / Ratio];
28041 if (M == SM_SentinelUndef) {
28042 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
28045 if (M == SM_SentinelZero) {
28046 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
28049 M = Ratio * M + i % Ratio;
28050 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28052 MVT ByteVT = MVT::v16i8;
28053 V1 = DAG.getBitcast(ByteVT, V1);
28054 DCI.AddToWorklist(V1.getNode());
28055 V2 = DAG.getBitcast(ByteVT, V2);
28056 DCI.AddToWorklist(V2.getNode());
28057 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
28058 DCI.AddToWorklist(VPPERMMaskOp.getNode());
28059 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
28060 DCI.AddToWorklist(Res.getNode());
28061 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
28066 // Failed to find any combines.
28070 // Attempt to constant fold all of the constant source ops.
28071 // Returns true if the entire shuffle is folded to a constant.
28072 // TODO: Extend this to merge multiple constant Ops and update the mask.
28073 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
28074 ArrayRef<int> Mask, SDValue Root,
28075 bool HasVariableMask, SelectionDAG &DAG,
28076 TargetLowering::DAGCombinerInfo &DCI,
28077 const X86Subtarget &Subtarget) {
28078 MVT VT = Root.getSimpleValueType();
28080 unsigned SizeInBits = VT.getSizeInBits();
28081 unsigned NumMaskElts = Mask.size();
28082 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
28083 unsigned NumOps = Ops.size();
28085 // Extract constant bits from each source op.
28086 bool OneUseConstantOp = false;
28087 SmallVector<APInt, 16> UndefEltsOps(NumOps);
28088 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
28089 for (unsigned i = 0; i != NumOps; ++i) {
28090 SDValue SrcOp = Ops[i];
28091 OneUseConstantOp |= SrcOp.hasOneUse();
28092 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
28097 // Only fold if at least one of the constants is only used once or
28098 // the combined shuffle has included a variable mask shuffle, this
28099 // is to avoid constant pool bloat.
28100 if (!OneUseConstantOp && !HasVariableMask)
28103 // Shuffle the constant bits according to the mask.
28104 APInt UndefElts(NumMaskElts, 0);
28105 APInt ZeroElts(NumMaskElts, 0);
28106 APInt ConstantElts(NumMaskElts, 0);
28107 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
28108 APInt::getNullValue(MaskSizeInBits));
28109 for (unsigned i = 0; i != NumMaskElts; ++i) {
28111 if (M == SM_SentinelUndef) {
28112 UndefElts.setBit(i);
28114 } else if (M == SM_SentinelZero) {
28115 ZeroElts.setBit(i);
28118 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
28120 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
28121 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
28123 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
28124 if (SrcUndefElts[SrcMaskIdx]) {
28125 UndefElts.setBit(i);
28129 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
28130 APInt &Bits = SrcEltBits[SrcMaskIdx];
28132 ZeroElts.setBit(i);
28136 ConstantElts.setBit(i);
28137 ConstantBitData[i] = Bits;
28139 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
28141 // Create the constant data.
28143 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
28144 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
28146 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
28148 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
28151 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
28152 DCI.AddToWorklist(CstOp.getNode());
28153 DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
28157 /// \brief Fully generic combining of x86 shuffle instructions.
28159 /// This should be the last combine run over the x86 shuffle instructions. Once
28160 /// they have been fully optimized, this will recursively consider all chains
28161 /// of single-use shuffle instructions, build a generic model of the cumulative
28162 /// shuffle operation, and check for simpler instructions which implement this
28163 /// operation. We use this primarily for two purposes:
28165 /// 1) Collapse generic shuffles to specialized single instructions when
28166 /// equivalent. In most cases, this is just an encoding size win, but
28167 /// sometimes we will collapse multiple generic shuffles into a single
28168 /// special-purpose shuffle.
28169 /// 2) Look for sequences of shuffle instructions with 3 or more total
28170 /// instructions, and replace them with the slightly more expensive SSSE3
28171 /// PSHUFB instruction if available. We do this as the last combining step
28172 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
28173 /// a suitable short sequence of other instructions. The PSHUFB will either
28174 /// use a register or have to read from memory and so is slightly (but only
28175 /// slightly) more expensive than the other shuffle instructions.
28177 /// Because this is inherently a quadratic operation (for each shuffle in
28178 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
28179 /// This should never be an issue in practice as the shuffle lowering doesn't
28180 /// produce sequences of more than 8 instructions.
28182 /// FIXME: We will currently miss some cases where the redundant shuffling
28183 /// would simplify under the threshold for PSHUFB formation because of
28184 /// combine-ordering. To fix this, we should do the redundant instruction
28185 /// combining in this recursive walk.
28186 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
28187 int SrcOpIndex, SDValue Root,
28188 ArrayRef<int> RootMask,
28189 ArrayRef<const SDNode*> SrcNodes,
28190 int Depth, bool HasVariableMask,
28192 TargetLowering::DAGCombinerInfo &DCI,
28193 const X86Subtarget &Subtarget) {
28194 // Bound the depth of our recursive combine because this is ultimately
28195 // quadratic in nature.
28199 // Directly rip through bitcasts to find the underlying operand.
28200 SDValue Op = SrcOps[SrcOpIndex];
28201 Op = peekThroughOneUseBitcasts(Op);
28203 MVT VT = Op.getSimpleValueType();
28204 if (!VT.isVector())
28205 return false; // Bail if we hit a non-vector.
28207 assert(Root.getSimpleValueType().isVector() &&
28208 "Shuffles operate on vector types!");
28209 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
28210 "Can only combine shuffles of the same vector register size.");
28212 // Extract target shuffle mask and resolve sentinels and inputs.
28213 SmallVector<int, 64> OpMask;
28214 SmallVector<SDValue, 2> OpInputs;
28215 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
28218 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
28219 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
28220 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
28222 // Add the inputs to the Ops list, avoiding duplicates.
28223 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
28225 int InputIdx0 = -1, InputIdx1 = -1;
28226 for (int i = 0, e = Ops.size(); i < e; ++i) {
28227 SDValue BC = peekThroughBitcasts(Ops[i]);
28228 if (Input0 && BC == peekThroughBitcasts(Input0))
28230 if (Input1 && BC == peekThroughBitcasts(Input1))
28234 if (Input0 && InputIdx0 < 0) {
28235 InputIdx0 = SrcOpIndex;
28236 Ops[SrcOpIndex] = Input0;
28238 if (Input1 && InputIdx1 < 0) {
28239 InputIdx1 = Ops.size();
28240 Ops.push_back(Input1);
28243 assert(((RootMask.size() > OpMask.size() &&
28244 RootMask.size() % OpMask.size() == 0) ||
28245 (OpMask.size() > RootMask.size() &&
28246 OpMask.size() % RootMask.size() == 0) ||
28247 OpMask.size() == RootMask.size()) &&
28248 "The smaller number of elements must divide the larger.");
28250 // This function can be performance-critical, so we rely on the power-of-2
28251 // knowledge that we have about the mask sizes to replace div/rem ops with
28252 // bit-masks and shifts.
28253 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
28254 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
28255 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
28256 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
28258 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
28259 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
28260 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
28261 assert((RootRatio == 1 || OpRatio == 1) &&
28262 "Must not have a ratio for both incoming and op masks!");
28264 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
28265 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
28266 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
28267 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
28268 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
28270 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
28272 // Merge this shuffle operation's mask into our accumulated mask. Note that
28273 // this shuffle's mask will be the first applied to the input, followed by the
28274 // root mask to get us all the way to the root value arrangement. The reason
28275 // for this order is that we are recursing up the operation chain.
28276 for (unsigned i = 0; i < MaskWidth; ++i) {
28277 unsigned RootIdx = i >> RootRatioLog2;
28278 if (RootMask[RootIdx] < 0) {
28279 // This is a zero or undef lane, we're done.
28280 Mask[i] = RootMask[RootIdx];
28284 unsigned RootMaskedIdx =
28286 ? RootMask[RootIdx]
28287 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
28289 // Just insert the scaled root mask value if it references an input other
28290 // than the SrcOp we're currently inserting.
28291 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
28292 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
28293 Mask[i] = RootMaskedIdx;
28297 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
28298 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
28299 if (OpMask[OpIdx] < 0) {
28300 // The incoming lanes are zero or undef, it doesn't matter which ones we
28302 Mask[i] = OpMask[OpIdx];
28306 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
28307 unsigned OpMaskedIdx =
28310 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
28312 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
28313 if (OpMask[OpIdx] < (int)OpMask.size()) {
28314 assert(0 <= InputIdx0 && "Unknown target shuffle input");
28315 OpMaskedIdx += InputIdx0 * MaskWidth;
28317 assert(0 <= InputIdx1 && "Unknown target shuffle input");
28318 OpMaskedIdx += InputIdx1 * MaskWidth;
28321 Mask[i] = OpMaskedIdx;
28324 // Handle the all undef/zero cases early.
28325 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
28326 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
28329 if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
28330 // TODO - should we handle the mixed zero/undef case as well? Just returning
28331 // a zero mask will lose information on undef elements possibly reducing
28332 // future combine possibilities.
28333 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
28334 Subtarget, DAG, SDLoc(Root)));
28338 // Remove unused shuffle source ops.
28339 resolveTargetShuffleInputsAndMask(Ops, Mask);
28340 assert(!Ops.empty() && "Shuffle with no inputs detected");
28342 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
28344 // Update the list of shuffle nodes that have been combined so far.
28345 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
28347 CombinedNodes.push_back(Op.getNode());
28349 // See if we can recurse into each shuffle source op (if it's a target
28350 // shuffle). The source op should only be combined if it either has a
28351 // single use (i.e. current Op) or all its users have already been combined.
28352 for (int i = 0, e = Ops.size(); i < e; ++i)
28353 if (Ops[i].getNode()->hasOneUse() ||
28354 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
28355 if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
28356 Depth + 1, HasVariableMask, DAG, DCI,
28360 // Attempt to constant fold all of the constant source ops.
28361 if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
28365 // We can only combine unary and binary shuffle mask cases.
28366 if (Ops.size() > 2)
28369 // Minor canonicalization of the accumulated shuffle mask to make it easier
28370 // to match below. All this does is detect masks with sequential pairs of
28371 // elements, and shrink them to the half-width mask. It does this in a loop
28372 // so it will reduce the size of the mask to the minimal width mask which
28373 // performs an equivalent shuffle.
28374 SmallVector<int, 64> WidenedMask;
28375 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
28376 Mask = std::move(WidenedMask);
28379 // Canonicalization of binary shuffle masks to improve pattern matching by
28380 // commuting the inputs.
28381 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
28382 ShuffleVectorSDNode::commuteMask(Mask);
28383 std::swap(Ops[0], Ops[1]);
28386 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
28390 /// \brief Get the PSHUF-style mask from PSHUF node.
28392 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
28393 /// PSHUF-style masks that can be reused with such instructions.
28394 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
28395 MVT VT = N.getSimpleValueType();
28396 SmallVector<int, 4> Mask;
28397 SmallVector<SDValue, 2> Ops;
28400 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
28404 // If we have more than 128-bits, only the low 128-bits of shuffle mask
28405 // matter. Check that the upper masks are repeats and remove them.
28406 if (VT.getSizeInBits() > 128) {
28407 int LaneElts = 128 / VT.getScalarSizeInBits();
28409 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
28410 for (int j = 0; j < LaneElts; ++j)
28411 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
28412 "Mask doesn't repeat in high 128-bit lanes!");
28414 Mask.resize(LaneElts);
28417 switch (N.getOpcode()) {
28418 case X86ISD::PSHUFD:
28420 case X86ISD::PSHUFLW:
28423 case X86ISD::PSHUFHW:
28424 Mask.erase(Mask.begin(), Mask.begin() + 4);
28425 for (int &M : Mask)
28429 llvm_unreachable("No valid shuffle instruction found!");
28433 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
28435 /// We walk up the chain and look for a combinable shuffle, skipping over
28436 /// shuffles that we could hoist this shuffle's transformation past without
28437 /// altering anything.
28439 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
28440 SelectionDAG &DAG) {
28441 assert(N.getOpcode() == X86ISD::PSHUFD &&
28442 "Called with something other than an x86 128-bit half shuffle!");
28445 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
28446 // of the shuffles in the chain so that we can form a fresh chain to replace
28448 SmallVector<SDValue, 8> Chain;
28449 SDValue V = N.getOperand(0);
28450 for (; V.hasOneUse(); V = V.getOperand(0)) {
28451 switch (V.getOpcode()) {
28453 return SDValue(); // Nothing combined!
28456 // Skip bitcasts as we always know the type for the target specific
28460 case X86ISD::PSHUFD:
28461 // Found another dword shuffle.
28464 case X86ISD::PSHUFLW:
28465 // Check that the low words (being shuffled) are the identity in the
28466 // dword shuffle, and the high words are self-contained.
28467 if (Mask[0] != 0 || Mask[1] != 1 ||
28468 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
28471 Chain.push_back(V);
28474 case X86ISD::PSHUFHW:
28475 // Check that the high words (being shuffled) are the identity in the
28476 // dword shuffle, and the low words are self-contained.
28477 if (Mask[2] != 2 || Mask[3] != 3 ||
28478 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
28481 Chain.push_back(V);
28484 case X86ISD::UNPCKL:
28485 case X86ISD::UNPCKH:
28486 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
28487 // shuffle into a preceding word shuffle.
28488 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
28489 V.getSimpleValueType().getVectorElementType() != MVT::i16)
28492 // Search for a half-shuffle which we can combine with.
28493 unsigned CombineOp =
28494 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
28495 if (V.getOperand(0) != V.getOperand(1) ||
28496 !V->isOnlyUserOf(V.getOperand(0).getNode()))
28498 Chain.push_back(V);
28499 V = V.getOperand(0);
28501 switch (V.getOpcode()) {
28503 return SDValue(); // Nothing to combine.
28505 case X86ISD::PSHUFLW:
28506 case X86ISD::PSHUFHW:
28507 if (V.getOpcode() == CombineOp)
28510 Chain.push_back(V);
28514 V = V.getOperand(0);
28518 } while (V.hasOneUse());
28521 // Break out of the loop if we break out of the switch.
28525 if (!V.hasOneUse())
28526 // We fell out of the loop without finding a viable combining instruction.
28529 // Merge this node's mask and our incoming mask.
28530 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28531 for (int &M : Mask)
28533 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
28534 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28536 // Rebuild the chain around this new shuffle.
28537 while (!Chain.empty()) {
28538 SDValue W = Chain.pop_back_val();
28540 if (V.getValueType() != W.getOperand(0).getValueType())
28541 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
28543 switch (W.getOpcode()) {
28545 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
28547 case X86ISD::UNPCKL:
28548 case X86ISD::UNPCKH:
28549 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
28552 case X86ISD::PSHUFD:
28553 case X86ISD::PSHUFLW:
28554 case X86ISD::PSHUFHW:
28555 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
28559 if (V.getValueType() != N.getValueType())
28560 V = DAG.getBitcast(N.getValueType(), V);
28562 // Return the new chain to replace N.
28566 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
28569 /// We walk up the chain, skipping shuffles of the other half and looking
28570 /// through shuffles which switch halves trying to find a shuffle of the same
28571 /// pair of dwords.
28572 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
28574 TargetLowering::DAGCombinerInfo &DCI) {
28576 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
28577 "Called with something other than an x86 128-bit half shuffle!");
28579 unsigned CombineOpcode = N.getOpcode();
28581 // Walk up a single-use chain looking for a combinable shuffle.
28582 SDValue V = N.getOperand(0);
28583 for (; V.hasOneUse(); V = V.getOperand(0)) {
28584 switch (V.getOpcode()) {
28586 return false; // Nothing combined!
28589 // Skip bitcasts as we always know the type for the target specific
28593 case X86ISD::PSHUFLW:
28594 case X86ISD::PSHUFHW:
28595 if (V.getOpcode() == CombineOpcode)
28598 // Other-half shuffles are no-ops.
28601 // Break out of the loop if we break out of the switch.
28605 if (!V.hasOneUse())
28606 // We fell out of the loop without finding a viable combining instruction.
28609 // Combine away the bottom node as its shuffle will be accumulated into
28610 // a preceding shuffle.
28611 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28613 // Record the old value.
28616 // Merge this node's mask and our incoming mask (adjusted to account for all
28617 // the pshufd instructions encountered).
28618 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28619 for (int &M : Mask)
28621 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
28622 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28624 // Check that the shuffles didn't cancel each other out. If not, we need to
28625 // combine to the new one.
28627 // Replace the combinable shuffle with the combined one, updating all users
28628 // so that we re-evaluate the chain here.
28629 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
28634 /// \brief Try to combine x86 target specific shuffles.
28635 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
28636 TargetLowering::DAGCombinerInfo &DCI,
28637 const X86Subtarget &Subtarget) {
28639 MVT VT = N.getSimpleValueType();
28640 SmallVector<int, 4> Mask;
28642 unsigned Opcode = N.getOpcode();
28644 case X86ISD::PSHUFD:
28645 case X86ISD::PSHUFLW:
28646 case X86ISD::PSHUFHW:
28647 Mask = getPSHUFShuffleMask(N);
28648 assert(Mask.size() == 4);
28650 case X86ISD::UNPCKL: {
28651 auto Op0 = N.getOperand(0);
28652 auto Op1 = N.getOperand(1);
28653 unsigned Opcode0 = Op0.getOpcode();
28654 unsigned Opcode1 = Op1.getOpcode();
28656 // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
28657 // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
28658 // TODO: Add other horizontal operations as required.
28659 if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
28660 return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
28662 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
28663 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
28664 // moves upper half elements into the lower half part. For example:
28666 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
28668 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
28670 // will be combined to:
28672 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
28674 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
28675 // happen due to advanced instructions.
28676 if (!VT.is128BitVector())
28679 if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
28680 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
28682 unsigned NumElts = VT.getVectorNumElements();
28683 SmallVector<int, 8> ExpectedMask(NumElts, -1);
28684 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
28687 auto ShufOp = Op1.getOperand(0);
28688 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
28689 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
28693 case X86ISD::BLENDI: {
28694 SDValue V0 = N->getOperand(0);
28695 SDValue V1 = N->getOperand(1);
28696 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
28697 "Unexpected input vector types");
28699 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
28700 // operands and changing the mask to 1. This saves us a bunch of
28701 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
28702 // x86InstrInfo knows how to commute this back after instruction selection
28703 // if it would help register allocation.
28705 // TODO: If optimizing for size or a processor that doesn't suffer from
28706 // partial register update stalls, this should be transformed into a MOVSD
28707 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
28709 if (VT == MVT::v2f64)
28710 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
28711 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
28712 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
28713 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
28718 case X86ISD::MOVSD:
28719 case X86ISD::MOVSS: {
28720 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
28721 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
28722 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
28723 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
28724 if (isZero0 && isZero1)
28727 // We often lower to MOVSD/MOVSS from integer as well as native float
28728 // types; remove unnecessary domain-crossing bitcasts if we can to make it
28729 // easier to combine shuffles later on. We've already accounted for the
28730 // domain switching cost when we decided to lower with it.
28731 bool isFloat = VT.isFloatingPoint();
28732 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
28733 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
28734 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
28735 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
28736 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
28737 V0 = DAG.getBitcast(NewVT, V0);
28738 V1 = DAG.getBitcast(NewVT, V1);
28739 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
28744 case X86ISD::INSERTPS: {
28745 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
28746 SDValue Op0 = N.getOperand(0);
28747 SDValue Op1 = N.getOperand(1);
28748 SDValue Op2 = N.getOperand(2);
28749 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
28750 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
28751 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
28752 unsigned ZeroMask = InsertPSMask & 0xF;
28754 // If we zero out all elements from Op0 then we don't need to reference it.
28755 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
28756 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
28757 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28759 // If we zero out the element from Op1 then we don't need to reference it.
28760 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
28761 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28762 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28764 // Attempt to merge insertps Op1 with an inner target shuffle node.
28765 SmallVector<int, 8> TargetMask1;
28766 SmallVector<SDValue, 2> Ops1;
28767 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
28768 int M = TargetMask1[SrcIdx];
28769 if (isUndefOrZero(M)) {
28770 // Zero/UNDEF insertion - zero out element and remove dependency.
28771 InsertPSMask |= (1u << DstIdx);
28772 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28773 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28775 // Update insertps mask srcidx and reference the source input directly.
28776 assert(0 <= M && M < 8 && "Shuffle index out of range");
28777 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
28778 Op1 = Ops1[M < 4 ? 0 : 1];
28779 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28780 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28783 // Attempt to merge insertps Op0 with an inner target shuffle node.
28784 SmallVector<int, 8> TargetMask0;
28785 SmallVector<SDValue, 2> Ops0;
28786 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
28789 bool Updated = false;
28790 bool UseInput00 = false;
28791 bool UseInput01 = false;
28792 for (int i = 0; i != 4; ++i) {
28793 int M = TargetMask0[i];
28794 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
28795 // No change if element is already zero or the inserted element.
28797 } else if (isUndefOrZero(M)) {
28798 // If the target mask is undef/zero then we must zero the element.
28799 InsertPSMask |= (1u << i);
28804 // The input vector element must be inline.
28805 if (M != i && M != (i + 4))
28808 // Determine which inputs of the target shuffle we're using.
28809 UseInput00 |= (0 <= M && M < 4);
28810 UseInput01 |= (4 <= M);
28813 // If we're not using both inputs of the target shuffle then use the
28814 // referenced input directly.
28815 if (UseInput00 && !UseInput01) {
28818 } else if (!UseInput00 && UseInput01) {
28824 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28825 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28833 // Nuke no-op shuffles that show up after combining.
28834 if (isNoopShuffleMask(Mask))
28835 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28837 // Look for simplifications involving one or two shuffle instructions.
28838 SDValue V = N.getOperand(0);
28839 switch (N.getOpcode()) {
28842 case X86ISD::PSHUFLW:
28843 case X86ISD::PSHUFHW:
28844 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
28846 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
28847 return SDValue(); // We combined away this shuffle, so we're done.
28849 // See if this reduces to a PSHUFD which is no more expensive and can
28850 // combine with more operations. Note that it has to at least flip the
28851 // dwords as otherwise it would have been removed as a no-op.
28852 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
28853 int DMask[] = {0, 1, 2, 3};
28854 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
28855 DMask[DOffset + 0] = DOffset + 1;
28856 DMask[DOffset + 1] = DOffset + 0;
28857 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
28858 V = DAG.getBitcast(DVT, V);
28859 DCI.AddToWorklist(V.getNode());
28860 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
28861 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
28862 DCI.AddToWorklist(V.getNode());
28863 return DAG.getBitcast(VT, V);
28866 // Look for shuffle patterns which can be implemented as a single unpack.
28867 // FIXME: This doesn't handle the location of the PSHUFD generically, and
28868 // only works when we have a PSHUFD followed by two half-shuffles.
28869 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
28870 (V.getOpcode() == X86ISD::PSHUFLW ||
28871 V.getOpcode() == X86ISD::PSHUFHW) &&
28872 V.getOpcode() != N.getOpcode() &&
28874 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
28875 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
28876 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28877 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28878 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28879 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28881 for (int i = 0; i < 4; ++i) {
28882 WordMask[i + NOffset] = Mask[i] + NOffset;
28883 WordMask[i + VOffset] = VMask[i] + VOffset;
28885 // Map the word mask through the DWord mask.
28887 for (int i = 0; i < 8; ++i)
28888 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28889 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
28890 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
28891 // We can replace all three shuffles with an unpack.
28892 V = DAG.getBitcast(VT, D.getOperand(0));
28893 DCI.AddToWorklist(V.getNode());
28894 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28903 case X86ISD::PSHUFD:
28904 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
28913 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28914 /// operation. If true is returned then the operands of ADDSUB operation
28915 /// are written to the parameters \p Opnd0 and \p Opnd1.
28917 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28918 /// so it is easier to generically match. We also insert dummy vector shuffle
28919 /// nodes for the operands which explicitly discard the lanes which are unused
28920 /// by this operation to try to flow through the rest of the combiner
28921 /// the fact that they're unused.
28922 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28923 SDValue &Opnd0, SDValue &Opnd1) {
28925 EVT VT = N->getValueType(0);
28926 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
28927 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
28928 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
28931 // We only handle target-independent shuffles.
28932 // FIXME: It would be easy and harmless to use the target shuffle mask
28933 // extraction tool to support more.
28934 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
28937 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28938 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28940 SDValue V1 = N->getOperand(0);
28941 SDValue V2 = N->getOperand(1);
28943 // We require the first shuffle operand to be the FSUB node, and the second to
28944 // be the FADD node.
28945 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
28946 ShuffleVectorSDNode::commuteMask(Mask);
28948 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
28951 // If there are other uses of these operations we can't fold them.
28952 if (!V1->hasOneUse() || !V2->hasOneUse())
28955 // Ensure that both operations have the same operands. Note that we can
28956 // commute the FADD operands.
28957 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28958 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
28959 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
28962 // We're looking for blends between FADD and FSUB nodes. We insist on these
28963 // nodes being lined up in a specific expected pattern.
28964 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
28965 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
28966 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
28967 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28968 8, 25, 10, 27, 12, 29, 14, 31})))
28976 /// \brief Try to combine a shuffle into a target-specific add-sub or
28977 /// mul-add-sub node.
28978 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28979 const X86Subtarget &Subtarget,
28980 SelectionDAG &DAG) {
28981 SDValue Opnd0, Opnd1;
28982 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28985 EVT VT = N->getValueType(0);
28988 // Try to generate X86ISD::FMADDSUB node here.
28990 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
28991 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
28993 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
28994 // the ADDSUB idiom has been successfully recognized. There are no known
28995 // X86 targets with 512-bit ADDSUB instructions!
28996 if (VT.is512BitVector())
28999 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
29002 // We are looking for a shuffle where both sources are concatenated with undef
29003 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
29004 // if we can express this as a single-source shuffle, that's preferable.
29005 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
29006 const X86Subtarget &Subtarget) {
29007 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
29010 EVT VT = N->getValueType(0);
29012 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
29013 if (!VT.is128BitVector() && !VT.is256BitVector())
29016 if (VT.getVectorElementType() != MVT::i32 &&
29017 VT.getVectorElementType() != MVT::i64 &&
29018 VT.getVectorElementType() != MVT::f32 &&
29019 VT.getVectorElementType() != MVT::f64)
29022 SDValue N0 = N->getOperand(0);
29023 SDValue N1 = N->getOperand(1);
29025 // Check that both sources are concats with undef.
29026 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
29027 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
29028 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
29029 !N1.getOperand(1).isUndef())
29032 // Construct the new shuffle mask. Elements from the first source retain their
29033 // index, but elements from the second source no longer need to skip an undef.
29034 SmallVector<int, 8> Mask;
29035 int NumElts = VT.getVectorNumElements();
29037 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29038 for (int Elt : SVOp->getMask())
29039 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
29042 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
29044 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
29047 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
29048 TargetLowering::DAGCombinerInfo &DCI,
29049 const X86Subtarget &Subtarget) {
29051 EVT VT = N->getValueType(0);
29052 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29053 // If we have legalized the vector types, look for blends of FADD and FSUB
29054 // nodes that we can fuse into an ADDSUB node.
29055 if (TLI.isTypeLegal(VT))
29056 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
29059 // During Type Legalization, when promoting illegal vector types,
29060 // the backend might introduce new shuffle dag nodes and bitcasts.
29062 // This code performs the following transformation:
29063 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
29064 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
29066 // We do this only if both the bitcast and the BINOP dag nodes have
29067 // one use. Also, perform this transformation only if the new binary
29068 // operation is legal. This is to avoid introducing dag nodes that
29069 // potentially need to be further expanded (or custom lowered) into a
29070 // less optimal sequence of dag nodes.
29071 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
29072 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
29073 N->getOperand(0).getOpcode() == ISD::BITCAST &&
29074 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
29075 SDValue N0 = N->getOperand(0);
29076 SDValue N1 = N->getOperand(1);
29078 SDValue BC0 = N0.getOperand(0);
29079 EVT SVT = BC0.getValueType();
29080 unsigned Opcode = BC0.getOpcode();
29081 unsigned NumElts = VT.getVectorNumElements();
29083 if (BC0.hasOneUse() && SVT.isVector() &&
29084 SVT.getVectorNumElements() * 2 == NumElts &&
29085 TLI.isOperationLegal(Opcode, VT)) {
29086 bool CanFold = false;
29092 // isOperationLegal lies for integer ops on floating point types.
29093 CanFold = VT.isInteger();
29098 // isOperationLegal lies for floating point ops on integer types.
29099 CanFold = VT.isFloatingPoint();
29103 unsigned SVTNumElts = SVT.getVectorNumElements();
29104 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29105 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
29106 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
29107 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
29108 CanFold = SVOp->getMaskElt(i) < 0;
29111 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
29112 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
29113 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
29114 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
29119 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
29120 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
29121 // consecutive, non-overlapping, and in the right order.
29122 SmallVector<SDValue, 16> Elts;
29123 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
29124 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
29125 Elts.push_back(Elt);
29132 if (Elts.size() == VT.getVectorNumElements())
29134 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
29137 // For AVX2, we sometimes want to combine
29138 // (vector_shuffle <mask> (concat_vectors t1, undef)
29139 // (concat_vectors t2, undef))
29141 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
29142 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
29143 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
29146 if (isTargetShuffle(N->getOpcode())) {
29148 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
29151 // Try recursively combining arbitrary sequences of x86 shuffle
29152 // instructions into higher-order shuffles. We do this after combining
29153 // specific PSHUF instruction sequences into their minimal form so that we
29154 // can evaluate how many specialized shuffle instructions are involved in
29155 // a particular chain.
29156 SmallVector<int, 1> NonceMask; // Just a placeholder.
29157 NonceMask.push_back(0);
29158 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
29159 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
29161 return SDValue(); // This routine will use CombineTo to replace N.
29167 /// Check if a vector extract from a target-specific shuffle of a load can be
29168 /// folded into a single element load.
29169 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
29170 /// shuffles have been custom lowered so we need to handle those here.
29171 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
29172 TargetLowering::DAGCombinerInfo &DCI) {
29173 if (DCI.isBeforeLegalizeOps())
29176 SDValue InVec = N->getOperand(0);
29177 SDValue EltNo = N->getOperand(1);
29178 EVT EltVT = N->getValueType(0);
29180 if (!isa<ConstantSDNode>(EltNo))
29183 EVT OriginalVT = InVec.getValueType();
29185 // Peek through bitcasts, don't duplicate a load with other uses.
29186 InVec = peekThroughOneUseBitcasts(InVec);
29188 EVT CurrentVT = InVec.getValueType();
29189 if (!CurrentVT.isVector() ||
29190 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
29193 if (!isTargetShuffle(InVec.getOpcode()))
29196 // Don't duplicate a load with other uses.
29197 if (!InVec.hasOneUse())
29200 SmallVector<int, 16> ShuffleMask;
29201 SmallVector<SDValue, 2> ShuffleOps;
29203 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
29204 ShuffleOps, ShuffleMask, UnaryShuffle))
29207 // Select the input vector, guarding against out of range extract vector.
29208 unsigned NumElems = CurrentVT.getVectorNumElements();
29209 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
29210 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
29212 if (Idx == SM_SentinelZero)
29213 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
29214 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
29215 if (Idx == SM_SentinelUndef)
29216 return DAG.getUNDEF(EltVT);
29218 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
29219 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
29222 // If inputs to shuffle are the same for both ops, then allow 2 uses
29223 unsigned AllowedUses =
29224 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
29226 if (LdNode.getOpcode() == ISD::BITCAST) {
29227 // Don't duplicate a load with other uses.
29228 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
29231 AllowedUses = 1; // only allow 1 load use if we have a bitcast
29232 LdNode = LdNode.getOperand(0);
29235 if (!ISD::isNormalLoad(LdNode.getNode()))
29238 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
29240 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
29243 // If there's a bitcast before the shuffle, check if the load type and
29244 // alignment is valid.
29245 unsigned Align = LN0->getAlignment();
29246 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29247 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
29248 EltVT.getTypeForEVT(*DAG.getContext()));
29250 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
29253 // All checks match so transform back to vector_shuffle so that DAG combiner
29254 // can finish the job
29257 // Create shuffle node taking into account the case that its a unary shuffle
29258 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
29259 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
29261 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
29262 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
29266 // Try to match patterns such as
29267 // (i16 bitcast (v16i1 x))
29269 // (i16 movmsk (16i8 sext (v16i1 x)))
29270 // before the illegal vector is scalarized on subtargets that don't have legal
29272 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
29273 const X86Subtarget &Subtarget) {
29274 EVT VT = BitCast.getValueType();
29275 SDValue N0 = BitCast.getOperand(0);
29276 EVT VecVT = N0->getValueType(0);
29278 if (!VT.isScalarInteger() || !VecVT.isSimple())
29281 // With AVX512 vxi1 types are legal and we prefer using k-regs.
29282 // MOVMSK is supported in SSE2 or later.
29283 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
29286 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
29287 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
29288 // v8i16 and v16i16.
29289 // For these two cases, we can shuffle the upper element bytes to a
29290 // consecutive sequence at the start of the vector and treat the results as
29291 // v16i8 or v32i8, and for v61i8 this is the preferable solution. However,
29292 // for v16i16 this is not the case, because the shuffle is expensive, so we
29293 // avoid sign-extending to this type entirely.
29294 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
29295 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
29297 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
29298 switch (VecVT.getSimpleVT().SimpleTy) {
29302 SExtVT = MVT::v2i64;
29303 FPCastVT = MVT::v2f64;
29306 SExtVT = MVT::v4i32;
29307 FPCastVT = MVT::v4f32;
29308 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
29309 // sign-extend to a 256-bit operation to avoid truncation.
29310 if (N0->getOpcode() == ISD::SETCC &&
29311 N0->getOperand(0)->getValueType(0).is256BitVector() &&
29312 Subtarget.hasInt256()) {
29313 SExtVT = MVT::v4i64;
29314 FPCastVT = MVT::v4f64;
29318 SExtVT = MVT::v8i16;
29319 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
29320 // sign-extend to a 256-bit operation to match the compare.
29321 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
29322 // 256-bit because the shuffle is cheaper than sign extending the result of
29324 if (N0->getOpcode() == ISD::SETCC &&
29325 N0->getOperand(0)->getValueType(0).is256BitVector() &&
29326 Subtarget.hasInt256()) {
29327 SExtVT = MVT::v8i32;
29328 FPCastVT = MVT::v8f32;
29332 SExtVT = MVT::v16i8;
29333 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
29334 // it is not profitable to sign-extend to 256-bit because this will
29335 // require an extra cross-lane shuffle which is more expensive than
29336 // truncating the result of the compare to 128-bits.
29339 // TODO: Handle pre-AVX2 cases by splitting to two v16i1's.
29340 if (!Subtarget.hasInt256())
29342 SExtVT = MVT::v32i8;
29347 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
29348 if (SExtVT == MVT::v8i16) {
29349 V = DAG.getBitcast(MVT::v16i8, V);
29350 V = DAG.getVectorShuffle(
29351 MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
29352 {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
29354 assert(SExtVT.getScalarType() != MVT::i16 &&
29355 "Vectors of i16 must be shuffled");
29356 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
29357 V = DAG.getBitcast(FPCastVT, V);
29358 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29359 return DAG.getZExtOrTrunc(V, DL, VT);
29362 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
29363 TargetLowering::DAGCombinerInfo &DCI,
29364 const X86Subtarget &Subtarget) {
29365 SDValue N0 = N->getOperand(0);
29366 EVT VT = N->getValueType(0);
29367 EVT SrcVT = N0.getValueType();
29369 // Try to match patterns such as
29370 // (i16 bitcast (v16i1 x))
29372 // (i16 movmsk (16i8 sext (v16i1 x)))
29373 // before the setcc result is scalarized on subtargets that don't have legal
29375 if (DCI.isBeforeLegalize())
29376 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
29378 // Since MMX types are special and don't usually play with other vector types,
29379 // it's better to handle them early to be sure we emit efficient code by
29380 // avoiding store-load conversions.
29382 // Detect bitcasts between i32 to x86mmx low word.
29383 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
29384 SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
29385 SDValue N00 = N0->getOperand(0);
29386 if (N00.getValueType() == MVT::i32)
29387 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
29390 // Detect bitcasts between element or subvector extraction to x86mmx.
29391 if (VT == MVT::x86mmx &&
29392 (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
29393 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
29394 isNullConstant(N0.getOperand(1))) {
29395 SDValue N00 = N0->getOperand(0);
29396 if (N00.getValueType().is128BitVector())
29397 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
29398 DAG.getBitcast(MVT::v2i64, N00));
29401 // Detect bitcasts from FP_TO_SINT to x86mmx.
29402 if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
29403 N0.getOpcode() == ISD::FP_TO_SINT) {
29405 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
29406 DAG.getUNDEF(MVT::v2i32));
29407 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
29408 DAG.getBitcast(MVT::v2i64, Res));
29411 // Convert a bitcasted integer logic operation that has one bitcasted
29412 // floating-point operand into a floating-point logic operation. This may
29413 // create a load of a constant, but that is cheaper than materializing the
29414 // constant in an integer register and transferring it to an SSE register or
29415 // transferring the SSE operand to integer register and back.
29417 switch (N0.getOpcode()) {
29418 case ISD::AND: FPOpcode = X86ISD::FAND; break;
29419 case ISD::OR: FPOpcode = X86ISD::FOR; break;
29420 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
29421 default: return SDValue();
29424 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
29425 (Subtarget.hasSSE2() && VT == MVT::f64)))
29428 SDValue LogicOp0 = N0.getOperand(0);
29429 SDValue LogicOp1 = N0.getOperand(1);
29432 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
29433 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
29434 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
29435 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
29436 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
29437 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
29439 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
29440 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
29441 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
29442 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
29443 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
29444 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
29450 // Match a binop + shuffle pyramid that represents a horizontal reduction over
29451 // the elements of a vector.
29452 // Returns the vector that is being reduced on, or SDValue() if a reduction
29453 // was not matched.
29454 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
29455 // The pattern must end in an extract from index 0.
29456 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
29457 !isNullConstant(Extract->getOperand(1)))
29461 Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
29463 SDValue Op = Extract->getOperand(0);
29464 // At each stage, we're looking for something that looks like:
29465 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
29466 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
29467 // i32 undef, i32 undef, i32 undef, i32 undef>
29468 // %a = binop <8 x i32> %op, %s
29469 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
29470 // we expect something like:
29471 // <4,5,6,7,u,u,u,u>
29472 // <2,3,u,u,u,u,u,u>
29473 // <1,u,u,u,u,u,u,u>
29474 for (unsigned i = 0; i < Stages; ++i) {
29475 if (Op.getOpcode() != BinOp)
29478 ShuffleVectorSDNode *Shuffle =
29479 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
29481 Op = Op.getOperand(1);
29483 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
29484 Op = Op.getOperand(0);
29487 // The first operand of the shuffle should be the same as the other operand
29489 if (!Shuffle || (Shuffle->getOperand(0) != Op))
29492 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
29493 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
29494 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
29501 // Given a select, detect the following pattern:
29502 // 1: %2 = zext <N x i8> %0 to <N x i32>
29503 // 2: %3 = zext <N x i8> %1 to <N x i32>
29504 // 3: %4 = sub nsw <N x i32> %2, %3
29505 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
29506 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
29507 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
29508 // This is useful as it is the input into a SAD pattern.
29509 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
29511 // Check the condition of the select instruction is greater-than.
29512 SDValue SetCC = Select->getOperand(0);
29513 if (SetCC.getOpcode() != ISD::SETCC)
29515 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
29516 if (CC != ISD::SETGT && CC != ISD::SETLT)
29519 SDValue SelectOp1 = Select->getOperand(1);
29520 SDValue SelectOp2 = Select->getOperand(2);
29522 // The following instructions assume SelectOp1 is the subtraction operand
29523 // and SelectOp2 is the negation operand.
29524 // In the case of SETLT this is the other way around.
29525 if (CC == ISD::SETLT)
29526 std::swap(SelectOp1, SelectOp2);
29528 // The second operand of the select should be the negation of the first
29529 // operand, which is implemented as 0 - SelectOp1.
29530 if (!(SelectOp2.getOpcode() == ISD::SUB &&
29531 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
29532 SelectOp2.getOperand(1) == SelectOp1))
29535 // The first operand of SetCC is the first operand of the select, which is the
29536 // difference between the two input vectors.
29537 if (SetCC.getOperand(0) != SelectOp1)
29540 // In SetLT case, The second operand of the comparison can be either 1 or 0.
29542 if ((CC == ISD::SETLT) &&
29543 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
29545 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
29548 // In SetGT case, The second operand of the comparison can be either -1 or 0.
29549 if ((CC == ISD::SETGT) &&
29550 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
29551 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
29554 // The first operand of the select is the difference between the two input
29556 if (SelectOp1.getOpcode() != ISD::SUB)
29559 Op0 = SelectOp1.getOperand(0);
29560 Op1 = SelectOp1.getOperand(1);
29562 // Check if the operands of the sub are zero-extended from vectors of i8.
29563 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
29564 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
29565 Op1.getOpcode() != ISD::ZERO_EXTEND ||
29566 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
29572 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
29574 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
29575 const SDValue &Zext1, const SDLoc &DL) {
29577 // Find the appropriate width for the PSADBW.
29578 EVT InVT = Zext0.getOperand(0).getValueType();
29579 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
29581 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
29582 // fill in the missing vector elements with 0.
29583 unsigned NumConcat = RegSize / InVT.getSizeInBits();
29584 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
29585 Ops[0] = Zext0.getOperand(0);
29586 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
29587 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29588 Ops[0] = Zext1.getOperand(0);
29589 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29591 // Actually build the SAD
29592 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
29593 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
29596 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
29597 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
29599 const X86Subtarget &Subtarget) {
29600 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
29601 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
29604 EVT ExtractVT = Extract->getValueType(0);
29605 unsigned BitWidth = ExtractVT.getSizeInBits();
29606 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
29607 ExtractVT != MVT::i8)
29610 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
29611 for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
29612 SDValue Match = matchBinOpReduction(Extract, Op);
29616 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
29617 // which we can't support here for now.
29618 if (Match.getScalarValueSizeInBits() != BitWidth)
29621 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
29622 unsigned MatchSizeInBits = Match.getValueSizeInBits();
29623 if (!(MatchSizeInBits == 128 ||
29624 (MatchSizeInBits == 256 &&
29625 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
29628 // Don't bother performing this for 2-element vectors.
29629 if (Match.getValueType().getVectorNumElements() <= 2)
29632 // Check that we are extracting a reduction of all sign bits.
29633 if (DAG.ComputeNumSignBits(Match) != BitWidth)
29636 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
29638 if (64 == BitWidth || 32 == BitWidth)
29639 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
29640 MatchSizeInBits / BitWidth);
29642 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
29645 ISD::CondCode CondCode;
29646 if (Op == ISD::OR) {
29647 // any_of -> MOVMSK != 0
29648 CompareBits = APInt::getNullValue(32);
29649 CondCode = ISD::CondCode::SETNE;
29651 // all_of -> MOVMSK == ((1 << NumElts) - 1)
29652 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
29653 CondCode = ISD::CondCode::SETEQ;
29656 // Perform the select as i32/i64 and then truncate to avoid partial register
29658 unsigned ResWidth = std::max(BitWidth, 32u);
29659 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
29661 SDValue Zero = DAG.getConstant(0, DL, ResVT);
29662 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
29663 SDValue Res = DAG.getBitcast(MaskVT, Match);
29664 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
29665 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
29666 Ones, Zero, CondCode);
29667 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
29673 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
29674 const X86Subtarget &Subtarget) {
29675 // PSADBW is only supported on SSE2 and up.
29676 if (!Subtarget.hasSSE2())
29679 // Verify the type we're extracting from is any integer type above i16.
29680 EVT VT = Extract->getOperand(0).getValueType();
29681 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
29684 unsigned RegSize = 128;
29685 if (Subtarget.hasBWI())
29687 else if (Subtarget.hasAVX2())
29690 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
29691 // TODO: We should be able to handle larger vectors by splitting them before
29692 // feeding them into several SADs, and then reducing over those.
29693 if (RegSize / VT.getVectorNumElements() < 8)
29696 // Match shuffle + add pyramid.
29697 SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
29699 // The operand is expected to be zero extended from i8
29700 // (verified in detectZextAbsDiff).
29701 // In order to convert to i64 and above, additional any/zero/sign
29702 // extend is expected.
29703 // The zero extend from 32 bit has no mathematical effect on the result.
29704 // Also the sign extend is basically zero extend
29705 // (extends the sign bit which is zero).
29706 // So it is correct to skip the sign/zero extend instruction.
29707 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
29708 Root.getOpcode() == ISD::ZERO_EXTEND ||
29709 Root.getOpcode() == ISD::ANY_EXTEND))
29710 Root = Root.getOperand(0);
29712 // If there was a match, we want Root to be a select that is the root of an
29713 // abs-diff pattern.
29714 if (!Root || (Root.getOpcode() != ISD::VSELECT))
29717 // Check whether we have an abs-diff pattern feeding into the select.
29718 SDValue Zext0, Zext1;
29719 if (!detectZextAbsDiff(Root, Zext0, Zext1))
29722 // Create the SAD instruction.
29724 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
29726 // If the original vector was wider than 8 elements, sum over the results
29727 // in the SAD vector.
29728 unsigned Stages = Log2_32(VT.getVectorNumElements());
29729 MVT SadVT = SAD.getSimpleValueType();
29731 unsigned SadElems = SadVT.getVectorNumElements();
29733 for(unsigned i = Stages - 3; i > 0; --i) {
29734 SmallVector<int, 16> Mask(SadElems, -1);
29735 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
29736 Mask[j] = MaskEnd + j;
29739 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
29740 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
29744 MVT Type = Extract->getSimpleValueType(0);
29745 unsigned TypeSizeInBits = Type.getSizeInBits();
29746 // Return the lowest TypeSizeInBits bits.
29747 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
29748 SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
29749 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
29750 Extract->getOperand(1));
29753 // Attempt to peek through a target shuffle and extract the scalar from the
29755 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
29756 TargetLowering::DAGCombinerInfo &DCI,
29757 const X86Subtarget &Subtarget) {
29758 if (DCI.isBeforeLegalizeOps())
29761 SDValue Src = N->getOperand(0);
29762 SDValue Idx = N->getOperand(1);
29764 EVT VT = N->getValueType(0);
29765 EVT SrcVT = Src.getValueType();
29766 EVT SrcSVT = SrcVT.getVectorElementType();
29767 unsigned NumSrcElts = SrcVT.getVectorNumElements();
29769 // Don't attempt this for boolean mask vectors or unknown extraction indices.
29770 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
29773 // Resolve the target shuffle inputs and mask.
29774 SmallVector<int, 16> Mask;
29775 SmallVector<SDValue, 2> Ops;
29776 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
29779 // Attempt to narrow/widen the shuffle mask to the correct size.
29780 if (Mask.size() != NumSrcElts) {
29781 if ((NumSrcElts % Mask.size()) == 0) {
29782 SmallVector<int, 16> ScaledMask;
29783 int Scale = NumSrcElts / Mask.size();
29784 scaleShuffleMask(Scale, Mask, ScaledMask);
29785 Mask = std::move(ScaledMask);
29786 } else if ((Mask.size() % NumSrcElts) == 0) {
29787 SmallVector<int, 16> WidenedMask;
29788 while (Mask.size() > NumSrcElts &&
29789 canWidenShuffleElements(Mask, WidenedMask))
29790 Mask = std::move(WidenedMask);
29791 // TODO - investigate support for wider shuffle masks with known upper
29792 // undef/zero elements for implicit zero-extension.
29796 // Check if narrowing/widening failed.
29797 if (Mask.size() != NumSrcElts)
29800 int SrcIdx = Mask[N->getConstantOperandVal(1)];
29803 // If the shuffle source element is undef/zero then we can just accept it.
29804 if (SrcIdx == SM_SentinelUndef)
29805 return DAG.getUNDEF(VT);
29807 if (SrcIdx == SM_SentinelZero)
29808 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
29809 : DAG.getConstant(0, dl, VT);
29811 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
29812 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
29813 SrcIdx = SrcIdx % Mask.size();
29815 // We can only extract other elements from 128-bit vectors and in certain
29816 // circumstances, depending on SSE-level.
29817 // TODO: Investigate using extract_subvector for larger vectors.
29818 // TODO: Investigate float/double extraction if it will be just stored.
29819 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
29820 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
29821 assert(SrcSVT == VT && "Unexpected extraction type");
29822 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
29823 DAG.getIntPtrConstant(SrcIdx, dl));
29826 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
29827 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
29828 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
29829 "Unexpected extraction type");
29830 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
29831 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
29832 DAG.getIntPtrConstant(SrcIdx, dl));
29833 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
29834 DAG.getValueType(SrcSVT));
29835 return DAG.getZExtOrTrunc(Assert, dl, VT);
29841 /// Detect vector gather/scatter index generation and convert it from being a
29842 /// bunch of shuffles and extracts into a somewhat faster sequence.
29843 /// For i686, the best sequence is apparently storing the value and loading
29844 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
29845 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
29846 TargetLowering::DAGCombinerInfo &DCI,
29847 const X86Subtarget &Subtarget) {
29848 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
29851 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
29854 SDValue InputVector = N->getOperand(0);
29855 SDValue EltIdx = N->getOperand(1);
29857 EVT SrcVT = InputVector.getValueType();
29858 EVT VT = N->getValueType(0);
29859 SDLoc dl(InputVector);
29861 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
29862 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29863 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
29864 SDValue MMXSrc = InputVector.getOperand(0);
29866 // The bitcast source is a direct mmx result.
29867 if (MMXSrc.getValueType() == MVT::x86mmx)
29868 return DAG.getBitcast(VT, InputVector);
29871 // Detect mmx to i32 conversion through a v2i32 elt extract.
29872 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29873 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
29874 SDValue MMXSrc = InputVector.getOperand(0);
29876 // The bitcast source is a direct mmx result.
29877 if (MMXSrc.getValueType() == MVT::x86mmx)
29878 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
29881 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
29882 isa<ConstantSDNode>(EltIdx) &&
29883 isa<ConstantSDNode>(InputVector.getOperand(0))) {
29884 uint64_t ExtractedElt = N->getConstantOperandVal(1);
29885 uint64_t InputValue = InputVector.getConstantOperandVal(0);
29886 uint64_t Res = (InputValue >> ExtractedElt) & 1;
29887 return DAG.getConstant(Res, dl, MVT::i1);
29890 // Check whether this extract is the root of a sum of absolute differences
29891 // pattern. This has to be done here because we really want it to happen
29892 // pre-legalization,
29893 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
29896 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
29897 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
29900 // Only operate on vectors of 4 elements, where the alternative shuffling
29901 // gets to be more expensive.
29902 if (SrcVT != MVT::v4i32)
29905 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
29906 // single use which is a sign-extend or zero-extend, and all elements are
29908 SmallVector<SDNode *, 4> Uses;
29909 unsigned ExtractedElements = 0;
29910 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
29911 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
29912 if (UI.getUse().getResNo() != InputVector.getResNo())
29915 SDNode *Extract = *UI;
29916 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
29919 if (Extract->getValueType(0) != MVT::i32)
29921 if (!Extract->hasOneUse())
29923 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
29924 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
29926 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
29929 // Record which element was extracted.
29930 ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
29931 Uses.push_back(Extract);
29934 // If not all the elements were used, this may not be worthwhile.
29935 if (ExtractedElements != 15)
29938 // Ok, we've now decided to do the transformation.
29939 // If 64-bit shifts are legal, use the extract-shift sequence,
29940 // otherwise bounce the vector off the cache.
29941 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29944 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
29945 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
29946 auto &DL = DAG.getDataLayout();
29947 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
29948 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29949 DAG.getConstant(0, dl, VecIdxTy));
29950 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29951 DAG.getConstant(1, dl, VecIdxTy));
29953 SDValue ShAmt = DAG.getConstant(
29954 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
29955 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
29956 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29957 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
29958 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
29959 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29960 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
29962 // Store the value to a temporary stack slot.
29963 SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
29964 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
29965 MachinePointerInfo());
29967 EVT ElementType = SrcVT.getVectorElementType();
29968 unsigned EltSize = ElementType.getSizeInBits() / 8;
29970 // Replace each use (extract) with a load of the appropriate element.
29971 for (unsigned i = 0; i < 4; ++i) {
29972 uint64_t Offset = EltSize * i;
29973 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
29974 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
29976 SDValue ScalarAddr =
29977 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
29979 // Load the scalar.
29981 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
29985 // Replace the extracts
29986 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
29987 UE = Uses.end(); UI != UE; ++UI) {
29988 SDNode *Extract = *UI;
29990 uint64_t IdxVal = Extract->getConstantOperandVal(1);
29991 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
29994 // The replacement was made in place; don't return anything.
29998 // TODO - merge with combineExtractVectorElt once it can handle the implicit
29999 // zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
30000 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
30001 // combineBasicSADPattern.
30002 static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
30003 TargetLowering::DAGCombinerInfo &DCI,
30004 const X86Subtarget &Subtarget) {
30005 return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
30008 /// If a vector select has an operand that is -1 or 0, try to simplify the
30009 /// select to a bitwise logic operation.
30011 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
30012 TargetLowering::DAGCombinerInfo &DCI,
30013 const X86Subtarget &Subtarget) {
30014 SDValue Cond = N->getOperand(0);
30015 SDValue LHS = N->getOperand(1);
30016 SDValue RHS = N->getOperand(2);
30017 EVT VT = LHS.getValueType();
30018 EVT CondVT = Cond.getValueType();
30020 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30022 if (N->getOpcode() != ISD::VSELECT)
30025 assert(CondVT.isVector() && "Vector select expects a vector selector!");
30027 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
30028 // Check if the first operand is all zeros and Cond type is vXi1.
30029 // This situation only applies to avx512.
30030 if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
30031 CondVT.getVectorElementType() == MVT::i1) {
30032 // Invert the cond to not(cond) : xor(op,allones)=not(op)
30033 SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
30034 DAG.getAllOnesConstant(DL, CondVT));
30035 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
30036 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
30039 // To use the condition operand as a bitwise mask, it must have elements that
30040 // are the same size as the select elements. Ie, the condition operand must
30041 // have already been promoted from the IR select condition type <N x i1>.
30042 // Don't check if the types themselves are equal because that excludes
30043 // vector floating-point selects.
30044 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
30047 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
30048 FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
30050 // Try to invert the condition if true value is not all 1s and false value is
30052 if (!TValIsAllOnes && !FValIsAllZeros &&
30053 // Check if the selector will be produced by CMPP*/PCMP*.
30054 Cond.getOpcode() == ISD::SETCC &&
30055 // Check if SETCC has already been promoted.
30056 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
30058 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
30059 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
30061 if (TValIsAllZeros || FValIsAllOnes) {
30062 SDValue CC = Cond.getOperand(2);
30063 ISD::CondCode NewCC =
30064 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
30065 Cond.getOperand(0).getValueType().isInteger());
30066 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
30068 std::swap(LHS, RHS);
30069 TValIsAllOnes = FValIsAllOnes;
30070 FValIsAllZeros = TValIsAllZeros;
30074 // vselect Cond, 111..., 000... -> Cond
30075 if (TValIsAllOnes && FValIsAllZeros)
30076 return DAG.getBitcast(VT, Cond);
30078 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
30081 // vselect Cond, 111..., X -> or Cond, X
30082 if (TValIsAllOnes) {
30083 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
30084 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
30085 return DAG.getBitcast(VT, Or);
30088 // vselect Cond, X, 000... -> and Cond, X
30089 if (FValIsAllZeros) {
30090 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
30091 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
30092 return DAG.getBitcast(VT, And);
30098 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
30099 SDValue Cond = N->getOperand(0);
30100 SDValue LHS = N->getOperand(1);
30101 SDValue RHS = N->getOperand(2);
30104 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
30105 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
30106 if (!TrueC || !FalseC)
30109 // Don't do this for crazy integer types.
30110 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
30113 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
30114 // so that TrueC (the true value) is larger than FalseC.
30115 bool NeedsCondInvert = false;
30116 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
30117 // Efficiently invertible.
30118 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
30119 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
30120 isa<ConstantSDNode>(Cond.getOperand(1))))) {
30121 NeedsCondInvert = true;
30122 std::swap(TrueC, FalseC);
30125 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
30126 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30127 if (NeedsCondInvert) // Invert the condition if needed.
30128 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
30129 DAG.getConstant(1, DL, Cond.getValueType()));
30131 // Zero extend the condition if needed.
30132 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
30134 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30135 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
30136 DAG.getConstant(ShAmt, DL, MVT::i8));
30139 // Optimize cases that will turn into an LEA instruction. This requires
30140 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
30141 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
30142 uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
30143 if (N->getValueType(0) == MVT::i32)
30144 Diff = (unsigned)Diff;
30146 bool IsFastMultiplier = false;
30148 switch ((unsigned char)Diff) {
30151 case 1: // result = add base, cond
30152 case 2: // result = lea base( , cond*2)
30153 case 3: // result = lea base(cond, cond*2)
30154 case 4: // result = lea base( , cond*4)
30155 case 5: // result = lea base(cond, cond*4)
30156 case 8: // result = lea base( , cond*8)
30157 case 9: // result = lea base(cond, cond*8)
30158 IsFastMultiplier = true;
30163 if (IsFastMultiplier) {
30164 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
30165 if (NeedsCondInvert) // Invert the condition if needed.
30166 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
30167 DAG.getConstant(1, DL, Cond.getValueType()));
30169 // Zero extend the condition if needed.
30170 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
30171 // Scale the condition by the difference.
30173 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
30174 DAG.getConstant(Diff, DL, Cond.getValueType()));
30176 // Add the base if non-zero.
30177 if (FalseC->getAPIntValue() != 0)
30178 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30179 SDValue(FalseC, 0));
30187 // If this is a bitcasted op that can be represented as another type, push the
30188 // the bitcast to the inputs. This allows more opportunities for pattern
30189 // matching masked instructions. This is called when we know that the operation
30190 // is used as one of the inputs of a vselect.
30191 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
30192 TargetLowering::DAGCombinerInfo &DCI) {
30193 // Make sure we have a bitcast.
30194 if (OrigOp.getOpcode() != ISD::BITCAST)
30197 SDValue Op = OrigOp.getOperand(0);
30199 // If the operation is used by anything other than the bitcast, we shouldn't
30200 // do this combine as that would replicate the operation.
30201 if (!Op.hasOneUse())
30204 MVT VT = OrigOp.getSimpleValueType();
30205 MVT EltVT = VT.getVectorElementType();
30206 SDLoc DL(Op.getNode());
30208 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
30210 Op0 = DAG.getBitcast(VT, Op0);
30211 DCI.AddToWorklist(Op0.getNode());
30212 Op1 = DAG.getBitcast(VT, Op1);
30213 DCI.AddToWorklist(Op1.getNode());
30214 DCI.CombineTo(OrigOp.getNode(),
30215 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
30219 unsigned Opcode = Op.getOpcode();
30221 case X86ISD::PALIGNR:
30222 // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
30223 if (!VT.is128BitVector())
30225 Opcode = X86ISD::VALIGN;
30227 case X86ISD::VALIGN: {
30228 if (EltVT != MVT::i32 && EltVT != MVT::i64)
30230 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
30231 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30232 unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
30233 unsigned EltSize = EltVT.getSizeInBits();
30234 // Make sure we can represent the same shift with the new VT.
30235 if ((ShiftAmt % EltSize) != 0)
30237 Imm = ShiftAmt / EltSize;
30238 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
30239 DAG.getConstant(Imm, DL, MVT::i8));
30241 case X86ISD::SHUF128: {
30242 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
30244 // Only change element size, not type.
30245 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
30247 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
30250 case ISD::INSERT_SUBVECTOR: {
30251 unsigned EltSize = EltVT.getSizeInBits();
30252 if (EltSize != 32 && EltSize != 64)
30254 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30255 // Only change element size, not type.
30256 if (EltVT.isInteger() != OpEltVT.isInteger())
30258 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
30259 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
30260 SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
30261 DCI.AddToWorklist(Op0.getNode());
30262 // Op1 needs to be bitcasted to a smaller vector with the same element type.
30263 SDValue Op1 = Op.getOperand(1);
30264 MVT Op1VT = MVT::getVectorVT(EltVT,
30265 Op1.getSimpleValueType().getSizeInBits() / EltSize);
30266 Op1 = DAG.getBitcast(Op1VT, Op1);
30267 DCI.AddToWorklist(Op1.getNode());
30268 DCI.CombineTo(OrigOp.getNode(),
30269 DAG.getNode(Opcode, DL, VT, Op0, Op1,
30270 DAG.getIntPtrConstant(Imm, DL)));
30273 case ISD::EXTRACT_SUBVECTOR: {
30274 unsigned EltSize = EltVT.getSizeInBits();
30275 if (EltSize != 32 && EltSize != 64)
30277 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30278 // Only change element size, not type.
30279 if (EltVT.isInteger() != OpEltVT.isInteger())
30281 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
30282 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
30283 // Op0 needs to be bitcasted to a larger vector with the same element type.
30284 SDValue Op0 = Op.getOperand(0);
30285 MVT Op0VT = MVT::getVectorVT(EltVT,
30286 Op0.getSimpleValueType().getSizeInBits() / EltSize);
30287 Op0 = DAG.getBitcast(Op0VT, Op0);
30288 DCI.AddToWorklist(Op0.getNode());
30289 DCI.CombineTo(OrigOp.getNode(),
30290 DAG.getNode(Opcode, DL, VT, Op0,
30291 DAG.getIntPtrConstant(Imm, DL)));
30294 case X86ISD::SUBV_BROADCAST: {
30295 unsigned EltSize = EltVT.getSizeInBits();
30296 if (EltSize != 32 && EltSize != 64)
30298 // Only change element size, not type.
30299 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
30301 SDValue Op0 = Op.getOperand(0);
30302 MVT Op0VT = MVT::getVectorVT(EltVT,
30303 Op0.getSimpleValueType().getSizeInBits() / EltSize);
30304 Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
30305 DCI.AddToWorklist(Op0.getNode());
30306 DCI.CombineTo(OrigOp.getNode(),
30307 DAG.getNode(Opcode, DL, VT, Op0));
30315 /// Do target-specific dag combines on SELECT and VSELECT nodes.
30316 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
30317 TargetLowering::DAGCombinerInfo &DCI,
30318 const X86Subtarget &Subtarget) {
30320 SDValue Cond = N->getOperand(0);
30321 // Get the LHS/RHS of the select.
30322 SDValue LHS = N->getOperand(1);
30323 SDValue RHS = N->getOperand(2);
30324 EVT VT = LHS.getValueType();
30325 EVT CondVT = Cond.getValueType();
30326 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30328 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
30329 // instructions match the semantics of the common C idiom x<y?x:y but not
30330 // x<=y?x:y, because of how they handle negative zero (which can be
30331 // ignored in unsafe-math mode).
30332 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
30333 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
30334 VT != MVT::f80 && VT != MVT::f128 &&
30335 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
30336 (Subtarget.hasSSE2() ||
30337 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
30338 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30340 unsigned Opcode = 0;
30341 // Check for x CC y ? x : y.
30342 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30343 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30347 // Converting this to a min would handle NaNs incorrectly, and swapping
30348 // the operands would cause it to handle comparisons between positive
30349 // and negative zero incorrectly.
30350 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
30351 if (!DAG.getTarget().Options.UnsafeFPMath &&
30352 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
30354 std::swap(LHS, RHS);
30356 Opcode = X86ISD::FMIN;
30359 // Converting this to a min would handle comparisons between positive
30360 // and negative zero incorrectly.
30361 if (!DAG.getTarget().Options.UnsafeFPMath &&
30362 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30364 Opcode = X86ISD::FMIN;
30367 // Converting this to a min would handle both negative zeros and NaNs
30368 // incorrectly, but we can swap the operands to fix both.
30369 std::swap(LHS, RHS);
30374 Opcode = X86ISD::FMIN;
30378 // Converting this to a max would handle comparisons between positive
30379 // and negative zero incorrectly.
30380 if (!DAG.getTarget().Options.UnsafeFPMath &&
30381 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30383 Opcode = X86ISD::FMAX;
30386 // Converting this to a max would handle NaNs incorrectly, and swapping
30387 // the operands would cause it to handle comparisons between positive
30388 // and negative zero incorrectly.
30389 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
30390 if (!DAG.getTarget().Options.UnsafeFPMath &&
30391 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
30393 std::swap(LHS, RHS);
30395 Opcode = X86ISD::FMAX;
30398 // Converting this to a max would handle both negative zeros and NaNs
30399 // incorrectly, but we can swap the operands to fix both.
30400 std::swap(LHS, RHS);
30405 Opcode = X86ISD::FMAX;
30408 // Check for x CC y ? y : x -- a min/max with reversed arms.
30409 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
30410 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
30414 // Converting this to a min would handle comparisons between positive
30415 // and negative zero incorrectly, and swapping the operands would
30416 // cause it to handle NaNs incorrectly.
30417 if (!DAG.getTarget().Options.UnsafeFPMath &&
30418 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
30419 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30421 std::swap(LHS, RHS);
30423 Opcode = X86ISD::FMIN;
30426 // Converting this to a min would handle NaNs incorrectly.
30427 if (!DAG.getTarget().Options.UnsafeFPMath &&
30428 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
30430 Opcode = X86ISD::FMIN;
30433 // Converting this to a min would handle both negative zeros and NaNs
30434 // incorrectly, but we can swap the operands to fix both.
30435 std::swap(LHS, RHS);
30440 Opcode = X86ISD::FMIN;
30444 // Converting this to a max would handle NaNs incorrectly.
30445 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30447 Opcode = X86ISD::FMAX;
30450 // Converting this to a max would handle comparisons between positive
30451 // and negative zero incorrectly, and swapping the operands would
30452 // cause it to handle NaNs incorrectly.
30453 if (!DAG.getTarget().Options.UnsafeFPMath &&
30454 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
30455 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30457 std::swap(LHS, RHS);
30459 Opcode = X86ISD::FMAX;
30462 // Converting this to a max would handle both negative zeros and NaNs
30463 // incorrectly, but we can swap the operands to fix both.
30464 std::swap(LHS, RHS);
30469 Opcode = X86ISD::FMAX;
30475 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
30478 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
30479 // lowering on KNL. In this case we convert it to
30480 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
30481 // The same situation for all 128 and 256-bit vectors of i8 and i16.
30482 // Since SKX these selects have a proper lowering.
30483 if (Subtarget.hasAVX512() && CondVT.isVector() &&
30484 CondVT.getVectorElementType() == MVT::i1 &&
30485 (VT.is128BitVector() || VT.is256BitVector()) &&
30486 (VT.getVectorElementType() == MVT::i8 ||
30487 VT.getVectorElementType() == MVT::i16) &&
30488 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
30489 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
30490 DCI.AddToWorklist(Cond.getNode());
30491 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
30494 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
30497 // Canonicalize max and min:
30498 // (x > y) ? x : y -> (x >= y) ? x : y
30499 // (x < y) ? x : y -> (x <= y) ? x : y
30500 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
30501 // the need for an extra compare
30502 // against zero. e.g.
30503 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
30505 // testl %edi, %edi
30507 // cmovgl %edi, %eax
30511 // cmovsl %eax, %edi
30512 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
30513 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30514 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30515 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30520 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
30521 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
30522 Cond.getOperand(0), Cond.getOperand(1), NewCC);
30523 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
30528 // Early exit check
30529 if (!TLI.isTypeLegal(VT))
30532 // Match VSELECTs into subs with unsigned saturation.
30533 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
30534 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
30535 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
30536 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
30537 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30539 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
30540 // left side invert the predicate to simplify logic below.
30542 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
30544 CC = ISD::getSetCCInverse(CC, true);
30545 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
30549 if (Other.getNode() && Other->getNumOperands() == 2 &&
30550 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
30551 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
30552 SDValue CondRHS = Cond->getOperand(1);
30554 // Look for a general sub with unsigned saturation first.
30555 // x >= y ? x-y : 0 --> subus x, y
30556 // x > y ? x-y : 0 --> subus x, y
30557 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
30558 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
30559 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
30561 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
30562 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
30563 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
30564 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
30565 // If the RHS is a constant we have to reverse the const
30566 // canonicalization.
30567 // x > C-1 ? x+-C : 0 --> subus x, C
30568 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
30569 CondRHSConst->getAPIntValue() ==
30570 (-OpRHSConst->getAPIntValue() - 1))
30571 return DAG.getNode(
30572 X86ISD::SUBUS, DL, VT, OpLHS,
30573 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
30575 // Another special case: If C was a sign bit, the sub has been
30576 // canonicalized into a xor.
30577 // FIXME: Would it be better to use computeKnownBits to determine
30578 // whether it's safe to decanonicalize the xor?
30579 // x s< 0 ? x^C : 0 --> subus x, C
30580 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
30581 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
30582 OpRHSConst->getAPIntValue().isSignMask())
30583 // Note that we have to rebuild the RHS constant here to ensure we
30584 // don't rely on particular values of undef lanes.
30585 return DAG.getNode(
30586 X86ISD::SUBUS, DL, VT, OpLHS,
30587 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
30592 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
30595 // If this is a *dynamic* select (non-constant condition) and we can match
30596 // this node with one of the variable blend instructions, restructure the
30597 // condition so that blends can use the high (sign) bit of each element and
30598 // use SimplifyDemandedBits to simplify the condition operand.
30599 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
30600 !DCI.isBeforeLegalize() &&
30601 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
30602 unsigned BitWidth = Cond.getScalarValueSizeInBits();
30604 // Don't optimize vector selects that map to mask-registers.
30608 // We can only handle the cases where VSELECT is directly legal on the
30609 // subtarget. We custom lower VSELECT nodes with constant conditions and
30610 // this makes it hard to see whether a dynamic VSELECT will correctly
30611 // lower, so we both check the operation's status and explicitly handle the
30612 // cases where a *dynamic* blend will fail even though a constant-condition
30613 // blend could be custom lowered.
30614 // FIXME: We should find a better way to handle this class of problems.
30615 // Potentially, we should combine constant-condition vselect nodes
30616 // pre-legalization into shuffles and not mark as many types as custom
30618 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
30620 // FIXME: We don't support i16-element blends currently. We could and
30621 // should support them by making *all* the bits in the condition be set
30622 // rather than just the high bit and using an i8-element blend.
30623 if (VT.getVectorElementType() == MVT::i16)
30625 // Dynamic blending was only available from SSE4.1 onward.
30626 if (VT.is128BitVector() && !Subtarget.hasSSE41())
30628 // Byte blends are only available in AVX2
30629 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
30632 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
30633 APInt DemandedMask(APInt::getSignMask(BitWidth));
30635 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
30636 !DCI.isBeforeLegalizeOps());
30637 if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
30638 TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
30639 // If we changed the computation somewhere in the DAG, this change will
30640 // affect all users of Cond. Make sure it is fine and update all the nodes
30641 // so that we do not use the generic VSELECT anymore. Otherwise, we may
30642 // perform wrong optimizations as we messed with the actual expectation
30643 // for the vector boolean values.
30644 if (Cond != TLO.Old) {
30645 // Check all uses of the condition operand to check whether it will be
30646 // consumed by non-BLEND instructions. Those may require that all bits
30647 // are set properly.
30648 for (SDNode *U : Cond->uses()) {
30649 // TODO: Add other opcodes eventually lowered into BLEND.
30650 if (U->getOpcode() != ISD::VSELECT)
30654 // Update all users of the condition before committing the change, so
30655 // that the VSELECT optimizations that expect the correct vector boolean
30656 // value will not be triggered.
30657 for (SDNode *U : Cond->uses()) {
30658 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
30659 U->getValueType(0), Cond, U->getOperand(1),
30661 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
30663 DCI.CommitTargetLoweringOpt(TLO);
30666 // Only Cond (rather than other nodes in the computation chain) was
30667 // changed. Change the condition just for N to keep the opportunity to
30668 // optimize all other users their own way.
30669 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
30670 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
30675 // Look for vselects with LHS/RHS being bitcasted from an operation that
30676 // can be executed on another type. Push the bitcast to the inputs of
30677 // the operation. This exposes opportunities for using masking instructions.
30678 if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
30679 CondVT.getVectorElementType() == MVT::i1) {
30680 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
30681 return SDValue(N, 0);
30682 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
30683 return SDValue(N, 0);
30686 // Custom action for SELECT MMX
30687 if (VT == MVT::x86mmx) {
30688 LHS = DAG.getBitcast(MVT::i64, LHS);
30689 RHS = DAG.getBitcast(MVT::i64, RHS);
30690 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
30691 return DAG.getBitcast(VT, newSelect);
30698 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
30700 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
30701 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
30702 /// Note that this is only legal for some op/cc combinations.
30703 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
30704 SelectionDAG &DAG) {
30705 // This combine only operates on CMP-like nodes.
30706 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30707 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30710 // Can't replace the cmp if it has more uses than the one we're looking at.
30711 // FIXME: We would like to be able to handle this, but would need to make sure
30712 // all uses were updated.
30713 if (!Cmp.hasOneUse())
30716 // This only applies to variations of the common case:
30717 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
30718 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
30719 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
30720 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
30721 // Using the proper condcodes (see below), overflow is checked for.
30723 // FIXME: We can generalize both constraints:
30724 // - XOR/OR/AND (if they were made to survive AtomicExpand)
30726 // if the result is compared.
30728 SDValue CmpLHS = Cmp.getOperand(0);
30729 SDValue CmpRHS = Cmp.getOperand(1);
30731 if (!CmpLHS.hasOneUse())
30734 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
30735 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
30738 const unsigned Opc = CmpLHS.getOpcode();
30740 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
30743 SDValue OpRHS = CmpLHS.getOperand(2);
30744 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
30748 APInt Addend = OpRHSC->getAPIntValue();
30749 if (Opc == ISD::ATOMIC_LOAD_SUB)
30752 if (CC == X86::COND_S && Addend == 1)
30754 else if (CC == X86::COND_NS && Addend == 1)
30756 else if (CC == X86::COND_G && Addend == -1)
30758 else if (CC == X86::COND_LE && Addend == -1)
30763 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
30764 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30765 DAG.getUNDEF(CmpLHS.getValueType()));
30766 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30770 // Check whether a boolean test is testing a boolean value generated by
30771 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
30774 // Simplify the following patterns:
30775 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
30776 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
30777 // to (Op EFLAGS Cond)
30779 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
30780 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
30781 // to (Op EFLAGS !Cond)
30783 // where Op could be BRCOND or CMOV.
30785 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
30786 // This combine only operates on CMP-like nodes.
30787 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30788 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30791 // Quit if not used as a boolean value.
30792 if (CC != X86::COND_E && CC != X86::COND_NE)
30795 // Check CMP operands. One of them should be 0 or 1 and the other should be
30796 // an SetCC or extended from it.
30797 SDValue Op1 = Cmp.getOperand(0);
30798 SDValue Op2 = Cmp.getOperand(1);
30801 const ConstantSDNode* C = nullptr;
30802 bool needOppositeCond = (CC == X86::COND_E);
30803 bool checkAgainstTrue = false; // Is it a comparison against 1?
30805 if ((C = dyn_cast<ConstantSDNode>(Op1)))
30807 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
30809 else // Quit if all operands are not constants.
30812 if (C->getZExtValue() == 1) {
30813 needOppositeCond = !needOppositeCond;
30814 checkAgainstTrue = true;
30815 } else if (C->getZExtValue() != 0)
30816 // Quit if the constant is neither 0 or 1.
30819 bool truncatedToBoolWithAnd = false;
30820 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
30821 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
30822 SetCC.getOpcode() == ISD::TRUNCATE ||
30823 SetCC.getOpcode() == ISD::AND) {
30824 if (SetCC.getOpcode() == ISD::AND) {
30826 if (isOneConstant(SetCC.getOperand(0)))
30828 if (isOneConstant(SetCC.getOperand(1)))
30832 SetCC = SetCC.getOperand(OpIdx);
30833 truncatedToBoolWithAnd = true;
30835 SetCC = SetCC.getOperand(0);
30838 switch (SetCC.getOpcode()) {
30839 case X86ISD::SETCC_CARRY:
30840 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
30841 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
30842 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
30843 // truncated to i1 using 'and'.
30844 if (checkAgainstTrue && !truncatedToBoolWithAnd)
30846 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
30847 "Invalid use of SETCC_CARRY!");
30849 case X86ISD::SETCC:
30850 // Set the condition code or opposite one if necessary.
30851 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
30852 if (needOppositeCond)
30853 CC = X86::GetOppositeBranchCondition(CC);
30854 return SetCC.getOperand(1);
30855 case X86ISD::CMOV: {
30856 // Check whether false/true value has canonical one, i.e. 0 or 1.
30857 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
30858 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
30859 // Quit if true value is not a constant.
30862 // Quit if false value is not a constant.
30864 SDValue Op = SetCC.getOperand(0);
30865 // Skip 'zext' or 'trunc' node.
30866 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
30867 Op.getOpcode() == ISD::TRUNCATE)
30868 Op = Op.getOperand(0);
30869 // A special case for rdrand/rdseed, where 0 is set if false cond is
30871 if ((Op.getOpcode() != X86ISD::RDRAND &&
30872 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
30875 // Quit if false value is not the constant 0 or 1.
30876 bool FValIsFalse = true;
30877 if (FVal && FVal->getZExtValue() != 0) {
30878 if (FVal->getZExtValue() != 1)
30880 // If FVal is 1, opposite cond is needed.
30881 needOppositeCond = !needOppositeCond;
30882 FValIsFalse = false;
30884 // Quit if TVal is not the constant opposite of FVal.
30885 if (FValIsFalse && TVal->getZExtValue() != 1)
30887 if (!FValIsFalse && TVal->getZExtValue() != 0)
30889 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
30890 if (needOppositeCond)
30891 CC = X86::GetOppositeBranchCondition(CC);
30892 return SetCC.getOperand(3);
30899 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
30901 /// (X86or (X86setcc) (X86setcc))
30902 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
30903 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
30904 X86::CondCode &CC1, SDValue &Flags,
30906 if (Cond->getOpcode() == X86ISD::CMP) {
30907 if (!isNullConstant(Cond->getOperand(1)))
30910 Cond = Cond->getOperand(0);
30915 SDValue SetCC0, SetCC1;
30916 switch (Cond->getOpcode()) {
30917 default: return false;
30924 SetCC0 = Cond->getOperand(0);
30925 SetCC1 = Cond->getOperand(1);
30929 // Make sure we have SETCC nodes, using the same flags value.
30930 if (SetCC0.getOpcode() != X86ISD::SETCC ||
30931 SetCC1.getOpcode() != X86ISD::SETCC ||
30932 SetCC0->getOperand(1) != SetCC1->getOperand(1))
30935 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
30936 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
30937 Flags = SetCC0->getOperand(1);
30941 /// Optimize an EFLAGS definition used according to the condition code \p CC
30942 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
30943 /// uses of chain values.
30944 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
30945 SelectionDAG &DAG) {
30946 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
30948 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
30951 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
30952 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
30953 TargetLowering::DAGCombinerInfo &DCI,
30954 const X86Subtarget &Subtarget) {
30957 // If the flag operand isn't dead, don't touch this CMOV.
30958 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
30961 SDValue FalseOp = N->getOperand(0);
30962 SDValue TrueOp = N->getOperand(1);
30963 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
30964 SDValue Cond = N->getOperand(3);
30966 if (CC == X86::COND_E || CC == X86::COND_NE) {
30967 switch (Cond.getOpcode()) {
30971 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
30972 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
30973 return (CC == X86::COND_E) ? FalseOp : TrueOp;
30977 // Try to simplify the EFLAGS and condition code operands.
30978 // We can't always do this as FCMOV only supports a subset of X86 cond.
30979 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
30980 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
30981 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
30983 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30987 // If this is a select between two integer constants, try to do some
30988 // optimizations. Note that the operands are ordered the opposite of SELECT
30990 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
30991 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
30992 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
30993 // larger than FalseC (the false value).
30994 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
30995 CC = X86::GetOppositeBranchCondition(CC);
30996 std::swap(TrueC, FalseC);
30997 std::swap(TrueOp, FalseOp);
31000 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
31001 // This is efficient for any integer data type (including i8/i16) and
31003 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
31004 Cond = getSETCC(CC, Cond, DL, DAG);
31006 // Zero extend the condition if needed.
31007 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
31009 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
31010 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
31011 DAG.getConstant(ShAmt, DL, MVT::i8));
31012 if (N->getNumValues() == 2) // Dead flag value?
31013 return DCI.CombineTo(N, Cond, SDValue());
31017 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
31018 // for any integer data type, including i8/i16.
31019 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
31020 Cond = getSETCC(CC, Cond, DL, DAG);
31022 // Zero extend the condition if needed.
31023 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
31024 FalseC->getValueType(0), Cond);
31025 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31026 SDValue(FalseC, 0));
31028 if (N->getNumValues() == 2) // Dead flag value?
31029 return DCI.CombineTo(N, Cond, SDValue());
31033 // Optimize cases that will turn into an LEA instruction. This requires
31034 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
31035 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
31036 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
31037 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
31039 bool isFastMultiplier = false;
31041 switch ((unsigned char)Diff) {
31043 case 1: // result = add base, cond
31044 case 2: // result = lea base( , cond*2)
31045 case 3: // result = lea base(cond, cond*2)
31046 case 4: // result = lea base( , cond*4)
31047 case 5: // result = lea base(cond, cond*4)
31048 case 8: // result = lea base( , cond*8)
31049 case 9: // result = lea base(cond, cond*8)
31050 isFastMultiplier = true;
31055 if (isFastMultiplier) {
31056 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
31057 Cond = getSETCC(CC, Cond, DL ,DAG);
31058 // Zero extend the condition if needed.
31059 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
31061 // Scale the condition by the difference.
31063 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
31064 DAG.getConstant(Diff, DL, Cond.getValueType()));
31066 // Add the base if non-zero.
31067 if (FalseC->getAPIntValue() != 0)
31068 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31069 SDValue(FalseC, 0));
31070 if (N->getNumValues() == 2) // Dead flag value?
31071 return DCI.CombineTo(N, Cond, SDValue());
31078 // Handle these cases:
31079 // (select (x != c), e, c) -> select (x != c), e, x),
31080 // (select (x == c), c, e) -> select (x == c), x, e)
31081 // where the c is an integer constant, and the "select" is the combination
31082 // of CMOV and CMP.
31084 // The rationale for this change is that the conditional-move from a constant
31085 // needs two instructions, however, conditional-move from a register needs
31086 // only one instruction.
31088 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
31089 // some instruction-combining opportunities. This opt needs to be
31090 // postponed as late as possible.
31092 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
31093 // the DCI.xxxx conditions are provided to postpone the optimization as
31094 // late as possible.
31096 ConstantSDNode *CmpAgainst = nullptr;
31097 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
31098 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
31099 !isa<ConstantSDNode>(Cond.getOperand(0))) {
31101 if (CC == X86::COND_NE &&
31102 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
31103 CC = X86::GetOppositeBranchCondition(CC);
31104 std::swap(TrueOp, FalseOp);
31107 if (CC == X86::COND_E &&
31108 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
31109 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
31110 DAG.getConstant(CC, DL, MVT::i8), Cond };
31111 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
31116 // Fold and/or of setcc's to double CMOV:
31117 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
31118 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
31120 // This combine lets us generate:
31121 // cmovcc1 (jcc1 if we don't have CMOV)
31127 // cmovne (jne if we don't have CMOV)
31128 // When we can't use the CMOV instruction, it might increase branch
31130 // When we can use CMOV, or when there is no mispredict, this improves
31131 // throughput and reduces register pressure.
31133 if (CC == X86::COND_NE) {
31135 X86::CondCode CC0, CC1;
31137 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
31139 std::swap(FalseOp, TrueOp);
31140 CC0 = X86::GetOppositeBranchCondition(CC0);
31141 CC1 = X86::GetOppositeBranchCondition(CC1);
31144 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
31146 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
31147 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
31148 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
31149 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
31157 /// Different mul shrinking modes.
31158 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
31160 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
31161 EVT VT = N->getOperand(0).getValueType();
31162 if (VT.getScalarSizeInBits() != 32)
31165 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
31166 unsigned SignBits[2] = {1, 1};
31167 bool IsPositive[2] = {false, false};
31168 for (unsigned i = 0; i < 2; i++) {
31169 SDValue Opd = N->getOperand(i);
31171 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
31172 // compute signbits for it separately.
31173 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
31174 // For anyextend, it is safe to assume an appropriate number of leading
31176 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
31178 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
31183 IsPositive[i] = true;
31184 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
31185 // All the operands of BUILD_VECTOR need to be int constant.
31186 // Find the smallest value range which all the operands belong to.
31188 IsPositive[i] = true;
31189 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
31190 if (SubOp.isUndef())
31192 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
31195 APInt IntVal = CN->getAPIntValue();
31196 if (IntVal.isNegative())
31197 IsPositive[i] = false;
31198 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
31201 SignBits[i] = DAG.ComputeNumSignBits(Opd);
31202 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
31203 IsPositive[i] = true;
31207 bool AllPositive = IsPositive[0] && IsPositive[1];
31208 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
31209 // When ranges are from -128 ~ 127, use MULS8 mode.
31210 if (MinSignBits >= 25)
31212 // When ranges are from 0 ~ 255, use MULU8 mode.
31213 else if (AllPositive && MinSignBits >= 24)
31215 // When ranges are from -32768 ~ 32767, use MULS16 mode.
31216 else if (MinSignBits >= 17)
31218 // When ranges are from 0 ~ 65535, use MULU16 mode.
31219 else if (AllPositive && MinSignBits >= 16)
31226 /// When the operands of vector mul are extended from smaller size values,
31227 /// like i8 and i16, the type of mul may be shrinked to generate more
31228 /// efficient code. Two typical patterns are handled:
31230 /// %2 = sext/zext <N x i8> %1 to <N x i32>
31231 /// %4 = sext/zext <N x i8> %3 to <N x i32>
31232 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31233 /// %5 = mul <N x i32> %2, %4
31236 /// %2 = zext/sext <N x i16> %1 to <N x i32>
31237 /// %4 = zext/sext <N x i16> %3 to <N x i32>
31238 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31239 /// %5 = mul <N x i32> %2, %4
31241 /// There are four mul shrinking modes:
31242 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
31243 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
31244 /// generate pmullw+sext32 for it (MULS8 mode).
31245 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
31246 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
31247 /// generate pmullw+zext32 for it (MULU8 mode).
31248 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
31249 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
31250 /// generate pmullw+pmulhw for it (MULS16 mode).
31251 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
31252 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
31253 /// generate pmullw+pmulhuw for it (MULU16 mode).
31254 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
31255 const X86Subtarget &Subtarget) {
31256 // Check for legality
31257 // pmullw/pmulhw are not supported by SSE.
31258 if (!Subtarget.hasSSE2())
31261 // Check for profitability
31262 // pmulld is supported since SSE41. It is better to use pmulld
31263 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
31265 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
31266 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
31270 if (!canReduceVMulWidth(N, DAG, Mode))
31274 SDValue N0 = N->getOperand(0);
31275 SDValue N1 = N->getOperand(1);
31276 EVT VT = N->getOperand(0).getValueType();
31277 unsigned RegSize = 128;
31278 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
31280 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
31281 // Shrink the operands of mul.
31282 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
31283 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
31285 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
31286 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
31287 // lower part is needed.
31288 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
31289 if (Mode == MULU8 || Mode == MULS8) {
31290 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
31293 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
31294 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
31295 // the higher part is also needed.
31296 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
31297 ReducedVT, NewN0, NewN1);
31299 // Repack the lower part and higher part result of mul into a wider
31301 // Generate shuffle functioning as punpcklwd.
31302 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
31303 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
31304 ShuffleMask[2 * i] = i;
31305 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
31308 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
31309 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
31310 // Generate shuffle functioning as punpckhwd.
31311 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
31312 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
31313 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
31316 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
31317 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
31318 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
31321 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
31322 // to legalize the mul explicitly because implicit legalization for type
31323 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
31324 // instructions which will not exist when we explicitly legalize it by
31325 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
31326 // <4 x i16> undef).
31328 // Legalize the operands of mul.
31329 // FIXME: We may be able to handle non-concatenated vectors by insertion.
31330 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
31331 if ((RegSize % ReducedSizeInBits) != 0)
31334 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
31335 DAG.getUNDEF(ReducedVT));
31337 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
31339 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
31341 if (Mode == MULU8 || Mode == MULS8) {
31342 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
31344 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
31346 // convert the type of mul result to VT.
31347 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
31348 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
31349 : ISD::SIGN_EXTEND_VECTOR_INREG,
31351 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31352 DAG.getIntPtrConstant(0, DL));
31354 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
31355 // MULU16/MULS16, both parts are needed.
31356 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
31357 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
31358 OpsVT, NewN0, NewN1);
31360 // Repack the lower part and higher part result of mul into a wider
31361 // result. Make sure the type of mul result is VT.
31362 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
31363 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
31364 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
31365 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31366 DAG.getIntPtrConstant(0, DL));
31371 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
31372 EVT VT, SDLoc DL) {
31374 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
31375 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31376 DAG.getConstant(Mult, DL, VT));
31377 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
31378 DAG.getConstant(Shift, DL, MVT::i8));
31379 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
31384 auto combineMulMulAddOrSub = [&](bool isAdd) {
31385 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31386 DAG.getConstant(9, DL, VT));
31387 Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
31388 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
31397 // mul x, 11 => add ((shl (mul x, 5), 1), x)
31398 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
31400 // mul x, 21 => add ((shl (mul x, 5), 2), x)
31401 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
31403 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
31404 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31405 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
31407 // mul x, 19 => sub ((shl (mul x, 5), 2), x)
31408 return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
31410 // mul x, 13 => add ((shl (mul x, 3), 2), x)
31411 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
31413 // mul x, 13 => sub ((shl (mul x, 3), 3), x)
31414 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
31416 // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
31417 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31418 combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
31420 // mul x, 26 => sub ((mul (mul x, 9), 3), x)
31421 return combineMulMulAddOrSub(/*isAdd*/ false);
31423 // mul x, 28 => add ((mul (mul x, 9), 3), x)
31424 return combineMulMulAddOrSub(/*isAdd*/ true);
31426 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
31427 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31428 combineMulMulAddOrSub(/*isAdd*/ true));
31430 // mul x, 30 => sub (sub ((shl x, 5), x), x)
31431 return DAG.getNode(
31433 DAG.getNode(ISD::SUB, DL, VT,
31434 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31435 DAG.getConstant(5, DL, MVT::i8)),
31442 /// Optimize a single multiply with constant into two operations in order to
31443 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
31444 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
31445 TargetLowering::DAGCombinerInfo &DCI,
31446 const X86Subtarget &Subtarget) {
31447 EVT VT = N->getValueType(0);
31448 if (DCI.isBeforeLegalize() && VT.isVector())
31449 return reduceVMULWidth(N, DAG, Subtarget);
31451 if (!MulConstantOptimization)
31453 // An imul is usually smaller than the alternative sequence.
31454 if (DAG.getMachineFunction().getFunction()->optForMinSize())
31457 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
31460 if (VT != MVT::i64 && VT != MVT::i32)
31463 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
31466 uint64_t MulAmt = C->getZExtValue();
31467 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
31470 uint64_t MulAmt1 = 0;
31471 uint64_t MulAmt2 = 0;
31472 if ((MulAmt % 9) == 0) {
31474 MulAmt2 = MulAmt / 9;
31475 } else if ((MulAmt % 5) == 0) {
31477 MulAmt2 = MulAmt / 5;
31478 } else if ((MulAmt % 3) == 0) {
31480 MulAmt2 = MulAmt / 3;
31486 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
31488 if (isPowerOf2_64(MulAmt2) &&
31489 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
31490 // If second multiplifer is pow2, issue it first. We want the multiply by
31491 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
31493 std::swap(MulAmt1, MulAmt2);
31495 if (isPowerOf2_64(MulAmt1))
31496 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31497 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
31499 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31500 DAG.getConstant(MulAmt1, DL, VT));
31502 if (isPowerOf2_64(MulAmt2))
31503 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
31504 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
31506 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
31507 DAG.getConstant(MulAmt2, DL, VT));
31508 } else if (!Subtarget.slowLEA())
31509 NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
31512 assert(MulAmt != 0 &&
31513 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
31514 "Both cases that could cause potential overflows should have "
31515 "already been handled.");
31516 int64_t SignMulAmt = C->getSExtValue();
31517 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
31518 (SignMulAmt != -INT64_MAX)) {
31519 int NumSign = SignMulAmt > 0 ? 1 : -1;
31520 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
31521 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
31522 if (IsPowerOf2_64PlusOne) {
31523 // (mul x, 2^N + 1) => (add (shl x, N), x)
31524 NewMul = DAG.getNode(
31525 ISD::ADD, DL, VT, N->getOperand(0),
31526 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31527 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
31529 } else if (IsPowerOf2_64MinusOne) {
31530 // (mul x, 2^N - 1) => (sub (shl x, N), x)
31531 NewMul = DAG.getNode(
31533 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31534 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
31538 // To negate, subtract the number from zero
31539 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
31541 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
31546 // Do not add new nodes to DAG combiner worklist.
31547 DCI.CombineTo(N, NewMul, false);
31552 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
31553 SDValue N0 = N->getOperand(0);
31554 SDValue N1 = N->getOperand(1);
31555 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
31556 EVT VT = N0.getValueType();
31558 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
31559 // since the result of setcc_c is all zero's or all ones.
31560 if (VT.isInteger() && !VT.isVector() &&
31561 N1C && N0.getOpcode() == ISD::AND &&
31562 N0.getOperand(1).getOpcode() == ISD::Constant) {
31563 SDValue N00 = N0.getOperand(0);
31564 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
31565 Mask <<= N1C->getAPIntValue();
31566 bool MaskOK = false;
31567 // We can handle cases concerning bit-widening nodes containing setcc_c if
31568 // we carefully interrogate the mask to make sure we are semantics
31570 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
31571 // of the underlying setcc_c operation if the setcc_c was zero extended.
31572 // Consider the following example:
31573 // zext(setcc_c) -> i32 0x0000FFFF
31574 // c1 -> i32 0x0000FFFF
31575 // c2 -> i32 0x00000001
31576 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
31577 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
31578 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
31580 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
31581 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31583 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
31584 N00.getOpcode() == ISD::ANY_EXTEND) &&
31585 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31586 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
31588 if (MaskOK && Mask != 0) {
31590 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
31594 // Hardware support for vector shifts is sparse which makes us scalarize the
31595 // vector operations in many cases. Also, on sandybridge ADD is faster than
31597 // (shl V, 1) -> add V,V
31598 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
31599 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
31600 assert(N0.getValueType().isVector() && "Invalid vector shift type");
31601 // We shift all of the values by one. In many cases we do not have
31602 // hardware support for this operation. This is better expressed as an ADD
31604 if (N1SplatC->getAPIntValue() == 1)
31605 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
31611 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
31612 SDValue N0 = N->getOperand(0);
31613 SDValue N1 = N->getOperand(1);
31614 EVT VT = N0.getValueType();
31615 unsigned Size = VT.getSizeInBits();
31617 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
31618 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
31619 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
31620 // depending on sign of (SarConst - [56,48,32,24,16])
31622 // sexts in X86 are MOVs. The MOVs have the same code size
31623 // as above SHIFTs (only SHIFT on 1 has lower code size).
31624 // However the MOVs have 2 advantages to a SHIFT:
31625 // 1. MOVs can write to a register that differs from source
31626 // 2. MOVs accept memory operands
31628 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
31629 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
31630 N0.getOperand(1).getOpcode() != ISD::Constant)
31633 SDValue N00 = N0.getOperand(0);
31634 SDValue N01 = N0.getOperand(1);
31635 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
31636 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
31637 EVT CVT = N1.getValueType();
31639 if (SarConst.isNegative())
31642 for (MVT SVT : MVT::integer_valuetypes()) {
31643 unsigned ShiftSize = SVT.getSizeInBits();
31644 // skipping types without corresponding sext/zext and
31645 // ShlConst that is not one of [56,48,32,24,16]
31646 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
31650 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
31651 SarConst = SarConst - (Size - ShiftSize);
31654 else if (SarConst.isNegative())
31655 return DAG.getNode(ISD::SHL, DL, VT, NN,
31656 DAG.getConstant(-SarConst, DL, CVT));
31658 return DAG.getNode(ISD::SRA, DL, VT, NN,
31659 DAG.getConstant(SarConst, DL, CVT));
31664 /// \brief Returns a vector of 0s if the node in input is a vector logical
31665 /// shift by a constant amount which is known to be bigger than or equal
31666 /// to the vector element size in bits.
31667 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
31668 const X86Subtarget &Subtarget) {
31669 EVT VT = N->getValueType(0);
31671 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
31672 (!Subtarget.hasInt256() ||
31673 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
31676 SDValue Amt = N->getOperand(1);
31678 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
31679 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
31680 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
31681 unsigned MaxAmount =
31682 VT.getSimpleVT().getScalarSizeInBits();
31684 // SSE2/AVX2 logical shifts always return a vector of 0s
31685 // if the shift amount is bigger than or equal to
31686 // the element size. The constant shift amount will be
31687 // encoded as a 8-bit immediate.
31688 if (ShiftAmt.trunc(8).uge(MaxAmount))
31689 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
31695 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
31696 TargetLowering::DAGCombinerInfo &DCI,
31697 const X86Subtarget &Subtarget) {
31698 if (N->getOpcode() == ISD::SHL)
31699 if (SDValue V = combineShiftLeft(N, DAG))
31702 if (N->getOpcode() == ISD::SRA)
31703 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
31706 // Try to fold this logical shift into a zero vector.
31707 if (N->getOpcode() != ISD::SRA)
31708 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
31714 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
31715 TargetLowering::DAGCombinerInfo &DCI,
31716 const X86Subtarget &Subtarget) {
31717 unsigned Opcode = N->getOpcode();
31718 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
31719 X86ISD::VSRLI == Opcode) &&
31720 "Unexpected shift opcode");
31721 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
31722 EVT VT = N->getValueType(0);
31723 SDValue N0 = N->getOperand(0);
31724 SDValue N1 = N->getOperand(1);
31725 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
31726 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
31727 "Unexpected value type");
31729 // Out of range logical bit shifts are guaranteed to be zero.
31730 // Out of range arithmetic bit shifts splat the sign bit.
31731 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
31732 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
31734 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31736 ShiftVal = NumBitsPerElt - 1;
31739 // Shift N0 by zero -> N0.
31743 // Shift zero -> zero.
31744 if (ISD::isBuildVectorAllZeros(N0.getNode()))
31745 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31747 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
31748 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
31749 // TODO - support other sra opcodes as needed.
31750 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
31751 N0.getOpcode() == X86ISD::VSRAI)
31752 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
31754 // We can decode 'whole byte' logical bit shifts as shuffles.
31755 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
31757 SmallVector<int, 1> NonceMask; // Just a placeholder.
31758 NonceMask.push_back(0);
31759 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31760 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31762 return SDValue(); // This routine will use CombineTo to replace N.
31765 // Constant Folding.
31767 SmallVector<APInt, 32> EltBits;
31768 if (N->isOnlyUserOf(N0.getNode()) &&
31769 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
31770 assert(EltBits.size() == VT.getVectorNumElements() &&
31771 "Unexpected shift value type");
31772 unsigned ShiftImm = ShiftVal.getZExtValue();
31773 for (APInt &Elt : EltBits) {
31774 if (X86ISD::VSHLI == Opcode)
31776 else if (X86ISD::VSRAI == Opcode)
31777 Elt.ashrInPlace(ShiftImm);
31779 Elt.lshrInPlace(ShiftImm);
31781 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
31787 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
31788 TargetLowering::DAGCombinerInfo &DCI,
31789 const X86Subtarget &Subtarget) {
31791 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
31792 (N->getOpcode() == X86ISD::PINSRW &&
31793 N->getValueType(0) == MVT::v8i16)) &&
31794 "Unexpected vector insertion");
31796 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
31798 SmallVector<int, 1> NonceMask; // Just a placeholder.
31799 NonceMask.push_back(0);
31800 combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31801 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31806 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
31807 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
31808 /// OR -> CMPNEQSS.
31809 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
31810 TargetLowering::DAGCombinerInfo &DCI,
31811 const X86Subtarget &Subtarget) {
31814 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
31815 // we're requiring SSE2 for both.
31816 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
31817 SDValue N0 = N->getOperand(0);
31818 SDValue N1 = N->getOperand(1);
31819 SDValue CMP0 = N0->getOperand(1);
31820 SDValue CMP1 = N1->getOperand(1);
31823 // The SETCCs should both refer to the same CMP.
31824 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
31827 SDValue CMP00 = CMP0->getOperand(0);
31828 SDValue CMP01 = CMP0->getOperand(1);
31829 EVT VT = CMP00.getValueType();
31831 if (VT == MVT::f32 || VT == MVT::f64) {
31832 bool ExpectingFlags = false;
31833 // Check for any users that want flags:
31834 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
31835 !ExpectingFlags && UI != UE; ++UI)
31836 switch (UI->getOpcode()) {
31841 ExpectingFlags = true;
31843 case ISD::CopyToReg:
31844 case ISD::SIGN_EXTEND:
31845 case ISD::ZERO_EXTEND:
31846 case ISD::ANY_EXTEND:
31850 if (!ExpectingFlags) {
31851 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
31852 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
31854 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
31855 X86::CondCode tmp = cc0;
31860 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
31861 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
31862 // FIXME: need symbolic constants for these magic numbers.
31863 // See X86ATTInstPrinter.cpp:printSSECC().
31864 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
31865 if (Subtarget.hasAVX512()) {
31867 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
31868 DAG.getConstant(x86cc, DL, MVT::i8));
31869 return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
31870 FSetCC, DAG.getIntPtrConstant(0, DL));
31872 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
31873 CMP00.getValueType(), CMP00, CMP01,
31874 DAG.getConstant(x86cc, DL,
31877 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
31878 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
31880 if (is64BitFP && !Subtarget.is64Bit()) {
31881 // On a 32-bit target, we cannot bitcast the 64-bit float to a
31882 // 64-bit integer, since that's not a legal type. Since
31883 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
31884 // bits, but can do this little dance to extract the lowest 32 bits
31885 // and work with those going forward.
31886 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
31888 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
31889 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
31890 Vector32, DAG.getIntPtrConstant(0, DL));
31894 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
31895 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
31896 DAG.getConstant(1, DL, IntVT));
31897 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
31899 return OneBitOfTruth;
31907 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
31908 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
31909 assert(N->getOpcode() == ISD::AND);
31911 EVT VT = N->getValueType(0);
31912 SDValue N0 = N->getOperand(0);
31913 SDValue N1 = N->getOperand(1);
31916 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
31919 if (N0.getOpcode() == ISD::XOR &&
31920 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
31921 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
31923 if (N1.getOpcode() == ISD::XOR &&
31924 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
31925 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
31930 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
31931 // register. In most cases we actually compare or select YMM-sized registers
31932 // and mixing the two types creates horrible code. This method optimizes
31933 // some of the transition sequences.
31934 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31935 TargetLowering::DAGCombinerInfo &DCI,
31936 const X86Subtarget &Subtarget) {
31937 EVT VT = N->getValueType(0);
31938 if (!VT.is256BitVector())
31941 assert((N->getOpcode() == ISD::ANY_EXTEND ||
31942 N->getOpcode() == ISD::ZERO_EXTEND ||
31943 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
31945 SDValue Narrow = N->getOperand(0);
31946 EVT NarrowVT = Narrow->getValueType(0);
31947 if (!NarrowVT.is128BitVector())
31950 if (Narrow->getOpcode() != ISD::XOR &&
31951 Narrow->getOpcode() != ISD::AND &&
31952 Narrow->getOpcode() != ISD::OR)
31955 SDValue N0 = Narrow->getOperand(0);
31956 SDValue N1 = Narrow->getOperand(1);
31959 // The Left side has to be a trunc.
31960 if (N0.getOpcode() != ISD::TRUNCATE)
31963 // The type of the truncated inputs.
31964 EVT WideVT = N0->getOperand(0)->getValueType(0);
31968 // The right side has to be a 'trunc' or a constant vector.
31969 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
31970 ConstantSDNode *RHSConstSplat = nullptr;
31971 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
31972 RHSConstSplat = RHSBV->getConstantSplatNode();
31973 if (!RHSTrunc && !RHSConstSplat)
31976 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31978 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
31981 // Set N0 and N1 to hold the inputs to the new wide operation.
31982 N0 = N0->getOperand(0);
31983 if (RHSConstSplat) {
31984 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
31985 SDValue(RHSConstSplat, 0));
31986 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
31987 } else if (RHSTrunc) {
31988 N1 = N1->getOperand(0);
31991 // Generate the wide operation.
31992 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
31993 unsigned Opcode = N->getOpcode();
31995 case ISD::ANY_EXTEND:
31997 case ISD::ZERO_EXTEND: {
31998 unsigned InBits = NarrowVT.getScalarSizeInBits();
31999 APInt Mask = APInt::getAllOnesValue(InBits);
32000 Mask = Mask.zext(VT.getScalarSizeInBits());
32001 return DAG.getNode(ISD::AND, DL, VT,
32002 Op, DAG.getConstant(Mask, DL, VT));
32004 case ISD::SIGN_EXTEND:
32005 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
32006 Op, DAG.getValueType(NarrowVT));
32008 llvm_unreachable("Unexpected opcode");
32012 /// If both input operands of a logic op are being cast from floating point
32013 /// types, try to convert this into a floating point logic node to avoid
32014 /// unnecessary moves from SSE to integer registers.
32015 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
32016 const X86Subtarget &Subtarget) {
32017 unsigned FPOpcode = ISD::DELETED_NODE;
32018 if (N->getOpcode() == ISD::AND)
32019 FPOpcode = X86ISD::FAND;
32020 else if (N->getOpcode() == ISD::OR)
32021 FPOpcode = X86ISD::FOR;
32022 else if (N->getOpcode() == ISD::XOR)
32023 FPOpcode = X86ISD::FXOR;
32025 assert(FPOpcode != ISD::DELETED_NODE &&
32026 "Unexpected input node for FP logic conversion");
32028 EVT VT = N->getValueType(0);
32029 SDValue N0 = N->getOperand(0);
32030 SDValue N1 = N->getOperand(1);
32032 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
32033 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
32034 (Subtarget.hasSSE2() && VT == MVT::i64))) {
32035 SDValue N00 = N0.getOperand(0);
32036 SDValue N10 = N1.getOperand(0);
32037 EVT N00Type = N00.getValueType();
32038 EVT N10Type = N10.getValueType();
32039 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
32040 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
32041 return DAG.getBitcast(VT, FPLogic);
32047 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
32048 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
32049 /// with a shift-right to eliminate loading the vector constant mask value.
32050 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
32051 const X86Subtarget &Subtarget) {
32052 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
32053 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
32054 EVT VT0 = Op0.getValueType();
32055 EVT VT1 = Op1.getValueType();
32057 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
32061 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
32062 !SplatVal.isMask())
32065 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
32068 unsigned EltBitWidth = VT0.getScalarSizeInBits();
32069 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
32073 unsigned ShiftVal = SplatVal.countTrailingOnes();
32074 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
32075 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
32076 return DAG.getBitcast(N->getValueType(0), Shift);
32079 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
32080 TargetLowering::DAGCombinerInfo &DCI,
32081 const X86Subtarget &Subtarget) {
32082 if (DCI.isBeforeLegalizeOps())
32085 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
32088 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32091 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
32094 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
32097 EVT VT = N->getValueType(0);
32098 SDValue N0 = N->getOperand(0);
32099 SDValue N1 = N->getOperand(1);
32102 // Attempt to recursively combine a bitmask AND with shuffles.
32103 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
32105 SmallVector<int, 1> NonceMask; // Just a placeholder.
32106 NonceMask.push_back(0);
32107 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
32108 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
32110 return SDValue(); // This routine will use CombineTo to replace N.
32113 // Create BEXTR instructions
32114 // BEXTR is ((X >> imm) & (2**size-1))
32115 if (VT != MVT::i32 && VT != MVT::i64)
32118 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
32120 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
32123 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
32124 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
32125 if (MaskNode && ShiftNode) {
32126 uint64_t Mask = MaskNode->getZExtValue();
32127 uint64_t Shift = ShiftNode->getZExtValue();
32128 if (isMask_64(Mask)) {
32129 uint64_t MaskSize = countPopulation(Mask);
32130 if (Shift + MaskSize <= VT.getSizeInBits())
32131 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
32132 DAG.getConstant(Shift | (MaskSize << 8), DL,
32140 // (or (and (m, y), (pandn m, x)))
32142 // (vselect m, x, y)
32143 // As a special case, try to fold:
32144 // (or (and (m, (sub 0, x)), (pandn m, x)))
32146 // (sub (xor X, M), M)
32147 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
32148 const X86Subtarget &Subtarget) {
32149 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
32151 SDValue N0 = N->getOperand(0);
32152 SDValue N1 = N->getOperand(1);
32153 EVT VT = N->getValueType(0);
32155 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
32156 (VT.is256BitVector() && Subtarget.hasInt256())))
32159 // Canonicalize AND to LHS.
32160 if (N1.getOpcode() == ISD::AND)
32163 // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
32164 // ANDNP combine allows other combines to happen that prevent matching.
32165 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
32168 SDValue Mask = N1.getOperand(0);
32169 SDValue X = N1.getOperand(1);
32171 if (N0.getOperand(0) == Mask)
32172 Y = N0.getOperand(1);
32173 if (N0.getOperand(1) == Mask)
32174 Y = N0.getOperand(0);
32176 // Check to see if the mask appeared in both the AND and ANDNP.
32180 // Validate that X, Y, and Mask are bitcasts, and see through them.
32181 Mask = peekThroughBitcasts(Mask);
32182 X = peekThroughBitcasts(X);
32183 Y = peekThroughBitcasts(Y);
32185 EVT MaskVT = Mask.getValueType();
32186 unsigned EltBits = MaskVT.getScalarSizeInBits();
32188 // TODO: Attempt to handle floating point cases as well?
32189 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
32195 // (or (and (M, (sub 0, X)), (pandn M, X)))
32196 // which is a special case of vselect:
32197 // (vselect M, (sub 0, X), X)
32199 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
32200 // We know that, if fNegate is 0 or 1:
32201 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
32203 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
32204 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
32205 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
32206 // This lets us transform our vselect to:
32207 // (add (xor X, M), (and M, 1))
32209 // (sub (xor X, M), M)
32210 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
32211 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
32212 auto IsNegV = [](SDNode *N, SDValue V) {
32213 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
32214 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
32217 if (IsNegV(Y.getNode(), X))
32219 else if (IsNegV(X.getNode(), Y))
32223 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
32224 SDValue SubOp2 = Mask;
32226 // If the negate was on the false side of the select, then
32227 // the operands of the SUB need to be swapped. PR 27251.
32228 // This is because the pattern being matched above is
32229 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
32230 // but if the pattern matched was
32231 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
32232 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
32233 // pattern also needs to be a negation of the replacement pattern above.
32234 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
32235 // sub accomplishes the negation of the replacement pattern.
32237 std::swap(SubOp1, SubOp2);
32239 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
32240 return DAG.getBitcast(VT, Res);
32244 // PBLENDVB is only available on SSE 4.1.
32245 if (!Subtarget.hasSSE41())
32248 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
32250 X = DAG.getBitcast(BlendVT, X);
32251 Y = DAG.getBitcast(BlendVT, Y);
32252 Mask = DAG.getBitcast(BlendVT, Mask);
32253 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
32254 return DAG.getBitcast(VT, Mask);
32257 // Helper function for combineOrCmpEqZeroToCtlzSrl
32261 // srl(ctlz x), log2(bitsize(x))
32262 // Input pattern is checked by caller.
32263 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
32264 SelectionDAG &DAG) {
32265 SDValue Cmp = Op.getOperand(1);
32266 EVT VT = Cmp.getOperand(0).getValueType();
32267 unsigned Log2b = Log2_32(VT.getSizeInBits());
32269 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
32270 // The result of the shift is true or false, and on X86, the 32-bit
32271 // encoding of shr and lzcnt is more desirable.
32272 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
32273 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
32274 DAG.getConstant(Log2b, dl, VT));
32275 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
32278 // Try to transform:
32279 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
32281 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
32282 // Will also attempt to match more generic cases, eg:
32283 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
32284 // Only applies if the target supports the FastLZCNT feature.
32285 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
32286 TargetLowering::DAGCombinerInfo &DCI,
32287 const X86Subtarget &Subtarget) {
32288 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
32291 auto isORCandidate = [](SDValue N) {
32292 return (N->getOpcode() == ISD::OR && N->hasOneUse());
32295 // Check the zero extend is extending to 32-bit or more. The code generated by
32296 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
32297 // instructions to clear the upper bits.
32298 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
32299 !isORCandidate(N->getOperand(0)))
32302 // Check the node matches: setcc(eq, cmp 0)
32303 auto isSetCCCandidate = [](SDValue N) {
32304 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
32305 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
32306 N->getOperand(1).getOpcode() == X86ISD::CMP &&
32307 isNullConstant(N->getOperand(1).getOperand(1)) &&
32308 N->getOperand(1).getValueType().bitsGE(MVT::i32);
32311 SDNode *OR = N->getOperand(0).getNode();
32312 SDValue LHS = OR->getOperand(0);
32313 SDValue RHS = OR->getOperand(1);
32315 // Save nodes matching or(or, setcc(eq, cmp 0)).
32316 SmallVector<SDNode *, 2> ORNodes;
32317 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
32318 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
32319 ORNodes.push_back(OR);
32320 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
32321 LHS = OR->getOperand(0);
32322 RHS = OR->getOperand(1);
32325 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
32326 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
32327 !isORCandidate(SDValue(OR, 0)))
32330 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
32332 // or(srl(ctlz),srl(ctlz)).
32333 // The dag combiner can then fold it into:
32334 // srl(or(ctlz, ctlz)).
32335 EVT VT = OR->getValueType(0);
32336 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
32337 SDValue Ret, NewRHS;
32338 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
32339 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
32344 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
32345 while (ORNodes.size() > 0) {
32346 OR = ORNodes.pop_back_val();
32347 LHS = OR->getOperand(0);
32348 RHS = OR->getOperand(1);
32349 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
32350 if (RHS->getOpcode() == ISD::OR)
32351 std::swap(LHS, RHS);
32352 EVT VT = OR->getValueType(0);
32353 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
32356 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
32360 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
32365 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
32366 TargetLowering::DAGCombinerInfo &DCI,
32367 const X86Subtarget &Subtarget) {
32368 if (DCI.isBeforeLegalizeOps())
32371 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
32374 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32377 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
32380 SDValue N0 = N->getOperand(0);
32381 SDValue N1 = N->getOperand(1);
32382 EVT VT = N->getValueType(0);
32384 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
32387 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
32388 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
32390 // SHLD/SHRD instructions have lower register pressure, but on some
32391 // platforms they have higher latency than the equivalent
32392 // series of shifts/or that would otherwise be generated.
32393 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
32394 // have higher latencies and we are not optimizing for size.
32395 if (!OptForSize && Subtarget.isSHLDSlow())
32398 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
32400 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
32402 if (!N0.hasOneUse() || !N1.hasOneUse())
32405 SDValue ShAmt0 = N0.getOperand(1);
32406 if (ShAmt0.getValueType() != MVT::i8)
32408 SDValue ShAmt1 = N1.getOperand(1);
32409 if (ShAmt1.getValueType() != MVT::i8)
32411 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
32412 ShAmt0 = ShAmt0.getOperand(0);
32413 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
32414 ShAmt1 = ShAmt1.getOperand(0);
32417 unsigned Opc = X86ISD::SHLD;
32418 SDValue Op0 = N0.getOperand(0);
32419 SDValue Op1 = N1.getOperand(0);
32420 if (ShAmt0.getOpcode() == ISD::SUB ||
32421 ShAmt0.getOpcode() == ISD::XOR) {
32422 Opc = X86ISD::SHRD;
32423 std::swap(Op0, Op1);
32424 std::swap(ShAmt0, ShAmt1);
32427 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
32428 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
32429 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
32430 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
32431 unsigned Bits = VT.getSizeInBits();
32432 if (ShAmt1.getOpcode() == ISD::SUB) {
32433 SDValue Sum = ShAmt1.getOperand(0);
32434 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
32435 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
32436 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
32437 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
32438 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
32439 return DAG.getNode(Opc, DL, VT,
32441 DAG.getNode(ISD::TRUNCATE, DL,
32444 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
32445 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
32446 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
32447 return DAG.getNode(Opc, DL, VT,
32448 N0.getOperand(0), N1.getOperand(0),
32449 DAG.getNode(ISD::TRUNCATE, DL,
32451 } else if (ShAmt1.getOpcode() == ISD::XOR) {
32452 SDValue Mask = ShAmt1.getOperand(1);
32453 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
32454 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
32455 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
32456 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
32457 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
32458 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
32459 if (Op1.getOpcode() == InnerShift &&
32460 isa<ConstantSDNode>(Op1.getOperand(1)) &&
32461 Op1.getConstantOperandVal(1) == 1) {
32462 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32463 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32465 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
32466 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
32467 Op1.getOperand(0) == Op1.getOperand(1)) {
32468 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32469 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32478 /// Generate NEG and CMOV for integer abs.
32479 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
32480 EVT VT = N->getValueType(0);
32482 // Since X86 does not have CMOV for 8-bit integer, we don't convert
32483 // 8-bit integer abs to NEG and CMOV.
32484 if (VT.isInteger() && VT.getSizeInBits() == 8)
32487 SDValue N0 = N->getOperand(0);
32488 SDValue N1 = N->getOperand(1);
32491 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
32492 // and change it to SUB and CMOV.
32493 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
32494 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
32495 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
32496 auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
32497 if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
32498 // Generate SUB & CMOV.
32499 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
32500 DAG.getConstant(0, DL, VT), N0.getOperand(0));
32501 SDValue Ops[] = {N0.getOperand(0), Neg,
32502 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
32503 SDValue(Neg.getNode(), 1)};
32504 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
32510 /// Try to turn tests against the signbit in the form of:
32511 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
32514 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
32515 // This is only worth doing if the output type is i8 or i1.
32516 EVT ResultType = N->getValueType(0);
32517 if (ResultType != MVT::i8 && ResultType != MVT::i1)
32520 SDValue N0 = N->getOperand(0);
32521 SDValue N1 = N->getOperand(1);
32523 // We should be performing an xor against a truncated shift.
32524 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
32527 // Make sure we are performing an xor against one.
32528 if (!isOneConstant(N1))
32531 // SetCC on x86 zero extends so only act on this if it's a logical shift.
32532 SDValue Shift = N0.getOperand(0);
32533 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
32536 // Make sure we are truncating from one of i16, i32 or i64.
32537 EVT ShiftTy = Shift.getValueType();
32538 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
32541 // Make sure the shift amount extracts the sign bit.
32542 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
32543 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
32546 // Create a greater-than comparison against -1.
32547 // N.B. Using SETGE against 0 works but we want a canonical looking
32548 // comparison, using SETGT matches up with what TranslateX86CC.
32550 SDValue ShiftOp = Shift.getOperand(0);
32551 EVT ShiftOpTy = ShiftOp.getValueType();
32552 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32553 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
32554 *DAG.getContext(), ResultType);
32555 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
32556 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
32557 if (SetCCResultType != ResultType)
32558 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
32562 /// Turn vector tests of the signbit in the form of:
32563 /// xor (sra X, elt_size(X)-1), -1
32567 /// This should be called before type legalization because the pattern may not
32568 /// persist after that.
32569 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
32570 const X86Subtarget &Subtarget) {
32571 EVT VT = N->getValueType(0);
32572 if (!VT.isSimple())
32575 switch (VT.getSimpleVT().SimpleTy) {
32576 default: return SDValue();
32579 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
32580 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
32584 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
32587 // There must be a shift right algebraic before the xor, and the xor must be a
32588 // 'not' operation.
32589 SDValue Shift = N->getOperand(0);
32590 SDValue Ones = N->getOperand(1);
32591 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
32592 !ISD::isBuildVectorAllOnes(Ones.getNode()))
32595 // The shift should be smearing the sign bit across each vector element.
32596 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
32600 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
32601 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
32602 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
32605 // Create a greater-than comparison against -1. We don't use the more obvious
32606 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
32607 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
32610 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
32611 /// is valid for the given \p Subtarget.
32612 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
32613 const X86Subtarget &Subtarget) {
32614 if (!Subtarget.hasAVX512())
32617 // FIXME: Scalar type may be supported if we move it to vector register.
32618 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
32621 EVT SrcElVT = SrcVT.getScalarType();
32622 EVT DstElVT = DstVT.getScalarType();
32623 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
32625 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
32627 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
32628 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
32632 /// Detect a pattern of truncation with saturation:
32633 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32634 /// Return the source value to be truncated or SDValue() if the pattern was not
32636 static SDValue detectUSatPattern(SDValue In, EVT VT) {
32637 if (In.getOpcode() != ISD::UMIN)
32640 //Saturation with truncation. We truncate from InVT to VT.
32641 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
32642 "Unexpected types for truncate operation");
32645 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
32646 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
32647 // the element size of the destination type.
32648 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
32654 /// Detect a pattern of truncation with saturation:
32655 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32656 /// The types should allow to use VPMOVUS* instruction on AVX512.
32657 /// Return the source value to be truncated or SDValue() if the pattern was not
32659 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
32660 const X86Subtarget &Subtarget) {
32661 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32663 return detectUSatPattern(In, VT);
32667 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
32668 const X86Subtarget &Subtarget) {
32669 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32670 if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
32672 if (auto USatVal = detectUSatPattern(In, VT))
32673 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32674 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
32678 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
32679 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
32680 /// X86ISD::AVG instruction.
32681 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
32682 const X86Subtarget &Subtarget,
32684 if (!VT.isVector() || !VT.isSimple())
32686 EVT InVT = In.getValueType();
32687 unsigned NumElems = VT.getVectorNumElements();
32689 EVT ScalarVT = VT.getVectorElementType();
32690 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
32691 isPowerOf2_32(NumElems)))
32694 // InScalarVT is the intermediate type in AVG pattern and it should be greater
32695 // than the original input type (i8/i16).
32696 EVT InScalarVT = InVT.getVectorElementType();
32697 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
32700 if (!Subtarget.hasSSE2())
32702 if (Subtarget.hasBWI()) {
32703 if (VT.getSizeInBits() > 512)
32705 } else if (Subtarget.hasAVX2()) {
32706 if (VT.getSizeInBits() > 256)
32709 if (VT.getSizeInBits() > 128)
32713 // Detect the following pattern:
32715 // %1 = zext <N x i8> %a to <N x i32>
32716 // %2 = zext <N x i8> %b to <N x i32>
32717 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
32718 // %4 = add nuw nsw <N x i32> %3, %2
32719 // %5 = lshr <N x i32> %N, <i32 1 x N>
32720 // %6 = trunc <N x i32> %5 to <N x i8>
32722 // In AVX512, the last instruction can also be a trunc store.
32724 if (In.getOpcode() != ISD::SRL)
32727 // A lambda checking the given SDValue is a constant vector and each element
32728 // is in the range [Min, Max].
32729 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
32730 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
32731 if (!BV || !BV->isConstant())
32733 for (SDValue Op : V->ops()) {
32734 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
32737 uint64_t Val = C->getZExtValue();
32738 if (Val < Min || Val > Max)
32744 // Check if each element of the vector is left-shifted by one.
32745 auto LHS = In.getOperand(0);
32746 auto RHS = In.getOperand(1);
32747 if (!IsConstVectorInRange(RHS, 1, 1))
32749 if (LHS.getOpcode() != ISD::ADD)
32752 // Detect a pattern of a + b + 1 where the order doesn't matter.
32753 SDValue Operands[3];
32754 Operands[0] = LHS.getOperand(0);
32755 Operands[1] = LHS.getOperand(1);
32757 // Take care of the case when one of the operands is a constant vector whose
32758 // element is in the range [1, 256].
32759 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
32760 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
32761 Operands[0].getOperand(0).getValueType() == VT) {
32762 // The pattern is detected. Subtract one from the constant vector, then
32763 // demote it and emit X86ISD::AVG instruction.
32764 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
32765 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
32766 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
32767 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32771 if (Operands[0].getOpcode() == ISD::ADD)
32772 std::swap(Operands[0], Operands[1]);
32773 else if (Operands[1].getOpcode() != ISD::ADD)
32775 Operands[2] = Operands[1].getOperand(0);
32776 Operands[1] = Operands[1].getOperand(1);
32778 // Now we have three operands of two additions. Check that one of them is a
32779 // constant vector with ones, and the other two are promoted from i8/i16.
32780 for (int i = 0; i < 3; ++i) {
32781 if (!IsConstVectorInRange(Operands[i], 1, 1))
32783 std::swap(Operands[i], Operands[2]);
32785 // Check if Operands[0] and Operands[1] are results of type promotion.
32786 for (int j = 0; j < 2; ++j)
32787 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
32788 Operands[j].getOperand(0).getValueType() != VT)
32791 // The pattern is detected, emit X86ISD::AVG instruction.
32792 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32793 Operands[1].getOperand(0));
32799 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
32800 TargetLowering::DAGCombinerInfo &DCI,
32801 const X86Subtarget &Subtarget) {
32802 LoadSDNode *Ld = cast<LoadSDNode>(N);
32803 EVT RegVT = Ld->getValueType(0);
32804 EVT MemVT = Ld->getMemoryVT();
32806 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32808 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
32809 // into two 16-byte operations. Also split non-temporal aligned loads on
32810 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
32811 ISD::LoadExtType Ext = Ld->getExtensionType();
32813 unsigned AddressSpace = Ld->getAddressSpace();
32814 unsigned Alignment = Ld->getAlignment();
32815 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
32816 Ext == ISD::NON_EXTLOAD &&
32817 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
32818 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32819 AddressSpace, Alignment, &Fast) && !Fast))) {
32820 unsigned NumElems = RegVT.getVectorNumElements();
32824 SDValue Ptr = Ld->getBasePtr();
32826 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
32829 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32830 Alignment, Ld->getMemOperand()->getFlags());
32832 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
32834 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32835 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
32836 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32838 Load2.getValue(1));
32840 SDValue NewVec = DAG.getUNDEF(RegVT);
32841 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
32842 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
32843 return DCI.CombineTo(N, NewVec, TF, true);
32849 /// If V is a build vector of boolean constants and exactly one of those
32850 /// constants is true, return the operand index of that true element.
32851 /// Otherwise, return -1.
32852 static int getOneTrueElt(SDValue V) {
32853 // This needs to be a build vector of booleans.
32854 // TODO: Checking for the i1 type matches the IR definition for the mask,
32855 // but the mask check could be loosened to i8 or other types. That might
32856 // also require checking more than 'allOnesValue'; eg, the x86 HW
32857 // instructions only require that the MSB is set for each mask element.
32858 // The ISD::MSTORE comments/definition do not specify how the mask operand
32860 auto *BV = dyn_cast<BuildVectorSDNode>(V);
32861 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
32864 int TrueIndex = -1;
32865 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
32866 for (unsigned i = 0; i < NumElts; ++i) {
32867 const SDValue &Op = BV->getOperand(i);
32870 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
32873 if (ConstNode->getAPIntValue().isAllOnesValue()) {
32874 // If we already found a one, this is too many.
32875 if (TrueIndex >= 0)
32883 /// Given a masked memory load/store operation, return true if it has one mask
32884 /// bit set. If it has one mask bit set, then also return the memory address of
32885 /// the scalar element to load/store, the vector index to insert/extract that
32886 /// scalar element, and the alignment for the scalar memory access.
32887 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
32888 SelectionDAG &DAG, SDValue &Addr,
32889 SDValue &Index, unsigned &Alignment) {
32890 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
32891 if (TrueMaskElt < 0)
32894 // Get the address of the one scalar element that is specified by the mask
32895 // using the appropriate offset from the base pointer.
32896 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
32897 Addr = MaskedOp->getBasePtr();
32898 if (TrueMaskElt != 0) {
32899 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
32900 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
32903 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
32904 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
32908 /// If exactly one element of the mask is set for a non-extending masked load,
32909 /// it is a scalar load and vector insert.
32910 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32911 /// mask have already been optimized in IR, so we don't bother with those here.
32913 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32914 TargetLowering::DAGCombinerInfo &DCI) {
32915 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32916 // However, some target hooks may need to be added to know when the transform
32917 // is profitable. Endianness would also have to be considered.
32919 SDValue Addr, VecIndex;
32920 unsigned Alignment;
32921 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
32924 // Load the one scalar element that is specified by the mask using the
32925 // appropriate offset from the base pointer.
32927 EVT VT = ML->getValueType(0);
32928 EVT EltVT = VT.getVectorElementType();
32930 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
32931 Alignment, ML->getMemOperand()->getFlags());
32933 // Insert the loaded element into the appropriate place in the vector.
32934 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
32936 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
32940 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32941 TargetLowering::DAGCombinerInfo &DCI) {
32942 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
32946 EVT VT = ML->getValueType(0);
32948 // If we are loading the first and last elements of a vector, it is safe and
32949 // always faster to load the whole vector. Replace the masked load with a
32950 // vector load and select.
32951 unsigned NumElts = VT.getVectorNumElements();
32952 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
32953 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
32954 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
32955 if (LoadFirstElt && LoadLastElt) {
32956 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32957 ML->getMemOperand());
32958 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
32959 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
32962 // Convert a masked load with a constant mask into a masked load and a select.
32963 // This allows the select operation to use a faster kind of select instruction
32964 // (for example, vblendvps -> vblendps).
32966 // Don't try this if the pass-through operand is already undefined. That would
32967 // cause an infinite loop because that's what we're about to create.
32968 if (ML->getSrc0().isUndef())
32971 // The new masked load has an undef pass-through operand. The select uses the
32972 // original pass-through operand.
32973 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32974 ML->getMask(), DAG.getUNDEF(VT),
32975 ML->getMemoryVT(), ML->getMemOperand(),
32976 ML->getExtensionType());
32977 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
32979 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
32982 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
32983 TargetLowering::DAGCombinerInfo &DCI,
32984 const X86Subtarget &Subtarget) {
32985 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
32987 // TODO: Expanding load with constant mask may be optimized as well.
32988 if (Mld->isExpandingLoad())
32991 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
32992 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
32994 // TODO: Do some AVX512 subsets benefit from this transform?
32995 if (!Subtarget.hasAVX512())
32996 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
33000 if (Mld->getExtensionType() != ISD::SEXTLOAD)
33003 // Resolve extending loads.
33004 EVT VT = Mld->getValueType(0);
33005 unsigned NumElems = VT.getVectorNumElements();
33006 EVT LdVT = Mld->getMemoryVT();
33009 assert(LdVT != VT && "Cannot extend to the same type");
33010 unsigned ToSz = VT.getScalarSizeInBits();
33011 unsigned FromSz = LdVT.getScalarSizeInBits();
33012 // From/To sizes and ElemCount must be pow of two.
33013 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
33014 "Unexpected size for extending masked load");
33016 unsigned SizeRatio = ToSz / FromSz;
33017 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
33019 // Create a type on which we perform the shuffle.
33020 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33021 LdVT.getScalarType(), NumElems*SizeRatio);
33022 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33024 // Convert Src0 value.
33025 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
33026 if (!Mld->getSrc0().isUndef()) {
33027 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33028 for (unsigned i = 0; i != NumElems; ++i)
33029 ShuffleVec[i] = i * SizeRatio;
33031 // Can't shuffle using an illegal type.
33032 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
33033 "WideVecVT should be legal");
33034 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
33035 DAG.getUNDEF(WideVecVT), ShuffleVec);
33037 // Prepare the new mask.
33039 SDValue Mask = Mld->getMask();
33040 if (Mask.getValueType() == VT) {
33041 // Mask and original value have the same type.
33042 NewMask = DAG.getBitcast(WideVecVT, Mask);
33043 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33044 for (unsigned i = 0; i != NumElems; ++i)
33045 ShuffleVec[i] = i * SizeRatio;
33046 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
33047 ShuffleVec[i] = NumElems * SizeRatio;
33048 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
33049 DAG.getConstant(0, dl, WideVecVT),
33052 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
33053 unsigned WidenNumElts = NumElems*SizeRatio;
33054 unsigned MaskNumElts = VT.getVectorNumElements();
33055 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
33058 unsigned NumConcat = WidenNumElts / MaskNumElts;
33059 SmallVector<SDValue, 16> Ops(NumConcat);
33060 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
33062 for (unsigned i = 1; i != NumConcat; ++i)
33065 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
33068 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
33069 Mld->getBasePtr(), NewMask, WideSrc0,
33070 Mld->getMemoryVT(), Mld->getMemOperand(),
33072 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
33073 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
33076 /// If exactly one element of the mask is set for a non-truncating masked store,
33077 /// it is a vector extract and scalar store.
33078 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
33079 /// mask have already been optimized in IR, so we don't bother with those here.
33080 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
33081 SelectionDAG &DAG) {
33082 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
33083 // However, some target hooks may need to be added to know when the transform
33084 // is profitable. Endianness would also have to be considered.
33086 SDValue Addr, VecIndex;
33087 unsigned Alignment;
33088 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
33091 // Extract the one scalar element that is actually being stored.
33093 EVT VT = MS->getValue().getValueType();
33094 EVT EltVT = VT.getVectorElementType();
33095 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
33096 MS->getValue(), VecIndex);
33098 // Store that element at the appropriate offset from the base pointer.
33099 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
33100 Alignment, MS->getMemOperand()->getFlags());
33103 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
33104 const X86Subtarget &Subtarget) {
33105 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
33107 if (Mst->isCompressingStore())
33110 if (!Mst->isTruncatingStore())
33111 return reduceMaskedStoreToScalarStore(Mst, DAG);
33113 // Resolve truncating stores.
33114 EVT VT = Mst->getValue().getValueType();
33115 unsigned NumElems = VT.getVectorNumElements();
33116 EVT StVT = Mst->getMemoryVT();
33119 assert(StVT != VT && "Cannot truncate to the same type");
33120 unsigned FromSz = VT.getScalarSizeInBits();
33121 unsigned ToSz = StVT.getScalarSizeInBits();
33123 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33125 // The truncating store is legal in some cases. For example
33126 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
33127 // are designated for truncate store.
33128 // In this case we don't need any further transformations.
33129 if (TLI.isTruncStoreLegal(VT, StVT))
33132 // From/To sizes and ElemCount must be pow of two.
33133 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
33134 "Unexpected size for truncating masked store");
33135 // We are going to use the original vector elt for storing.
33136 // Accumulated smaller vector elements must be a multiple of the store size.
33137 assert (((NumElems * FromSz) % ToSz) == 0 &&
33138 "Unexpected ratio for truncating masked store");
33140 unsigned SizeRatio = FromSz / ToSz;
33141 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
33143 // Create a type on which we perform the shuffle.
33144 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33145 StVT.getScalarType(), NumElems*SizeRatio);
33147 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33149 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
33150 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33151 for (unsigned i = 0; i != NumElems; ++i)
33152 ShuffleVec[i] = i * SizeRatio;
33154 // Can't shuffle using an illegal type.
33155 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
33156 "WideVecVT should be legal");
33158 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
33159 DAG.getUNDEF(WideVecVT),
33163 SDValue Mask = Mst->getMask();
33164 if (Mask.getValueType() == VT) {
33165 // Mask and original value have the same type.
33166 NewMask = DAG.getBitcast(WideVecVT, Mask);
33167 for (unsigned i = 0; i != NumElems; ++i)
33168 ShuffleVec[i] = i * SizeRatio;
33169 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
33170 ShuffleVec[i] = NumElems*SizeRatio;
33171 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
33172 DAG.getConstant(0, dl, WideVecVT),
33175 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
33176 unsigned WidenNumElts = NumElems*SizeRatio;
33177 unsigned MaskNumElts = VT.getVectorNumElements();
33178 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
33181 unsigned NumConcat = WidenNumElts / MaskNumElts;
33182 SmallVector<SDValue, 16> Ops(NumConcat);
33183 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
33185 for (unsigned i = 1; i != NumConcat; ++i)
33188 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
33191 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
33192 Mst->getBasePtr(), NewMask, StVT,
33193 Mst->getMemOperand(), false);
33196 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
33197 const X86Subtarget &Subtarget) {
33198 StoreSDNode *St = cast<StoreSDNode>(N);
33199 EVT VT = St->getValue().getValueType();
33200 EVT StVT = St->getMemoryVT();
33202 SDValue StoredVal = St->getOperand(1);
33203 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33205 // If we are saving a concatenation of two XMM registers and 32-byte stores
33206 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
33208 unsigned AddressSpace = St->getAddressSpace();
33209 unsigned Alignment = St->getAlignment();
33210 if (VT.is256BitVector() && StVT == VT &&
33211 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
33212 AddressSpace, Alignment, &Fast) &&
33214 unsigned NumElems = VT.getVectorNumElements();
33218 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
33219 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
33221 SDValue Ptr0 = St->getBasePtr();
33222 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
33225 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
33226 Alignment, St->getMemOperand()->getFlags());
33228 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
33229 std::min(16U, Alignment), St->getMemOperand()->getFlags());
33230 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
33233 // Optimize trunc store (of multiple scalars) to shuffle and store.
33234 // First, pack all of the elements in one place. Next, store to memory
33235 // in fewer chunks.
33236 if (St->isTruncatingStore() && VT.isVector()) {
33237 // Check if we can detect an AVG pattern from the truncation. If yes,
33238 // replace the trunc store by a normal store with the result of X86ISD::AVG
33240 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
33242 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
33243 St->getPointerInfo(), St->getAlignment(),
33244 St->getMemOperand()->getFlags());
33247 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
33248 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
33249 dl, Val, St->getBasePtr(),
33250 St->getMemoryVT(), St->getMemOperand(), DAG);
33252 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33253 unsigned NumElems = VT.getVectorNumElements();
33254 assert(StVT != VT && "Cannot truncate to the same type");
33255 unsigned FromSz = VT.getScalarSizeInBits();
33256 unsigned ToSz = StVT.getScalarSizeInBits();
33258 // The truncating store is legal in some cases. For example
33259 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
33260 // are designated for truncate store.
33261 // In this case we don't need any further transformations.
33262 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
33265 // From, To sizes and ElemCount must be pow of two
33266 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
33267 // We are going to use the original vector elt for storing.
33268 // Accumulated smaller vector elements must be a multiple of the store size.
33269 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
33271 unsigned SizeRatio = FromSz / ToSz;
33273 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
33275 // Create a type on which we perform the shuffle
33276 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33277 StVT.getScalarType(), NumElems*SizeRatio);
33279 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33281 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
33282 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
33283 for (unsigned i = 0; i != NumElems; ++i)
33284 ShuffleVec[i] = i * SizeRatio;
33286 // Can't shuffle using an illegal type.
33287 if (!TLI.isTypeLegal(WideVecVT))
33290 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
33291 DAG.getUNDEF(WideVecVT),
33293 // At this point all of the data is stored at the bottom of the
33294 // register. We now need to save it to mem.
33296 // Find the largest store unit
33297 MVT StoreType = MVT::i8;
33298 for (MVT Tp : MVT::integer_valuetypes()) {
33299 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
33303 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
33304 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
33305 (64 <= NumElems * ToSz))
33306 StoreType = MVT::f64;
33308 // Bitcast the original vector into a vector of store-size units
33309 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
33310 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
33311 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
33312 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
33313 SmallVector<SDValue, 8> Chains;
33314 SDValue Ptr = St->getBasePtr();
33316 // Perform one or more big stores into memory.
33317 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
33318 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
33319 StoreType, ShuffWide,
33320 DAG.getIntPtrConstant(i, dl));
33322 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
33323 St->getAlignment(), St->getMemOperand()->getFlags());
33324 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
33325 Chains.push_back(Ch);
33328 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
33331 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
33332 // the FP state in cases where an emms may be missing.
33333 // A preferable solution to the general problem is to figure out the right
33334 // places to insert EMMS. This qualifies as a quick hack.
33336 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
33337 if (VT.getSizeInBits() != 64)
33340 const Function *F = DAG.getMachineFunction().getFunction();
33341 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
33343 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
33344 if ((VT.isVector() ||
33345 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
33346 isa<LoadSDNode>(St->getValue()) &&
33347 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
33348 St->getChain().hasOneUse() && !St->isVolatile()) {
33349 SDNode* LdVal = St->getValue().getNode();
33350 LoadSDNode *Ld = nullptr;
33351 int TokenFactorIndex = -1;
33352 SmallVector<SDValue, 8> Ops;
33353 SDNode* ChainVal = St->getChain().getNode();
33354 // Must be a store of a load. We currently handle two cases: the load
33355 // is a direct child, and it's under an intervening TokenFactor. It is
33356 // possible to dig deeper under nested TokenFactors.
33357 if (ChainVal == LdVal)
33358 Ld = cast<LoadSDNode>(St->getChain());
33359 else if (St->getValue().hasOneUse() &&
33360 ChainVal->getOpcode() == ISD::TokenFactor) {
33361 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
33362 if (ChainVal->getOperand(i).getNode() == LdVal) {
33363 TokenFactorIndex = i;
33364 Ld = cast<LoadSDNode>(St->getValue());
33366 Ops.push_back(ChainVal->getOperand(i));
33370 if (!Ld || !ISD::isNormalLoad(Ld))
33373 // If this is not the MMX case, i.e. we are just turning i64 load/store
33374 // into f64 load/store, avoid the transformation if there are multiple
33375 // uses of the loaded value.
33376 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
33381 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
33382 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
33384 if (Subtarget.is64Bit() || F64IsLegal) {
33385 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
33386 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
33387 Ld->getPointerInfo(), Ld->getAlignment(),
33388 Ld->getMemOperand()->getFlags());
33389 // Make sure new load is placed in same chain order.
33390 SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
33391 if (TokenFactorIndex >= 0) {
33392 Ops.push_back(NewChain);
33393 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
33395 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
33396 St->getPointerInfo(), St->getAlignment(),
33397 St->getMemOperand()->getFlags());
33400 // Otherwise, lower to two pairs of 32-bit loads / stores.
33401 SDValue LoAddr = Ld->getBasePtr();
33402 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
33404 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
33405 Ld->getPointerInfo(), Ld->getAlignment(),
33406 Ld->getMemOperand()->getFlags());
33407 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
33408 Ld->getPointerInfo().getWithOffset(4),
33409 MinAlign(Ld->getAlignment(), 4),
33410 Ld->getMemOperand()->getFlags());
33411 // Make sure new loads are placed in same chain order.
33412 SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
33413 NewChain = DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
33415 if (TokenFactorIndex >= 0) {
33416 Ops.push_back(NewChain);
33417 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
33420 LoAddr = St->getBasePtr();
33421 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
33424 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
33425 St->getAlignment(), St->getMemOperand()->getFlags());
33426 SDValue HiSt = DAG.getStore(
33427 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
33428 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
33429 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
33432 // This is similar to the above case, but here we handle a scalar 64-bit
33433 // integer store that is extracted from a vector on a 32-bit target.
33434 // If we have SSE2, then we can treat it like a floating-point double
33435 // to get past legalization. The execution dependencies fixup pass will
33436 // choose the optimal machine instruction for the store if this really is
33437 // an integer or v2f32 rather than an f64.
33438 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
33439 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
33440 SDValue OldExtract = St->getOperand(1);
33441 SDValue ExtOp0 = OldExtract.getOperand(0);
33442 unsigned VecSize = ExtOp0.getValueSizeInBits();
33443 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
33444 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
33445 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
33446 BitCast, OldExtract.getOperand(1));
33447 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
33448 St->getPointerInfo(), St->getAlignment(),
33449 St->getMemOperand()->getFlags());
33455 /// Return 'true' if this vector operation is "horizontal"
33456 /// and return the operands for the horizontal operation in LHS and RHS. A
33457 /// horizontal operation performs the binary operation on successive elements
33458 /// of its first operand, then on successive elements of its second operand,
33459 /// returning the resulting values in a vector. For example, if
33460 /// A = < float a0, float a1, float a2, float a3 >
33462 /// B = < float b0, float b1, float b2, float b3 >
33463 /// then the result of doing a horizontal operation on A and B is
33464 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
33465 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
33466 /// A horizontal-op B, for some already available A and B, and if so then LHS is
33467 /// set to A, RHS to B, and the routine returns 'true'.
33468 /// Note that the binary operation should have the property that if one of the
33469 /// operands is UNDEF then the result is UNDEF.
33470 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
33471 // Look for the following pattern: if
33472 // A = < float a0, float a1, float a2, float a3 >
33473 // B = < float b0, float b1, float b2, float b3 >
33475 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
33476 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
33477 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
33478 // which is A horizontal-op B.
33480 // At least one of the operands should be a vector shuffle.
33481 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
33482 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
33485 MVT VT = LHS.getSimpleValueType();
33487 assert((VT.is128BitVector() || VT.is256BitVector()) &&
33488 "Unsupported vector type for horizontal add/sub");
33490 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
33491 // operate independently on 128-bit lanes.
33492 unsigned NumElts = VT.getVectorNumElements();
33493 unsigned NumLanes = VT.getSizeInBits()/128;
33494 unsigned NumLaneElts = NumElts / NumLanes;
33495 assert((NumLaneElts % 2 == 0) &&
33496 "Vector type should have an even number of elements in each lane");
33497 unsigned HalfLaneElts = NumLaneElts/2;
33499 // View LHS in the form
33500 // LHS = VECTOR_SHUFFLE A, B, LMask
33501 // If LHS is not a shuffle then pretend it is the shuffle
33502 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
33503 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
33506 SmallVector<int, 16> LMask(NumElts);
33507 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33508 if (!LHS.getOperand(0).isUndef())
33509 A = LHS.getOperand(0);
33510 if (!LHS.getOperand(1).isUndef())
33511 B = LHS.getOperand(1);
33512 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
33513 std::copy(Mask.begin(), Mask.end(), LMask.begin());
33515 if (!LHS.isUndef())
33517 for (unsigned i = 0; i != NumElts; ++i)
33521 // Likewise, view RHS in the form
33522 // RHS = VECTOR_SHUFFLE C, D, RMask
33524 SmallVector<int, 16> RMask(NumElts);
33525 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33526 if (!RHS.getOperand(0).isUndef())
33527 C = RHS.getOperand(0);
33528 if (!RHS.getOperand(1).isUndef())
33529 D = RHS.getOperand(1);
33530 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
33531 std::copy(Mask.begin(), Mask.end(), RMask.begin());
33533 if (!RHS.isUndef())
33535 for (unsigned i = 0; i != NumElts; ++i)
33539 // Check that the shuffles are both shuffling the same vectors.
33540 if (!(A == C && B == D) && !(A == D && B == C))
33543 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
33544 if (!A.getNode() && !B.getNode())
33547 // If A and B occur in reverse order in RHS, then "swap" them (which means
33548 // rewriting the mask).
33550 ShuffleVectorSDNode::commuteMask(RMask);
33552 // At this point LHS and RHS are equivalent to
33553 // LHS = VECTOR_SHUFFLE A, B, LMask
33554 // RHS = VECTOR_SHUFFLE A, B, RMask
33555 // Check that the masks correspond to performing a horizontal operation.
33556 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
33557 for (unsigned i = 0; i != NumLaneElts; ++i) {
33558 int LIdx = LMask[i+l], RIdx = RMask[i+l];
33560 // Ignore any UNDEF components.
33561 if (LIdx < 0 || RIdx < 0 ||
33562 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
33563 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
33566 // Check that successive elements are being operated on. If not, this is
33567 // not a horizontal operation.
33568 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
33569 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
33570 if (!(LIdx == Index && RIdx == Index + 1) &&
33571 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
33576 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
33577 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
33581 /// Do target-specific dag combines on floating-point adds/subs.
33582 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
33583 const X86Subtarget &Subtarget) {
33584 EVT VT = N->getValueType(0);
33585 SDValue LHS = N->getOperand(0);
33586 SDValue RHS = N->getOperand(1);
33587 bool IsFadd = N->getOpcode() == ISD::FADD;
33588 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
33590 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
33591 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
33592 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
33593 isHorizontalBinOp(LHS, RHS, IsFadd)) {
33594 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
33595 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
33600 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
33602 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
33603 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
33604 const X86Subtarget &Subtarget,
33606 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
33607 SDValue Src = N->getOperand(0);
33608 unsigned Opcode = Src.getOpcode();
33609 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33611 EVT VT = N->getValueType(0);
33612 EVT SrcVT = Src.getValueType();
33614 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
33615 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
33617 // Repeated operand, so we are only trading one output truncation for
33618 // one input truncation.
33622 // See if either operand has been extended from a smaller/equal size to
33623 // the truncation size, allowing a truncation to combine with the extend.
33624 unsigned Opcode0 = Op0.getOpcode();
33625 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
33626 Opcode0 == ISD::ZERO_EXTEND) &&
33627 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33630 unsigned Opcode1 = Op1.getOpcode();
33631 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
33632 Opcode1 == ISD::ZERO_EXTEND) &&
33633 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33636 // See if either operand is a single use constant which can be constant
33638 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
33639 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
33640 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
33641 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
33644 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
33645 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
33646 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
33647 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
33650 // Don't combine if the operation has other uses.
33651 if (!N->isOnlyUserOf(Src.getNode()))
33654 // Only support vector truncation for now.
33655 // TODO: i64 scalar math would benefit as well.
33656 if (!VT.isVector())
33659 // In most cases its only worth pre-truncating if we're only facing the cost
33660 // of one truncation.
33661 // i.e. if one of the inputs will constant fold or the input is repeated.
33666 SDValue Op0 = Src.getOperand(0);
33667 SDValue Op1 = Src.getOperand(1);
33668 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
33669 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33670 return TruncateArithmetic(Op0, Op1);
33675 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
33676 // better to truncate if we have the chance.
33677 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
33678 !TLI.isOperationLegal(Opcode, SrcVT))
33679 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
33682 SDValue Op0 = Src.getOperand(0);
33683 SDValue Op1 = Src.getOperand(1);
33684 if (TLI.isOperationLegal(Opcode, VT) &&
33685 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33686 return TruncateArithmetic(Op0, Op1);
33694 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
33696 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
33697 SmallVector<SDValue, 8> &Regs) {
33698 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
33699 Regs[0].getValueType() == MVT::v2i64));
33700 EVT OutVT = N->getValueType(0);
33701 EVT OutSVT = OutVT.getVectorElementType();
33702 EVT InVT = Regs[0].getValueType();
33703 EVT InSVT = InVT.getVectorElementType();
33706 // First, use mask to unset all bits that won't appear in the result.
33707 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
33708 "OutSVT can only be either i8 or i16.");
33710 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
33711 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
33712 for (auto &Reg : Regs)
33713 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
33715 MVT UnpackedVT, PackedVT;
33716 if (OutSVT == MVT::i8) {
33717 UnpackedVT = MVT::v8i16;
33718 PackedVT = MVT::v16i8;
33720 UnpackedVT = MVT::v4i32;
33721 PackedVT = MVT::v8i16;
33724 // In each iteration, truncate the type by a half size.
33725 auto RegNum = Regs.size();
33726 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
33727 j < e; j *= 2, RegNum /= 2) {
33728 for (unsigned i = 0; i < RegNum; i++)
33729 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
33730 for (unsigned i = 0; i < RegNum / 2; i++)
33731 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
33735 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
33736 // then extract a subvector as the result since v8i8 is not a legal type.
33737 if (OutVT == MVT::v8i8) {
33738 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
33739 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
33740 DAG.getIntPtrConstant(0, DL));
33742 } else if (RegNum > 1) {
33743 Regs.resize(RegNum);
33744 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33749 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
33751 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
33753 SmallVector<SDValue, 8> &Regs) {
33754 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
33755 EVT OutVT = N->getValueType(0);
33758 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
33759 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
33760 for (auto &Reg : Regs) {
33761 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
33763 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
33767 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
33768 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
33771 if (Regs.size() > 2) {
33772 Regs.resize(Regs.size() / 2);
33773 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33778 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
33779 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
33780 /// legalization the truncation will be translated into a BUILD_VECTOR with each
33781 /// element that is extracted from a vector and then truncated, and it is
33782 /// difficult to do this optimization based on them.
33783 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
33784 const X86Subtarget &Subtarget) {
33785 EVT OutVT = N->getValueType(0);
33786 if (!OutVT.isVector())
33789 SDValue In = N->getOperand(0);
33790 if (!In.getValueType().isSimple())
33793 EVT InVT = In.getValueType();
33794 unsigned NumElems = OutVT.getVectorNumElements();
33796 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
33797 // SSE2, and we need to take care of it specially.
33798 // AVX512 provides vpmovdb.
33799 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
33802 EVT OutSVT = OutVT.getVectorElementType();
33803 EVT InSVT = InVT.getVectorElementType();
33804 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
33805 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
33809 // SSSE3's pshufb results in less instructions in the cases below.
33810 if (Subtarget.hasSSSE3() && NumElems == 8 &&
33811 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
33812 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
33817 // Split a long vector into vectors of legal type.
33818 unsigned RegNum = InVT.getSizeInBits() / 128;
33819 SmallVector<SDValue, 8> SubVec(RegNum);
33820 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
33821 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
33823 for (unsigned i = 0; i < RegNum; i++)
33824 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
33825 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
33827 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
33828 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
33829 // truncate 2 x v4i32 to v8i16.
33830 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
33831 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
33832 else if (InSVT == MVT::i32)
33833 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
33838 /// This function transforms vector truncation of 'all or none' bits values.
33839 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
33840 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
33842 const X86Subtarget &Subtarget) {
33843 // Requires SSE2 but AVX512 has fast truncate.
33844 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
33847 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
33850 SDValue In = N->getOperand(0);
33851 if (!In.getValueType().isSimple())
33854 MVT VT = N->getValueType(0).getSimpleVT();
33855 MVT SVT = VT.getScalarType();
33857 MVT InVT = In.getValueType().getSimpleVT();
33858 MVT InSVT = InVT.getScalarType();
33860 // Use PACKSS if the input is a splatted sign bit.
33861 // e.g. Comparison result, sext_in_reg, etc.
33862 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
33863 if (NumSignBits != InSVT.getSizeInBits())
33866 // Check we have a truncation suited for PACKSS.
33867 if (!VT.is128BitVector() && !VT.is256BitVector())
33869 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
33871 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
33874 return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
33877 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
33878 const X86Subtarget &Subtarget) {
33879 EVT VT = N->getValueType(0);
33880 SDValue Src = N->getOperand(0);
33883 // Attempt to pre-truncate inputs to arithmetic ops instead.
33884 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
33887 // Try to detect AVG pattern first.
33888 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
33891 // Try to combine truncation with unsigned saturation.
33892 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
33895 // The bitcast source is a direct mmx result.
33896 // Detect bitcasts between i32 to x86mmx
33897 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
33898 SDValue BCSrc = Src.getOperand(0);
33899 if (BCSrc.getValueType() == MVT::x86mmx)
33900 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
33903 // Try to truncate extended sign bits with PACKSS.
33904 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
33907 return combineVectorTruncation(N, DAG, Subtarget);
33910 /// Returns the negated value if the node \p N flips sign of FP value.
33912 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
33913 /// AVX512F does not have FXOR, so FNEG is lowered as
33914 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
33915 /// In this case we go though all bitcasts.
33916 static SDValue isFNEG(SDNode *N) {
33917 if (N->getOpcode() == ISD::FNEG)
33918 return N->getOperand(0);
33920 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
33921 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
33924 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
33925 if (!Op1.getValueType().isFloatingPoint())
33928 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
33930 unsigned EltBits = Op1.getScalarValueSizeInBits();
33931 auto isSignMask = [&](const ConstantFP *C) {
33932 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
33935 // There is more than one way to represent the same constant on
33936 // the different X86 targets. The type of the node may also depend on size.
33937 // - load scalar value and broadcast
33938 // - BUILD_VECTOR node
33939 // - load from a constant pool.
33940 // We check all variants here.
33941 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
33942 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
33943 if (isSignMask(cast<ConstantFP>(C)))
33946 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
33947 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
33948 if (isSignMask(CN->getConstantFPValue()))
33951 } else if (auto *C = getTargetConstantFromNode(Op1)) {
33952 if (C->getType()->isVectorTy()) {
33953 if (auto *SplatV = C->getSplatValue())
33954 if (isSignMask(cast<ConstantFP>(SplatV)))
33956 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
33957 if (isSignMask(FPConst))
33963 /// Do target-specific dag combines on floating point negations.
33964 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
33965 const X86Subtarget &Subtarget) {
33966 EVT OrigVT = N->getValueType(0);
33967 SDValue Arg = isFNEG(N);
33968 assert(Arg.getNode() && "N is expected to be an FNEG node");
33970 EVT VT = Arg.getValueType();
33971 EVT SVT = VT.getScalarType();
33974 // Let legalize expand this if it isn't a legal type yet.
33975 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33978 // If we're negating a FMUL node on a target with FMA, then we can avoid the
33979 // use of a constant by performing (-0 - A*B) instead.
33980 // FIXME: Check rounding control flags as well once it becomes available.
33981 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
33982 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
33983 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
33984 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
33985 Arg.getOperand(1), Zero);
33986 return DAG.getBitcast(OrigVT, NewNode);
33989 // If we're negating an FMA node, then we can adjust the
33990 // instruction to include the extra negation.
33991 unsigned NewOpcode = 0;
33992 if (Arg.hasOneUse()) {
33993 switch (Arg.getOpcode()) {
33994 case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
33995 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
33996 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
33997 case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
33998 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
33999 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
34000 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
34001 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
34002 // We can't handle scalar intrinsic node here because it would only
34003 // invert one element and not the whole vector. But we could try to handle
34004 // a negation of the lower element only.
34008 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
34009 Arg.getNode()->ops()));
34014 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
34015 const X86Subtarget &Subtarget) {
34016 MVT VT = N->getSimpleValueType(0);
34017 // If we have integer vector types available, use the integer opcodes.
34018 if (VT.isVector() && Subtarget.hasSSE2()) {
34021 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
34023 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
34024 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
34025 unsigned IntOpcode;
34026 switch (N->getOpcode()) {
34027 default: llvm_unreachable("Unexpected FP logic op");
34028 case X86ISD::FOR: IntOpcode = ISD::OR; break;
34029 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
34030 case X86ISD::FAND: IntOpcode = ISD::AND; break;
34031 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
34033 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
34034 return DAG.getBitcast(VT, IntOp);
34039 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
34040 TargetLowering::DAGCombinerInfo &DCI,
34041 const X86Subtarget &Subtarget) {
34042 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
34045 if (DCI.isBeforeLegalizeOps())
34048 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
34051 if (Subtarget.hasCMov())
34052 if (SDValue RV = combineIntegerAbs(N, DAG))
34055 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
34059 return combineFneg(N, DAG, Subtarget);
34064 static bool isNullFPScalarOrVectorConst(SDValue V) {
34065 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
34068 /// If a value is a scalar FP zero or a vector FP zero (potentially including
34069 /// undefined elements), return a zero constant that may be used to fold away
34070 /// that value. In the case of a vector, the returned constant will not contain
34071 /// undefined elements even if the input parameter does. This makes it suitable
34072 /// to be used as a replacement operand with operations (eg, bitwise-and) where
34073 /// an undef should not propagate.
34074 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
34075 const X86Subtarget &Subtarget) {
34076 if (!isNullFPScalarOrVectorConst(V))
34079 if (V.getValueType().isVector())
34080 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
34085 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
34086 const X86Subtarget &Subtarget) {
34087 SDValue N0 = N->getOperand(0);
34088 SDValue N1 = N->getOperand(1);
34089 EVT VT = N->getValueType(0);
34092 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
34093 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
34094 (VT == MVT::f64 && Subtarget.hasSSE2())))
34097 auto isAllOnesConstantFP = [](SDValue V) {
34098 auto *C = dyn_cast<ConstantFPSDNode>(V);
34099 return C && C->getConstantFPValue()->isAllOnesValue();
34102 // fand (fxor X, -1), Y --> fandn X, Y
34103 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
34104 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
34106 // fand X, (fxor Y, -1) --> fandn Y, X
34107 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
34108 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
34113 /// Do target-specific dag combines on X86ISD::FAND nodes.
34114 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
34115 const X86Subtarget &Subtarget) {
34116 // FAND(0.0, x) -> 0.0
34117 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
34120 // FAND(x, 0.0) -> 0.0
34121 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
34124 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
34127 return lowerX86FPLogicOp(N, DAG, Subtarget);
34130 /// Do target-specific dag combines on X86ISD::FANDN nodes.
34131 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
34132 const X86Subtarget &Subtarget) {
34133 // FANDN(0.0, x) -> x
34134 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
34135 return N->getOperand(1);
34137 // FANDN(x, 0.0) -> 0.0
34138 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
34141 return lowerX86FPLogicOp(N, DAG, Subtarget);
34144 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
34145 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
34146 const X86Subtarget &Subtarget) {
34147 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
34149 // F[X]OR(0.0, x) -> x
34150 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
34151 return N->getOperand(1);
34153 // F[X]OR(x, 0.0) -> x
34154 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
34155 return N->getOperand(0);
34158 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
34161 return lowerX86FPLogicOp(N, DAG, Subtarget);
34164 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
34165 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
34166 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
34168 // Only perform optimizations if UnsafeMath is used.
34169 if (!DAG.getTarget().Options.UnsafeFPMath)
34172 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
34173 // into FMINC and FMAXC, which are Commutative operations.
34174 unsigned NewOp = 0;
34175 switch (N->getOpcode()) {
34176 default: llvm_unreachable("unknown opcode");
34177 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
34178 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
34181 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
34182 N->getOperand(0), N->getOperand(1));
34185 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
34186 const X86Subtarget &Subtarget) {
34187 if (Subtarget.useSoftFloat())
34190 // TODO: Check for global or instruction-level "nnan". In that case, we
34191 // should be able to lower to FMAX/FMIN alone.
34192 // TODO: If an operand is already known to be a NaN or not a NaN, this
34193 // should be an optional swap and FMAX/FMIN.
34195 EVT VT = N->getValueType(0);
34196 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
34197 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
34198 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
34201 // This takes at least 3 instructions, so favor a library call when operating
34202 // on a scalar and minimizing code size.
34203 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
34206 SDValue Op0 = N->getOperand(0);
34207 SDValue Op1 = N->getOperand(1);
34209 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
34210 DAG.getDataLayout(), *DAG.getContext(), VT);
34212 // There are 4 possibilities involving NaN inputs, and these are the required
34216 // ----------------
34217 // Num | Max | Op0 |
34218 // Op0 ----------------
34219 // NaN | Op1 | NaN |
34220 // ----------------
34222 // The SSE FP max/min instructions were not designed for this case, but rather
34224 // Min = Op1 < Op0 ? Op1 : Op0
34225 // Max = Op1 > Op0 ? Op1 : Op0
34227 // So they always return Op0 if either input is a NaN. However, we can still
34228 // use those instructions for fmaxnum by selecting away a NaN input.
34230 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
34231 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
34232 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
34233 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
34235 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
34236 // are NaN, the NaN value of Op1 is the result.
34237 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
34240 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
34241 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
34242 TargetLowering::DAGCombinerInfo &DCI,
34243 const X86Subtarget &Subtarget) {
34244 // ANDNP(0, x) -> x
34245 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
34246 return N->getOperand(1);
34248 // ANDNP(x, 0) -> 0
34249 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
34250 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
34252 EVT VT = N->getValueType(0);
34254 // Attempt to recursively combine a bitmask ANDNP with shuffles.
34255 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
34257 SmallVector<int, 1> NonceMask; // Just a placeholder.
34258 NonceMask.push_back(0);
34259 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
34260 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
34262 return SDValue(); // This routine will use CombineTo to replace N.
34268 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
34269 TargetLowering::DAGCombinerInfo &DCI) {
34270 // BT ignores high bits in the bit index operand.
34271 SDValue Op1 = N->getOperand(1);
34272 if (Op1.hasOneUse()) {
34273 unsigned BitWidth = Op1.getValueSizeInBits();
34274 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
34276 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
34277 !DCI.isBeforeLegalizeOps());
34278 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34279 if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
34280 TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
34281 DCI.CommitTargetLoweringOpt(TLO);
34286 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
34287 const X86Subtarget &Subtarget) {
34288 EVT VT = N->getValueType(0);
34289 if (!VT.isVector())
34292 SDValue N0 = N->getOperand(0);
34293 SDValue N1 = N->getOperand(1);
34294 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
34297 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
34298 // both SSE and AVX2 since there is no sign-extended shift right
34299 // operation on a vector with 64-bit elements.
34300 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
34301 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
34302 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
34303 N0.getOpcode() == ISD::SIGN_EXTEND)) {
34304 SDValue N00 = N0.getOperand(0);
34306 // EXTLOAD has a better solution on AVX2,
34307 // it may be replaced with X86ISD::VSEXT node.
34308 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
34309 if (!ISD::isNormalLoad(N00.getNode()))
34312 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
34313 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
34315 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
34321 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
34322 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
34323 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
34324 /// opportunities to combine math ops, use an LEA, or use a complex addressing
34325 /// mode. This can eliminate extend, add, and shift instructions.
34326 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
34327 const X86Subtarget &Subtarget) {
34328 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
34329 Ext->getOpcode() != ISD::ZERO_EXTEND)
34332 // TODO: This should be valid for other integer types.
34333 EVT VT = Ext->getValueType(0);
34334 if (VT != MVT::i64)
34337 SDValue Add = Ext->getOperand(0);
34338 if (Add.getOpcode() != ISD::ADD)
34341 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
34342 bool NSW = Add->getFlags().hasNoSignedWrap();
34343 bool NUW = Add->getFlags().hasNoUnsignedWrap();
34345 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
34347 if ((Sext && !NSW) || (!Sext && !NUW))
34350 // Having a constant operand to the 'add' ensures that we are not increasing
34351 // the instruction count because the constant is extended for free below.
34352 // A constant operand can also become the displacement field of an LEA.
34353 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
34357 // Don't make the 'add' bigger if there's no hope of combining it with some
34358 // other 'add' or 'shl' instruction.
34359 // TODO: It may be profitable to generate simpler LEA instructions in place
34360 // of single 'add' instructions, but the cost model for selecting an LEA
34361 // currently has a high threshold.
34362 bool HasLEAPotential = false;
34363 for (auto *User : Ext->uses()) {
34364 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
34365 HasLEAPotential = true;
34369 if (!HasLEAPotential)
34372 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
34373 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
34374 SDValue AddOp0 = Add.getOperand(0);
34375 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
34376 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
34378 // The wider add is guaranteed to not wrap because both operands are
34381 Flags.setNoSignedWrap(NSW);
34382 Flags.setNoUnsignedWrap(NUW);
34383 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
34386 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
34387 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
34388 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
34389 /// extends from AH (which we otherwise need to do contortions to access).
34390 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
34391 SDValue N0 = N->getOperand(0);
34392 auto OpcodeN = N->getOpcode();
34393 auto OpcodeN0 = N0.getOpcode();
34394 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
34395 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
34398 EVT VT = N->getValueType(0);
34399 EVT InVT = N0.getValueType();
34400 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
34403 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
34404 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
34405 : X86ISD::UDIVREM8_ZEXT_HREG;
34406 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
34408 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
34409 return R.getValue(1);
34412 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
34413 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
34414 /// with UNDEFs) of the input to vectors of the same size as the target type
34415 /// which then extends the lowest elements.
34416 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
34417 TargetLowering::DAGCombinerInfo &DCI,
34418 const X86Subtarget &Subtarget) {
34419 unsigned Opcode = N->getOpcode();
34420 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
34422 if (!DCI.isBeforeLegalizeOps())
34424 if (!Subtarget.hasSSE2())
34427 SDValue N0 = N->getOperand(0);
34428 EVT VT = N->getValueType(0);
34429 EVT SVT = VT.getScalarType();
34430 EVT InVT = N0.getValueType();
34431 EVT InSVT = InVT.getScalarType();
34433 // Input type must be a vector and we must be extending legal integer types.
34434 if (!VT.isVector())
34436 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
34438 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
34441 // On AVX2+ targets, if the input/output types are both legal then we will be
34442 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
34443 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
34444 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
34449 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
34450 EVT InVT = N.getValueType();
34451 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
34452 Size / InVT.getScalarSizeInBits());
34453 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
34454 DAG.getUNDEF(InVT));
34456 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
34459 // If target-size is less than 128-bits, extend to a type that would extend
34460 // to 128 bits, extend that and extract the original target vector.
34461 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
34462 unsigned Scale = 128 / VT.getSizeInBits();
34464 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
34465 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
34466 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
34467 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
34468 DAG.getIntPtrConstant(0, DL));
34471 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
34472 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
34473 // Also use this if we don't have SSE41 to allow the legalizer do its job.
34474 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
34475 (VT.is256BitVector() && Subtarget.hasInt256()) ||
34476 (VT.is512BitVector() && Subtarget.hasAVX512())) {
34477 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
34478 return Opcode == ISD::SIGN_EXTEND
34479 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
34480 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
34483 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
34484 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
34485 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
34486 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
34487 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
34489 SmallVector<SDValue, 8> Opnds;
34490 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
34491 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
34492 DAG.getIntPtrConstant(Offset, DL));
34493 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
34494 SrcVec = Opcode == ISD::SIGN_EXTEND
34495 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
34496 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
34497 Opnds.push_back(SrcVec);
34499 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
34502 // On pre-AVX2 targets, split into 128-bit nodes of
34503 // ISD::*_EXTEND_VECTOR_INREG.
34504 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
34505 return SplitAndExtendInReg(128);
34507 // On pre-AVX512 targets, split into 256-bit nodes of
34508 // ISD::*_EXTEND_VECTOR_INREG.
34509 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
34510 return SplitAndExtendInReg(256);
34515 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
34516 TargetLowering::DAGCombinerInfo &DCI,
34517 const X86Subtarget &Subtarget) {
34518 SDValue N0 = N->getOperand(0);
34519 EVT VT = N->getValueType(0);
34520 EVT InVT = N0.getValueType();
34523 if (SDValue DivRem8 = getDivRem8(N, DAG))
34526 if (!DCI.isBeforeLegalizeOps()) {
34527 if (InVT == MVT::i1) {
34528 SDValue Zero = DAG.getConstant(0, DL, VT);
34529 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
34530 return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
34535 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
34536 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
34537 // Invert and sign-extend a boolean is the same as zero-extend and subtract
34538 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
34539 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
34540 // sext (xor Bool, -1) --> sub (zext Bool), 1
34541 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
34542 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
34545 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34548 if (Subtarget.hasAVX() && VT.is256BitVector())
34549 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34552 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34558 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
34559 const X86Subtarget &Subtarget) {
34561 EVT VT = N->getValueType(0);
34563 // Let legalize expand this if it isn't a legal type yet.
34564 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34567 EVT ScalarVT = VT.getScalarType();
34568 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
34571 SDValue A = N->getOperand(0);
34572 SDValue B = N->getOperand(1);
34573 SDValue C = N->getOperand(2);
34575 auto invertIfNegative = [](SDValue &V) {
34576 if (SDValue NegVal = isFNEG(V.getNode())) {
34583 // Do not convert the passthru input of scalar intrinsics.
34584 // FIXME: We could allow negations of the lower element only.
34585 bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
34586 bool NegB = invertIfNegative(B);
34587 bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
34589 // Negative multiplication when NegA xor NegB
34590 bool NegMul = (NegA != NegB);
34592 unsigned NewOpcode;
34594 NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
34596 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
34599 if (N->getOpcode() == X86ISD::FMADD_RND) {
34600 switch (NewOpcode) {
34601 case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
34602 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
34603 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
34604 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
34606 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
34607 switch (NewOpcode) {
34608 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
34609 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
34610 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
34611 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
34613 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
34614 switch (NewOpcode) {
34615 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
34616 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
34617 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
34618 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
34621 assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
34622 "Unexpected opcode!");
34623 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
34626 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
34629 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
34630 TargetLowering::DAGCombinerInfo &DCI,
34631 const X86Subtarget &Subtarget) {
34632 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
34633 // (and (i32 x86isd::setcc_carry), 1)
34634 // This eliminates the zext. This transformation is necessary because
34635 // ISD::SETCC is always legalized to i8.
34637 SDValue N0 = N->getOperand(0);
34638 EVT VT = N->getValueType(0);
34640 if (N0.getOpcode() == ISD::AND &&
34642 N0.getOperand(0).hasOneUse()) {
34643 SDValue N00 = N0.getOperand(0);
34644 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34645 if (!isOneConstant(N0.getOperand(1)))
34647 return DAG.getNode(ISD::AND, dl, VT,
34648 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34649 N00.getOperand(0), N00.getOperand(1)),
34650 DAG.getConstant(1, dl, VT));
34654 if (N0.getOpcode() == ISD::TRUNCATE &&
34656 N0.getOperand(0).hasOneUse()) {
34657 SDValue N00 = N0.getOperand(0);
34658 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34659 return DAG.getNode(ISD::AND, dl, VT,
34660 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34661 N00.getOperand(0), N00.getOperand(1)),
34662 DAG.getConstant(1, dl, VT));
34666 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34669 if (VT.is256BitVector())
34670 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34673 if (SDValue DivRem8 = getDivRem8(N, DAG))
34676 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34679 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
34685 /// Try to map a 128-bit or larger integer comparison to vector instructions
34686 /// before type legalization splits it up into chunks.
34687 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
34688 const X86Subtarget &Subtarget) {
34689 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
34690 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
34692 // We're looking for an oversized integer equality comparison, but ignore a
34693 // comparison with zero because that gets special treatment in EmitTest().
34694 SDValue X = SetCC->getOperand(0);
34695 SDValue Y = SetCC->getOperand(1);
34696 EVT OpVT = X.getValueType();
34697 unsigned OpSize = OpVT.getSizeInBits();
34698 if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
34701 // TODO: Use PXOR + PTEST for SSE4.1 or later?
34702 // TODO: Add support for AVX-512.
34703 EVT VT = SetCC->getValueType(0);
34705 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
34706 (OpSize == 256 && Subtarget.hasAVX2())) {
34707 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
34708 SDValue VecX = DAG.getBitcast(VecVT, X);
34709 SDValue VecY = DAG.getBitcast(VecVT, Y);
34711 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
34712 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
34713 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
34714 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
34715 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
34716 SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
34717 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
34718 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
34720 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
34726 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
34727 const X86Subtarget &Subtarget) {
34728 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
34729 SDValue LHS = N->getOperand(0);
34730 SDValue RHS = N->getOperand(1);
34731 EVT VT = N->getValueType(0);
34734 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
34735 EVT OpVT = LHS.getValueType();
34736 // 0-x == y --> x+y == 0
34737 // 0-x != y --> x+y != 0
34738 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
34740 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
34741 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34743 // x == 0-y --> x+y == 0
34744 // x != 0-y --> x+y != 0
34745 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
34747 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
34748 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34751 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
34755 if (VT.getScalarType() == MVT::i1 &&
34756 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
34758 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34759 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34760 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34762 if (!IsSEXT0 || !IsVZero1) {
34763 // Swap the operands and update the condition code.
34764 std::swap(LHS, RHS);
34765 CC = ISD::getSetCCSwappedOperands(CC);
34767 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34768 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34769 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34772 if (IsSEXT0 && IsVZero1) {
34773 assert(VT == LHS.getOperand(0).getValueType() &&
34774 "Uexpected operand type");
34775 if (CC == ISD::SETGT)
34776 return DAG.getConstant(0, DL, VT);
34777 if (CC == ISD::SETLE)
34778 return DAG.getConstant(1, DL, VT);
34779 if (CC == ISD::SETEQ || CC == ISD::SETGE)
34780 return DAG.getNOT(DL, LHS.getOperand(0), VT);
34782 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
34783 "Unexpected condition code!");
34784 return LHS.getOperand(0);
34788 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
34789 // to avoid scalarization via legalization because v4i32 is not a legal type.
34790 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
34791 LHS.getValueType() == MVT::v4f32)
34792 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
34797 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
34799 // Gather and Scatter instructions use k-registers for masks. The type of
34800 // the masks is v*i1. So the mask will be truncated anyway.
34801 // The SIGN_EXTEND_INREG my be dropped.
34802 SDValue Mask = N->getOperand(2);
34803 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
34804 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
34805 NewOps[2] = Mask.getOperand(0);
34806 DAG.UpdateNodeOperands(N, NewOps);
34811 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
34812 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
34813 const X86Subtarget &Subtarget) {
34815 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
34816 SDValue EFLAGS = N->getOperand(1);
34818 // Try to simplify the EFLAGS and condition code operands.
34819 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
34820 return getSETCC(CC, Flags, DL, DAG);
34825 /// Optimize branch condition evaluation.
34826 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
34827 const X86Subtarget &Subtarget) {
34829 SDValue EFLAGS = N->getOperand(3);
34830 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
34832 // Try to simplify the EFLAGS and condition code operands.
34833 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
34834 // RAUW them under us.
34835 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
34836 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
34837 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
34838 N->getOperand(1), Cond, Flags);
34844 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
34845 SelectionDAG &DAG) {
34846 // Take advantage of vector comparisons producing 0 or -1 in each lane to
34847 // optimize away operation when it's from a constant.
34849 // The general transformation is:
34850 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
34851 // AND(VECTOR_CMP(x,y), constant2)
34852 // constant2 = UNARYOP(constant)
34854 // Early exit if this isn't a vector operation, the operand of the
34855 // unary operation isn't a bitwise AND, or if the sizes of the operations
34856 // aren't the same.
34857 EVT VT = N->getValueType(0);
34858 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
34859 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
34860 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
34863 // Now check that the other operand of the AND is a constant. We could
34864 // make the transformation for non-constant splats as well, but it's unclear
34865 // that would be a benefit as it would not eliminate any operations, just
34866 // perform one more step in scalar code before moving to the vector unit.
34867 if (BuildVectorSDNode *BV =
34868 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
34869 // Bail out if the vector isn't a constant.
34870 if (!BV->isConstant())
34873 // Everything checks out. Build up the new and improved node.
34875 EVT IntVT = BV->getValueType(0);
34876 // Create a new constant of the appropriate type for the transformed
34878 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
34879 // The AND node needs bitcasts to/from an integer vector type around it.
34880 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
34881 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
34882 N->getOperand(0)->getOperand(0), MaskConst);
34883 SDValue Res = DAG.getBitcast(VT, NewAnd);
34890 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
34891 const X86Subtarget &Subtarget) {
34892 SDValue Op0 = N->getOperand(0);
34893 EVT VT = N->getValueType(0);
34894 EVT InVT = Op0.getValueType();
34895 EVT InSVT = InVT.getScalarType();
34896 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34898 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
34899 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
34900 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
34902 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34903 InVT.getVectorNumElements());
34904 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
34906 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
34907 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
34909 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34912 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
34913 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
34914 // the optimization here.
34915 if (DAG.SignBitIsZero(Op0))
34916 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
34921 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
34922 const X86Subtarget &Subtarget) {
34923 // First try to optimize away the conversion entirely when it's
34924 // conditionally from a constant. Vectors only.
34925 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
34928 // Now move on to more general possibilities.
34929 SDValue Op0 = N->getOperand(0);
34930 EVT VT = N->getValueType(0);
34931 EVT InVT = Op0.getValueType();
34932 EVT InSVT = InVT.getScalarType();
34934 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
34935 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
34936 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
34937 if (InVT.isVector() &&
34938 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
34939 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
34941 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34942 InVT.getVectorNumElements());
34943 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
34944 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34947 // Without AVX512DQ we only support i64 to float scalar conversion. For both
34948 // vectors and scalars, see if we know that the upper bits are all the sign
34949 // bit, in which case we can truncate the input to i32 and convert from that.
34950 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
34951 unsigned BitWidth = InVT.getScalarSizeInBits();
34952 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
34953 if (NumSignBits >= (BitWidth - 31)) {
34954 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
34955 if (InVT.isVector())
34956 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
34957 InVT.getVectorNumElements());
34959 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
34960 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
34964 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
34965 // a 32-bit target where SSE doesn't support i64->FP operations.
34966 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
34967 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
34968 EVT LdVT = Ld->getValueType(0);
34970 // This transformation is not supported if the result type is f16 or f128.
34971 if (VT == MVT::f16 || VT == MVT::f128)
34974 if (!Ld->isVolatile() && !VT.isVector() &&
34975 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
34976 !Subtarget.is64Bit() && LdVT == MVT::i64) {
34977 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
34978 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
34979 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
34986 // Optimize RES, EFLAGS = X86ISD::ADD LHS, RHS
34987 static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG,
34988 X86TargetLowering::DAGCombinerInfo &DCI) {
34989 // When legalizing carry, we create carries via add X, -1
34990 // If that comes from an actual carry, via setcc, we use the
34992 if (isAllOnesConstant(N->getOperand(1)) && N->hasAnyUseOfValue(1)) {
34993 SDValue Carry = N->getOperand(0);
34994 while (Carry.getOpcode() == ISD::TRUNCATE ||
34995 Carry.getOpcode() == ISD::ZERO_EXTEND ||
34996 Carry.getOpcode() == ISD::SIGN_EXTEND ||
34997 Carry.getOpcode() == ISD::ANY_EXTEND ||
34998 (Carry.getOpcode() == ISD::AND &&
34999 isOneConstant(Carry.getOperand(1))))
35000 Carry = Carry.getOperand(0);
35002 if (Carry.getOpcode() == X86ISD::SETCC ||
35003 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
35004 if (Carry.getConstantOperandVal(0) == X86::COND_B)
35005 return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1));
35012 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
35013 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
35014 X86TargetLowering::DAGCombinerInfo &DCI) {
35015 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
35016 // the result is either zero or one (depending on the input carry bit).
35017 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
35018 if (X86::isZeroNode(N->getOperand(0)) &&
35019 X86::isZeroNode(N->getOperand(1)) &&
35020 // We don't have a good way to replace an EFLAGS use, so only do this when
35022 SDValue(N, 1).use_empty()) {
35024 EVT VT = N->getValueType(0);
35025 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
35026 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
35027 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35028 DAG.getConstant(X86::COND_B, DL,
35031 DAG.getConstant(1, DL, VT));
35032 return DCI.CombineTo(N, Res1, CarryOut);
35038 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
35039 /// which is more useful than 0/1 in some cases.
35040 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
35042 // "Condition code B" is also known as "the carry flag" (CF).
35043 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
35044 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
35045 MVT VT = N->getSimpleValueType(0);
35047 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
35049 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
35050 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
35053 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
35054 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
35055 /// with CMP+{ADC, SBB}.
35056 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
35057 bool IsSub = N->getOpcode() == ISD::SUB;
35058 SDValue X = N->getOperand(0);
35059 SDValue Y = N->getOperand(1);
35061 // If this is an add, canonicalize a zext operand to the RHS.
35062 // TODO: Incomplete? What if both sides are zexts?
35063 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
35064 Y.getOpcode() != ISD::ZERO_EXTEND)
35067 // Look through a one-use zext.
35068 bool PeekedThroughZext = false;
35069 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
35070 Y = Y.getOperand(0);
35071 PeekedThroughZext = true;
35074 // If this is an add, canonicalize a setcc operand to the RHS.
35075 // TODO: Incomplete? What if both sides are setcc?
35076 // TODO: Should we allow peeking through a zext of the other operand?
35077 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
35078 Y.getOpcode() != X86ISD::SETCC)
35081 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
35085 EVT VT = N->getValueType(0);
35086 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
35088 // If X is -1 or 0, then we have an opportunity to avoid constants required in
35089 // the general case below.
35090 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
35092 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
35093 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
35094 // This is a complicated way to get -1 or 0 from the carry flag:
35095 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
35096 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
35097 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35098 DAG.getConstant(X86::COND_B, DL, MVT::i8),
35102 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
35103 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
35104 SDValue EFLAGS = Y->getOperand(1);
35105 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
35106 EFLAGS.getValueType().isInteger() &&
35107 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
35108 // Swap the operands of a SUB, and we have the same pattern as above.
35109 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
35110 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
35111 SDValue NewSub = DAG.getNode(
35112 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
35113 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
35114 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
35115 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35116 DAG.getConstant(X86::COND_B, DL, MVT::i8),
35122 if (CC == X86::COND_B) {
35123 // X + SETB Z --> X + (mask SBB Z, Z)
35124 // X - SETB Z --> X - (mask SBB Z, Z)
35125 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
35126 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
35127 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
35128 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
35129 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
35132 if (CC == X86::COND_A) {
35133 SDValue EFLAGS = Y->getOperand(1);
35134 // Try to convert COND_A into COND_B in an attempt to facilitate
35135 // materializing "setb reg".
35137 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
35138 // cannot take an immediate as its first operand.
35140 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
35141 EFLAGS.getValueType().isInteger() &&
35142 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
35143 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
35144 EFLAGS.getNode()->getVTList(),
35145 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
35146 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
35147 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
35148 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
35149 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
35150 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
35154 if (CC != X86::COND_E && CC != X86::COND_NE)
35157 SDValue Cmp = Y.getOperand(1);
35158 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
35159 !X86::isZeroNode(Cmp.getOperand(1)) ||
35160 !Cmp.getOperand(0).getValueType().isInteger())
35163 SDValue Z = Cmp.getOperand(0);
35164 EVT ZVT = Z.getValueType();
35166 // If X is -1 or 0, then we have an opportunity to avoid constants required in
35167 // the general case below.
35169 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
35171 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
35172 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
35173 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
35174 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
35175 SDValue Zero = DAG.getConstant(0, DL, ZVT);
35176 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
35177 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
35178 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35179 DAG.getConstant(X86::COND_B, DL, MVT::i8),
35180 SDValue(Neg.getNode(), 1));
35183 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
35184 // with fake operands:
35185 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
35186 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
35187 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
35188 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
35189 SDValue One = DAG.getConstant(1, DL, ZVT);
35190 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
35191 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35192 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
35196 // (cmp Z, 1) sets the carry flag if Z is 0.
35197 SDValue One = DAG.getConstant(1, DL, ZVT);
35198 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
35200 // Add the flags type for ADC/SBB nodes.
35201 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
35203 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
35204 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
35205 if (CC == X86::COND_NE)
35206 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
35207 DAG.getConstant(-1ULL, DL, VT), Cmp1);
35209 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
35210 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
35211 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
35212 DAG.getConstant(0, DL, VT), Cmp1);
35215 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
35216 const X86Subtarget &Subtarget) {
35217 SDValue MulOp = N->getOperand(0);
35218 SDValue Phi = N->getOperand(1);
35220 if (MulOp.getOpcode() != ISD::MUL)
35221 std::swap(MulOp, Phi);
35222 if (MulOp.getOpcode() != ISD::MUL)
35226 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
35229 EVT VT = N->getValueType(0);
35231 unsigned RegSize = 128;
35232 if (Subtarget.hasBWI())
35234 else if (Subtarget.hasAVX2())
35236 unsigned VectorSize = VT.getVectorNumElements() * 16;
35237 // If the vector size is less than 128, or greater than the supported RegSize,
35238 // do not use PMADD.
35239 if (VectorSize < 128 || VectorSize > RegSize)
35243 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
35244 VT.getVectorNumElements());
35245 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
35246 VT.getVectorNumElements() / 2);
35248 // Shrink the operands of mul.
35249 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
35250 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
35252 // Madd vector size is half of the original vector size
35253 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
35254 // Fill the rest of the output with 0
35255 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
35256 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
35257 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
35260 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
35261 const X86Subtarget &Subtarget) {
35263 EVT VT = N->getValueType(0);
35264 SDValue Op0 = N->getOperand(0);
35265 SDValue Op1 = N->getOperand(1);
35267 // TODO: There's nothing special about i32, any integer type above i16 should
35268 // work just as well.
35269 if (!VT.isVector() || !VT.isSimple() ||
35270 !(VT.getVectorElementType() == MVT::i32))
35273 unsigned RegSize = 128;
35274 if (Subtarget.hasBWI())
35276 else if (Subtarget.hasAVX2())
35279 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
35280 // TODO: We should be able to handle larger vectors by splitting them before
35281 // feeding them into several SADs, and then reducing over those.
35282 if (VT.getSizeInBits() / 4 > RegSize)
35285 // We know N is a reduction add, which means one of its operands is a phi.
35286 // To match SAD, we need the other operand to be a vector select.
35287 SDValue SelectOp, Phi;
35288 if (Op0.getOpcode() == ISD::VSELECT) {
35291 } else if (Op1.getOpcode() == ISD::VSELECT) {
35297 // Check whether we have an abs-diff pattern feeding into the select.
35298 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
35301 // SAD pattern detected. Now build a SAD instruction and an addition for
35302 // reduction. Note that the number of elements of the result of SAD is less
35303 // than the number of elements of its input. Therefore, we could only update
35304 // part of elements in the reduction vector.
35305 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
35307 // The output of PSADBW is a vector of i64.
35308 // We need to turn the vector of i64 into a vector of i32.
35309 // If the reduction vector is at least as wide as the psadbw result, just
35310 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
35312 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
35313 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
35314 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
35316 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
35318 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
35319 // Update part of elements of the reduction vector. This is done by first
35320 // extracting a sub-vector from it, updating this sub-vector, and inserting
35322 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
35323 DAG.getIntPtrConstant(0, DL));
35324 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
35325 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
35326 DAG.getIntPtrConstant(0, DL));
35328 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
35331 /// Convert vector increment or decrement to sub/add with an all-ones constant:
35332 /// add X, <1, 1...> --> sub X, <-1, -1...>
35333 /// sub X, <1, 1...> --> add X, <-1, -1...>
35334 /// The all-ones vector constant can be materialized using a pcmpeq instruction
35335 /// that is commonly recognized as an idiom (has no register dependency), so
35336 /// that's better/smaller than loading a splat 1 constant.
35337 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
35338 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
35339 "Unexpected opcode for increment/decrement transform");
35341 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
35342 // out and wait for legalization if we have an unsupported vector length.
35343 EVT VT = N->getValueType(0);
35344 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
35347 SDNode *N1 = N->getOperand(1).getNode();
35349 if (!ISD::isConstantSplatVector(N1, SplatVal) || !SplatVal.isOneValue())
35352 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
35353 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
35354 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
35357 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
35358 const X86Subtarget &Subtarget) {
35359 const SDNodeFlags Flags = N->getFlags();
35360 if (Flags.hasVectorReduction()) {
35361 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
35363 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
35366 EVT VT = N->getValueType(0);
35367 SDValue Op0 = N->getOperand(0);
35368 SDValue Op1 = N->getOperand(1);
35370 // Try to synthesize horizontal adds from adds of shuffles.
35371 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
35372 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
35373 isHorizontalBinOp(Op0, Op1, true))
35374 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
35376 if (SDValue V = combineIncDecVector(N, DAG))
35379 return combineAddOrSubToADCOrSBB(N, DAG);
35382 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
35383 const X86Subtarget &Subtarget) {
35384 SDValue Op0 = N->getOperand(0);
35385 SDValue Op1 = N->getOperand(1);
35387 // X86 can't encode an immediate LHS of a sub. See if we can push the
35388 // negation into a preceding instruction.
35389 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
35390 // If the RHS of the sub is a XOR with one use and a constant, invert the
35391 // immediate. Then add one to the LHS of the sub so we can turn
35392 // X-Y -> X+~Y+1, saving one register.
35393 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
35394 isa<ConstantSDNode>(Op1.getOperand(1))) {
35395 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
35396 EVT VT = Op0.getValueType();
35397 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
35399 DAG.getConstant(~XorC, SDLoc(Op1), VT));
35400 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
35401 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
35405 // Try to synthesize horizontal subs from subs of shuffles.
35406 EVT VT = N->getValueType(0);
35407 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
35408 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
35409 isHorizontalBinOp(Op0, Op1, false))
35410 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
35412 if (SDValue V = combineIncDecVector(N, DAG))
35415 return combineAddOrSubToADCOrSBB(N, DAG);
35418 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
35419 TargetLowering::DAGCombinerInfo &DCI,
35420 const X86Subtarget &Subtarget) {
35421 if (DCI.isBeforeLegalize())
35425 unsigned Opcode = N->getOpcode();
35426 MVT VT = N->getSimpleValueType(0);
35427 MVT SVT = VT.getVectorElementType();
35428 unsigned NumElts = VT.getVectorNumElements();
35429 unsigned EltSizeInBits = SVT.getSizeInBits();
35431 SDValue Op = N->getOperand(0);
35432 MVT OpVT = Op.getSimpleValueType();
35433 MVT OpEltVT = OpVT.getVectorElementType();
35434 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
35435 unsigned InputBits = OpEltSizeInBits * NumElts;
35437 // Perform any constant folding.
35438 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
35440 SmallVector<APInt, 64> EltBits;
35441 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
35442 APInt Undefs(NumElts, 0);
35443 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
35445 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
35446 for (unsigned i = 0; i != NumElts; ++i) {
35447 if (UndefElts[i]) {
35451 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
35452 : EltBits[i].sextOrTrunc(EltSizeInBits);
35454 return getConstVector(Vals, Undefs, VT, DAG, DL);
35457 // (vzext (bitcast (vzext (x)) -> (vzext x)
35458 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
35459 SDValue V = peekThroughBitcasts(Op);
35460 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
35461 MVT InnerVT = V.getSimpleValueType();
35462 MVT InnerEltVT = InnerVT.getVectorElementType();
35464 // If the element sizes match exactly, we can just do one larger vzext. This
35465 // is always an exact type match as vzext operates on integer types.
35466 if (OpEltVT == InnerEltVT) {
35467 assert(OpVT == InnerVT && "Types must match for vzext!");
35468 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
35471 // The only other way we can combine them is if only a single element of the
35472 // inner vzext is used in the input to the outer vzext.
35473 if (InnerEltVT.getSizeInBits() < InputBits)
35476 // In this case, the inner vzext is completely dead because we're going to
35477 // only look at bits inside of the low element. Just do the outer vzext on
35478 // a bitcast of the input to the inner.
35479 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
35482 // Check if we can bypass extracting and re-inserting an element of an input
35483 // vector. Essentially:
35484 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
35485 // TODO: Add X86ISD::VSEXT support
35486 if (Opcode == X86ISD::VZEXT &&
35487 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35488 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
35489 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
35490 SDValue ExtractedV = V.getOperand(0);
35491 SDValue OrigV = ExtractedV.getOperand(0);
35492 if (isNullConstant(ExtractedV.getOperand(1))) {
35493 MVT OrigVT = OrigV.getSimpleValueType();
35494 // Extract a subvector if necessary...
35495 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
35496 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
35497 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
35498 OrigVT.getVectorNumElements() / Ratio);
35499 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
35500 DAG.getIntPtrConstant(0, DL));
35502 Op = DAG.getBitcast(OpVT, OrigV);
35503 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
35510 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
35511 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
35512 const X86Subtarget &Subtarget) {
35513 SDValue Chain = N->getOperand(0);
35514 SDValue LHS = N->getOperand(1);
35515 SDValue RHS = N->getOperand(2);
35516 MVT VT = RHS.getSimpleValueType();
35519 auto *C = dyn_cast<ConstantSDNode>(RHS);
35520 if (!C || C->getZExtValue() != 1)
35523 RHS = DAG.getConstant(-1, DL, VT);
35524 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
35525 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
35526 DAG.getVTList(MVT::i32, MVT::Other),
35527 {Chain, LHS, RHS}, VT, MMO);
35530 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
35531 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
35532 SDValue Op0 = N->getOperand(0);
35533 SDValue Op1 = N->getOperand(1);
35535 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
35538 EVT VT = N->getValueType(0);
35541 return DAG.getNode(X86ISD::TESTM, DL, VT,
35542 Op0->getOperand(0), Op0->getOperand(1));
35545 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
35546 const X86Subtarget &Subtarget) {
35547 MVT VT = N->getSimpleValueType(0);
35550 if (N->getOperand(0) == N->getOperand(1)) {
35551 if (N->getOpcode() == X86ISD::PCMPEQ)
35552 return getOnesVector(VT, DAG, DL);
35553 if (N->getOpcode() == X86ISD::PCMPGT)
35554 return getZeroVector(VT, Subtarget, DAG, DL);
35560 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
35561 TargetLowering::DAGCombinerInfo &DCI,
35562 const X86Subtarget &Subtarget) {
35563 if (DCI.isBeforeLegalizeOps())
35567 SDValue Vec = N->getOperand(0);
35568 SDValue SubVec = N->getOperand(1);
35569 SDValue Idx = N->getOperand(2);
35571 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
35572 MVT OpVT = N->getSimpleValueType(0);
35573 MVT SubVecVT = SubVec.getSimpleValueType();
35575 // If this is an insert of an extract, combine to a shuffle. Don't do this
35576 // if the insert or extract can be represented with a subvector operation.
35577 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
35578 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
35579 (IdxVal != 0 || !Vec.isUndef())) {
35580 int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
35581 if (ExtIdxVal != 0) {
35582 int VecNumElts = OpVT.getVectorNumElements();
35583 int SubVecNumElts = SubVecVT.getVectorNumElements();
35584 SmallVector<int, 64> Mask(VecNumElts);
35585 // First create an identity shuffle mask.
35586 for (int i = 0; i != VecNumElts; ++i)
35588 // Now insert the extracted portion.
35589 for (int i = 0; i != SubVecNumElts; ++i)
35590 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
35592 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
35596 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
35598 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35599 // (load16 addr + 16), Elts/2)
35602 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35603 // (load32 addr + 32), Elts/2)
35605 // or a 16-byte or 32-byte broadcast:
35606 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35607 // (load16 addr), Elts/2)
35608 // --> X86SubVBroadcast(load16 addr)
35610 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35611 // (load32 addr), Elts/2)
35612 // --> X86SubVBroadcast(load32 addr)
35613 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
35614 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
35615 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
35616 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
35617 if (Idx2 && Idx2->getZExtValue() == 0) {
35618 SDValue SubVec2 = Vec.getOperand(1);
35619 // If needed, look through bitcasts to get to the load.
35620 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
35622 unsigned Alignment = FirstLd->getAlignment();
35623 unsigned AS = FirstLd->getAddressSpace();
35624 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
35625 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
35626 OpVT, AS, Alignment, &Fast) && Fast) {
35627 SDValue Ops[] = {SubVec2, SubVec};
35628 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
35633 // If lower/upper loads are the same and the only users of the load, then
35634 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
35635 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
35636 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
35637 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
35638 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
35641 // If this is subv_broadcast insert into both halves, use a larger
35643 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
35644 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
35645 SubVec.getOperand(0));
35654 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
35655 DAGCombinerInfo &DCI) const {
35656 SelectionDAG &DAG = DCI.DAG;
35657 switch (N->getOpcode()) {
35659 case ISD::EXTRACT_VECTOR_ELT:
35660 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
35661 case X86ISD::PEXTRW:
35662 case X86ISD::PEXTRB:
35663 return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
35664 case ISD::INSERT_SUBVECTOR:
35665 return combineInsertSubvector(N, DAG, DCI, Subtarget);
35668 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
35669 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
35670 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
35671 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
35672 case ISD::SUB: return combineSub(N, DAG, Subtarget);
35673 case X86ISD::ADD: return combineX86ADD(N, DAG, DCI);
35674 case X86ISD::ADC: return combineADC(N, DAG, DCI);
35675 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
35678 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
35679 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
35680 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
35681 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
35682 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
35683 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
35684 case ISD::STORE: return combineStore(N, DAG, Subtarget);
35685 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
35686 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
35687 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
35689 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
35690 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
35691 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
35692 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
35693 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
35694 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
35696 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
35698 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
35700 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
35701 case X86ISD::BT: return combineBT(N, DAG, DCI);
35702 case ISD::ANY_EXTEND:
35703 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
35704 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
35705 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
35706 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
35707 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
35708 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
35709 case X86ISD::VSHLI:
35710 case X86ISD::VSRAI:
35711 case X86ISD::VSRLI:
35712 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
35713 case ISD::SIGN_EXTEND_VECTOR_INREG:
35714 case ISD::ZERO_EXTEND_VECTOR_INREG:
35715 case X86ISD::VSEXT:
35716 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
35717 case X86ISD::PINSRB:
35718 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
35719 case X86ISD::SHUFP: // Handle all target specific shuffles
35720 case X86ISD::INSERTPS:
35721 case X86ISD::EXTRQI:
35722 case X86ISD::INSERTQI:
35723 case X86ISD::PALIGNR:
35724 case X86ISD::VSHLDQ:
35725 case X86ISD::VSRLDQ:
35726 case X86ISD::BLENDI:
35727 case X86ISD::UNPCKH:
35728 case X86ISD::UNPCKL:
35729 case X86ISD::MOVHLPS:
35730 case X86ISD::MOVLHPS:
35731 case X86ISD::PSHUFB:
35732 case X86ISD::PSHUFD:
35733 case X86ISD::PSHUFHW:
35734 case X86ISD::PSHUFLW:
35735 case X86ISD::MOVSHDUP:
35736 case X86ISD::MOVSLDUP:
35737 case X86ISD::MOVDDUP:
35738 case X86ISD::MOVSS:
35739 case X86ISD::MOVSD:
35740 case X86ISD::VPPERM:
35741 case X86ISD::VPERMI:
35742 case X86ISD::VPERMV:
35743 case X86ISD::VPERMV3:
35744 case X86ISD::VPERMIV3:
35745 case X86ISD::VPERMIL2:
35746 case X86ISD::VPERMILPI:
35747 case X86ISD::VPERMILPV:
35748 case X86ISD::VPERM2X128:
35749 case X86ISD::VZEXT_MOVL:
35750 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
35751 case X86ISD::FMADD:
35752 case X86ISD::FMADD_RND:
35753 case X86ISD::FMADDS1_RND:
35754 case X86ISD::FMADDS3_RND:
35755 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
35757 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
35758 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
35759 case X86ISD::TESTM: return combineTestM(N, DAG);
35760 case X86ISD::PCMPEQ:
35761 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
35767 /// Return true if the target has native support for the specified value type
35768 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
35769 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
35770 /// some i16 instructions are slow.
35771 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
35772 if (!isTypeLegal(VT))
35774 if (VT != MVT::i16)
35781 case ISD::SIGN_EXTEND:
35782 case ISD::ZERO_EXTEND:
35783 case ISD::ANY_EXTEND:
35796 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
35797 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
35798 /// we don't adjust the stack we clobber the first frame index.
35799 /// See X86InstrInfo::copyPhysReg.
35800 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
35801 const MachineRegisterInfo &MRI = MF.getRegInfo();
35802 return any_of(MRI.reg_instructions(X86::EFLAGS),
35803 [](const MachineInstr &RI) { return RI.isCopy(); });
35806 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
35807 if (hasCopyImplyingStackAdjustment(MF)) {
35808 MachineFrameInfo &MFI = MF.getFrameInfo();
35809 MFI.setHasCopyImplyingStackAdjustment(true);
35812 TargetLoweringBase::finalizeLowering(MF);
35815 /// This method query the target whether it is beneficial for dag combiner to
35816 /// promote the specified node. If true, it should return the desired promotion
35817 /// type by reference.
35818 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
35819 EVT VT = Op.getValueType();
35820 if (VT != MVT::i16)
35823 bool Promote = false;
35824 bool Commute = false;
35825 switch (Op.getOpcode()) {
35827 case ISD::SIGN_EXTEND:
35828 case ISD::ZERO_EXTEND:
35829 case ISD::ANY_EXTEND:
35834 SDValue N0 = Op.getOperand(0);
35835 // Look out for (store (shl (load), x)).
35836 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
35849 SDValue N0 = Op.getOperand(0);
35850 SDValue N1 = Op.getOperand(1);
35851 if (!Commute && MayFoldLoad(N1))
35853 // Avoid disabling potential load folding opportunities.
35854 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
35856 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
35866 //===----------------------------------------------------------------------===//
35867 // X86 Inline Assembly Support
35868 //===----------------------------------------------------------------------===//
35870 // Helper to match a string separated by whitespace.
35871 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
35872 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
35874 for (StringRef Piece : Pieces) {
35875 if (!S.startswith(Piece)) // Check if the piece matches.
35878 S = S.substr(Piece.size());
35879 StringRef::size_type Pos = S.find_first_not_of(" \t");
35880 if (Pos == 0) // We matched a prefix.
35889 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
35891 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
35892 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
35893 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
35894 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
35896 if (AsmPieces.size() == 3)
35898 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
35905 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
35906 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
35908 const std::string &AsmStr = IA->getAsmString();
35910 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
35911 if (!Ty || Ty->getBitWidth() % 16 != 0)
35914 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
35915 SmallVector<StringRef, 4> AsmPieces;
35916 SplitString(AsmStr, AsmPieces, ";\n");
35918 switch (AsmPieces.size()) {
35919 default: return false;
35921 // FIXME: this should verify that we are targeting a 486 or better. If not,
35922 // we will turn this bswap into something that will be lowered to logical
35923 // ops instead of emitting the bswap asm. For now, we don't support 486 or
35924 // lower so don't worry about this.
35926 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
35927 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
35928 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
35929 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
35930 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
35931 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
35932 // No need to check constraints, nothing other than the equivalent of
35933 // "=r,0" would be valid here.
35934 return IntrinsicLowering::LowerToByteSwap(CI);
35937 // rorw $$8, ${0:w} --> llvm.bswap.i16
35938 if (CI->getType()->isIntegerTy(16) &&
35939 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35940 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
35941 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
35943 StringRef ConstraintsStr = IA->getConstraintString();
35944 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35945 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35946 if (clobbersFlagRegisters(AsmPieces))
35947 return IntrinsicLowering::LowerToByteSwap(CI);
35951 if (CI->getType()->isIntegerTy(32) &&
35952 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35953 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
35954 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
35955 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
35957 StringRef ConstraintsStr = IA->getConstraintString();
35958 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35959 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35960 if (clobbersFlagRegisters(AsmPieces))
35961 return IntrinsicLowering::LowerToByteSwap(CI);
35964 if (CI->getType()->isIntegerTy(64)) {
35965 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
35966 if (Constraints.size() >= 2 &&
35967 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
35968 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
35969 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
35970 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
35971 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
35972 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
35973 return IntrinsicLowering::LowerToByteSwap(CI);
35981 /// Given a constraint letter, return the type of constraint for this target.
35982 X86TargetLowering::ConstraintType
35983 X86TargetLowering::getConstraintType(StringRef Constraint) const {
35984 if (Constraint.size() == 1) {
35985 switch (Constraint[0]) {
35997 return C_RegisterClass;
35998 case 'k': // AVX512 masking registers.
36022 else if (Constraint.size() == 2) {
36023 switch (Constraint[0]) {
36027 switch (Constraint[1]) {
36035 return TargetLowering::getConstraintType(Constraint);
36038 /// Examine constraint type and operand type and determine a weight value.
36039 /// This object must already have been set up with the operand type
36040 /// and the current alternative constraint selected.
36041 TargetLowering::ConstraintWeight
36042 X86TargetLowering::getSingleConstraintMatchWeight(
36043 AsmOperandInfo &info, const char *constraint) const {
36044 ConstraintWeight weight = CW_Invalid;
36045 Value *CallOperandVal = info.CallOperandVal;
36046 // If we don't have a value, we can't do a match,
36047 // but allow it at the lowest weight.
36048 if (!CallOperandVal)
36050 Type *type = CallOperandVal->getType();
36051 // Look at the constraint type.
36052 switch (*constraint) {
36054 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
36066 if (CallOperandVal->getType()->isIntegerTy())
36067 weight = CW_SpecificReg;
36072 if (type->isFloatingPointTy())
36073 weight = CW_SpecificReg;
36076 if (type->isX86_MMXTy() && Subtarget.hasMMX())
36077 weight = CW_SpecificReg;
36080 // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
36081 if (constraint[1] == 'k') {
36082 // Support for 'Yk' (similarly to the 'k' variant below).
36083 weight = CW_SpecificReg;
36086 // Else fall through (handle "Y" constraint).
36089 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
36090 weight = CW_Register;
36093 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
36094 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
36095 weight = CW_Register;
36098 // Enable conditional vector operations using %k<#> registers.
36099 weight = CW_SpecificReg;
36102 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
36103 if (C->getZExtValue() <= 31)
36104 weight = CW_Constant;
36108 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36109 if (C->getZExtValue() <= 63)
36110 weight = CW_Constant;
36114 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36115 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
36116 weight = CW_Constant;
36120 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36121 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
36122 weight = CW_Constant;
36126 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36127 if (C->getZExtValue() <= 3)
36128 weight = CW_Constant;
36132 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36133 if (C->getZExtValue() <= 0xff)
36134 weight = CW_Constant;
36139 if (isa<ConstantFP>(CallOperandVal)) {
36140 weight = CW_Constant;
36144 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36145 if ((C->getSExtValue() >= -0x80000000LL) &&
36146 (C->getSExtValue() <= 0x7fffffffLL))
36147 weight = CW_Constant;
36151 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36152 if (C->getZExtValue() <= 0xffffffff)
36153 weight = CW_Constant;
36160 /// Try to replace an X constraint, which matches anything, with another that
36161 /// has more specific requirements based on the type of the corresponding
36163 const char *X86TargetLowering::
36164 LowerXConstraint(EVT ConstraintVT) const {
36165 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
36166 // 'f' like normal targets.
36167 if (ConstraintVT.isFloatingPoint()) {
36168 if (Subtarget.hasSSE2())
36170 if (Subtarget.hasSSE1())
36174 return TargetLowering::LowerXConstraint(ConstraintVT);
36177 /// Lower the specified operand into the Ops vector.
36178 /// If it is invalid, don't add anything to Ops.
36179 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
36180 std::string &Constraint,
36181 std::vector<SDValue>&Ops,
36182 SelectionDAG &DAG) const {
36185 // Only support length 1 constraints for now.
36186 if (Constraint.length() > 1) return;
36188 char ConstraintLetter = Constraint[0];
36189 switch (ConstraintLetter) {
36192 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36193 if (C->getZExtValue() <= 31) {
36194 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36195 Op.getValueType());
36201 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36202 if (C->getZExtValue() <= 63) {
36203 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36204 Op.getValueType());
36210 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36211 if (isInt<8>(C->getSExtValue())) {
36212 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36213 Op.getValueType());
36219 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36220 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
36221 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
36222 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
36223 Op.getValueType());
36229 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36230 if (C->getZExtValue() <= 3) {
36231 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36232 Op.getValueType());
36238 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36239 if (C->getZExtValue() <= 255) {
36240 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36241 Op.getValueType());
36247 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36248 if (C->getZExtValue() <= 127) {
36249 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36250 Op.getValueType());
36256 // 32-bit signed value
36257 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36258 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
36259 C->getSExtValue())) {
36260 // Widen to 64 bits here to get it sign extended.
36261 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
36264 // FIXME gcc accepts some relocatable values here too, but only in certain
36265 // memory models; it's complicated.
36270 // 32-bit unsigned value
36271 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36272 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
36273 C->getZExtValue())) {
36274 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36275 Op.getValueType());
36279 // FIXME gcc accepts some relocatable values here too, but only in certain
36280 // memory models; it's complicated.
36284 // Literal immediates are always ok.
36285 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
36286 // Widen to 64 bits here to get it sign extended.
36287 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
36291 // In any sort of PIC mode addresses need to be computed at runtime by
36292 // adding in a register or some sort of table lookup. These can't
36293 // be used as immediates.
36294 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
36297 // If we are in non-pic codegen mode, we allow the address of a global (with
36298 // an optional displacement) to be used with 'i'.
36299 GlobalAddressSDNode *GA = nullptr;
36300 int64_t Offset = 0;
36302 // Match either (GA), (GA+C), (GA+C1+C2), etc.
36304 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
36305 Offset += GA->getOffset();
36307 } else if (Op.getOpcode() == ISD::ADD) {
36308 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
36309 Offset += C->getZExtValue();
36310 Op = Op.getOperand(0);
36313 } else if (Op.getOpcode() == ISD::SUB) {
36314 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
36315 Offset += -C->getZExtValue();
36316 Op = Op.getOperand(0);
36321 // Otherwise, this isn't something we can handle, reject it.
36325 const GlobalValue *GV = GA->getGlobal();
36326 // If we require an extra load to get this address, as in PIC mode, we
36327 // can't accept it.
36328 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
36331 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
36332 GA->getValueType(0), Offset);
36337 if (Result.getNode()) {
36338 Ops.push_back(Result);
36341 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
36344 /// Check if \p RC is a general purpose register class.
36345 /// I.e., GR* or one of their variant.
36346 static bool isGRClass(const TargetRegisterClass &RC) {
36347 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
36348 RC.hasSuperClassEq(&X86::GR16RegClass) ||
36349 RC.hasSuperClassEq(&X86::GR32RegClass) ||
36350 RC.hasSuperClassEq(&X86::GR64RegClass) ||
36351 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
36354 /// Check if \p RC is a vector register class.
36355 /// I.e., FR* / VR* or one of their variant.
36356 static bool isFRClass(const TargetRegisterClass &RC) {
36357 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
36358 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
36359 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
36360 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
36361 RC.hasSuperClassEq(&X86::VR512RegClass);
36364 std::pair<unsigned, const TargetRegisterClass *>
36365 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
36366 StringRef Constraint,
36368 // First, see if this is a constraint that directly corresponds to an LLVM
36370 if (Constraint.size() == 1) {
36371 // GCC Constraint Letters
36372 switch (Constraint[0]) {
36374 // TODO: Slight differences here in allocation order and leaving
36375 // RIP in the class. Do they matter any more here than they do
36376 // in the normal allocation?
36378 if (Subtarget.hasAVX512()) {
36379 // Only supported in AVX512 or later.
36380 switch (VT.SimpleTy) {
36383 return std::make_pair(0U, &X86::VK32RegClass);
36385 return std::make_pair(0U, &X86::VK16RegClass);
36387 return std::make_pair(0U, &X86::VK8RegClass);
36389 return std::make_pair(0U, &X86::VK1RegClass);
36391 return std::make_pair(0U, &X86::VK64RegClass);
36395 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
36396 if (Subtarget.is64Bit()) {
36397 if (VT == MVT::i32 || VT == MVT::f32)
36398 return std::make_pair(0U, &X86::GR32RegClass);
36399 if (VT == MVT::i16)
36400 return std::make_pair(0U, &X86::GR16RegClass);
36401 if (VT == MVT::i8 || VT == MVT::i1)
36402 return std::make_pair(0U, &X86::GR8RegClass);
36403 if (VT == MVT::i64 || VT == MVT::f64)
36404 return std::make_pair(0U, &X86::GR64RegClass);
36408 // 32-bit fallthrough
36409 case 'Q': // Q_REGS
36410 if (VT == MVT::i32 || VT == MVT::f32)
36411 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
36412 if (VT == MVT::i16)
36413 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
36414 if (VT == MVT::i8 || VT == MVT::i1)
36415 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
36416 if (VT == MVT::i64)
36417 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
36419 case 'r': // GENERAL_REGS
36420 case 'l': // INDEX_REGS
36421 if (VT == MVT::i8 || VT == MVT::i1)
36422 return std::make_pair(0U, &X86::GR8RegClass);
36423 if (VT == MVT::i16)
36424 return std::make_pair(0U, &X86::GR16RegClass);
36425 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
36426 return std::make_pair(0U, &X86::GR32RegClass);
36427 return std::make_pair(0U, &X86::GR64RegClass);
36428 case 'R': // LEGACY_REGS
36429 if (VT == MVT::i8 || VT == MVT::i1)
36430 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
36431 if (VT == MVT::i16)
36432 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
36433 if (VT == MVT::i32 || !Subtarget.is64Bit())
36434 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
36435 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
36436 case 'f': // FP Stack registers.
36437 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
36438 // value to the correct fpstack register class.
36439 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
36440 return std::make_pair(0U, &X86::RFP32RegClass);
36441 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
36442 return std::make_pair(0U, &X86::RFP64RegClass);
36443 return std::make_pair(0U, &X86::RFP80RegClass);
36444 case 'y': // MMX_REGS if MMX allowed.
36445 if (!Subtarget.hasMMX()) break;
36446 return std::make_pair(0U, &X86::VR64RegClass);
36447 case 'Y': // SSE_REGS if SSE2 allowed
36448 if (!Subtarget.hasSSE2()) break;
36451 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
36452 if (!Subtarget.hasSSE1()) break;
36453 bool VConstraint = (Constraint[0] == 'v');
36455 switch (VT.SimpleTy) {
36457 // Scalar SSE types.
36460 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
36461 return std::make_pair(0U, &X86::FR32XRegClass);
36462 return std::make_pair(0U, &X86::FR32RegClass);
36465 if (VConstraint && Subtarget.hasVLX())
36466 return std::make_pair(0U, &X86::FR64XRegClass);
36467 return std::make_pair(0U, &X86::FR64RegClass);
36468 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36476 if (VConstraint && Subtarget.hasVLX())
36477 return std::make_pair(0U, &X86::VR128XRegClass);
36478 return std::make_pair(0U, &X86::VR128RegClass);
36486 if (VConstraint && Subtarget.hasVLX())
36487 return std::make_pair(0U, &X86::VR256XRegClass);
36488 return std::make_pair(0U, &X86::VR256RegClass);
36493 return std::make_pair(0U, &X86::VR512RegClass);
36497 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
36498 switch (Constraint[1]) {
36502 // This register class doesn't allocate k0 for masked vector operation.
36503 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
36504 switch (VT.SimpleTy) {
36507 return std::make_pair(0U, &X86::VK32WMRegClass);
36509 return std::make_pair(0U, &X86::VK16WMRegClass);
36511 return std::make_pair(0U, &X86::VK8WMRegClass);
36513 return std::make_pair(0U, &X86::VK1WMRegClass);
36515 return std::make_pair(0U, &X86::VK64WMRegClass);
36522 // Use the default implementation in TargetLowering to convert the register
36523 // constraint into a member of a register class.
36524 std::pair<unsigned, const TargetRegisterClass*> Res;
36525 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
36527 // Not found as a standard register?
36529 // Map st(0) -> st(7) -> ST0
36530 if (Constraint.size() == 7 && Constraint[0] == '{' &&
36531 tolower(Constraint[1]) == 's' &&
36532 tolower(Constraint[2]) == 't' &&
36533 Constraint[3] == '(' &&
36534 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
36535 Constraint[5] == ')' &&
36536 Constraint[6] == '}') {
36538 Res.first = X86::FP0+Constraint[4]-'0';
36539 Res.second = &X86::RFP80RegClass;
36543 // GCC allows "st(0)" to be called just plain "st".
36544 if (StringRef("{st}").equals_lower(Constraint)) {
36545 Res.first = X86::FP0;
36546 Res.second = &X86::RFP80RegClass;
36551 if (StringRef("{flags}").equals_lower(Constraint)) {
36552 Res.first = X86::EFLAGS;
36553 Res.second = &X86::CCRRegClass;
36557 // 'A' means [ER]AX + [ER]DX.
36558 if (Constraint == "A") {
36559 if (Subtarget.is64Bit()) {
36560 Res.first = X86::RAX;
36561 Res.second = &X86::GR64_ADRegClass;
36563 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
36564 "Expecting 64, 32 or 16 bit subtarget");
36565 Res.first = X86::EAX;
36566 Res.second = &X86::GR32_ADRegClass;
36573 // Otherwise, check to see if this is a register class of the wrong value
36574 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
36575 // turn into {ax},{dx}.
36576 // MVT::Other is used to specify clobber names.
36577 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
36578 return Res; // Correct type already, nothing to do.
36580 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
36581 // return "eax". This should even work for things like getting 64bit integer
36582 // registers when given an f64 type.
36583 const TargetRegisterClass *Class = Res.second;
36584 // The generic code will match the first register class that contains the
36585 // given register. Thus, based on the ordering of the tablegened file,
36586 // the "plain" GR classes might not come first.
36587 // Therefore, use a helper method.
36588 if (isGRClass(*Class)) {
36589 unsigned Size = VT.getSizeInBits();
36590 if (Size == 1) Size = 8;
36591 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
36593 Res.first = DestReg;
36594 Res.second = Size == 8 ? &X86::GR8RegClass
36595 : Size == 16 ? &X86::GR16RegClass
36596 : Size == 32 ? &X86::GR32RegClass
36597 : &X86::GR64RegClass;
36598 assert(Res.second->contains(Res.first) && "Register in register class");
36600 // No register found/type mismatch.
36602 Res.second = nullptr;
36604 } else if (isFRClass(*Class)) {
36605 // Handle references to XMM physical registers that got mapped into the
36606 // wrong class. This can happen with constraints like {xmm0} where the
36607 // target independent register mapper will just pick the first match it can
36608 // find, ignoring the required type.
36610 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36611 if (VT == MVT::f32 || VT == MVT::i32)
36612 Res.second = &X86::FR32RegClass;
36613 else if (VT == MVT::f64 || VT == MVT::i64)
36614 Res.second = &X86::FR64RegClass;
36615 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
36616 Res.second = &X86::VR128RegClass;
36617 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
36618 Res.second = &X86::VR256RegClass;
36619 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
36620 Res.second = &X86::VR512RegClass;
36622 // Type mismatch and not a clobber: Return an error;
36624 Res.second = nullptr;
36631 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
36632 const AddrMode &AM, Type *Ty,
36633 unsigned AS) const {
36634 // Scaling factors are not free at all.
36635 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
36636 // will take 2 allocations in the out of order engine instead of 1
36637 // for plain addressing mode, i.e. inst (reg1).
36639 // vaddps (%rsi,%drx), %ymm0, %ymm1
36640 // Requires two allocations (one for the load, one for the computation)
36642 // vaddps (%rsi), %ymm0, %ymm1
36643 // Requires just 1 allocation, i.e., freeing allocations for other operations
36644 // and having less micro operations to execute.
36646 // For some X86 architectures, this is even worse because for instance for
36647 // stores, the complex addressing mode forces the instruction to use the
36648 // "load" ports instead of the dedicated "store" port.
36649 // E.g., on Haswell:
36650 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
36651 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
36652 if (isLegalAddressingMode(DL, AM, Ty, AS))
36653 // Scale represents reg2 * scale, thus account for 1
36654 // as soon as we use a second register.
36655 return AM.Scale != 0;
36659 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
36660 // Integer division on x86 is expensive. However, when aggressively optimizing
36661 // for code size, we prefer to use a div instruction, as it is usually smaller
36662 // than the alternative sequence.
36663 // The exception to this is vector division. Since x86 doesn't have vector
36664 // integer division, leaving the division as-is is a loss even in terms of
36665 // size, because it will have to be scalarized, while the alternative code
36666 // sequence can be performed in vector form.
36668 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
36669 return OptSize && !VT.isVector();
36672 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
36673 if (!Subtarget.is64Bit())
36676 // Update IsSplitCSR in X86MachineFunctionInfo.
36677 X86MachineFunctionInfo *AFI =
36678 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
36679 AFI->setIsSplitCSR(true);
36682 void X86TargetLowering::insertCopiesSplitCSR(
36683 MachineBasicBlock *Entry,
36684 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
36685 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36686 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
36690 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36691 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
36692 MachineBasicBlock::iterator MBBI = Entry->begin();
36693 for (const MCPhysReg *I = IStart; *I; ++I) {
36694 const TargetRegisterClass *RC = nullptr;
36695 if (X86::GR64RegClass.contains(*I))
36696 RC = &X86::GR64RegClass;
36698 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
36700 unsigned NewVR = MRI->createVirtualRegister(RC);
36701 // Create copy from CSR to a virtual register.
36702 // FIXME: this currently does not emit CFI pseudo-instructions, it works
36703 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
36704 // nounwind. If we want to generalize this later, we may need to emit
36705 // CFI pseudo-instructions.
36706 assert(Entry->getParent()->getFunction()->hasFnAttribute(
36707 Attribute::NoUnwind) &&
36708 "Function should be nounwind in insertCopiesSplitCSR!");
36709 Entry->addLiveIn(*I);
36710 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
36713 // Insert the copy-back instructions right before the terminator.
36714 for (auto *Exit : Exits)
36715 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
36716 TII->get(TargetOpcode::COPY), *I)
36721 bool X86TargetLowering::supportSwiftError() const {
36722 return Subtarget.is64Bit();
36725 /// Returns the name of the symbol used to emit stack probes or the empty
36726 /// string if not applicable.
36727 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
36728 // If the function specifically requests stack probes, emit them.
36729 if (MF.getFunction()->hasFnAttribute("probe-stack"))
36730 return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
36732 // Generally, if we aren't on Windows, the platform ABI does not include
36733 // support for stack probes, so don't emit them.
36734 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
36737 // We need a stack probe to conform to the Windows ABI. Choose the right
36739 if (Subtarget.is64Bit())
36740 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
36741 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";