1 //===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the PPCISelLowering class.
11 //===----------------------------------------------------------------------===//
13 #include "PPCISelLowering.h"
14 #include "MCTargetDesc/PPCPredicates.h"
16 #include "PPCCCState.h"
17 #include "PPCCallingConv.h"
18 #include "PPCFrameLowering.h"
19 #include "PPCInstrInfo.h"
20 #include "PPCMachineFunctionInfo.h"
21 #include "PPCPerfectShuffle.h"
22 #include "PPCRegisterInfo.h"
23 #include "PPCSubtarget.h"
24 #include "PPCTargetMachine.h"
25 #include "llvm/ADT/APFloat.h"
26 #include "llvm/ADT/APInt.h"
27 #include "llvm/ADT/APSInt.h"
28 #include "llvm/ADT/ArrayRef.h"
29 #include "llvm/ADT/DenseMap.h"
30 #include "llvm/ADT/STLExtras.h"
31 #include "llvm/ADT/SmallPtrSet.h"
32 #include "llvm/ADT/SmallSet.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
35 #include "llvm/ADT/StringRef.h"
36 #include "llvm/ADT/StringSwitch.h"
37 #include "llvm/CodeGen/CallingConvLower.h"
38 #include "llvm/CodeGen/ISDOpcodes.h"
39 #include "llvm/CodeGen/MachineBasicBlock.h"
40 #include "llvm/CodeGen/MachineFrameInfo.h"
41 #include "llvm/CodeGen/MachineFunction.h"
42 #include "llvm/CodeGen/MachineInstr.h"
43 #include "llvm/CodeGen/MachineInstrBuilder.h"
44 #include "llvm/CodeGen/MachineJumpTableInfo.h"
45 #include "llvm/CodeGen/MachineLoopInfo.h"
46 #include "llvm/CodeGen/MachineMemOperand.h"
47 #include "llvm/CodeGen/MachineModuleInfo.h"
48 #include "llvm/CodeGen/MachineOperand.h"
49 #include "llvm/CodeGen/MachineRegisterInfo.h"
50 #include "llvm/CodeGen/MachineValueType.h"
51 #include "llvm/CodeGen/RuntimeLibcalls.h"
52 #include "llvm/CodeGen/SelectionDAG.h"
53 #include "llvm/CodeGen/SelectionDAGNodes.h"
54 #include "llvm/CodeGen/TargetInstrInfo.h"
55 #include "llvm/CodeGen/TargetLowering.h"
56 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
57 #include "llvm/CodeGen/TargetRegisterInfo.h"
58 #include "llvm/CodeGen/ValueTypes.h"
59 #include "llvm/IR/CallingConv.h"
60 #include "llvm/IR/Constant.h"
61 #include "llvm/IR/Constants.h"
62 #include "llvm/IR/DataLayout.h"
63 #include "llvm/IR/DebugLoc.h"
64 #include "llvm/IR/DerivedTypes.h"
65 #include "llvm/IR/Function.h"
66 #include "llvm/IR/GlobalValue.h"
67 #include "llvm/IR/IRBuilder.h"
68 #include "llvm/IR/Instructions.h"
69 #include "llvm/IR/Intrinsics.h"
70 #include "llvm/IR/IntrinsicsPowerPC.h"
71 #include "llvm/IR/Module.h"
72 #include "llvm/IR/Type.h"
73 #include "llvm/IR/Use.h"
74 #include "llvm/IR/Value.h"
75 #include "llvm/MC/MCContext.h"
76 #include "llvm/MC/MCExpr.h"
77 #include "llvm/MC/MCRegisterInfo.h"
78 #include "llvm/MC/MCSectionXCOFF.h"
79 #include "llvm/MC/MCSymbolXCOFF.h"
80 #include "llvm/Support/AtomicOrdering.h"
81 #include "llvm/Support/BranchProbability.h"
82 #include "llvm/Support/Casting.h"
83 #include "llvm/Support/CodeGen.h"
84 #include "llvm/Support/CommandLine.h"
85 #include "llvm/Support/Compiler.h"
86 #include "llvm/Support/Debug.h"
87 #include "llvm/Support/ErrorHandling.h"
88 #include "llvm/Support/Format.h"
89 #include "llvm/Support/KnownBits.h"
90 #include "llvm/Support/MathExtras.h"
91 #include "llvm/Support/raw_ostream.h"
92 #include "llvm/Target/TargetMachine.h"
93 #include "llvm/Target/TargetOptions.h"
103 using namespace llvm;
105 #define DEBUG_TYPE "ppc-lowering"
107 static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
108 cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
110 static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
111 cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
113 static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
114 cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
116 static cl::opt<bool> DisableSCO("disable-ppc-sco",
117 cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
119 static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",
120 cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
122 static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
123 cl::desc("use absolute jump tables on ppc"), cl::Hidden);
126 DisablePerfectShuffle("ppc-disable-perfect-shuffle",
127 cl::desc("disable vector permute decomposition"),
128 cl::init(true), cl::Hidden);
130 cl::opt<bool> DisableAutoPairedVecSt(
131 "disable-auto-paired-vec-st",
132 cl::desc("disable automatically generated 32byte paired vector stores"),
133 cl::init(true), cl::Hidden);
135 STATISTIC(NumTailCalls, "Number of tail calls");
136 STATISTIC(NumSiblingCalls, "Number of sibling calls");
137 STATISTIC(ShufflesHandledWithVPERM,
138 "Number of shuffles lowered to a VPERM or XXPERM");
139 STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed");
141 static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
143 static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl);
145 static const char AIXSSPCanaryWordName[] = "__ssp_canary_word";
147 // FIXME: Remove this once the bug has been fixed!
148 extern cl::opt<bool> ANDIGlueBug;
150 PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
151 const PPCSubtarget &STI)
152 : TargetLowering(TM), Subtarget(STI) {
153 // Initialize map that relates the PPC addressing modes to the computed flags
154 // of a load/store instruction. The map is used to determine the optimal
155 // addressing mode when selecting load and stores.
156 initializeAddrModeMap();
157 // On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
158 // arguments are at least 4/8 bytes aligned.
159 bool isPPC64 = Subtarget.isPPC64();
160 setMinStackArgumentAlignment(isPPC64 ? Align(8) : Align(4));
162 // Set up the register classes.
163 addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
164 if (!useSoftFloat()) {
166 addRegisterClass(MVT::f32, &PPC::GPRCRegClass);
167 // EFPU2 APU only supports f32
168 if (!Subtarget.hasEFPU2())
169 addRegisterClass(MVT::f64, &PPC::SPERCRegClass);
171 addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
172 addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
176 // Match BITREVERSE to customized fast code sequence in the td file.
177 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
178 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
180 // Sub-word ATOMIC_CMP_SWAP need to ensure that the input is zero-extended.
181 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
183 // Custom lower inline assembly to check for special registers.
184 setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
185 setOperationAction(ISD::INLINEASM_BR, MVT::Other, Custom);
187 // PowerPC has an i16 but no i8 (or i1) SEXTLOAD.
188 for (MVT VT : MVT::integer_valuetypes()) {
189 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
190 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
193 if (Subtarget.isISA3_0()) {
194 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Legal);
195 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Legal);
196 setTruncStoreAction(MVT::f64, MVT::f16, Legal);
197 setTruncStoreAction(MVT::f32, MVT::f16, Legal);
199 // No extending loads from f16 or HW conversions back and forth.
200 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
201 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
202 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
203 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
204 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
205 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
206 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
207 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
210 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
212 // PowerPC has pre-inc load and store's.
213 setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
214 setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
215 setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
216 setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
217 setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
218 setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
219 setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
220 setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
221 setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
222 setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
223 if (!Subtarget.hasSPE()) {
224 setIndexedLoadAction(ISD::PRE_INC, MVT::f32, Legal);
225 setIndexedLoadAction(ISD::PRE_INC, MVT::f64, Legal);
226 setIndexedStoreAction(ISD::PRE_INC, MVT::f32, Legal);
227 setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal);
230 // PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry.
231 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
232 for (MVT VT : ScalarIntVTs) {
233 setOperationAction(ISD::ADDC, VT, Legal);
234 setOperationAction(ISD::ADDE, VT, Legal);
235 setOperationAction(ISD::SUBC, VT, Legal);
236 setOperationAction(ISD::SUBE, VT, Legal);
239 if (Subtarget.useCRBits()) {
240 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
242 if (isPPC64 || Subtarget.hasFPCVT()) {
243 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i1, Promote);
244 AddPromotedToType(ISD::STRICT_SINT_TO_FP, MVT::i1,
245 isPPC64 ? MVT::i64 : MVT::i32);
246 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i1, Promote);
247 AddPromotedToType(ISD::STRICT_UINT_TO_FP, MVT::i1,
248 isPPC64 ? MVT::i64 : MVT::i32);
250 setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
251 AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
252 isPPC64 ? MVT::i64 : MVT::i32);
253 setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
254 AddPromotedToType(ISD::UINT_TO_FP, MVT::i1,
255 isPPC64 ? MVT::i64 : MVT::i32);
257 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i1, Promote);
258 AddPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::i1,
259 isPPC64 ? MVT::i64 : MVT::i32);
260 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i1, Promote);
261 AddPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::i1,
262 isPPC64 ? MVT::i64 : MVT::i32);
264 setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote);
265 AddPromotedToType(ISD::FP_TO_SINT, MVT::i1,
266 isPPC64 ? MVT::i64 : MVT::i32);
267 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote);
268 AddPromotedToType(ISD::FP_TO_UINT, MVT::i1,
269 isPPC64 ? MVT::i64 : MVT::i32);
271 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i1, Custom);
272 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i1, Custom);
273 setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
274 setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
277 // PowerPC does not support direct load/store of condition registers.
278 setOperationAction(ISD::LOAD, MVT::i1, Custom);
279 setOperationAction(ISD::STORE, MVT::i1, Custom);
281 // FIXME: Remove this once the ANDI glue bug is fixed:
283 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
285 for (MVT VT : MVT::integer_valuetypes()) {
286 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
287 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
288 setTruncStoreAction(VT, MVT::i1, Expand);
291 addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
294 // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
295 // PPC (the libcall is not available).
296 setOperationAction(ISD::FP_TO_SINT, MVT::ppcf128, Custom);
297 setOperationAction(ISD::FP_TO_UINT, MVT::ppcf128, Custom);
298 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::ppcf128, Custom);
299 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::ppcf128, Custom);
301 // We do not currently implement these libm ops for PowerPC.
302 setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
303 setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand);
304 setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
305 setOperationAction(ISD::FRINT, MVT::ppcf128, Expand);
306 setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
307 setOperationAction(ISD::FREM, MVT::ppcf128, Expand);
309 // PowerPC has no SREM/UREM instructions unless we are on P9
310 // On P9 we may use a hardware instruction to compute the remainder.
311 // When the result of both the remainder and the division is required it is
312 // more efficient to compute the remainder from the result of the division
313 // rather than use the remainder instruction. The instructions are legalized
314 // directly because the DivRemPairsPass performs the transformation at the IR
316 if (Subtarget.isISA3_0()) {
317 setOperationAction(ISD::SREM, MVT::i32, Legal);
318 setOperationAction(ISD::UREM, MVT::i32, Legal);
319 setOperationAction(ISD::SREM, MVT::i64, Legal);
320 setOperationAction(ISD::UREM, MVT::i64, Legal);
322 setOperationAction(ISD::SREM, MVT::i32, Expand);
323 setOperationAction(ISD::UREM, MVT::i32, Expand);
324 setOperationAction(ISD::SREM, MVT::i64, Expand);
325 setOperationAction(ISD::UREM, MVT::i64, Expand);
328 // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
329 setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
330 setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
331 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
332 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
333 setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
334 setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
335 setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
336 setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
338 // Handle constrained floating-point operations of scalar.
339 // TODO: Handle SPE specific operation.
340 setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
341 setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
342 setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
343 setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
344 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
346 setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
347 setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
348 setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
349 setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
351 if (!Subtarget.hasSPE()) {
352 setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal);
353 setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);
356 if (Subtarget.hasVSX()) {
357 setOperationAction(ISD::STRICT_FRINT, MVT::f32, Legal);
358 setOperationAction(ISD::STRICT_FRINT, MVT::f64, Legal);
361 if (Subtarget.hasFSQRT()) {
362 setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
363 setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
366 if (Subtarget.hasFPRND()) {
367 setOperationAction(ISD::STRICT_FFLOOR, MVT::f32, Legal);
368 setOperationAction(ISD::STRICT_FCEIL, MVT::f32, Legal);
369 setOperationAction(ISD::STRICT_FTRUNC, MVT::f32, Legal);
370 setOperationAction(ISD::STRICT_FROUND, MVT::f32, Legal);
372 setOperationAction(ISD::STRICT_FFLOOR, MVT::f64, Legal);
373 setOperationAction(ISD::STRICT_FCEIL, MVT::f64, Legal);
374 setOperationAction(ISD::STRICT_FTRUNC, MVT::f64, Legal);
375 setOperationAction(ISD::STRICT_FROUND, MVT::f64, Legal);
378 // We don't support sin/cos/sqrt/fmod/pow
379 setOperationAction(ISD::FSIN , MVT::f64, Expand);
380 setOperationAction(ISD::FCOS , MVT::f64, Expand);
381 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
382 setOperationAction(ISD::FREM , MVT::f64, Expand);
383 setOperationAction(ISD::FPOW , MVT::f64, Expand);
384 setOperationAction(ISD::FSIN , MVT::f32, Expand);
385 setOperationAction(ISD::FCOS , MVT::f32, Expand);
386 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
387 setOperationAction(ISD::FREM , MVT::f32, Expand);
388 setOperationAction(ISD::FPOW , MVT::f32, Expand);
390 // MASS transformation for LLVM intrinsics with replicating fast-math flag
391 // to be consistent to PPCGenScalarMASSEntries pass
392 if (TM.getOptLevel() == CodeGenOpt::Aggressive) {
393 setOperationAction(ISD::FSIN , MVT::f64, Custom);
394 setOperationAction(ISD::FCOS , MVT::f64, Custom);
395 setOperationAction(ISD::FPOW , MVT::f64, Custom);
396 setOperationAction(ISD::FLOG, MVT::f64, Custom);
397 setOperationAction(ISD::FLOG10, MVT::f64, Custom);
398 setOperationAction(ISD::FEXP, MVT::f64, Custom);
399 setOperationAction(ISD::FSIN , MVT::f32, Custom);
400 setOperationAction(ISD::FCOS , MVT::f32, Custom);
401 setOperationAction(ISD::FPOW , MVT::f32, Custom);
402 setOperationAction(ISD::FLOG, MVT::f32, Custom);
403 setOperationAction(ISD::FLOG10, MVT::f32, Custom);
404 setOperationAction(ISD::FEXP, MVT::f32, Custom);
407 if (Subtarget.hasSPE()) {
408 setOperationAction(ISD::FMA , MVT::f64, Expand);
409 setOperationAction(ISD::FMA , MVT::f32, Expand);
411 setOperationAction(ISD::FMA , MVT::f64, Legal);
412 setOperationAction(ISD::FMA , MVT::f32, Legal);
415 if (Subtarget.hasSPE())
416 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
418 setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
420 // If we're enabling GP optimizations, use hardware square root
421 if (!Subtarget.hasFSQRT() &&
422 !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTE() &&
424 setOperationAction(ISD::FSQRT, MVT::f64, Expand);
426 if (!Subtarget.hasFSQRT() &&
427 !(TM.Options.UnsafeFPMath && Subtarget.hasFRSQRTES() &&
428 Subtarget.hasFRES()))
429 setOperationAction(ISD::FSQRT, MVT::f32, Expand);
431 if (Subtarget.hasFCPSGN()) {
432 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
433 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
435 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
436 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
439 if (Subtarget.hasFPRND()) {
440 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
441 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
442 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
443 setOperationAction(ISD::FROUND, MVT::f64, Legal);
445 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
446 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
447 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
448 setOperationAction(ISD::FROUND, MVT::f32, Legal);
451 // Prior to P10, PowerPC does not have BSWAP, but we can use vector BSWAP
452 // instruction xxbrd to speed up scalar BSWAP64.
453 if (Subtarget.isISA3_1()) {
454 setOperationAction(ISD::BSWAP, MVT::i32, Legal);
455 setOperationAction(ISD::BSWAP, MVT::i64, Legal);
457 setOperationAction(ISD::BSWAP, MVT::i32, Expand);
459 ISD::BSWAP, MVT::i64,
460 (Subtarget.hasP9Vector() && Subtarget.isPPC64()) ? Custom : Expand);
463 // CTPOP or CTTZ were introduced in P8/P9 respectively
464 if (Subtarget.isISA3_0()) {
465 setOperationAction(ISD::CTTZ , MVT::i32 , Legal);
466 setOperationAction(ISD::CTTZ , MVT::i64 , Legal);
468 setOperationAction(ISD::CTTZ , MVT::i32 , Expand);
469 setOperationAction(ISD::CTTZ , MVT::i64 , Expand);
472 if (Subtarget.hasPOPCNTD() == PPCSubtarget::POPCNTD_Fast) {
473 setOperationAction(ISD::CTPOP, MVT::i32 , Legal);
474 setOperationAction(ISD::CTPOP, MVT::i64 , Legal);
476 setOperationAction(ISD::CTPOP, MVT::i32 , Expand);
477 setOperationAction(ISD::CTPOP, MVT::i64 , Expand);
480 // PowerPC does not have ROTR
481 setOperationAction(ISD::ROTR, MVT::i32 , Expand);
482 setOperationAction(ISD::ROTR, MVT::i64 , Expand);
484 if (!Subtarget.useCRBits()) {
485 // PowerPC does not have Select
486 setOperationAction(ISD::SELECT, MVT::i32, Expand);
487 setOperationAction(ISD::SELECT, MVT::i64, Expand);
488 setOperationAction(ISD::SELECT, MVT::f32, Expand);
489 setOperationAction(ISD::SELECT, MVT::f64, Expand);
492 // PowerPC wants to turn select_cc of FP into fsel when possible.
493 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
494 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
496 // PowerPC wants to optimize integer setcc a bit
497 if (!Subtarget.useCRBits())
498 setOperationAction(ISD::SETCC, MVT::i32, Custom);
500 if (Subtarget.hasFPU()) {
501 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Legal);
502 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Legal);
503 setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Legal);
505 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal);
506 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
507 setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Legal);
510 // PowerPC does not have BRCOND which requires SetCC
511 if (!Subtarget.useCRBits())
512 setOperationAction(ISD::BRCOND, MVT::Other, Expand);
514 setOperationAction(ISD::BR_JT, MVT::Other, Expand);
516 if (Subtarget.hasSPE()) {
517 // SPE has built-in conversions
518 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Legal);
519 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Legal);
520 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Legal);
521 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
522 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
523 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
525 // SPE supports signaling compare of f32/f64.
526 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal);
527 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
529 // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
530 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
531 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
533 // PowerPC does not have [U|S]INT_TO_FP
534 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Expand);
535 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Expand);
536 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
537 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
540 if (Subtarget.hasDirectMove() && isPPC64) {
541 setOperationAction(ISD::BITCAST, MVT::f32, Legal);
542 setOperationAction(ISD::BITCAST, MVT::i32, Legal);
543 setOperationAction(ISD::BITCAST, MVT::i64, Legal);
544 setOperationAction(ISD::BITCAST, MVT::f64, Legal);
545 if (TM.Options.UnsafeFPMath) {
546 setOperationAction(ISD::LRINT, MVT::f64, Legal);
547 setOperationAction(ISD::LRINT, MVT::f32, Legal);
548 setOperationAction(ISD::LLRINT, MVT::f64, Legal);
549 setOperationAction(ISD::LLRINT, MVT::f32, Legal);
550 setOperationAction(ISD::LROUND, MVT::f64, Legal);
551 setOperationAction(ISD::LROUND, MVT::f32, Legal);
552 setOperationAction(ISD::LLROUND, MVT::f64, Legal);
553 setOperationAction(ISD::LLROUND, MVT::f32, Legal);
556 setOperationAction(ISD::BITCAST, MVT::f32, Expand);
557 setOperationAction(ISD::BITCAST, MVT::i32, Expand);
558 setOperationAction(ISD::BITCAST, MVT::i64, Expand);
559 setOperationAction(ISD::BITCAST, MVT::f64, Expand);
562 // We cannot sextinreg(i1). Expand to shifts.
563 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
565 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
566 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
567 // support continuation, user-level threading, and etc.. As a result, no
568 // other SjLj exception interfaces are implemented and please don't build
569 // your own exception handling based on them.
570 // LLVM/Clang supports zero-cost DWARF exception handling.
571 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
572 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
574 // We want to legalize GlobalAddress and ConstantPool nodes into the
575 // appropriate instructions to materialize the address.
576 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
577 setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
578 setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
579 setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
580 setOperationAction(ISD::JumpTable, MVT::i32, Custom);
581 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
582 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
583 setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
584 setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
585 setOperationAction(ISD::JumpTable, MVT::i64, Custom);
588 setOperationAction(ISD::TRAP, MVT::Other, Legal);
590 // TRAMPOLINE is custom lowered.
591 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
592 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
594 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
595 setOperationAction(ISD::VASTART , MVT::Other, Custom);
597 if (Subtarget.is64BitELFABI()) {
598 // VAARG always uses double-word chunks, so promote anything smaller.
599 setOperationAction(ISD::VAARG, MVT::i1, Promote);
600 AddPromotedToType(ISD::VAARG, MVT::i1, MVT::i64);
601 setOperationAction(ISD::VAARG, MVT::i8, Promote);
602 AddPromotedToType(ISD::VAARG, MVT::i8, MVT::i64);
603 setOperationAction(ISD::VAARG, MVT::i16, Promote);
604 AddPromotedToType(ISD::VAARG, MVT::i16, MVT::i64);
605 setOperationAction(ISD::VAARG, MVT::i32, Promote);
606 AddPromotedToType(ISD::VAARG, MVT::i32, MVT::i64);
607 setOperationAction(ISD::VAARG, MVT::Other, Expand);
608 } else if (Subtarget.is32BitELFABI()) {
609 // VAARG is custom lowered with the 32-bit SVR4 ABI.
610 setOperationAction(ISD::VAARG, MVT::Other, Custom);
611 setOperationAction(ISD::VAARG, MVT::i64, Custom);
613 setOperationAction(ISD::VAARG, MVT::Other, Expand);
615 // VACOPY is custom lowered with the 32-bit SVR4 ABI.
616 if (Subtarget.is32BitELFABI())
617 setOperationAction(ISD::VACOPY , MVT::Other, Custom);
619 setOperationAction(ISD::VACOPY , MVT::Other, Expand);
621 // Use the default implementation.
622 setOperationAction(ISD::VAEND , MVT::Other, Expand);
623 setOperationAction(ISD::STACKSAVE , MVT::Other, Expand);
624 setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom);
625 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);
626 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom);
627 setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom);
628 setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom);
629 setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
630 setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
632 // We want to custom lower some of our intrinsics.
633 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
634 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
635 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
636 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
637 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f64, Custom);
639 // To handle counter-based loop conditions.
640 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
642 setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
643 setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
644 setOperationAction(ISD::INTRINSIC_VOID, MVT::i32, Custom);
645 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
647 // Comparisons that require checking two conditions.
648 if (Subtarget.hasSPE()) {
649 setCondCodeAction(ISD::SETO, MVT::f32, Expand);
650 setCondCodeAction(ISD::SETO, MVT::f64, Expand);
651 setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
652 setCondCodeAction(ISD::SETUO, MVT::f64, Expand);
654 setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
655 setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
656 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
657 setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
658 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
659 setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
660 setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
661 setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
662 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
663 setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
664 setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
665 setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
667 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
668 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
670 if (Subtarget.has64BitSupport()) {
671 // They also have instructions for converting between i64 and fp.
672 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
673 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Expand);
674 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
675 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Expand);
676 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
677 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
678 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
679 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
680 // This is just the low 32 bits of a (signed) fp->i64 conversion.
681 // We cannot do this with Promote because i64 is not a legal type.
682 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
683 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
685 if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) {
686 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
687 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
690 // PowerPC does not have FP_TO_UINT on 32-bit implementations.
691 if (Subtarget.hasSPE()) {
692 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Legal);
693 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
695 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Expand);
696 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
700 // With the instructions enabled under FPCVT, we can do everything.
701 if (Subtarget.hasFPCVT()) {
702 if (Subtarget.has64BitSupport()) {
703 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
704 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
705 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
706 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
707 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
708 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
709 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
710 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
713 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
714 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
715 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
716 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
717 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
718 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
719 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
720 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
723 if (Subtarget.use64BitRegs()) {
724 // 64-bit PowerPC implementations can support i64 types directly
725 addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
726 // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
727 setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
728 // 64-bit PowerPC wants to expand i128 shifts itself.
729 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
730 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
731 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
733 // 32-bit PowerPC wants to expand i64 shifts itself.
734 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
735 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
736 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
739 // PowerPC has better expansions for funnel shifts than the generic
740 // TargetLowering::expandFunnelShift.
741 if (Subtarget.has64BitSupport()) {
742 setOperationAction(ISD::FSHL, MVT::i64, Custom);
743 setOperationAction(ISD::FSHR, MVT::i64, Custom);
745 setOperationAction(ISD::FSHL, MVT::i32, Custom);
746 setOperationAction(ISD::FSHR, MVT::i32, Custom);
748 if (Subtarget.hasVSX()) {
749 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
750 setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
751 setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
752 setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
755 if (Subtarget.hasAltivec()) {
756 for (MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
757 setOperationAction(ISD::SADDSAT, VT, Legal);
758 setOperationAction(ISD::SSUBSAT, VT, Legal);
759 setOperationAction(ISD::UADDSAT, VT, Legal);
760 setOperationAction(ISD::USUBSAT, VT, Legal);
762 // First set operation action for all vector types to expand. Then we
763 // will selectively turn on ones that can be effectively codegen'd.
764 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
765 // add/sub are legal for all supported vector VT's.
766 setOperationAction(ISD::ADD, VT, Legal);
767 setOperationAction(ISD::SUB, VT, Legal);
769 // For v2i64, these are only valid with P8Vector. This is corrected after
771 if (VT.getSizeInBits() <= 128 && VT.getScalarSizeInBits() <= 64) {
772 setOperationAction(ISD::SMAX, VT, Legal);
773 setOperationAction(ISD::SMIN, VT, Legal);
774 setOperationAction(ISD::UMAX, VT, Legal);
775 setOperationAction(ISD::UMIN, VT, Legal);
778 setOperationAction(ISD::SMAX, VT, Expand);
779 setOperationAction(ISD::SMIN, VT, Expand);
780 setOperationAction(ISD::UMAX, VT, Expand);
781 setOperationAction(ISD::UMIN, VT, Expand);
784 if (Subtarget.hasVSX()) {
785 setOperationAction(ISD::FMAXNUM, VT, Legal);
786 setOperationAction(ISD::FMINNUM, VT, Legal);
789 // Vector instructions introduced in P8
790 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
791 setOperationAction(ISD::CTPOP, VT, Legal);
792 setOperationAction(ISD::CTLZ, VT, Legal);
795 setOperationAction(ISD::CTPOP, VT, Expand);
796 setOperationAction(ISD::CTLZ, VT, Expand);
799 // Vector instructions introduced in P9
800 if (Subtarget.hasP9Altivec() && (VT.SimpleTy != MVT::v1i128))
801 setOperationAction(ISD::CTTZ, VT, Legal);
803 setOperationAction(ISD::CTTZ, VT, Expand);
805 // We promote all shuffles to v16i8.
806 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
807 AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);
809 // We promote all non-typed operations to v4i32.
810 setOperationAction(ISD::AND , VT, Promote);
811 AddPromotedToType (ISD::AND , VT, MVT::v4i32);
812 setOperationAction(ISD::OR , VT, Promote);
813 AddPromotedToType (ISD::OR , VT, MVT::v4i32);
814 setOperationAction(ISD::XOR , VT, Promote);
815 AddPromotedToType (ISD::XOR , VT, MVT::v4i32);
816 setOperationAction(ISD::LOAD , VT, Promote);
817 AddPromotedToType (ISD::LOAD , VT, MVT::v4i32);
818 setOperationAction(ISD::SELECT, VT, Promote);
819 AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
820 setOperationAction(ISD::VSELECT, VT, Legal);
821 setOperationAction(ISD::SELECT_CC, VT, Promote);
822 AddPromotedToType (ISD::SELECT_CC, VT, MVT::v4i32);
823 setOperationAction(ISD::STORE, VT, Promote);
824 AddPromotedToType (ISD::STORE, VT, MVT::v4i32);
826 // No other operations are legal.
827 setOperationAction(ISD::MUL , VT, Expand);
828 setOperationAction(ISD::SDIV, VT, Expand);
829 setOperationAction(ISD::SREM, VT, Expand);
830 setOperationAction(ISD::UDIV, VT, Expand);
831 setOperationAction(ISD::UREM, VT, Expand);
832 setOperationAction(ISD::FDIV, VT, Expand);
833 setOperationAction(ISD::FREM, VT, Expand);
834 setOperationAction(ISD::FNEG, VT, Expand);
835 setOperationAction(ISD::FSQRT, VT, Expand);
836 setOperationAction(ISD::FLOG, VT, Expand);
837 setOperationAction(ISD::FLOG10, VT, Expand);
838 setOperationAction(ISD::FLOG2, VT, Expand);
839 setOperationAction(ISD::FEXP, VT, Expand);
840 setOperationAction(ISD::FEXP2, VT, Expand);
841 setOperationAction(ISD::FSIN, VT, Expand);
842 setOperationAction(ISD::FCOS, VT, Expand);
843 setOperationAction(ISD::FABS, VT, Expand);
844 setOperationAction(ISD::FFLOOR, VT, Expand);
845 setOperationAction(ISD::FCEIL, VT, Expand);
846 setOperationAction(ISD::FTRUNC, VT, Expand);
847 setOperationAction(ISD::FRINT, VT, Expand);
848 setOperationAction(ISD::FLDEXP, VT, Expand);
849 setOperationAction(ISD::FNEARBYINT, VT, Expand);
850 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
851 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
852 setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
853 setOperationAction(ISD::MULHU, VT, Expand);
854 setOperationAction(ISD::MULHS, VT, Expand);
855 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
856 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
857 setOperationAction(ISD::UDIVREM, VT, Expand);
858 setOperationAction(ISD::SDIVREM, VT, Expand);
859 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
860 setOperationAction(ISD::FPOW, VT, Expand);
861 setOperationAction(ISD::BSWAP, VT, Expand);
862 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
863 setOperationAction(ISD::ROTL, VT, Expand);
864 setOperationAction(ISD::ROTR, VT, Expand);
866 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
867 setTruncStoreAction(VT, InnerVT, Expand);
868 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
869 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
870 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
873 setOperationAction(ISD::SELECT_CC, MVT::v4i32, Expand);
874 if (!Subtarget.hasP8Vector()) {
875 setOperationAction(ISD::SMAX, MVT::v2i64, Expand);
876 setOperationAction(ISD::SMIN, MVT::v2i64, Expand);
877 setOperationAction(ISD::UMAX, MVT::v2i64, Expand);
878 setOperationAction(ISD::UMIN, MVT::v2i64, Expand);
881 // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
882 // with merges, splats, etc.
883 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
885 // Vector truncates to sub-word integer that fit in an Altivec/VSX register
886 // are cheap, so handle them before they get expanded to scalar.
887 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);
888 setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);
889 setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);
890 setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);
891 setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);
893 setOperationAction(ISD::AND , MVT::v4i32, Legal);
894 setOperationAction(ISD::OR , MVT::v4i32, Legal);
895 setOperationAction(ISD::XOR , MVT::v4i32, Legal);
896 setOperationAction(ISD::LOAD , MVT::v4i32, Legal);
897 setOperationAction(ISD::SELECT, MVT::v4i32,
898 Subtarget.useCRBits() ? Legal : Expand);
899 setOperationAction(ISD::STORE , MVT::v4i32, Legal);
900 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
901 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Legal);
902 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
903 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Legal);
904 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
905 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
906 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
907 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
908 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
909 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
910 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
911 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
913 // Custom lowering ROTL v1i128 to VECTOR_SHUFFLE v16i8.
914 setOperationAction(ISD::ROTL, MVT::v1i128, Custom);
915 // With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w).
916 if (Subtarget.hasAltivec())
917 for (auto VT : {MVT::v4i32, MVT::v8i16, MVT::v16i8})
918 setOperationAction(ISD::ROTL, VT, Legal);
919 // With hasP8Altivec set, we can lower ISD::ROTL to vrld.
920 if (Subtarget.hasP8Altivec())
921 setOperationAction(ISD::ROTL, MVT::v2i64, Legal);
923 addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
924 addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
925 addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
926 addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);
928 setOperationAction(ISD::MUL, MVT::v4f32, Legal);
929 setOperationAction(ISD::FMA, MVT::v4f32, Legal);
931 if (Subtarget.hasVSX()) {
932 setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
933 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
934 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
937 if (Subtarget.hasP8Altivec())
938 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
940 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
942 if (Subtarget.isISA3_1()) {
943 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
944 setOperationAction(ISD::MULHS, MVT::v2i64, Legal);
945 setOperationAction(ISD::MULHU, MVT::v2i64, Legal);
946 setOperationAction(ISD::MULHS, MVT::v4i32, Legal);
947 setOperationAction(ISD::MULHU, MVT::v4i32, Legal);
948 setOperationAction(ISD::UDIV, MVT::v2i64, Legal);
949 setOperationAction(ISD::SDIV, MVT::v2i64, Legal);
950 setOperationAction(ISD::UDIV, MVT::v4i32, Legal);
951 setOperationAction(ISD::SDIV, MVT::v4i32, Legal);
952 setOperationAction(ISD::UREM, MVT::v2i64, Legal);
953 setOperationAction(ISD::SREM, MVT::v2i64, Legal);
954 setOperationAction(ISD::UREM, MVT::v4i32, Legal);
955 setOperationAction(ISD::SREM, MVT::v4i32, Legal);
956 setOperationAction(ISD::UREM, MVT::v1i128, Legal);
957 setOperationAction(ISD::SREM, MVT::v1i128, Legal);
958 setOperationAction(ISD::UDIV, MVT::v1i128, Legal);
959 setOperationAction(ISD::SDIV, MVT::v1i128, Legal);
960 setOperationAction(ISD::ROTL, MVT::v1i128, Legal);
963 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
964 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
966 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
967 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
969 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
970 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
971 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
972 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
974 // Altivec does not contain unordered floating-point compare instructions
975 setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
976 setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
977 setCondCodeAction(ISD::SETO, MVT::v4f32, Expand);
978 setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
980 if (Subtarget.hasVSX()) {
981 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
982 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
983 if (Subtarget.hasP8Vector()) {
984 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
985 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
987 if (Subtarget.hasDirectMove() && isPPC64) {
988 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal);
989 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal);
990 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal);
991 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal);
992 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal);
993 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal);
994 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
995 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
997 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
999 // The nearbyint variants are not allowed to raise the inexact exception
1000 // so we can only code-gen them with unsafe math.
1001 if (TM.Options.UnsafeFPMath) {
1002 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1003 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1006 setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
1007 setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
1008 setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
1009 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
1010 setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
1011 setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
1012 setOperationAction(ISD::FROUND, MVT::f64, Legal);
1013 setOperationAction(ISD::FRINT, MVT::f64, Legal);
1015 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
1016 setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
1017 setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
1018 setOperationAction(ISD::FROUND, MVT::f32, Legal);
1019 setOperationAction(ISD::FRINT, MVT::f32, Legal);
1021 setOperationAction(ISD::MUL, MVT::v2f64, Legal);
1022 setOperationAction(ISD::FMA, MVT::v2f64, Legal);
1024 setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
1025 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
1027 // Share the Altivec comparison restrictions.
1028 setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
1029 setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
1030 setCondCodeAction(ISD::SETO, MVT::v2f64, Expand);
1031 setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);
1033 setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
1034 setOperationAction(ISD::STORE, MVT::v2f64, Legal);
1036 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
1038 if (Subtarget.hasP8Vector())
1039 addRegisterClass(MVT::f32, &PPC::VSSRCRegClass);
1041 addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);
1043 addRegisterClass(MVT::v4i32, &PPC::VSRCRegClass);
1044 addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
1045 addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);
1047 if (Subtarget.hasP8Altivec()) {
1048 setOperationAction(ISD::SHL, MVT::v2i64, Legal);
1049 setOperationAction(ISD::SRA, MVT::v2i64, Legal);
1050 setOperationAction(ISD::SRL, MVT::v2i64, Legal);
1052 // 128 bit shifts can be accomplished via 3 instructions for SHL and
1053 // SRL, but not for SRA because of the instructions available:
1054 // VS{RL} and VS{RL}O. However due to direct move costs, it's not worth
1056 setOperationAction(ISD::SHL, MVT::v1i128, Expand);
1057 setOperationAction(ISD::SRL, MVT::v1i128, Expand);
1058 setOperationAction(ISD::SRA, MVT::v1i128, Expand);
1060 setOperationAction(ISD::SETCC, MVT::v2i64, Legal);
1063 setOperationAction(ISD::SHL, MVT::v2i64, Expand);
1064 setOperationAction(ISD::SRA, MVT::v2i64, Expand);
1065 setOperationAction(ISD::SRL, MVT::v2i64, Expand);
1067 setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
1069 // VSX v2i64 only supports non-arithmetic operations.
1070 setOperationAction(ISD::ADD, MVT::v2i64, Expand);
1071 setOperationAction(ISD::SUB, MVT::v2i64, Expand);
1074 if (Subtarget.isISA3_1())
1075 setOperationAction(ISD::SETCC, MVT::v1i128, Legal);
1077 setOperationAction(ISD::SETCC, MVT::v1i128, Expand);
1079 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
1080 AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
1081 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
1082 AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);
1084 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
1086 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i64, Legal);
1087 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i64, Legal);
1088 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i64, Legal);
1089 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i64, Legal);
1090 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
1091 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
1092 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
1093 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
1095 // Custom handling for partial vectors of integers converted to
1096 // floating point. We already have optimal handling for v2i32 through
1097 // the DAG combine, so those aren't necessary.
1098 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i8, Custom);
1099 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i8, Custom);
1100 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i16, Custom);
1101 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i16, Custom);
1102 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i8, Custom);
1103 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i8, Custom);
1104 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i16, Custom);
1105 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i16, Custom);
1106 setOperationAction(ISD::UINT_TO_FP, MVT::v2i8, Custom);
1107 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
1108 setOperationAction(ISD::UINT_TO_FP, MVT::v2i16, Custom);
1109 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1110 setOperationAction(ISD::SINT_TO_FP, MVT::v2i8, Custom);
1111 setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Custom);
1112 setOperationAction(ISD::SINT_TO_FP, MVT::v2i16, Custom);
1113 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
1115 setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
1116 setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
1117 setOperationAction(ISD::FABS, MVT::v4f32, Legal);
1118 setOperationAction(ISD::FABS, MVT::v2f64, Legal);
1119 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Legal);
1120 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Legal);
1122 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
1123 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
1125 // Handle constrained floating-point operations of vector.
1126 // The predictor is `hasVSX` because altivec instruction has
1127 // no exception but VSX vector instruction has.
1128 setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
1129 setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
1130 setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
1131 setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
1132 setOperationAction(ISD::STRICT_FMA, MVT::v4f32, Legal);
1133 setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
1134 setOperationAction(ISD::STRICT_FMAXNUM, MVT::v4f32, Legal);
1135 setOperationAction(ISD::STRICT_FMINNUM, MVT::v4f32, Legal);
1136 setOperationAction(ISD::STRICT_FRINT, MVT::v4f32, Legal);
1137 setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);
1138 setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal);
1139 setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);
1140 setOperationAction(ISD::STRICT_FROUND, MVT::v4f32, Legal);
1142 setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
1143 setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
1144 setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
1145 setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
1146 setOperationAction(ISD::STRICT_FMA, MVT::v2f64, Legal);
1147 setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
1148 setOperationAction(ISD::STRICT_FMAXNUM, MVT::v2f64, Legal);
1149 setOperationAction(ISD::STRICT_FMINNUM, MVT::v2f64, Legal);
1150 setOperationAction(ISD::STRICT_FRINT, MVT::v2f64, Legal);
1151 setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);
1152 setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal);
1153 setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);
1154 setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal);
1156 addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
1157 addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
1159 for (MVT FPT : MVT::fp_valuetypes())
1160 setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
1162 // Expand the SELECT to SELECT_CC
1163 setOperationAction(ISD::SELECT, MVT::f128, Expand);
1165 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
1166 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
1168 // No implementation for these ops for PowerPC.
1169 setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
1170 setOperationAction(ISD::FSIN, MVT::f128, Expand);
1171 setOperationAction(ISD::FCOS, MVT::f128, Expand);
1172 setOperationAction(ISD::FPOW, MVT::f128, Expand);
1173 setOperationAction(ISD::FPOWI, MVT::f128, Expand);
1174 setOperationAction(ISD::FREM, MVT::f128, Expand);
1177 if (Subtarget.hasP8Altivec()) {
1178 addRegisterClass(MVT::v2i64, &PPC::VRRCRegClass);
1179 addRegisterClass(MVT::v1i128, &PPC::VRRCRegClass);
1182 if (Subtarget.hasP9Vector()) {
1183 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
1184 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
1186 // Test data class instructions store results in CR bits.
1187 if (Subtarget.useCRBits()) {
1188 setOperationAction(ISD::IS_FPCLASS, MVT::f32, Custom);
1189 setOperationAction(ISD::IS_FPCLASS, MVT::f64, Custom);
1190 setOperationAction(ISD::IS_FPCLASS, MVT::f128, Custom);
1193 // 128 bit shifts can be accomplished via 3 instructions for SHL and
1194 // SRL, but not for SRA because of the instructions available:
1195 // VS{RL} and VS{RL}O.
1196 setOperationAction(ISD::SHL, MVT::v1i128, Legal);
1197 setOperationAction(ISD::SRL, MVT::v1i128, Legal);
1198 setOperationAction(ISD::SRA, MVT::v1i128, Expand);
1200 setOperationAction(ISD::FADD, MVT::f128, Legal);
1201 setOperationAction(ISD::FSUB, MVT::f128, Legal);
1202 setOperationAction(ISD::FDIV, MVT::f128, Legal);
1203 setOperationAction(ISD::FMUL, MVT::f128, Legal);
1204 setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
1206 setOperationAction(ISD::FMA, MVT::f128, Legal);
1207 setCondCodeAction(ISD::SETULT, MVT::f128, Expand);
1208 setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);
1209 setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);
1210 setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);
1211 setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);
1212 setCondCodeAction(ISD::SETONE, MVT::f128, Expand);
1214 setOperationAction(ISD::FTRUNC, MVT::f128, Legal);
1215 setOperationAction(ISD::FRINT, MVT::f128, Legal);
1216 setOperationAction(ISD::FFLOOR, MVT::f128, Legal);
1217 setOperationAction(ISD::FCEIL, MVT::f128, Legal);
1218 setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);
1219 setOperationAction(ISD::FROUND, MVT::f128, Legal);
1221 setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
1222 setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);
1223 setOperationAction(ISD::BITCAST, MVT::i128, Custom);
1225 // Handle constrained floating-point operations of fp128
1226 setOperationAction(ISD::STRICT_FADD, MVT::f128, Legal);
1227 setOperationAction(ISD::STRICT_FSUB, MVT::f128, Legal);
1228 setOperationAction(ISD::STRICT_FMUL, MVT::f128, Legal);
1229 setOperationAction(ISD::STRICT_FDIV, MVT::f128, Legal);
1230 setOperationAction(ISD::STRICT_FMA, MVT::f128, Legal);
1231 setOperationAction(ISD::STRICT_FSQRT, MVT::f128, Legal);
1232 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Legal);
1233 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
1234 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
1235 setOperationAction(ISD::STRICT_FRINT, MVT::f128, Legal);
1236 setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f128, Legal);
1237 setOperationAction(ISD::STRICT_FFLOOR, MVT::f128, Legal);
1238 setOperationAction(ISD::STRICT_FCEIL, MVT::f128, Legal);
1239 setOperationAction(ISD::STRICT_FTRUNC, MVT::f128, Legal);
1240 setOperationAction(ISD::STRICT_FROUND, MVT::f128, Legal);
1241 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
1242 setOperationAction(ISD::BSWAP, MVT::v8i16, Legal);
1243 setOperationAction(ISD::BSWAP, MVT::v4i32, Legal);
1244 setOperationAction(ISD::BSWAP, MVT::v2i64, Legal);
1245 setOperationAction(ISD::BSWAP, MVT::v1i128, Legal);
1246 } else if (Subtarget.hasVSX()) {
1247 setOperationAction(ISD::LOAD, MVT::f128, Promote);
1248 setOperationAction(ISD::STORE, MVT::f128, Promote);
1250 AddPromotedToType(ISD::LOAD, MVT::f128, MVT::v4i32);
1251 AddPromotedToType(ISD::STORE, MVT::f128, MVT::v4i32);
1253 // Set FADD/FSUB as libcall to avoid the legalizer to expand the
1254 // fp_to_uint and int_to_fp.
1255 setOperationAction(ISD::FADD, MVT::f128, LibCall);
1256 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
1258 setOperationAction(ISD::FMUL, MVT::f128, Expand);
1259 setOperationAction(ISD::FDIV, MVT::f128, Expand);
1260 setOperationAction(ISD::FNEG, MVT::f128, Expand);
1261 setOperationAction(ISD::FABS, MVT::f128, Expand);
1262 setOperationAction(ISD::FSQRT, MVT::f128, Expand);
1263 setOperationAction(ISD::FMA, MVT::f128, Expand);
1264 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
1266 // Expand the fp_extend if the target type is fp128.
1267 setOperationAction(ISD::FP_EXTEND, MVT::f128, Expand);
1268 setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Expand);
1270 // Expand the fp_round if the source type is fp128.
1271 for (MVT VT : {MVT::f32, MVT::f64}) {
1272 setOperationAction(ISD::FP_ROUND, VT, Custom);
1273 setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
1276 setOperationAction(ISD::SETCC, MVT::f128, Custom);
1277 setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
1278 setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
1279 setOperationAction(ISD::BR_CC, MVT::f128, Expand);
1281 // Lower following f128 select_cc pattern:
1282 // select_cc x, y, tv, fv, cc -> select_cc (setcc x, y, cc), 0, tv, fv, NE
1283 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
1285 // We need to handle f128 SELECT_CC with integer result type.
1286 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
1287 setOperationAction(ISD::SELECT_CC, MVT::i64, isPPC64 ? Custom : Expand);
1290 if (Subtarget.hasP9Altivec()) {
1291 if (Subtarget.isISA3_1()) {
1292 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal);
1293 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Legal);
1294 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Legal);
1295 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Legal);
1297 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
1298 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
1300 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Legal);
1301 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Legal);
1302 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Legal);
1303 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Legal);
1304 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Legal);
1305 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
1306 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
1308 setOperationAction(ISD::ABDU, MVT::v16i8, Legal);
1309 setOperationAction(ISD::ABDU, MVT::v8i16, Legal);
1310 setOperationAction(ISD::ABDU, MVT::v4i32, Legal);
1311 setOperationAction(ISD::ABDS, MVT::v4i32, Legal);
1314 if (Subtarget.hasP10Vector()) {
1315 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
1319 if (Subtarget.pairedVectorMemops()) {
1320 addRegisterClass(MVT::v256i1, &PPC::VSRpRCRegClass);
1321 setOperationAction(ISD::LOAD, MVT::v256i1, Custom);
1322 setOperationAction(ISD::STORE, MVT::v256i1, Custom);
1324 if (Subtarget.hasMMA()) {
1325 if (Subtarget.isISAFuture())
1326 addRegisterClass(MVT::v512i1, &PPC::WACCRCRegClass);
1328 addRegisterClass(MVT::v512i1, &PPC::UACCRCRegClass);
1329 setOperationAction(ISD::LOAD, MVT::v512i1, Custom);
1330 setOperationAction(ISD::STORE, MVT::v512i1, Custom);
1331 setOperationAction(ISD::BUILD_VECTOR, MVT::v512i1, Custom);
1334 if (Subtarget.has64BitSupport())
1335 setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
1337 if (Subtarget.isISA3_1())
1338 setOperationAction(ISD::SRA, MVT::v1i128, Legal);
1340 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, isPPC64 ? Legal : Custom);
1343 setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
1344 setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
1347 if (shouldInlineQuadwordAtomics()) {
1348 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
1349 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
1350 setOperationAction(ISD::INTRINSIC_VOID, MVT::i128, Custom);
1353 setBooleanContents(ZeroOrOneBooleanContent);
1355 if (Subtarget.hasAltivec()) {
1356 // Altivec instructions set fields to all zeros or all ones.
1357 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
1360 setLibcallName(RTLIB::MULO_I128, nullptr);
1362 // These libcalls are not available in 32-bit.
1363 setLibcallName(RTLIB::SHL_I128, nullptr);
1364 setLibcallName(RTLIB::SRL_I128, nullptr);
1365 setLibcallName(RTLIB::SRA_I128, nullptr);
1366 setLibcallName(RTLIB::MUL_I128, nullptr);
1367 setLibcallName(RTLIB::MULO_I64, nullptr);
1370 if (shouldInlineQuadwordAtomics())
1371 setMaxAtomicSizeInBitsSupported(128);
1373 setMaxAtomicSizeInBitsSupported(64);
1375 setMaxAtomicSizeInBitsSupported(32);
1377 setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1);
1379 // We have target-specific dag combine patterns for the following nodes:
1380 setTargetDAGCombine({ISD::AND, ISD::ADD, ISD::SHL, ISD::SRA, ISD::SRL,
1381 ISD::MUL, ISD::FMA, ISD::SINT_TO_FP, ISD::BUILD_VECTOR});
1382 if (Subtarget.hasFPCVT())
1383 setTargetDAGCombine(ISD::UINT_TO_FP);
1384 setTargetDAGCombine({ISD::LOAD, ISD::STORE, ISD::BR_CC});
1385 if (Subtarget.useCRBits())
1386 setTargetDAGCombine(ISD::BRCOND);
1387 setTargetDAGCombine({ISD::BSWAP, ISD::INTRINSIC_WO_CHAIN,
1388 ISD::INTRINSIC_W_CHAIN, ISD::INTRINSIC_VOID});
1390 setTargetDAGCombine({ISD::SIGN_EXTEND, ISD::ZERO_EXTEND, ISD::ANY_EXTEND});
1392 setTargetDAGCombine({ISD::TRUNCATE, ISD::VECTOR_SHUFFLE});
1394 if (Subtarget.useCRBits()) {
1395 setTargetDAGCombine({ISD::TRUNCATE, ISD::SETCC, ISD::SELECT_CC});
1398 setLibcallName(RTLIB::LOG_F128, "logf128");
1399 setLibcallName(RTLIB::LOG2_F128, "log2f128");
1400 setLibcallName(RTLIB::LOG10_F128, "log10f128");
1401 setLibcallName(RTLIB::EXP_F128, "expf128");
1402 setLibcallName(RTLIB::EXP2_F128, "exp2f128");
1403 setLibcallName(RTLIB::SIN_F128, "sinf128");
1404 setLibcallName(RTLIB::COS_F128, "cosf128");
1405 setLibcallName(RTLIB::SINCOS_F128, "sincosf128");
1406 setLibcallName(RTLIB::POW_F128, "powf128");
1407 setLibcallName(RTLIB::FMIN_F128, "fminf128");
1408 setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
1409 setLibcallName(RTLIB::REM_F128, "fmodf128");
1410 setLibcallName(RTLIB::SQRT_F128, "sqrtf128");
1411 setLibcallName(RTLIB::CEIL_F128, "ceilf128");
1412 setLibcallName(RTLIB::FLOOR_F128, "floorf128");
1413 setLibcallName(RTLIB::TRUNC_F128, "truncf128");
1414 setLibcallName(RTLIB::ROUND_F128, "roundf128");
1415 setLibcallName(RTLIB::LROUND_F128, "lroundf128");
1416 setLibcallName(RTLIB::LLROUND_F128, "llroundf128");
1417 setLibcallName(RTLIB::RINT_F128, "rintf128");
1418 setLibcallName(RTLIB::LRINT_F128, "lrintf128");
1419 setLibcallName(RTLIB::LLRINT_F128, "llrintf128");
1420 setLibcallName(RTLIB::NEARBYINT_F128, "nearbyintf128");
1421 setLibcallName(RTLIB::FMA_F128, "fmaf128");
1423 if (Subtarget.isAIXABI()) {
1424 setLibcallName(RTLIB::MEMCPY, isPPC64 ? "___memmove64" : "___memmove");
1425 setLibcallName(RTLIB::MEMMOVE, isPPC64 ? "___memmove64" : "___memmove");
1426 setLibcallName(RTLIB::MEMSET, isPPC64 ? "___memset64" : "___memset");
1427 setLibcallName(RTLIB::BZERO, isPPC64 ? "___bzero64" : "___bzero");
1430 // With 32 condition bits, we don't need to sink (and duplicate) compares
1431 // aggressively in CodeGenPrep.
1432 if (Subtarget.useCRBits()) {
1433 setHasMultipleConditionRegisters();
1434 setJumpIsExpensive();
1437 setMinFunctionAlignment(Align(4));
1439 switch (Subtarget.getCPUDirective()) {
1444 case PPC::DIR_E500mc:
1445 case PPC::DIR_E5500:
1448 case PPC::DIR_PWR5X:
1450 case PPC::DIR_PWR6X:
1454 case PPC::DIR_PWR10:
1455 case PPC::DIR_PWR_FUTURE:
1456 setPrefLoopAlignment(Align(16));
1457 setPrefFunctionAlignment(Align(16));
1461 if (Subtarget.enableMachineScheduler())
1462 setSchedulingPreference(Sched::Source);
1464 setSchedulingPreference(Sched::Hybrid);
1466 computeRegisterProperties(STI.getRegisterInfo());
1468 // The Freescale cores do better with aggressive inlining of memcpy and
1469 // friends. GCC uses same threshold of 128 bytes (= 32 word stores).
1470 if (Subtarget.getCPUDirective() == PPC::DIR_E500mc ||
1471 Subtarget.getCPUDirective() == PPC::DIR_E5500) {
1472 MaxStoresPerMemset = 32;
1473 MaxStoresPerMemsetOptSize = 16;
1474 MaxStoresPerMemcpy = 32;
1475 MaxStoresPerMemcpyOptSize = 8;
1476 MaxStoresPerMemmove = 32;
1477 MaxStoresPerMemmoveOptSize = 8;
1478 } else if (Subtarget.getCPUDirective() == PPC::DIR_A2) {
1479 // The A2 also benefits from (very) aggressive inlining of memcpy and
1480 // friends. The overhead of a the function call, even when warm, can be
1481 // over one hundred cycles.
1482 MaxStoresPerMemset = 128;
1483 MaxStoresPerMemcpy = 128;
1484 MaxStoresPerMemmove = 128;
1485 MaxLoadsPerMemcmp = 128;
1487 MaxLoadsPerMemcmp = 8;
1488 MaxLoadsPerMemcmpOptSize = 4;
1491 IsStrictFPEnabled = true;
1493 // Let the subtarget (CPU) decide if a predictable select is more expensive
1494 // than the corresponding branch. This information is used in CGP to decide
1495 // when to convert selects into branches.
1496 PredictableSelectIsExpensive = Subtarget.isPredictableSelectIsExpensive();
1499 // *********************************** NOTE ************************************
1500 // For selecting load and store instructions, the addressing modes are defined
1501 // as ComplexPatterns in PPCInstrInfo.td, which are then utilized in the TD
1502 // patterns to match the load the store instructions.
1504 // The TD definitions for the addressing modes correspond to their respective
1505 // Select<AddrMode>Form() function in PPCISelDAGToDAG.cpp. These functions rely
1506 // on SelectOptimalAddrMode(), which calls computeMOFlags() to compute the
1507 // address mode flags of a particular node. Afterwards, the computed address
1508 // flags are passed into getAddrModeForFlags() in order to retrieve the optimal
1509 // addressing mode. SelectOptimalAddrMode() then sets the Base and Displacement
1510 // accordingly, based on the preferred addressing mode.
1512 // Within PPCISelLowering.h, there are two enums: MemOpFlags and AddrMode.
1513 // MemOpFlags contains all the possible flags that can be used to compute the
1514 // optimal addressing mode for load and store instructions.
1515 // AddrMode contains all the possible load and store addressing modes available
1516 // on Power (such as DForm, DSForm, DQForm, XForm, etc.)
1518 // When adding new load and store instructions, it is possible that new address
1519 // flags may need to be added into MemOpFlags, and a new addressing mode will
1520 // need to be added to AddrMode. An entry of the new addressing mode (consisting
1521 // of the minimal and main distinguishing address flags for the new load/store
1522 // instructions) will need to be added into initializeAddrModeMap() below.
1523 // Finally, when adding new addressing modes, the getAddrModeForFlags() will
1524 // need to be updated to account for selecting the optimal addressing mode.
1525 // *****************************************************************************
1526 /// Initialize the map that relates the different addressing modes of the load
1527 /// and store instructions to a set of flags. This ensures the load/store
1528 /// instruction is correctly matched during instruction selection.
1529 void PPCTargetLowering::initializeAddrModeMap() {
1530 AddrModesMap[PPC::AM_DForm] = {
1532 PPC::MOF_ZExt | PPC::MOF_RPlusSImm16 | PPC::MOF_WordInt,
1533 PPC::MOF_ZExt | PPC::MOF_RPlusLo | PPC::MOF_WordInt,
1534 PPC::MOF_ZExt | PPC::MOF_NotAddNorCst | PPC::MOF_WordInt,
1535 PPC::MOF_ZExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_WordInt,
1536 // LBZ, LHZ, STB, STH
1537 PPC::MOF_ZExt | PPC::MOF_RPlusSImm16 | PPC::MOF_SubWordInt,
1538 PPC::MOF_ZExt | PPC::MOF_RPlusLo | PPC::MOF_SubWordInt,
1539 PPC::MOF_ZExt | PPC::MOF_NotAddNorCst | PPC::MOF_SubWordInt,
1540 PPC::MOF_ZExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubWordInt,
1542 PPC::MOF_SExt | PPC::MOF_RPlusSImm16 | PPC::MOF_SubWordInt,
1543 PPC::MOF_SExt | PPC::MOF_RPlusLo | PPC::MOF_SubWordInt,
1544 PPC::MOF_SExt | PPC::MOF_NotAddNorCst | PPC::MOF_SubWordInt,
1545 PPC::MOF_SExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubWordInt,
1546 // LFS, LFD, STFS, STFD
1547 PPC::MOF_RPlusSImm16 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1548 PPC::MOF_RPlusLo | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1549 PPC::MOF_NotAddNorCst | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1550 PPC::MOF_AddrIsSImm32 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetBeforeP9,
1552 AddrModesMap[PPC::AM_DSForm] = {
1554 PPC::MOF_SExt | PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_WordInt,
1555 PPC::MOF_SExt | PPC::MOF_NotAddNorCst | PPC::MOF_WordInt,
1556 PPC::MOF_SExt | PPC::MOF_AddrIsSImm32 | PPC::MOF_WordInt,
1558 PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_DoubleWordInt,
1559 PPC::MOF_NotAddNorCst | PPC::MOF_DoubleWordInt,
1560 PPC::MOF_AddrIsSImm32 | PPC::MOF_DoubleWordInt,
1561 // DFLOADf32, DFLOADf64, DSTOREf32, DSTOREf64
1562 PPC::MOF_RPlusSImm16Mult4 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1563 PPC::MOF_NotAddNorCst | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1564 PPC::MOF_AddrIsSImm32 | PPC::MOF_ScalarFloat | PPC::MOF_SubtargetP9,
1566 AddrModesMap[PPC::AM_DQForm] = {
1568 PPC::MOF_RPlusSImm16Mult16 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1569 PPC::MOF_NotAddNorCst | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1570 PPC::MOF_AddrIsSImm32 | PPC::MOF_Vector | PPC::MOF_SubtargetP9,
1572 AddrModesMap[PPC::AM_PrefixDForm] = {PPC::MOF_RPlusSImm34 |
1573 PPC::MOF_SubtargetP10};
1574 // TODO: Add mapping for quadword load/store.
1577 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
1578 /// the desired ByVal argument alignment.
1579 static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) {
1580 if (MaxAlign == MaxMaxAlign)
1582 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1583 if (MaxMaxAlign >= 32 &&
1584 VTy->getPrimitiveSizeInBits().getFixedValue() >= 256)
1585 MaxAlign = Align(32);
1586 else if (VTy->getPrimitiveSizeInBits().getFixedValue() >= 128 &&
1588 MaxAlign = Align(16);
1589 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1591 getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
1592 if (EltAlign > MaxAlign)
1593 MaxAlign = EltAlign;
1594 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1595 for (auto *EltTy : STy->elements()) {
1597 getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);
1598 if (EltAlign > MaxAlign)
1599 MaxAlign = EltAlign;
1600 if (MaxAlign == MaxMaxAlign)
1606 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
1607 /// function arguments in the caller parameter area.
1608 uint64_t PPCTargetLowering::getByValTypeAlignment(Type *Ty,
1609 const DataLayout &DL) const {
1610 // 16byte and wider vectors are passed on 16byte boundary.
1611 // The rest is 8 on PPC64 and 4 on PPC32 boundary.
1612 Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
1613 if (Subtarget.hasAltivec())
1614 getMaxByValAlign(Ty, Alignment, Align(16));
1615 return Alignment.value();
1618 bool PPCTargetLowering::useSoftFloat() const {
1619 return Subtarget.useSoftFloat();
1622 bool PPCTargetLowering::hasSPE() const {
1623 return Subtarget.hasSPE();
1626 bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
1627 return VT.isScalarInteger();
1630 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
1631 switch ((PPCISD::NodeType)Opcode) {
1632 case PPCISD::FIRST_NUMBER: break;
1633 case PPCISD::FSEL: return "PPCISD::FSEL";
1634 case PPCISD::XSMAXC: return "PPCISD::XSMAXC";
1635 case PPCISD::XSMINC: return "PPCISD::XSMINC";
1636 case PPCISD::FCFID: return "PPCISD::FCFID";
1637 case PPCISD::FCFIDU: return "PPCISD::FCFIDU";
1638 case PPCISD::FCFIDS: return "PPCISD::FCFIDS";
1639 case PPCISD::FCFIDUS: return "PPCISD::FCFIDUS";
1640 case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ";
1641 case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ";
1642 case PPCISD::FCTIDUZ: return "PPCISD::FCTIDUZ";
1643 case PPCISD::FCTIWUZ: return "PPCISD::FCTIWUZ";
1644 case PPCISD::FRE: return "PPCISD::FRE";
1645 case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";
1646 case PPCISD::FTSQRT:
1647 return "PPCISD::FTSQRT";
1649 return "PPCISD::FSQRT";
1650 case PPCISD::STFIWX: return "PPCISD::STFIWX";
1651 case PPCISD::VPERM: return "PPCISD::VPERM";
1652 case PPCISD::XXSPLT: return "PPCISD::XXSPLT";
1653 case PPCISD::XXSPLTI_SP_TO_DP:
1654 return "PPCISD::XXSPLTI_SP_TO_DP";
1655 case PPCISD::XXSPLTI32DX:
1656 return "PPCISD::XXSPLTI32DX";
1657 case PPCISD::VECINSERT: return "PPCISD::VECINSERT";
1658 case PPCISD::XXPERMDI: return "PPCISD::XXPERMDI";
1659 case PPCISD::XXPERM:
1660 return "PPCISD::XXPERM";
1661 case PPCISD::VECSHL: return "PPCISD::VECSHL";
1662 case PPCISD::CMPB: return "PPCISD::CMPB";
1663 case PPCISD::Hi: return "PPCISD::Hi";
1664 case PPCISD::Lo: return "PPCISD::Lo";
1665 case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY";
1666 case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";
1667 case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
1668 case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";
1669 case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET";
1670 case PPCISD::PROBED_ALLOCA: return "PPCISD::PROBED_ALLOCA";
1671 case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
1672 case PPCISD::SRL: return "PPCISD::SRL";
1673 case PPCISD::SRA: return "PPCISD::SRA";
1674 case PPCISD::SHL: return "PPCISD::SHL";
1675 case PPCISD::SRA_ADDZE: return "PPCISD::SRA_ADDZE";
1676 case PPCISD::CALL: return "PPCISD::CALL";
1677 case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP";
1678 case PPCISD::CALL_NOTOC: return "PPCISD::CALL_NOTOC";
1679 case PPCISD::CALL_RM:
1680 return "PPCISD::CALL_RM";
1681 case PPCISD::CALL_NOP_RM:
1682 return "PPCISD::CALL_NOP_RM";
1683 case PPCISD::CALL_NOTOC_RM:
1684 return "PPCISD::CALL_NOTOC_RM";
1685 case PPCISD::MTCTR: return "PPCISD::MTCTR";
1686 case PPCISD::BCTRL: return "PPCISD::BCTRL";
1687 case PPCISD::BCTRL_LOAD_TOC: return "PPCISD::BCTRL_LOAD_TOC";
1688 case PPCISD::BCTRL_RM:
1689 return "PPCISD::BCTRL_RM";
1690 case PPCISD::BCTRL_LOAD_TOC_RM:
1691 return "PPCISD::BCTRL_LOAD_TOC_RM";
1692 case PPCISD::RET_GLUE: return "PPCISD::RET_GLUE";
1693 case PPCISD::READ_TIME_BASE: return "PPCISD::READ_TIME_BASE";
1694 case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP";
1695 case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
1696 case PPCISD::MFOCRF: return "PPCISD::MFOCRF";
1697 case PPCISD::MFVSR: return "PPCISD::MFVSR";
1698 case PPCISD::MTVSRA: return "PPCISD::MTVSRA";
1699 case PPCISD::MTVSRZ: return "PPCISD::MTVSRZ";
1700 case PPCISD::SINT_VEC_TO_FP: return "PPCISD::SINT_VEC_TO_FP";
1701 case PPCISD::UINT_VEC_TO_FP: return "PPCISD::UINT_VEC_TO_FP";
1702 case PPCISD::SCALAR_TO_VECTOR_PERMUTED:
1703 return "PPCISD::SCALAR_TO_VECTOR_PERMUTED";
1704 case PPCISD::ANDI_rec_1_EQ_BIT:
1705 return "PPCISD::ANDI_rec_1_EQ_BIT";
1706 case PPCISD::ANDI_rec_1_GT_BIT:
1707 return "PPCISD::ANDI_rec_1_GT_BIT";
1708 case PPCISD::VCMP: return "PPCISD::VCMP";
1709 case PPCISD::VCMP_rec: return "PPCISD::VCMP_rec";
1710 case PPCISD::LBRX: return "PPCISD::LBRX";
1711 case PPCISD::STBRX: return "PPCISD::STBRX";
1712 case PPCISD::LFIWAX: return "PPCISD::LFIWAX";
1713 case PPCISD::LFIWZX: return "PPCISD::LFIWZX";
1714 case PPCISD::LXSIZX: return "PPCISD::LXSIZX";
1715 case PPCISD::STXSIX: return "PPCISD::STXSIX";
1716 case PPCISD::VEXTS: return "PPCISD::VEXTS";
1717 case PPCISD::LXVD2X: return "PPCISD::LXVD2X";
1718 case PPCISD::STXVD2X: return "PPCISD::STXVD2X";
1719 case PPCISD::LOAD_VEC_BE: return "PPCISD::LOAD_VEC_BE";
1720 case PPCISD::STORE_VEC_BE: return "PPCISD::STORE_VEC_BE";
1721 case PPCISD::ST_VSR_SCAL_INT:
1722 return "PPCISD::ST_VSR_SCAL_INT";
1723 case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH";
1724 case PPCISD::BDNZ: return "PPCISD::BDNZ";
1725 case PPCISD::BDZ: return "PPCISD::BDZ";
1726 case PPCISD::MFFS: return "PPCISD::MFFS";
1727 case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ";
1728 case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN";
1729 case PPCISD::CR6SET: return "PPCISD::CR6SET";
1730 case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET";
1731 case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT";
1732 case PPCISD::PPC32_PICGOT: return "PPCISD::PPC32_PICGOT";
1733 case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
1734 case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L";
1735 case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS";
1736 case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA";
1737 case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L";
1738 case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR";
1739 case PPCISD::GET_TPOINTER: return "PPCISD::GET_TPOINTER";
1740 case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
1741 case PPCISD::TLSGD_AIX: return "PPCISD::TLSGD_AIX";
1742 case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA";
1743 case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L";
1744 case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR";
1745 case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
1746 case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
1747 case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L";
1748 case PPCISD::PADDI_DTPREL:
1749 return "PPCISD::PADDI_DTPREL";
1750 case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT";
1751 case PPCISD::SC: return "PPCISD::SC";
1752 case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB";
1753 case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE";
1754 case PPCISD::RFEBB: return "PPCISD::RFEBB";
1755 case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD";
1756 case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN";
1757 case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128";
1758 case PPCISD::BUILD_SPE64: return "PPCISD::BUILD_SPE64";
1759 case PPCISD::EXTRACT_SPE: return "PPCISD::EXTRACT_SPE";
1760 case PPCISD::EXTSWSLI: return "PPCISD::EXTSWSLI";
1761 case PPCISD::LD_VSX_LH: return "PPCISD::LD_VSX_LH";
1762 case PPCISD::FP_EXTEND_HALF: return "PPCISD::FP_EXTEND_HALF";
1763 case PPCISD::MAT_PCREL_ADDR: return "PPCISD::MAT_PCREL_ADDR";
1764 case PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR:
1765 return "PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR";
1766 case PPCISD::TLS_LOCAL_EXEC_MAT_ADDR:
1767 return "PPCISD::TLS_LOCAL_EXEC_MAT_ADDR";
1768 case PPCISD::ACC_BUILD: return "PPCISD::ACC_BUILD";
1769 case PPCISD::PAIR_BUILD: return "PPCISD::PAIR_BUILD";
1770 case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG";
1771 case PPCISD::XXMFACC: return "PPCISD::XXMFACC";
1772 case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT";
1773 case PPCISD::ZEXT_LD_SPLAT: return "PPCISD::ZEXT_LD_SPLAT";
1774 case PPCISD::SEXT_LD_SPLAT: return "PPCISD::SEXT_LD_SPLAT";
1775 case PPCISD::FNMSUB: return "PPCISD::FNMSUB";
1776 case PPCISD::STRICT_FADDRTZ:
1777 return "PPCISD::STRICT_FADDRTZ";
1778 case PPCISD::STRICT_FCTIDZ:
1779 return "PPCISD::STRICT_FCTIDZ";
1780 case PPCISD::STRICT_FCTIWZ:
1781 return "PPCISD::STRICT_FCTIWZ";
1782 case PPCISD::STRICT_FCTIDUZ:
1783 return "PPCISD::STRICT_FCTIDUZ";
1784 case PPCISD::STRICT_FCTIWUZ:
1785 return "PPCISD::STRICT_FCTIWUZ";
1786 case PPCISD::STRICT_FCFID:
1787 return "PPCISD::STRICT_FCFID";
1788 case PPCISD::STRICT_FCFIDU:
1789 return "PPCISD::STRICT_FCFIDU";
1790 case PPCISD::STRICT_FCFIDS:
1791 return "PPCISD::STRICT_FCFIDS";
1792 case PPCISD::STRICT_FCFIDUS:
1793 return "PPCISD::STRICT_FCFIDUS";
1794 case PPCISD::LXVRZX: return "PPCISD::LXVRZX";
1795 case PPCISD::STORE_COND:
1796 return "PPCISD::STORE_COND";
1801 EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
1804 return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
1806 return VT.changeVectorElementTypeToInteger();
1809 bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const {
1810 assert(VT.isFloatingPoint() && "Non-floating-point FMA?");
1814 //===----------------------------------------------------------------------===//
1815 // Node matching predicates, for use by the tblgen matching code.
1816 //===----------------------------------------------------------------------===//
1818 /// isFloatingPointZero - Return true if this is 0.0 or -0.0.
1819 static bool isFloatingPointZero(SDValue Op) {
1820 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
1821 return CFP->getValueAPF().isZero();
1822 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
1823 // Maybe this has already been legalized into the constant pool?
1824 if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
1825 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
1826 return CFP->getValueAPF().isZero();
1831 /// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return
1832 /// true if Op is undef or if it matches the specified value.
1833 static bool isConstantOrUndef(int Op, int Val) {
1834 return Op < 0 || Op == Val;
1837 /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
1838 /// VPKUHUM instruction.
1839 /// The ShuffleKind distinguishes between big-endian operations with
1840 /// two different inputs (0), either-endian operations with two identical
1841 /// inputs (1), and little-endian operations with two different inputs (2).
1842 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1843 bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1844 SelectionDAG &DAG) {
1845 bool IsLE = DAG.getDataLayout().isLittleEndian();
1846 if (ShuffleKind == 0) {
1849 for (unsigned i = 0; i != 16; ++i)
1850 if (!isConstantOrUndef(N->getMaskElt(i), i*2+1))
1852 } else if (ShuffleKind == 2) {
1855 for (unsigned i = 0; i != 16; ++i)
1856 if (!isConstantOrUndef(N->getMaskElt(i), i*2))
1858 } else if (ShuffleKind == 1) {
1859 unsigned j = IsLE ? 0 : 1;
1860 for (unsigned i = 0; i != 8; ++i)
1861 if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) ||
1862 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j))
1868 /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
1869 /// VPKUWUM instruction.
1870 /// The ShuffleKind distinguishes between big-endian operations with
1871 /// two different inputs (0), either-endian operations with two identical
1872 /// inputs (1), and little-endian operations with two different inputs (2).
1873 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1874 bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1875 SelectionDAG &DAG) {
1876 bool IsLE = DAG.getDataLayout().isLittleEndian();
1877 if (ShuffleKind == 0) {
1880 for (unsigned i = 0; i != 16; i += 2)
1881 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+2) ||
1882 !isConstantOrUndef(N->getMaskElt(i+1), i*2+3))
1884 } else if (ShuffleKind == 2) {
1887 for (unsigned i = 0; i != 16; i += 2)
1888 if (!isConstantOrUndef(N->getMaskElt(i ), i*2) ||
1889 !isConstantOrUndef(N->getMaskElt(i+1), i*2+1))
1891 } else if (ShuffleKind == 1) {
1892 unsigned j = IsLE ? 0 : 2;
1893 for (unsigned i = 0; i != 8; i += 2)
1894 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
1895 !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) ||
1896 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||
1897 !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1))
1903 /// isVPKUDUMShuffleMask - Return true if this is the shuffle mask for a
1904 /// VPKUDUM instruction, AND the VPKUDUM instruction exists for the
1905 /// current subtarget.
1907 /// The ShuffleKind distinguishes between big-endian operations with
1908 /// two different inputs (0), either-endian operations with two identical
1909 /// inputs (1), and little-endian operations with two different inputs (2).
1910 /// For the latter, the input operands are swapped (see PPCInstrAltivec.td).
1911 bool PPC::isVPKUDUMShuffleMask(ShuffleVectorSDNode *N, unsigned ShuffleKind,
1912 SelectionDAG &DAG) {
1913 const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
1914 if (!Subtarget.hasP8Vector())
1917 bool IsLE = DAG.getDataLayout().isLittleEndian();
1918 if (ShuffleKind == 0) {
1921 for (unsigned i = 0; i != 16; i += 4)
1922 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+4) ||
1923 !isConstantOrUndef(N->getMaskElt(i+1), i*2+5) ||
1924 !isConstantOrUndef(N->getMaskElt(i+2), i*2+6) ||
1925 !isConstantOrUndef(N->getMaskElt(i+3), i*2+7))
1927 } else if (ShuffleKind == 2) {
1930 for (unsigned i = 0; i != 16; i += 4)
1931 if (!isConstantOrUndef(N->getMaskElt(i ), i*2) ||
1932 !isConstantOrUndef(N->getMaskElt(i+1), i*2+1) ||
1933 !isConstantOrUndef(N->getMaskElt(i+2), i*2+2) ||
1934 !isConstantOrUndef(N->getMaskElt(i+3), i*2+3))
1936 } else if (ShuffleKind == 1) {
1937 unsigned j = IsLE ? 0 : 4;
1938 for (unsigned i = 0; i != 8; i += 4)
1939 if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
1940 !isConstantOrUndef(N->getMaskElt(i+1), i*2+j+1) ||
1941 !isConstantOrUndef(N->getMaskElt(i+2), i*2+j+2) ||
1942 !isConstantOrUndef(N->getMaskElt(i+3), i*2+j+3) ||
1943 !isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||
1944 !isConstantOrUndef(N->getMaskElt(i+9), i*2+j+1) ||
1945 !isConstantOrUndef(N->getMaskElt(i+10), i*2+j+2) ||
1946 !isConstantOrUndef(N->getMaskElt(i+11), i*2+j+3))
1952 /// isVMerge - Common function, used to match vmrg* shuffles.
1954 static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
1955 unsigned LHSStart, unsigned RHSStart) {
1956 if (N->getValueType(0) != MVT::v16i8)
1958 assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
1959 "Unsupported merge size!");
1961 for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units
1962 for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit
1963 if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j),
1964 LHSStart+j+i*UnitSize) ||
1965 !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j),
1966 RHSStart+j+i*UnitSize))
1972 /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
1973 /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
1974 /// The ShuffleKind distinguishes between big-endian merges with two
1975 /// different inputs (0), either-endian merges with two identical inputs (1),
1976 /// and little-endian merges with two different inputs (2). For the latter,
1977 /// the input operands are swapped (see PPCInstrAltivec.td).
1978 bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
1979 unsigned ShuffleKind, SelectionDAG &DAG) {
1980 if (DAG.getDataLayout().isLittleEndian()) {
1981 if (ShuffleKind == 1) // unary
1982 return isVMerge(N, UnitSize, 0, 0);
1983 else if (ShuffleKind == 2) // swapped
1984 return isVMerge(N, UnitSize, 0, 16);
1988 if (ShuffleKind == 1) // unary
1989 return isVMerge(N, UnitSize, 8, 8);
1990 else if (ShuffleKind == 0) // normal
1991 return isVMerge(N, UnitSize, 8, 24);
1997 /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
1998 /// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
1999 /// The ShuffleKind distinguishes between big-endian merges with two
2000 /// different inputs (0), either-endian merges with two identical inputs (1),
2001 /// and little-endian merges with two different inputs (2). For the latter,
2002 /// the input operands are swapped (see PPCInstrAltivec.td).
2003 bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
2004 unsigned ShuffleKind, SelectionDAG &DAG) {
2005 if (DAG.getDataLayout().isLittleEndian()) {
2006 if (ShuffleKind == 1) // unary
2007 return isVMerge(N, UnitSize, 8, 8);
2008 else if (ShuffleKind == 2) // swapped
2009 return isVMerge(N, UnitSize, 8, 24);
2013 if (ShuffleKind == 1) // unary
2014 return isVMerge(N, UnitSize, 0, 0);
2015 else if (ShuffleKind == 0) // normal
2016 return isVMerge(N, UnitSize, 0, 16);
2023 * Common function used to match vmrgew and vmrgow shuffles
2025 * The indexOffset determines whether to look for even or odd words in
2026 * the shuffle mask. This is based on the of the endianness of the target
2029 * - Use offset of 0 to check for odd elements
2030 * - Use offset of 4 to check for even elements
2032 * - Use offset of 0 to check for even elements
2033 * - Use offset of 4 to check for odd elements
2034 * A detailed description of the vector element ordering for little endian and
2035 * big endian can be found at
2036 * http://www.ibm.com/developerworks/library/l-ibm-xl-c-cpp-compiler/index.html
2037 * Targeting your applications - what little endian and big endian IBM XL C/C++
2038 * compiler differences mean to you
2040 * The mask to the shuffle vector instruction specifies the indices of the
2041 * elements from the two input vectors to place in the result. The elements are
2042 * numbered in array-access order, starting with the first vector. These vectors
2043 * are always of type v16i8, thus each vector will contain 16 elements of size
2044 * 8. More info on the shuffle vector can be found in the
2045 * http://llvm.org/docs/LangRef.html#shufflevector-instruction
2046 * Language Reference.
2048 * The RHSStartValue indicates whether the same input vectors are used (unary)
2049 * or two different input vectors are used, based on the following:
2050 * - If the instruction uses the same vector for both inputs, the range of the
2051 * indices will be 0 to 15. In this case, the RHSStart value passed should
2053 * - If the instruction has two different vectors then the range of the
2054 * indices will be 0 to 31. In this case, the RHSStart value passed should
2055 * be 16 (indices 0-15 specify elements in the first vector while indices 16
2056 * to 31 specify elements in the second vector).
2058 * \param[in] N The shuffle vector SD Node to analyze
2059 * \param[in] IndexOffset Specifies whether to look for even or odd elements
2060 * \param[in] RHSStartValue Specifies the starting index for the righthand input
2061 * vector to the shuffle_vector instruction
2062 * \return true iff this shuffle vector represents an even or odd word merge
2064 static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset,
2065 unsigned RHSStartValue) {
2066 if (N->getValueType(0) != MVT::v16i8)
2069 for (unsigned i = 0; i < 2; ++i)
2070 for (unsigned j = 0; j < 4; ++j)
2071 if (!isConstantOrUndef(N->getMaskElt(i*4+j),
2072 i*RHSStartValue+j+IndexOffset) ||
2073 !isConstantOrUndef(N->getMaskElt(i*4+j+8),
2074 i*RHSStartValue+j+IndexOffset+8))
2080 * Determine if the specified shuffle mask is suitable for the vmrgew or
2081 * vmrgow instructions.
2083 * \param[in] N The shuffle vector SD Node to analyze
2084 * \param[in] CheckEven Check for an even merge (true) or an odd merge (false)
2085 * \param[in] ShuffleKind Identify the type of merge:
2086 * - 0 = big-endian merge with two different inputs;
2087 * - 1 = either-endian merge with two identical inputs;
2088 * - 2 = little-endian merge with two different inputs (inputs are swapped for
2089 * little-endian merges).
2090 * \param[in] DAG The current SelectionDAG
2091 * \return true iff this shuffle mask
2093 bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
2094 unsigned ShuffleKind, SelectionDAG &DAG) {
2095 if (DAG.getDataLayout().isLittleEndian()) {
2096 unsigned indexOffset = CheckEven ? 4 : 0;
2097 if (ShuffleKind == 1) // Unary
2098 return isVMerge(N, indexOffset, 0);
2099 else if (ShuffleKind == 2) // swapped
2100 return isVMerge(N, indexOffset, 16);
2105 unsigned indexOffset = CheckEven ? 0 : 4;
2106 if (ShuffleKind == 1) // Unary
2107 return isVMerge(N, indexOffset, 0);
2108 else if (ShuffleKind == 0) // Normal
2109 return isVMerge(N, indexOffset, 16);
2116 /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
2117 /// amount, otherwise return -1.
2118 /// The ShuffleKind distinguishes between big-endian operations with two
2119 /// different inputs (0), either-endian operations with two identical inputs
2120 /// (1), and little-endian operations with two different inputs (2). For the
2121 /// latter, the input operands are swapped (see PPCInstrAltivec.td).
2122 int PPC::isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
2123 SelectionDAG &DAG) {
2124 if (N->getValueType(0) != MVT::v16i8)
2127 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
2129 // Find the first non-undef value in the shuffle mask.
2131 for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
2134 if (i == 16) return -1; // all undef.
2136 // Otherwise, check to see if the rest of the elements are consecutively
2137 // numbered from this value.
2138 unsigned ShiftAmt = SVOp->getMaskElt(i);
2139 if (ShiftAmt < i) return -1;
2142 bool isLE = DAG.getDataLayout().isLittleEndian();
2144 if ((ShuffleKind == 0 && !isLE) || (ShuffleKind == 2 && isLE)) {
2145 // Check the rest of the elements to see if they are consecutive.
2146 for (++i; i != 16; ++i)
2147 if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
2149 } else if (ShuffleKind == 1) {
2150 // Check the rest of the elements to see if they are consecutive.
2151 for (++i; i != 16; ++i)
2152 if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
2158 ShiftAmt = 16 - ShiftAmt;
2163 /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
2164 /// specifies a splat of a single element that is suitable for input to
2165 /// one of the splat operations (VSPLTB/VSPLTH/VSPLTW/XXSPLTW/LXVDSX/etc.).
2166 bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
2167 EVT VT = N->getValueType(0);
2168 if (VT == MVT::v2i64 || VT == MVT::v2f64)
2169 return EltSize == 8 && N->getMaskElt(0) == N->getMaskElt(1);
2171 assert(VT == MVT::v16i8 && isPowerOf2_32(EltSize) &&
2172 EltSize <= 8 && "Can only handle 1,2,4,8 byte element sizes");
2174 // The consecutive indices need to specify an element, not part of two
2175 // different elements. So abandon ship early if this isn't the case.
2176 if (N->getMaskElt(0) % EltSize != 0)
2179 // This is a splat operation if each element of the permute is the same, and
2180 // if the value doesn't reference the second vector.
2181 unsigned ElementBase = N->getMaskElt(0);
2183 // FIXME: Handle UNDEF elements too!
2184 if (ElementBase >= 16)
2187 // Check that the indices are consecutive, in the case of a multi-byte element
2188 // splatted with a v16i8 mask.
2189 for (unsigned i = 1; i != EltSize; ++i)
2190 if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase))
2193 for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
2194 if (N->getMaskElt(i) < 0) continue;
2195 for (unsigned j = 0; j != EltSize; ++j)
2196 if (N->getMaskElt(i+j) != N->getMaskElt(j))
2202 /// Check that the mask is shuffling N byte elements. Within each N byte
2203 /// element of the mask, the indices could be either in increasing or
2204 /// decreasing order as long as they are consecutive.
2205 /// \param[in] N the shuffle vector SD Node to analyze
2206 /// \param[in] Width the element width in bytes, could be 2/4/8/16 (HalfWord/
2207 /// Word/DoubleWord/QuadWord).
2208 /// \param[in] StepLen the delta indices number among the N byte element, if
2209 /// the mask is in increasing/decreasing order then it is 1/-1.
2210 /// \return true iff the mask is shuffling N byte elements.
2211 static bool isNByteElemShuffleMask(ShuffleVectorSDNode *N, unsigned Width,
2213 assert((Width == 2 || Width == 4 || Width == 8 || Width == 16) &&
2214 "Unexpected element width.");
2215 assert((StepLen == 1 || StepLen == -1) && "Unexpected element width.");
2217 unsigned NumOfElem = 16 / Width;
2218 unsigned MaskVal[16]; // Width is never greater than 16
2219 for (unsigned i = 0; i < NumOfElem; ++i) {
2220 MaskVal[0] = N->getMaskElt(i * Width);
2221 if ((StepLen == 1) && (MaskVal[0] % Width)) {
2223 } else if ((StepLen == -1) && ((MaskVal[0] + 1) % Width)) {
2227 for (unsigned int j = 1; j < Width; ++j) {
2228 MaskVal[j] = N->getMaskElt(i * Width + j);
2229 if (MaskVal[j] != MaskVal[j-1] + StepLen) {
2238 bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
2239 unsigned &InsertAtByte, bool &Swap, bool IsLE) {
2240 if (!isNByteElemShuffleMask(N, 4, 1))
2243 // Now we look at mask elements 0,4,8,12
2244 unsigned M0 = N->getMaskElt(0) / 4;
2245 unsigned M1 = N->getMaskElt(4) / 4;
2246 unsigned M2 = N->getMaskElt(8) / 4;
2247 unsigned M3 = N->getMaskElt(12) / 4;
2248 unsigned LittleEndianShifts[] = { 2, 1, 0, 3 };
2249 unsigned BigEndianShifts[] = { 3, 0, 1, 2 };
2251 // Below, let H and L be arbitrary elements of the shuffle mask
2252 // where H is in the range [4,7] and L is in the range [0,3].
2253 // H, 1, 2, 3 or L, 5, 6, 7
2254 if ((M0 > 3 && M1 == 1 && M2 == 2 && M3 == 3) ||
2255 (M0 < 4 && M1 == 5 && M2 == 6 && M3 == 7)) {
2256 ShiftElts = IsLE ? LittleEndianShifts[M0 & 0x3] : BigEndianShifts[M0 & 0x3];
2257 InsertAtByte = IsLE ? 12 : 0;
2261 // 0, H, 2, 3 or 4, L, 6, 7
2262 if ((M1 > 3 && M0 == 0 && M2 == 2 && M3 == 3) ||
2263 (M1 < 4 && M0 == 4 && M2 == 6 && M3 == 7)) {
2264 ShiftElts = IsLE ? LittleEndianShifts[M1 & 0x3] : BigEndianShifts[M1 & 0x3];
2265 InsertAtByte = IsLE ? 8 : 4;
2269 // 0, 1, H, 3 or 4, 5, L, 7
2270 if ((M2 > 3 && M0 == 0 && M1 == 1 && M3 == 3) ||
2271 (M2 < 4 && M0 == 4 && M1 == 5 && M3 == 7)) {
2272 ShiftElts = IsLE ? LittleEndianShifts[M2 & 0x3] : BigEndianShifts[M2 & 0x3];
2273 InsertAtByte = IsLE ? 4 : 8;
2277 // 0, 1, 2, H or 4, 5, 6, L
2278 if ((M3 > 3 && M0 == 0 && M1 == 1 && M2 == 2) ||
2279 (M3 < 4 && M0 == 4 && M1 == 5 && M2 == 6)) {
2280 ShiftElts = IsLE ? LittleEndianShifts[M3 & 0x3] : BigEndianShifts[M3 & 0x3];
2281 InsertAtByte = IsLE ? 0 : 12;
2286 // If both vector operands for the shuffle are the same vector, the mask will
2287 // contain only elements from the first one and the second one will be undef.
2288 if (N->getOperand(1).isUndef()) {
2291 unsigned XXINSERTWSrcElem = IsLE ? 2 : 1;
2292 if (M0 == XXINSERTWSrcElem && M1 == 1 && M2 == 2 && M3 == 3) {
2293 InsertAtByte = IsLE ? 12 : 0;
2296 if (M0 == 0 && M1 == XXINSERTWSrcElem && M2 == 2 && M3 == 3) {
2297 InsertAtByte = IsLE ? 8 : 4;
2300 if (M0 == 0 && M1 == 1 && M2 == XXINSERTWSrcElem && M3 == 3) {
2301 InsertAtByte = IsLE ? 4 : 8;
2304 if (M0 == 0 && M1 == 1 && M2 == 2 && M3 == XXINSERTWSrcElem) {
2305 InsertAtByte = IsLE ? 0 : 12;
2313 bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
2314 bool &Swap, bool IsLE) {
2315 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2316 // Ensure each byte index of the word is consecutive.
2317 if (!isNByteElemShuffleMask(N, 4, 1))
2320 // Now we look at mask elements 0,4,8,12, which are the beginning of words.
2321 unsigned M0 = N->getMaskElt(0) / 4;
2322 unsigned M1 = N->getMaskElt(4) / 4;
2323 unsigned M2 = N->getMaskElt(8) / 4;
2324 unsigned M3 = N->getMaskElt(12) / 4;
2326 // If both vector operands for the shuffle are the same vector, the mask will
2327 // contain only elements from the first one and the second one will be undef.
2328 if (N->getOperand(1).isUndef()) {
2329 assert(M0 < 4 && "Indexing into an undef vector?");
2330 if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4)
2333 ShiftElts = IsLE ? (4 - M0) % 4 : M0;
2338 // Ensure each word index of the ShuffleVector Mask is consecutive.
2339 if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8)
2343 if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) {
2344 // Input vectors don't need to be swapped if the leading element
2345 // of the result is one of the 3 left elements of the second vector
2346 // (or if there is no shift to be done at all).
2348 ShiftElts = (8 - M0) % 8;
2349 } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) {
2350 // Input vectors need to be swapped if the leading element
2351 // of the result is one of the 3 left elements of the first vector
2352 // (or if we're shifting by 4 - thereby simply swapping the vectors).
2354 ShiftElts = (4 - M0) % 4;
2359 if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) {
2360 // Input vectors don't need to be swapped if the leading element
2361 // of the result is one of the 4 elements of the first vector.
2364 } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) {
2365 // Input vectors need to be swapped if the leading element
2366 // of the result is one of the 4 elements of the right vector.
2375 bool static isXXBRShuffleMaskHelper(ShuffleVectorSDNode *N, int Width) {
2376 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2378 if (!isNByteElemShuffleMask(N, Width, -1))
2381 for (int i = 0; i < 16; i += Width)
2382 if (N->getMaskElt(i) != i + Width - 1)
2388 bool PPC::isXXBRHShuffleMask(ShuffleVectorSDNode *N) {
2389 return isXXBRShuffleMaskHelper(N, 2);
2392 bool PPC::isXXBRWShuffleMask(ShuffleVectorSDNode *N) {
2393 return isXXBRShuffleMaskHelper(N, 4);
2396 bool PPC::isXXBRDShuffleMask(ShuffleVectorSDNode *N) {
2397 return isXXBRShuffleMaskHelper(N, 8);
2400 bool PPC::isXXBRQShuffleMask(ShuffleVectorSDNode *N) {
2401 return isXXBRShuffleMaskHelper(N, 16);
2404 /// Can node \p N be lowered to an XXPERMDI instruction? If so, set \p Swap
2405 /// if the inputs to the instruction should be swapped and set \p DM to the
2406 /// value for the immediate.
2407 /// Specifically, set \p Swap to true only if \p N can be lowered to XXPERMDI
2408 /// AND element 0 of the result comes from the first input (LE) or second input
2409 /// (BE). Set \p DM to the calculated result (0-3) only if \p N can be lowered.
2410 /// \return true iff the given mask of shuffle node \p N is a XXPERMDI shuffle
2412 bool PPC::isXXPERMDIShuffleMask(ShuffleVectorSDNode *N, unsigned &DM,
2413 bool &Swap, bool IsLE) {
2414 assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
2416 // Ensure each byte index of the double word is consecutive.
2417 if (!isNByteElemShuffleMask(N, 8, 1))
2420 unsigned M0 = N->getMaskElt(0) / 8;
2421 unsigned M1 = N->getMaskElt(8) / 8;
2422 assert(((M0 | M1) < 4) && "A mask element out of bounds?");
2424 // If both vector operands for the shuffle are the same vector, the mask will
2425 // contain only elements from the first one and the second one will be undef.
2426 if (N->getOperand(1).isUndef()) {
2427 if ((M0 | M1) < 2) {
2428 DM = IsLE ? (((~M1) & 1) << 1) + ((~M0) & 1) : (M0 << 1) + (M1 & 1);
2436 if (M0 > 1 && M1 < 2) {
2438 } else if (M0 < 2 && M1 > 1) {
2445 // Note: if control flow comes here that means Swap is already set above
2446 DM = (((~M1) & 1) << 1) + ((~M0) & 1);
2449 if (M0 < 2 && M1 > 1) {
2451 } else if (M0 > 1 && M1 < 2) {
2458 // Note: if control flow comes here that means Swap is already set above
2459 DM = (M0 << 1) + (M1 & 1);
2465 /// getSplatIdxForPPCMnemonics - Return the splat index as a value that is
2466 /// appropriate for PPC mnemonics (which have a big endian bias - namely
2467 /// elements are counted from the left of the vector register).
2468 unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
2469 SelectionDAG &DAG) {
2470 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
2471 assert(isSplatShuffleMask(SVOp, EltSize));
2472 EVT VT = SVOp->getValueType(0);
2474 if (VT == MVT::v2i64 || VT == MVT::v2f64)
2475 return DAG.getDataLayout().isLittleEndian() ? 1 - SVOp->getMaskElt(0)
2476 : SVOp->getMaskElt(0);
2478 if (DAG.getDataLayout().isLittleEndian())
2479 return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
2481 return SVOp->getMaskElt(0) / EltSize;
2484 /// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
2485 /// by using a vspltis[bhw] instruction of the specified element size, return
2486 /// the constant being splatted. The ByteSize field indicates the number of
2487 /// bytes of each element [124] -> [bhw].
2488 SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
2491 // If ByteSize of the splat is bigger than the element size of the
2492 // build_vector, then we have a case where we are checking for a splat where
2493 // multiple elements of the buildvector are folded together into a single
2494 // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
2495 unsigned EltSize = 16/N->getNumOperands();
2496 if (EltSize < ByteSize) {
2497 unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval.
2498 SDValue UniquedVals[4];
2499 assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
2501 // See if all of the elements in the buildvector agree across.
2502 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
2503 if (N->getOperand(i).isUndef()) continue;
2504 // If the element isn't a constant, bail fully out.
2505 if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
2507 if (!UniquedVals[i&(Multiple-1)].getNode())
2508 UniquedVals[i&(Multiple-1)] = N->getOperand(i);
2509 else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
2510 return SDValue(); // no match.
2513 // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
2514 // either constant or undef values that are identical for each chunk. See
2515 // if these chunks can form into a larger vspltis*.
2517 // Check to see if all of the leading entries are either 0 or -1. If
2518 // neither, then this won't fit into the immediate field.
2519 bool LeadingZero = true;
2520 bool LeadingOnes = true;
2521 for (unsigned i = 0; i != Multiple-1; ++i) {
2522 if (!UniquedVals[i].getNode()) continue; // Must have been undefs.
2524 LeadingZero &= isNullConstant(UniquedVals[i]);
2525 LeadingOnes &= isAllOnesConstant(UniquedVals[i]);
2527 // Finally, check the least significant entry.
2529 if (!UniquedVals[Multiple-1].getNode())
2530 return DAG.getTargetConstant(0, SDLoc(N), MVT::i32); // 0,0,0,undef
2531 int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
2532 if (Val < 16) // 0,0,0,4 -> vspltisw(4)
2533 return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
2536 if (!UniquedVals[Multiple-1].getNode())
2537 return DAG.getTargetConstant(~0U, SDLoc(N), MVT::i32); // -1,-1,-1,undef
2538 int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
2539 if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2)
2540 return DAG.getTargetConstant(Val, SDLoc(N), MVT::i32);
2546 // Check to see if this buildvec has a single non-undef value in its elements.
2547 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
2548 if (N->getOperand(i).isUndef()) continue;
2549 if (!OpVal.getNode())
2550 OpVal = N->getOperand(i);
2551 else if (OpVal != N->getOperand(i))
2555 if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def.
2557 unsigned ValSizeInBytes = EltSize;
2559 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
2560 Value = CN->getZExtValue();
2561 } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
2562 assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
2563 Value = llvm::bit_cast<uint32_t>(CN->getValueAPF().convertToFloat());
2566 // If the splat value is larger than the element value, then we can never do
2567 // this splat. The only case that we could fit the replicated bits into our
2568 // immediate field for would be zero, and we prefer to use vxor for it.
2569 if (ValSizeInBytes < ByteSize) return SDValue();
2571 // If the element value is larger than the splat value, check if it consists
2572 // of a repeated bit pattern of size ByteSize.
2573 if (!APInt(ValSizeInBytes * 8, Value).isSplat(ByteSize * 8))
2576 // Properly sign extend the value.
2577 int MaskVal = SignExtend32(Value, ByteSize * 8);
2579 // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
2580 if (MaskVal == 0) return SDValue();
2582 // Finally, if this value fits in a 5 bit sext field, return it
2583 if (SignExtend32<5>(MaskVal) == MaskVal)
2584 return DAG.getTargetConstant(MaskVal, SDLoc(N), MVT::i32);
2588 //===----------------------------------------------------------------------===//
2589 // Addressing Mode Selection
2590 //===----------------------------------------------------------------------===//
2592 /// isIntS16Immediate - This method tests to see if the node is either a 32-bit
2593 /// or 64-bit immediate, and if the value can be accurately represented as a
2594 /// sign extension from a 16-bit value. If so, this returns true and the
2596 bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) {
2597 if (!isa<ConstantSDNode>(N))
2600 Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue();
2601 if (N->getValueType(0) == MVT::i32)
2602 return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
2604 return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
2606 bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) {
2607 return isIntS16Immediate(Op.getNode(), Imm);
2610 /// Used when computing address flags for selecting loads and stores.
2611 /// If we have an OR, check if the LHS and RHS are provably disjoint.
2612 /// An OR of two provably disjoint values is equivalent to an ADD.
2613 /// Most PPC load/store instructions compute the effective address as a sum,
2614 /// so doing this conversion is useful.
2615 static bool provablyDisjointOr(SelectionDAG &DAG, const SDValue &N) {
2616 if (N.getOpcode() != ISD::OR)
2618 KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
2619 if (!LHSKnown.Zero.getBoolValue())
2621 KnownBits RHSKnown = DAG.computeKnownBits(N.getOperand(1));
2622 return (~(LHSKnown.Zero | RHSKnown.Zero) == 0);
2625 /// SelectAddressEVXRegReg - Given the specified address, check to see if it can
2626 /// be represented as an indexed [r+r] operation.
2627 bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
2629 SelectionDAG &DAG) const {
2630 for (SDNode *U : N->uses()) {
2631 if (MemSDNode *Memop = dyn_cast<MemSDNode>(U)) {
2632 if (Memop->getMemoryVT() == MVT::f64) {
2633 Base = N.getOperand(0);
2634 Index = N.getOperand(1);
2642 /// isIntS34Immediate - This method tests if value of node given can be
2643 /// accurately represented as a sign extension from a 34-bit value. If so,
2644 /// this returns true and the immediate.
2645 bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {
2646 if (!isa<ConstantSDNode>(N))
2649 Imm = (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
2650 return isInt<34>(Imm);
2652 bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {
2653 return isIntS34Immediate(Op.getNode(), Imm);
2656 /// SelectAddressRegReg - Given the specified addressed, check to see if it
2657 /// can be represented as an indexed [r+r] operation. Returns false if it
2658 /// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
2659 /// non-zero and N can be represented by a base register plus a signed 16-bit
2660 /// displacement, make a more precise judgement by checking (displacement % \p
2661 /// EncodingAlignment).
2662 bool PPCTargetLowering::SelectAddressRegReg(
2663 SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG,
2664 MaybeAlign EncodingAlignment) const {
2665 // If we have a PC Relative target flag don't select as [reg+reg]. It will be
2667 if (SelectAddressPCRel(N, Base))
2671 if (N.getOpcode() == ISD::ADD) {
2672 // Is there any SPE load/store (f64), which can't handle 16bit offset?
2673 // SPE load/store can only handle 8-bit offsets.
2674 if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))
2676 if (isIntS16Immediate(N.getOperand(1), Imm) &&
2677 (!EncodingAlignment || isAligned(*EncodingAlignment, Imm)))
2678 return false; // r+i
2679 if (N.getOperand(1).getOpcode() == PPCISD::Lo)
2680 return false; // r+i
2682 Base = N.getOperand(0);
2683 Index = N.getOperand(1);
2685 } else if (N.getOpcode() == ISD::OR) {
2686 if (isIntS16Immediate(N.getOperand(1), Imm) &&
2687 (!EncodingAlignment || isAligned(*EncodingAlignment, Imm)))
2688 return false; // r+i can fold it if we can.
2690 // If this is an or of disjoint bitfields, we can codegen this as an add
2691 // (for better address arithmetic) if the LHS and RHS of the OR are provably
2693 KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
2695 if (LHSKnown.Zero.getBoolValue()) {
2696 KnownBits RHSKnown = DAG.computeKnownBits(N.getOperand(1));
2697 // If all of the bits are known zero on the LHS or RHS, the add won't
2699 if (~(LHSKnown.Zero | RHSKnown.Zero) == 0) {
2700 Base = N.getOperand(0);
2701 Index = N.getOperand(1);
2710 // If we happen to be doing an i64 load or store into a stack slot that has
2711 // less than a 4-byte alignment, then the frame-index elimination may need to
2712 // use an indexed load or store instruction (because the offset may not be a
2713 // multiple of 4). The extra register needed to hold the offset comes from the
2714 // register scavenger, and it is possible that the scavenger will need to use
2715 // an emergency spill slot. As a result, we need to make sure that a spill slot
2716 // is allocated when doing an i64 load/store into a less-than-4-byte-aligned
2718 static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
2719 // FIXME: This does not handle the LWA case.
2723 // NOTE: We'll exclude negative FIs here, which come from argument
2724 // lowering, because there are no known test cases triggering this problem
2725 // using packed structures (or similar). We can remove this exclusion if
2726 // we find such a test case. The reason why this is so test-case driven is
2727 // because this entire 'fixup' is only to prevent crashes (from the
2728 // register scavenger) on not-really-valid inputs. For example, if we have:
2730 // %b = bitcast i1* %a to i64*
2731 // store i64* a, i64 b
2732 // then the store should really be marked as 'align 1', but is not. If it
2733 // were marked as 'align 1' then the indexed form would have been
2734 // instruction-selected initially, and the problem this 'fixup' is preventing
2735 // won't happen regardless.
2739 MachineFunction &MF = DAG.getMachineFunction();
2740 MachineFrameInfo &MFI = MF.getFrameInfo();
2742 if (MFI.getObjectAlign(FrameIdx) >= Align(4))
2745 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
2746 FuncInfo->setHasNonRISpills();
2749 /// Returns true if the address N can be represented by a base register plus
2750 /// a signed 16-bit displacement [r+imm], and if it is not better
2751 /// represented as reg+reg. If \p EncodingAlignment is non-zero, only accept
2752 /// displacements that are multiples of that value.
2753 bool PPCTargetLowering::SelectAddressRegImm(
2754 SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG,
2755 MaybeAlign EncodingAlignment) const {
2756 // FIXME dl should come from parent load or store, not from address
2759 // If we have a PC Relative target flag don't select as [reg+imm]. It will be
2761 if (SelectAddressPCRel(N, Base))
2764 // If this can be more profitably realized as r+r, fail.
2765 if (SelectAddressRegReg(N, Disp, Base, DAG, EncodingAlignment))
2768 if (N.getOpcode() == ISD::ADD) {
2770 if (isIntS16Immediate(N.getOperand(1), imm) &&
2771 (!EncodingAlignment || isAligned(*EncodingAlignment, imm))) {
2772 Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
2773 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
2774 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
2775 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
2777 Base = N.getOperand(0);
2779 return true; // [r+i]
2780 } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
2781 // Match LOAD (ADD (X, Lo(G))).
2782 assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
2783 && "Cannot handle constant offsets yet!");
2784 Disp = N.getOperand(1).getOperand(0); // The global address.
2785 assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
2786 Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
2787 Disp.getOpcode() == ISD::TargetConstantPool ||
2788 Disp.getOpcode() == ISD::TargetJumpTable);
2789 Base = N.getOperand(0);
2790 return true; // [&g+r]
2792 } else if (N.getOpcode() == ISD::OR) {
2794 if (isIntS16Immediate(N.getOperand(1), imm) &&
2795 (!EncodingAlignment || isAligned(*EncodingAlignment, imm))) {
2796 // If this is an or of disjoint bitfields, we can codegen this as an add
2797 // (for better address arithmetic) if the LHS and RHS of the OR are
2798 // provably disjoint.
2799 KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
2801 if ((LHSKnown.Zero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
2802 // If all of the bits are known zero on the LHS or RHS, the add won't
2804 if (FrameIndexSDNode *FI =
2805 dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
2806 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
2807 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
2809 Base = N.getOperand(0);
2811 Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
2815 } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
2816 // Loading from a constant address.
2818 // If this address fits entirely in a 16-bit sext immediate field, codegen
2821 if (isIntS16Immediate(CN, Imm) &&
2822 (!EncodingAlignment || isAligned(*EncodingAlignment, Imm))) {
2823 Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
2824 Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
2825 CN->getValueType(0));
2829 // Handle 32-bit sext immediates with LIS + addr mode.
2830 if ((CN->getValueType(0) == MVT::i32 ||
2831 (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
2832 (!EncodingAlignment ||
2833 isAligned(*EncodingAlignment, CN->getZExtValue()))) {
2834 int Addr = (int)CN->getZExtValue();
2836 // Otherwise, break this down into an LIS + disp.
2837 Disp = DAG.getTargetConstant((short)Addr, dl, MVT::i32);
2839 Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, dl,
2841 unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
2842 Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
2847 Disp = DAG.getTargetConstant(0, dl, getPointerTy(DAG.getDataLayout()));
2848 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
2849 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
2850 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
2853 return true; // [r+0]
2856 /// Similar to the 16-bit case but for instructions that take a 34-bit
2857 /// displacement field (prefixed loads/stores).
2858 bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp,
2860 SelectionDAG &DAG) const {
2861 // Only on 64-bit targets.
2862 if (N.getValueType() != MVT::i64)
2868 if (N.getOpcode() == ISD::ADD) {
2869 if (!isIntS34Immediate(N.getOperand(1), Imm))
2871 Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
2872 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
2873 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
2875 Base = N.getOperand(0);
2879 if (N.getOpcode() == ISD::OR) {
2880 if (!isIntS34Immediate(N.getOperand(1), Imm))
2882 // If this is an or of disjoint bitfields, we can codegen this as an add
2883 // (for better address arithmetic) if the LHS and RHS of the OR are
2884 // provably disjoint.
2885 KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
2886 if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) != ~0ULL)
2888 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
2889 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
2891 Base = N.getOperand(0);
2892 Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
2896 if (isIntS34Immediate(N, Imm)) { // If the address is a 34-bit const.
2897 Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
2898 Base = DAG.getRegister(PPC::ZERO8, N.getValueType());
2905 /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
2906 /// represented as an indexed [r+r] operation.
2907 bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
2909 SelectionDAG &DAG) const {
2910 // Check to see if we can easily represent this as an [r+r] address. This
2911 // will fail if it thinks that the address is more profitably represented as
2912 // reg+imm, e.g. where imm = 0.
2913 if (SelectAddressRegReg(N, Base, Index, DAG))
2916 // If the address is the result of an add, we will utilize the fact that the
2917 // address calculation includes an implicit add. However, we can reduce
2918 // register pressure if we do not materialize a constant just for use as the
2919 // index register. We only get rid of the add if it is not an add of a
2920 // value and a 16-bit signed constant and both have a single use.
2922 if (N.getOpcode() == ISD::ADD &&
2923 (!isIntS16Immediate(N.getOperand(1), imm) ||
2924 !N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) {
2925 Base = N.getOperand(0);
2926 Index = N.getOperand(1);
2930 // Otherwise, do it the hard way, using R0 as the base register.
2931 Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
2937 template <typename Ty> static bool isValidPCRelNode(SDValue N) {
2938 Ty *PCRelCand = dyn_cast<Ty>(N);
2939 return PCRelCand && (PCRelCand->getTargetFlags() & PPCII::MO_PCREL_FLAG);
2942 /// Returns true if this address is a PC Relative address.
2943 /// PC Relative addresses are marked with the flag PPCII::MO_PCREL_FLAG
2944 /// or if the node opcode is PPCISD::MAT_PCREL_ADDR.
2945 bool PPCTargetLowering::SelectAddressPCRel(SDValue N, SDValue &Base) const {
2946 // This is a materialize PC Relative node. Always select this as PC Relative.
2948 if (N.getOpcode() == PPCISD::MAT_PCREL_ADDR)
2950 if (isValidPCRelNode<ConstantPoolSDNode>(N) ||
2951 isValidPCRelNode<GlobalAddressSDNode>(N) ||
2952 isValidPCRelNode<JumpTableSDNode>(N) ||
2953 isValidPCRelNode<BlockAddressSDNode>(N))
2958 /// Returns true if we should use a direct load into vector instruction
2959 /// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.
2960 static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {
2962 // If there are any other uses other than scalar to vector, then we should
2963 // keep it as a scalar load -> direct move pattern to prevent multiple
2965 LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
2969 EVT MemVT = LD->getMemoryVT();
2970 if (!MemVT.isSimple())
2972 switch(MemVT.getSimpleVT().SimpleTy) {
2976 if (!ST.hasP8Vector())
2981 if (!ST.hasP9Vector())
2988 SDValue LoadedVal(N, 0);
2989 if (!LoadedVal.hasOneUse())
2992 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();
2994 if (UI.getUse().get().getResNo() == 0 &&
2995 UI->getOpcode() != ISD::SCALAR_TO_VECTOR &&
2996 UI->getOpcode() != PPCISD::SCALAR_TO_VECTOR_PERMUTED)
3002 /// getPreIndexedAddressParts - returns true by value, base pointer and
3003 /// offset pointer and addressing mode by reference if the node's address
3004 /// can be legally represented as pre-indexed load / store address.
3005 bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
3007 ISD::MemIndexedMode &AM,
3008 SelectionDAG &DAG) const {
3009 if (DisablePPCPreinc) return false;
3015 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
3016 Ptr = LD->getBasePtr();
3017 VT = LD->getMemoryVT();
3018 Alignment = LD->getAlign();
3019 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
3020 Ptr = ST->getBasePtr();
3021 VT = ST->getMemoryVT();
3022 Alignment = ST->getAlign();
3027 // Do not generate pre-inc forms for specific loads that feed scalar_to_vector
3028 // instructions because we can fold these into a more efficient instruction
3029 // instead, (such as LXSD).
3030 if (isLoad && usePartialVectorLoads(N, Subtarget)) {
3034 // PowerPC doesn't have preinc load/store instructions for vectors
3038 if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
3039 // Common code will reject creating a pre-inc form if the base pointer
3040 // is a frame index, or if N is a store and the base pointer is either
3041 // the same as or a predecessor of the value being stored. Check for
3042 // those situations here, and try with swapped Base/Offset instead.
3045 if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base))
3048 SDValue Val = cast<StoreSDNode>(N)->getValue();
3049 if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode()))
3054 std::swap(Base, Offset);
3060 // LDU/STU can only handle immediates that are a multiple of 4.
3061 if (VT != MVT::i64) {
3062 if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, std::nullopt))
3065 // LDU/STU need an address with at least 4-byte alignment.
3066 if (Alignment < Align(4))
3069 if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, Align(4)))
3073 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
3074 // PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of
3075 // sext i32 to i64 when addr mode is r+i.
3076 if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
3077 LD->getExtensionType() == ISD::SEXTLOAD &&
3078 isa<ConstantSDNode>(Offset))
3086 //===----------------------------------------------------------------------===//
3087 // LowerOperation implementation
3088 //===----------------------------------------------------------------------===//
3090 /// Return true if we should reference labels using a PICBase, set the HiOpFlags
3091 /// and LoOpFlags to the target MO flags.
3092 static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
3093 unsigned &HiOpFlags, unsigned &LoOpFlags,
3094 const GlobalValue *GV = nullptr) {
3095 HiOpFlags = PPCII::MO_HA;
3096 LoOpFlags = PPCII::MO_LO;
3098 // Don't use the pic base if not in PIC relocation model.
3100 HiOpFlags |= PPCII::MO_PIC_FLAG;
3101 LoOpFlags |= PPCII::MO_PIC_FLAG;
3105 static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
3106 SelectionDAG &DAG) {
3108 EVT PtrVT = HiPart.getValueType();
3109 SDValue Zero = DAG.getConstant(0, DL, PtrVT);
3111 SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
3112 SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);
3114 // With PIC, the first instruction is actually "GR+hi(&G)".
3116 Hi = DAG.getNode(ISD::ADD, DL, PtrVT,
3117 DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi);
3119 // Generate non-pic code that has direct accesses to the constant pool.
3120 // The address of the global is just (hi(&g)+lo(&g)).
3121 return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
3124 static void setUsesTOCBasePtr(MachineFunction &MF) {
3125 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
3126 FuncInfo->setUsesTOCBasePtr();
3129 static void setUsesTOCBasePtr(SelectionDAG &DAG) {
3130 setUsesTOCBasePtr(DAG.getMachineFunction());
3133 SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,
3135 const bool Is64Bit = Subtarget.isPPC64();
3136 EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
3137 SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT)
3138 : Subtarget.isAIXABI()
3139 ? DAG.getRegister(PPC::R2, VT)
3140 : DAG.getNode(PPCISD::GlobalBaseReg, dl, VT);
3141 SDValue Ops[] = { GA, Reg };
3142 return DAG.getMemIntrinsicNode(
3143 PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
3144 MachinePointerInfo::getGOT(DAG.getMachineFunction()), std::nullopt,
3145 MachineMemOperand::MOLoad);
3148 SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
3149 SelectionDAG &DAG) const {
3150 EVT PtrVT = Op.getValueType();
3151 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3152 const Constant *C = CP->getConstVal();
3154 // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3155 // The actual address of the GlobalValue is stored in the TOC.
3156 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3157 if (Subtarget.isUsingPCRelativeCalls()) {
3159 EVT Ty = getPointerTy(DAG.getDataLayout());
3160 SDValue ConstPool = DAG.getTargetConstantPool(
3161 C, Ty, CP->getAlign(), CP->getOffset(), PPCII::MO_PCREL_FLAG);
3162 return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, ConstPool);
3164 setUsesTOCBasePtr(DAG);
3165 SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0);
3166 return getTOCEntry(DAG, SDLoc(CP), GA);
3169 unsigned MOHiFlag, MOLoFlag;
3170 bool IsPIC = isPositionIndependent();
3171 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
3173 if (IsPIC && Subtarget.isSVR4ABI()) {
3175 DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), PPCII::MO_PIC_FLAG);
3176 return getTOCEntry(DAG, SDLoc(CP), GA);
3180 DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOHiFlag);
3182 DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOLoFlag);
3183 return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);
3186 // For 64-bit PowerPC, prefer the more compact relative encodings.
3187 // This trades 32 bits per jump table entry for one or two instructions
3188 // on the jump site.
3189 unsigned PPCTargetLowering::getJumpTableEncoding() const {
3190 if (isJumpTableRelative())
3191 return MachineJumpTableInfo::EK_LabelDifference32;
3193 return TargetLowering::getJumpTableEncoding();
3196 bool PPCTargetLowering::isJumpTableRelative() const {
3197 if (UseAbsoluteJumpTables)
3199 if (Subtarget.isPPC64() || Subtarget.isAIXABI())
3201 return TargetLowering::isJumpTableRelative();
3204 SDValue PPCTargetLowering::getPICJumpTableRelocBase(SDValue Table,
3205 SelectionDAG &DAG) const {
3206 if (!Subtarget.isPPC64() || Subtarget.isAIXABI())
3207 return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
3209 switch (getTargetMachine().getCodeModel()) {
3210 case CodeModel::Small:
3211 case CodeModel::Medium:
3212 return TargetLowering::getPICJumpTableRelocBase(Table, DAG);
3214 return DAG.getNode(PPCISD::GlobalBaseReg, SDLoc(),
3215 getPointerTy(DAG.getDataLayout()));
3220 PPCTargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
3222 MCContext &Ctx) const {
3223 if (!Subtarget.isPPC64() || Subtarget.isAIXABI())
3224 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
3226 switch (getTargetMachine().getCodeModel()) {
3227 case CodeModel::Small:
3228 case CodeModel::Medium:
3229 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
3231 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
3235 SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
3236 EVT PtrVT = Op.getValueType();
3237 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
3239 // isUsingPCRelativeCalls() returns true when PCRelative is enabled
3240 if (Subtarget.isUsingPCRelativeCalls()) {
3242 EVT Ty = getPointerTy(DAG.getDataLayout());
3244 DAG.getTargetJumpTable(JT->getIndex(), Ty, PPCII::MO_PCREL_FLAG);
3245 SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
3249 // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3250 // The actual address of the GlobalValue is stored in the TOC.
3251 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3252 setUsesTOCBasePtr(DAG);
3253 SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
3254 return getTOCEntry(DAG, SDLoc(JT), GA);
3257 unsigned MOHiFlag, MOLoFlag;
3258 bool IsPIC = isPositionIndependent();
3259 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
3261 if (IsPIC && Subtarget.isSVR4ABI()) {
3262 SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
3263 PPCII::MO_PIC_FLAG);
3264 return getTOCEntry(DAG, SDLoc(GA), GA);
3267 SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
3268 SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
3269 return LowerLabelRef(JTIHi, JTILo, IsPIC, DAG);
3272 SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
3273 SelectionDAG &DAG) const {
3274 EVT PtrVT = Op.getValueType();
3275 BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
3276 const BlockAddress *BA = BASDN->getBlockAddress();
3278 // isUsingPCRelativeCalls() returns true when PCRelative is enabled
3279 if (Subtarget.isUsingPCRelativeCalls()) {
3281 EVT Ty = getPointerTy(DAG.getDataLayout());
3282 SDValue GA = DAG.getTargetBlockAddress(BA, Ty, BASDN->getOffset(),
3283 PPCII::MO_PCREL_FLAG);
3284 SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
3288 // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
3289 // The actual BlockAddress is stored in the TOC.
3290 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3291 setUsesTOCBasePtr(DAG);
3292 SDValue GA = DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset());
3293 return getTOCEntry(DAG, SDLoc(BASDN), GA);
3296 // 32-bit position-independent ELF stores the BlockAddress in the .got.
3297 if (Subtarget.is32BitELFABI() && isPositionIndependent())
3300 DAG.getTargetBlockAddress(BA, PtrVT, BASDN->getOffset()));
3302 unsigned MOHiFlag, MOLoFlag;
3303 bool IsPIC = isPositionIndependent();
3304 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
3305 SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
3306 SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
3307 return LowerLabelRef(TgtBAHi, TgtBALo, IsPIC, DAG);
3310 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
3311 SelectionDAG &DAG) const {
3312 if (Subtarget.isAIXABI())
3313 return LowerGlobalTLSAddressAIX(Op, DAG);
3315 return LowerGlobalTLSAddressLinux(Op, DAG);
3318 SDValue PPCTargetLowering::LowerGlobalTLSAddressAIX(SDValue Op,
3319 SelectionDAG &DAG) const {
3320 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3322 if (DAG.getTarget().useEmulatedTLS())
3323 report_fatal_error("Emulated TLS is not yet supported on AIX");
3326 const GlobalValue *GV = GA->getGlobal();
3327 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3328 bool Is64Bit = Subtarget.isPPC64();
3329 TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
3331 if (Model == TLSModel::LocalExec) {
3332 SDValue VariableOffsetTGA =
3333 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TPREL_FLAG);
3334 SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);
3337 // For local-exec on AIX (64-bit), the sequence that is generated involves
3338 // a load of the variable offset (from the TOC), followed by an add of the
3339 // loaded variable offset to R13 (the thread pointer).
3340 // This code sequence looks like:
3341 // ld reg1,var[TC](2)
3342 // add reg2, reg1, r13 // r13 contains the thread pointer
3343 TLSReg = DAG.getRegister(PPC::X13, MVT::i64);
3345 // For local-exec on AIX (32-bit), the sequence that is generated involves
3346 // loading the variable offset from the TOC, generating a call to
3347 // .__get_tpointer to get the thread pointer (which will be in R3), and
3348 // adding the two together:
3349 // lwz reg1,var[TC](2)
3350 // bla .__get_tpointer
3351 // add reg2, reg1, r3
3352 TLSReg = DAG.getNode(PPCISD::GET_TPOINTER, dl, PtrVT);
3353 return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, VariableOffset);
3356 // The Local-Exec and General-Dynamic TLS models are currently the only
3357 // supported access models. If Local-exec is not possible or specified, all
3358 // GlobalTLSAddress nodes are lowered using the general-dynamic model.
3359 // We need to generate two TOC entries, one for the variable offset, one for
3360 // the region handle. The global address for the TOC entry of the region
3361 // handle is created with the MO_TLSGDM_FLAG flag and the global address
3362 // for the TOC entry of the variable offset is created with MO_TLSGD_FLAG.
3363 SDValue VariableOffsetTGA =
3364 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TLSGD_FLAG);
3365 SDValue RegionHandleTGA =
3366 DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, PPCII::MO_TLSGDM_FLAG);
3367 SDValue VariableOffset = getTOCEntry(DAG, dl, VariableOffsetTGA);
3368 SDValue RegionHandle = getTOCEntry(DAG, dl, RegionHandleTGA);
3369 return DAG.getNode(PPCISD::TLSGD_AIX, dl, PtrVT, VariableOffset,
3373 SDValue PPCTargetLowering::LowerGlobalTLSAddressLinux(SDValue Op,
3374 SelectionDAG &DAG) const {
3375 // FIXME: TLS addresses currently use medium model code sequences,
3376 // which is the most useful form. Eventually support for small and
3377 // large models could be added if users need it, at the cost of
3378 // additional complexity.
3379 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3380 if (DAG.getTarget().useEmulatedTLS())
3381 return LowerToTLSEmulatedModel(GA, DAG);
3384 const GlobalValue *GV = GA->getGlobal();
3385 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3386 bool is64bit = Subtarget.isPPC64();
3387 const Module *M = DAG.getMachineFunction().getFunction().getParent();
3388 PICLevel::Level picLevel = M->getPICLevel();
3390 const TargetMachine &TM = getTargetMachine();
3391 TLSModel::Model Model = TM.getTLSModel(GV);
3393 if (Model == TLSModel::LocalExec) {
3394 if (Subtarget.isUsingPCRelativeCalls()) {
3395 SDValue TLSReg = DAG.getRegister(PPC::X13, MVT::i64);
3396 SDValue TGA = DAG.getTargetGlobalAddress(
3397 GV, dl, PtrVT, 0, (PPCII::MO_PCREL_FLAG | PPCII::MO_TPREL_FLAG));
3399 DAG.getNode(PPCISD::TLS_LOCAL_EXEC_MAT_ADDR, dl, PtrVT, TGA);
3400 return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TLSReg, MatAddr);
3403 SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3404 PPCII::MO_TPREL_HA);
3405 SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3406 PPCII::MO_TPREL_LO);
3407 SDValue TLSReg = is64bit ? DAG.getRegister(PPC::X13, MVT::i64)
3408 : DAG.getRegister(PPC::R2, MVT::i32);
3410 SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
3411 return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
3414 if (Model == TLSModel::InitialExec) {
3415 bool IsPCRel = Subtarget.isUsingPCRelativeCalls();
3416 SDValue TGA = DAG.getTargetGlobalAddress(
3417 GV, dl, PtrVT, 0, IsPCRel ? PPCII::MO_GOT_TPREL_PCREL_FLAG : 0);
3418 SDValue TGATLS = DAG.getTargetGlobalAddress(
3420 IsPCRel ? (PPCII::MO_TLS | PPCII::MO_PCREL_FLAG) : PPCII::MO_TLS);
3423 SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, dl, PtrVT, TGA);
3424 TPOffset = DAG.getLoad(MVT::i64, dl, DAG.getEntryNode(), MatPCRel,
3425 MachinePointerInfo());
3429 setUsesTOCBasePtr(DAG);
3430 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
3432 DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl, PtrVT, GOTReg, TGA);
3434 if (!TM.isPositionIndependent())
3435 GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
3436 else if (picLevel == PICLevel::SmallPIC)
3437 GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
3439 GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
3441 TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl, PtrVT, TGA, GOTPtr);
3443 return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
3446 if (Model == TLSModel::GeneralDynamic) {
3447 if (Subtarget.isUsingPCRelativeCalls()) {
3448 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3449 PPCII::MO_GOT_TLSGD_PCREL_FLAG);
3450 return DAG.getNode(PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, dl, PtrVT, TGA);
3453 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
3456 setUsesTOCBasePtr(DAG);
3457 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
3458 GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
3461 if (picLevel == PICLevel::SmallPIC)
3462 GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
3464 GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
3466 return DAG.getNode(PPCISD::ADDI_TLSGD_L_ADDR, dl, PtrVT,
3470 if (Model == TLSModel::LocalDynamic) {
3471 if (Subtarget.isUsingPCRelativeCalls()) {
3472 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3473 PPCII::MO_GOT_TLSLD_PCREL_FLAG);
3475 DAG.getNode(PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR, dl, PtrVT, TGA);
3476 return DAG.getNode(PPCISD::PADDI_DTPREL, dl, PtrVT, MatPCRel, TGA);
3479 SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
3482 setUsesTOCBasePtr(DAG);
3483 SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
3484 GOTPtr = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
3487 if (picLevel == PICLevel::SmallPIC)
3488 GOTPtr = DAG.getNode(PPCISD::GlobalBaseReg, dl, PtrVT);
3490 GOTPtr = DAG.getNode(PPCISD::PPC32_PICGOT, dl, PtrVT);
3492 SDValue TLSAddr = DAG.getNode(PPCISD::ADDI_TLSLD_L_ADDR, dl,
3493 PtrVT, GOTPtr, TGA, TGA);
3494 SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl,
3495 PtrVT, TLSAddr, TGA);
3496 return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
3499 llvm_unreachable("Unknown TLS model!");
3502 SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
3503 SelectionDAG &DAG) const {
3504 EVT PtrVT = Op.getValueType();
3505 GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
3507 const GlobalValue *GV = GSDN->getGlobal();
3509 // 64-bit SVR4 ABI & AIX ABI code is always position-independent.
3510 // The actual address of the GlobalValue is stored in the TOC.
3511 if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
3512 if (Subtarget.isUsingPCRelativeCalls()) {
3513 EVT Ty = getPointerTy(DAG.getDataLayout());
3514 if (isAccessedAsGotIndirect(Op)) {
3515 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),
3516 PPCII::MO_PCREL_FLAG |
3517 PPCII::MO_GOT_FLAG);
3518 SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
3519 SDValue Load = DAG.getLoad(MVT::i64, DL, DAG.getEntryNode(), MatPCRel,
3520 MachinePointerInfo());
3523 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),
3524 PPCII::MO_PCREL_FLAG);
3525 return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
3528 setUsesTOCBasePtr(DAG);
3529 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
3530 return getTOCEntry(DAG, DL, GA);
3533 unsigned MOHiFlag, MOLoFlag;
3534 bool IsPIC = isPositionIndependent();
3535 getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag, GV);
3537 if (IsPIC && Subtarget.isSVR4ABI()) {
3538 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
3540 PPCII::MO_PIC_FLAG);
3541 return getTOCEntry(DAG, DL, GA);
3545 DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
3547 DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);
3549 return LowerLabelRef(GAHi, GALo, IsPIC, DAG);
3552 SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
3553 bool IsStrict = Op->isStrictFPOpcode();
3555 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
3556 SDValue LHS = Op.getOperand(IsStrict ? 1 : 0);
3557 SDValue RHS = Op.getOperand(IsStrict ? 2 : 1);
3558 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
3559 EVT LHSVT = LHS.getValueType();
3562 // Soften the setcc with libcall if it is fp128.
3563 if (LHSVT == MVT::f128) {
3564 assert(!Subtarget.hasP9Vector() &&
3565 "SETCC for f128 is already legal under Power9!");
3566 softenSetCCOperands(DAG, LHSVT, LHS, RHS, CC, dl, LHS, RHS, Chain,
3567 Op->getOpcode() == ISD::STRICT_FSETCCS);
3569 LHS = DAG.getNode(ISD::SETCC, dl, Op.getValueType(), LHS, RHS,
3570 DAG.getCondCode(CC));
3572 return DAG.getMergeValues({LHS, Chain}, dl);
3576 assert(!IsStrict && "Don't know how to handle STRICT_FSETCC!");
3578 if (Op.getValueType() == MVT::v2i64) {
3579 // When the operands themselves are v2i64 values, we need to do something
3580 // special because VSX has no underlying comparison operations for these.
3581 if (LHS.getValueType() == MVT::v2i64) {
3582 // Equality can be handled by casting to the legal type for Altivec
3583 // comparisons, everything else needs to be expanded.
3584 if (CC != ISD::SETEQ && CC != ISD::SETNE)
3586 SDValue SetCC32 = DAG.getSetCC(
3587 dl, MVT::v4i32, DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, LHS),
3588 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, RHS), CC);
3589 int ShuffV[] = {1, 0, 3, 2};
3591 DAG.getVectorShuffle(MVT::v4i32, dl, SetCC32, SetCC32, ShuffV);
3592 return DAG.getBitcast(MVT::v2i64,
3593 DAG.getNode(CC == ISD::SETEQ ? ISD::AND : ISD::OR,
3594 dl, MVT::v4i32, Shuff, SetCC32));
3597 // We handle most of these in the usual way.
3601 // If we're comparing for equality to zero, expose the fact that this is
3602 // implemented as a ctlz/srl pair on ppc, so that the dag combiner can
3603 // fold the new nodes.
3604 if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
3607 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
3608 // Leave comparisons against 0 and -1 alone for now, since they're usually
3609 // optimized. FIXME: revisit this when we can custom lower all setcc
3611 if (C->isAllOnes() || C->isZero())
3615 // If we have an integer seteq/setne, turn it into a compare against zero
3616 // by xor'ing the rhs with the lhs, which is faster than setting a
3617 // condition register, reading it back out, and masking the correct bit. The
3618 // normal approach here uses sub to do this instead of xor. Using xor exposes
3619 // the result to other bit-twiddling opportunities.
3620 if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3621 EVT VT = Op.getValueType();
3622 SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, LHS, RHS);
3623 return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC);
3628 SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3629 SDNode *Node = Op.getNode();
3630 EVT VT = Node->getValueType(0);
3631 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3632 SDValue InChain = Node->getOperand(0);
3633 SDValue VAListPtr = Node->getOperand(1);
3634 const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
3637 assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
3640 SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
3641 VAListPtr, MachinePointerInfo(SV), MVT::i8);
3642 InChain = GprIndex.getValue(1);
3644 if (VT == MVT::i64) {
3645 // Check if GprIndex is even
3646 SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex,
3647 DAG.getConstant(1, dl, MVT::i32));
3648 SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd,
3649 DAG.getConstant(0, dl, MVT::i32), ISD::SETNE);
3650 SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex,
3651 DAG.getConstant(1, dl, MVT::i32));
3652 // Align GprIndex to be even if it isn't
3653 GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne,
3657 // fpr index is 1 byte after gpr
3658 SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
3659 DAG.getConstant(1, dl, MVT::i32));
3662 SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
3663 FprPtr, MachinePointerInfo(SV), MVT::i8);
3664 InChain = FprIndex.getValue(1);
3666 SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
3667 DAG.getConstant(8, dl, MVT::i32));
3669 SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
3670 DAG.getConstant(4, dl, MVT::i32));
3673 SDValue OverflowArea =
3674 DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr, MachinePointerInfo());
3675 InChain = OverflowArea.getValue(1);
3677 SDValue RegSaveArea =
3678 DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr, MachinePointerInfo());
3679 InChain = RegSaveArea.getValue(1);
3681 // select overflow_area if index > 8
3682 SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex,
3683 DAG.getConstant(8, dl, MVT::i32), ISD::SETLT);
3685 // adjustment constant gpr_index * 4/8
3686 SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32,
3687 VT.isInteger() ? GprIndex : FprIndex,
3688 DAG.getConstant(VT.isInteger() ? 4 : 8, dl,
3691 // OurReg = RegSaveArea + RegConstant
3692 SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea,
3695 // Floating types are 32 bytes into RegSaveArea
3696 if (VT.isFloatingPoint())
3697 OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg,
3698 DAG.getConstant(32, dl, MVT::i32));
3700 // increase {f,g}pr_index by 1 (or 2 if VT is i64)
3701 SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32,
3702 VT.isInteger() ? GprIndex : FprIndex,
3703 DAG.getConstant(VT == MVT::i64 ? 2 : 1, dl,
3706 InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,
3707 VT.isInteger() ? VAListPtr : FprPtr,
3708 MachinePointerInfo(SV), MVT::i8);
3710 // determine if we should load from reg_save_area or overflow_area
3711 SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);
3713 // increase overflow_area by 4/8 if gpr/fpr > 8
3714 SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea,
3715 DAG.getConstant(VT.isInteger() ? 4 : 8,
3718 OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,
3721 InChain = DAG.getTruncStore(InChain, dl, OverflowArea, OverflowAreaPtr,
3722 MachinePointerInfo(), MVT::i32);
3724 return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo());
3727 SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
3728 assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");
3730 // We have to copy the entire va_list struct:
3731 // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
3732 return DAG.getMemcpy(Op.getOperand(0), Op, Op.getOperand(1), Op.getOperand(2),
3733 DAG.getConstant(12, SDLoc(Op), MVT::i32), Align(8),
3734 false, true, false, MachinePointerInfo(),
3735 MachinePointerInfo());
3738 SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
3739 SelectionDAG &DAG) const {
3740 if (Subtarget.isAIXABI())
3741 report_fatal_error("ADJUST_TRAMPOLINE operation is not supported on AIX.");
3743 return Op.getOperand(0);
3746 SDValue PPCTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
3747 MachineFunction &MF = DAG.getMachineFunction();
3748 PPCFunctionInfo &MFI = *MF.getInfo<PPCFunctionInfo>();
3750 assert((Op.getOpcode() == ISD::INLINEASM ||
3751 Op.getOpcode() == ISD::INLINEASM_BR) &&
3752 "Expecting Inline ASM node.");
3754 // If an LR store is already known to be required then there is not point in
3755 // checking this ASM as well.
3756 if (MFI.isLRStoreRequired())
3759 // Inline ASM nodes have an optional last operand that is an incoming Flag of
3760 // type MVT::Glue. We want to ignore this last operand if that is the case.
3761 unsigned NumOps = Op.getNumOperands();
3762 if (Op.getOperand(NumOps - 1).getValueType() == MVT::Glue)
3765 // Check all operands that may contain the LR.
3766 for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
3767 unsigned Flags = cast<ConstantSDNode>(Op.getOperand(i))->getZExtValue();
3768 unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
3769 ++i; // Skip the ID value.
3771 switch (InlineAsm::getKind(Flags)) {
3773 llvm_unreachable("Bad flags!");
3774 case InlineAsm::Kind_RegUse:
3775 case InlineAsm::Kind_Imm:
3776 case InlineAsm::Kind_Mem:
3779 case InlineAsm::Kind_Clobber:
3780 case InlineAsm::Kind_RegDef:
3781 case InlineAsm::Kind_RegDefEarlyClobber: {
3782 for (; NumVals; --NumVals, ++i) {
3783 Register Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg();
3784 if (Reg != PPC::LR && Reg != PPC::LR8)
3786 MFI.setLRStoreRequired();
3797 SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
3798 SelectionDAG &DAG) const {
3799 if (Subtarget.isAIXABI())
3800 report_fatal_error("INIT_TRAMPOLINE operation is not supported on AIX.");
3802 SDValue Chain = Op.getOperand(0);
3803 SDValue Trmp = Op.getOperand(1); // trampoline
3804 SDValue FPtr = Op.getOperand(2); // nested function
3805 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
3808 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3809 bool isPPC64 = (PtrVT == MVT::i64);
3810 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
3812 TargetLowering::ArgListTy Args;
3813 TargetLowering::ArgListEntry Entry;
3815 Entry.Ty = IntPtrTy;
3816 Entry.Node = Trmp; Args.push_back(Entry);
3818 // TrampSize == (isPPC64 ? 48 : 40);
3819 Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl,
3820 isPPC64 ? MVT::i64 : MVT::i32);
3821 Args.push_back(Entry);
3823 Entry.Node = FPtr; Args.push_back(Entry);
3824 Entry.Node = Nest; Args.push_back(Entry);
3826 // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
3827 TargetLowering::CallLoweringInfo CLI(DAG);
3828 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3829 CallingConv::C, Type::getVoidTy(*DAG.getContext()),
3830 DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
3832 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3833 return CallResult.second;
3836 SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3837 MachineFunction &MF = DAG.getMachineFunction();
3838 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
3839 EVT PtrVT = getPointerTy(MF.getDataLayout());
3843 if (Subtarget.isPPC64() || Subtarget.isAIXABI()) {
3844 // vastart just stores the address of the VarArgsFrameIndex slot into the
3845 // memory location argument.
3846 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
3847 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3848 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
3849 MachinePointerInfo(SV));
3852 // For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
3853 // We suppose the given va_list is already allocated.
3856 // char gpr; /* index into the array of 8 GPRs
3857 // * stored in the register save area
3858 // * gpr=0 corresponds to r3,
3859 // * gpr=1 to r4, etc.
3861 // char fpr; /* index into the array of 8 FPRs
3862 // * stored in the register save area
3863 // * fpr=0 corresponds to f1,
3864 // * fpr=1 to f2, etc.
3866 // char *overflow_arg_area;
3867 // /* location on stack that holds
3868 // * the next overflow argument
3870 // char *reg_save_area;
3871 // /* where r3:r10 and f1:f8 (if saved)
3876 SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32);
3877 SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32);
3878 SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
3880 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
3883 uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
3884 SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, dl, PtrVT);
3886 uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
3887 SDValue ConstStackOffset = DAG.getConstant(StackOffset, dl, PtrVT);
3889 uint64_t FPROffset = 1;
3890 SDValue ConstFPROffset = DAG.getConstant(FPROffset, dl, PtrVT);
3892 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3894 // Store first byte : number of int regs
3895 SDValue firstStore =
3896 DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR, Op.getOperand(1),
3897 MachinePointerInfo(SV), MVT::i8);
3898 uint64_t nextOffset = FPROffset;
3899 SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
3902 // Store second byte : number of float regs
3903 SDValue secondStore =
3904 DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
3905 MachinePointerInfo(SV, nextOffset), MVT::i8);
3906 nextOffset += StackOffset;
3907 nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);
3909 // Store second word : arguments given on stack
3910 SDValue thirdStore = DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
3911 MachinePointerInfo(SV, nextOffset));
3912 nextOffset += FrameOffset;
3913 nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);
3915 // Store third word : arguments given in registers
3916 return DAG.getStore(thirdStore, dl, FR, nextPtr,
3917 MachinePointerInfo(SV, nextOffset));
3920 /// FPR - The set of FP registers that should be allocated for arguments
3921 /// on Darwin and AIX.
3922 static const MCPhysReg FPR[] = {PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5,
3923 PPC::F6, PPC::F7, PPC::F8, PPC::F9, PPC::F10,
3924 PPC::F11, PPC::F12, PPC::F13};
3926 /// CalculateStackSlotSize - Calculates the size reserved for this argument on
3928 static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
3929 unsigned PtrByteSize) {
3930 unsigned ArgSize = ArgVT.getStoreSize();
3931 if (Flags.isByVal())
3932 ArgSize = Flags.getByValSize();
3934 // Round up to multiples of the pointer size, except for array members,
3935 // which are always packed.
3936 if (!Flags.isInConsecutiveRegs())
3937 ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
3942 /// CalculateStackSlotAlignment - Calculates the alignment of this argument
3944 static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
3945 ISD::ArgFlagsTy Flags,
3946 unsigned PtrByteSize) {
3947 Align Alignment(PtrByteSize);
3949 // Altivec parameters are padded to a 16 byte boundary.
3950 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
3951 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
3952 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
3953 ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
3954 Alignment = Align(16);
3956 // ByVal parameters are aligned as requested.
3957 if (Flags.isByVal()) {
3958 auto BVAlign = Flags.getNonZeroByValAlign();
3959 if (BVAlign > PtrByteSize) {
3960 if (BVAlign.value() % PtrByteSize != 0)
3962 "ByVal alignment is not a multiple of the pointer size");
3964 Alignment = BVAlign;
3968 // Array members are always packed to their original alignment.
3969 if (Flags.isInConsecutiveRegs()) {
3970 // If the array member was split into multiple registers, the first
3971 // needs to be aligned to the size of the full type. (Except for
3972 // ppcf128, which is only aligned as its f64 components.)
3973 if (Flags.isSplit() && OrigVT != MVT::ppcf128)
3974 Alignment = Align(OrigVT.getStoreSize());
3976 Alignment = Align(ArgVT.getStoreSize());
3982 /// CalculateStackSlotUsed - Return whether this argument will use its
3983 /// stack slot (instead of being passed in registers). ArgOffset,
3984 /// AvailableFPRs, and AvailableVRs must hold the current argument
3985 /// position, and will be updated to account for this argument.
3986 static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT, ISD::ArgFlagsTy Flags,
3987 unsigned PtrByteSize, unsigned LinkageSize,
3988 unsigned ParamAreaSize, unsigned &ArgOffset,
3989 unsigned &AvailableFPRs,
3990 unsigned &AvailableVRs) {
3991 bool UseMemory = false;
3993 // Respect alignment of argument on the stack.
3995 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
3996 ArgOffset = alignTo(ArgOffset, Alignment);
3997 // If there's no space left in the argument save area, we must
3998 // use memory (this check also catches zero-sized arguments).
3999 if (ArgOffset >= LinkageSize + ParamAreaSize)
4002 // Allocate argument on the stack.
4003 ArgOffset += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
4004 if (Flags.isInConsecutiveRegsLast())
4005 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4006 // If we overran the argument save area, we must use memory
4007 // (this check catches arguments passed partially in memory)
4008 if (ArgOffset > LinkageSize + ParamAreaSize)
4011 // However, if the argument is actually passed in an FPR or a VR,
4012 // we don't use memory after all.
4013 if (!Flags.isByVal()) {
4014 if (ArgVT == MVT::f32 || ArgVT == MVT::f64)
4015 if (AvailableFPRs > 0) {
4019 if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
4020 ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
4021 ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
4022 ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
4023 if (AvailableVRs > 0) {
4032 /// EnsureStackAlignment - Round stack frame size up from NumBytes to
4033 /// ensure minimum alignment required for target.
4034 static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
4035 unsigned NumBytes) {
4036 return alignTo(NumBytes, Lowering->getStackAlign());
4039 SDValue PPCTargetLowering::LowerFormalArguments(
4040 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4041 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4042 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4043 if (Subtarget.isAIXABI())
4044 return LowerFormalArguments_AIX(Chain, CallConv, isVarArg, Ins, dl, DAG,
4046 if (Subtarget.is64BitELFABI())
4047 return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
4049 assert(Subtarget.is32BitELFABI());
4050 return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
4054 SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
4055 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4056 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4057 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4059 // 32-bit SVR4 ABI Stack Frame Layout:
4060 // +-----------------------------------+
4061 // +--> | Back chain |
4062 // | +-----------------------------------+
4063 // | | Floating-point register save area |
4064 // | +-----------------------------------+
4065 // | | General register save area |
4066 // | +-----------------------------------+
4067 // | | CR save word |
4068 // | +-----------------------------------+
4069 // | | VRSAVE save word |
4070 // | +-----------------------------------+
4071 // | | Alignment padding |
4072 // | +-----------------------------------+
4073 // | | Vector register save area |
4074 // | +-----------------------------------+
4075 // | | Local variable space |
4076 // | +-----------------------------------+
4077 // | | Parameter list area |
4078 // | +-----------------------------------+
4079 // | | LR save word |
4080 // | +-----------------------------------+
4081 // SP--> +--- | Back chain |
4082 // +-----------------------------------+
4085 // System V Application Binary Interface PowerPC Processor Supplement
4086 // AltiVec Technology Programming Interface Manual
4088 MachineFunction &MF = DAG.getMachineFunction();
4089 MachineFrameInfo &MFI = MF.getFrameInfo();
4090 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
4092 EVT PtrVT = getPointerTy(MF.getDataLayout());
4093 // Potential tail calls could cause overwriting of argument stack slots.
4094 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
4095 (CallConv == CallingConv::Fast));
4096 const Align PtrAlign(4);
4098 // Assign locations to all of the incoming arguments.
4099 SmallVector<CCValAssign, 16> ArgLocs;
4100 PPCCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4103 // Reserve space for the linkage area on the stack.
4104 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4105 CCInfo.AllocateStack(LinkageSize, PtrAlign);
4107 CCInfo.PreAnalyzeFormalArguments(Ins);
4109 CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
4110 CCInfo.clearWasPPCF128();
4112 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4113 CCValAssign &VA = ArgLocs[i];
4115 // Arguments stored in registers.
4116 if (VA.isRegLoc()) {
4117 const TargetRegisterClass *RC;
4118 EVT ValVT = VA.getValVT();
4120 switch (ValVT.getSimpleVT().SimpleTy) {
4122 llvm_unreachable("ValVT not supported by formal arguments Lowering");
4125 RC = &PPC::GPRCRegClass;
4128 if (Subtarget.hasP8Vector())
4129 RC = &PPC::VSSRCRegClass;
4130 else if (Subtarget.hasSPE())
4131 RC = &PPC::GPRCRegClass;
4133 RC = &PPC::F4RCRegClass;
4136 if (Subtarget.hasVSX())
4137 RC = &PPC::VSFRCRegClass;
4138 else if (Subtarget.hasSPE())
4139 // SPE passes doubles in GPR pairs.
4140 RC = &PPC::GPRCRegClass;
4142 RC = &PPC::F8RCRegClass;
4147 RC = &PPC::VRRCRegClass;
4150 RC = &PPC::VRRCRegClass;
4154 RC = &PPC::VRRCRegClass;
4159 // Transform the arguments stored in physical registers into
4161 if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) {
4162 assert(i + 1 < e && "No second half of double precision argument");
4163 Register RegLo = MF.addLiveIn(VA.getLocReg(), RC);
4164 Register RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC);
4165 SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, RegLo, MVT::i32);
4166 SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, RegHi, MVT::i32);
4167 if (!Subtarget.isLittleEndian())
4168 std::swap (ArgValueLo, ArgValueHi);
4169 ArgValue = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, ArgValueLo,
4172 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4173 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
4174 ValVT == MVT::i1 ? MVT::i32 : ValVT);
4175 if (ValVT == MVT::i1)
4176 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
4179 InVals.push_back(ArgValue);
4181 // Argument stored in memory.
4182 assert(VA.isMemLoc());
4184 // Get the extended size of the argument type in stack
4185 unsigned ArgSize = VA.getLocVT().getStoreSize();
4186 // Get the actual size of the argument type
4187 unsigned ObjSize = VA.getValVT().getStoreSize();
4188 unsigned ArgOffset = VA.getLocMemOffset();
4189 // Stack objects in PPC32 are right justified.
4190 ArgOffset += ArgSize - ObjSize;
4191 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, isImmutable);
4193 // Create load nodes to retrieve arguments from the stack.
4194 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4196 DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo()));
4200 // Assign locations to all of the incoming aggregate by value arguments.
4201 // Aggregates passed by value are stored in the local variable space of the
4202 // caller's stack frame, right above the parameter list area.
4203 SmallVector<CCValAssign, 16> ByValArgLocs;
4204 CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
4205 ByValArgLocs, *DAG.getContext());
4207 // Reserve stack space for the allocations in CCInfo.
4208 CCByValInfo.AllocateStack(CCInfo.getStackSize(), PtrAlign);
4210 CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);
4212 // Area that is at least reserved in the caller of this function.
4213 unsigned MinReservedArea = CCByValInfo.getStackSize();
4214 MinReservedArea = std::max(MinReservedArea, LinkageSize);
4216 // Set the size that is at least reserved in caller of this function. Tail
4217 // call optimized function's reserved stack space needs to be aligned so that
4218 // taking the difference between two stack areas will result in an aligned
4221 EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
4222 FuncInfo->setMinReservedArea(MinReservedArea);
4224 SmallVector<SDValue, 8> MemOps;
4226 // If the function takes variable number of arguments, make a frame index for
4227 // the start of the first vararg value... for expansion of llvm.va_start.
4229 static const MCPhysReg GPArgRegs[] = {
4230 PPC::R3, PPC::R4, PPC::R5, PPC::R6,
4231 PPC::R7, PPC::R8, PPC::R9, PPC::R10,
4233 const unsigned NumGPArgRegs = std::size(GPArgRegs);
4235 static const MCPhysReg FPArgRegs[] = {
4236 PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
4239 unsigned NumFPArgRegs = std::size(FPArgRegs);
4241 if (useSoftFloat() || hasSPE())
4244 FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs));
4245 FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs));
4247 // Make room for NumGPArgRegs and NumFPArgRegs.
4248 int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
4249 NumFPArgRegs * MVT(MVT::f64).getSizeInBits()/8;
4251 FuncInfo->setVarArgsStackOffset(MFI.CreateFixedObject(
4252 PtrVT.getSizeInBits() / 8, CCInfo.getStackSize(), true));
4254 FuncInfo->setVarArgsFrameIndex(
4255 MFI.CreateStackObject(Depth, Align(8), false));
4256 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4258 // The fixed integer arguments of a variadic function are stored to the
4259 // VarArgsFrameIndex on the stack so that they may be loaded by
4260 // dereferencing the result of va_next.
4261 for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
4262 // Get an existing live-in vreg, or add a new one.
4263 Register VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
4265 VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
4267 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
4269 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
4270 MemOps.push_back(Store);
4271 // Increment the address by four for the next argument to store
4272 SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, dl, PtrVT);
4273 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
4276 // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
4278 // The double arguments are stored to the VarArgsFrameIndex
4280 for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
4281 // Get an existing live-in vreg, or add a new one.
4282 Register VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
4284 VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
4286 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
4288 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
4289 MemOps.push_back(Store);
4290 // Increment the address by eight for the next argument to store
4291 SDValue PtrOff = DAG.getConstant(MVT(MVT::f64).getSizeInBits()/8, dl,
4293 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
4297 if (!MemOps.empty())
4298 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4303 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
4304 // value to MVT::i64 and then truncate to the correct register size.
4305 SDValue PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags,
4306 EVT ObjectVT, SelectionDAG &DAG,
4308 const SDLoc &dl) const {
4310 ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
4311 DAG.getValueType(ObjectVT));
4312 else if (Flags.isZExt())
4313 ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
4314 DAG.getValueType(ObjectVT));
4316 return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
4319 SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
4320 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4321 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4322 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4323 // TODO: add description of PPC stack frame format, or at least some docs.
4325 bool isELFv2ABI = Subtarget.isELFv2ABI();
4326 bool isLittleEndian = Subtarget.isLittleEndian();
4327 MachineFunction &MF = DAG.getMachineFunction();
4328 MachineFrameInfo &MFI = MF.getFrameInfo();
4329 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
4331 assert(!(CallConv == CallingConv::Fast && isVarArg) &&
4332 "fastcc not supported on varargs functions");
4334 EVT PtrVT = getPointerTy(MF.getDataLayout());
4335 // Potential tail calls could cause overwriting of argument stack slots.
4336 bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
4337 (CallConv == CallingConv::Fast));
4338 unsigned PtrByteSize = 8;
4339 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4341 static const MCPhysReg GPR[] = {
4342 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
4343 PPC::X7, PPC::X8, PPC::X9, PPC::X10,
4345 static const MCPhysReg VR[] = {
4346 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
4347 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
4350 const unsigned Num_GPR_Regs = std::size(GPR);
4351 const unsigned Num_FPR_Regs = useSoftFloat() ? 0 : 13;
4352 const unsigned Num_VR_Regs = std::size(VR);
4354 // Do a first pass over the arguments to determine whether the ABI
4355 // guarantees that our caller has allocated the parameter save area
4356 // on its stack frame. In the ELFv1 ABI, this is always the case;
4357 // in the ELFv2 ABI, it is true if this is a vararg function or if
4358 // any parameter is located in a stack slot.
4360 bool HasParameterArea = !isELFv2ABI || isVarArg;
4361 unsigned ParamAreaSize = Num_GPR_Regs * PtrByteSize;
4362 unsigned NumBytes = LinkageSize;
4363 unsigned AvailableFPRs = Num_FPR_Regs;
4364 unsigned AvailableVRs = Num_VR_Regs;
4365 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4366 if (Ins[i].Flags.isNest())
4369 if (CalculateStackSlotUsed(Ins[i].VT, Ins[i].ArgVT, Ins[i].Flags,
4370 PtrByteSize, LinkageSize, ParamAreaSize,
4371 NumBytes, AvailableFPRs, AvailableVRs))
4372 HasParameterArea = true;
4375 // Add DAG nodes to load the arguments or copy them out of registers. On
4376 // entry to a function on PPC, the arguments start after the linkage area,
4377 // although the first ones are often in registers.
4379 unsigned ArgOffset = LinkageSize;
4380 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
4381 SmallVector<SDValue, 8> MemOps;
4382 Function::const_arg_iterator FuncArg = MF.getFunction().arg_begin();
4383 unsigned CurArgIdx = 0;
4384 for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
4386 bool needsLoad = false;
4387 EVT ObjectVT = Ins[ArgNo].VT;
4388 EVT OrigVT = Ins[ArgNo].ArgVT;
4389 unsigned ObjSize = ObjectVT.getStoreSize();
4390 unsigned ArgSize = ObjSize;
4391 ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
4392 if (Ins[ArgNo].isOrigArg()) {
4393 std::advance(FuncArg, Ins[ArgNo].getOrigArgIndex() - CurArgIdx);
4394 CurArgIdx = Ins[ArgNo].getOrigArgIndex();
4396 // We re-align the argument offset for each argument, except when using the
4397 // fast calling convention, when we need to make sure we do that only when
4398 // we'll actually use a stack slot.
4399 unsigned CurArgOffset;
4401 auto ComputeArgOffset = [&]() {
4402 /* Respect alignment of argument on the stack. */
4404 CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
4405 ArgOffset = alignTo(ArgOffset, Alignment);
4406 CurArgOffset = ArgOffset;
4409 if (CallConv != CallingConv::Fast) {
4412 /* Compute GPR index associated with argument offset. */
4413 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
4414 GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
4417 // FIXME the codegen can be much improved in some cases.
4418 // We do not have to keep everything in memory.
4419 if (Flags.isByVal()) {
4420 assert(Ins[ArgNo].isOrigArg() && "Byval arguments cannot be implicit");
4422 if (CallConv == CallingConv::Fast)
4425 // ObjSize is the true size, ArgSize rounded up to multiple of registers.
4426 ObjSize = Flags.getByValSize();
4427 ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4428 // Empty aggregate parameters do not take up registers. Examples:
4432 // etc. However, we have to provide a place-holder in InVals, so
4433 // pretend we have an 8-byte item at the current address for that
4436 int FI = MFI.CreateFixedObject(PtrByteSize, ArgOffset, true);
4437 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4438 InVals.push_back(FIN);
4442 // Create a stack object covering all stack doublewords occupied
4443 // by the argument. If the argument is (fully or partially) on
4444 // the stack, or if the argument is fully in registers but the
4445 // caller has allocated the parameter save anyway, we can refer
4446 // directly to the caller's stack frame. Otherwise, create a
4447 // local copy in our own frame.
4449 if (HasParameterArea ||
4450 ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
4451 FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true);
4453 FI = MFI.CreateStackObject(ArgSize, Alignment, false);
4454 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4456 // Handle aggregates smaller than 8 bytes.
4457 if (ObjSize < PtrByteSize) {
4458 // The value of the object is its address, which differs from the
4459 // address of the enclosing doubleword on big-endian systems.
4461 if (!isLittleEndian) {
4462 SDValue ArgOff = DAG.getConstant(PtrByteSize - ObjSize, dl, PtrVT);
4463 Arg = DAG.getNode(ISD::ADD, dl, ArgOff.getValueType(), Arg, ArgOff);
4465 InVals.push_back(Arg);
4467 if (GPR_idx != Num_GPR_Regs) {
4468 Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
4469 FuncInfo->addLiveInAttr(VReg, Flags);
4470 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
4471 EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), ObjSize * 8);
4473 DAG.getTruncStore(Val.getValue(1), dl, Val, Arg,
4474 MachinePointerInfo(&*FuncArg), ObjType);
4475 MemOps.push_back(Store);
4477 // Whether we copied from a register or not, advance the offset
4478 // into the parameter save area by a full doubleword.
4479 ArgOffset += PtrByteSize;
4483 // The value of the object is its address, which is the address of
4484 // its first stack doubleword.
4485 InVals.push_back(FIN);
4487 // Store whatever pieces of the object are in registers to memory.
4488 for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
4489 if (GPR_idx == Num_GPR_Regs)
4492 Register VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
4493 FuncInfo->addLiveInAttr(VReg, Flags);
4494 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
4497 SDValue Off = DAG.getConstant(j, dl, PtrVT);
4498 Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off);
4500 unsigned StoreSizeInBits = std::min(PtrByteSize, (ObjSize - j)) * 8;
4501 EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), StoreSizeInBits);
4503 DAG.getTruncStore(Val.getValue(1), dl, Val, Addr,
4504 MachinePointerInfo(&*FuncArg, j), ObjType);
4505 MemOps.push_back(Store);
4508 ArgOffset += ArgSize;
4512 switch (ObjectVT.getSimpleVT().SimpleTy) {
4513 default: llvm_unreachable("Unhandled argument type!");
4517 if (Flags.isNest()) {
4518 // The 'nest' parameter, if any, is passed in R11.
4519 Register VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);
4520 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
4522 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
4523 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
4528 // These can be scalar arguments or elements of an integer array type
4529 // passed directly. Clang may use those instead of "byval" aggregate
4530 // types to avoid forcing arguments to memory unnecessarily.
4531 if (GPR_idx != Num_GPR_Regs) {
4532 Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
4533 FuncInfo->addLiveInAttr(VReg, Flags);
4534 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
4536 if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
4537 // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
4538 // value to MVT::i64 and then truncate to the correct register size.
4539 ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
4541 if (CallConv == CallingConv::Fast)
4545 ArgSize = PtrByteSize;
4547 if (CallConv != CallingConv::Fast || needsLoad)
4553 // These can be scalar arguments or elements of a float array type
4554 // passed directly. The latter are used to implement ELFv2 homogenous
4555 // float aggregates.
4556 if (FPR_idx != Num_FPR_Regs) {
4559 if (ObjectVT == MVT::f32)
4560 VReg = MF.addLiveIn(FPR[FPR_idx],
4561 Subtarget.hasP8Vector()
4562 ? &PPC::VSSRCRegClass
4563 : &PPC::F4RCRegClass);
4565 VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX()
4566 ? &PPC::VSFRCRegClass
4567 : &PPC::F8RCRegClass);
4569 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
4571 } else if (GPR_idx != Num_GPR_Regs && CallConv != CallingConv::Fast) {
4572 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
4573 // once we support fp <-> gpr moves.
4575 // This can only ever happen in the presence of f32 array types,
4576 // since otherwise we never run out of FPRs before running out
4578 Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
4579 FuncInfo->addLiveInAttr(VReg, Flags);
4580 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
4582 if (ObjectVT == MVT::f32) {
4583 if ((ArgOffset % PtrByteSize) == (isLittleEndian ? 4 : 0))
4584 ArgVal = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgVal,
4585 DAG.getConstant(32, dl, MVT::i32));
4586 ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
4589 ArgVal = DAG.getNode(ISD::BITCAST, dl, ObjectVT, ArgVal);
4591 if (CallConv == CallingConv::Fast)
4597 // When passing an array of floats, the array occupies consecutive
4598 // space in the argument area; only round up to the next doubleword
4599 // at the end of the array. Otherwise, each float takes 8 bytes.
4600 if (CallConv != CallingConv::Fast || needsLoad) {
4601 ArgSize = Flags.isInConsecutiveRegs() ? ObjSize : PtrByteSize;
4602 ArgOffset += ArgSize;
4603 if (Flags.isInConsecutiveRegsLast())
4604 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
4615 // These can be scalar arguments or elements of a vector array type
4616 // passed directly. The latter are used to implement ELFv2 homogenous
4617 // vector aggregates.
4618 if (VR_idx != Num_VR_Regs) {
4619 Register VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
4620 ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
4623 if (CallConv == CallingConv::Fast)
4627 if (CallConv != CallingConv::Fast || needsLoad)
4632 // We need to load the argument to a virtual register if we determined
4633 // above that we ran out of physical registers of the appropriate type.
4635 if (ObjSize < ArgSize && !isLittleEndian)
4636 CurArgOffset += ArgSize - ObjSize;
4637 int FI = MFI.CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
4638 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4639 ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo());
4642 InVals.push_back(ArgVal);
4645 // Area that is at least reserved in the caller of this function.
4646 unsigned MinReservedArea;
4647 if (HasParameterArea)
4648 MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
4650 MinReservedArea = LinkageSize;
4652 // Set the size that is at least reserved in caller of this function. Tail
4653 // call optimized functions' reserved stack space needs to be aligned so that
4654 // taking the difference between two stack areas will result in an aligned
4657 EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
4658 FuncInfo->setMinReservedArea(MinReservedArea);
4660 // If the function takes variable number of arguments, make a frame index for
4661 // the start of the first vararg value... for expansion of llvm.va_start.
4662 // On ELFv2ABI spec, it writes:
4663 // C programs that are intended to be *portable* across different compilers
4664 // and architectures must use the header file <stdarg.h> to deal with variable
4666 if (isVarArg && MFI.hasVAStart()) {
4667 int Depth = ArgOffset;
4669 FuncInfo->setVarArgsFrameIndex(
4670 MFI.CreateFixedObject(PtrByteSize, Depth, true));
4671 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4673 // If this function is vararg, store any remaining integer argument regs
4674 // to their spots on the stack so that they may be loaded by dereferencing
4675 // the result of va_next.
4676 for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
4677 GPR_idx < Num_GPR_Regs; ++GPR_idx) {
4678 Register VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
4679 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
4681 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
4682 MemOps.push_back(Store);
4683 // Increment the address by four for the next argument to store
4684 SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
4685 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
4689 if (!MemOps.empty())
4690 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4695 /// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
4696 /// adjusted to accommodate the arguments for the tailcall.
4697 static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
4698 unsigned ParamSize) {
4700 if (!isTailCall) return 0;
4702 PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
4703 unsigned CallerMinReservedArea = FI->getMinReservedArea();
4704 int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
4705 // Remember only if the new adjustment is bigger.
4706 if (SPDiff < FI->getTailCallSPDelta())
4707 FI->setTailCallSPDelta(SPDiff);
4712 static bool isFunctionGlobalAddress(const GlobalValue *CalleeGV);
4714 static bool callsShareTOCBase(const Function *Caller,
4715 const GlobalValue *CalleeGV,
4716 const TargetMachine &TM) {
4717 // It does not make sense to call callsShareTOCBase() with a caller that
4718 // is PC Relative since PC Relative callers do not have a TOC.
4720 const PPCSubtarget *STICaller = &TM.getSubtarget<PPCSubtarget>(*Caller);
4721 assert(!STICaller->isUsingPCRelativeCalls() &&
4722 "PC Relative callers do not have a TOC and cannot share a TOC Base");
4725 // Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
4726 // don't have enough information to determine if the caller and callee share
4727 // the same TOC base, so we have to pessimistically assume they don't for
4732 // If the callee is preemptable, then the static linker will use a plt-stub
4733 // which saves the toc to the stack, and needs a nop after the call
4734 // instruction to convert to a toc-restore.
4735 if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), CalleeGV))
4738 // Functions with PC Relative enabled may clobber the TOC in the same DSO.
4739 // We may need a TOC restore in the situation where the caller requires a
4740 // valid TOC but the callee is PC Relative and does not.
4741 const Function *F = dyn_cast<Function>(CalleeGV);
4742 const GlobalAlias *Alias = dyn_cast<GlobalAlias>(CalleeGV);
4744 // If we have an Alias we can try to get the function from there.
4746 const GlobalObject *GlobalObj = Alias->getAliaseeObject();
4747 F = dyn_cast<Function>(GlobalObj);
4750 // If we still have no valid function pointer we do not have enough
4751 // information to determine if the callee uses PC Relative calls so we must
4752 // assume that it does.
4756 // If the callee uses PC Relative we cannot guarantee that the callee won't
4757 // clobber the TOC of the caller and so we must assume that the two
4758 // functions do not share a TOC base.
4759 const PPCSubtarget *STICallee = &TM.getSubtarget<PPCSubtarget>(*F);
4760 if (STICallee->isUsingPCRelativeCalls())
4763 // If the GV is not a strong definition then we need to assume it can be
4764 // replaced by another function at link time. The function that replaces
4765 // it may not share the same TOC as the caller since the callee may be
4766 // replaced by a PC Relative version of the same function.
4767 if (!CalleeGV->isStrongDefinitionForLinker())
4770 // The medium and large code models are expected to provide a sufficiently
4771 // large TOC to provide all data addressing needs of a module with a
4773 if (CodeModel::Medium == TM.getCodeModel() ||
4774 CodeModel::Large == TM.getCodeModel())
4777 // Any explicitly-specified sections and section prefixes must also match.
4778 // Also, if we're using -ffunction-sections, then each function is always in
4779 // a different section (the same is true for COMDAT functions).
4780 if (TM.getFunctionSections() || CalleeGV->hasComdat() ||
4781 Caller->hasComdat() || CalleeGV->getSection() != Caller->getSection())
4783 if (const auto *F = dyn_cast<Function>(CalleeGV)) {
4784 if (F->getSectionPrefix() != Caller->getSectionPrefix())
4792 needStackSlotPassParameters(const PPCSubtarget &Subtarget,
4793 const SmallVectorImpl<ISD::OutputArg> &Outs) {
4794 assert(Subtarget.is64BitELFABI());
4796 const unsigned PtrByteSize = 8;
4797 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
4799 static const MCPhysReg GPR[] = {
4800 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
4801 PPC::X7, PPC::X8, PPC::X9, PPC::X10,
4803 static const MCPhysReg VR[] = {
4804 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
4805 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
4808 const unsigned NumGPRs = std::size(GPR);
4809 const unsigned NumFPRs = 13;
4810 const unsigned NumVRs = std::size(VR);
4811 const unsigned ParamAreaSize = NumGPRs * PtrByteSize;
4813 unsigned NumBytes = LinkageSize;
4814 unsigned AvailableFPRs = NumFPRs;
4815 unsigned AvailableVRs = NumVRs;
4817 for (const ISD::OutputArg& Param : Outs) {
4818 if (Param.Flags.isNest()) continue;
4820 if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags, PtrByteSize,
4821 LinkageSize, ParamAreaSize, NumBytes,
4822 AvailableFPRs, AvailableVRs))
4828 static bool hasSameArgumentList(const Function *CallerFn, const CallBase &CB) {
4829 if (CB.arg_size() != CallerFn->arg_size())
4832 auto CalleeArgIter = CB.arg_begin();
4833 auto CalleeArgEnd = CB.arg_end();
4834 Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
4836 for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
4837 const Value* CalleeArg = *CalleeArgIter;
4838 const Value* CallerArg = &(*CallerArgIter);
4839 if (CalleeArg == CallerArg)
4842 // e.g. @caller([4 x i64] %a, [4 x i64] %b) {
4843 // tail call @callee([4 x i64] undef, [4 x i64] %b)
4845 // 1st argument of callee is undef and has the same type as caller.
4846 if (CalleeArg->getType() == CallerArg->getType() &&
4847 isa<UndefValue>(CalleeArg))
4856 // Returns true if TCO is possible between the callers and callees
4857 // calling conventions.
4859 areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
4860 CallingConv::ID CalleeCC) {
4861 // Tail calls are possible with fastcc and ccc.
4862 auto isTailCallableCC = [] (CallingConv::ID CC){
4863 return CC == CallingConv::C || CC == CallingConv::Fast;
4865 if (!isTailCallableCC(CallerCC) || !isTailCallableCC(CalleeCC))
4868 // We can safely tail call both fastcc and ccc callees from a c calling
4869 // convention caller. If the caller is fastcc, we may have less stack space
4870 // than a non-fastcc caller with the same signature so disable tail-calls in
4872 return CallerCC == CallingConv::C || CallerCC == CalleeCC;
4875 bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
4876 const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
4877 CallingConv::ID CallerCC, const CallBase *CB, bool isVarArg,
4878 const SmallVectorImpl<ISD::OutputArg> &Outs,
4879 const SmallVectorImpl<ISD::InputArg> &Ins, const Function *CallerFunc,
4880 bool isCalleeExternalSymbol) const {
4881 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
4883 if (DisableSCO && !TailCallOpt) return false;
4885 // Variadic argument functions are not supported.
4886 if (isVarArg) return false;
4888 // Check that the calling conventions are compatible for tco.
4889 if (!areCallingConvEligibleForTCO_64SVR4(CallerCC, CalleeCC))
4892 // Caller contains any byval parameter is not supported.
4893 if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
4896 // Callee contains any byval parameter is not supported, too.
4897 // Note: This is a quick work around, because in some cases, e.g.
4898 // caller's stack size > callee's stack size, we are still able to apply
4899 // sibling call optimization. For example, gcc is able to do SCO for caller1
4900 // in the following example, but not for caller2.
4905 // __attribute__((noinline)) int callee(struct test v, struct test *b) {
4909 // void caller1(struct test a, struct test c, struct test *b) {
4910 // callee(gTest, b); }
4911 // void caller2(struct test *b) { callee(gTest, b); }
4912 if (any_of(Outs, [](const ISD::OutputArg& OA) { return OA.Flags.isByVal(); }))
4915 // If callee and caller use different calling conventions, we cannot pass
4916 // parameters on stack since offsets for the parameter area may be different.
4917 if (CallerCC != CalleeCC && needStackSlotPassParameters(Subtarget, Outs))
4920 // All variants of 64-bit ELF ABIs without PC-Relative addressing require that
4921 // the caller and callee share the same TOC for TCO/SCO. If the caller and
4922 // callee potentially have different TOC bases then we cannot tail call since
4923 // we need to restore the TOC pointer after the call.
4924 // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
4925 // We cannot guarantee this for indirect calls or calls to external functions.
4926 // When PC-Relative addressing is used, the concept of the TOC is no longer
4927 // applicable so this check is not required.
4928 // Check first for indirect calls.
4929 if (!Subtarget.isUsingPCRelativeCalls() &&
4930 !isFunctionGlobalAddress(CalleeGV) && !isCalleeExternalSymbol)
4933 // Check if we share the TOC base.
4934 if (!Subtarget.isUsingPCRelativeCalls() &&
4935 !callsShareTOCBase(CallerFunc, CalleeGV, getTargetMachine()))
4938 // TCO allows altering callee ABI, so we don't have to check further.
4939 if (CalleeCC == CallingConv::Fast && TailCallOpt)
4942 if (DisableSCO) return false;
4944 // If callee use the same argument list that caller is using, then we can
4945 // apply SCO on this case. If it is not, then we need to check if callee needs
4946 // stack for passing arguments.
4947 // PC Relative tail calls may not have a CallBase.
4948 // If there is no CallBase we cannot verify if we have the same argument
4949 // list so assume that we don't have the same argument list.
4950 if (CB && !hasSameArgumentList(CallerFunc, *CB) &&
4951 needStackSlotPassParameters(Subtarget, Outs))
4953 else if (!CB && needStackSlotPassParameters(Subtarget, Outs))
4959 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
4960 /// for tail call optimization. Targets which want to do tail call
4961 /// optimization should implement this function.
4962 bool PPCTargetLowering::IsEligibleForTailCallOptimization(
4963 const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
4964 CallingConv::ID CallerCC, bool isVarArg,
4965 const SmallVectorImpl<ISD::InputArg> &Ins) const {
4966 if (!getTargetMachine().Options.GuaranteedTailCallOpt)
4969 // Variable argument functions are not supported.
4973 if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
4974 // Functions containing by val parameters are not supported.
4975 if (any_of(Ins, [](const ISD::InputArg &IA) { return IA.Flags.isByVal(); }))
4978 // Non-PIC/GOT tail calls are supported.
4979 if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
4982 // At the moment we can only do local tail calls (in same module, hidden
4983 // or protected) if we are generating PIC.
4985 return CalleeGV->hasHiddenVisibility() ||
4986 CalleeGV->hasProtectedVisibility();
4992 /// isCallCompatibleAddress - Return the immediate to use if the specified
4993 /// 32-bit value is representable in the immediate field of a BxA instruction.
4994 static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
4995 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
4996 if (!C) return nullptr;
4998 int Addr = C->getZExtValue();
4999 if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero.
5000 SignExtend32<26>(Addr) != Addr)
5001 return nullptr; // Top 6 bits have to be sext of immediate.
5005 (int)C->getZExtValue() >> 2, SDLoc(Op),
5006 DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()))
5012 struct TailCallArgumentInfo {
5017 TailCallArgumentInfo() = default;
5020 } // end anonymous namespace
5022 /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
5023 static void StoreTailCallArgumentsToStackSlot(
5024 SelectionDAG &DAG, SDValue Chain,
5025 const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
5026 SmallVectorImpl<SDValue> &MemOpChains, const SDLoc &dl) {
5027 for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
5028 SDValue Arg = TailCallArgs[i].Arg;
5029 SDValue FIN = TailCallArgs[i].FrameIdxOp;
5030 int FI = TailCallArgs[i].FrameIdx;
5031 // Store relative to framepointer.
5032 MemOpChains.push_back(DAG.getStore(
5033 Chain, dl, Arg, FIN,
5034 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
5038 /// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
5039 /// the appropriate stack slot for the tail call optimized function call.
5040 static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
5041 SDValue OldRetAddr, SDValue OldFP,
5042 int SPDiff, const SDLoc &dl) {
5044 // Calculate the new stack slot for the return address.
5045 MachineFunction &MF = DAG.getMachineFunction();
5046 const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
5047 const PPCFrameLowering *FL = Subtarget.getFrameLowering();
5048 bool isPPC64 = Subtarget.isPPC64();
5049 int SlotSize = isPPC64 ? 8 : 4;
5050 int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset();
5051 int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize,
5052 NewRetAddrLoc, true);
5053 EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
5054 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
5055 Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
5056 MachinePointerInfo::getFixedStack(MF, NewRetAddr));
5061 /// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
5062 /// the position of the argument.
5064 CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
5065 SDValue Arg, int SPDiff, unsigned ArgOffset,
5066 SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) {
5067 int Offset = ArgOffset + SPDiff;
5068 uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8;
5069 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
5070 EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
5071 SDValue FIN = DAG.getFrameIndex(FI, VT);
5072 TailCallArgumentInfo Info;
5074 Info.FrameIdxOp = FIN;
5076 TailCallArguments.push_back(Info);
5079 /// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
5080 /// stack slot. Returns the chain as result and the loaded frame pointers in
5081 /// LROpOut/FPOpout. Used when tail calling.
5082 SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
5083 SelectionDAG &DAG, int SPDiff, SDValue Chain, SDValue &LROpOut,
5084 SDValue &FPOpOut, const SDLoc &dl) const {
5086 // Load the LR and FP stack slot for later adjusting.
5087 EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
5088 LROpOut = getReturnAddrFrameIndex(DAG);
5089 LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());
5090 Chain = SDValue(LROpOut.getNode(), 1);
5095 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
5096 /// by "Src" to address "Dst" of size "Size". Alignment information is
5097 /// specified by the specific parameter attribute. The copy will be passed as
5098 /// a byval function parameter.
5099 /// Sometimes what we are copying is the end of a larger object, the part that
5100 /// does not fit in registers.
5101 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
5102 SDValue Chain, ISD::ArgFlagsTy Flags,
5103 SelectionDAG &DAG, const SDLoc &dl) {
5104 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
5105 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode,
5106 Flags.getNonZeroByValAlign(), false, false, false,
5107 MachinePointerInfo(), MachinePointerInfo());
5110 /// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
5112 static void LowerMemOpCallTo(
5113 SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue Arg,
5114 SDValue PtrOff, int SPDiff, unsigned ArgOffset, bool isPPC64,
5115 bool isTailCall, bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
5116 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments, const SDLoc &dl) {
5117 EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
5122 StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
5124 StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
5125 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
5126 DAG.getConstant(ArgOffset, dl, PtrVT));
5128 MemOpChains.push_back(
5129 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
5130 // Calculate and remember argument location.
5131 } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
5136 PrepareTailCall(SelectionDAG &DAG, SDValue &InGlue, SDValue &Chain,
5137 const SDLoc &dl, int SPDiff, unsigned NumBytes, SDValue LROp,
5139 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
5140 // Emit a sequence of copyto/copyfrom virtual registers for arguments that
5141 // might overwrite each other in case of tail call optimization.
5142 SmallVector<SDValue, 8> MemOpChains2;
5143 // Do not flag preceding copytoreg stuff together with the following stuff.
5145 StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
5147 if (!MemOpChains2.empty())
5148 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
5150 // Store the return address to the appropriate stack slot.
5151 Chain = EmitTailCallStoreFPAndRetAddr(DAG, Chain, LROp, FPOp, SPDiff, dl);
5153 // Emit callseq_end just before tailcall node.
5154 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, dl);
5155 InGlue = Chain.getValue(1);
5158 // Is this global address that of a function that can be called by name? (as
5159 // opposed to something that must hold a descriptor for an indirect call).
5160 static bool isFunctionGlobalAddress(const GlobalValue *GV) {
5162 if (GV->isThreadLocal())
5165 return GV->getValueType()->isFunctionTy();
5171 SDValue PPCTargetLowering::LowerCallResult(
5172 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
5173 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
5174 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
5175 SmallVector<CCValAssign, 16> RVLocs;
5176 CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
5179 CCRetInfo.AnalyzeCallResult(
5180 Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
5184 // Copy all of the result registers out of their specified physreg.
5185 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
5186 CCValAssign &VA = RVLocs[i];
5187 assert(VA.isRegLoc() && "Can only return in registers!");
5191 if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
5192 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
5194 Chain = Lo.getValue(1);
5195 InGlue = Lo.getValue(2);
5196 VA = RVLocs[++i]; // skip ahead to next loc
5197 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
5199 Chain = Hi.getValue(1);
5200 InGlue = Hi.getValue(2);
5201 if (!Subtarget.isLittleEndian())
5203 Val = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, Lo, Hi);
5205 Val = DAG.getCopyFromReg(Chain, dl,
5206 VA.getLocReg(), VA.getLocVT(), InGlue);
5207 Chain = Val.getValue(1);
5208 InGlue = Val.getValue(2);
5211 switch (VA.getLocInfo()) {
5212 default: llvm_unreachable("Unknown loc info!");
5213 case CCValAssign::Full: break;
5214 case CCValAssign::AExt:
5215 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
5217 case CCValAssign::ZExt:
5218 Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
5219 DAG.getValueType(VA.getValVT()));
5220 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
5222 case CCValAssign::SExt:
5223 Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
5224 DAG.getValueType(VA.getValVT()));
5225 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
5229 InVals.push_back(Val);
5235 static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG,
5236 const PPCSubtarget &Subtarget, bool isPatchPoint) {
5237 auto *G = dyn_cast<GlobalAddressSDNode>(Callee);
5238 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5240 // PatchPoint calls are not indirect.
5244 if (isFunctionGlobalAddress(GV) || isa<ExternalSymbolSDNode>(Callee))
5247 // Darwin, and 32-bit ELF can use a BLA. The descriptor based ABIs can not
5248 // becuase the immediate function pointer points to a descriptor instead of
5249 // a function entry point. The ELFv2 ABI cannot use a BLA because the function
5250 // pointer immediate points to the global entry point, while the BLA would
5251 // need to jump to the local entry point (see rL211174).
5252 if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI() &&
5253 isBLACompatibleAddress(Callee, DAG))
5259 // AIX and 64-bit ELF ABIs w/o PCRel require a TOC save/restore around calls.
5260 static inline bool isTOCSaveRestoreRequired(const PPCSubtarget &Subtarget) {
5261 return Subtarget.isAIXABI() ||
5262 (Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls());
5265 static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
5266 const Function &Caller, const SDValue &Callee,
5267 const PPCSubtarget &Subtarget,
5268 const TargetMachine &TM,
5269 bool IsStrictFPCall = false) {
5270 if (CFlags.IsTailCall)
5271 return PPCISD::TC_RETURN;
5273 unsigned RetOpc = 0;
5274 // This is a call through a function pointer.
5275 if (CFlags.IsIndirect) {
5276 // AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross
5277 // indirect calls. The save of the caller's TOC pointer to the stack will be
5278 // inserted into the DAG as part of call lowering. The restore of the TOC
5279 // pointer is modeled by using a pseudo instruction for the call opcode that
5280 // represents the 2 instruction sequence of an indirect branch and link,
5281 // immediately followed by a load of the TOC pointer from the the stack save
5282 // slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC
5283 // as it is not saved or used.
5284 RetOpc = isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
5286 } else if (Subtarget.isUsingPCRelativeCalls()) {
5287 assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI.");
5288 RetOpc = PPCISD::CALL_NOTOC;
5289 } else if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI()) {
5290 // The ABIs that maintain a TOC pointer accross calls need to have a nop
5291 // immediately following the call instruction if the caller and callee may
5292 // have different TOC bases. At link time if the linker determines the calls
5293 // may not share a TOC base, the call is redirected to a trampoline inserted
5294 // by the linker. The trampoline will (among other things) save the callers
5295 // TOC pointer at an ABI designated offset in the linkage area and the
5296 // linker will rewrite the nop to be a load of the TOC pointer from the
5297 // linkage area into gpr2.
5298 auto *G = dyn_cast<GlobalAddressSDNode>(Callee);
5299 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5301 callsShareTOCBase(&Caller, GV, TM) ? PPCISD::CALL : PPCISD::CALL_NOP;
5303 RetOpc = PPCISD::CALL;
5304 if (IsStrictFPCall) {
5307 llvm_unreachable("Unknown call opcode");
5308 case PPCISD::BCTRL_LOAD_TOC:
5309 RetOpc = PPCISD::BCTRL_LOAD_TOC_RM;
5312 RetOpc = PPCISD::BCTRL_RM;
5314 case PPCISD::CALL_NOTOC:
5315 RetOpc = PPCISD::CALL_NOTOC_RM;
5318 RetOpc = PPCISD::CALL_RM;
5320 case PPCISD::CALL_NOP:
5321 RetOpc = PPCISD::CALL_NOP_RM;
5328 static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
5329 const SDLoc &dl, const PPCSubtarget &Subtarget) {
5330 if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI())
5331 if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG))
5332 return SDValue(Dest, 0);
5334 // Returns true if the callee is local, and false otherwise.
5335 auto isLocalCallee = [&]() {
5336 const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
5337 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
5338 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5340 return DAG.getTarget().shouldAssumeDSOLocal(*Mod, GV) &&
5341 !isa_and_nonnull<GlobalIFunc>(GV);
5344 // The PLT is only used in 32-bit ELF PIC mode. Attempting to use the PLT in
5345 // a static relocation model causes some versions of GNU LD (2.17.50, at
5346 // least) to force BSS-PLT, instead of secure-PLT, even if all objects are
5347 // built with secure-PLT.
5349 Subtarget.is32BitELFABI() && !isLocalCallee() &&
5350 Subtarget.getTargetMachine().getRelocationModel() == Reloc::PIC_;
5352 const auto getAIXFuncEntryPointSymbolSDNode = [&](const GlobalValue *GV) {
5353 const TargetMachine &TM = Subtarget.getTargetMachine();
5354 const TargetLoweringObjectFile *TLOF = TM.getObjFileLowering();
5356 cast<MCSymbolXCOFF>(TLOF->getFunctionEntryPointSymbol(GV, TM));
5358 MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
5359 return DAG.getMCSymbol(S, PtrVT);
5362 auto *G = dyn_cast<GlobalAddressSDNode>(Callee);
5363 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5364 if (isFunctionGlobalAddress(GV)) {
5365 const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
5367 if (Subtarget.isAIXABI()) {
5368 assert(!isa<GlobalIFunc>(GV) && "IFunc is not supported on AIX.");
5369 return getAIXFuncEntryPointSymbolSDNode(GV);
5371 return DAG.getTargetGlobalAddress(GV, dl, Callee.getValueType(), 0,
5372 UsePlt ? PPCII::MO_PLT : 0);
5375 if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
5376 const char *SymName = S->getSymbol();
5377 if (Subtarget.isAIXABI()) {
5378 // If there exists a user-declared function whose name is the same as the
5379 // ExternalSymbol's, then we pick up the user-declared version.
5380 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
5381 if (const Function *F =
5382 dyn_cast_or_null<Function>(Mod->getNamedValue(SymName)))
5383 return getAIXFuncEntryPointSymbolSDNode(F);
5385 // On AIX, direct function calls reference the symbol for the function's
5386 // entry point, which is named by prepending a "." before the function's
5387 // C-linkage name. A Qualname is returned here because an external
5388 // function entry point is a csect with XTY_ER property.
5389 const auto getExternalFunctionEntryPointSymbol = [&](StringRef SymName) {
5390 auto &Context = DAG.getMachineFunction().getMMI().getContext();
5391 MCSectionXCOFF *Sec = Context.getXCOFFSection(
5392 (Twine(".") + Twine(SymName)).str(), SectionKind::getMetadata(),
5393 XCOFF::CsectProperties(XCOFF::XMC_PR, XCOFF::XTY_ER));
5394 return Sec->getQualNameSymbol();
5397 SymName = getExternalFunctionEntryPointSymbol(SymName)->getName().data();
5399 return DAG.getTargetExternalSymbol(SymName, Callee.getValueType(),
5400 UsePlt ? PPCII::MO_PLT : 0);
5403 // No transformation needed.
5404 assert(Callee.getNode() && "What no callee?");
5408 static SDValue getOutputChainFromCallSeq(SDValue CallSeqStart) {
5409 assert(CallSeqStart.getOpcode() == ISD::CALLSEQ_START &&
5410 "Expected a CALLSEQ_STARTSDNode.");
5412 // The last operand is the chain, except when the node has glue. If the node
5413 // has glue, then the last operand is the glue, and the chain is the second
5415 SDValue LastValue = CallSeqStart.getValue(CallSeqStart->getNumValues() - 1);
5416 if (LastValue.getValueType() != MVT::Glue)
5419 return CallSeqStart.getValue(CallSeqStart->getNumValues() - 2);
5422 // Creates the node that moves a functions address into the count register
5423 // to prepare for an indirect call instruction.
5424 static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee,
5425 SDValue &Glue, SDValue &Chain,
5427 SDValue MTCTROps[] = {Chain, Callee, Glue};
5428 EVT ReturnTypes[] = {MVT::Other, MVT::Glue};
5429 Chain = DAG.getNode(PPCISD::MTCTR, dl, ArrayRef(ReturnTypes, 2),
5430 ArrayRef(MTCTROps, Glue.getNode() ? 3 : 2));
5431 // The glue is the second value produced.
5432 Glue = Chain.getValue(1);
5435 static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
5436 SDValue &Glue, SDValue &Chain,
5437 SDValue CallSeqStart,
5438 const CallBase *CB, const SDLoc &dl,
5440 const PPCSubtarget &Subtarget) {
5441 // Function pointers in the 64-bit SVR4 ABI do not point to the function
5442 // entry point, but to the function descriptor (the function entry point
5443 // address is part of the function descriptor though).
5444 // The function descriptor is a three doubleword structure with the
5445 // following fields: function entry point, TOC base address and
5446 // environment pointer.
5447 // Thus for a call through a function pointer, the following actions need
5449 // 1. Save the TOC of the caller in the TOC save area of its stack
5450 // frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
5451 // 2. Load the address of the function entry point from the function
5453 // 3. Load the TOC of the callee from the function descriptor into r2.
5454 // 4. Load the environment pointer from the function descriptor into
5456 // 5. Branch to the function entry point address.
5457 // 6. On return of the callee, the TOC of the caller needs to be
5458 // restored (this is done in FinishCall()).
5460 // The loads are scheduled at the beginning of the call sequence, and the
5461 // register copies are flagged together to ensure that no other
5462 // operations can be scheduled in between. E.g. without flagging the
5463 // copies together, a TOC access in the caller could be scheduled between
5464 // the assignment of the callee TOC and the branch to the callee, which leads
5465 // to incorrect code.
5467 // Start by loading the function address from the descriptor.
5468 SDValue LDChain = getOutputChainFromCallSeq(CallSeqStart);
5469 auto MMOFlags = Subtarget.hasInvariantFunctionDescriptors()
5470 ? (MachineMemOperand::MODereferenceable |
5471 MachineMemOperand::MOInvariant)
5472 : MachineMemOperand::MONone;
5474 MachinePointerInfo MPI(CB ? CB->getCalledOperand() : nullptr);
5476 // Registers used in building the DAG.
5477 const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister();
5478 const MCRegister TOCReg = Subtarget.getTOCPointerRegister();
5480 // Offsets of descriptor members.
5481 const unsigned TOCAnchorOffset = Subtarget.descriptorTOCAnchorOffset();
5482 const unsigned EnvPtrOffset = Subtarget.descriptorEnvironmentPointerOffset();
5484 const MVT RegVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
5485 const Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
5487 // One load for the functions entry point address.
5488 SDValue LoadFuncPtr = DAG.getLoad(RegVT, dl, LDChain, Callee, MPI,
5489 Alignment, MMOFlags);
5491 // One for loading the TOC anchor for the module that contains the called
5493 SDValue TOCOff = DAG.getIntPtrConstant(TOCAnchorOffset, dl);
5494 SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, Callee, TOCOff);
5496 DAG.getLoad(RegVT, dl, LDChain, AddTOC,
5497 MPI.getWithOffset(TOCAnchorOffset), Alignment, MMOFlags);
5499 // One for loading the environment pointer.
5500 SDValue PtrOff = DAG.getIntPtrConstant(EnvPtrOffset, dl);
5501 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, RegVT, Callee, PtrOff);
5502 SDValue LoadEnvPtr =
5503 DAG.getLoad(RegVT, dl, LDChain, AddPtr,
5504 MPI.getWithOffset(EnvPtrOffset), Alignment, MMOFlags);
5507 // Then copy the newly loaded TOC anchor to the TOC pointer.
5508 SDValue TOCVal = DAG.getCopyToReg(Chain, dl, TOCReg, TOCPtr, Glue);
5509 Chain = TOCVal.getValue(0);
5510 Glue = TOCVal.getValue(1);
5512 // If the function call has an explicit 'nest' parameter, it takes the
5513 // place of the environment pointer.
5514 assert((!hasNest || !Subtarget.isAIXABI()) &&
5515 "Nest parameter is not supported on AIX.");
5517 SDValue EnvVal = DAG.getCopyToReg(Chain, dl, EnvPtrReg, LoadEnvPtr, Glue);
5518 Chain = EnvVal.getValue(0);
5519 Glue = EnvVal.getValue(1);
5522 // The rest of the indirect call sequence is the same as the non-descriptor
5524 prepareIndirectCall(DAG, LoadFuncPtr, Glue, Chain, dl);
5528 buildCallOperands(SmallVectorImpl<SDValue> &Ops,
5529 PPCTargetLowering::CallFlags CFlags, const SDLoc &dl,
5531 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
5532 SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff,
5533 const PPCSubtarget &Subtarget) {
5534 const bool IsPPC64 = Subtarget.isPPC64();
5535 // MVT for a general purpose register.
5536 const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
5538 // First operand is always the chain.
5539 Ops.push_back(Chain);
5541 // If it's a direct call pass the callee as the second operand.
5542 if (!CFlags.IsIndirect)
5543 Ops.push_back(Callee);
5545 assert(!CFlags.IsPatchPoint && "Patch point calls are not indirect.");
5547 // For the TOC based ABIs, we have saved the TOC pointer to the linkage area
5548 // on the stack (this would have been done in `LowerCall_64SVR4` or
5549 // `LowerCall_AIX`). The call instruction is a pseudo instruction that
5550 // represents both the indirect branch and a load that restores the TOC
5551 // pointer from the linkage area. The operand for the TOC restore is an add
5552 // of the TOC save offset to the stack pointer. This must be the second
5553 // operand: after the chain input but before any other variadic arguments.
5554 // For 64-bit ELFv2 ABI with PCRel, do not restore the TOC as it is not
5556 if (isTOCSaveRestoreRequired(Subtarget)) {
5557 const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
5559 SDValue StackPtr = DAG.getRegister(StackPtrReg, RegVT);
5560 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
5561 SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
5562 SDValue AddTOC = DAG.getNode(ISD::ADD, dl, RegVT, StackPtr, TOCOff);
5563 Ops.push_back(AddTOC);
5566 // Add the register used for the environment pointer.
5567 if (Subtarget.usesFunctionDescriptors() && !CFlags.HasNest)
5568 Ops.push_back(DAG.getRegister(Subtarget.getEnvironmentPointerRegister(),
5572 // Add CTR register as callee so a bctr can be emitted later.
5573 if (CFlags.IsTailCall)
5574 Ops.push_back(DAG.getRegister(IsPPC64 ? PPC::CTR8 : PPC::CTR, RegVT));
5577 // If this is a tail call add stack pointer delta.
5578 if (CFlags.IsTailCall)
5579 Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32));
5581 // Add argument registers to the end of the list so that they are known live
5583 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
5584 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
5585 RegsToPass[i].second.getValueType()));
5587 // We cannot add R2/X2 as an operand here for PATCHPOINT, because there is
5588 // no way to mark dependencies as implicit here.
5589 // We will add the R2/X2 dependency in EmitInstrWithCustomInserter.
5590 if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) &&
5591 !CFlags.IsPatchPoint && !Subtarget.isUsingPCRelativeCalls())
5592 Ops.push_back(DAG.getRegister(Subtarget.getTOCPointerRegister(), RegVT));
5594 // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
5595 if (CFlags.IsVarArg && Subtarget.is32BitELFABI())
5596 Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
5598 // Add a register mask operand representing the call-preserved registers.
5599 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
5600 const uint32_t *Mask =
5601 TRI->getCallPreservedMask(DAG.getMachineFunction(), CFlags.CallConv);
5602 assert(Mask && "Missing call preserved mask for calling convention");
5603 Ops.push_back(DAG.getRegisterMask(Mask));
5605 // If the glue is valid, it is the last operand.
5607 Ops.push_back(Glue);
5610 SDValue PPCTargetLowering::FinishCall(
5611 CallFlags CFlags, const SDLoc &dl, SelectionDAG &DAG,
5612 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue Glue,
5613 SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
5614 unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
5615 SmallVectorImpl<SDValue> &InVals, const CallBase *CB) const {
5617 if ((Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls()) ||
5618 Subtarget.isAIXABI())
5619 setUsesTOCBasePtr(DAG);
5622 getCallOpcode(CFlags, DAG.getMachineFunction().getFunction(), Callee,
5623 Subtarget, DAG.getTarget(), CB ? CB->isStrictFP() : false);
5625 if (!CFlags.IsIndirect)
5626 Callee = transformCallee(Callee, DAG, dl, Subtarget);
5627 else if (Subtarget.usesFunctionDescriptors())
5628 prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CB,
5629 dl, CFlags.HasNest, Subtarget);
5631 prepareIndirectCall(DAG, Callee, Glue, Chain, dl);
5633 // Build the operand list for the call instruction.
5634 SmallVector<SDValue, 8> Ops;
5635 buildCallOperands(Ops, CFlags, dl, DAG, RegsToPass, Glue, Chain, Callee,
5639 if (CFlags.IsTailCall) {
5640 // Indirect tail call when using PC Relative calls do not have the same
5642 assert(((Callee.getOpcode() == ISD::Register &&
5643 cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
5644 Callee.getOpcode() == ISD::TargetExternalSymbol ||
5645 Callee.getOpcode() == ISD::TargetGlobalAddress ||
5646 isa<ConstantSDNode>(Callee) ||
5647 (CFlags.IsIndirect && Subtarget.isUsingPCRelativeCalls())) &&
5648 "Expecting a global address, external symbol, absolute value, "
5649 "register or an indirect tail call when PC Relative calls are "
5651 // PC Relative calls also use TC_RETURN as the way to mark tail calls.
5652 assert(CallOpc == PPCISD::TC_RETURN &&
5653 "Unexpected call opcode for a tail call.");
5654 DAG.getMachineFunction().getFrameInfo().setHasTailCall();
5655 SDValue Ret = DAG.getNode(CallOpc, dl, MVT::Other, Ops);
5656 DAG.addNoMergeSiteInfo(Ret.getNode(), CFlags.NoMerge);
5660 std::array<EVT, 2> ReturnTypes = {{MVT::Other, MVT::Glue}};
5661 Chain = DAG.getNode(CallOpc, dl, ReturnTypes, Ops);
5662 DAG.addNoMergeSiteInfo(Chain.getNode(), CFlags.NoMerge);
5663 Glue = Chain.getValue(1);
5665 // When performing tail call optimization the callee pops its arguments off
5666 // the stack. Account for this here so these bytes can be pushed back on in
5667 // PPCFrameLowering::eliminateCallFramePseudoInstr.
5668 int BytesCalleePops = (CFlags.CallConv == CallingConv::Fast &&
5669 getTargetMachine().Options.GuaranteedTailCallOpt)
5673 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, BytesCalleePops, Glue, dl);
5674 Glue = Chain.getValue(1);
5676 return LowerCallResult(Chain, Glue, CFlags.CallConv, CFlags.IsVarArg, Ins, dl,
5680 bool PPCTargetLowering::supportsTailCallFor(const CallBase *CB) const {
5681 CallingConv::ID CalleeCC = CB->getCallingConv();
5682 const Function *CallerFunc = CB->getCaller();
5683 CallingConv::ID CallerCC = CallerFunc->getCallingConv();
5684 const Function *CalleeFunc = CB->getCalledFunction();
5687 const GlobalValue *CalleeGV = dyn_cast<GlobalValue>(CalleeFunc);
5689 SmallVector<ISD::OutputArg, 2> Outs;
5690 SmallVector<ISD::InputArg, 2> Ins;
5692 GetReturnInfo(CalleeCC, CalleeFunc->getReturnType(),
5693 CalleeFunc->getAttributes(), Outs, *this,
5694 CalleeFunc->getParent()->getDataLayout());
5696 return isEligibleForTCO(CalleeGV, CalleeCC, CallerCC, CB,
5697 CalleeFunc->isVarArg(), Outs, Ins, CallerFunc,
5698 false /*isCalleeExternalSymbol*/);
5701 bool PPCTargetLowering::isEligibleForTCO(
5702 const GlobalValue *CalleeGV, CallingConv::ID CalleeCC,
5703 CallingConv::ID CallerCC, const CallBase *CB, bool isVarArg,
5704 const SmallVectorImpl<ISD::OutputArg> &Outs,
5705 const SmallVectorImpl<ISD::InputArg> &Ins, const Function *CallerFunc,
5706 bool isCalleeExternalSymbol) const {
5707 if (Subtarget.useLongCalls() && !(CB && CB->isMustTailCall()))
5710 if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
5711 return IsEligibleForTailCallOptimization_64SVR4(
5712 CalleeGV, CalleeCC, CallerCC, CB, isVarArg, Outs, Ins, CallerFunc,
5713 isCalleeExternalSymbol);
5715 return IsEligibleForTailCallOptimization(CalleeGV, CalleeCC, CallerCC,
5720 PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
5721 SmallVectorImpl<SDValue> &InVals) const {
5722 SelectionDAG &DAG = CLI.DAG;
5724 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
5725 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
5726 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
5727 SDValue Chain = CLI.Chain;
5728 SDValue Callee = CLI.Callee;
5729 bool &isTailCall = CLI.IsTailCall;
5730 CallingConv::ID CallConv = CLI.CallConv;
5731 bool isVarArg = CLI.IsVarArg;
5732 bool isPatchPoint = CLI.IsPatchPoint;
5733 const CallBase *CB = CLI.CB;
5736 MachineFunction &MF = DAG.getMachineFunction();
5737 CallingConv::ID CallerCC = MF.getFunction().getCallingConv();
5738 auto *G = dyn_cast<GlobalAddressSDNode>(Callee);
5739 const GlobalValue *GV = G ? G->getGlobal() : nullptr;
5740 bool IsCalleeExternalSymbol = isa<ExternalSymbolSDNode>(Callee);
5743 isEligibleForTCO(GV, CallConv, CallerCC, CB, isVarArg, Outs, Ins,
5744 &(MF.getFunction()), IsCalleeExternalSymbol);
5747 if (!getTargetMachine().Options.GuaranteedTailCallOpt)
5750 // PC Relative calls no longer guarantee that the callee is a Global
5751 // Address Node. The callee could be an indirect tail call in which
5752 // case the SDValue for the callee could be a load (to load the address
5753 // of a function pointer) or it may be a register copy (to move the
5754 // address of the callee from a function parameter into a virtual
5755 // register). It may also be an ExternalSymbolSDNode (ex memcopy).
5756 assert((Subtarget.isUsingPCRelativeCalls() ||
5757 isa<GlobalAddressSDNode>(Callee)) &&
5758 "Callee should be an llvm::Function object.");
5760 LLVM_DEBUG(dbgs() << "TCO caller: " << DAG.getMachineFunction().getName()
5761 << "\nTCO callee: ");
5762 LLVM_DEBUG(Callee.dump());
5766 if (!isTailCall && CB && CB->isMustTailCall())
5767 report_fatal_error("failed to perform tail call elimination on a call "
5768 "site marked musttail");
5770 // When long calls (i.e. indirect calls) are always used, calls are always
5771 // made via function pointer. If we have a function name, first translate it
5773 if (Subtarget.useLongCalls() && isa<GlobalAddressSDNode>(Callee) &&
5775 Callee = LowerGlobalAddress(Callee, DAG);
5778 CallConv, isTailCall, isVarArg, isPatchPoint,
5779 isIndirectCall(Callee, DAG, Subtarget, isPatchPoint),
5781 Subtarget.is64BitELFABI() &&
5782 any_of(Outs, [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }),
5785 if (Subtarget.isAIXABI())
5786 return LowerCall_AIX(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5789 assert(Subtarget.isSVR4ABI());
5790 if (Subtarget.isPPC64())
5791 return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5793 return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
5797 SDValue PPCTargetLowering::LowerCall_32SVR4(
5798 SDValue Chain, SDValue Callee, CallFlags CFlags,
5799 const SmallVectorImpl<ISD::OutputArg> &Outs,
5800 const SmallVectorImpl<SDValue> &OutVals,
5801 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
5802 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
5803 const CallBase *CB) const {
5804 // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
5805 // of the 32-bit SVR4 ABI stack frame layout.
5807 const CallingConv::ID CallConv = CFlags.CallConv;
5808 const bool IsVarArg = CFlags.IsVarArg;
5809 const bool IsTailCall = CFlags.IsTailCall;
5811 assert((CallConv == CallingConv::C ||
5812 CallConv == CallingConv::Cold ||
5813 CallConv == CallingConv::Fast) && "Unknown calling convention!");
5815 const Align PtrAlign(4);
5817 MachineFunction &MF = DAG.getMachineFunction();
5819 // Mark this function as potentially containing a function that contains a
5820 // tail call. As a consequence the frame pointer will be used for dynamicalloc
5821 // and restoring the callers stack pointer in this functions epilog. This is
5822 // done because by tail calling the called function might overwrite the value
5823 // in this function's (MF) stack pointer stack slot 0(SP).
5824 if (getTargetMachine().Options.GuaranteedTailCallOpt &&
5825 CallConv == CallingConv::Fast)
5826 MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
5828 // Count how many bytes are to be pushed on the stack, including the linkage
5829 // area, parameter list area and the part of the local variable space which
5830 // contains copies of aggregates which are passed by value.
5832 // Assign locations to all of the outgoing arguments.
5833 SmallVector<CCValAssign, 16> ArgLocs;
5834 PPCCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
5836 // Reserve space for the linkage area on the stack.
5837 CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),
5840 CCInfo.PreAnalyzeCallOperands(Outs);
5843 // Handle fixed and variable vector arguments differently.
5844 // Fixed vector arguments go into registers as long as registers are
5845 // available. Variable vector arguments always go into memory.
5846 unsigned NumArgs = Outs.size();
5848 for (unsigned i = 0; i != NumArgs; ++i) {
5849 MVT ArgVT = Outs[i].VT;
5850 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
5853 if (Outs[i].IsFixed) {
5854 Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
5857 Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
5863 errs() << "Call operand #" << i << " has unhandled type "
5866 llvm_unreachable(nullptr);
5870 // All arguments are treated the same.
5871 CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
5873 CCInfo.clearWasPPCF128();
5875 // Assign locations to all of the outgoing aggregate by value arguments.
5876 SmallVector<CCValAssign, 16> ByValArgLocs;
5877 CCState CCByValInfo(CallConv, IsVarArg, MF, ByValArgLocs, *DAG.getContext());
5879 // Reserve stack space for the allocations in CCInfo.
5880 CCByValInfo.AllocateStack(CCInfo.getStackSize(), PtrAlign);
5882 CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);
5884 // Size of the linkage area, parameter list area and the part of the local
5885 // space variable where copies of aggregates which are passed by value are
5887 unsigned NumBytes = CCByValInfo.getStackSize();
5889 // Calculate by how many bytes the stack has to be adjusted in case of tail
5890 // call optimization.
5891 int SPDiff = CalculateTailCallSPDiff(DAG, IsTailCall, NumBytes);
5893 // Adjust the stack pointer for the new arguments...
5894 // These operations are automatically eliminated by the prolog/epilog pass
5895 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
5896 SDValue CallSeqStart = Chain;
5898 // Load the return address and frame pointer so it can be moved somewhere else
5901 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
5903 // Set up a copy of the stack pointer for use loading and storing any
5904 // arguments that may not fit in the registers available for argument
5906 SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
5908 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
5909 SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
5910 SmallVector<SDValue, 8> MemOpChains;
5912 bool seenFloatArg = false;
5913 // Walk the register/memloc assignments, inserting copies/loads.
5914 // i - Tracks the index into the list of registers allocated for the call
5915 // RealArgIdx - Tracks the index into the list of actual function arguments
5916 // j - Tracks the index into the list of byval arguments
5917 for (unsigned i = 0, RealArgIdx = 0, j = 0, e = ArgLocs.size();
5919 ++i, ++RealArgIdx) {
5920 CCValAssign &VA = ArgLocs[i];
5921 SDValue Arg = OutVals[RealArgIdx];
5922 ISD::ArgFlagsTy Flags = Outs[RealArgIdx].Flags;
5924 if (Flags.isByVal()) {
5925 // Argument is an aggregate which is passed by value, thus we need to
5926 // create a copy of it in the local variable space of the current stack
5927 // frame (which is the stack frame of the caller) and pass the address of
5928 // this copy to the callee.
5929 assert((j < ByValArgLocs.size()) && "Index out of bounds!");
5930 CCValAssign &ByValVA = ByValArgLocs[j++];
5931 assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
5933 // Memory reserved in the local variable space of the callers stack frame.
5934 unsigned LocMemOffset = ByValVA.getLocMemOffset();
5936 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
5937 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
5940 // Create a copy of the argument in the local area of the current
5942 SDValue MemcpyCall =
5943 CreateCopyOfByValArgument(Arg, PtrOff,
5944 CallSeqStart.getNode()->getOperand(0),
5947 // This must go outside the CALLSEQ_START..END.
5948 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0,
5950 DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
5951 NewCallSeqStart.getNode());
5952 Chain = CallSeqStart = NewCallSeqStart;
5954 // Pass the address of the aggregate copy on the stack either in a
5955 // physical register or in the parameter list area of the current stack
5956 // frame to the callee.
5960 // When useCRBits() is true, there can be i1 arguments.
5961 // It is because getRegisterType(MVT::i1) => MVT::i1,
5962 // and for other integer types getRegisterType() => MVT::i32.
5963 // Extend i1 and ensure callee will get i32.
5964 if (Arg.getValueType() == MVT::i1)
5965 Arg = DAG.getNode(Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
5968 if (VA.isRegLoc()) {
5969 seenFloatArg |= VA.getLocVT().isFloatingPoint();
5970 // Put argument in a physical register.
5971 if (Subtarget.hasSPE() && Arg.getValueType() == MVT::f64) {
5972 bool IsLE = Subtarget.isLittleEndian();
5973 SDValue SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
5974 DAG.getIntPtrConstant(IsLE ? 0 : 1, dl));
5975 RegsToPass.push_back(std::make_pair(VA.getLocReg(), SVal.getValue(0)));
5976 SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
5977 DAG.getIntPtrConstant(IsLE ? 1 : 0, dl));
5978 RegsToPass.push_back(std::make_pair(ArgLocs[++i].getLocReg(),
5981 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
5983 // Put argument in the parameter list area of the current stack frame.
5984 assert(VA.isMemLoc());
5985 unsigned LocMemOffset = VA.getLocMemOffset();
5988 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
5989 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
5992 MemOpChains.push_back(
5993 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
5995 // Calculate and remember argument location.
5996 CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
6002 if (!MemOpChains.empty())
6003 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
6005 // Build a sequence of copy-to-reg nodes chained together with token chain
6006 // and flag operands which copy the outgoing args into the appropriate regs.
6008 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
6009 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
6010 RegsToPass[i].second, InGlue);
6011 InGlue = Chain.getValue(1);
6014 // Set CR bit 6 to true if this is a vararg call with floating args passed in
6017 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
6018 SDValue Ops[] = { Chain, InGlue };
6020 Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, dl,
6021 VTs, ArrayRef(Ops, InGlue.getNode() ? 2 : 1));
6023 InGlue = Chain.getValue(1);
6027 PrepareTailCall(DAG, InGlue, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
6030 return FinishCall(CFlags, dl, DAG, RegsToPass, InGlue, Chain, CallSeqStart,
6031 Callee, SPDiff, NumBytes, Ins, InVals, CB);
6034 // Copy an argument into memory, being careful to do this outside the
6035 // call sequence for the call to which the argument belongs.
6036 SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
6037 SDValue Arg, SDValue PtrOff, SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
6038 SelectionDAG &DAG, const SDLoc &dl) const {
6039 SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
6040 CallSeqStart.getNode()->getOperand(0),
6042 // The MEMCPY must go outside the CALLSEQ_START..END.
6043 int64_t FrameSize = CallSeqStart.getConstantOperandVal(1);
6044 SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0,
6046 DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
6047 NewCallSeqStart.getNode());
6048 return NewCallSeqStart;
6051 SDValue PPCTargetLowering::LowerCall_64SVR4(
6052 SDValue Chain, SDValue Callee, CallFlags CFlags,
6053 const SmallVectorImpl<ISD::OutputArg> &Outs,
6054 const SmallVectorImpl<SDValue> &OutVals,
6055 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
6056 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
6057 const CallBase *CB) const {
6058 bool isELFv2ABI = Subtarget.isELFv2ABI();
6059 bool isLittleEndian = Subtarget.isLittleEndian();
6060 unsigned NumOps = Outs.size();
6061 bool IsSibCall = false;
6062 bool IsFastCall = CFlags.CallConv == CallingConv::Fast;
6064 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6065 unsigned PtrByteSize = 8;
6067 MachineFunction &MF = DAG.getMachineFunction();
6069 if (CFlags.IsTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
6072 // Mark this function as potentially containing a function that contains a
6073 // tail call. As a consequence the frame pointer will be used for dynamicalloc
6074 // and restoring the callers stack pointer in this functions epilog. This is
6075 // done because by tail calling the called function might overwrite the value
6076 // in this function's (MF) stack pointer stack slot 0(SP).
6077 if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
6078 MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
6080 assert(!(IsFastCall && CFlags.IsVarArg) &&
6081 "fastcc not supported on varargs functions");
6083 // Count how many bytes are to be pushed on the stack, including the linkage
6084 // area, and parameter passing area. On ELFv1, the linkage area is 48 bytes
6085 // reserved space for [SP][CR][LR][2 x unused][TOC]; on ELFv2, the linkage
6086 // area is 32 bytes reserved space for [SP][CR][LR][TOC].
6087 unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
6088 unsigned NumBytes = LinkageSize;
6089 unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
6091 static const MCPhysReg GPR[] = {
6092 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
6093 PPC::X7, PPC::X8, PPC::X9, PPC::X10,
6095 static const MCPhysReg VR[] = {
6096 PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
6097 PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
6100 const unsigned NumGPRs = std::size(GPR);
6101 const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
6102 const unsigned NumVRs = std::size(VR);
6104 // On ELFv2, we can avoid allocating the parameter area if all the arguments
6105 // can be passed to the callee in registers.
6106 // For the fast calling convention, there is another check below.
6107 // Note: We should keep consistent with LowerFormalArguments_64SVR4()
6108 bool HasParameterArea = !isELFv2ABI || CFlags.IsVarArg || IsFastCall;
6109 if (!HasParameterArea) {
6110 unsigned ParamAreaSize = NumGPRs * PtrByteSize;
6111 unsigned AvailableFPRs = NumFPRs;
6112 unsigned AvailableVRs = NumVRs;
6113 unsigned NumBytesTmp = NumBytes;
6114 for (unsigned i = 0; i != NumOps; ++i) {
6115 if (Outs[i].Flags.isNest()) continue;
6116 if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags,
6117 PtrByteSize, LinkageSize, ParamAreaSize,
6118 NumBytesTmp, AvailableFPRs, AvailableVRs))
6119 HasParameterArea = true;
6123 // When using the fast calling convention, we don't provide backing for
6124 // arguments that will be in registers.
6125 unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
6127 // Avoid allocating parameter area for fastcc functions if all the arguments
6128 // can be passed in the registers.
6130 HasParameterArea = false;
6132 // Add up all the space actually used.
6133 for (unsigned i = 0; i != NumOps; ++i) {
6134 ISD::ArgFlagsTy Flags = Outs[i].Flags;
6135 EVT ArgVT = Outs[i].VT;
6136 EVT OrigVT = Outs[i].ArgVT;
6142 if (Flags.isByVal()) {
6143 NumGPRsUsed += (Flags.getByValSize()+7)/8;
6144 if (NumGPRsUsed > NumGPRs)
6145 HasParameterArea = true;
6147 switch (ArgVT.getSimpleVT().SimpleTy) {
6148 default: llvm_unreachable("Unexpected ValueType for argument!");
6152 if (++NumGPRsUsed <= NumGPRs)
6162 if (++NumVRsUsed <= NumVRs)
6166 if (++NumVRsUsed <= NumVRs)
6171 if (++NumFPRsUsed <= NumFPRs)
6175 HasParameterArea = true;
6179 /* Respect alignment of argument on the stack. */
6181 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
6182 NumBytes = alignTo(NumBytes, Alignement);
6184 NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
6185 if (Flags.isInConsecutiveRegsLast())
6186 NumBytes = ((NumBytes + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
6189 unsigned NumBytesActuallyUsed = NumBytes;
6191 // In the old ELFv1 ABI,
6192 // the prolog code of the callee may store up to 8 GPR argument registers to
6193 // the stack, allowing va_start to index over them in memory if its varargs.
6194 // Because we cannot tell if this is needed on the caller side, we have to
6195 // conservatively assume that it is needed. As such, make sure we have at
6196 // least enough stack space for the caller to store the 8 GPRs.
6197 // In the ELFv2 ABI, we allocate the parameter area iff a callee
6198 // really requires memory operands, e.g. a vararg function.
6199 if (HasParameterArea)
6200 NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
6202 NumBytes = LinkageSize;
6204 // Tail call needs the stack to be aligned.
6205 if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
6206 NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
6210 // Calculate by how many bytes the stack has to be adjusted in case of tail
6211 // call optimization.
6213 SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);
6215 // To protect arguments on the stack from being clobbered in a tail call,
6216 // force all the loads to happen before doing any other lowering.
6217 if (CFlags.IsTailCall)
6218 Chain = DAG.getStackArgumentTokenFactor(Chain);
6220 // Adjust the stack pointer for the new arguments...
6221 // These operations are automatically eliminated by the prolog/epilog pass
6223 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
6224 SDValue CallSeqStart = Chain;
6226 // Load the return address and frame pointer so it can be move somewhere else
6229 Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, dl);
6231 // Set up a copy of the stack pointer for use loading and storing any
6232 // arguments that may not fit in the registers available for argument
6234 SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
6236 // Figure out which arguments are going to go in registers, and which in
6237 // memory. Also, if this is a vararg function, floating point operations
6238 // must be stored to our stack, and loaded into integer regs as well, if
6239 // any integer regs are available for argument passing.
6240 unsigned ArgOffset = LinkageSize;
6242 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
6243 SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
6245 SmallVector<SDValue, 8> MemOpChains;
6246 for (unsigned i = 0; i != NumOps; ++i) {
6247 SDValue Arg = OutVals[i];
6248 ISD::ArgFlagsTy Flags = Outs[i].Flags;
6249 EVT ArgVT = Outs[i].VT;
6250 EVT OrigVT = Outs[i].ArgVT;
6252 // PtrOff will be used to store the current argument to the stack if a
6253 // register cannot be found for it.
6256 // We re-align the argument offset for each argument, except when using the
6257 // fast calling convention, when we need to make sure we do that only when
6258 // we'll actually use a stack slot.
6259 auto ComputePtrOff = [&]() {
6260 /* Respect alignment of argument on the stack. */
6262 CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
6263 ArgOffset = alignTo(ArgOffset, Alignment);
6265 PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
6267 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
6273 /* Compute GPR index associated with argument offset. */
6274 GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
6275 GPR_idx = std::min(GPR_idx, NumGPRs);
6278 // Promote integers to 64-bit values.
6279 if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {
6280 // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
6281 unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6282 Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
6285 // FIXME memcpy is used way more than necessary. Correctness first.
6286 // Note: "by value" is code for passing a structure by value, not
6288 if (Flags.isByVal()) {
6289 // Note: Size includes alignment padding, so
6290 // struct x { short a; char b; }
6291 // will have Size = 4. With #pragma pack(1), it will have Size = 3.
6292 // These are the proper values we need for right-justifying the
6293 // aggregate in a parameter register.
6294 unsigned Size = Flags.getByValSize();
6296 // An empty aggregate parameter takes up no storage and no
6304 // All aggregates smaller than 8 bytes must be passed right-justified.
6305 if (Size==1 || Size==2 || Size==4) {
6306 EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
6307 if (GPR_idx != NumGPRs) {
6308 SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
6309 MachinePointerInfo(), VT);
6310 MemOpChains.push_back(Load.getValue(1));
6311 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
6313 ArgOffset += PtrByteSize;
6318 if (GPR_idx == NumGPRs && Size < 8) {
6319 SDValue AddPtr = PtrOff;
6320 if (!isLittleEndian) {
6321 SDValue Const = DAG.getConstant(PtrByteSize - Size, dl,
6322 PtrOff.getValueType());
6323 AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
6325 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
6328 ArgOffset += PtrByteSize;
6331 // Copy the object to parameter save area if it can not be entirely passed
6333 // FIXME: we only need to copy the parts which need to be passed in
6334 // parameter save area. For the parts passed by registers, we don't need
6335 // to copy them to the stack although we need to allocate space for them
6336 // in parameter save area.
6337 if ((NumGPRs - GPR_idx) * PtrByteSize < Size)
6338 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
6342 // When a register is available, pass a small aggregate right-justified.
6343 if (Size < 8 && GPR_idx != NumGPRs) {
6344 // The easiest way to get this right-justified in a register
6345 // is to copy the structure into the rightmost portion of a
6346 // local variable slot, then load the whole slot into the
6348 // FIXME: The memcpy seems to produce pretty awful code for
6349 // small aggregates, particularly for packed ones.
6350 // FIXME: It would be preferable to use the slot in the
6351 // parameter save area instead of a new local variable.
6352 SDValue AddPtr = PtrOff;
6353 if (!isLittleEndian) {
6354 SDValue Const = DAG.getConstant(8 - Size, dl, PtrOff.getValueType());
6355 AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
6357 Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
6361 // Load the slot into the register.
6363 DAG.getLoad(PtrVT, dl, Chain, PtrOff, MachinePointerInfo());
6364 MemOpChains.push_back(Load.getValue(1));
6365 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
6367 // Done with this argument.
6368 ArgOffset += PtrByteSize;
6372 // For aggregates larger than PtrByteSize, copy the pieces of the
6373 // object that fit into registers from the parameter save area.
6374 for (unsigned j=0; j<Size; j+=PtrByteSize) {
6375 SDValue Const = DAG.getConstant(j, dl, PtrOff.getValueType());
6376 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
6377 if (GPR_idx != NumGPRs) {
6378 unsigned LoadSizeInBits = std::min(PtrByteSize, (Size - j)) * 8;
6379 EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), LoadSizeInBits);
6380 SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, AddArg,
6381 MachinePointerInfo(), ObjType);
6383 MemOpChains.push_back(Load.getValue(1));
6384 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
6385 ArgOffset += PtrByteSize;
6387 ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
6394 switch (Arg.getSimpleValueType().SimpleTy) {
6395 default: llvm_unreachable("Unexpected ValueType for argument!");
6399 if (Flags.isNest()) {
6400 // The 'nest' parameter, if any, is passed in R11.
6401 RegsToPass.push_back(std::make_pair(PPC::X11, Arg));
6405 // These can be scalar arguments or elements of an integer array type
6406 // passed directly. Clang may use those instead of "byval" aggregate
6407 // types to avoid forcing arguments to memory unnecessarily.
6408 if (GPR_idx != NumGPRs) {
6409 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
6414 assert(HasParameterArea &&
6415 "Parameter area must exist to pass an argument in memory.");
6416 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6417 true, CFlags.IsTailCall, false, MemOpChains,
6418 TailCallArguments, dl);
6420 ArgOffset += PtrByteSize;
6423 ArgOffset += PtrByteSize;
6427 // These can be scalar arguments or elements of a float array type
6428 // passed directly. The latter are used to implement ELFv2 homogenous
6429 // float aggregates.
6431 // Named arguments go into FPRs first, and once they overflow, the
6432 // remaining arguments go into GPRs and then the parameter save area.
6433 // Unnamed arguments for vararg functions always go to GPRs and
6434 // then the parameter save area. For now, put all arguments to vararg
6435 // routines always in both locations (FPR *and* GPR or stack slot).
6436 bool NeedGPROrStack = CFlags.IsVarArg || FPR_idx == NumFPRs;
6437 bool NeededLoad = false;
6439 // First load the argument into the next available FPR.
6440 if (FPR_idx != NumFPRs)
6441 RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
6443 // Next, load the argument into GPR or stack slot if needed.
6444 if (!NeedGPROrStack)
6446 else if (GPR_idx != NumGPRs && !IsFastCall) {
6447 // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
6448 // once we support fp <-> gpr moves.
6450 // In the non-vararg case, this can only ever happen in the
6451 // presence of f32 array types, since otherwise we never run
6452 // out of FPRs before running out of GPRs.
6455 // Double values are always passed in a single GPR.
6456 if (Arg.getValueType() != MVT::f32) {
6457 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
6459 // Non-array float values are extended and passed in a GPR.
6460 } else if (!Flags.isInConsecutiveRegs()) {
6461 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
6462 ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
6464 // If we have an array of floats, we collect every odd element
6465 // together with its predecessor into one GPR.
6466 } else if (ArgOffset % PtrByteSize != 0) {
6468 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::i32, OutVals[i - 1]);
6469 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
6470 if (!isLittleEndian)
6472 ArgVal = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6474 // The final element, if even, goes into the first half of a GPR.
6475 } else if (Flags.isInConsecutiveRegsLast()) {
6476 ArgVal = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Arg);
6477 ArgVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i64, ArgVal);
6478 if (!isLittleEndian)
6479 ArgVal = DAG.getNode(ISD::SHL, dl, MVT::i64, ArgVal,
6480 DAG.getConstant(32, dl, MVT::i32));
6482 // Non-final even elements are skipped; they will be handled
6483 // together the with subsequent argument on the next go-around.
6487 if (ArgVal.getNode())
6488 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));
6493 // Single-precision floating-point values are mapped to the
6494 // second (rightmost) word of the stack doubleword.
6495 if (Arg.getValueType() == MVT::f32 &&
6496 !isLittleEndian && !Flags.isInConsecutiveRegs()) {
6497 SDValue ConstFour = DAG.getConstant(4, dl, PtrOff.getValueType());
6498 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
6501 assert(HasParameterArea &&
6502 "Parameter area must exist to pass an argument in memory.");
6503 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6504 true, CFlags.IsTailCall, false, MemOpChains,
6505 TailCallArguments, dl);
6509 // When passing an array of floats, the array occupies consecutive
6510 // space in the argument area; only round up to the next doubleword
6511 // at the end of the array. Otherwise, each float takes 8 bytes.
6512 if (!IsFastCall || NeededLoad) {
6513 ArgOffset += (Arg.getValueType() == MVT::f32 &&
6514 Flags.isInConsecutiveRegs()) ? 4 : 8;
6515 if (Flags.isInConsecutiveRegsLast())
6516 ArgOffset = ((ArgOffset + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
6528 // These can be scalar arguments or elements of a vector array type
6529 // passed directly. The latter are used to implement ELFv2 homogenous
6530 // vector aggregates.
6532 // For a varargs call, named arguments go into VRs or on the stack as
6533 // usual; unnamed arguments always go to the stack or the corresponding
6534 // GPRs when within range. For now, we always put the value in both
6535 // locations (or even all three).
6536 if (CFlags.IsVarArg) {
6537 assert(HasParameterArea &&
6538 "Parameter area must exist if we have a varargs call.");
6539 // We could elide this store in the case where the object fits
6540 // entirely in R registers. Maybe later.
6542 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
6543 MemOpChains.push_back(Store);
6544 if (VR_idx != NumVRs) {
6546 DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, MachinePointerInfo());
6547 MemOpChains.push_back(Load.getValue(1));
6548 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
6551 for (unsigned i=0; i<16; i+=PtrByteSize) {
6552 if (GPR_idx == NumGPRs)
6554 SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
6555 DAG.getConstant(i, dl, PtrVT));
6557 DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo());
6558 MemOpChains.push_back(Load.getValue(1));
6559 RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
6564 // Non-varargs Altivec params go into VRs or on the stack.
6565 if (VR_idx != NumVRs) {
6566 RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
6571 assert(HasParameterArea &&
6572 "Parameter area must exist to pass an argument in memory.");
6573 LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
6574 true, CFlags.IsTailCall, true, MemOpChains,
6575 TailCallArguments, dl);
6586 assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) &&
6587 "mismatch in size of parameter area");
6588 (void)NumBytesActuallyUsed;
6590 if (!MemOpChains.empty())
6591 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
6593 // Check if this is an indirect call (MTCTR/BCTRL).
6594 // See prepareDescriptorIndirectCall and buildCallOperands for more
6595 // information about calls through function pointers in the 64-bit SVR4 ABI.
6596 if (CFlags.IsIndirect) {
6597 // For 64-bit ELFv2 ABI with PCRel, do not save the TOC of the
6598 // caller in the TOC save area.
6599 if (isTOCSaveRestoreRequired(Subtarget)) {
6600 assert(!CFlags.IsTailCall && "Indirect tails calls not supported");
6601 // Load r2 into a virtual register and store it to the TOC save area.
6602 setUsesTOCBasePtr(DAG);
6603 SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
6604 // TOC save area offset.
6605 unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
6606 SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
6607 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
6608 Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr,
6609 MachinePointerInfo::getStack(
6610 DAG.getMachineFunction(), TOCSaveOffset));
6612 // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
6613 // This does not mean the MTCTR instruction must use R12; it's easier
6614 // to model this as an extra parameter, so do that.
6615 if (isELFv2ABI && !CFlags.IsPatchPoint)
6616 RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
6619 // Build a sequence of copy-to-reg nodes chained together with token chain
6620 // and flag operands which copy the outgoing args into the appropriate regs.
6622 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
6623 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
6624 RegsToPass[i].second, InGlue);
6625 InGlue = Chain.getValue(1);
6628 if (CFlags.IsTailCall && !IsSibCall)
6629 PrepareTailCall(DAG, InGlue, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
6632 return FinishCall(CFlags, dl, DAG, RegsToPass, InGlue, Chain, CallSeqStart,
6633 Callee, SPDiff, NumBytes, Ins, InVals, CB);
6636 // Returns true when the shadow of a general purpose argument register
6637 // in the parameter save area is aligned to at least 'RequiredAlign'.
6638 static bool isGPRShadowAligned(MCPhysReg Reg, Align RequiredAlign) {
6639 assert(RequiredAlign.value() <= 16 &&
6640 "Required alignment greater than stack alignment.");
6643 report_fatal_error("called on invalid register.");
6650 // These registers are 16 byte aligned which is the most strict aligment
6659 // The shadow of these registers in the PSA is 8 byte aligned.
6660 return RequiredAlign <= 8;
6665 return RequiredAlign <= 4;
6669 static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
6670 CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
6672 AIXCCState &State = static_cast<AIXCCState &>(S);
6673 const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
6674 State.getMachineFunction().getSubtarget());
6675 const bool IsPPC64 = Subtarget.isPPC64();
6676 const Align PtrAlign = IsPPC64 ? Align(8) : Align(4);
6677 const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
6679 if (ValVT == MVT::f128)
6680 report_fatal_error("f128 is unimplemented on AIX.");
6682 if (ArgFlags.isNest())
6683 report_fatal_error("Nest arguments are unimplemented.");
6685 static const MCPhysReg GPR_32[] = {// 32-bit registers.
6686 PPC::R3, PPC::R4, PPC::R5, PPC::R6,
6687 PPC::R7, PPC::R8, PPC::R9, PPC::R10};
6688 static const MCPhysReg GPR_64[] = {// 64-bit registers.
6689 PPC::X3, PPC::X4, PPC::X5, PPC::X6,
6690 PPC::X7, PPC::X8, PPC::X9, PPC::X10};
6692 static const MCPhysReg VR[] = {// Vector registers.
6693 PPC::V2, PPC::V3, PPC::V4, PPC::V5,
6694 PPC::V6, PPC::V7, PPC::V8, PPC::V9,
6695 PPC::V10, PPC::V11, PPC::V12, PPC::V13};
6697 if (ArgFlags.isByVal()) {
6698 if (ArgFlags.getNonZeroByValAlign() > PtrAlign)
6699 report_fatal_error("Pass-by-value arguments with alignment greater than "
6700 "register width are not supported.");
6702 const unsigned ByValSize = ArgFlags.getByValSize();
6704 // An empty aggregate parameter takes up no storage and no registers,
6705 // but needs a MemLoc for a stack slot for the formal arguments side.
6706 if (ByValSize == 0) {
6707 State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
6708 State.getStackSize(), RegVT, LocInfo));
6712 const unsigned StackSize = alignTo(ByValSize, PtrAlign);
6713 unsigned Offset = State.AllocateStack(StackSize, PtrAlign);
6714 for (const unsigned E = Offset + StackSize; Offset < E;
6715 Offset += PtrAlign.value()) {
6716 if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
6717 State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
6719 State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
6720 Offset, MVT::INVALID_SIMPLE_VALUE_TYPE,
6728 // Arguments always reserve parameter save area.
6729 switch (ValVT.SimpleTy) {
6731 report_fatal_error("Unhandled value type for argument.");
6733 // i64 arguments should have been split to i32 for PPC32.
6734 assert(IsPPC64 && "PPC32 should have split i64 values.");
6738 const unsigned Offset = State.AllocateStack(PtrAlign.value(), PtrAlign);
6739 // AIX integer arguments are always passed in register width.
6740 if (ValVT.getFixedSizeInBits() < RegVT.getFixedSizeInBits())
6741 LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
6742 : CCValAssign::LocInfo::ZExt;
6743 if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
6744 State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
6746 State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, RegVT, LocInfo));
6752 // Parameter save area (PSA) is reserved even if the float passes in fpr.
6753 const unsigned StoreSize = LocVT.getStoreSize();
6754 // Floats are always 4-byte aligned in the PSA on AIX.
6755 // This includes f64 in 64-bit mode for ABI compatibility.
6756 const unsigned Offset =
6757 State.AllocateStack(IsPPC64 ? 8 : StoreSize, Align(4));
6758 unsigned FReg = State.AllocateReg(FPR);
6760 State.addLoc(CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, LocInfo));
6762 // Reserve and initialize GPRs or initialize the PSA as required.
6763 for (unsigned I = 0; I < StoreSize; I += PtrAlign.value()) {
6764 if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
6765 assert(FReg && "An FPR should be available when a GPR is reserved.");
6766 if (State.isVarArg()) {
6767 // Successfully reserved GPRs are only initialized for vararg calls.
6768 // Custom handling is required for:
6769 // f64 in PPC32 needs to be split into 2 GPRs.
6770 // f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR.
6772 CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));
6775 // If there are insufficient GPRs, the PSA needs to be initialized.
6776 // Initialization occurs even if an FPR was initialized for
6777 // compatibility with the AIX XL compiler. The full memory for the
6778 // argument will be initialized even if a prior word is saved in GPR.
6779 // A custom memLoc is used when the argument also passes in FPR so
6780 // that the callee handling can skip over it easily.
6782 FReg ? CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT,
6784 : CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
6798 const unsigned VecSize = 16;
6799 const Align VecAlign(VecSize);
6801 if (!State.isVarArg()) {
6802 // If there are vector registers remaining we don't consume any stack
6804 if (unsigned VReg = State.AllocateReg(VR)) {
6805 State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));
6808 // Vectors passed on the stack do not shadow GPRs or FPRs even though they
6809 // might be allocated in the portion of the PSA that is shadowed by the
6811 const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
6812 State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
6816 const unsigned PtrSize = IsPPC64 ? 8 : 4;
6817 ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32;
6819 unsigned NextRegIndex = State.getFirstUnallocated(GPRs);
6820 // Burn any underaligned registers and their shadowed stack space until
6821 // we reach the required alignment.
6822 while (NextRegIndex != GPRs.size() &&
6823 !isGPRShadowAligned(GPRs[NextRegIndex], VecAlign)) {
6824 // Shadow allocate register and its stack shadow.
6825 unsigned Reg = State.AllocateReg(GPRs);
6826 State.AllocateStack(PtrSize, PtrAlign);
6827 assert(Reg && "Allocating register unexpectedly failed.");
6829 NextRegIndex = State.getFirstUnallocated(GPRs);
6832 // Vectors that are passed as fixed arguments are handled differently.
6833 // They are passed in VRs if any are available (unlike arguments passed
6834 // through ellipses) and shadow GPRs (unlike arguments to non-vaarg
6836 if (State.isFixed(ValNo)) {
6837 if (unsigned VReg = State.AllocateReg(VR)) {
6838 State.addLoc(CCValAssign::getReg(ValNo, ValVT, VReg, LocVT, LocInfo));
6839 // Shadow allocate GPRs and stack space even though we pass in a VR.
6840 for (unsigned I = 0; I != VecSize; I += PtrSize)
6841 State.AllocateReg(GPRs);
6842 State.AllocateStack(VecSize, VecAlign);
6845 // No vector registers remain so pass on the stack.
6846 const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
6847 State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
6851 // If all GPRS are consumed then we pass the argument fully on the stack.
6852 if (NextRegIndex == GPRs.size()) {
6853 const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
6854 State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
6858 // Corner case for 32-bit codegen. We have 2 registers to pass the first
6859 // half of the argument, and then need to pass the remaining half on the
6861 if (GPRs[NextRegIndex] == PPC::R9) {
6862 const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
6864 CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
6866 const unsigned FirstReg = State.AllocateReg(PPC::R9);
6867 const unsigned SecondReg = State.AllocateReg(PPC::R10);
6868 assert(FirstReg && SecondReg &&
6869 "Allocating R9 or R10 unexpectedly failed.");
6871 CCValAssign::getCustomReg(ValNo, ValVT, FirstReg, RegVT, LocInfo));
6873 CCValAssign::getCustomReg(ValNo, ValVT, SecondReg, RegVT, LocInfo));
6877 // We have enough GPRs to fully pass the vector argument, and we have
6878 // already consumed any underaligned registers. Start with the custom
6879 // MemLoc and then the custom RegLocs.
6880 const unsigned Offset = State.AllocateStack(VecSize, VecAlign);
6882 CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
6883 for (unsigned I = 0; I != VecSize; I += PtrSize) {
6884 const unsigned Reg = State.AllocateReg(GPRs);
6885 assert(Reg && "Failed to allocated register for vararg vector argument");
6887 CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));
6895 // So far, this function is only used by LowerFormalArguments_AIX()
6896 static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT,
6900 assert((IsPPC64 || SVT != MVT::i64) &&
6901 "i64 should have been split for 32-bit codegen.");
6905 report_fatal_error("Unexpected value type for formal argument");
6909 return IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
6911 return HasP8Vector ? &PPC::VSSRCRegClass : &PPC::F4RCRegClass;
6913 return HasVSX ? &PPC::VSFRCRegClass : &PPC::F8RCRegClass;
6921 return &PPC::VRRCRegClass;
6925 static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,
6926 SelectionDAG &DAG, SDValue ArgValue,
6927 MVT LocVT, const SDLoc &dl) {
6928 assert(ValVT.isScalarInteger() && LocVT.isScalarInteger());
6929 assert(ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits());
6932 ArgValue = DAG.getNode(ISD::AssertSext, dl, LocVT, ArgValue,
6933 DAG.getValueType(ValVT));
6934 else if (Flags.isZExt())
6935 ArgValue = DAG.getNode(ISD::AssertZext, dl, LocVT, ArgValue,
6936 DAG.getValueType(ValVT));
6938 return DAG.getNode(ISD::TRUNCATE, dl, ValVT, ArgValue);
6941 static unsigned mapArgRegToOffsetAIX(unsigned Reg, const PPCFrameLowering *FL) {
6942 const unsigned LASize = FL->getLinkageSize();
6944 if (PPC::GPRCRegClass.contains(Reg)) {
6945 assert(Reg >= PPC::R3 && Reg <= PPC::R10 &&
6946 "Reg must be a valid argument register!");
6947 return LASize + 4 * (Reg - PPC::R3);
6950 if (PPC::G8RCRegClass.contains(Reg)) {
6951 assert(Reg >= PPC::X3 && Reg <= PPC::X10 &&
6952 "Reg must be a valid argument register!");
6953 return LASize + 8 * (Reg - PPC::X3);
6956 llvm_unreachable("Only general purpose registers expected.");
6959 // AIX ABI Stack Frame Layout:
6961 // Low Memory +--------------------------------------------+
6962 // SP +---> | Back chain | ---+
6963 // | +--------------------------------------------+ |
6964 // | | Saved Condition Register | |
6965 // | +--------------------------------------------+ |
6966 // | | Saved Linkage Register | |
6967 // | +--------------------------------------------+ | Linkage Area
6968 // | | Reserved for compilers | |
6969 // | +--------------------------------------------+ |
6970 // | | Reserved for binders | |
6971 // | +--------------------------------------------+ |
6972 // | | Saved TOC pointer | ---+
6973 // | +--------------------------------------------+
6974 // | | Parameter save area |
6975 // | +--------------------------------------------+
6976 // | | Alloca space |
6977 // | +--------------------------------------------+
6978 // | | Local variable space |
6979 // | +--------------------------------------------+
6980 // | | Float/int conversion temporary |
6981 // | +--------------------------------------------+
6982 // | | Save area for AltiVec registers |
6983 // | +--------------------------------------------+
6984 // | | AltiVec alignment padding |
6985 // | +--------------------------------------------+
6986 // | | Save area for VRSAVE register |
6987 // | +--------------------------------------------+
6988 // | | Save area for General Purpose registers |
6989 // | +--------------------------------------------+
6990 // | | Save area for Floating Point registers |
6991 // | +--------------------------------------------+
6992 // +---- | Back chain |
6993 // High Memory +--------------------------------------------+
6996 // AIX 7.2 Assembler Language Reference
6997 // Subroutine linkage convention
6999 SDValue PPCTargetLowering::LowerFormalArguments_AIX(
7000 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7001 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
7002 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7004 assert((CallConv == CallingConv::C || CallConv == CallingConv::Cold ||
7005 CallConv == CallingConv::Fast) &&
7006 "Unexpected calling convention!");
7008 if (getTargetMachine().Options.GuaranteedTailCallOpt)
7009 report_fatal_error("Tail call support is unimplemented on AIX.");
7012 report_fatal_error("Soft float support is unimplemented on AIX.");
7014 const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
7016 const bool IsPPC64 = Subtarget.isPPC64();
7017 const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
7019 // Assign locations to all of the incoming arguments.
7020 SmallVector<CCValAssign, 16> ArgLocs;
7021 MachineFunction &MF = DAG.getMachineFunction();
7022 MachineFrameInfo &MFI = MF.getFrameInfo();
7023 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
7024 AIXCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7026 const EVT PtrVT = getPointerTy(MF.getDataLayout());
7027 // Reserve space for the linkage area on the stack.
7028 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
7029 CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
7030 CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);
7032 SmallVector<SDValue, 8> MemOps;
7034 for (size_t I = 0, End = ArgLocs.size(); I != End; /* No increment here */) {
7035 CCValAssign &VA = ArgLocs[I++];
7036 MVT LocVT = VA.getLocVT();
7037 MVT ValVT = VA.getValVT();
7038 ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;
7039 // For compatibility with the AIX XL compiler, the float args in the
7040 // parameter save area are initialized even if the argument is available
7041 // in register. The caller is required to initialize both the register
7042 // and memory, however, the callee can choose to expect it in either.
7043 // The memloc is dismissed here because the argument is retrieved from
7045 if (VA.isMemLoc() && VA.needsCustom() && ValVT.isFloatingPoint())
7048 auto HandleMemLoc = [&]() {
7049 const unsigned LocSize = LocVT.getStoreSize();
7050 const unsigned ValSize = ValVT.getStoreSize();
7051 assert((ValSize <= LocSize) &&
7052 "Object size is larger than size of MemLoc");
7053 int CurArgOffset = VA.getLocMemOffset();
7054 // Objects are right-justified because AIX is big-endian.
7055 if (LocSize > ValSize)
7056 CurArgOffset += LocSize - ValSize;
7057 // Potential tail calls could cause overwriting of argument stack slots.
7058 const bool IsImmutable =
7059 !(getTargetMachine().Options.GuaranteedTailCallOpt &&
7060 (CallConv == CallingConv::Fast));
7061 int FI = MFI.CreateFixedObject(ValSize, CurArgOffset, IsImmutable);
7062 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
7064 DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo());
7065 InVals.push_back(ArgValue);
7068 // Vector arguments to VaArg functions are passed both on the stack, and
7069 // in any available GPRs. Load the value from the stack and add the GPRs
7071 if (VA.isMemLoc() && VA.needsCustom()) {
7072 assert(ValVT.isVector() && "Unexpected Custom MemLoc type.");
7073 assert(isVarArg && "Only use custom memloc for vararg.");
7074 // ValNo of the custom MemLoc, so we can compare it to the ValNo of the
7075 // matching custom RegLocs.
7076 const unsigned OriginalValNo = VA.getValNo();
7077 (void)OriginalValNo;
7079 auto HandleCustomVecRegLoc = [&]() {
7080 assert(I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7081 "Missing custom RegLoc.");
7083 assert(VA.getValVT().isVector() &&
7084 "Unexpected Val type for custom RegLoc.");
7085 assert(VA.getValNo() == OriginalValNo &&
7086 "ValNo mismatch between custom MemLoc and RegLoc.");
7087 MVT::SimpleValueType SVT = VA.getLocVT().SimpleTy;
7088 MF.addLiveIn(VA.getLocReg(),
7089 getRegClassForSVT(SVT, IsPPC64, Subtarget.hasP8Vector(),
7090 Subtarget.hasVSX()));
7094 // In 64-bit there will be exactly 2 custom RegLocs that follow, and in
7095 // in 32-bit there will be 2 custom RegLocs if we are passing in R9 and
7097 HandleCustomVecRegLoc();
7098 HandleCustomVecRegLoc();
7100 // If we are targeting 32-bit, there might be 2 extra custom RegLocs if
7101 // we passed the vector in R5, R6, R7 and R8.
7102 if (I != End && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom()) {
7104 "Only 2 custom RegLocs expected for 64-bit codegen.");
7105 HandleCustomVecRegLoc();
7106 HandleCustomVecRegLoc();
7112 if (VA.isRegLoc()) {
7113 if (VA.getValVT().isScalarInteger())
7114 FuncInfo->appendParameterType(PPCFunctionInfo::FixedType);
7115 else if (VA.getValVT().isFloatingPoint() && !VA.getValVT().isVector()) {
7116 switch (VA.getValVT().SimpleTy) {
7118 report_fatal_error("Unhandled value type for argument.");
7120 FuncInfo->appendParameterType(PPCFunctionInfo::ShortFloatingPoint);
7123 FuncInfo->appendParameterType(PPCFunctionInfo::LongFloatingPoint);
7126 } else if (VA.getValVT().isVector()) {
7127 switch (VA.getValVT().SimpleTy) {
7129 report_fatal_error("Unhandled value type for argument.");
7131 FuncInfo->appendParameterType(PPCFunctionInfo::VectorChar);
7134 FuncInfo->appendParameterType(PPCFunctionInfo::VectorShort);
7139 FuncInfo->appendParameterType(PPCFunctionInfo::VectorInt);
7143 FuncInfo->appendParameterType(PPCFunctionInfo::VectorFloat);
7149 if (Flags.isByVal() && VA.isMemLoc()) {
7150 const unsigned Size =
7151 alignTo(Flags.getByValSize() ? Flags.getByValSize() : PtrByteSize,
7153 const int FI = MF.getFrameInfo().CreateFixedObject(
7154 Size, VA.getLocMemOffset(), /* IsImmutable */ false,
7155 /* IsAliased */ true);
7156 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
7157 InVals.push_back(FIN);
7162 if (Flags.isByVal()) {
7163 assert(VA.isRegLoc() && "MemLocs should already be handled.");
7165 const MCPhysReg ArgReg = VA.getLocReg();
7166 const PPCFrameLowering *FL = Subtarget.getFrameLowering();
7168 if (Flags.getNonZeroByValAlign() > PtrByteSize)
7169 report_fatal_error("Over aligned byvals not supported yet.");
7171 const unsigned StackSize = alignTo(Flags.getByValSize(), PtrByteSize);
7172 const int FI = MF.getFrameInfo().CreateFixedObject(
7173 StackSize, mapArgRegToOffsetAIX(ArgReg, FL), /* IsImmutable */ false,
7174 /* IsAliased */ true);
7175 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
7176 InVals.push_back(FIN);
7178 // Add live ins for all the RegLocs for the same ByVal.
7179 const TargetRegisterClass *RegClass =
7180 IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
7182 auto HandleRegLoc = [&, RegClass, LocVT](const MCPhysReg PhysReg,
7184 const Register VReg = MF.addLiveIn(PhysReg, RegClass);
7185 // Since the callers side has left justified the aggregate in the
7186 // register, we can simply store the entire register into the stack
7188 SDValue CopyFrom = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
7189 // The store to the fixedstack object is needed becuase accessing a
7190 // field of the ByVal will use a gep and load. Ideally we will optimize
7191 // to extracting the value from the register directly, and elide the
7192 // stores when the arguments address is not taken, but that will need to
7194 SDValue Store = DAG.getStore(
7195 CopyFrom.getValue(1), dl, CopyFrom,
7196 DAG.getObjectPtrOffset(dl, FIN, TypeSize::Fixed(Offset)),
7197 MachinePointerInfo::getFixedStack(MF, FI, Offset));
7199 MemOps.push_back(Store);
7202 unsigned Offset = 0;
7203 HandleRegLoc(VA.getLocReg(), Offset);
7204 Offset += PtrByteSize;
7205 for (; Offset != StackSize && ArgLocs[I].isRegLoc();
7206 Offset += PtrByteSize) {
7207 assert(ArgLocs[I].getValNo() == VA.getValNo() &&
7208 "RegLocs should be for ByVal argument.");
7210 const CCValAssign RL = ArgLocs[I++];
7211 HandleRegLoc(RL.getLocReg(), Offset);
7212 FuncInfo->appendParameterType(PPCFunctionInfo::FixedType);
7215 if (Offset != StackSize) {
7216 assert(ArgLocs[I].getValNo() == VA.getValNo() &&
7217 "Expected MemLoc for remaining bytes.");
7218 assert(ArgLocs[I].isMemLoc() && "Expected MemLoc for remaining bytes.");
7219 // Consume the MemLoc.The InVal has already been emitted, so nothing
7220 // more needs to be done.
7227 if (VA.isRegLoc() && !VA.needsCustom()) {
7228 MVT::SimpleValueType SVT = ValVT.SimpleTy;
7230 MF.addLiveIn(VA.getLocReg(),
7231 getRegClassForSVT(SVT, IsPPC64, Subtarget.hasP8Vector(),
7232 Subtarget.hasVSX()));
7233 SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
7234 if (ValVT.isScalarInteger() &&
7235 (ValVT.getFixedSizeInBits() < LocVT.getFixedSizeInBits())) {
7237 truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
7239 InVals.push_back(ArgValue);
7242 if (VA.isMemLoc()) {
7248 // On AIX a minimum of 8 words is saved to the parameter save area.
7249 const unsigned MinParameterSaveArea = 8 * PtrByteSize;
7250 // Area that is at least reserved in the caller of this function.
7251 unsigned CallerReservedArea = std::max<unsigned>(
7252 CCInfo.getStackSize(), LinkageSize + MinParameterSaveArea);
7254 // Set the size that is at least reserved in caller of this function. Tail
7255 // call optimized function's reserved stack space needs to be aligned so
7256 // that taking the difference between two stack areas will result in an
7258 CallerReservedArea =
7259 EnsureStackAlignment(Subtarget.getFrameLowering(), CallerReservedArea);
7260 FuncInfo->setMinReservedArea(CallerReservedArea);
7263 FuncInfo->setVarArgsFrameIndex(
7264 MFI.CreateFixedObject(PtrByteSize, CCInfo.getStackSize(), true));
7265 SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
7267 static const MCPhysReg GPR_32[] = {PPC::R3, PPC::R4, PPC::R5, PPC::R6,
7268 PPC::R7, PPC::R8, PPC::R9, PPC::R10};
7270 static const MCPhysReg GPR_64[] = {PPC::X3, PPC::X4, PPC::X5, PPC::X6,
7271 PPC::X7, PPC::X8, PPC::X9, PPC::X10};
7272 const unsigned NumGPArgRegs = std::size(IsPPC64 ? GPR_64 : GPR_32);
7274 // The fixed integer arguments of a variadic function are stored to the
7275 // VarArgsFrameIndex on the stack so that they may be loaded by
7276 // dereferencing the result of va_next.
7277 for (unsigned GPRIndex =
7278 (CCInfo.getStackSize() - LinkageSize) / PtrByteSize;
7279 GPRIndex < NumGPArgRegs; ++GPRIndex) {
7281 const Register VReg =
7282 IsPPC64 ? MF.addLiveIn(GPR_64[GPRIndex], &PPC::G8RCRegClass)
7283 : MF.addLiveIn(GPR_32[GPRIndex], &PPC::GPRCRegClass);
7285 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
7287 DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
7288 MemOps.push_back(Store);
7289 // Increment the address for the next argument to store.
7290 SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
7291 FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
7295 if (!MemOps.empty())
7296 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
7301 SDValue PPCTargetLowering::LowerCall_AIX(
7302 SDValue Chain, SDValue Callee, CallFlags CFlags,
7303 const SmallVectorImpl<ISD::OutputArg> &Outs,
7304 const SmallVectorImpl<SDValue> &OutVals,
7305 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
7306 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
7307 const CallBase *CB) const {
7308 // See PPCTargetLowering::LowerFormalArguments_AIX() for a description of the
7309 // AIX ABI stack frame layout.
7311 assert((CFlags.CallConv == CallingConv::C ||
7312 CFlags.CallConv == CallingConv::Cold ||
7313 CFlags.CallConv == CallingConv::Fast) &&
7314 "Unexpected calling convention!");
7316 if (CFlags.IsPatchPoint)
7317 report_fatal_error("This call type is unimplemented on AIX.");
7319 const PPCSubtarget &Subtarget = DAG.getSubtarget<PPCSubtarget>();
7321 MachineFunction &MF = DAG.getMachineFunction();
7322 SmallVector<CCValAssign, 16> ArgLocs;
7323 AIXCCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
7326 // Reserve space for the linkage save area (LSA) on the stack.
7327 // In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
7328 // [SP][CR][LR][2 x reserved][TOC].
7329 // The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64.
7330 const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
7331 const bool IsPPC64 = Subtarget.isPPC64();
7332 const EVT PtrVT = getPointerTy(DAG.getDataLayout());
7333 const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
7334 CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
7335 CCInfo.AnalyzeCallOperands(Outs, CC_AIX);
7337 // The prolog code of the callee may store up to 8 GPR argument registers to
7338 // the stack, allowing va_start to index over them in memory if the callee
7340 // Because we cannot tell if this is needed on the caller side, we have to
7341 // conservatively assume that it is needed. As such, make sure we have at
7342 // least enough stack space for the caller to store the 8 GPRs.
7343 const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize;
7344 const unsigned NumBytes = std::max<unsigned>(
7345 LinkageSize + MinParameterSaveAreaSize, CCInfo.getStackSize());
7347 // Adjust the stack pointer for the new arguments...
7348 // These operations are automatically eliminated by the prolog/epilog pass.
7349 Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
7350 SDValue CallSeqStart = Chain;
7352 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
7353 SmallVector<SDValue, 8> MemOpChains;
7355 // Set up a copy of the stack pointer for loading and storing any
7356 // arguments that may not fit in the registers available for argument
7358 const SDValue StackPtr = IsPPC64 ? DAG.getRegister(PPC::X1, MVT::i64)
7359 : DAG.getRegister(PPC::R1, MVT::i32);
7361 for (unsigned I = 0, E = ArgLocs.size(); I != E;) {
7362 const unsigned ValNo = ArgLocs[I].getValNo();
7363 SDValue Arg = OutVals[ValNo];
7364 ISD::ArgFlagsTy Flags = Outs[ValNo].Flags;
7366 if (Flags.isByVal()) {
7367 const unsigned ByValSize = Flags.getByValSize();
7369 // Nothing to do for zero-sized ByVals on the caller side.
7375 auto GetLoad = [&](EVT VT, unsigned LoadOffset) {
7376 return DAG.getExtLoad(
7377 ISD::ZEXTLOAD, dl, PtrVT, Chain,
7379 ? DAG.getObjectPtrOffset(dl, Arg, TypeSize::Fixed(LoadOffset))
7381 MachinePointerInfo(), VT);
7384 unsigned LoadOffset = 0;
7386 // Initialize registers, which are fully occupied by the by-val argument.
7387 while (LoadOffset + PtrByteSize <= ByValSize && ArgLocs[I].isRegLoc()) {
7388 SDValue Load = GetLoad(PtrVT, LoadOffset);
7389 MemOpChains.push_back(Load.getValue(1));
7390 LoadOffset += PtrByteSize;
7391 const CCValAssign &ByValVA = ArgLocs[I++];
7392 assert(ByValVA.getValNo() == ValNo &&
7393 "Unexpected location for pass-by-value argument.");
7394 RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), Load));
7397 if (LoadOffset == ByValSize)
7400 // There must be one more loc to handle the remainder.
7401 assert(ArgLocs[I].getValNo() == ValNo &&
7402 "Expected additional location for by-value argument.");
7404 if (ArgLocs[I].isMemLoc()) {
7405 assert(LoadOffset < ByValSize && "Unexpected memloc for by-val arg.");
7406 const CCValAssign &ByValVA = ArgLocs[I++];
7407 ISD::ArgFlagsTy MemcpyFlags = Flags;
7408 // Only memcpy the bytes that don't pass in register.
7409 MemcpyFlags.setByValSize(ByValSize - LoadOffset);
7410 Chain = CallSeqStart = createMemcpyOutsideCallSeq(
7412 ? DAG.getObjectPtrOffset(dl, Arg, TypeSize::Fixed(LoadOffset))
7414 DAG.getObjectPtrOffset(dl, StackPtr,
7415 TypeSize::Fixed(ByValVA.getLocMemOffset())),
7416 CallSeqStart, MemcpyFlags, DAG, dl);
7420 // Initialize the final register residue.
7421 // Any residue that occupies the final by-val arg register must be
7422 // left-justified on AIX. Loads must be a power-of-2 size and cannot be
7423 // larger than the ByValSize. For example: a 7 byte by-val arg requires 4,
7424 // 2 and 1 byte loads.
7425 const unsigned ResidueBytes = ByValSize % PtrByteSize;
7426 assert(ResidueBytes != 0 && LoadOffset + PtrByteSize > ByValSize &&
7427 "Unexpected register residue for by-value argument.");
7429 for (unsigned Bytes = 0; Bytes != ResidueBytes;) {
7430 const unsigned N = llvm::bit_floor(ResidueBytes - Bytes);
7433 : ((N == 2) ? MVT::i16 : (N == 4 ? MVT::i32 : MVT::i64));
7434 SDValue Load = GetLoad(VT, LoadOffset);
7435 MemOpChains.push_back(Load.getValue(1));
7439 // By-val arguments are passed left-justfied in register.
7440 // Every load here needs to be shifted, otherwise a full register load
7441 // should have been used.
7442 assert(PtrVT.getSimpleVT().getSizeInBits() > (Bytes * 8) &&
7443 "Unexpected load emitted during handling of pass-by-value "
7445 unsigned NumSHLBits = PtrVT.getSimpleVT().getSizeInBits() - (Bytes * 8);
7447 getShiftAmountTy(Load->getValueType(0), DAG.getDataLayout());
7448 SDValue SHLAmt = DAG.getConstant(NumSHLBits, dl, ShiftAmountTy);
7449 SDValue ShiftedLoad =
7450 DAG.getNode(ISD::SHL, dl, Load.getValueType(), Load, SHLAmt);
7451 ResidueVal = ResidueVal ? DAG.getNode(ISD::OR, dl, PtrVT, ResidueVal,
7456 const CCValAssign &ByValVA = ArgLocs[I++];
7457 RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), ResidueVal));
7461 CCValAssign &VA = ArgLocs[I++];
7462 const MVT LocVT = VA.getLocVT();
7463 const MVT ValVT = VA.getValVT();
7465 switch (VA.getLocInfo()) {
7467 report_fatal_error("Unexpected argument extension type.");
7468 case CCValAssign::Full:
7470 case CCValAssign::ZExt:
7471 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
7473 case CCValAssign::SExt:
7474 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
7478 if (VA.isRegLoc() && !VA.needsCustom()) {
7479 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
7483 // Vector arguments passed to VarArg functions need custom handling when
7484 // they are passed (at least partially) in GPRs.
7485 if (VA.isMemLoc() && VA.needsCustom() && ValVT.isVector()) {
7486 assert(CFlags.IsVarArg && "Custom MemLocs only used for Vector args.");
7487 // Store value to its stack slot.
7489 DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());
7490 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
7492 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
7493 MemOpChains.push_back(Store);
7494 const unsigned OriginalValNo = VA.getValNo();
7495 // Then load the GPRs from the stack
7496 unsigned LoadOffset = 0;
7497 auto HandleCustomVecRegLoc = [&]() {
7498 assert(I != E && "Unexpected end of CCvalAssigns.");
7499 assert(ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7500 "Expected custom RegLoc.");
7501 CCValAssign RegVA = ArgLocs[I++];
7502 assert(RegVA.getValNo() == OriginalValNo &&
7503 "Custom MemLoc ValNo and custom RegLoc ValNo must match.");
7504 SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
7505 DAG.getConstant(LoadOffset, dl, PtrVT));
7506 SDValue Load = DAG.getLoad(PtrVT, dl, Store, Add, MachinePointerInfo());
7507 MemOpChains.push_back(Load.getValue(1));
7508 RegsToPass.push_back(std::make_pair(RegVA.getLocReg(), Load));
7509 LoadOffset += PtrByteSize;
7512 // In 64-bit there will be exactly 2 custom RegLocs that follow, and in
7513 // in 32-bit there will be 2 custom RegLocs if we are passing in R9 and
7515 HandleCustomVecRegLoc();
7516 HandleCustomVecRegLoc();
7518 if (I != E && ArgLocs[I].isRegLoc() && ArgLocs[I].needsCustom() &&
7519 ArgLocs[I].getValNo() == OriginalValNo) {
7521 "Only 2 custom RegLocs expected for 64-bit codegen.");
7522 HandleCustomVecRegLoc();
7523 HandleCustomVecRegLoc();
7529 if (VA.isMemLoc()) {
7531 DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());
7532 PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
7533 MemOpChains.push_back(
7534 DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
7539 if (!ValVT.isFloatingPoint())
7541 "Unexpected register handling for calling convention.");
7543 // Custom handling is used for GPR initializations for vararg float
7545 assert(VA.isRegLoc() && VA.needsCustom() && CFlags.IsVarArg &&
7546 LocVT.isInteger() &&
7547 "Custom register handling only expected for VarArg.");
7550 DAG.getBitcast(MVT::getIntegerVT(ValVT.getSizeInBits()), Arg);
7552 if (Arg.getValueType().getStoreSize() == LocVT.getStoreSize())
7553 // f32 in 32-bit GPR
7554 // f64 in 64-bit GPR
7555 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgAsInt));
7556 else if (Arg.getValueType().getFixedSizeInBits() <
7557 LocVT.getFixedSizeInBits())
7558 // f32 in 64-bit GPR.
7559 RegsToPass.push_back(std::make_pair(
7560 VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, LocVT)));
7562 // f64 in two 32-bit GPRs
7563 // The 2 GPRs are marked custom and expected to be adjacent in ArgLocs.
7564 assert(Arg.getValueType() == MVT::f64 && CFlags.IsVarArg && !IsPPC64 &&
7565 "Unexpected custom register for argument!");
7566 CCValAssign &GPR1 = VA;
7567 SDValue MSWAsI64 = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgAsInt,
7568 DAG.getConstant(32, dl, MVT::i8));
7569 RegsToPass.push_back(std::make_pair(
7570 GPR1.getLocReg(), DAG.getZExtOrTrunc(MSWAsI64, dl, MVT::i32)));
7573 // If only 1 GPR was available, there will only be one custom GPR and
7574 // the argument will also pass in memory.
7575 CCValAssign &PeekArg = ArgLocs[I];
7576 if (PeekArg.isRegLoc() && PeekArg.getValNo() == PeekArg.getValNo()) {
7577 assert(PeekArg.needsCustom() && "A second custom GPR is expected.");
7578 CCValAssign &GPR2 = ArgLocs[I++];
7579 RegsToPass.push_back(std::make_pair(
7580 GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));
7586 if (!MemOpChains.empty())
7587 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
7589 // For indirect calls, we need to save the TOC base to the stack for
7590 // restoration after the call.
7591 if (CFlags.IsIndirect) {
7592 assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
7593 const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister();
7594 const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
7595 const MVT PtrVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
7596 const unsigned TOCSaveOffset =
7597 Subtarget.getFrameLowering()->getTOCSaveOffset();
7599 setUsesTOCBasePtr(DAG);
7600 SDValue Val = DAG.getCopyFromReg(Chain, dl, TOCBaseReg, PtrVT);
7601 SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
7602 SDValue StackPtr = DAG.getRegister(StackPtrReg, PtrVT);
7603 SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
7604 Chain = DAG.getStore(
7605 Val.getValue(1), dl, Val, AddPtr,
7606 MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
7609 // Build a sequence of copy-to-reg nodes chained together with token chain
7610 // and flag operands which copy the outgoing args into the appropriate regs.
7612 for (auto Reg : RegsToPass) {
7613 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, InGlue);
7614 InGlue = Chain.getValue(1);
7617 const int SPDiff = 0;
7618 return FinishCall(CFlags, dl, DAG, RegsToPass, InGlue, Chain, CallSeqStart,
7619 Callee, SPDiff, NumBytes, Ins, InVals, CB);
7623 PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
7624 MachineFunction &MF, bool isVarArg,
7625 const SmallVectorImpl<ISD::OutputArg> &Outs,
7626 LLVMContext &Context) const {
7627 SmallVector<CCValAssign, 16> RVLocs;
7628 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
7629 return CCInfo.CheckReturn(
7630 Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
7636 PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
7638 const SmallVectorImpl<ISD::OutputArg> &Outs,
7639 const SmallVectorImpl<SDValue> &OutVals,
7640 const SDLoc &dl, SelectionDAG &DAG) const {
7641 SmallVector<CCValAssign, 16> RVLocs;
7642 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
7644 CCInfo.AnalyzeReturn(Outs,
7645 (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
7650 SmallVector<SDValue, 4> RetOps(1, Chain);
7652 // Copy the result values into the output registers.
7653 for (unsigned i = 0, RealResIdx = 0; i != RVLocs.size(); ++i, ++RealResIdx) {
7654 CCValAssign &VA = RVLocs[i];
7655 assert(VA.isRegLoc() && "Can only return in registers!");
7657 SDValue Arg = OutVals[RealResIdx];
7659 switch (VA.getLocInfo()) {
7660 default: llvm_unreachable("Unknown loc info!");
7661 case CCValAssign::Full: break;
7662 case CCValAssign::AExt:
7663 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
7665 case CCValAssign::ZExt:
7666 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
7668 case CCValAssign::SExt:
7669 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
7672 if (Subtarget.hasSPE() && VA.getLocVT() == MVT::f64) {
7673 bool isLittleEndian = Subtarget.isLittleEndian();
7674 // Legalize ret f64 -> ret 2 x i32.
7676 DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
7677 DAG.getIntPtrConstant(isLittleEndian ? 0 : 1, dl));
7678 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Glue);
7679 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
7680 SVal = DAG.getNode(PPCISD::EXTRACT_SPE, dl, MVT::i32, Arg,
7681 DAG.getIntPtrConstant(isLittleEndian ? 1 : 0, dl));
7682 Glue = Chain.getValue(1);
7683 VA = RVLocs[++i]; // skip ahead to next loc
7684 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), SVal, Glue);
7686 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
7687 Glue = Chain.getValue(1);
7688 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
7691 RetOps[0] = Chain; // Update chain.
7693 // Add the glue if we have it.
7695 RetOps.push_back(Glue);
7697 return DAG.getNode(PPCISD::RET_GLUE, dl, MVT::Other, RetOps);
7701 PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op,
7702 SelectionDAG &DAG) const {
7705 // Get the correct type for integers.
7706 EVT IntVT = Op.getValueType();
7709 SDValue Chain = Op.getOperand(0);
7710 SDValue FPSIdx = getFramePointerFrameIndex(DAG);
7711 // Build a DYNAREAOFFSET node.
7712 SDValue Ops[2] = {Chain, FPSIdx};
7713 SDVTList VTs = DAG.getVTList(IntVT);
7714 return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops);
7717 SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op,
7718 SelectionDAG &DAG) const {
7719 // When we pop the dynamic allocation we need to restore the SP link.
7722 // Get the correct type for pointers.
7723 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7725 // Construct the stack pointer operand.
7726 bool isPPC64 = Subtarget.isPPC64();
7727 unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
7728 SDValue StackPtr = DAG.getRegister(SP, PtrVT);
7730 // Get the operands for the STACKRESTORE.
7731 SDValue Chain = Op.getOperand(0);
7732 SDValue SaveSP = Op.getOperand(1);
7734 // Load the old link SP.
7735 SDValue LoadLinkSP =
7736 DAG.getLoad(PtrVT, dl, Chain, StackPtr, MachinePointerInfo());
7738 // Restore the stack pointer.
7739 Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);
7741 // Store the old link SP.
7742 return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo());
7745 SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const {
7746 MachineFunction &MF = DAG.getMachineFunction();
7747 bool isPPC64 = Subtarget.isPPC64();
7748 EVT PtrVT = getPointerTy(MF.getDataLayout());
7750 // Get current frame pointer save index. The users of this index will be
7751 // primarily DYNALLOC instructions.
7752 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
7753 int RASI = FI->getReturnAddrSaveIndex();
7755 // If the frame pointer save index hasn't been defined yet.
7757 // Find out what the fix offset of the frame pointer save area.
7758 int LROffset = Subtarget.getFrameLowering()->getReturnSaveOffset();
7759 // Allocate the frame index for frame pointer save area.
7760 RASI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, LROffset, false);
7762 FI->setReturnAddrSaveIndex(RASI);
7764 return DAG.getFrameIndex(RASI, PtrVT);
7768 PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
7769 MachineFunction &MF = DAG.getMachineFunction();
7770 bool isPPC64 = Subtarget.isPPC64();
7771 EVT PtrVT = getPointerTy(MF.getDataLayout());
7773 // Get current frame pointer save index. The users of this index will be
7774 // primarily DYNALLOC instructions.
7775 PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
7776 int FPSI = FI->getFramePointerSaveIndex();
7778 // If the frame pointer save index hasn't been defined yet.
7780 // Find out what the fix offset of the frame pointer save area.
7781 int FPOffset = Subtarget.getFrameLowering()->getFramePointerSaveOffset();
7782 // Allocate the frame index for frame pointer save area.
7783 FPSI = MF.getFrameInfo().CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
7785 FI->setFramePointerSaveIndex(FPSI);
7787 return DAG.getFrameIndex(FPSI, PtrVT);
7790 SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
7791 SelectionDAG &DAG) const {
7792 MachineFunction &MF = DAG.getMachineFunction();
7794 SDValue Chain = Op.getOperand(0);
7795 SDValue Size = Op.getOperand(1);
7798 // Get the correct type for pointers.
7799 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7801 SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
7802 DAG.getConstant(0, dl, PtrVT), Size);
7803 // Construct a node for the frame pointer save index.
7804 SDValue FPSIdx = getFramePointerFrameIndex(DAG);
7805 SDValue Ops[3] = { Chain, NegSize, FPSIdx };
7806 SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
7807 if (hasInlineStackProbe(MF))
7808 return DAG.getNode(PPCISD::PROBED_ALLOCA, dl, VTs, Ops);
7809 return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
7812 SDValue PPCTargetLowering::LowerEH_DWARF_CFA(SDValue Op,
7813 SelectionDAG &DAG) const {
7814 MachineFunction &MF = DAG.getMachineFunction();
7816 bool isPPC64 = Subtarget.isPPC64();
7817 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7819 int FI = MF.getFrameInfo().CreateFixedObject(isPPC64 ? 8 : 4, 0, false);
7820 return DAG.getFrameIndex(FI, PtrVT);
7823 SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
7824 SelectionDAG &DAG) const {
7826 return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,
7827 DAG.getVTList(MVT::i32, MVT::Other),
7828 Op.getOperand(0), Op.getOperand(1));
7831 SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
7832 SelectionDAG &DAG) const {
7834 return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
7835 Op.getOperand(0), Op.getOperand(1));
7838 SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
7839 if (Op.getValueType().isVector())
7840 return LowerVectorLoad(Op, DAG);
7842 assert(Op.getValueType() == MVT::i1 &&
7843 "Custom lowering only for i1 loads");
7845 // First, load 8 bits into 32 bits, then truncate to 1 bit.
7848 LoadSDNode *LD = cast<LoadSDNode>(Op);
7850 SDValue Chain = LD->getChain();
7851 SDValue BasePtr = LD->getBasePtr();
7852 MachineMemOperand *MMO = LD->getMemOperand();
7855 DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(DAG.getDataLayout()), Chain,
7856 BasePtr, MVT::i8, MMO);
7857 SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);
7859 SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
7860 return DAG.getMergeValues(Ops, dl);
7863 SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
7864 if (Op.getOperand(1).getValueType().isVector())
7865 return LowerVectorStore(Op, DAG);
7867 assert(Op.getOperand(1).getValueType() == MVT::i1 &&
7868 "Custom lowering only for i1 stores");
7870 // First, zero extend to 32 bits, then use a truncating store to 8 bits.
7873 StoreSDNode *ST = cast<StoreSDNode>(Op);
7875 SDValue Chain = ST->getChain();
7876 SDValue BasePtr = ST->getBasePtr();
7877 SDValue Value = ST->getValue();
7878 MachineMemOperand *MMO = ST->getMemOperand();
7880 Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(DAG.getDataLayout()),
7882 return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
7885 // FIXME: Remove this once the ANDI glue bug is fixed:
7886 SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
7887 assert(Op.getValueType() == MVT::i1 &&
7888 "Custom lowering only for i1 results");
7891 return DAG.getNode(PPCISD::ANDI_rec_1_GT_BIT, DL, MVT::i1, Op.getOperand(0));
7894 SDValue PPCTargetLowering::LowerTRUNCATEVector(SDValue Op,
7895 SelectionDAG &DAG) const {
7897 // Implements a vector truncate that fits in a vector register as a shuffle.
7898 // We want to legalize vector truncates down to where the source fits in
7899 // a vector register (and target is therefore smaller than vector register
7900 // size). At that point legalization will try to custom lower the sub-legal
7901 // result and get here - where we can contain the truncate as a single target
7904 // For example a trunc <2 x i16> to <2 x i8> could be visualized as follows:
7905 // <MSB1|LSB1, MSB2|LSB2> to <LSB1, LSB2>
7907 // We will implement it for big-endian ordering as this (where x denotes
7909 // < MSB1|LSB1, MSB2|LSB2, uu, uu, uu, uu, uu, uu> to
7910 // < LSB1, LSB2, u, u, u, u, u, u, u, u, u, u, u, u, u, u>
7912 // The same operation in little-endian ordering will be:
7913 // <uu, uu, uu, uu, uu, uu, LSB2|MSB2, LSB1|MSB1> to
7914 // <u, u, u, u, u, u, u, u, u, u, u, u, u, u, LSB2, LSB1>
7916 EVT TrgVT = Op.getValueType();
7917 assert(TrgVT.isVector() && "Vector type expected.");
7918 unsigned TrgNumElts = TrgVT.getVectorNumElements();
7919 EVT EltVT = TrgVT.getVectorElementType();
7920 if (!isOperationCustom(Op.getOpcode(), TrgVT) ||
7921 TrgVT.getSizeInBits() > 128 || !isPowerOf2_32(TrgNumElts) ||
7922 !llvm::has_single_bit<uint32_t>(EltVT.getSizeInBits()))
7925 SDValue N1 = Op.getOperand(0);
7926 EVT SrcVT = N1.getValueType();
7927 unsigned SrcSize = SrcVT.getSizeInBits();
7928 if (SrcSize > 256 || !isPowerOf2_32(SrcVT.getVectorNumElements()) ||
7929 !llvm::has_single_bit<uint32_t>(
7930 SrcVT.getVectorElementType().getSizeInBits()))
7932 if (SrcSize == 256 && SrcVT.getVectorNumElements() < 2)
7935 unsigned WideNumElts = 128 / EltVT.getSizeInBits();
7936 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
7940 if (SrcSize == 256) {
7941 EVT VecIdxTy = getVectorIdxTy(DAG.getDataLayout());
7943 N1.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
7944 unsigned SplitNumElts = SplitVT.getVectorNumElements();
7945 Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, N1,
7946 DAG.getConstant(0, DL, VecIdxTy));
7947 Op2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, N1,
7948 DAG.getConstant(SplitNumElts, DL, VecIdxTy));
7951 Op1 = SrcSize == 128 ? N1 : widenVec(DAG, N1, DL);
7952 Op2 = DAG.getUNDEF(WideVT);
7955 // First list the elements we want to keep.
7956 unsigned SizeMult = SrcSize / TrgVT.getSizeInBits();
7957 SmallVector<int, 16> ShuffV;
7958 if (Subtarget.isLittleEndian())
7959 for (unsigned i = 0; i < TrgNumElts; ++i)
7960 ShuffV.push_back(i * SizeMult);
7962 for (unsigned i = 1; i <= TrgNumElts; ++i)
7963 ShuffV.push_back(i * SizeMult - 1);
7965 // Populate the remaining elements with undefs.
7966 for (unsigned i = TrgNumElts; i < WideNumElts; ++i)
7967 // ShuffV.push_back(i + WideNumElts);
7968 ShuffV.push_back(WideNumElts + 1);
7970 Op1 = DAG.getNode(ISD::BITCAST, DL, WideVT, Op1);
7971 Op2 = DAG.getNode(ISD::BITCAST, DL, WideVT, Op2);
7972 return DAG.getVectorShuffle(WideVT, DL, Op1, Op2, ShuffV);
7975 /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
7977 SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
7978 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
7979 EVT ResVT = Op.getValueType();
7980 EVT CmpVT = Op.getOperand(0).getValueType();
7981 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
7982 SDValue TV = Op.getOperand(2), FV = Op.getOperand(3);
7985 // Without power9-vector, we don't have native instruction for f128 comparison.
7986 // Following transformation to libcall is needed for setcc:
7987 // select_cc lhs, rhs, tv, fv, cc -> select_cc (setcc cc, x, y), 0, tv, fv, NE
7988 if (!Subtarget.hasP9Vector() && CmpVT == MVT::f128) {
7989 SDValue Z = DAG.getSetCC(
7990 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT),
7992 SDValue Zero = DAG.getConstant(0, dl, Z.getValueType());
7993 return DAG.getSelectCC(dl, Z, Zero, TV, FV, ISD::SETNE);
7996 // Not FP, or using SPE? Not a fsel.
7997 if (!CmpVT.isFloatingPoint() || !TV.getValueType().isFloatingPoint() ||
8001 SDNodeFlags Flags = Op.getNode()->getFlags();
8003 // We have xsmaxc[dq]p/xsminc[dq]p which are OK to emit even in the
8004 // presence of infinities.
8005 if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) {
8011 return DAG.getNode(PPCISD::XSMAXC, dl, Op.getValueType(), LHS, RHS);
8014 return DAG.getNode(PPCISD::XSMINC, dl, Op.getValueType(), LHS, RHS);
8018 // We might be able to do better than this under some circumstances, but in
8019 // general, fsel-based lowering of select is a finite-math-only optimization.
8020 // For more information, see section F.3 of the 2.06 ISA specification.
8022 if ((!DAG.getTarget().Options.NoInfsFPMath && !Flags.hasNoInfs()) ||
8023 (!DAG.getTarget().Options.NoNaNsFPMath && !Flags.hasNoNaNs()))
8026 // If the RHS of the comparison is a 0.0, we don't need to do the
8027 // subtraction at all.
8029 if (isFloatingPointZero(RHS))
8031 default: break; // SETUO etc aren't handled by fsel.
8036 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8037 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
8038 Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
8039 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
8040 Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
8041 return DAG.getNode(PPCISD::FSEL, dl, ResVT,
8042 DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV);
8045 std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
8049 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8050 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
8051 return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
8054 std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
8058 if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
8059 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
8060 return DAG.getNode(PPCISD::FSEL, dl, ResVT,
8061 DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
8066 default: break; // SETUO etc aren't handled by fsel.
8071 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
8072 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8073 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
8074 Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
8075 if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
8076 Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
8077 return DAG.getNode(PPCISD::FSEL, dl, ResVT,
8078 DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
8081 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
8082 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8083 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
8084 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
8087 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags);
8088 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8089 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
8090 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
8093 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
8094 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8095 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
8096 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
8099 Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, Flags);
8100 if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
8101 Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
8102 return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
8107 static unsigned getPPCStrictOpcode(unsigned Opc) {
8110 llvm_unreachable("No strict version of this opcode!");
8111 case PPCISD::FCTIDZ:
8112 return PPCISD::STRICT_FCTIDZ;
8113 case PPCISD::FCTIWZ:
8114 return PPCISD::STRICT_FCTIWZ;
8115 case PPCISD::FCTIDUZ:
8116 return PPCISD::STRICT_FCTIDUZ;
8117 case PPCISD::FCTIWUZ:
8118 return PPCISD::STRICT_FCTIWUZ;
8120 return PPCISD::STRICT_FCFID;
8121 case PPCISD::FCFIDU:
8122 return PPCISD::STRICT_FCFIDU;
8123 case PPCISD::FCFIDS:
8124 return PPCISD::STRICT_FCFIDS;
8125 case PPCISD::FCFIDUS:
8126 return PPCISD::STRICT_FCFIDUS;
8130 static SDValue convertFPToInt(SDValue Op, SelectionDAG &DAG,
8131 const PPCSubtarget &Subtarget) {
8133 bool IsStrict = Op->isStrictFPOpcode();
8134 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8135 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8137 // TODO: Any other flags to propagate?
8139 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8141 // For strict nodes, source is the second operand.
8142 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
8143 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
8144 MVT DestTy = Op.getSimpleValueType();
8145 assert(Src.getValueType().isFloatingPoint() &&
8146 (DestTy == MVT::i8 || DestTy == MVT::i16 || DestTy == MVT::i32 ||
8147 DestTy == MVT::i64) &&
8148 "Invalid FP_TO_INT types");
8149 if (Src.getValueType() == MVT::f32) {
8152 DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
8153 DAG.getVTList(MVT::f64, MVT::Other), {Chain, Src}, Flags);
8154 Chain = Src.getValue(1);
8156 Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
8158 if ((DestTy == MVT::i8 || DestTy == MVT::i16) && Subtarget.hasP9Vector())
8159 DestTy = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
8160 unsigned Opc = ISD::DELETED_NODE;
8161 switch (DestTy.SimpleTy) {
8162 default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
8164 Opc = IsSigned ? PPCISD::FCTIWZ
8165 : (Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ : PPCISD::FCTIDZ);
8168 assert((IsSigned || Subtarget.hasFPCVT()) &&
8169 "i64 FP_TO_UINT is supported only with FPCVT");
8170 Opc = IsSigned ? PPCISD::FCTIDZ : PPCISD::FCTIDUZ;
8172 EVT ConvTy = Src.getValueType() == MVT::f128 ? MVT::f128 : MVT::f64;
8175 Opc = getPPCStrictOpcode(Opc);
8176 Conv = DAG.getNode(Opc, dl, DAG.getVTList(ConvTy, MVT::Other), {Chain, Src},
8179 Conv = DAG.getNode(Opc, dl, ConvTy, Src);
8184 void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
8186 const SDLoc &dl) const {
8187 SDValue Tmp = convertFPToInt(Op, DAG, Subtarget);
8188 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8189 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8190 bool IsStrict = Op->isStrictFPOpcode();
8192 // Convert the FP value to an int value through memory.
8193 bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
8194 (IsSigned || Subtarget.hasFPCVT());
8195 SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
8196 int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
8197 MachinePointerInfo MPI =
8198 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
8200 // Emit a store to the stack slot.
8201 SDValue Chain = IsStrict ? Tmp.getValue(1) : DAG.getEntryNode();
8202 Align Alignment(DAG.getEVTAlign(Tmp.getValueType()));
8204 MachineFunction &MF = DAG.getMachineFunction();
8205 Alignment = Align(4);
8206 MachineMemOperand *MMO =
8207 MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Alignment);
8208 SDValue Ops[] = { Chain, Tmp, FIPtr };
8209 Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
8210 DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
8212 Chain = DAG.getStore(Chain, dl, Tmp, FIPtr, MPI, Alignment);
8214 // Result is a load from the stack slot. If loading 4 bytes, make sure to
8215 // add in a bias on big endian.
8216 if (Op.getValueType() == MVT::i32 && !i32Stack) {
8217 FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
8218 DAG.getConstant(4, dl, FIPtr.getValueType()));
8219 MPI = MPI.getWithOffset(Subtarget.isLittleEndian() ? 0 : 4);
8225 RLI.Alignment = Alignment;
8228 /// Custom lowers floating point to integer conversions to use
8229 /// the direct move instructions available in ISA 2.07 to avoid the
8230 /// need for load/store combinations.
8231 SDValue PPCTargetLowering::LowerFP_TO_INTDirectMove(SDValue Op,
8233 const SDLoc &dl) const {
8234 SDValue Conv = convertFPToInt(Op, DAG, Subtarget);
8235 SDValue Mov = DAG.getNode(PPCISD::MFVSR, dl, Op.getValueType(), Conv);
8236 if (Op->isStrictFPOpcode())
8237 return DAG.getMergeValues({Mov, Conv.getValue(1)}, dl);
8242 SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
8243 const SDLoc &dl) const {
8244 bool IsStrict = Op->isStrictFPOpcode();
8245 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
8246 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
8247 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
8248 EVT SrcVT = Src.getValueType();
8249 EVT DstVT = Op.getValueType();
8251 // FP to INT conversions are legal for f128.
8252 if (SrcVT == MVT::f128)
8253 return Subtarget.hasP9Vector() ? Op : SDValue();
8255 // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
8256 // PPC (the libcall is not available).
8257 if (SrcVT == MVT::ppcf128) {
8258 if (DstVT == MVT::i32) {
8259 // TODO: Conservatively pass only nofpexcept flag here. Need to check and
8260 // set other fast-math flags to FP operations in both strict and
8261 // non-strict cases. (FP_TO_SINT, FSUB)
8263 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8267 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::f64, MVT::f64);
8269 // Add the two halves of the long double in round-to-zero mode, and use
8270 // a smaller FP_TO_SINT.
8272 SDValue Res = DAG.getNode(PPCISD::STRICT_FADDRTZ, dl,
8273 DAG.getVTList(MVT::f64, MVT::Other),
8274 {Op.getOperand(0), Lo, Hi}, Flags);
8275 return DAG.getNode(ISD::STRICT_FP_TO_SINT, dl,
8276 DAG.getVTList(MVT::i32, MVT::Other),
8277 {Res.getValue(1), Res}, Flags);
8279 SDValue Res = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
8280 return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
8283 const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
8284 APFloat APF = APFloat(APFloat::PPCDoubleDouble(), APInt(128, TwoE31));
8285 SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT);
8286 SDValue SignMask = DAG.getConstant(0x80000000, dl, DstVT);
8288 // Sel = Src < 0x80000000
8289 // FltOfs = select Sel, 0.0, 0x80000000
8290 // IntOfs = select Sel, 0, 0x80000000
8291 // Result = fp_to_sint(Src - FltOfs) ^ IntOfs
8292 SDValue Chain = Op.getOperand(0);
8294 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
8296 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), DstVT);
8297 SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT,
8299 Chain = Sel.getValue(1);
8301 SDValue FltOfs = DAG.getSelect(
8302 dl, SrcVT, Sel, DAG.getConstantFP(0.0, dl, SrcVT), Cst);
8303 Sel = DAG.getBoolExtOrTrunc(Sel, dl, DstSetCCVT, DstVT);
8305 SDValue Val = DAG.getNode(ISD::STRICT_FSUB, dl,
8306 DAG.getVTList(SrcVT, MVT::Other),
8307 {Chain, Src, FltOfs}, Flags);
8308 Chain = Val.getValue(1);
8309 SDValue SInt = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl,
8310 DAG.getVTList(DstVT, MVT::Other),
8311 {Chain, Val}, Flags);
8312 Chain = SInt.getValue(1);
8313 SDValue IntOfs = DAG.getSelect(
8314 dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT), SignMask);
8315 SDValue Result = DAG.getNode(ISD::XOR, dl, DstVT, SInt, IntOfs);
8316 return DAG.getMergeValues({Result, Chain}, dl);
8318 // X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
8319 // FIXME: generated code sucks.
8320 SDValue True = DAG.getNode(ISD::FSUB, dl, MVT::ppcf128, Src, Cst);
8321 True = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, True);
8322 True = DAG.getNode(ISD::ADD, dl, MVT::i32, True, SignMask);
8323 SDValue False = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
8324 return DAG.getSelectCC(dl, Src, Cst, True, False, ISD::SETGE);
8332 if (Subtarget.hasDirectMove() && Subtarget.isPPC64())
8333 return LowerFP_TO_INTDirectMove(Op, DAG, dl);
8336 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
8338 return DAG.getLoad(Op.getValueType(), dl, RLI.Chain, RLI.Ptr, RLI.MPI,
8339 RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
8342 // We're trying to insert a regular store, S, and then a load, L. If the
8343 // incoming value, O, is a load, we might just be able to have our load use the
8344 // address used by O. However, we don't know if anything else will store to
8345 // that address before we can load from it. To prevent this situation, we need
8346 // to insert our load, L, into the chain as a peer of O. To do this, we give L
8347 // the same chain operand as O, we create a token factor from the chain results
8348 // of O and L, and we replace all uses of O's chain result with that token
8349 // factor (see spliceIntoChain below for this last part).
8350 bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
8353 ISD::LoadExtType ET) const {
8354 // Conservatively skip reusing for constrained FP nodes.
8355 if (Op->isStrictFPOpcode())
8359 bool ValidFPToUint = Op.getOpcode() == ISD::FP_TO_UINT &&
8360 (Subtarget.hasFPCVT() || Op.getValueType() == MVT::i32);
8361 if (ET == ISD::NON_EXTLOAD &&
8362 (ValidFPToUint || Op.getOpcode() == ISD::FP_TO_SINT) &&
8363 isOperationLegalOrCustom(Op.getOpcode(),
8364 Op.getOperand(0).getValueType())) {
8366 LowerFP_TO_INTForReuse(Op, RLI, DAG, dl);
8370 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op);
8371 if (!LD || LD->getExtensionType() != ET || LD->isVolatile() ||
8372 LD->isNonTemporal())
8374 if (LD->getMemoryVT() != MemVT)
8377 // If the result of the load is an illegal type, then we can't build a
8378 // valid chain for reuse since the legalised loads and token factor node that
8379 // ties the legalised loads together uses a different output chain then the
8381 if (!isTypeLegal(LD->getValueType(0)))
8384 RLI.Ptr = LD->getBasePtr();
8385 if (LD->isIndexed() && !LD->getOffset().isUndef()) {
8386 assert(LD->getAddressingMode() == ISD::PRE_INC &&
8387 "Non-pre-inc AM on PPC?");
8388 RLI.Ptr = DAG.getNode(ISD::ADD, dl, RLI.Ptr.getValueType(), RLI.Ptr,
8392 RLI.Chain = LD->getChain();
8393 RLI.MPI = LD->getPointerInfo();
8394 RLI.IsDereferenceable = LD->isDereferenceable();
8395 RLI.IsInvariant = LD->isInvariant();
8396 RLI.Alignment = LD->getAlign();
8397 RLI.AAInfo = LD->getAAInfo();
8398 RLI.Ranges = LD->getRanges();
8400 RLI.ResChain = SDValue(LD, LD->isIndexed() ? 2 : 1);
8404 // Given the head of the old chain, ResChain, insert a token factor containing
8405 // it and NewResChain, and make users of ResChain now be users of that token
8407 // TODO: Remove and use DAG::makeEquivalentMemoryOrdering() instead.
8408 void PPCTargetLowering::spliceIntoChain(SDValue ResChain,
8409 SDValue NewResChain,
8410 SelectionDAG &DAG) const {
8414 SDLoc dl(NewResChain);
8416 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
8417 NewResChain, DAG.getUNDEF(MVT::Other));
8418 assert(TF.getNode() != NewResChain.getNode() &&
8419 "A new TF really is required here");
8421 DAG.ReplaceAllUsesOfValueWith(ResChain, TF);
8422 DAG.UpdateNodeOperands(TF.getNode(), ResChain, NewResChain);
8425 /// Analyze profitability of direct move
8426 /// prefer float load to int load plus direct move
8427 /// when there is no integer use of int load
8428 bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
8429 SDNode *Origin = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0).getNode();
8430 if (Origin->getOpcode() != ISD::LOAD)
8433 // If there is no LXSIBZX/LXSIHZX, like Power8,
8434 // prefer direct move if the memory size is 1 or 2 bytes.
8435 MachineMemOperand *MMO = cast<LoadSDNode>(Origin)->getMemOperand();
8436 if (!Subtarget.hasP9Vector() && MMO->getSize() <= 2)
8439 for (SDNode::use_iterator UI = Origin->use_begin(),
8440 UE = Origin->use_end();
8443 // Only look at the users of the loaded value.
8444 if (UI.getUse().get().getResNo() != 0)
8447 if (UI->getOpcode() != ISD::SINT_TO_FP &&
8448 UI->getOpcode() != ISD::UINT_TO_FP &&
8449 UI->getOpcode() != ISD::STRICT_SINT_TO_FP &&
8450 UI->getOpcode() != ISD::STRICT_UINT_TO_FP)
8457 static SDValue convertIntToFP(SDValue Op, SDValue Src, SelectionDAG &DAG,
8458 const PPCSubtarget &Subtarget,
8459 SDValue Chain = SDValue()) {
8460 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||
8461 Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8464 // TODO: Any other flags to propagate?
8466 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8468 // If we have FCFIDS, then use it when converting to single-precision.
8469 // Otherwise, convert to double-precision and then round.
8470 bool IsSingle = Op.getValueType() == MVT::f32 && Subtarget.hasFPCVT();
8471 unsigned ConvOpc = IsSingle ? (IsSigned ? PPCISD::FCFIDS : PPCISD::FCFIDUS)
8472 : (IsSigned ? PPCISD::FCFID : PPCISD::FCFIDU);
8473 EVT ConvTy = IsSingle ? MVT::f32 : MVT::f64;
8474 if (Op->isStrictFPOpcode()) {
8476 Chain = Op.getOperand(0);
8477 return DAG.getNode(getPPCStrictOpcode(ConvOpc), dl,
8478 DAG.getVTList(ConvTy, MVT::Other), {Chain, Src}, Flags);
8480 return DAG.getNode(ConvOpc, dl, ConvTy, Src);
8483 /// Custom lowers integer to floating point conversions to use
8484 /// the direct move instructions available in ISA 2.07 to avoid the
8485 /// need for load/store combinations.
8486 SDValue PPCTargetLowering::LowerINT_TO_FPDirectMove(SDValue Op,
8488 const SDLoc &dl) const {
8489 assert((Op.getValueType() == MVT::f32 ||
8490 Op.getValueType() == MVT::f64) &&
8491 "Invalid floating point type as target of conversion");
8492 assert(Subtarget.hasFPCVT() &&
8493 "Int to FP conversions with direct moves require FPCVT");
8494 SDValue Src = Op.getOperand(Op->isStrictFPOpcode() ? 1 : 0);
8495 bool WordInt = Src.getSimpleValueType().SimpleTy == MVT::i32;
8496 bool Signed = Op.getOpcode() == ISD::SINT_TO_FP ||
8497 Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8498 unsigned MovOpc = (WordInt && !Signed) ? PPCISD::MTVSRZ : PPCISD::MTVSRA;
8499 SDValue Mov = DAG.getNode(MovOpc, dl, MVT::f64, Src);
8500 return convertIntToFP(Op, Mov, DAG, Subtarget);
8503 static SDValue widenVec(SelectionDAG &DAG, SDValue Vec, const SDLoc &dl) {
8505 EVT VecVT = Vec.getValueType();
8506 assert(VecVT.isVector() && "Expected a vector type.");
8507 assert(VecVT.getSizeInBits() < 128 && "Vector is already full width.");
8509 EVT EltVT = VecVT.getVectorElementType();
8510 unsigned WideNumElts = 128 / EltVT.getSizeInBits();
8511 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), EltVT, WideNumElts);
8513 unsigned NumConcat = WideNumElts / VecVT.getVectorNumElements();
8514 SmallVector<SDValue, 16> Ops(NumConcat);
8516 SDValue UndefVec = DAG.getUNDEF(VecVT);
8517 for (unsigned i = 1; i < NumConcat; ++i)
8520 return DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Ops);
8523 SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
8524 const SDLoc &dl) const {
8525 bool IsStrict = Op->isStrictFPOpcode();
8526 unsigned Opc = Op.getOpcode();
8527 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
8528 assert((Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP ||
8529 Opc == ISD::STRICT_UINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP) &&
8530 "Unexpected conversion type");
8531 assert((Op.getValueType() == MVT::v2f64 || Op.getValueType() == MVT::v4f32) &&
8532 "Supports conversions to v2f64/v4f32 only.");
8534 // TODO: Any other flags to propagate?
8536 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8538 bool SignedConv = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
8539 bool FourEltRes = Op.getValueType() == MVT::v4f32;
8541 SDValue Wide = widenVec(DAG, Src, dl);
8542 EVT WideVT = Wide.getValueType();
8543 unsigned WideNumElts = WideVT.getVectorNumElements();
8544 MVT IntermediateVT = FourEltRes ? MVT::v4i32 : MVT::v2i64;
8546 SmallVector<int, 16> ShuffV;
8547 for (unsigned i = 0; i < WideNumElts; ++i)
8548 ShuffV.push_back(i + WideNumElts);
8550 int Stride = FourEltRes ? WideNumElts / 4 : WideNumElts / 2;
8551 int SaveElts = FourEltRes ? 4 : 2;
8552 if (Subtarget.isLittleEndian())
8553 for (int i = 0; i < SaveElts; i++)
8554 ShuffV[i * Stride] = i;
8556 for (int i = 1; i <= SaveElts; i++)
8557 ShuffV[i * Stride - 1] = i - 1;
8559 SDValue ShuffleSrc2 =
8560 SignedConv ? DAG.getUNDEF(WideVT) : DAG.getConstant(0, dl, WideVT);
8561 SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV);
8565 Arrange = DAG.getBitcast(IntermediateVT, Arrange);
8566 EVT ExtVT = Src.getValueType();
8567 if (Subtarget.hasP9Altivec())
8568 ExtVT = EVT::getVectorVT(*DAG.getContext(), WideVT.getVectorElementType(),
8569 IntermediateVT.getVectorNumElements());
8571 Extend = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, IntermediateVT, Arrange,
8572 DAG.getValueType(ExtVT));
8574 Extend = DAG.getNode(ISD::BITCAST, dl, IntermediateVT, Arrange);
8577 return DAG.getNode(Opc, dl, DAG.getVTList(Op.getValueType(), MVT::Other),
8578 {Op.getOperand(0), Extend}, Flags);
8580 return DAG.getNode(Opc, dl, Op.getValueType(), Extend);
8583 SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
8584 SelectionDAG &DAG) const {
8586 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP ||
8587 Op.getOpcode() == ISD::STRICT_SINT_TO_FP;
8588 bool IsStrict = Op->isStrictFPOpcode();
8589 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
8590 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
8592 // TODO: Any other flags to propagate?
8594 Flags.setNoFPExcept(Op->getFlags().hasNoFPExcept());
8596 EVT InVT = Src.getValueType();
8597 EVT OutVT = Op.getValueType();
8598 if (OutVT.isVector() && OutVT.isFloatingPoint() &&
8599 isOperationCustom(Op.getOpcode(), InVT))
8600 return LowerINT_TO_FPVector(Op, DAG, dl);
8602 // Conversions to f128 are legal.
8603 if (Op.getValueType() == MVT::f128)
8604 return Subtarget.hasP9Vector() ? Op : SDValue();
8606 // Don't handle ppc_fp128 here; let it be lowered to a libcall.
8607 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
8610 if (Src.getValueType() == MVT::i1) {
8611 SDValue Sel = DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Src,
8612 DAG.getConstantFP(1.0, dl, Op.getValueType()),
8613 DAG.getConstantFP(0.0, dl, Op.getValueType()));
8615 return DAG.getMergeValues({Sel, Chain}, dl);
8620 // If we have direct moves, we can do all the conversion, skip the store/load
8621 // however, without FPCVT we can't do most conversions.
8622 if (Subtarget.hasDirectMove() && directMoveIsProfitable(Op) &&
8623 Subtarget.isPPC64() && Subtarget.hasFPCVT())
8624 return LowerINT_TO_FPDirectMove(Op, DAG, dl);
8626 assert((IsSigned || Subtarget.hasFPCVT()) &&
8627 "UINT_TO_FP is supported only with FPCVT");
8629 if (Src.getValueType() == MVT::i64) {
8631 // When converting to single-precision, we actually need to convert
8632 // to double-precision first and then round to single-precision.
8633 // To avoid double-rounding effects during that operation, we have
8634 // to prepare the input operand. Bits that might be truncated when
8635 // converting to double-precision are replaced by a bit that won't
8636 // be lost at this stage, but is below the single-precision rounding
8639 // However, if -enable-unsafe-fp-math is in effect, accept double
8640 // rounding to avoid the extra overhead.
8641 if (Op.getValueType() == MVT::f32 &&
8642 !Subtarget.hasFPCVT() &&
8643 !DAG.getTarget().Options.UnsafeFPMath) {
8645 // Twiddle input to make sure the low 11 bits are zero. (If this
8646 // is the case, we are guaranteed the value will fit into the 53 bit
8647 // mantissa of an IEEE double-precision value without rounding.)
8648 // If any of those low 11 bits were not zero originally, make sure
8649 // bit 12 (value 2048) is set instead, so that the final rounding
8650 // to single-precision gets the correct result.
8651 SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
8652 SINT, DAG.getConstant(2047, dl, MVT::i64));
8653 Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
8654 Round, DAG.getConstant(2047, dl, MVT::i64));
8655 Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
8656 Round = DAG.getNode(ISD::AND, dl, MVT::i64,
8657 Round, DAG.getConstant(-2048, dl, MVT::i64));
8659 // However, we cannot use that value unconditionally: if the magnitude
8660 // of the input value is small, the bit-twiddling we did above might
8661 // end up visibly changing the output. Fortunately, in that case, we
8662 // don't need to twiddle bits since the original input will convert
8663 // exactly to double-precision floating-point already. Therefore,
8664 // construct a conditional to use the original value if the top 11
8665 // bits are all sign-bit copies, and use the rounded value computed
8667 SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
8668 SINT, DAG.getConstant(53, dl, MVT::i32));
8669 Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
8670 Cond, DAG.getConstant(1, dl, MVT::i64));
8671 Cond = DAG.getSetCC(
8673 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
8674 Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);
8676 SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
8682 MachineFunction &MF = DAG.getMachineFunction();
8683 if (canReuseLoadAddress(SINT, MVT::i64, RLI, DAG)) {
8684 Bits = DAG.getLoad(MVT::f64, dl, RLI.Chain, RLI.Ptr, RLI.MPI,
8685 RLI.Alignment, RLI.MMOFlags(), RLI.AAInfo, RLI.Ranges);
8686 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
8687 } else if (Subtarget.hasLFIWAX() &&
8688 canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::SEXTLOAD)) {
8689 MachineMemOperand *MMO =
8690 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
8691 RLI.Alignment, RLI.AAInfo, RLI.Ranges);
8692 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8693 Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl,
8694 DAG.getVTList(MVT::f64, MVT::Other),
8695 Ops, MVT::i32, MMO);
8696 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
8697 } else if (Subtarget.hasFPCVT() &&
8698 canReuseLoadAddress(SINT, MVT::i32, RLI, DAG, ISD::ZEXTLOAD)) {
8699 MachineMemOperand *MMO =
8700 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
8701 RLI.Alignment, RLI.AAInfo, RLI.Ranges);
8702 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8703 Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWZX, dl,
8704 DAG.getVTList(MVT::f64, MVT::Other),
8705 Ops, MVT::i32, MMO);
8706 spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
8707 } else if (((Subtarget.hasLFIWAX() &&
8708 SINT.getOpcode() == ISD::SIGN_EXTEND) ||
8709 (Subtarget.hasFPCVT() &&
8710 SINT.getOpcode() == ISD::ZERO_EXTEND)) &&
8711 SINT.getOperand(0).getValueType() == MVT::i32) {
8712 MachineFrameInfo &MFI = MF.getFrameInfo();
8713 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8715 int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
8716 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
8718 SDValue Store = DAG.getStore(Chain, dl, SINT.getOperand(0), FIdx,
8719 MachinePointerInfo::getFixedStack(
8720 DAG.getMachineFunction(), FrameIdx));
8723 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
8724 "Expected an i32 store");
8729 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
8730 RLI.Alignment = Align(4);
8732 MachineMemOperand *MMO =
8733 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
8734 RLI.Alignment, RLI.AAInfo, RLI.Ranges);
8735 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8736 Bits = DAG.getMemIntrinsicNode(SINT.getOpcode() == ISD::ZERO_EXTEND ?
8737 PPCISD::LFIWZX : PPCISD::LFIWAX,
8738 dl, DAG.getVTList(MVT::f64, MVT::Other),
8739 Ops, MVT::i32, MMO);
8740 Chain = Bits.getValue(1);
8742 Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
8744 SDValue FP = convertIntToFP(Op, Bits, DAG, Subtarget, Chain);
8746 Chain = FP.getValue(1);
8748 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
8750 FP = DAG.getNode(ISD::STRICT_FP_ROUND, dl,
8751 DAG.getVTList(MVT::f32, MVT::Other),
8752 {Chain, FP, DAG.getIntPtrConstant(0, dl)}, Flags);
8754 FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
8755 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
8760 assert(Src.getValueType() == MVT::i32 &&
8761 "Unhandled INT_TO_FP type in custom expander!");
8762 // Since we only generate this in 64-bit mode, we can take advantage of
8763 // 64-bit registers. In particular, sign extend the input value into the
8764 // 64-bit register with extsw, store the WHOLE 64-bit value into the stack
8765 // then lfd it and fcfid it.
8766 MachineFunction &MF = DAG.getMachineFunction();
8767 MachineFrameInfo &MFI = MF.getFrameInfo();
8768 EVT PtrVT = getPointerTy(MF.getDataLayout());
8771 if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
8774 if (!(ReusingLoad = canReuseLoadAddress(Src, MVT::i32, RLI, DAG))) {
8775 int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
8776 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
8778 SDValue Store = DAG.getStore(Chain, dl, Src, FIdx,
8779 MachinePointerInfo::getFixedStack(
8780 DAG.getMachineFunction(), FrameIdx));
8783 assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
8784 "Expected an i32 store");
8789 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
8790 RLI.Alignment = Align(4);
8793 MachineMemOperand *MMO =
8794 MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
8795 RLI.Alignment, RLI.AAInfo, RLI.Ranges);
8796 SDValue Ops[] = { RLI.Chain, RLI.Ptr };
8797 Ld = DAG.getMemIntrinsicNode(IsSigned ? PPCISD::LFIWAX : PPCISD::LFIWZX, dl,
8798 DAG.getVTList(MVT::f64, MVT::Other), Ops,
8800 Chain = Ld.getValue(1);
8802 spliceIntoChain(RLI.ResChain, Ld.getValue(1), DAG);
8804 assert(Subtarget.isPPC64() &&
8805 "i32->FP without LFIWAX supported only on PPC64");
8807 int FrameIdx = MFI.CreateStackObject(8, Align(8), false);
8808 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
8810 SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64, Src);
8812 // STD the extended value into the stack slot.
8813 SDValue Store = DAG.getStore(
8814 Chain, dl, Ext64, FIdx,
8815 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
8818 // Load the value as a double.
8820 MVT::f64, dl, Chain, FIdx,
8821 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx));
8822 Chain = Ld.getValue(1);
8825 // FCFID it and return it.
8826 SDValue FP = convertIntToFP(Op, Ld, DAG, Subtarget, Chain);
8828 Chain = FP.getValue(1);
8829 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
8831 FP = DAG.getNode(ISD::STRICT_FP_ROUND, dl,
8832 DAG.getVTList(MVT::f32, MVT::Other),
8833 {Chain, FP, DAG.getIntPtrConstant(0, dl)}, Flags);
8835 FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
8836 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
8841 SDValue PPCTargetLowering::LowerGET_ROUNDING(SDValue Op,
8842 SelectionDAG &DAG) const {
8845 The rounding mode is in bits 30:31 of FPSR, and has the following
8852 GET_ROUNDING, on the other hand, expects the following:
8859 To perform the conversion, we do:
8860 ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
8863 MachineFunction &MF = DAG.getMachineFunction();
8864 EVT VT = Op.getValueType();
8865 EVT PtrVT = getPointerTy(MF.getDataLayout());
8867 // Save FP Control Word to register
8868 SDValue Chain = Op.getOperand(0);
8869 SDValue MFFS = DAG.getNode(PPCISD::MFFS, dl, {MVT::f64, MVT::Other}, Chain);
8870 Chain = MFFS.getValue(1);
8873 if (isTypeLegal(MVT::i64)) {
8874 CWD = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
8875 DAG.getNode(ISD::BITCAST, dl, MVT::i64, MFFS));
8877 // Save FP register to stack slot
8878 int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false);
8879 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
8880 Chain = DAG.getStore(Chain, dl, MFFS, StackSlot, MachinePointerInfo());
8882 // Load FP Control Word from low 32 bits of stack slot.
8883 assert(hasBigEndianPartOrdering(MVT::i64, MF.getDataLayout()) &&
8884 "Stack slot adjustment is valid only on big endian subtargets!");
8885 SDValue Four = DAG.getConstant(4, dl, PtrVT);
8886 SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
8887 CWD = DAG.getLoad(MVT::i32, dl, Chain, Addr, MachinePointerInfo());
8888 Chain = CWD.getValue(1);
8891 // Transform as necessary
8893 DAG.getNode(ISD::AND, dl, MVT::i32,
8894 CWD, DAG.getConstant(3, dl, MVT::i32));
8896 DAG.getNode(ISD::SRL, dl, MVT::i32,
8897 DAG.getNode(ISD::AND, dl, MVT::i32,
8898 DAG.getNode(ISD::XOR, dl, MVT::i32,
8899 CWD, DAG.getConstant(3, dl, MVT::i32)),
8900 DAG.getConstant(3, dl, MVT::i32)),
8901 DAG.getConstant(1, dl, MVT::i32));
8904 DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);
8907 DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND),
8910 return DAG.getMergeValues({RetVal, Chain}, dl);
8913 SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
8914 EVT VT = Op.getValueType();
8915 unsigned BitWidth = VT.getSizeInBits();
8917 assert(Op.getNumOperands() == 3 &&
8918 VT == Op.getOperand(1).getValueType() &&
8921 // Expand into a bunch of logical ops. Note that these ops
8922 // depend on the PPC behavior for oversized shift amounts.
8923 SDValue Lo = Op.getOperand(0);
8924 SDValue Hi = Op.getOperand(1);
8925 SDValue Amt = Op.getOperand(2);
8926 EVT AmtVT = Amt.getValueType();
8928 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
8929 DAG.getConstant(BitWidth, dl, AmtVT), Amt);
8930 SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
8931 SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
8932 SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
8933 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
8934 DAG.getConstant(-BitWidth, dl, AmtVT));
8935 SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
8936 SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
8937 SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
8938 SDValue OutOps[] = { OutLo, OutHi };
8939 return DAG.getMergeValues(OutOps, dl);
8942 SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
8943 EVT VT = Op.getValueType();
8945 unsigned BitWidth = VT.getSizeInBits();
8946 assert(Op.getNumOperands() == 3 &&
8947 VT == Op.getOperand(1).getValueType() &&
8950 // Expand into a bunch of logical ops. Note that these ops
8951 // depend on the PPC behavior for oversized shift amounts.
8952 SDValue Lo = Op.getOperand(0);
8953 SDValue Hi = Op.getOperand(1);
8954 SDValue Amt = Op.getOperand(2);
8955 EVT AmtVT = Amt.getValueType();
8957 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
8958 DAG.getConstant(BitWidth, dl, AmtVT), Amt);
8959 SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
8960 SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
8961 SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
8962 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
8963 DAG.getConstant(-BitWidth, dl, AmtVT));
8964 SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
8965 SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
8966 SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
8967 SDValue OutOps[] = { OutLo, OutHi };
8968 return DAG.getMergeValues(OutOps, dl);
8971 SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
8973 EVT VT = Op.getValueType();
8974 unsigned BitWidth = VT.getSizeInBits();
8975 assert(Op.getNumOperands() == 3 &&
8976 VT == Op.getOperand(1).getValueType() &&
8979 // Expand into a bunch of logical ops, followed by a select_cc.
8980 SDValue Lo = Op.getOperand(0);
8981 SDValue Hi = Op.getOperand(1);
8982 SDValue Amt = Op.getOperand(2);
8983 EVT AmtVT = Amt.getValueType();
8985 SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
8986 DAG.getConstant(BitWidth, dl, AmtVT), Amt);
8987 SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
8988 SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
8989 SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
8990 SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
8991 DAG.getConstant(-BitWidth, dl, AmtVT));
8992 SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
8993 SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
8994 SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, dl, AmtVT),
8995 Tmp4, Tmp6, ISD::SETLE);
8996 SDValue OutOps[] = { OutLo, OutHi };
8997 return DAG.getMergeValues(OutOps, dl);
9000 SDValue PPCTargetLowering::LowerFunnelShift(SDValue Op,
9001 SelectionDAG &DAG) const {
9003 EVT VT = Op.getValueType();
9004 unsigned BitWidth = VT.getSizeInBits();
9006 bool IsFSHL = Op.getOpcode() == ISD::FSHL;
9007 SDValue X = Op.getOperand(0);
9008 SDValue Y = Op.getOperand(1);
9009 SDValue Z = Op.getOperand(2);
9010 EVT AmtVT = Z.getValueType();
9012 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
9013 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
9014 // This is simpler than TargetLowering::expandFunnelShift because we can rely
9015 // on PowerPC shift by BW being well defined.
9016 Z = DAG.getNode(ISD::AND, dl, AmtVT, Z,
9017 DAG.getConstant(BitWidth - 1, dl, AmtVT));
9019 DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BitWidth, dl, AmtVT), Z);
9020 X = DAG.getNode(PPCISD::SHL, dl, VT, X, IsFSHL ? Z : SubZ);
9021 Y = DAG.getNode(PPCISD::SRL, dl, VT, Y, IsFSHL ? SubZ : Z);
9022 return DAG.getNode(ISD::OR, dl, VT, X, Y);
9025 //===----------------------------------------------------------------------===//
9026 // Vector related lowering.
9029 /// getCanonicalConstSplat - Build a canonical splat immediate of Val with an
9030 /// element size of SplatSize. Cast the result to VT.
9031 static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,
9032 SelectionDAG &DAG, const SDLoc &dl) {
9033 static const MVT VTys[] = { // canonical VT to use for each size.
9034 MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
9037 EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
9039 // For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.
9040 if (Val == ((1LLU << (SplatSize * 8)) - 1)) {
9045 EVT CanonicalVT = VTys[SplatSize-1];
9047 // Build a canonical splat for this value.
9048 return DAG.getBitcast(ReqVT, DAG.getConstant(Val, dl, CanonicalVT));
9051 /// BuildIntrinsicOp - Return a unary operator intrinsic node with the
9052 /// specified intrinsic ID.
9053 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op, SelectionDAG &DAG,
9054 const SDLoc &dl, EVT DestVT = MVT::Other) {
9055 if (DestVT == MVT::Other) DestVT = Op.getValueType();
9056 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
9057 DAG.getConstant(IID, dl, MVT::i32), Op);
9060 /// BuildIntrinsicOp - Return a binary operator intrinsic node with the
9061 /// specified intrinsic ID.
9062 static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
9063 SelectionDAG &DAG, const SDLoc &dl,
9064 EVT DestVT = MVT::Other) {
9065 if (DestVT == MVT::Other) DestVT = LHS.getValueType();
9066 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
9067 DAG.getConstant(IID, dl, MVT::i32), LHS, RHS);
9070 /// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
9071 /// specified intrinsic ID.
9072 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
9073 SDValue Op2, SelectionDAG &DAG, const SDLoc &dl,
9074 EVT DestVT = MVT::Other) {
9075 if (DestVT == MVT::Other) DestVT = Op0.getValueType();
9076 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
9077 DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2);
9080 /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
9081 /// amount. The result has the specified value type.
9082 static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, EVT VT,
9083 SelectionDAG &DAG, const SDLoc &dl) {
9084 // Force LHS/RHS to be the right type.
9085 LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
9086 RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);
9089 for (unsigned i = 0; i != 16; ++i)
9091 SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
9092 return DAG.getNode(ISD::BITCAST, dl, VT, T);
9095 /// Do we have an efficient pattern in a .td file for this node?
9097 /// \param V - pointer to the BuildVectorSDNode being matched
9098 /// \param HasDirectMove - does this subtarget have VSR <-> GPR direct moves?
9100 /// There are some patterns where it is beneficial to keep a BUILD_VECTOR
9101 /// node as a BUILD_VECTOR node rather than expanding it. The patterns where
9102 /// the opposite is true (expansion is beneficial) are:
9103 /// - The node builds a vector out of integers that are not 32 or 64-bits
9104 /// - The node builds a vector out of constants
9105 /// - The node is a "load-and-splat"
9106 /// In all other cases, we will choose to keep the BUILD_VECTOR.
9107 static bool haveEfficientBuildVectorPattern(BuildVectorSDNode *V,
9110 EVT VecVT = V->getValueType(0);
9111 bool RightType = VecVT == MVT::v2f64 ||
9112 (HasP8Vector && VecVT == MVT::v4f32) ||
9113 (HasDirectMove && (VecVT == MVT::v2i64 || VecVT == MVT::v4i32));
9117 bool IsSplat = true;
9118 bool IsLoad = false;
9119 SDValue Op0 = V->getOperand(0);
9121 // This function is called in a block that confirms the node is not a constant
9122 // splat. So a constant BUILD_VECTOR here means the vector is built out of
9123 // different constants.
9124 if (V->isConstant())
9126 for (int i = 0, e = V->getNumOperands(); i < e; ++i) {
9127 if (V->getOperand(i).isUndef())
9129 // We want to expand nodes that represent load-and-splat even if the
9130 // loaded value is a floating point truncation or conversion to int.
9131 if (V->getOperand(i).getOpcode() == ISD::LOAD ||
9132 (V->getOperand(i).getOpcode() == ISD::FP_ROUND &&
9133 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||
9134 (V->getOperand(i).getOpcode() == ISD::FP_TO_SINT &&
9135 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD) ||
9136 (V->getOperand(i).getOpcode() == ISD::FP_TO_UINT &&
9137 V->getOperand(i).getOperand(0).getOpcode() == ISD::LOAD))
9139 // If the operands are different or the input is not a load and has more
9140 // uses than just this BV node, then it isn't a splat.
9141 if (V->getOperand(i) != Op0 ||
9142 (!IsLoad && !V->isOnlyUserOf(V->getOperand(i).getNode())))
9145 return !(IsSplat && IsLoad);
9148 // Lower BITCAST(f128, (build_pair i64, i64)) to BUILD_FP128.
9149 SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
9152 SDValue Op0 = Op->getOperand(0);
9154 if ((Op.getValueType() != MVT::f128) ||
9155 (Op0.getOpcode() != ISD::BUILD_PAIR) ||
9156 (Op0.getOperand(0).getValueType() != MVT::i64) ||
9157 (Op0.getOperand(1).getValueType() != MVT::i64))
9160 return DAG.getNode(PPCISD::BUILD_FP128, dl, MVT::f128, Op0.getOperand(0),
9164 static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {
9165 const SDValue *InputLoad = &Op;
9166 while (InputLoad->getOpcode() == ISD::BITCAST)
9167 InputLoad = &InputLoad->getOperand(0);
9168 if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
9169 InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {
9170 IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;
9171 InputLoad = &InputLoad->getOperand(0);
9173 if (InputLoad->getOpcode() != ISD::LOAD)
9175 LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
9176 return ISD::isNormalLoad(LD) ? InputLoad : nullptr;
9179 // Convert the argument APFloat to a single precision APFloat if there is no
9180 // loss in information during the conversion to single precision APFloat and the
9181 // resulting number is not a denormal number. Return true if successful.
9182 bool llvm::convertToNonDenormSingle(APFloat &ArgAPFloat) {
9183 APFloat APFloatToConvert = ArgAPFloat;
9184 bool LosesInfo = true;
9185 APFloatToConvert.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
9187 bool Success = (!LosesInfo && !APFloatToConvert.isDenormal());
9189 ArgAPFloat = APFloatToConvert;
9193 // Bitcast the argument APInt to a double and convert it to a single precision
9194 // APFloat, bitcast the APFloat to an APInt and assign it to the original
9195 // argument if there is no loss in information during the conversion from
9196 // double to single precision APFloat and the resulting number is not a denormal
9197 // number. Return true if successful.
9198 bool llvm::convertToNonDenormSingle(APInt &ArgAPInt) {
9199 double DpValue = ArgAPInt.bitsToDouble();
9200 APFloat APFloatDp(DpValue);
9201 bool Success = convertToNonDenormSingle(APFloatDp);
9203 ArgAPInt = APFloatDp.bitcastToAPInt();
9207 // Nondestructive check for convertTonNonDenormSingle.
9208 bool llvm::checkConvertToNonDenormSingle(APFloat &ArgAPFloat) {
9209 // Only convert if it loses info, since XXSPLTIDP should
9210 // handle the other case.
9211 APFloat APFloatToConvert = ArgAPFloat;
9212 bool LosesInfo = true;
9213 APFloatToConvert.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
9216 return (!LosesInfo && !APFloatToConvert.isDenormal());
9219 static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,
9221 LoadSDNode *InputNode = dyn_cast<LoadSDNode>(Op.getOperand(0));
9222 if (!InputNode || !Subtarget.hasVSX() || !ISD::isUNINDEXEDLoad(InputNode))
9225 EVT Ty = Op->getValueType(0);
9226 // For v2f64, v4f32 and v4i32 types, we require the load to be non-extending
9227 // as we cannot handle extending loads for these types.
9228 if ((Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32) &&
9229 ISD::isNON_EXTLoad(InputNode))
9232 EVT MemVT = InputNode->getMemoryVT();
9233 // For v8i16 and v16i8 types, extending loads can be handled as long as the
9234 // memory VT is the same vector element VT type.
9235 // The loads feeding into the v8i16 and v16i8 types will be extending because
9236 // scalar i8/i16 are not legal types.
9237 if ((Ty == MVT::v8i16 || Ty == MVT::v16i8) && ISD::isEXTLoad(InputNode) &&
9238 (MemVT == Ty.getVectorElementType()))
9241 if (Ty == MVT::v2i64) {
9242 // Check the extend type, when the input type is i32, and the output vector
9244 if (MemVT == MVT::i32) {
9245 if (ISD::isZEXTLoad(InputNode))
9246 Opcode = PPCISD::ZEXT_LD_SPLAT;
9247 if (ISD::isSEXTLoad(InputNode))
9248 Opcode = PPCISD::SEXT_LD_SPLAT;
9255 // If this is a case we can't handle, return null and let the default
9256 // expansion code take care of it. If we CAN select this case, and if it
9257 // selects to a single instruction, return Op. Otherwise, if we can codegen
9258 // this case more efficiently than a constant pool load, lower it to the
9259 // sequence of ops that should be used.
9260 SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
9261 SelectionDAG &DAG) const {
9263 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
9264 assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
9266 // Check if this is a splat of a constant value.
9267 APInt APSplatBits, APSplatUndef;
9268 unsigned SplatBitSize;
9270 bool BVNIsConstantSplat =
9271 BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
9272 HasAnyUndefs, 0, !Subtarget.isLittleEndian());
9274 // If it is a splat of a double, check if we can shrink it to a 32 bit
9275 // non-denormal float which when converted back to double gives us the same
9276 // double. This is to exploit the XXSPLTIDP instruction.
9277 // If we lose precision, we use XXSPLTI32DX.
9278 if (BVNIsConstantSplat && (SplatBitSize == 64) &&
9279 Subtarget.hasPrefixInstrs()) {
9280 // Check the type first to short-circuit so we don't modify APSplatBits if
9281 // this block isn't executed.
9282 if ((Op->getValueType(0) == MVT::v2f64) &&
9283 convertToNonDenormSingle(APSplatBits)) {
9284 SDValue SplatNode = DAG.getNode(
9285 PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
9286 DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
9287 return DAG.getBitcast(Op.getValueType(), SplatNode);
9289 // We may lose precision, so we have to use XXSPLTI32DX.
9292 (uint32_t)((APSplatBits.getZExtValue() & 0xFFFFFFFF00000000LL) >> 32);
9294 (uint32_t)(APSplatBits.getZExtValue() & 0xFFFFFFFF);
9295 SDValue SplatNode = DAG.getUNDEF(MVT::v2i64);
9298 // If either load is 0, then we should generate XXLXOR to set to 0.
9299 SplatNode = DAG.getTargetConstant(0, dl, MVT::v2i64);
9302 SplatNode = DAG.getNode(
9303 PPCISD::XXSPLTI32DX, dl, MVT::v2i64, SplatNode,
9304 DAG.getTargetConstant(0, dl, MVT::i32),
9305 DAG.getTargetConstant(Hi, dl, MVT::i32));
9309 DAG.getNode(PPCISD::XXSPLTI32DX, dl, MVT::v2i64, SplatNode,
9310 DAG.getTargetConstant(1, dl, MVT::i32),
9311 DAG.getTargetConstant(Lo, dl, MVT::i32));
9313 return DAG.getBitcast(Op.getValueType(), SplatNode);
9317 if (!BVNIsConstantSplat || SplatBitSize > 32) {
9318 unsigned NewOpcode = PPCISD::LD_SPLAT;
9320 // Handle load-and-splat patterns as we have instructions that will do this
9322 if (DAG.isSplatValue(Op, true) &&
9323 isValidSplatLoad(Subtarget, Op, NewOpcode)) {
9324 const SDValue *InputLoad = &Op.getOperand(0);
9325 LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
9327 // If the input load is an extending load, it will be an i32 -> i64
9328 // extending load and isValidSplatLoad() will update NewOpcode.
9329 unsigned MemorySize = LD->getMemoryVT().getScalarSizeInBits();
9330 unsigned ElementSize =
9331 MemorySize * ((NewOpcode == PPCISD::LD_SPLAT) ? 1 : 2);
9333 assert(((ElementSize == 2 * MemorySize)
9334 ? (NewOpcode == PPCISD::ZEXT_LD_SPLAT ||
9335 NewOpcode == PPCISD::SEXT_LD_SPLAT)
9336 : (NewOpcode == PPCISD::LD_SPLAT)) &&
9337 "Unmatched element size and opcode!\n");
9339 // Checking for a single use of this load, we have to check for vector
9340 // width (128 bits) / ElementSize uses (since each operand of the
9341 // BUILD_VECTOR is a separate use of the value.
9342 unsigned NumUsesOfInputLD = 128 / ElementSize;
9343 for (SDValue BVInOp : Op->ops())
9344 if (BVInOp.isUndef())
9347 // Exclude somes case where LD_SPLAT is worse than scalar_to_vector:
9348 // Below cases should also happen for "lfiwzx/lfiwax + LE target + index
9349 // 1" and "lxvrhx + BE target + index 7" and "lxvrbx + BE target + index
9350 // 15", but function IsValidSplatLoad() now will only return true when
9351 // the data at index 0 is not nullptr. So we will not get into trouble for
9354 // case 1 - lfiwzx/lfiwax
9355 // 1.1: load result is i32 and is sign/zero extend to i64;
9356 // 1.2: build a v2i64 vector type with above loaded value;
9357 // 1.3: the vector has only one value at index 0, others are all undef;
9358 // 1.4: on BE target, so that lfiwzx/lfiwax does not need any permute.
9359 if (NumUsesOfInputLD == 1 &&
9360 (Op->getValueType(0) == MVT::v2i64 && NewOpcode != PPCISD::LD_SPLAT &&
9361 !Subtarget.isLittleEndian() && Subtarget.hasVSX() &&
9362 Subtarget.hasLFIWAX()))
9365 // case 2 - lxvr[hb]x
9366 // 2.1: load result is at most i16;
9367 // 2.2: build a vector with above loaded value;
9368 // 2.3: the vector has only one value at index 0, others are all undef;
9369 // 2.4: on LE target, so that lxvr[hb]x does not need any permute.
9370 if (NumUsesOfInputLD == 1 && Subtarget.isLittleEndian() &&
9371 Subtarget.isISA3_1() && ElementSize <= 16)
9374 assert(NumUsesOfInputLD > 0 && "No uses of input LD of a build_vector?");
9375 if (InputLoad->getNode()->hasNUsesOfValue(NumUsesOfInputLD, 0) &&
9376 Subtarget.hasVSX()) {
9378 LD->getChain(), // Chain
9379 LD->getBasePtr(), // Ptr
9380 DAG.getValueType(Op.getValueType()) // VT
9382 SDValue LdSplt = DAG.getMemIntrinsicNode(
9383 NewOpcode, dl, DAG.getVTList(Op.getValueType(), MVT::Other), Ops,
9384 LD->getMemoryVT(), LD->getMemOperand());
9385 // Replace all uses of the output chain of the original load with the
9386 // output chain of the new load.
9387 DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1),
9388 LdSplt.getValue(1));
9393 // In 64BIT mode BUILD_VECTOR nodes that are not constant splats of up to
9394 // 32-bits can be lowered to VSX instructions under certain conditions.
9395 // Without VSX, there is no pattern more efficient than expanding the node.
9396 if (Subtarget.hasVSX() && Subtarget.isPPC64() &&
9397 haveEfficientBuildVectorPattern(BVN, Subtarget.hasDirectMove(),
9398 Subtarget.hasP8Vector()))
9403 uint64_t SplatBits = APSplatBits.getZExtValue();
9404 uint64_t SplatUndef = APSplatUndef.getZExtValue();
9405 unsigned SplatSize = SplatBitSize / 8;
9407 // First, handle single instruction cases.
9410 if (SplatBits == 0) {
9411 // Canonicalize all zero vectors to be v4i32.
9412 if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
9413 SDValue Z = DAG.getConstant(0, dl, MVT::v4i32);
9414 Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);
9419 // We have XXSPLTIW for constant splats four bytes wide.
9420 // Given vector length is a multiple of 4, 2-byte splats can be replaced
9421 // with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to
9422 // make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
9423 // turned into a 4-byte splat of 0xABABABAB.
9424 if (Subtarget.hasPrefixInstrs() && SplatSize == 2)
9425 return getCanonicalConstSplat(SplatBits | (SplatBits << 16), SplatSize * 2,
9426 Op.getValueType(), DAG, dl);
9428 if (Subtarget.hasPrefixInstrs() && SplatSize == 4)
9429 return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
9432 // We have XXSPLTIB for constant splats one byte wide.
9433 if (Subtarget.hasP9Vector() && SplatSize == 1)
9434 return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
9437 // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
9438 int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
9440 if (SextVal >= -16 && SextVal <= 15)
9441 return getCanonicalConstSplat(SextVal, SplatSize, Op.getValueType(), DAG,
9444 // Two instruction sequences.
9446 // If this value is in the range [-32,30] and is even, use:
9447 // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
9448 // If this value is in the range [17,31] and is odd, use:
9449 // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
9450 // If this value is in the range [-31,-17] and is odd, use:
9451 // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
9452 // Note the last two are three-instruction sequences.
9453 if (SextVal >= -32 && SextVal <= 31) {
9454 // To avoid having these optimizations undone by constant folding,
9455 // we convert to a pseudo that will be expanded later into one of
9457 SDValue Elt = DAG.getConstant(SextVal, dl, MVT::i32);
9458 EVT VT = (SplatSize == 1 ? MVT::v16i8 :
9459 (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
9460 SDValue EltSize = DAG.getConstant(SplatSize, dl, MVT::i32);
9461 SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
9462 if (VT == Op.getValueType())
9465 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
9468 // If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is
9469 // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important
9471 if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
9472 // Make -1 and vspltisw -1:
9473 SDValue OnesV = getCanonicalConstSplat(-1, 4, MVT::v4i32, DAG, dl);
9475 // Make the VSLW intrinsic, computing 0x8000_0000.
9476 SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
9479 // xor by OnesV to invert it.
9480 Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
9481 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
9484 // Check to see if this is a wide variety of vsplti*, binop self cases.
9485 static const signed char SplatCsts[] = {
9486 -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
9487 -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
9490 for (unsigned idx = 0; idx < std::size(SplatCsts); ++idx) {
9491 // Indirect through the SplatCsts array so that we favor 'vsplti -1' for
9492 // cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1'
9493 int i = SplatCsts[idx];
9495 // Figure out what shift amount will be used by altivec if shifted by i in
9497 unsigned TypeShiftAmt = i & (SplatBitSize-1);
9499 // vsplti + shl self.
9500 if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
9501 SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
9502 static const unsigned IIDs[] = { // Intrinsic to use for each size.
9503 Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
9504 Intrinsic::ppc_altivec_vslw
9506 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
9507 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
9510 // vsplti + srl self.
9511 if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
9512 SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
9513 static const unsigned IIDs[] = { // Intrinsic to use for each size.
9514 Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
9515 Intrinsic::ppc_altivec_vsrw
9517 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
9518 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
9521 // vsplti + rol self.
9522 if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
9523 ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
9524 SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
9525 static const unsigned IIDs[] = { // Intrinsic to use for each size.
9526 Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
9527 Intrinsic::ppc_altivec_vrlw
9529 Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
9530 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
9533 // t = vsplti c, result = vsldoi t, t, 1
9534 if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
9535 SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
9536 unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
9537 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
9539 // t = vsplti c, result = vsldoi t, t, 2
9540 if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
9541 SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
9542 unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
9543 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
9545 // t = vsplti c, result = vsldoi t, t, 3
9546 if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
9547 SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
9548 unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
9549 return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
9556 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
9557 /// the specified operations to build the shuffle.
9558 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
9559 SDValue RHS, SelectionDAG &DAG,
9561 unsigned OpNum = (PFEntry >> 26) & 0x0F;
9562 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
9563 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
9566 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
9578 if (OpNum == OP_COPY) {
9579 if (LHSID == (1*9+2)*9+3) return LHS;
9580 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
9584 SDValue OpLHS, OpRHS;
9585 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
9586 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
9590 default: llvm_unreachable("Unknown i32 permute!");
9592 ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3;
9593 ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
9594 ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7;
9595 ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
9598 ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
9599 ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
9600 ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
9601 ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
9604 for (unsigned i = 0; i != 16; ++i)
9605 ShufIdxs[i] = (i&3)+0;
9608 for (unsigned i = 0; i != 16; ++i)
9609 ShufIdxs[i] = (i&3)+4;
9612 for (unsigned i = 0; i != 16; ++i)
9613 ShufIdxs[i] = (i&3)+8;
9616 for (unsigned i = 0; i != 16; ++i)
9617 ShufIdxs[i] = (i&3)+12;
9620 return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
9622 return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
9624 return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
9626 EVT VT = OpLHS.getValueType();
9627 OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS);
9628 OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS);
9629 SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
9630 return DAG.getNode(ISD::BITCAST, dl, VT, T);
9633 /// lowerToVINSERTB - Return the SDValue if this VECTOR_SHUFFLE can be handled
9634 /// by the VINSERTB instruction introduced in ISA 3.0, else just return default
9636 SDValue PPCTargetLowering::lowerToVINSERTB(ShuffleVectorSDNode *N,
9637 SelectionDAG &DAG) const {
9638 const unsigned BytesInVector = 16;
9639 bool IsLE = Subtarget.isLittleEndian();
9641 SDValue V1 = N->getOperand(0);
9642 SDValue V2 = N->getOperand(1);
9643 unsigned ShiftElts = 0, InsertAtByte = 0;
9646 // Shifts required to get the byte we want at element 7.
9647 unsigned LittleEndianShifts[] = {8, 7, 6, 5, 4, 3, 2, 1,
9648 0, 15, 14, 13, 12, 11, 10, 9};
9649 unsigned BigEndianShifts[] = {9, 10, 11, 12, 13, 14, 15, 0,
9650 1, 2, 3, 4, 5, 6, 7, 8};
9652 ArrayRef<int> Mask = N->getMask();
9653 int OriginalOrder[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
9655 // For each mask element, find out if we're just inserting something
9656 // from V2 into V1 or vice versa.
9657 // Possible permutations inserting an element from V2 into V1:
9658 // X, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
9659 // 0, X, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
9661 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, X
9662 // Inserting from V1 into V2 will be similar, except mask range will be
9665 bool FoundCandidate = false;
9666 // If both vector operands for the shuffle are the same vector, the mask
9667 // will contain only elements from the first one and the second one will be
9669 unsigned VINSERTBSrcElem = IsLE ? 8 : 7;
9670 // Go through the mask of half-words to find an element that's being moved
9671 // from one vector to the other.
9672 for (unsigned i = 0; i < BytesInVector; ++i) {
9673 unsigned CurrentElement = Mask[i];
9674 // If 2nd operand is undefined, we should only look for element 7 in the
9676 if (V2.isUndef() && CurrentElement != VINSERTBSrcElem)
9679 bool OtherElementsInOrder = true;
9680 // Examine the other elements in the Mask to see if they're in original
9682 for (unsigned j = 0; j < BytesInVector; ++j) {
9685 // If CurrentElement is from V1 [0,15], then we the rest of the Mask to be
9686 // from V2 [16,31] and vice versa. Unless the 2nd operand is undefined,
9687 // in which we always assume we're always picking from the 1st operand.
9689 (!V2.isUndef() && CurrentElement < BytesInVector) ? BytesInVector : 0;
9690 if (Mask[j] != OriginalOrder[j] + MaskOffset) {
9691 OtherElementsInOrder = false;
9695 // If other elements are in original order, we record the number of shifts
9696 // we need to get the element we want into element 7. Also record which byte
9697 // in the vector we should insert into.
9698 if (OtherElementsInOrder) {
9699 // If 2nd operand is undefined, we assume no shifts and no swapping.
9704 // Only need the last 4-bits for shifts because operands will be swapped if CurrentElement is >= 2^4.
9705 ShiftElts = IsLE ? LittleEndianShifts[CurrentElement & 0xF]
9706 : BigEndianShifts[CurrentElement & 0xF];
9707 Swap = CurrentElement < BytesInVector;
9709 InsertAtByte = IsLE ? BytesInVector - (i + 1) : i;
9710 FoundCandidate = true;
9715 if (!FoundCandidate)
9718 // Candidate found, construct the proper SDAG sequence with VINSERTB,
9719 // optionally with VECSHL if shift is required.
9725 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
9726 DAG.getConstant(ShiftElts, dl, MVT::i32));
9727 return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, Shl,
9728 DAG.getConstant(InsertAtByte, dl, MVT::i32));
9730 return DAG.getNode(PPCISD::VECINSERT, dl, MVT::v16i8, V1, V2,
9731 DAG.getConstant(InsertAtByte, dl, MVT::i32));
9734 /// lowerToVINSERTH - Return the SDValue if this VECTOR_SHUFFLE can be handled
9735 /// by the VINSERTH instruction introduced in ISA 3.0, else just return default
9737 SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
9738 SelectionDAG &DAG) const {
9739 const unsigned NumHalfWords = 8;
9740 const unsigned BytesInVector = NumHalfWords * 2;
9741 // Check that the shuffle is on half-words.
9742 if (!isNByteElemShuffleMask(N, 2, 1))
9745 bool IsLE = Subtarget.isLittleEndian();
9747 SDValue V1 = N->getOperand(0);
9748 SDValue V2 = N->getOperand(1);
9749 unsigned ShiftElts = 0, InsertAtByte = 0;
9752 // Shifts required to get the half-word we want at element 3.
9753 unsigned LittleEndianShifts[] = {4, 3, 2, 1, 0, 7, 6, 5};
9754 unsigned BigEndianShifts[] = {5, 6, 7, 0, 1, 2, 3, 4};
9757 uint32_t OriginalOrderLow = 0x1234567;
9758 uint32_t OriginalOrderHigh = 0x89ABCDEF;
9759 // Now we look at mask elements 0,2,4,6,8,10,12,14. Pack the mask into a
9760 // 32-bit space, only need 4-bit nibbles per element.
9761 for (unsigned i = 0; i < NumHalfWords; ++i) {
9762 unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
9763 Mask |= ((uint32_t)(N->getMaskElt(i * 2) / 2) << MaskShift);
9766 // For each mask element, find out if we're just inserting something
9767 // from V2 into V1 or vice versa. Possible permutations inserting an element
9769 // X, 1, 2, 3, 4, 5, 6, 7
9770 // 0, X, 2, 3, 4, 5, 6, 7
9771 // 0, 1, X, 3, 4, 5, 6, 7
9772 // 0, 1, 2, X, 4, 5, 6, 7
9773 // 0, 1, 2, 3, X, 5, 6, 7
9774 // 0, 1, 2, 3, 4, X, 6, 7
9775 // 0, 1, 2, 3, 4, 5, X, 7
9776 // 0, 1, 2, 3, 4, 5, 6, X
9777 // Inserting from V1 into V2 will be similar, except mask range will be [8,15].
9779 bool FoundCandidate = false;
9780 // Go through the mask of half-words to find an element that's being moved
9781 // from one vector to the other.
9782 for (unsigned i = 0; i < NumHalfWords; ++i) {
9783 unsigned MaskShift = (NumHalfWords - 1 - i) * 4;
9784 uint32_t MaskOneElt = (Mask >> MaskShift) & 0xF;
9785 uint32_t MaskOtherElts = ~(0xF << MaskShift);
9786 uint32_t TargetOrder = 0x0;
9788 // If both vector operands for the shuffle are the same vector, the mask
9789 // will contain only elements from the first one and the second one will be
9793 unsigned VINSERTHSrcElem = IsLE ? 4 : 3;
9794 TargetOrder = OriginalOrderLow;
9796 // Skip if not the correct element or mask of other elements don't equal
9797 // to our expected order.
9798 if (MaskOneElt == VINSERTHSrcElem &&
9799 (Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
9800 InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
9801 FoundCandidate = true;
9804 } else { // If both operands are defined.
9805 // Target order is [8,15] if the current mask is between [0,7].
9807 (MaskOneElt < NumHalfWords) ? OriginalOrderHigh : OriginalOrderLow;
9808 // Skip if mask of other elements don't equal our expected order.
9809 if ((Mask & MaskOtherElts) == (TargetOrder & MaskOtherElts)) {
9810 // We only need the last 3 bits for the number of shifts.
9811 ShiftElts = IsLE ? LittleEndianShifts[MaskOneElt & 0x7]
9812 : BigEndianShifts[MaskOneElt & 0x7];
9813 InsertAtByte = IsLE ? BytesInVector - (i + 1) * 2 : i * 2;
9814 Swap = MaskOneElt < NumHalfWords;
9815 FoundCandidate = true;
9821 if (!FoundCandidate)
9824 // Candidate found, construct the proper SDAG sequence with VINSERTH,
9825 // optionally with VECSHL if shift is required.
9830 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
9832 // Double ShiftElts because we're left shifting on v16i8 type.
9833 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v16i8, V2, V2,
9834 DAG.getConstant(2 * ShiftElts, dl, MVT::i32));
9835 SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, Shl);
9836 SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
9837 DAG.getConstant(InsertAtByte, dl, MVT::i32));
9838 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
9840 SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
9841 SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v8i16, Conv1, Conv2,
9842 DAG.getConstant(InsertAtByte, dl, MVT::i32));
9843 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
9846 /// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be
9847 /// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise
9848 /// return the default SDValue.
9849 SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN,
9850 SelectionDAG &DAG) const {
9851 // The LHS and RHS may be bitcasts to v16i8 as we canonicalize shuffles
9852 // to v16i8. Peek through the bitcasts to get the actual operands.
9853 SDValue LHS = peekThroughBitcasts(SVN->getOperand(0));
9854 SDValue RHS = peekThroughBitcasts(SVN->getOperand(1));
9856 auto ShuffleMask = SVN->getMask();
9857 SDValue VecShuffle(SVN, 0);
9860 // Check that we have a four byte shuffle.
9861 if (!isNByteElemShuffleMask(SVN, 4, 1))
9864 // Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx.
9865 if (RHS->getOpcode() != ISD::BUILD_VECTOR) {
9866 std::swap(LHS, RHS);
9867 VecShuffle = peekThroughBitcasts(DAG.getCommutedVectorShuffle(*SVN));
9868 ShuffleVectorSDNode *CommutedSV = dyn_cast<ShuffleVectorSDNode>(VecShuffle);
9871 ShuffleMask = CommutedSV->getMask();
9874 // Ensure that the RHS is a vector of constants.
9875 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
9879 // Check if RHS is a splat of 4-bytes (or smaller).
9880 APInt APSplatValue, APSplatUndef;
9881 unsigned SplatBitSize;
9883 if (!BVN->isConstantSplat(APSplatValue, APSplatUndef, SplatBitSize,
9884 HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
9888 // Check that the shuffle mask matches the semantics of XXSPLTI32DX.
9889 // The instruction splats a constant C into two words of the source vector
9890 // producing { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C }.
9891 // Thus we check that the shuffle mask is the equivalent of
9892 // <0, [4-7], 2, [4-7]> or <[4-7], 1, [4-7], 3> respectively.
9893 // Note: the check above of isNByteElemShuffleMask() ensures that the bytes
9894 // within each word are consecutive, so we only need to check the first byte.
9896 bool IsLE = Subtarget.isLittleEndian();
9897 if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) &&
9898 (ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 &&
9899 ShuffleMask[4] > 15 && ShuffleMask[12] > 15))
9900 Index = DAG.getTargetConstant(IsLE ? 0 : 1, DL, MVT::i32);
9901 else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) &&
9902 (ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 &&
9903 ShuffleMask[0] > 15 && ShuffleMask[8] > 15))
9904 Index = DAG.getTargetConstant(IsLE ? 1 : 0, DL, MVT::i32);
9908 // If the splat is narrower than 32-bits, we need to get the 32-bit value
9910 unsigned SplatVal = APSplatValue.getZExtValue();
9911 for (; SplatBitSize < 32; SplatBitSize <<= 1)
9912 SplatVal |= (SplatVal << SplatBitSize);
9914 SDValue SplatNode = DAG.getNode(
9915 PPCISD::XXSPLTI32DX, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, LHS),
9916 Index, DAG.getTargetConstant(SplatVal, DL, MVT::i32));
9917 return DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, SplatNode);
9920 /// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8).
9921 /// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is
9922 /// a multiple of 8. Otherwise convert it to a scalar rotation(i128)
9923 /// i.e (or (shl x, C1), (srl x, 128-C1)).
9924 SDValue PPCTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
9925 assert(Op.getOpcode() == ISD::ROTL && "Should only be called for ISD::ROTL");
9926 assert(Op.getValueType() == MVT::v1i128 &&
9927 "Only set v1i128 as custom, other type shouldn't reach here!");
9929 SDValue N0 = peekThroughBitcasts(Op.getOperand(0));
9930 SDValue N1 = peekThroughBitcasts(Op.getOperand(1));
9931 unsigned SHLAmt = N1.getConstantOperandVal(0);
9932 if (SHLAmt % 8 == 0) {
9933 std::array<int, 16> Mask;
9934 std::iota(Mask.begin(), Mask.end(), 0);
9935 std::rotate(Mask.begin(), Mask.begin() + SHLAmt / 8, Mask.end());
9936 if (SDValue Shuffle =
9937 DAG.getVectorShuffle(MVT::v16i8, dl, DAG.getBitcast(MVT::v16i8, N0),
9938 DAG.getUNDEF(MVT::v16i8), Mask))
9939 return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, Shuffle);
9941 SDValue ArgVal = DAG.getBitcast(MVT::i128, N0);
9942 SDValue SHLOp = DAG.getNode(ISD::SHL, dl, MVT::i128, ArgVal,
9943 DAG.getConstant(SHLAmt, dl, MVT::i32));
9944 SDValue SRLOp = DAG.getNode(ISD::SRL, dl, MVT::i128, ArgVal,
9945 DAG.getConstant(128 - SHLAmt, dl, MVT::i32));
9946 SDValue OROp = DAG.getNode(ISD::OR, dl, MVT::i128, SHLOp, SRLOp);
9947 return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, OROp);
9950 /// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
9951 /// is a shuffle we can handle in a single instruction, return it. Otherwise,
9952 /// return the code it can be lowered into. Worst case, it can always be
9953 /// lowered into a vperm.
9954 SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
9955 SelectionDAG &DAG) const {
9957 SDValue V1 = Op.getOperand(0);
9958 SDValue V2 = Op.getOperand(1);
9959 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
9961 // Any nodes that were combined in the target-independent combiner prior
9962 // to vector legalization will not be sent to the target combine. Try to
9964 if (SDValue NewShuffle = combineVectorShuffle(SVOp, DAG)) {
9965 if (!isa<ShuffleVectorSDNode>(NewShuffle))
9968 SVOp = cast<ShuffleVectorSDNode>(Op);
9969 V1 = Op.getOperand(0);
9970 V2 = Op.getOperand(1);
9972 EVT VT = Op.getValueType();
9973 bool isLittleEndian = Subtarget.isLittleEndian();
9975 unsigned ShiftElts, InsertAtByte;
9978 // If this is a load-and-splat, we can do that with a single instruction
9979 // in some cases. However if the load has multiple uses, we don't want to
9980 // combine it because that will just produce multiple loads.
9981 bool IsPermutedLoad = false;
9982 const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad);
9983 if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
9984 (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&
9985 InputLoad->hasOneUse()) {
9986 bool IsFourByte = PPC::isSplatShuffleMask(SVOp, 4);
9988 PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);
9990 // The splat index for permuted loads will be in the left half of the vector
9991 // which is strictly wider than the loaded value by 8 bytes. So we need to
9992 // adjust the splat index to point to the correct address in memory.
9993 if (IsPermutedLoad) {
9994 assert((isLittleEndian || IsFourByte) &&
9995 "Unexpected size for permuted load on big endian target");
9996 SplatIdx += IsFourByte ? 2 : 1;
9997 assert((SplatIdx < (IsFourByte ? 4 : 2)) &&
9998 "Splat of a value outside of the loaded memory");
10001 LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
10002 // For 4-byte load-and-splat, we need Power9.
10003 if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
10004 uint64_t Offset = 0;
10006 Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
10008 Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
10010 // If the width of the load is the same as the width of the splat,
10011 // loading with an offset would load the wrong memory.
10012 if (LD->getValueType(0).getSizeInBits() == (IsFourByte ? 32 : 64))
10015 SDValue BasePtr = LD->getBasePtr();
10017 BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
10018 BasePtr, DAG.getIntPtrConstant(Offset, dl));
10020 LD->getChain(), // Chain
10021 BasePtr, // BasePtr
10022 DAG.getValueType(Op.getValueType()) // VT
10025 DAG.getVTList(IsFourByte ? MVT::v4i32 : MVT::v2i64, MVT::Other);
10027 DAG.getMemIntrinsicNode(PPCISD::LD_SPLAT, dl, VTL,
10028 Ops, LD->getMemoryVT(), LD->getMemOperand());
10029 DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1), LdSplt.getValue(1));
10030 if (LdSplt.getValueType() != SVOp->getValueType(0))
10031 LdSplt = DAG.getBitcast(SVOp->getValueType(0), LdSplt);
10036 // All v2i64 and v2f64 shuffles are legal
10037 if (VT == MVT::v2i64 || VT == MVT::v2f64)
10040 if (Subtarget.hasP9Vector() &&
10041 PPC::isXXINSERTWMask(SVOp, ShiftElts, InsertAtByte, Swap,
10045 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
10046 SDValue Conv2 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2);
10048 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv2, Conv2,
10049 DAG.getConstant(ShiftElts, dl, MVT::i32));
10050 SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Shl,
10051 DAG.getConstant(InsertAtByte, dl, MVT::i32));
10052 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
10054 SDValue Ins = DAG.getNode(PPCISD::VECINSERT, dl, MVT::v4i32, Conv1, Conv2,
10055 DAG.getConstant(InsertAtByte, dl, MVT::i32));
10056 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
10059 if (Subtarget.hasPrefixInstrs()) {
10060 SDValue SplatInsertNode;
10061 if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG)))
10062 return SplatInsertNode;
10065 if (Subtarget.hasP9Altivec()) {
10066 SDValue NewISDNode;
10067 if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
10070 if ((NewISDNode = lowerToVINSERTB(SVOp, DAG)))
10074 if (Subtarget.hasVSX() &&
10075 PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
10078 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
10080 DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2);
10082 SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2,
10083 DAG.getConstant(ShiftElts, dl, MVT::i32));
10084 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl);
10087 if (Subtarget.hasVSX() &&
10088 PPC::isXXPERMDIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
10091 SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
10093 DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2.isUndef() ? V1 : V2);
10095 SDValue PermDI = DAG.getNode(PPCISD::XXPERMDI, dl, MVT::v2i64, Conv1, Conv2,
10096 DAG.getConstant(ShiftElts, dl, MVT::i32));
10097 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, PermDI);
10100 if (Subtarget.hasP9Vector()) {
10101 if (PPC::isXXBRHShuffleMask(SVOp)) {
10102 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
10103 SDValue ReveHWord = DAG.getNode(ISD::BSWAP, dl, MVT::v8i16, Conv);
10104 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveHWord);
10105 } else if (PPC::isXXBRWShuffleMask(SVOp)) {
10106 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
10107 SDValue ReveWord = DAG.getNode(ISD::BSWAP, dl, MVT::v4i32, Conv);
10108 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveWord);
10109 } else if (PPC::isXXBRDShuffleMask(SVOp)) {
10110 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1);
10111 SDValue ReveDWord = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Conv);
10112 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveDWord);
10113 } else if (PPC::isXXBRQShuffleMask(SVOp)) {
10114 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, V1);
10115 SDValue ReveQWord = DAG.getNode(ISD::BSWAP, dl, MVT::v1i128, Conv);
10116 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, ReveQWord);
10120 if (Subtarget.hasVSX()) {
10121 if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
10122 int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, 4, DAG);
10124 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
10125 SDValue Splat = DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Conv,
10126 DAG.getConstant(SplatIdx, dl, MVT::i32));
10127 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Splat);
10130 // Left shifts of 8 bytes are actually swaps. Convert accordingly.
10131 if (V2.isUndef() && PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) == 8) {
10132 SDValue Conv = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
10133 SDValue Swap = DAG.getNode(PPCISD::SWAP_NO_CHAIN, dl, MVT::v2f64, Conv);
10134 return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Swap);
10138 // Cases that are handled by instructions that take permute immediates
10139 // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
10140 // selected by the instruction selector.
10141 if (V2.isUndef()) {
10142 if (PPC::isSplatShuffleMask(SVOp, 1) ||
10143 PPC::isSplatShuffleMask(SVOp, 2) ||
10144 PPC::isSplatShuffleMask(SVOp, 4) ||
10145 PPC::isVPKUWUMShuffleMask(SVOp, 1, DAG) ||
10146 PPC::isVPKUHUMShuffleMask(SVOp, 1, DAG) ||
10147 PPC::isVSLDOIShuffleMask(SVOp, 1, DAG) != -1 ||
10148 PPC::isVMRGLShuffleMask(SVOp, 1, 1, DAG) ||
10149 PPC::isVMRGLShuffleMask(SVOp, 2, 1, DAG) ||
10150 PPC::isVMRGLShuffleMask(SVOp, 4, 1, DAG) ||
10151 PPC::isVMRGHShuffleMask(SVOp, 1, 1, DAG) ||
10152 PPC::isVMRGHShuffleMask(SVOp, 2, 1, DAG) ||
10153 PPC::isVMRGHShuffleMask(SVOp, 4, 1, DAG) ||
10154 (Subtarget.hasP8Altivec() && (
10155 PPC::isVPKUDUMShuffleMask(SVOp, 1, DAG) ||
10156 PPC::isVMRGEOShuffleMask(SVOp, true, 1, DAG) ||
10157 PPC::isVMRGEOShuffleMask(SVOp, false, 1, DAG)))) {
10162 // Altivec has a variety of "shuffle immediates" that take two vector inputs
10163 // and produce a fixed permutation. If any of these match, do not lower to
10165 unsigned int ShuffleKind = isLittleEndian ? 2 : 0;
10166 if (PPC::isVPKUWUMShuffleMask(SVOp, ShuffleKind, DAG) ||
10167 PPC::isVPKUHUMShuffleMask(SVOp, ShuffleKind, DAG) ||
10168 PPC::isVSLDOIShuffleMask(SVOp, ShuffleKind, DAG) != -1 ||
10169 PPC::isVMRGLShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
10170 PPC::isVMRGLShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
10171 PPC::isVMRGLShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
10172 PPC::isVMRGHShuffleMask(SVOp, 1, ShuffleKind, DAG) ||
10173 PPC::isVMRGHShuffleMask(SVOp, 2, ShuffleKind, DAG) ||
10174 PPC::isVMRGHShuffleMask(SVOp, 4, ShuffleKind, DAG) ||
10175 (Subtarget.hasP8Altivec() && (
10176 PPC::isVPKUDUMShuffleMask(SVOp, ShuffleKind, DAG) ||
10177 PPC::isVMRGEOShuffleMask(SVOp, true, ShuffleKind, DAG) ||
10178 PPC::isVMRGEOShuffleMask(SVOp, false, ShuffleKind, DAG))))
10181 // Check to see if this is a shuffle of 4-byte values. If so, we can use our
10182 // perfect shuffle table to emit an optimal matching sequence.
10183 ArrayRef<int> PermMask = SVOp->getMask();
10185 if (!DisablePerfectShuffle && !isLittleEndian) {
10186 unsigned PFIndexes[4];
10187 bool isFourElementShuffle = true;
10188 for (unsigned i = 0; i != 4 && isFourElementShuffle;
10189 ++i) { // Element number
10190 unsigned EltNo = 8; // Start out undef.
10191 for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
10192 if (PermMask[i * 4 + j] < 0)
10193 continue; // Undef, ignore it.
10195 unsigned ByteSource = PermMask[i * 4 + j];
10196 if ((ByteSource & 3) != j) {
10197 isFourElementShuffle = false;
10202 EltNo = ByteSource / 4;
10203 } else if (EltNo != ByteSource / 4) {
10204 isFourElementShuffle = false;
10208 PFIndexes[i] = EltNo;
10211 // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
10212 // perfect shuffle vector to determine if it is cost effective to do this as
10213 // discrete instructions, or whether we should use a vperm.
10214 // For now, we skip this for little endian until such time as we have a
10215 // little-endian perfect shuffle table.
10216 if (isFourElementShuffle) {
10217 // Compute the index in the perfect shuffle table.
10218 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
10219 PFIndexes[2] * 9 + PFIndexes[3];
10221 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
10222 unsigned Cost = (PFEntry >> 30);
10224 // Determining when to avoid vperm is tricky. Many things affect the cost
10225 // of vperm, particularly how many times the perm mask needs to be
10226 // computed. For example, if the perm mask can be hoisted out of a loop or
10227 // is already used (perhaps because there are multiple permutes with the
10228 // same shuffle mask?) the vperm has a cost of 1. OTOH, hoisting the
10229 // permute mask out of the loop requires an extra register.
10231 // As a compromise, we only emit discrete instructions if the shuffle can
10232 // be generated in 3 or fewer operations. When we have loop information
10233 // available, if this block is within a loop, we should avoid using vperm
10234 // for 3-operation perms and use a constant pool load instead.
10236 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
10240 // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
10241 // vector that will get spilled to the constant pool.
10242 if (V2.isUndef()) V2 = V1;
10244 return LowerVPERM(Op, DAG, PermMask, VT, V1, V2);
10247 SDValue PPCTargetLowering::LowerVPERM(SDValue Op, SelectionDAG &DAG,
10248 ArrayRef<int> PermMask, EVT VT,
10249 SDValue V1, SDValue V2) const {
10250 unsigned Opcode = PPCISD::VPERM;
10251 EVT ValType = V1.getValueType();
10253 bool NeedSwap = false;
10254 bool isLittleEndian = Subtarget.isLittleEndian();
10255 bool isPPC64 = Subtarget.isPPC64();
10257 // Only need to place items backwards in LE,
10258 // the mask will be properly calculated.
10259 if (isLittleEndian)
10262 if (Subtarget.hasVSX() && Subtarget.hasP9Vector() &&
10263 (V1->hasOneUse() || V2->hasOneUse())) {
10264 LLVM_DEBUG(dbgs() << "At least one of two input vectors are dead - using "
10265 "XXPERM instead\n");
10266 Opcode = PPCISD::XXPERM;
10268 // The second input to XXPERM is also an output so if the second input has
10269 // multiple uses then copying is necessary, as a result we want the
10270 // single-use operand to be used as the second input to prevent copying.
10271 if (!V2->hasOneUse() && V1->hasOneUse()) {
10273 NeedSwap = !NeedSwap;
10277 // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
10278 // that it is in input element units, not in bytes. Convert now.
10280 // For little endian, the order of the input vectors is reversed, and
10281 // the permutation mask is complemented with respect to 31. This is
10282 // necessary to produce proper semantics with the big-endian-based vperm
10284 EVT EltVT = V1.getValueType().getVectorElementType();
10285 unsigned BytesPerElement = EltVT.getSizeInBits() / 8;
10287 bool V1HasXXSWAPD = V1->getOperand(0)->getOpcode() == PPCISD::XXSWAPD;
10288 bool V2HasXXSWAPD = V2->getOperand(0)->getOpcode() == PPCISD::XXSWAPD;
10291 Vectors will be appended like so: [ V1 | v2 ]
10293 [ A | B | C | D ] -> [ C | D | A | B ]
10294 0-3 4-7 8-11 12-15 0-3 4-7 8-11 12-15
10295 i.e. index of A, B += 8, and index of C, D -= 8.
10297 [ E | F | G | H ] -> [ G | H | E | F ]
10298 16-19 20-23 24-27 28-31 16-19 20-23 24-27 28-31
10299 i.e. index of E, F += 8, index of G, H -= 8
10301 [ V1 | V2 ] -> [ V2 | V1 ]
10302 0-15 16-31 0-15 16-31
10303 i.e. index of V1 += 16, index of V2 -= 16
10306 SmallVector<SDValue, 16> ResultMask;
10307 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
10308 unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
10310 if (Opcode == PPCISD::XXPERM) {
10311 if (V1HasXXSWAPD) {
10314 else if (SrcElt < 16)
10317 if (V2HasXXSWAPD) {
10320 else if (SrcElt > 15)
10331 for (unsigned j = 0; j != BytesPerElement; ++j)
10332 if (isLittleEndian)
10333 ResultMask.push_back(
10334 DAG.getConstant(31 - (SrcElt * BytesPerElement + j), dl, MVT::i32));
10336 ResultMask.push_back(
10337 DAG.getConstant(SrcElt * BytesPerElement + j, dl, MVT::i32));
10340 if (Opcode == PPCISD::XXPERM && (V1HasXXSWAPD || V2HasXXSWAPD)) {
10341 if (V1HasXXSWAPD) {
10342 dl = SDLoc(V1->getOperand(0));
10343 V1 = V1->getOperand(0)->getOperand(1);
10345 if (V2HasXXSWAPD) {
10346 dl = SDLoc(V2->getOperand(0));
10347 V2 = V2->getOperand(0)->getOperand(1);
10349 if (isPPC64 && ValType != MVT::v2f64)
10350 V1 = DAG.getBitcast(MVT::v2f64, V1);
10351 if (isPPC64 && V2.getValueType() != MVT::v2f64)
10352 V2 = DAG.getBitcast(MVT::v2f64, V2);
10355 ShufflesHandledWithVPERM++;
10356 SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);
10358 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
10359 if (Opcode == PPCISD::XXPERM) {
10360 dbgs() << "Emitting a XXPERM for the following shuffle:\n";
10362 dbgs() << "Emitting a VPERM for the following shuffle:\n";
10365 dbgs() << "With the following permute control vector:\n";
10369 if (Opcode == PPCISD::XXPERM)
10370 VPermMask = DAG.getBitcast(MVT::v4i32, VPermMask);
10372 SDValue VPERMNode =
10373 DAG.getNode(Opcode, dl, V1.getValueType(), V1, V2, VPermMask);
10375 VPERMNode = DAG.getBitcast(ValType, VPERMNode);
10379 /// getVectorCompareInfo - Given an intrinsic, return false if it is not a
10380 /// vector comparison. If it is, return true and fill in Opc/isDot with
10381 /// information about the intrinsic.
10382 static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
10383 bool &isDot, const PPCSubtarget &Subtarget) {
10384 unsigned IntrinsicID =
10385 cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
10388 switch (IntrinsicID) {
10391 // Comparison predicates.
10392 case Intrinsic::ppc_altivec_vcmpbfp_p:
10396 case Intrinsic::ppc_altivec_vcmpeqfp_p:
10400 case Intrinsic::ppc_altivec_vcmpequb_p:
10404 case Intrinsic::ppc_altivec_vcmpequh_p:
10408 case Intrinsic::ppc_altivec_vcmpequw_p:
10412 case Intrinsic::ppc_altivec_vcmpequd_p:
10413 if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10419 case Intrinsic::ppc_altivec_vcmpneb_p:
10420 case Intrinsic::ppc_altivec_vcmpneh_p:
10421 case Intrinsic::ppc_altivec_vcmpnew_p:
10422 case Intrinsic::ppc_altivec_vcmpnezb_p:
10423 case Intrinsic::ppc_altivec_vcmpnezh_p:
10424 case Intrinsic::ppc_altivec_vcmpnezw_p:
10425 if (Subtarget.hasP9Altivec()) {
10426 switch (IntrinsicID) {
10428 llvm_unreachable("Unknown comparison intrinsic.");
10429 case Intrinsic::ppc_altivec_vcmpneb_p:
10432 case Intrinsic::ppc_altivec_vcmpneh_p:
10435 case Intrinsic::ppc_altivec_vcmpnew_p:
10438 case Intrinsic::ppc_altivec_vcmpnezb_p:
10441 case Intrinsic::ppc_altivec_vcmpnezh_p:
10444 case Intrinsic::ppc_altivec_vcmpnezw_p:
10452 case Intrinsic::ppc_altivec_vcmpgefp_p:
10456 case Intrinsic::ppc_altivec_vcmpgtfp_p:
10460 case Intrinsic::ppc_altivec_vcmpgtsb_p:
10464 case Intrinsic::ppc_altivec_vcmpgtsh_p:
10468 case Intrinsic::ppc_altivec_vcmpgtsw_p:
10472 case Intrinsic::ppc_altivec_vcmpgtsd_p:
10473 if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10479 case Intrinsic::ppc_altivec_vcmpgtub_p:
10483 case Intrinsic::ppc_altivec_vcmpgtuh_p:
10487 case Intrinsic::ppc_altivec_vcmpgtuw_p:
10491 case Intrinsic::ppc_altivec_vcmpgtud_p:
10492 if (Subtarget.hasVSX() || Subtarget.hasP8Altivec()) {
10499 case Intrinsic::ppc_altivec_vcmpequq:
10500 case Intrinsic::ppc_altivec_vcmpgtsq:
10501 case Intrinsic::ppc_altivec_vcmpgtuq:
10502 if (!Subtarget.isISA3_1())
10504 switch (IntrinsicID) {
10506 llvm_unreachable("Unknown comparison intrinsic.");
10507 case Intrinsic::ppc_altivec_vcmpequq:
10510 case Intrinsic::ppc_altivec_vcmpgtsq:
10513 case Intrinsic::ppc_altivec_vcmpgtuq:
10519 // VSX predicate comparisons use the same infrastructure
10520 case Intrinsic::ppc_vsx_xvcmpeqdp_p:
10521 case Intrinsic::ppc_vsx_xvcmpgedp_p:
10522 case Intrinsic::ppc_vsx_xvcmpgtdp_p:
10523 case Intrinsic::ppc_vsx_xvcmpeqsp_p:
10524 case Intrinsic::ppc_vsx_xvcmpgesp_p:
10525 case Intrinsic::ppc_vsx_xvcmpgtsp_p:
10526 if (Subtarget.hasVSX()) {
10527 switch (IntrinsicID) {
10528 case Intrinsic::ppc_vsx_xvcmpeqdp_p:
10531 case Intrinsic::ppc_vsx_xvcmpgedp_p:
10534 case Intrinsic::ppc_vsx_xvcmpgtdp_p:
10537 case Intrinsic::ppc_vsx_xvcmpeqsp_p:
10540 case Intrinsic::ppc_vsx_xvcmpgesp_p:
10543 case Intrinsic::ppc_vsx_xvcmpgtsp_p:
10552 // Normal Comparisons.
10553 case Intrinsic::ppc_altivec_vcmpbfp:
10556 case Intrinsic::ppc_altivec_vcmpeqfp:
10559 case Intrinsic::ppc_altivec_vcmpequb:
10562 case Intrinsic::ppc_altivec_vcmpequh:
10565 case Intrinsic::ppc_altivec_vcmpequw:
10568 case Intrinsic::ppc_altivec_vcmpequd:
10569 if (Subtarget.hasP8Altivec())
10574 case Intrinsic::ppc_altivec_vcmpneb:
10575 case Intrinsic::ppc_altivec_vcmpneh:
10576 case Intrinsic::ppc_altivec_vcmpnew:
10577 case Intrinsic::ppc_altivec_vcmpnezb:
10578 case Intrinsic::ppc_altivec_vcmpnezh:
10579 case Intrinsic::ppc_altivec_vcmpnezw:
10580 if (Subtarget.hasP9Altivec())
10581 switch (IntrinsicID) {
10583 llvm_unreachable("Unknown comparison intrinsic.");
10584 case Intrinsic::ppc_altivec_vcmpneb:
10587 case Intrinsic::ppc_altivec_vcmpneh:
10590 case Intrinsic::ppc_altivec_vcmpnew:
10593 case Intrinsic::ppc_altivec_vcmpnezb:
10596 case Intrinsic::ppc_altivec_vcmpnezh:
10599 case Intrinsic::ppc_altivec_vcmpnezw:
10606 case Intrinsic::ppc_altivec_vcmpgefp:
10609 case Intrinsic::ppc_altivec_vcmpgtfp:
10612 case Intrinsic::ppc_altivec_vcmpgtsb:
10615 case Intrinsic::ppc_altivec_vcmpgtsh:
10618 case Intrinsic::ppc_altivec_vcmpgtsw:
10621 case Intrinsic::ppc_altivec_vcmpgtsd:
10622 if (Subtarget.hasP8Altivec())
10627 case Intrinsic::ppc_altivec_vcmpgtub:
10630 case Intrinsic::ppc_altivec_vcmpgtuh:
10633 case Intrinsic::ppc_altivec_vcmpgtuw:
10636 case Intrinsic::ppc_altivec_vcmpgtud:
10637 if (Subtarget.hasP8Altivec())
10642 case Intrinsic::ppc_altivec_vcmpequq_p:
10643 case Intrinsic::ppc_altivec_vcmpgtsq_p:
10644 case Intrinsic::ppc_altivec_vcmpgtuq_p:
10645 if (!Subtarget.isISA3_1())
10647 switch (IntrinsicID) {
10649 llvm_unreachable("Unknown comparison intrinsic.");
10650 case Intrinsic::ppc_altivec_vcmpequq_p:
10653 case Intrinsic::ppc_altivec_vcmpgtsq_p:
10656 case Intrinsic::ppc_altivec_vcmpgtuq_p:
10666 /// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
10667 /// lower, do it, otherwise return null.
10668 SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
10669 SelectionDAG &DAG) const {
10670 unsigned IntrinsicID =
10671 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10675 switch (IntrinsicID) {
10676 case Intrinsic::thread_pointer:
10677 // Reads the thread pointer register, used for __builtin_thread_pointer.
10678 if (Subtarget.isPPC64())
10679 return DAG.getRegister(PPC::X13, MVT::i64);
10680 return DAG.getRegister(PPC::R2, MVT::i32);
10682 case Intrinsic::ppc_mma_disassemble_acc: {
10683 if (Subtarget.isISAFuture()) {
10684 EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
10685 SDValue WideVec = SDValue(DAG.getMachineNode(PPC::DMXXEXTFDMR512, dl,
10686 ArrayRef(ReturnTypes, 2),
10689 SmallVector<SDValue, 4> RetOps;
10690 SDValue Value = SDValue(WideVec.getNode(), 0);
10691 SDValue Value2 = SDValue(WideVec.getNode(), 1);
10694 Extract = DAG.getNode(
10695 PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,
10696 Subtarget.isLittleEndian() ? Value2 : Value,
10697 DAG.getConstant(Subtarget.isLittleEndian() ? 1 : 0,
10698 dl, getPointerTy(DAG.getDataLayout())));
10699 RetOps.push_back(Extract);
10700 Extract = DAG.getNode(
10701 PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,
10702 Subtarget.isLittleEndian() ? Value2 : Value,
10703 DAG.getConstant(Subtarget.isLittleEndian() ? 0 : 1,
10704 dl, getPointerTy(DAG.getDataLayout())));
10705 RetOps.push_back(Extract);
10706 Extract = DAG.getNode(
10707 PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,
10708 Subtarget.isLittleEndian() ? Value : Value2,
10709 DAG.getConstant(Subtarget.isLittleEndian() ? 1 : 0,
10710 dl, getPointerTy(DAG.getDataLayout())));
10711 RetOps.push_back(Extract);
10712 Extract = DAG.getNode(
10713 PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,
10714 Subtarget.isLittleEndian() ? Value : Value2,
10715 DAG.getConstant(Subtarget.isLittleEndian() ? 0 : 1,
10716 dl, getPointerTy(DAG.getDataLayout())));
10717 RetOps.push_back(Extract);
10718 return DAG.getMergeValues(RetOps, dl);
10722 case Intrinsic::ppc_vsx_disassemble_pair: {
10724 SDValue WideVec = Op.getOperand(1);
10725 if (IntrinsicID == Intrinsic::ppc_mma_disassemble_acc) {
10727 WideVec = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, WideVec);
10729 SmallVector<SDValue, 4> RetOps;
10730 for (int VecNo = 0; VecNo < NumVecs; VecNo++) {
10731 SDValue Extract = DAG.getNode(
10732 PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, WideVec,
10733 DAG.getConstant(Subtarget.isLittleEndian() ? NumVecs - 1 - VecNo
10735 dl, getPointerTy(DAG.getDataLayout())));
10736 RetOps.push_back(Extract);
10738 return DAG.getMergeValues(RetOps, dl);
10741 case Intrinsic::ppc_mma_xxmfacc:
10742 case Intrinsic::ppc_mma_xxmtacc: {
10743 // Allow pre-isa-future subtargets to lower as normal.
10744 if (!Subtarget.isISAFuture())
10746 // The intrinsics for xxmtacc and xxmfacc take one argument of
10747 // type v512i1, for future cpu the corresponding wacc instruction
10748 // dmxx[inst|extf]dmr512 is always generated for type v512i1, negating
10749 // the need to produce the xxm[t|f]acc.
10750 SDValue WideVec = Op.getOperand(1);
10751 DAG.ReplaceAllUsesWith(Op, WideVec);
10755 case Intrinsic::ppc_unpack_longdouble: {
10756 auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
10757 assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
10758 "Argument of long double unpack must be 0 or 1!");
10759 return DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::f64, Op.getOperand(1),
10760 DAG.getConstant(!!(Idx->getSExtValue()), dl,
10761 Idx->getValueType(0)));
10764 case Intrinsic::ppc_compare_exp_lt:
10765 case Intrinsic::ppc_compare_exp_gt:
10766 case Intrinsic::ppc_compare_exp_eq:
10767 case Intrinsic::ppc_compare_exp_uo: {
10769 switch (IntrinsicID) {
10770 case Intrinsic::ppc_compare_exp_lt:
10771 Pred = PPC::PRED_LT;
10773 case Intrinsic::ppc_compare_exp_gt:
10774 Pred = PPC::PRED_GT;
10776 case Intrinsic::ppc_compare_exp_eq:
10777 Pred = PPC::PRED_EQ;
10779 case Intrinsic::ppc_compare_exp_uo:
10780 Pred = PPC::PRED_UN;
10784 DAG.getMachineNode(
10785 PPC::SELECT_CC_I4, dl, MVT::i32,
10786 {SDValue(DAG.getMachineNode(PPC::XSCMPEXPDP, dl, MVT::i32,
10787 Op.getOperand(1), Op.getOperand(2)),
10789 DAG.getConstant(1, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32),
10790 DAG.getTargetConstant(Pred, dl, MVT::i32)}),
10793 case Intrinsic::ppc_test_data_class: {
10794 EVT OpVT = Op.getOperand(1).getValueType();
10795 unsigned CmprOpc = OpVT == MVT::f128 ? PPC::XSTSTDCQP
10796 : (OpVT == MVT::f64 ? PPC::XSTSTDCDP
10799 DAG.getMachineNode(
10800 PPC::SELECT_CC_I4, dl, MVT::i32,
10801 {SDValue(DAG.getMachineNode(CmprOpc, dl, MVT::i32, Op.getOperand(2),
10804 DAG.getConstant(1, dl, MVT::i32), DAG.getConstant(0, dl, MVT::i32),
10805 DAG.getTargetConstant(PPC::PRED_EQ, dl, MVT::i32)}),
10808 case Intrinsic::ppc_fnmsub: {
10809 EVT VT = Op.getOperand(1).getValueType();
10810 if (!Subtarget.hasVSX() || (!Subtarget.hasFloat128() && VT == MVT::f128))
10811 return DAG.getNode(
10813 DAG.getNode(ISD::FMA, dl, VT, Op.getOperand(1), Op.getOperand(2),
10814 DAG.getNode(ISD::FNEG, dl, VT, Op.getOperand(3))));
10815 return DAG.getNode(PPCISD::FNMSUB, dl, VT, Op.getOperand(1),
10816 Op.getOperand(2), Op.getOperand(3));
10818 case Intrinsic::ppc_convert_f128_to_ppcf128:
10819 case Intrinsic::ppc_convert_ppcf128_to_f128: {
10820 RTLIB::Libcall LC = IntrinsicID == Intrinsic::ppc_convert_ppcf128_to_f128
10821 ? RTLIB::CONVERT_PPCF128_F128
10822 : RTLIB::CONVERT_F128_PPCF128;
10823 MakeLibCallOptions CallOptions;
10824 std::pair<SDValue, SDValue> Result =
10825 makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(1), CallOptions,
10827 return Result.first;
10829 case Intrinsic::ppc_maxfe:
10830 case Intrinsic::ppc_maxfl:
10831 case Intrinsic::ppc_maxfs:
10832 case Intrinsic::ppc_minfe:
10833 case Intrinsic::ppc_minfl:
10834 case Intrinsic::ppc_minfs: {
10835 EVT VT = Op.getValueType();
10837 all_of(Op->ops().drop_front(4),
10838 [VT](const SDUse &Use) { return Use.getValueType() == VT; }) &&
10839 "ppc_[max|min]f[e|l|s] must have uniform type arguments");
10841 ISD::CondCode CC = ISD::SETGT;
10842 if (IntrinsicID == Intrinsic::ppc_minfe ||
10843 IntrinsicID == Intrinsic::ppc_minfl ||
10844 IntrinsicID == Intrinsic::ppc_minfs)
10846 unsigned I = Op.getNumOperands() - 2, Cnt = I;
10847 SDValue Res = Op.getOperand(I);
10848 for (--I; Cnt != 0; --Cnt, I = (--I == 0 ? (Op.getNumOperands() - 1) : I)) {
10850 DAG.getSelectCC(dl, Res, Op.getOperand(I), Res, Op.getOperand(I), CC);
10856 // If this is a lowered altivec predicate compare, CompareOpc is set to the
10857 // opcode number of the comparison.
10860 if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget))
10861 return SDValue(); // Don't custom lower most intrinsics.
10863 // If this is a non-dot comparison, make the VCMP node and we are done.
10865 SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
10866 Op.getOperand(1), Op.getOperand(2),
10867 DAG.getConstant(CompareOpc, dl, MVT::i32));
10868 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp);
10871 // Create the PPCISD altivec 'dot' comparison node.
10873 Op.getOperand(2), // LHS
10874 Op.getOperand(3), // RHS
10875 DAG.getConstant(CompareOpc, dl, MVT::i32)
10877 EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
10878 SDValue CompNode = DAG.getNode(PPCISD::VCMP_rec, dl, VTs, Ops);
10880 // Now that we have the comparison, emit a copy from the CR to a GPR.
10881 // This is flagged to the above dot comparison.
10882 SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32,
10883 DAG.getRegister(PPC::CR6, MVT::i32),
10884 CompNode.getValue(1));
10886 // Unpack the result based on how the target uses it.
10887 unsigned BitNo; // Bit # of CR6.
10888 bool InvertBit; // Invert result?
10889 switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
10890 default: // Can't happen, don't crash on invalid number though.
10891 case 0: // Return the value of the EQ bit of CR6.
10892 BitNo = 0; InvertBit = false;
10894 case 1: // Return the inverted value of the EQ bit of CR6.
10895 BitNo = 0; InvertBit = true;
10897 case 2: // Return the value of the LT bit of CR6.
10898 BitNo = 2; InvertBit = false;
10900 case 3: // Return the inverted value of the LT bit of CR6.
10901 BitNo = 2; InvertBit = true;
10905 // Shift the bit into the low position.
10906 Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
10907 DAG.getConstant(8 - (3 - BitNo), dl, MVT::i32));
10908 // Isolate the bit.
10909 Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
10910 DAG.getConstant(1, dl, MVT::i32));
10912 // If we are supposed to, toggle the bit.
10914 Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
10915 DAG.getConstant(1, dl, MVT::i32));
10919 SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
10920 SelectionDAG &DAG) const {
10921 // SelectionDAGBuilder::visitTargetIntrinsic may insert one extra chain to
10922 // the beginning of the argument list.
10923 int ArgStart = isa<ConstantSDNode>(Op.getOperand(0)) ? 0 : 1;
10925 switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) {
10926 case Intrinsic::ppc_cfence: {
10927 assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
10928 assert(Subtarget.isPPC64() && "Only 64-bit is supported for now.");
10929 SDValue Val = Op.getOperand(ArgStart + 1);
10930 EVT Ty = Val.getValueType();
10931 if (Ty == MVT::i128) {
10932 // FIXME: Testing one of two paired registers is sufficient to guarantee
10934 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, Val);
10937 DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
10938 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Val),
10948 // Lower scalar BSWAP64 to xxbrd.
10949 SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
10951 if (!Subtarget.isPPC64())
10954 Op = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, Op.getOperand(0),
10957 Op = DAG.getNode(ISD::BSWAP, dl, MVT::v2i64, Op);
10959 int VectorIndex = 0;
10960 if (Subtarget.isLittleEndian())
10962 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
10963 DAG.getTargetConstant(VectorIndex, dl, MVT::i32));
10967 // ATOMIC_CMP_SWAP for i8/i16 needs to zero-extend its input since it will be
10968 // compared to a value that is atomically loaded (atomic loads zero-extend).
10969 SDValue PPCTargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
10970 SelectionDAG &DAG) const {
10971 assert(Op.getOpcode() == ISD::ATOMIC_CMP_SWAP &&
10972 "Expecting an atomic compare-and-swap here.");
10974 auto *AtomicNode = cast<AtomicSDNode>(Op.getNode());
10975 EVT MemVT = AtomicNode->getMemoryVT();
10976 if (MemVT.getSizeInBits() >= 32)
10979 SDValue CmpOp = Op.getOperand(2);
10980 // If this is already correctly zero-extended, leave it alone.
10981 auto HighBits = APInt::getHighBitsSet(32, 32 - MemVT.getSizeInBits());
10982 if (DAG.MaskedValueIsZero(CmpOp, HighBits))
10985 // Clear the high bits of the compare operand.
10986 unsigned MaskVal = (1 << MemVT.getSizeInBits()) - 1;
10988 DAG.getNode(ISD::AND, dl, MVT::i32, CmpOp,
10989 DAG.getConstant(MaskVal, dl, MVT::i32));
10991 // Replace the existing compare operand with the properly zero-extended one.
10992 SmallVector<SDValue, 4> Ops;
10993 for (int i = 0, e = AtomicNode->getNumOperands(); i < e; i++)
10994 Ops.push_back(AtomicNode->getOperand(i));
10996 MachineMemOperand *MMO = AtomicNode->getMemOperand();
10997 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::Other);
10999 (MemVT == MVT::i8) ? PPCISD::ATOMIC_CMP_SWAP_8 : PPCISD::ATOMIC_CMP_SWAP_16;
11000 return DAG.getMemIntrinsicNode(NodeTy, dl, Tys, Ops, MemVT, MMO);
11003 SDValue PPCTargetLowering::LowerATOMIC_LOAD_STORE(SDValue Op,
11004 SelectionDAG &DAG) const {
11005 AtomicSDNode *N = cast<AtomicSDNode>(Op.getNode());
11006 EVT MemVT = N->getMemoryVT();
11007 assert(MemVT.getSimpleVT() == MVT::i128 &&
11008 "Expect quadword atomic operations");
11010 unsigned Opc = N->getOpcode();
11012 case ISD::ATOMIC_LOAD: {
11013 // Lower quadword atomic load to int_ppc_atomic_load_i128 which will be
11014 // lowered to ppc instructions by pattern matching instruction selector.
11015 SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
11016 SmallVector<SDValue, 4> Ops{
11018 DAG.getConstant(Intrinsic::ppc_atomic_load_i128, dl, MVT::i32)};
11019 for (int I = 1, E = N->getNumOperands(); I < E; ++I)
11020 Ops.push_back(N->getOperand(I));
11021 SDValue LoadedVal = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl, Tys,
11022 Ops, MemVT, N->getMemOperand());
11023 SDValue ValLo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i128, LoadedVal);
11025 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i128, LoadedVal.getValue(1));
11026 ValHi = DAG.getNode(ISD::SHL, dl, MVT::i128, ValHi,
11027 DAG.getConstant(64, dl, MVT::i32));
11029 DAG.getNode(ISD::OR, dl, {MVT::i128, MVT::Other}, {ValLo, ValHi});
11030 return DAG.getNode(ISD::MERGE_VALUES, dl, {MVT::i128, MVT::Other},
11031 {Val, LoadedVal.getValue(2)});
11033 case ISD::ATOMIC_STORE: {
11034 // Lower quadword atomic store to int_ppc_atomic_store_i128 which will be
11035 // lowered to ppc instructions by pattern matching instruction selector.
11036 SDVTList Tys = DAG.getVTList(MVT::Other);
11037 SmallVector<SDValue, 4> Ops{
11039 DAG.getConstant(Intrinsic::ppc_atomic_store_i128, dl, MVT::i32)};
11040 SDValue Val = N->getOperand(2);
11041 SDValue ValLo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, Val);
11042 SDValue ValHi = DAG.getNode(ISD::SRL, dl, MVT::i128, Val,
11043 DAG.getConstant(64, dl, MVT::i32));
11044 ValHi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i64, ValHi);
11045 Ops.push_back(ValLo);
11046 Ops.push_back(ValHi);
11047 Ops.push_back(N->getOperand(1));
11048 return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, dl, Tys, Ops, MemVT,
11049 N->getMemOperand());
11052 llvm_unreachable("Unexpected atomic opcode");
11056 static SDValue getDataClassTest(SDValue Op, FPClassTest Mask, const SDLoc &Dl,
11058 const PPCSubtarget &Subtarget) {
11059 assert(Mask <= fcAllFlags && "Invalid fp_class flags!");
11061 enum DataClassMask {
11063 DC_NEG_INF = 1 << 4,
11064 DC_POS_INF = 1 << 5,
11065 DC_NEG_ZERO = 1 << 2,
11066 DC_POS_ZERO = 1 << 3,
11067 DC_NEG_SUBNORM = 1,
11068 DC_POS_SUBNORM = 1 << 1,
11071 EVT VT = Op.getValueType();
11073 unsigned TestOp = VT == MVT::f128 ? PPC::XSTSTDCQP
11074 : VT == MVT::f64 ? PPC::XSTSTDCDP
11077 if (Mask == fcAllFlags)
11078 return DAG.getBoolConstant(true, Dl, MVT::i1, VT);
11080 return DAG.getBoolConstant(false, Dl, MVT::i1, VT);
11082 // When it's cheaper or necessary to test reverse flags.
11083 if ((Mask & fcNormal) == fcNormal || Mask == ~fcQNan || Mask == ~fcSNan) {
11084 SDValue Rev = getDataClassTest(Op, ~Mask, Dl, DAG, Subtarget);
11085 return DAG.getNOT(Dl, Rev, MVT::i1);
11088 // Power doesn't support testing whether a value is 'normal'. Test the rest
11089 // first, and test if it's 'not not-normal' with expected sign.
11090 if (Mask & fcNormal) {
11091 SDValue Rev(DAG.getMachineNode(
11092 TestOp, Dl, MVT::i32,
11093 DAG.getTargetConstant(DC_NAN | DC_NEG_INF | DC_POS_INF |
11094 DC_NEG_ZERO | DC_POS_ZERO |
11095 DC_NEG_SUBNORM | DC_POS_SUBNORM,
11099 // Sign are stored in CR bit 0, result are in CR bit 2.
11101 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, Dl, MVT::i1, Rev,
11102 DAG.getTargetConstant(PPC::sub_lt, Dl, MVT::i32)),
11104 SDValue Normal(DAG.getNOT(
11106 SDValue(DAG.getMachineNode(
11107 TargetOpcode::EXTRACT_SUBREG, Dl, MVT::i1, Rev,
11108 DAG.getTargetConstant(PPC::sub_eq, Dl, MVT::i32)),
11111 if (Mask & fcPosNormal)
11112 Sign = DAG.getNOT(Dl, Sign, MVT::i1);
11113 SDValue Result = DAG.getNode(ISD::AND, Dl, MVT::i1, Sign, Normal);
11114 if (Mask == fcPosNormal || Mask == fcNegNormal)
11117 return DAG.getNode(
11118 ISD::OR, Dl, MVT::i1,
11119 getDataClassTest(Op, Mask & ~fcNormal, Dl, DAG, Subtarget), Result);
11122 // The instruction doesn't differentiate between signaling or quiet NaN. Test
11123 // the rest first, and test if it 'is NaN and is signaling/quiet'.
11124 if ((Mask & fcNan) == fcQNan || (Mask & fcNan) == fcSNan) {
11125 bool IsQuiet = Mask & fcQNan;
11126 SDValue NanCheck = getDataClassTest(Op, fcNan, Dl, DAG, Subtarget);
11128 // Quietness is determined by the first bit in fraction field.
11129 uint64_t QuietMask = 0;
11131 if (VT == MVT::f128) {
11132 HighWord = DAG.getNode(
11133 ISD::EXTRACT_VECTOR_ELT, Dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Op),
11134 DAG.getVectorIdxConstant(Subtarget.isLittleEndian() ? 3 : 0, Dl));
11135 QuietMask = 0x8000;
11136 } else if (VT == MVT::f64) {
11137 if (Subtarget.isPPC64()) {
11138 HighWord = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32,
11139 DAG.getBitcast(MVT::i64, Op),
11140 DAG.getConstant(1, Dl, MVT::i32));
11142 SDValue Vec = DAG.getBitcast(
11143 MVT::v4i32, DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v2f64, Op));
11144 HighWord = DAG.getNode(
11145 ISD::EXTRACT_VECTOR_ELT, Dl, MVT::i32, Vec,
11146 DAG.getVectorIdxConstant(Subtarget.isLittleEndian() ? 1 : 0, Dl));
11148 QuietMask = 0x80000;
11149 } else if (VT == MVT::f32) {
11150 HighWord = DAG.getBitcast(MVT::i32, Op);
11151 QuietMask = 0x400000;
11153 SDValue NanRes = DAG.getSetCC(
11155 DAG.getNode(ISD::AND, Dl, MVT::i32, HighWord,
11156 DAG.getConstant(QuietMask, Dl, MVT::i32)),
11157 DAG.getConstant(0, Dl, MVT::i32), IsQuiet ? ISD::SETNE : ISD::SETEQ);
11158 NanRes = DAG.getNode(ISD::AND, Dl, MVT::i1, NanCheck, NanRes);
11159 if (Mask == fcQNan || Mask == fcSNan)
11162 return DAG.getNode(ISD::OR, Dl, MVT::i1,
11163 getDataClassTest(Op, Mask & ~fcNan, Dl, DAG, Subtarget),
11167 unsigned NativeMask = 0;
11168 if ((Mask & fcNan) == fcNan)
11169 NativeMask |= DC_NAN;
11170 if (Mask & fcNegInf)
11171 NativeMask |= DC_NEG_INF;
11172 if (Mask & fcPosInf)
11173 NativeMask |= DC_POS_INF;
11174 if (Mask & fcNegZero)
11175 NativeMask |= DC_NEG_ZERO;
11176 if (Mask & fcPosZero)
11177 NativeMask |= DC_POS_ZERO;
11178 if (Mask & fcNegSubnormal)
11179 NativeMask |= DC_NEG_SUBNORM;
11180 if (Mask & fcPosSubnormal)
11181 NativeMask |= DC_POS_SUBNORM;
11183 DAG.getMachineNode(
11184 TargetOpcode::EXTRACT_SUBREG, Dl, MVT::i1,
11185 SDValue(DAG.getMachineNode(
11186 TestOp, Dl, MVT::i32,
11187 DAG.getTargetConstant(NativeMask, Dl, MVT::i32), Op),
11189 DAG.getTargetConstant(PPC::sub_eq, Dl, MVT::i32)),
11193 SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op,
11194 SelectionDAG &DAG) const {
11195 assert(Subtarget.hasP9Vector() && "Test data class requires Power9");
11196 SDValue LHS = Op.getOperand(0);
11197 const auto *RHS = cast<ConstantSDNode>(Op.getOperand(1));
11199 FPClassTest Category = static_cast<FPClassTest>(RHS->getZExtValue());
11200 return getDataClassTest(LHS, Category, Dl, DAG, Subtarget);
11203 SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
11204 SelectionDAG &DAG) const {
11206 // Create a stack slot that is 16-byte aligned.
11207 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
11208 int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
11209 EVT PtrVT = getPointerTy(DAG.getDataLayout());
11210 SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
11212 // Store the input value into Value#0 of the stack slot.
11213 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
11214 MachinePointerInfo());
11216 return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo());
11219 SDValue PPCTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
11220 SelectionDAG &DAG) const {
11221 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT &&
11222 "Should only be called for ISD::INSERT_VECTOR_ELT");
11224 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(2));
11226 EVT VT = Op.getValueType();
11228 SDValue V1 = Op.getOperand(0);
11229 SDValue V2 = Op.getOperand(1);
11231 if (VT == MVT::v2f64 && C)
11234 if (Subtarget.hasP9Vector()) {
11235 // A f32 load feeding into a v4f32 insert_vector_elt is handled in this way
11236 // because on P10, it allows this specific insert_vector_elt load pattern to
11237 // utilize the refactored load and store infrastructure in order to exploit
11239 // On targets with inexpensive direct moves (Power9 and up), a
11240 // (insert_vector_elt v4f32:$vec, (f32 load)) is always better as an integer
11241 // load since a single precision load will involve conversion to double
11242 // precision on the load followed by another conversion to single precision.
11243 if ((VT == MVT::v4f32) && (V2.getValueType() == MVT::f32) &&
11244 (isa<LoadSDNode>(V2))) {
11245 SDValue BitcastVector = DAG.getBitcast(MVT::v4i32, V1);
11246 SDValue BitcastLoad = DAG.getBitcast(MVT::i32, V2);
11247 SDValue InsVecElt =
11248 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4i32, BitcastVector,
11249 BitcastLoad, Op.getOperand(2));
11250 return DAG.getBitcast(MVT::v4f32, InsVecElt);
11254 if (Subtarget.isISA3_1()) {
11255 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && !Subtarget.isPPC64())
11257 // On P10, we have legal lowering for constant and variable indices for
11259 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
11260 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
11264 // Before P10, we have legal lowering for constant indices but not for
11269 // We can use MTVSRZ + VECINSERT for v8i16 and v16i8 types.
11270 if (VT == MVT::v8i16 || VT == MVT::v16i8) {
11271 SDValue Mtvsrz = DAG.getNode(PPCISD::MTVSRZ, dl, VT, V2);
11272 unsigned BytesInEachElement = VT.getVectorElementType().getSizeInBits() / 8;
11273 unsigned InsertAtElement = C->getZExtValue();
11274 unsigned InsertAtByte = InsertAtElement * BytesInEachElement;
11275 if (Subtarget.isLittleEndian()) {
11276 InsertAtByte = (16 - BytesInEachElement) - InsertAtByte;
11278 return DAG.getNode(PPCISD::VECINSERT, dl, VT, V1, Mtvsrz,
11279 DAG.getConstant(InsertAtByte, dl, MVT::i32));
11284 SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
11285 SelectionDAG &DAG) const {
11287 LoadSDNode *LN = cast<LoadSDNode>(Op.getNode());
11288 SDValue LoadChain = LN->getChain();
11289 SDValue BasePtr = LN->getBasePtr();
11290 EVT VT = Op.getValueType();
11292 if (VT != MVT::v256i1 && VT != MVT::v512i1)
11295 // Type v256i1 is used for pairs and v512i1 is used for accumulators.
11296 // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in
11297 // 2 or 4 vsx registers.
11298 assert((VT != MVT::v512i1 || Subtarget.hasMMA()) &&
11299 "Type unsupported without MMA");
11300 assert((VT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
11301 "Type unsupported without paired vector support");
11302 Align Alignment = LN->getAlign();
11303 SmallVector<SDValue, 4> Loads;
11304 SmallVector<SDValue, 4> LoadChains;
11305 unsigned NumVecs = VT.getSizeInBits() / 128;
11306 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
11308 DAG.getLoad(MVT::v16i8, dl, LoadChain, BasePtr,
11309 LN->getPointerInfo().getWithOffset(Idx * 16),
11310 commonAlignment(Alignment, Idx * 16),
11311 LN->getMemOperand()->getFlags(), LN->getAAInfo());
11312 BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
11313 DAG.getConstant(16, dl, BasePtr.getValueType()));
11314 Loads.push_back(Load);
11315 LoadChains.push_back(Load.getValue(1));
11317 if (Subtarget.isLittleEndian()) {
11318 std::reverse(Loads.begin(), Loads.end());
11319 std::reverse(LoadChains.begin(), LoadChains.end());
11321 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
11323 DAG.getNode(VT == MVT::v512i1 ? PPCISD::ACC_BUILD : PPCISD::PAIR_BUILD,
11325 SDValue RetOps[] = {Value, TF};
11326 return DAG.getMergeValues(RetOps, dl);
11329 SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
11330 SelectionDAG &DAG) const {
11332 StoreSDNode *SN = cast<StoreSDNode>(Op.getNode());
11333 SDValue StoreChain = SN->getChain();
11334 SDValue BasePtr = SN->getBasePtr();
11335 SDValue Value = SN->getValue();
11336 SDValue Value2 = SN->getValue();
11337 EVT StoreVT = Value.getValueType();
11339 if (StoreVT != MVT::v256i1 && StoreVT != MVT::v512i1)
11342 // Type v256i1 is used for pairs and v512i1 is used for accumulators.
11343 // Here we create 2 or 4 v16i8 stores to store the pair or accumulator
11344 // underlying registers individually.
11345 assert((StoreVT != MVT::v512i1 || Subtarget.hasMMA()) &&
11346 "Type unsupported without MMA");
11347 assert((StoreVT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
11348 "Type unsupported without paired vector support");
11349 Align Alignment = SN->getAlign();
11350 SmallVector<SDValue, 4> Stores;
11351 unsigned NumVecs = 2;
11352 if (StoreVT == MVT::v512i1) {
11353 if (Subtarget.isISAFuture()) {
11354 EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
11355 MachineSDNode *ExtNode = DAG.getMachineNode(
11356 PPC::DMXXEXTFDMR512, dl, ArrayRef(ReturnTypes, 2), Op.getOperand(1));
11358 Value = SDValue(ExtNode, 0);
11359 Value2 = SDValue(ExtNode, 1);
11361 Value = DAG.getNode(PPCISD::XXMFACC, dl, MVT::v512i1, Value);
11364 for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
11365 unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx;
11367 if (Subtarget.isISAFuture()) {
11368 VecNum = Subtarget.isLittleEndian() ? 1 - (Idx % 2) : (Idx % 2);
11369 Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8,
11370 Idx > 1 ? Value2 : Value,
11371 DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));
11373 Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value,
11374 DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout())));
11377 DAG.getStore(StoreChain, dl, Elt, BasePtr,
11378 SN->getPointerInfo().getWithOffset(Idx * 16),
11379 commonAlignment(Alignment, Idx * 16),
11380 SN->getMemOperand()->getFlags(), SN->getAAInfo());
11381 BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
11382 DAG.getConstant(16, dl, BasePtr.getValueType()));
11383 Stores.push_back(Store);
11385 SDValue TF = DAG.getTokenFactor(dl, Stores);
11389 SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
11391 if (Op.getValueType() == MVT::v4i32) {
11392 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
11394 SDValue Zero = getCanonicalConstSplat(0, 1, MVT::v4i32, DAG, dl);
11395 // +16 as shift amt.
11396 SDValue Neg16 = getCanonicalConstSplat(-16, 4, MVT::v4i32, DAG, dl);
11397 SDValue RHSSwap = // = vrlw RHS, 16
11398 BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
11400 // Shrinkify inputs to v8i16.
11401 LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS);
11402 RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS);
11403 RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap);
11405 // Low parts multiplied together, generating 32-bit results (we ignore the
11407 SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
11408 LHS, RHS, DAG, dl, MVT::v4i32);
11410 SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
11411 LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
11412 // Shift the high parts up 16 bits.
11413 HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
11415 return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
11416 } else if (Op.getValueType() == MVT::v16i8) {
11417 SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
11418 bool isLittleEndian = Subtarget.isLittleEndian();
11420 // Multiply the even 8-bit parts, producing 16-bit sums.
11421 SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
11422 LHS, RHS, DAG, dl, MVT::v8i16);
11423 EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);
11425 // Multiply the odd 8-bit parts, producing 16-bit sums.
11426 SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
11427 LHS, RHS, DAG, dl, MVT::v8i16);
11428 OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);
11430 // Merge the results together. Because vmuleub and vmuloub are
11431 // instructions with a big-endian bias, we must reverse the
11432 // element numbering and reverse the meaning of "odd" and "even"
11433 // when generating little endian code.
11435 for (unsigned i = 0; i != 8; ++i) {
11436 if (isLittleEndian) {
11438 Ops[i*2+1] = 2*i+16;
11441 Ops[i*2+1] = 2*i+1+16;
11444 if (isLittleEndian)
11445 return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);
11447 return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
11449 llvm_unreachable("Unknown mul to lower!");
11453 SDValue PPCTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
11454 bool IsStrict = Op->isStrictFPOpcode();
11455 if (Op.getOperand(IsStrict ? 1 : 0).getValueType() == MVT::f128 &&
11456 !Subtarget.hasP9Vector())
11462 // Custom lowering for fpext vf32 to v2f64
11463 SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
11465 assert(Op.getOpcode() == ISD::FP_EXTEND &&
11466 "Should only be called for ISD::FP_EXTEND");
11468 // FIXME: handle extends from half precision float vectors on P9.
11469 // We only want to custom lower an extend from v2f32 to v2f64.
11470 if (Op.getValueType() != MVT::v2f64 ||
11471 Op.getOperand(0).getValueType() != MVT::v2f32)
11475 SDValue Op0 = Op.getOperand(0);
11477 switch (Op0.getOpcode()) {
11480 case ISD::EXTRACT_SUBVECTOR: {
11481 assert(Op0.getNumOperands() == 2 &&
11482 isa<ConstantSDNode>(Op0->getOperand(1)) &&
11483 "Node should have 2 operands with second one being a constant!");
11485 if (Op0.getOperand(0).getValueType() != MVT::v4f32)
11488 // Custom lower is only done for high or low doubleword.
11489 int Idx = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
11493 // Since input is v4f32, at this point Idx is either 0 or 2.
11494 // Shift to get the doubleword position we want.
11495 int DWord = Idx >> 1;
11497 // High and low word positions are different on little endian.
11498 if (Subtarget.isLittleEndian())
11501 return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64,
11502 Op0.getOperand(0), DAG.getConstant(DWord, dl, MVT::i32));
11507 SDValue NewLoad[2];
11508 for (unsigned i = 0, ie = Op0.getNumOperands(); i != ie; ++i) {
11509 // Ensure both input are loads.
11510 SDValue LdOp = Op0.getOperand(i);
11511 if (LdOp.getOpcode() != ISD::LOAD)
11513 // Generate new load node.
11514 LoadSDNode *LD = cast<LoadSDNode>(LdOp);
11515 SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
11516 NewLoad[i] = DAG.getMemIntrinsicNode(
11517 PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
11518 LD->getMemoryVT(), LD->getMemOperand());
11521 DAG.getNode(Op0.getOpcode(), SDLoc(Op0), MVT::v4f32, NewLoad[0],
11522 NewLoad[1], Op0.getNode()->getFlags());
11523 return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewOp,
11524 DAG.getConstant(0, dl, MVT::i32));
11527 LoadSDNode *LD = cast<LoadSDNode>(Op0);
11528 SDValue LoadOps[] = {LD->getChain(), LD->getBasePtr()};
11529 SDValue NewLd = DAG.getMemIntrinsicNode(
11530 PPCISD::LD_VSX_LH, dl, DAG.getVTList(MVT::v4f32, MVT::Other), LoadOps,
11531 LD->getMemoryVT(), LD->getMemOperand());
11532 return DAG.getNode(PPCISD::FP_EXTEND_HALF, dl, MVT::v2f64, NewLd,
11533 DAG.getConstant(0, dl, MVT::i32));
11536 llvm_unreachable("ERROR:Should return for all cases within swtich.");
11539 /// LowerOperation - Provide custom lowering hooks for some operations.
11541 SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
11542 switch (Op.getOpcode()) {
11543 default: llvm_unreachable("Wasn't expecting to be able to lower this!");
11544 case ISD::FPOW: return lowerPow(Op, DAG);
11545 case ISD::FSIN: return lowerSin(Op, DAG);
11546 case ISD::FCOS: return lowerCos(Op, DAG);
11547 case ISD::FLOG: return lowerLog(Op, DAG);
11548 case ISD::FLOG10: return lowerLog10(Op, DAG);
11549 case ISD::FEXP: return lowerExp(Op, DAG);
11550 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
11551 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
11552 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
11553 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
11554 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
11555 case ISD::STRICT_FSETCC:
11556 case ISD::STRICT_FSETCCS:
11557 case ISD::SETCC: return LowerSETCC(Op, DAG);
11558 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
11559 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
11561 case ISD::INLINEASM:
11562 case ISD::INLINEASM_BR: return LowerINLINEASM(Op, DAG);
11563 // Variable argument lowering.
11564 case ISD::VASTART: return LowerVASTART(Op, DAG);
11565 case ISD::VAARG: return LowerVAARG(Op, DAG);
11566 case ISD::VACOPY: return LowerVACOPY(Op, DAG);
11568 case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG);
11569 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
11570 case ISD::GET_DYNAMIC_AREA_OFFSET:
11571 return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
11573 // Exception handling lowering.
11574 case ISD::EH_DWARF_CFA: return LowerEH_DWARF_CFA(Op, DAG);
11575 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
11576 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
11578 case ISD::LOAD: return LowerLOAD(Op, DAG);
11579 case ISD::STORE: return LowerSTORE(Op, DAG);
11580 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
11581 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
11582 case ISD::STRICT_FP_TO_UINT:
11583 case ISD::STRICT_FP_TO_SINT:
11584 case ISD::FP_TO_UINT:
11585 case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, SDLoc(Op));
11586 case ISD::STRICT_UINT_TO_FP:
11587 case ISD::STRICT_SINT_TO_FP:
11588 case ISD::UINT_TO_FP:
11589 case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
11590 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
11592 // Lower 64-bit shifts.
11593 case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG);
11594 case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);
11595 case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);
11597 case ISD::FSHL: return LowerFunnelShift(Op, DAG);
11598 case ISD::FSHR: return LowerFunnelShift(Op, DAG);
11600 // Vector-related lowering.
11601 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
11602 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
11603 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
11604 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
11605 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
11606 case ISD::MUL: return LowerMUL(Op, DAG);
11607 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
11608 case ISD::STRICT_FP_ROUND:
11609 case ISD::FP_ROUND:
11610 return LowerFP_ROUND(Op, DAG);
11611 case ISD::ROTL: return LowerROTL(Op, DAG);
11613 // For counter-based loop handling.
11614 case ISD::INTRINSIC_W_CHAIN: return SDValue();
11616 case ISD::BITCAST: return LowerBITCAST(Op, DAG);
11618 // Frame & Return address.
11619 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
11620 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
11622 case ISD::INTRINSIC_VOID:
11623 return LowerINTRINSIC_VOID(Op, DAG);
11625 return LowerBSWAP(Op, DAG);
11626 case ISD::ATOMIC_CMP_SWAP:
11627 return LowerATOMIC_CMP_SWAP(Op, DAG);
11628 case ISD::ATOMIC_STORE:
11629 return LowerATOMIC_LOAD_STORE(Op, DAG);
11630 case ISD::IS_FPCLASS:
11631 return LowerIS_FPCLASS(Op, DAG);
11635 void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
11636 SmallVectorImpl<SDValue>&Results,
11637 SelectionDAG &DAG) const {
11639 switch (N->getOpcode()) {
11641 llvm_unreachable("Do not know how to custom type legalize this operation!");
11642 case ISD::ATOMIC_LOAD: {
11643 SDValue Res = LowerATOMIC_LOAD_STORE(SDValue(N, 0), DAG);
11644 Results.push_back(Res);
11645 Results.push_back(Res.getValue(1));
11648 case ISD::READCYCLECOUNTER: {
11649 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
11650 SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));
11653 DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, RTB, RTB.getValue(1)));
11654 Results.push_back(RTB.getValue(2));
11657 case ISD::INTRINSIC_W_CHAIN: {
11658 if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
11659 Intrinsic::loop_decrement)
11662 assert(N->getValueType(0) == MVT::i1 &&
11663 "Unexpected result type for CTR decrement intrinsic");
11664 EVT SVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
11665 N->getValueType(0));
11666 SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
11667 SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
11670 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewInt));
11671 Results.push_back(NewInt.getValue(1));
11674 case ISD::INTRINSIC_WO_CHAIN: {
11675 switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
11676 case Intrinsic::ppc_pack_longdouble:
11677 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
11678 N->getOperand(2), N->getOperand(1)));
11680 case Intrinsic::ppc_maxfe:
11681 case Intrinsic::ppc_minfe:
11682 case Intrinsic::ppc_fnmsub:
11683 case Intrinsic::ppc_convert_f128_to_ppcf128:
11684 Results.push_back(LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG));
11690 if (!Subtarget.isSVR4ABI() || Subtarget.isPPC64())
11693 EVT VT = N->getValueType(0);
11695 if (VT == MVT::i64) {
11696 SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG);
11698 Results.push_back(NewNode);
11699 Results.push_back(NewNode.getValue(1));
11703 case ISD::STRICT_FP_TO_SINT:
11704 case ISD::STRICT_FP_TO_UINT:
11705 case ISD::FP_TO_SINT:
11706 case ISD::FP_TO_UINT: {
11707 // LowerFP_TO_INT() can only handle f32 and f64.
11708 if (N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType() ==
11711 SDValue LoweredValue = LowerFP_TO_INT(SDValue(N, 0), DAG, dl);
11712 Results.push_back(LoweredValue);
11713 if (N->isStrictFPOpcode())
11714 Results.push_back(LoweredValue.getValue(1));
11717 case ISD::TRUNCATE: {
11718 if (!N->getValueType(0).isVector())
11720 SDValue Lowered = LowerTRUNCATEVector(SDValue(N, 0), DAG);
11722 Results.push_back(Lowered);
11727 // Don't handle funnel shifts here.
11730 // Don't handle bitcast here.
11732 case ISD::FP_EXTEND:
11733 SDValue Lowered = LowerFP_EXTEND(SDValue(N, 0), DAG);
11735 Results.push_back(Lowered);
11740 //===----------------------------------------------------------------------===//
11741 // Other Lowering Code
11742 //===----------------------------------------------------------------------===//
11744 static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
11745 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
11746 Function *Func = Intrinsic::getDeclaration(M, Id);
11747 return Builder.CreateCall(Func, {});
11750 // The mappings for emitLeading/TrailingFence is taken from
11751 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
11752 Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
11754 AtomicOrdering Ord) const {
11755 if (Ord == AtomicOrdering::SequentiallyConsistent)
11756 return callIntrinsic(Builder, Intrinsic::ppc_sync);
11757 if (isReleaseOrStronger(Ord))
11758 return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
11762 Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
11764 AtomicOrdering Ord) const {
11765 if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
11766 // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
11767 // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
11768 // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
11769 if (isa<LoadInst>(Inst) && Subtarget.isPPC64())
11770 return Builder.CreateCall(
11771 Intrinsic::getDeclaration(
11772 Builder.GetInsertBlock()->getParent()->getParent(),
11773 Intrinsic::ppc_cfence, {Inst->getType()}),
11775 // FIXME: Can use isync for rmw operation.
11776 return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
11781 MachineBasicBlock *
11782 PPCTargetLowering::EmitAtomicBinary(MachineInstr &MI, MachineBasicBlock *BB,
11783 unsigned AtomicSize,
11784 unsigned BinOpcode,
11785 unsigned CmpOpcode,
11786 unsigned CmpPred) const {
11787 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
11788 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
11790 auto LoadMnemonic = PPC::LDARX;
11791 auto StoreMnemonic = PPC::STDCX;
11792 switch (AtomicSize) {
11794 llvm_unreachable("Unexpected size of atomic entity");
11796 LoadMnemonic = PPC::LBARX;
11797 StoreMnemonic = PPC::STBCX;
11798 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
11801 LoadMnemonic = PPC::LHARX;
11802 StoreMnemonic = PPC::STHCX;
11803 assert(Subtarget.hasPartwordAtomics() && "Call this only with size >=4");
11806 LoadMnemonic = PPC::LWARX;
11807 StoreMnemonic = PPC::STWCX;
11810 LoadMnemonic = PPC::LDARX;
11811 StoreMnemonic = PPC::STDCX;
11815 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11816 MachineFunction *F = BB->getParent();
11817 MachineFunction::iterator It = ++BB->getIterator();
11819 Register dest = MI.getOperand(0).getReg();
11820 Register ptrA = MI.getOperand(1).getReg();
11821 Register ptrB = MI.getOperand(2).getReg();
11822 Register incr = MI.getOperand(3).getReg();
11823 DebugLoc dl = MI.getDebugLoc();
11825 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
11826 MachineBasicBlock *loop2MBB =
11827 CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
11828 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
11829 F->insert(It, loopMBB);
11831 F->insert(It, loop2MBB);
11832 F->insert(It, exitMBB);
11833 exitMBB->splice(exitMBB->begin(), BB,
11834 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11835 exitMBB->transferSuccessorsAndUpdatePHIs(BB);
11837 MachineRegisterInfo &RegInfo = F->getRegInfo();
11838 Register TmpReg = (!BinOpcode) ? incr :
11839 RegInfo.createVirtualRegister( AtomicSize == 8 ? &PPC::G8RCRegClass
11840 : &PPC::GPRCRegClass);
11844 // fallthrough --> loopMBB
11845 BB->addSuccessor(loopMBB);
11848 // l[wd]arx dest, ptr
11849 // add r0, dest, incr
11850 // st[wd]cx. r0, ptr
11852 // fallthrough --> exitMBB
11856 // l[wd]arx dest, ptr
11857 // cmpl?[wd] dest, incr
11860 // st[wd]cx. dest, ptr
11862 // fallthrough --> exitMBB
11865 BuildMI(BB, dl, TII->get(LoadMnemonic), dest)
11866 .addReg(ptrA).addReg(ptrB);
11868 BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
11870 Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
11871 // Signed comparisons of byte or halfword values must be sign-extended.
11872 if (CmpOpcode == PPC::CMPW && AtomicSize < 4) {
11873 Register ExtReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
11874 BuildMI(BB, dl, TII->get(AtomicSize == 1 ? PPC::EXTSB : PPC::EXTSH),
11875 ExtReg).addReg(dest);
11876 BuildMI(BB, dl, TII->get(CmpOpcode), CrReg).addReg(ExtReg).addReg(incr);
11878 BuildMI(BB, dl, TII->get(CmpOpcode), CrReg).addReg(dest).addReg(incr);
11880 BuildMI(BB, dl, TII->get(PPC::BCC))
11884 BB->addSuccessor(loop2MBB);
11885 BB->addSuccessor(exitMBB);
11888 BuildMI(BB, dl, TII->get(StoreMnemonic))
11889 .addReg(TmpReg).addReg(ptrA).addReg(ptrB);
11890 BuildMI(BB, dl, TII->get(PPC::BCC))
11891 .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
11892 BB->addSuccessor(loopMBB);
11893 BB->addSuccessor(exitMBB);
11901 static bool isSignExtended(MachineInstr &MI, const PPCInstrInfo *TII) {
11902 switch(MI.getOpcode()) {
11906 return TII->isSignExtended(MI.getOperand(1).getReg(),
11907 &MI.getMF()->getRegInfo());
11931 case PPC::EXTSB8_32_64:
11932 case PPC::EXTSB8_rec:
11933 case PPC::EXTSB_rec:
11936 case PPC::EXTSH8_32_64:
11937 case PPC::EXTSH8_rec:
11938 case PPC::EXTSH_rec:
11940 case PPC::EXTSWSLI:
11941 case PPC::EXTSWSLI_32_64:
11942 case PPC::EXTSWSLI_32_64_rec:
11943 case PPC::EXTSWSLI_rec:
11944 case PPC::EXTSW_32:
11945 case PPC::EXTSW_32_64:
11946 case PPC::EXTSW_32_64_rec:
11947 case PPC::EXTSW_rec:
11950 case PPC::SRAWI_rec:
11951 case PPC::SRAW_rec:
11957 MachineBasicBlock *PPCTargetLowering::EmitPartwordAtomicBinary(
11958 MachineInstr &MI, MachineBasicBlock *BB,
11959 bool is8bit, // operation
11960 unsigned BinOpcode, unsigned CmpOpcode, unsigned CmpPred) const {
11961 // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
11962 const PPCInstrInfo *TII = Subtarget.getInstrInfo();
11964 // If this is a signed comparison and the value being compared is not known
11965 // to be sign extended, sign extend it here.
11966 DebugLoc dl = MI.getDebugLoc();
11967 MachineFunction *F = BB->getParent();
11968 MachineRegisterInfo &RegInfo = F->getRegInfo();
11969 Register incr = MI.getOperand(3).getReg();
11970 bool IsSignExtended =
11971 incr.isVirtual() && isSignExtended(*RegInfo.getVRegDef(incr), TII);
11973 if (CmpOpcode == PPC::CMPW && !IsSignExtended) {
11974 Register ValueReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
11975 BuildMI(*BB, MI, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueReg)
11976 .addReg(MI.getOperand(3).getReg());
11977 MI.getOperand(3).setReg(ValueReg);
11980 // If we support part-word atomic mnemonics, just use them
11981 if (Subtarget.hasPartwordAtomics())
11982 return EmitAtomicBinary(MI, BB, is8bit ? 1 : 2, BinOpcode, CmpOpcode,
11985 // In 64 bit mode we have to use 64 bits for addresses, even though the
11986 // lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address
11987 // registers without caring whether they're 32 or 64, but here we're
11988 // doing actual arithmetic on the addresses.
11989 bool is64bit = Subtarget.isPPC64();
11990 bool isLittleEndian = Subtarget.isLittleEndian();
11991 unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
11993 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11994 MachineFunction::iterator It = ++BB->getIterator();
11996 Register dest = MI.getOperand(0).getReg();
11997 Register ptrA = MI.getOperand(1).getReg();
11998 Register ptrB = MI.getOperand(2).getReg();
12000 MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
12001 MachineBasicBlock *loop2MBB =
12002 CmpOpcode ? F->CreateMachineBasicBlock(LLVM_BB) : nullptr;
12003 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
12004 F->insert(It, loopMBB);
12006 F->insert(It, loop2MBB);
12007 F->insert(It, exitMBB);
12008 exitMBB->splice(exitMBB->begin(), BB,
12009 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12010 exitMBB->transferSuccessorsAndUpdatePHIs(BB);
12012 const TargetRegisterClass *RC =
12013 is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
12014 const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
12016 Register PtrReg = RegInfo.createVirtualRegister(RC);
12017 Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
12018 Register ShiftReg =
12019 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
12020 Register Incr2Reg = RegInfo.createVirtualRegister(GPRC);
12021 Register MaskReg = RegInfo.createVirtualRegister(GPRC);
12022 Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
12023 Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
12024 Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
12025 Register Tmp3Reg = RegInfo.createVirtualRegister(GPRC);
12026 Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
12027 Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
12028 Register SrwDestReg = RegInfo.createVirtualRegister(GPRC);
12031 (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(GPRC);
12035 // fallthrough --> loopMBB
12036 BB->addSuccessor(loopMBB);
12038 // The 4-byte load must be aligned, while a char or short may be
12039 // anywhere in the word. Hence all this nasty bookkeeping code.
12040 // add ptr1, ptrA, ptrB [copy if ptrA==0]
12041 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
12042 // xori shift, shift1, 24 [16]
12043 // rlwinm ptr, ptr1, 0, 0, 29
12044 // slw incr2, incr, shift
12045 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
12046 // slw mask, mask2, shift
12048 // lwarx tmpDest, ptr
12049 // add tmp, tmpDest, incr2
12050 // andc tmp2, tmpDest, mask
12051 // and tmp3, tmp, mask
12052 // or tmp4, tmp3, tmp2
12053 // stwcx. tmp4, ptr
12055 // fallthrough --> exitMBB
12056 // srw SrwDest, tmpDest, shift
12057 // rlwinm SrwDest, SrwDest, 0, 24 [16], 31
12058 if (ptrA != ZeroReg) {
12059 Ptr1Reg = RegInfo.createVirtualRegister(RC);
12060 BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
12066 // We need use 32-bit subregister to avoid mismatch register class in 64-bit
12068 BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg)
12069 .addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0)
12072 .addImm(is8bit ? 28 : 27);
12073 if (!isLittleEndian)
12074 BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg)
12076 .addImm(is8bit ? 24 : 16);
12078 BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
12083 BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
12088 BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg).addReg(incr).addReg(ShiftReg);
12090 BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
12092 BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
12093 BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
12097 BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
12102 BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
12106 BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
12108 .addReg(TmpDestReg);
12109 BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg)
12110 .addReg(TmpDestReg)
12112 BuildMI(BB, dl, TII->get(PPC::AND), Tmp3Reg).addReg(TmpReg).addReg(MaskReg);
12114 // For unsigned comparisons, we can directly compare the shifted values.
12115 // For signed comparisons we shift and sign extend.
12116 Register SReg = RegInfo.createVirtualRegister(GPRC);
12117 Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
12118 BuildMI(BB, dl, TII->get(PPC::AND), SReg)
12119 .addReg(TmpDestReg)
12121 unsigned ValueReg = SReg;
12122 unsigned CmpReg = Incr2Reg;
12123 if (CmpOpcode == PPC::CMPW) {
12124 ValueReg = RegInfo.createVirtualRegister(GPRC);
12125 BuildMI(BB, dl, TII->get(PPC::SRW), ValueReg)
12128 Register ValueSReg = RegInfo.createVirtualRegister(GPRC);
12129 BuildMI(BB, dl, TII->get(is8bit ? PPC::EXTSB : PPC::EXTSH), ValueSReg)
12131 ValueReg = ValueSReg;
12134 BuildMI(BB, dl, TII->get(CmpOpcode), CrReg).addReg(ValueReg).addReg(CmpReg);
12135 BuildMI(BB, dl, TII->get(PPC::BCC))
12139 BB->addSuccessor(loop2MBB);
12140 BB->addSuccessor(exitMBB);
12143 BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg).addReg(Tmp3Reg).addReg(Tmp2Reg);
12144 BuildMI(BB, dl, TII->get(PPC::STWCX))
12148 BuildMI(BB, dl, TII->get(PPC::BCC))
12149 .addImm(PPC::PRED_NE)
12152 BB->addSuccessor(loopMBB);
12153 BB->addSuccessor(exitMBB);
12158 // Since the shift amount is not a constant, we need to clear
12159 // the upper bits with a separate RLWINM.
12160 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::RLWINM), dest)
12161 .addReg(SrwDestReg)
12163 .addImm(is8bit ? 24 : 16)
12165 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), SrwDestReg)
12166 .addReg(TmpDestReg)
12171 llvm::MachineBasicBlock *
12172 PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
12173 MachineBasicBlock *MBB) const {
12174 DebugLoc DL = MI.getDebugLoc();
12175 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
12176 const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
12178 MachineFunction *MF = MBB->getParent();
12179 MachineRegisterInfo &MRI = MF->getRegInfo();
12181 const BasicBlock *BB = MBB->getBasicBlock();
12182 MachineFunction::iterator I = ++MBB->getIterator();
12184 Register DstReg = MI.getOperand(0).getReg();
12185 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
12186 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
12187 Register mainDstReg = MRI.createVirtualRegister(RC);
12188 Register restoreDstReg = MRI.createVirtualRegister(RC);
12190 MVT PVT = getPointerTy(MF->getDataLayout());
12191 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
12192 "Invalid Pointer Size!");
12193 // For v = setjmp(buf), we generate
12196 // SjLjSetup mainMBB
12202 // buf[LabelOffset] = LR
12206 // v = phi(main, restore)
12209 MachineBasicBlock *thisMBB = MBB;
12210 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
12211 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
12212 MF->insert(I, mainMBB);
12213 MF->insert(I, sinkMBB);
12215 MachineInstrBuilder MIB;
12217 // Transfer the remainder of BB and its successor edges to sinkMBB.
12218 sinkMBB->splice(sinkMBB->begin(), MBB,
12219 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
12220 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
12222 // Note that the structure of the jmp_buf used here is not compatible
12223 // with that used by libc, and is not designed to be. Specifically, it
12224 // stores only those 'reserved' registers that LLVM does not otherwise
12225 // understand how to spill. Also, by convention, by the time this
12226 // intrinsic is called, Clang has already stored the frame address in the
12227 // first slot of the buffer and stack address in the third. Following the
12228 // X86 target code, we'll store the jump address in the second slot. We also
12229 // need to save the TOC pointer (R2) to handle jumps between shared
12230 // libraries, and that will be stored in the fourth slot. The thread
12231 // identifier (R13) is not affected.
12234 const int64_t LabelOffset = 1 * PVT.getStoreSize();
12235 const int64_t TOCOffset = 3 * PVT.getStoreSize();
12236 const int64_t BPOffset = 4 * PVT.getStoreSize();
12238 // Prepare IP either in reg.
12239 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
12240 Register LabelReg = MRI.createVirtualRegister(PtrRC);
12241 Register BufReg = MI.getOperand(1).getReg();
12243 if (Subtarget.is64BitELFABI()) {
12244 setUsesTOCBasePtr(*MBB->getParent());
12245 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
12252 // Naked functions never have a base pointer, and so we use r1. For all
12253 // other functions, this decision must be delayed until during PEI.
12255 if (MF->getFunction().hasFnAttribute(Attribute::Naked))
12256 BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
12258 BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
12260 MIB = BuildMI(*thisMBB, MI, DL,
12261 TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
12268 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
12269 MIB.addRegMask(TRI->getNoPreservedMask());
12271 BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
12273 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup))
12275 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);
12277 thisMBB->addSuccessor(mainMBB, BranchProbability::getZero());
12278 thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne());
12283 BuildMI(mainMBB, DL,
12284 TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
12287 if (Subtarget.isPPC64()) {
12288 MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
12290 .addImm(LabelOffset)
12293 MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
12295 .addImm(LabelOffset)
12298 MIB.cloneMemRefs(MI);
12300 BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0);
12301 mainMBB->addSuccessor(sinkMBB);
12304 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
12305 TII->get(PPC::PHI), DstReg)
12306 .addReg(mainDstReg).addMBB(mainMBB)
12307 .addReg(restoreDstReg).addMBB(thisMBB);
12309 MI.eraseFromParent();
12313 MachineBasicBlock *
12314 PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
12315 MachineBasicBlock *MBB) const {
12316 DebugLoc DL = MI.getDebugLoc();
12317 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
12319 MachineFunction *MF = MBB->getParent();
12320 MachineRegisterInfo &MRI = MF->getRegInfo();
12322 MVT PVT = getPointerTy(MF->getDataLayout());
12323 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
12324 "Invalid Pointer Size!");
12326 const TargetRegisterClass *RC =
12327 (PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
12328 Register Tmp = MRI.createVirtualRegister(RC);
12329 // Since FP is only updated here but NOT referenced, it's treated as GPR.
12330 unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
12331 unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
12335 : (Subtarget.isSVR4ABI() && isPositionIndependent() ? PPC::R29
12338 MachineInstrBuilder MIB;
12340 const int64_t LabelOffset = 1 * PVT.getStoreSize();
12341 const int64_t SPOffset = 2 * PVT.getStoreSize();
12342 const int64_t TOCOffset = 3 * PVT.getStoreSize();
12343 const int64_t BPOffset = 4 * PVT.getStoreSize();
12345 Register BufReg = MI.getOperand(0).getReg();
12347 // Reload FP (the jumped-to function may not have had a
12348 // frame pointer, and if so, then its r31 will be restored
12350 if (PVT == MVT::i64) {
12351 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP)
12355 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP)
12359 MIB.cloneMemRefs(MI);
12362 if (PVT == MVT::i64) {
12363 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
12364 .addImm(LabelOffset)
12367 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
12368 .addImm(LabelOffset)
12371 MIB.cloneMemRefs(MI);
12374 if (PVT == MVT::i64) {
12375 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
12379 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
12383 MIB.cloneMemRefs(MI);
12386 if (PVT == MVT::i64) {
12387 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP)
12391 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP)
12395 MIB.cloneMemRefs(MI);
12398 if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
12399 setUsesTOCBasePtr(*MBB->getParent());
12400 MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
12407 BuildMI(*MBB, MI, DL,
12408 TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);
12409 BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));
12411 MI.eraseFromParent();
12415 bool PPCTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
12416 // If the function specifically requests inline stack probes, emit them.
12417 if (MF.getFunction().hasFnAttribute("probe-stack"))
12418 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
12423 unsigned PPCTargetLowering::getStackProbeSize(const MachineFunction &MF) const {
12424 const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
12425 unsigned StackAlign = TFI->getStackAlignment();
12426 assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
12427 "Unexpected stack alignment");
12428 // The default stack probe size is 4096 if the function has no
12429 // stack-probe-size attribute.
12430 const Function &Fn = MF.getFunction();
12431 unsigned StackProbeSize =
12432 Fn.getFnAttributeAsParsedInteger("stack-probe-size", 4096);
12433 // Round down to the stack alignment.
12434 StackProbeSize &= ~(StackAlign - 1);
12435 return StackProbeSize ? StackProbeSize : StackAlign;
12438 // Lower dynamic stack allocation with probing. `emitProbedAlloca` is splitted
12439 // into three phases. In the first phase, it uses pseudo instruction
12440 // PREPARE_PROBED_ALLOCA to get the future result of actual FramePointer and
12441 // FinalStackPtr. In the second phase, it generates a loop for probing blocks.
12442 // At last, it uses pseudo instruction DYNAREAOFFSET to get the future result of
12443 // MaxCallFrameSize so that it can calculate correct data area pointer.
12444 MachineBasicBlock *
12445 PPCTargetLowering::emitProbedAlloca(MachineInstr &MI,
12446 MachineBasicBlock *MBB) const {
12447 const bool isPPC64 = Subtarget.isPPC64();
12448 MachineFunction *MF = MBB->getParent();
12449 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
12450 DebugLoc DL = MI.getDebugLoc();
12451 const unsigned ProbeSize = getStackProbeSize(*MF);
12452 const BasicBlock *ProbedBB = MBB->getBasicBlock();
12453 MachineRegisterInfo &MRI = MF->getRegInfo();
12454 // The CFG of probing stack looks as
12460 // +--->+ TestMBB +---+
12463 // | +-----v----+ |
12464 // +---+ BlockMBB | |
12470 // In MBB, calculate previous frame pointer and final stack pointer.
12471 // In TestMBB, test if sp is equal to final stack pointer, if so, jump to
12472 // TailMBB. In BlockMBB, update the sp atomically and jump back to TestMBB.
12473 // TailMBB is spliced via \p MI.
12474 MachineBasicBlock *TestMBB = MF->CreateMachineBasicBlock(ProbedBB);
12475 MachineBasicBlock *TailMBB = MF->CreateMachineBasicBlock(ProbedBB);
12476 MachineBasicBlock *BlockMBB = MF->CreateMachineBasicBlock(ProbedBB);
12478 MachineFunction::iterator MBBIter = ++MBB->getIterator();
12479 MF->insert(MBBIter, TestMBB);
12480 MF->insert(MBBIter, BlockMBB);
12481 MF->insert(MBBIter, TailMBB);
12483 const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
12484 const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
12486 Register DstReg = MI.getOperand(0).getReg();
12487 Register NegSizeReg = MI.getOperand(1).getReg();
12488 Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;
12489 Register FinalStackPtr = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12490 Register FramePointer = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12491 Register ActualNegSizeReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12493 // Since value of NegSizeReg might be realigned in prologepilog, insert a
12494 // PREPARE_PROBED_ALLOCA pseudo instruction to get actual FramePointer and
12497 if (!MRI.hasOneNonDBGUse(NegSizeReg))
12499 isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64 : PPC::PREPARE_PROBED_ALLOCA_32;
12501 // By introducing PREPARE_PROBED_ALLOCA_NEGSIZE_OPT, ActualNegSizeReg
12502 // and NegSizeReg will be allocated in the same phyreg to avoid
12503 // redundant copy when NegSizeReg has only one use which is current MI and
12504 // will be replaced by PREPARE_PROBED_ALLOCA then.
12505 ProbeOpc = isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_64
12506 : PPC::PREPARE_PROBED_ALLOCA_NEGSIZE_SAME_REG_32;
12507 BuildMI(*MBB, {MI}, DL, TII->get(ProbeOpc), FramePointer)
12508 .addDef(ActualNegSizeReg)
12509 .addReg(NegSizeReg)
12510 .add(MI.getOperand(2))
12511 .add(MI.getOperand(3));
12513 // Calculate final stack pointer, which equals to SP + ActualNegSize.
12514 BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4),
12517 .addReg(ActualNegSizeReg);
12519 // Materialize a scratch register for update.
12520 int64_t NegProbeSize = -(int64_t)ProbeSize;
12521 assert(isInt<32>(NegProbeSize) && "Unhandled probe size!");
12522 Register ScratchReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12523 if (!isInt<16>(NegProbeSize)) {
12524 Register TempReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12525 BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS), TempReg)
12526 .addImm(NegProbeSize >> 16);
12527 BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI),
12530 .addImm(NegProbeSize & 0xFFFF);
12532 BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LI8 : PPC::LI), ScratchReg)
12533 .addImm(NegProbeSize);
12536 // Probing leading residual part.
12537 Register Div = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12538 BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::DIVD : PPC::DIVW), Div)
12539 .addReg(ActualNegSizeReg)
12540 .addReg(ScratchReg);
12541 Register Mul = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12542 BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::MULLD : PPC::MULLW), Mul)
12544 .addReg(ScratchReg);
12545 Register NegMod = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12546 BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::SUBF8 : PPC::SUBF), NegMod)
12548 .addReg(ActualNegSizeReg);
12549 BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
12550 .addReg(FramePointer)
12556 // Remaining part should be multiple of ProbeSize.
12557 Register CmpResult = MRI.createVirtualRegister(&PPC::CRRCRegClass);
12558 BuildMI(TestMBB, DL, TII->get(isPPC64 ? PPC::CMPD : PPC::CMPW), CmpResult)
12560 .addReg(FinalStackPtr);
12561 BuildMI(TestMBB, DL, TII->get(PPC::BCC))
12562 .addImm(PPC::PRED_EQ)
12565 TestMBB->addSuccessor(BlockMBB);
12566 TestMBB->addSuccessor(TailMBB);
12570 // Touch the block.
12572 BuildMI(BlockMBB, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
12573 .addReg(FramePointer)
12575 .addReg(ScratchReg);
12576 BuildMI(BlockMBB, DL, TII->get(PPC::B)).addMBB(TestMBB);
12577 BlockMBB->addSuccessor(TestMBB);
12580 // Calculation of MaxCallFrameSize is deferred to prologepilog, use
12581 // DYNAREAOFFSET pseudo instruction to get the future result.
12582 Register MaxCallFrameSizeReg =
12583 MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
12584 BuildMI(TailMBB, DL,
12585 TII->get(isPPC64 ? PPC::DYNAREAOFFSET8 : PPC::DYNAREAOFFSET),
12586 MaxCallFrameSizeReg)
12587 .add(MI.getOperand(2))
12588 .add(MI.getOperand(3));
12589 BuildMI(TailMBB, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4), DstReg)
12591 .addReg(MaxCallFrameSizeReg);
12593 // Splice instructions after MI to TailMBB.
12594 TailMBB->splice(TailMBB->end(), MBB,
12595 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
12596 TailMBB->transferSuccessorsAndUpdatePHIs(MBB);
12597 MBB->addSuccessor(TestMBB);
12599 // Delete the pseudo instruction.
12600 MI.eraseFromParent();
12602 ++NumDynamicAllocaProbed;
12606 MachineBasicBlock *
12607 PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
12608 MachineBasicBlock *BB) const {
12609 if (MI.getOpcode() == TargetOpcode::STACKMAP ||
12610 MI.getOpcode() == TargetOpcode::PATCHPOINT) {
12611 if (Subtarget.is64BitELFABI() &&
12612 MI.getOpcode() == TargetOpcode::PATCHPOINT &&
12613 !Subtarget.isUsingPCRelativeCalls()) {
12614 // Call lowering should have added an r2 operand to indicate a dependence
12615 // on the TOC base pointer value. It can't however, because there is no
12616 // way to mark the dependence as implicit there, and so the stackmap code
12617 // will confuse it with a regular operand. Instead, add the dependence
12619 MI.addOperand(MachineOperand::CreateReg(PPC::X2, false, true));
12622 return emitPatchPoint(MI, BB);
12625 if (MI.getOpcode() == PPC::EH_SjLj_SetJmp32 ||
12626 MI.getOpcode() == PPC::EH_SjLj_SetJmp64) {
12627 return emitEHSjLjSetJmp(MI, BB);
12628 } else if (MI.getOpcode() == PPC::EH_SjLj_LongJmp32 ||
12629 MI.getOpcode() == PPC::EH_SjLj_LongJmp64) {
12630 return emitEHSjLjLongJmp(MI, BB);
12633 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
12635 // To "insert" these instructions we actually have to insert their
12636 // control-flow patterns.
12637 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12638 MachineFunction::iterator It = ++BB->getIterator();
12640 MachineFunction *F = BB->getParent();
12641 MachineRegisterInfo &MRI = F->getRegInfo();
12643 if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
12644 MI.getOpcode() == PPC::SELECT_CC_I8 || MI.getOpcode() == PPC::SELECT_I4 ||
12645 MI.getOpcode() == PPC::SELECT_I8) {
12646 SmallVector<MachineOperand, 2> Cond;
12647 if (MI.getOpcode() == PPC::SELECT_CC_I4 ||
12648 MI.getOpcode() == PPC::SELECT_CC_I8)
12649 Cond.push_back(MI.getOperand(4));
12651 Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
12652 Cond.push_back(MI.getOperand(1));
12654 DebugLoc dl = MI.getDebugLoc();
12655 TII->insertSelect(*BB, MI, dl, MI.getOperand(0).getReg(), Cond,
12656 MI.getOperand(2).getReg(), MI.getOperand(3).getReg());
12657 } else if (MI.getOpcode() == PPC::SELECT_CC_F4 ||
12658 MI.getOpcode() == PPC::SELECT_CC_F8 ||
12659 MI.getOpcode() == PPC::SELECT_CC_F16 ||
12660 MI.getOpcode() == PPC::SELECT_CC_VRRC ||
12661 MI.getOpcode() == PPC::SELECT_CC_VSFRC ||
12662 MI.getOpcode() == PPC::SELECT_CC_VSSRC ||
12663 MI.getOpcode() == PPC::SELECT_CC_VSRC ||
12664 MI.getOpcode() == PPC::SELECT_CC_SPE4 ||
12665 MI.getOpcode() == PPC::SELECT_CC_SPE ||
12666 MI.getOpcode() == PPC::SELECT_F4 ||
12667 MI.getOpcode() == PPC::SELECT_F8 ||
12668 MI.getOpcode() == PPC::SELECT_F16 ||
12669 MI.getOpcode() == PPC::SELECT_SPE ||
12670 MI.getOpcode() == PPC::SELECT_SPE4 ||
12671 MI.getOpcode() == PPC::SELECT_VRRC ||
12672 MI.getOpcode() == PPC::SELECT_VSFRC ||
12673 MI.getOpcode() == PPC::SELECT_VSSRC ||
12674 MI.getOpcode() == PPC::SELECT_VSRC) {
12675 // The incoming instruction knows the destination vreg to set, the
12676 // condition code register to branch on, the true/false values to
12677 // select between, and a branch opcode to use.
12682 // cmpTY ccX, r1, r2
12684 // fallthrough --> copy0MBB
12685 MachineBasicBlock *thisMBB = BB;
12686 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12687 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12688 DebugLoc dl = MI.getDebugLoc();
12689 F->insert(It, copy0MBB);
12690 F->insert(It, sinkMBB);
12692 // Transfer the remainder of BB and its successor edges to sinkMBB.
12693 sinkMBB->splice(sinkMBB->begin(), BB,
12694 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12695 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
12697 // Next, add the true and fallthrough blocks as its successors.
12698 BB->addSuccessor(copy0MBB);
12699 BB->addSuccessor(sinkMBB);
12701 if (MI.getOpcode() == PPC::SELECT_I4 || MI.getOpcode() == PPC::SELECT_I8 ||
12702 MI.getOpcode() == PPC::SELECT_F4 || MI.getOpcode() == PPC::SELECT_F8 ||
12703 MI.getOpcode() == PPC::SELECT_F16 ||
12704 MI.getOpcode() == PPC::SELECT_SPE4 ||
12705 MI.getOpcode() == PPC::SELECT_SPE ||
12706 MI.getOpcode() == PPC::SELECT_VRRC ||
12707 MI.getOpcode() == PPC::SELECT_VSFRC ||
12708 MI.getOpcode() == PPC::SELECT_VSSRC ||
12709 MI.getOpcode() == PPC::SELECT_VSRC) {
12710 BuildMI(BB, dl, TII->get(PPC::BC))
12711 .addReg(MI.getOperand(1).getReg())
12714 unsigned SelectPred = MI.getOperand(4).getImm();
12715 BuildMI(BB, dl, TII->get(PPC::BCC))
12716 .addImm(SelectPred)
12717 .addReg(MI.getOperand(1).getReg())
12722 // %FalseValue = ...
12723 // # fallthrough to sinkMBB
12726 // Update machine-CFG edges
12727 BB->addSuccessor(sinkMBB);
12730 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12733 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::PHI), MI.getOperand(0).getReg())
12734 .addReg(MI.getOperand(3).getReg())
12736 .addReg(MI.getOperand(2).getReg())
12738 } else if (MI.getOpcode() == PPC::ReadTB) {
12739 // To read the 64-bit time-base register on a 32-bit target, we read the
12740 // two halves. Should the counter have wrapped while it was being read, we
12741 // need to try again.
12744 // mfspr Rx,TBU # load from TBU
12745 // mfspr Ry,TB # load from TB
12746 // mfspr Rz,TBU # load from TBU
12747 // cmpw crX,Rx,Rz # check if 'old'='new'
12748 // bne readLoop # branch if they're not equal
12751 MachineBasicBlock *readMBB = F->CreateMachineBasicBlock(LLVM_BB);
12752 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12753 DebugLoc dl = MI.getDebugLoc();
12754 F->insert(It, readMBB);
12755 F->insert(It, sinkMBB);
12757 // Transfer the remainder of BB and its successor edges to sinkMBB.
12758 sinkMBB->splice(sinkMBB->begin(), BB,
12759 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12760 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
12762 BB->addSuccessor(readMBB);
12765 MachineRegisterInfo &RegInfo = F->getRegInfo();
12766 Register ReadAgainReg = RegInfo.createVirtualRegister(&PPC::GPRCRegClass);
12767 Register LoReg = MI.getOperand(0).getReg();
12768 Register HiReg = MI.getOperand(1).getReg();
12770 BuildMI(BB, dl, TII->get(PPC::MFSPR), HiReg).addImm(269);
12771 BuildMI(BB, dl, TII->get(PPC::MFSPR), LoReg).addImm(268);
12772 BuildMI(BB, dl, TII->get(PPC::MFSPR), ReadAgainReg).addImm(269);
12774 Register CmpReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
12776 BuildMI(BB, dl, TII->get(PPC::CMPW), CmpReg)
12778 .addReg(ReadAgainReg);
12779 BuildMI(BB, dl, TII->get(PPC::BCC))
12780 .addImm(PPC::PRED_NE)
12784 BB->addSuccessor(readMBB);
12785 BB->addSuccessor(sinkMBB);
12786 } else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
12787 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
12788 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
12789 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
12790 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
12791 BB = EmitAtomicBinary(MI, BB, 4, PPC::ADD4);
12792 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
12793 BB = EmitAtomicBinary(MI, BB, 8, PPC::ADD8);
12795 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
12796 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
12797 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
12798 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
12799 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
12800 BB = EmitAtomicBinary(MI, BB, 4, PPC::AND);
12801 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
12802 BB = EmitAtomicBinary(MI, BB, 8, PPC::AND8);
12804 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
12805 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
12806 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
12807 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
12808 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
12809 BB = EmitAtomicBinary(MI, BB, 4, PPC::OR);
12810 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
12811 BB = EmitAtomicBinary(MI, BB, 8, PPC::OR8);
12813 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
12814 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
12815 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
12816 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
12817 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
12818 BB = EmitAtomicBinary(MI, BB, 4, PPC::XOR);
12819 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
12820 BB = EmitAtomicBinary(MI, BB, 8, PPC::XOR8);
12822 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
12823 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
12824 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
12825 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
12826 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
12827 BB = EmitAtomicBinary(MI, BB, 4, PPC::NAND);
12828 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
12829 BB = EmitAtomicBinary(MI, BB, 8, PPC::NAND8);
12831 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
12832 BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
12833 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
12834 BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
12835 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
12836 BB = EmitAtomicBinary(MI, BB, 4, PPC::SUBF);
12837 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
12838 BB = EmitAtomicBinary(MI, BB, 8, PPC::SUBF8);
12840 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I8)
12841 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_LT);
12842 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I16)
12843 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_LT);
12844 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I32)
12845 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_LT);
12846 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MIN_I64)
12847 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_LT);
12849 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I8)
12850 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPW, PPC::PRED_GT);
12851 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I16)
12852 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPW, PPC::PRED_GT);
12853 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I32)
12854 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPW, PPC::PRED_GT);
12855 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_MAX_I64)
12856 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPD, PPC::PRED_GT);
12858 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I8)
12859 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_LT);
12860 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I16)
12861 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_LT);
12862 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I32)
12863 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_LT);
12864 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMIN_I64)
12865 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_LT);
12867 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I8)
12868 BB = EmitPartwordAtomicBinary(MI, BB, true, 0, PPC::CMPLW, PPC::PRED_GT);
12869 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I16)
12870 BB = EmitPartwordAtomicBinary(MI, BB, false, 0, PPC::CMPLW, PPC::PRED_GT);
12871 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I32)
12872 BB = EmitAtomicBinary(MI, BB, 4, 0, PPC::CMPLW, PPC::PRED_GT);
12873 else if (MI.getOpcode() == PPC::ATOMIC_LOAD_UMAX_I64)
12874 BB = EmitAtomicBinary(MI, BB, 8, 0, PPC::CMPLD, PPC::PRED_GT);
12876 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I8)
12877 BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
12878 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I16)
12879 BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
12880 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I32)
12881 BB = EmitAtomicBinary(MI, BB, 4, 0);
12882 else if (MI.getOpcode() == PPC::ATOMIC_SWAP_I64)
12883 BB = EmitAtomicBinary(MI, BB, 8, 0);
12884 else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
12885 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64 ||
12886 (Subtarget.hasPartwordAtomics() &&
12887 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8) ||
12888 (Subtarget.hasPartwordAtomics() &&
12889 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16)) {
12890 bool is64bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
12892 auto LoadMnemonic = PPC::LDARX;
12893 auto StoreMnemonic = PPC::STDCX;
12894 switch (MI.getOpcode()) {
12896 llvm_unreachable("Compare and swap of unknown size");
12897 case PPC::ATOMIC_CMP_SWAP_I8:
12898 LoadMnemonic = PPC::LBARX;
12899 StoreMnemonic = PPC::STBCX;
12900 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
12902 case PPC::ATOMIC_CMP_SWAP_I16:
12903 LoadMnemonic = PPC::LHARX;
12904 StoreMnemonic = PPC::STHCX;
12905 assert(Subtarget.hasPartwordAtomics() && "No support partword atomics.");
12907 case PPC::ATOMIC_CMP_SWAP_I32:
12908 LoadMnemonic = PPC::LWARX;
12909 StoreMnemonic = PPC::STWCX;
12911 case PPC::ATOMIC_CMP_SWAP_I64:
12912 LoadMnemonic = PPC::LDARX;
12913 StoreMnemonic = PPC::STDCX;
12916 MachineRegisterInfo &RegInfo = F->getRegInfo();
12917 Register dest = MI.getOperand(0).getReg();
12918 Register ptrA = MI.getOperand(1).getReg();
12919 Register ptrB = MI.getOperand(2).getReg();
12920 Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
12921 Register oldval = MI.getOperand(3).getReg();
12922 Register newval = MI.getOperand(4).getReg();
12923 DebugLoc dl = MI.getDebugLoc();
12925 MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
12926 MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
12927 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
12928 F->insert(It, loop1MBB);
12929 F->insert(It, loop2MBB);
12930 F->insert(It, exitMBB);
12931 exitMBB->splice(exitMBB->begin(), BB,
12932 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12933 exitMBB->transferSuccessorsAndUpdatePHIs(BB);
12937 // fallthrough --> loopMBB
12938 BB->addSuccessor(loop1MBB);
12941 // l[bhwd]arx dest, ptr
12942 // cmp[wd] dest, oldval
12945 // st[bhwd]cx. newval, ptr
12950 BuildMI(BB, dl, TII->get(LoadMnemonic), dest).addReg(ptrA).addReg(ptrB);
12951 BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), CrReg)
12954 BuildMI(BB, dl, TII->get(PPC::BCC))
12955 .addImm(PPC::PRED_NE)
12958 BB->addSuccessor(loop2MBB);
12959 BB->addSuccessor(exitMBB);
12962 BuildMI(BB, dl, TII->get(StoreMnemonic))
12966 BuildMI(BB, dl, TII->get(PPC::BCC))
12967 .addImm(PPC::PRED_NE)
12970 BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
12971 BB->addSuccessor(loop1MBB);
12972 BB->addSuccessor(exitMBB);
12977 } else if (MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
12978 MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
12979 // We must use 64-bit registers for addresses when targeting 64-bit,
12980 // since we're actually doing arithmetic on them. Other registers
12982 bool is64bit = Subtarget.isPPC64();
12983 bool isLittleEndian = Subtarget.isLittleEndian();
12984 bool is8bit = MI.getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
12986 Register dest = MI.getOperand(0).getReg();
12987 Register ptrA = MI.getOperand(1).getReg();
12988 Register ptrB = MI.getOperand(2).getReg();
12989 Register oldval = MI.getOperand(3).getReg();
12990 Register newval = MI.getOperand(4).getReg();
12991 DebugLoc dl = MI.getDebugLoc();
12993 MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
12994 MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
12995 MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
12996 F->insert(It, loop1MBB);
12997 F->insert(It, loop2MBB);
12998 F->insert(It, exitMBB);
12999 exitMBB->splice(exitMBB->begin(), BB,
13000 std::next(MachineBasicBlock::iterator(MI)), BB->end());
13001 exitMBB->transferSuccessorsAndUpdatePHIs(BB);
13003 MachineRegisterInfo &RegInfo = F->getRegInfo();
13004 const TargetRegisterClass *RC =
13005 is64bit ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
13006 const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
13008 Register PtrReg = RegInfo.createVirtualRegister(RC);
13009 Register Shift1Reg = RegInfo.createVirtualRegister(GPRC);
13010 Register ShiftReg =
13011 isLittleEndian ? Shift1Reg : RegInfo.createVirtualRegister(GPRC);
13012 Register NewVal2Reg = RegInfo.createVirtualRegister(GPRC);
13013 Register NewVal3Reg = RegInfo.createVirtualRegister(GPRC);
13014 Register OldVal2Reg = RegInfo.createVirtualRegister(GPRC);
13015 Register OldVal3Reg = RegInfo.createVirtualRegister(GPRC);
13016 Register MaskReg = RegInfo.createVirtualRegister(GPRC);
13017 Register Mask2Reg = RegInfo.createVirtualRegister(GPRC);
13018 Register Mask3Reg = RegInfo.createVirtualRegister(GPRC);
13019 Register Tmp2Reg = RegInfo.createVirtualRegister(GPRC);
13020 Register Tmp4Reg = RegInfo.createVirtualRegister(GPRC);
13021 Register TmpDestReg = RegInfo.createVirtualRegister(GPRC);
13023 Register TmpReg = RegInfo.createVirtualRegister(GPRC);
13024 Register ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
13025 Register CrReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
13028 // fallthrough --> loopMBB
13029 BB->addSuccessor(loop1MBB);
13031 // The 4-byte load must be aligned, while a char or short may be
13032 // anywhere in the word. Hence all this nasty bookkeeping code.
13033 // add ptr1, ptrA, ptrB [copy if ptrA==0]
13034 // rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
13035 // xori shift, shift1, 24 [16]
13036 // rlwinm ptr, ptr1, 0, 0, 29
13037 // slw newval2, newval, shift
13038 // slw oldval2, oldval,shift
13039 // li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
13040 // slw mask, mask2, shift
13041 // and newval3, newval2, mask
13042 // and oldval3, oldval2, mask
13044 // lwarx tmpDest, ptr
13045 // and tmp, tmpDest, mask
13046 // cmpw tmp, oldval3
13049 // andc tmp2, tmpDest, mask
13050 // or tmp4, tmp2, newval3
13051 // stwcx. tmp4, ptr
13055 // srw dest, tmpDest, shift
13056 if (ptrA != ZeroReg) {
13057 Ptr1Reg = RegInfo.createVirtualRegister(RC);
13058 BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
13065 // We need use 32-bit subregister to avoid mismatch register class in 64-bit
13067 BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg)
13068 .addReg(Ptr1Reg, 0, is64bit ? PPC::sub_32 : 0)
13071 .addImm(is8bit ? 28 : 27);
13072 if (!isLittleEndian)
13073 BuildMI(BB, dl, TII->get(PPC::XORI), ShiftReg)
13075 .addImm(is8bit ? 24 : 16);
13077 BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
13082 BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
13087 BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
13090 BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
13094 BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
13096 BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
13097 BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
13101 BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
13104 BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
13105 .addReg(NewVal2Reg)
13107 BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
13108 .addReg(OldVal2Reg)
13112 BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
13115 BuildMI(BB, dl, TII->get(PPC::AND), TmpReg)
13116 .addReg(TmpDestReg)
13118 BuildMI(BB, dl, TII->get(PPC::CMPW), CrReg)
13120 .addReg(OldVal3Reg);
13121 BuildMI(BB, dl, TII->get(PPC::BCC))
13122 .addImm(PPC::PRED_NE)
13125 BB->addSuccessor(loop2MBB);
13126 BB->addSuccessor(exitMBB);
13129 BuildMI(BB, dl, TII->get(PPC::ANDC), Tmp2Reg)
13130 .addReg(TmpDestReg)
13132 BuildMI(BB, dl, TII->get(PPC::OR), Tmp4Reg)
13134 .addReg(NewVal3Reg);
13135 BuildMI(BB, dl, TII->get(PPC::STWCX))
13139 BuildMI(BB, dl, TII->get(PPC::BCC))
13140 .addImm(PPC::PRED_NE)
13143 BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
13144 BB->addSuccessor(loop1MBB);
13145 BB->addSuccessor(exitMBB);
13150 BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest)
13153 } else if (MI.getOpcode() == PPC::FADDrtz) {
13154 // This pseudo performs an FADD with rounding mode temporarily forced
13155 // to round-to-zero. We emit this via custom inserter since the FPSCR
13156 // is not modeled at the SelectionDAG level.
13157 Register Dest = MI.getOperand(0).getReg();
13158 Register Src1 = MI.getOperand(1).getReg();
13159 Register Src2 = MI.getOperand(2).getReg();
13160 DebugLoc dl = MI.getDebugLoc();
13162 MachineRegisterInfo &RegInfo = F->getRegInfo();
13163 Register MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
13165 // Save FPSCR value.
13166 BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);
13168 // Set rounding mode to round-to-zero.
13169 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1))
13171 .addReg(PPC::RM, RegState::ImplicitDefine);
13173 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0))
13175 .addReg(PPC::RM, RegState::ImplicitDefine);
13177 // Perform addition.
13178 auto MIB = BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest)
13181 if (MI.getFlag(MachineInstr::NoFPExcept))
13182 MIB.setMIFlag(MachineInstr::NoFPExcept);
13184 // Restore FPSCR value.
13185 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSFb)).addImm(1).addReg(MFFSReg);
13186 } else if (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||
13187 MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT ||
13188 MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||
13189 MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8) {
13190 unsigned Opcode = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8 ||
13191 MI.getOpcode() == PPC::ANDI_rec_1_GT_BIT8)
13194 bool IsEQ = (MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT ||
13195 MI.getOpcode() == PPC::ANDI_rec_1_EQ_BIT8);
13197 MachineRegisterInfo &RegInfo = F->getRegInfo();
13198 Register Dest = RegInfo.createVirtualRegister(
13199 Opcode == PPC::ANDI_rec ? &PPC::GPRCRegClass : &PPC::G8RCRegClass);
13201 DebugLoc Dl = MI.getDebugLoc();
13202 BuildMI(*BB, MI, Dl, TII->get(Opcode), Dest)
13203 .addReg(MI.getOperand(1).getReg())
13205 BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
13206 MI.getOperand(0).getReg())
13207 .addReg(IsEQ ? PPC::CR0EQ : PPC::CR0GT);
13208 } else if (MI.getOpcode() == PPC::TCHECK_RET) {
13209 DebugLoc Dl = MI.getDebugLoc();
13210 MachineRegisterInfo &RegInfo = F->getRegInfo();
13211 Register CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
13212 BuildMI(*BB, MI, Dl, TII->get(PPC::TCHECK), CRReg);
13213 BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
13214 MI.getOperand(0).getReg())
13216 } else if (MI.getOpcode() == PPC::TBEGIN_RET) {
13217 DebugLoc Dl = MI.getDebugLoc();
13218 unsigned Imm = MI.getOperand(1).getImm();
13219 BuildMI(*BB, MI, Dl, TII->get(PPC::TBEGIN)).addImm(Imm);
13220 BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::COPY),
13221 MI.getOperand(0).getReg())
13222 .addReg(PPC::CR0EQ);
13223 } else if (MI.getOpcode() == PPC::SETRNDi) {
13224 DebugLoc dl = MI.getDebugLoc();
13225 Register OldFPSCRReg = MI.getOperand(0).getReg();
13227 // Save FPSCR value.
13228 if (MRI.use_empty(OldFPSCRReg))
13229 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), OldFPSCRReg);
13231 BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
13233 // The floating point rounding mode is in the bits 62:63 of FPCSR, and has
13234 // the following settings:
13235 // 00 Round to nearest
13237 // 10 Round to +inf
13238 // 11 Round to -inf
13240 // When the operand is immediate, using the two least significant bits of
13241 // the immediate to set the bits 62:63 of FPSCR.
13242 unsigned Mode = MI.getOperand(1).getImm();
13243 BuildMI(*BB, MI, dl, TII->get((Mode & 1) ? PPC::MTFSB1 : PPC::MTFSB0))
13245 .addReg(PPC::RM, RegState::ImplicitDefine);
13247 BuildMI(*BB, MI, dl, TII->get((Mode & 2) ? PPC::MTFSB1 : PPC::MTFSB0))
13249 .addReg(PPC::RM, RegState::ImplicitDefine);
13250 } else if (MI.getOpcode() == PPC::SETRND) {
13251 DebugLoc dl = MI.getDebugLoc();
13253 // Copy register from F8RCRegClass::SrcReg to G8RCRegClass::DestReg
13254 // or copy register from G8RCRegClass::SrcReg to F8RCRegClass::DestReg.
13255 // If the target doesn't have DirectMove, we should use stack to do the
13256 // conversion, because the target doesn't have the instructions like mtvsrd
13257 // or mfvsrd to do this conversion directly.
13258 auto copyRegFromG8RCOrF8RC = [&] (unsigned DestReg, unsigned SrcReg) {
13259 if (Subtarget.hasDirectMove()) {
13260 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), DestReg)
13263 // Use stack to do the register copy.
13264 unsigned StoreOp = PPC::STD, LoadOp = PPC::LFD;
13265 MachineRegisterInfo &RegInfo = F->getRegInfo();
13266 const TargetRegisterClass *RC = RegInfo.getRegClass(SrcReg);
13267 if (RC == &PPC::F8RCRegClass) {
13268 // Copy register from F8RCRegClass to G8RCRegclass.
13269 assert((RegInfo.getRegClass(DestReg) == &PPC::G8RCRegClass) &&
13270 "Unsupported RegClass.");
13272 StoreOp = PPC::STFD;
13275 // Copy register from G8RCRegClass to F8RCRegclass.
13276 assert((RegInfo.getRegClass(SrcReg) == &PPC::G8RCRegClass) &&
13277 (RegInfo.getRegClass(DestReg) == &PPC::F8RCRegClass) &&
13278 "Unsupported RegClass.");
13281 MachineFrameInfo &MFI = F->getFrameInfo();
13282 int FrameIdx = MFI.CreateStackObject(8, Align(8), false);
13284 MachineMemOperand *MMOStore = F->getMachineMemOperand(
13285 MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
13286 MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
13287 MFI.getObjectAlign(FrameIdx));
13289 // Store the SrcReg into the stack.
13290 BuildMI(*BB, MI, dl, TII->get(StoreOp))
13293 .addFrameIndex(FrameIdx)
13294 .addMemOperand(MMOStore);
13296 MachineMemOperand *MMOLoad = F->getMachineMemOperand(
13297 MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
13298 MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
13299 MFI.getObjectAlign(FrameIdx));
13301 // Load from the stack where SrcReg is stored, and save to DestReg,
13302 // so we have done the RegClass conversion from RegClass::SrcReg to
13303 // RegClass::DestReg.
13304 BuildMI(*BB, MI, dl, TII->get(LoadOp), DestReg)
13306 .addFrameIndex(FrameIdx)
13307 .addMemOperand(MMOLoad);
13311 Register OldFPSCRReg = MI.getOperand(0).getReg();
13313 // Save FPSCR value.
13314 BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), OldFPSCRReg);
13316 // When the operand is gprc register, use two least significant bits of the
13317 // register and mtfsf instruction to set the bits 62:63 of FPSCR.
13319 // copy OldFPSCRTmpReg, OldFPSCRReg
13320 // (INSERT_SUBREG ExtSrcReg, (IMPLICIT_DEF ImDefReg), SrcOp, 1)
13321 // rldimi NewFPSCRTmpReg, ExtSrcReg, OldFPSCRReg, 0, 62
13322 // copy NewFPSCRReg, NewFPSCRTmpReg
13323 // mtfsf 255, NewFPSCRReg
13324 MachineOperand SrcOp = MI.getOperand(1);
13325 MachineRegisterInfo &RegInfo = F->getRegInfo();
13326 Register OldFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
13328 copyRegFromG8RCOrF8RC(OldFPSCRTmpReg, OldFPSCRReg);
13330 Register ImDefReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
13331 Register ExtSrcReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
13333 // The first operand of INSERT_SUBREG should be a register which has
13334 // subregisters, we only care about its RegClass, so we should use an
13335 // IMPLICIT_DEF register.
13336 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::IMPLICIT_DEF), ImDefReg);
13337 BuildMI(*BB, MI, dl, TII->get(PPC::INSERT_SUBREG), ExtSrcReg)
13342 Register NewFPSCRTmpReg = RegInfo.createVirtualRegister(&PPC::G8RCRegClass);
13343 BuildMI(*BB, MI, dl, TII->get(PPC::RLDIMI), NewFPSCRTmpReg)
13344 .addReg(OldFPSCRTmpReg)
13349 Register NewFPSCRReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
13350 copyRegFromG8RCOrF8RC(NewFPSCRReg, NewFPSCRTmpReg);
13352 // The mask 255 means that put the 32:63 bits of NewFPSCRReg to the 32:63
13354 BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF))
13356 .addReg(NewFPSCRReg)
13359 } else if (MI.getOpcode() == PPC::SETFLM) {
13360 DebugLoc Dl = MI.getDebugLoc();
13362 // Result of setflm is previous FPSCR content, so we need to save it first.
13363 Register OldFPSCRReg = MI.getOperand(0).getReg();
13364 if (MRI.use_empty(OldFPSCRReg))
13365 BuildMI(*BB, MI, Dl, TII->get(TargetOpcode::IMPLICIT_DEF), OldFPSCRReg);
13367 BuildMI(*BB, MI, Dl, TII->get(PPC::MFFS), OldFPSCRReg);
13369 // Put bits in 32:63 to FPSCR.
13370 Register NewFPSCRReg = MI.getOperand(1).getReg();
13371 BuildMI(*BB, MI, Dl, TII->get(PPC::MTFSF))
13373 .addReg(NewFPSCRReg)
13376 } else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 ||
13377 MI.getOpcode() == PPC::PROBED_ALLOCA_64) {
13378 return emitProbedAlloca(MI, BB);
13379 } else if (MI.getOpcode() == PPC::SPLIT_QUADWORD) {
13380 DebugLoc DL = MI.getDebugLoc();
13381 Register Src = MI.getOperand(2).getReg();
13382 Register Lo = MI.getOperand(0).getReg();
13383 Register Hi = MI.getOperand(1).getReg();
13384 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY))
13386 .addUse(Src, 0, PPC::sub_gp8_x1);
13387 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY))
13389 .addUse(Src, 0, PPC::sub_gp8_x0);
13390 } else if (MI.getOpcode() == PPC::LQX_PSEUDO ||
13391 MI.getOpcode() == PPC::STQX_PSEUDO) {
13392 DebugLoc DL = MI.getDebugLoc();
13393 // Ptr is used as the ptr_rc_no_r0 part
13394 // of LQ/STQ's memory operand and adding result of RA and RB,
13395 // so it has to be g8rc_and_g8rc_nox0.
13397 F->getRegInfo().createVirtualRegister(&PPC::G8RC_and_G8RC_NOX0RegClass);
13398 Register Val = MI.getOperand(0).getReg();
13399 Register RA = MI.getOperand(1).getReg();
13400 Register RB = MI.getOperand(2).getReg();
13401 BuildMI(*BB, MI, DL, TII->get(PPC::ADD8), Ptr).addReg(RA).addReg(RB);
13402 BuildMI(*BB, MI, DL,
13403 MI.getOpcode() == PPC::LQX_PSEUDO ? TII->get(PPC::LQ)
13404 : TII->get(PPC::STQ))
13405 .addReg(Val, MI.getOpcode() == PPC::LQX_PSEUDO ? RegState::Define : 0)
13409 llvm_unreachable("Unexpected instr type to insert");
13412 MI.eraseFromParent(); // The pseudo instruction is gone now.
13416 //===----------------------------------------------------------------------===//
13417 // Target Optimization Hooks
13418 //===----------------------------------------------------------------------===//
13420 static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
13421 // For the estimates, convergence is quadratic, so we essentially double the
13422 // number of digits correct after every iteration. For both FRE and FRSQRTE,
13423 // the minimum architected relative accuracy is 2^-5. When hasRecipPrec(),
13424 // this is 2^-14. IEEE float has 23 digits and double has 52 digits.
13425 int RefinementSteps = Subtarget.hasRecipPrec() ? 1 : 3;
13426 if (VT.getScalarType() == MVT::f64)
13428 return RefinementSteps;
13431 SDValue PPCTargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
13432 const DenormalMode &Mode) const {
13433 // We only have VSX Vector Test for software Square Root.
13434 EVT VT = Op.getValueType();
13435 if (!isTypeLegal(MVT::i1) ||
13437 ((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX())))
13438 return TargetLowering::getSqrtInputTest(Op, DAG, Mode);
13441 // The output register of FTSQRT is CR field.
13442 SDValue FTSQRT = DAG.getNode(PPCISD::FTSQRT, DL, MVT::i32, Op);
13444 // Let e_b be the unbiased exponent of the double-precision
13445 // floating-point operand in register FRB.
13446 // fe_flag is set to 1 if either of the following conditions occurs.
13447 // - The double-precision floating-point operand in register FRB is a zero,
13448 // a NaN, or an infinity, or a negative value.
13449 // - e_b is less than or equal to -970.
13450 // Otherwise fe_flag is set to 0.
13451 // Both VSX and non-VSX versions would set EQ bit in the CR if the number is
13452 // not eligible for iteration. (zero/negative/infinity/nan or unbiased
13453 // exponent is less than -970)
13454 SDValue SRIdxVal = DAG.getTargetConstant(PPC::sub_eq, DL, MVT::i32);
13455 return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i1,
13461 PPCTargetLowering::getSqrtResultForDenormInput(SDValue Op,
13462 SelectionDAG &DAG) const {
13463 // We only have VSX Vector Square Root.
13464 EVT VT = Op.getValueType();
13465 if (VT != MVT::f64 &&
13466 ((VT != MVT::v2f64 && VT != MVT::v4f32) || !Subtarget.hasVSX()))
13467 return TargetLowering::getSqrtResultForDenormInput(Op, DAG);
13469 return DAG.getNode(PPCISD::FSQRT, SDLoc(Op), VT, Op);
13472 SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
13473 int Enabled, int &RefinementSteps,
13474 bool &UseOneConstNR,
13475 bool Reciprocal) const {
13476 EVT VT = Operand.getValueType();
13477 if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
13478 (VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
13479 (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
13480 (VT == MVT::v2f64 && Subtarget.hasVSX())) {
13481 if (RefinementSteps == ReciprocalEstimate::Unspecified)
13482 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
13484 // The Newton-Raphson computation with a single constant does not provide
13485 // enough accuracy on some CPUs.
13486 UseOneConstNR = !Subtarget.needsTwoConstNR();
13487 return DAG.getNode(PPCISD::FRSQRTE, SDLoc(Operand), VT, Operand);
13492 SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
13494 int &RefinementSteps) const {
13495 EVT VT = Operand.getValueType();
13496 if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
13497 (VT == MVT::f64 && Subtarget.hasFRE()) ||
13498 (VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
13499 (VT == MVT::v2f64 && Subtarget.hasVSX())) {
13500 if (RefinementSteps == ReciprocalEstimate::Unspecified)
13501 RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
13502 return DAG.getNode(PPCISD::FRE, SDLoc(Operand), VT, Operand);
13507 unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
13508 // Note: This functionality is used only when unsafe-fp-math is enabled, and
13509 // on cores with reciprocal estimates (which are used when unsafe-fp-math is
13510 // enabled for division), this functionality is redundant with the default
13511 // combiner logic (once the division -> reciprocal/multiply transformation
13512 // has taken place). As a result, this matters more for older cores than for
13515 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
13516 // reciprocal if there are two or more FDIVs (for embedded cores with only
13517 // one FP pipeline) for three or more FDIVs (for generic OOO cores).
13518 switch (Subtarget.getCPUDirective()) {
13523 case PPC::DIR_E500:
13524 case PPC::DIR_E500mc:
13525 case PPC::DIR_E5500:
13530 // isConsecutiveLSLoc needs to work even if all adds have not yet been
13531 // collapsed, and so we need to look through chains of them.
13532 static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base,
13533 int64_t& Offset, SelectionDAG &DAG) {
13534 if (DAG.isBaseWithConstantOffset(Loc)) {
13535 Base = Loc.getOperand(0);
13536 Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
13538 // The base might itself be a base plus an offset, and if so, accumulate
13540 getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG);
13544 static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base,
13545 unsigned Bytes, int Dist,
13546 SelectionDAG &DAG) {
13547 if (VT.getSizeInBits() / 8 != Bytes)
13550 SDValue BaseLoc = Base->getBasePtr();
13551 if (Loc.getOpcode() == ISD::FrameIndex) {
13552 if (BaseLoc.getOpcode() != ISD::FrameIndex)
13554 const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
13555 int FI = cast<FrameIndexSDNode>(Loc)->getIndex();
13556 int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
13557 int FS = MFI.getObjectSize(FI);
13558 int BFS = MFI.getObjectSize(BFI);
13559 if (FS != BFS || FS != (int)Bytes) return false;
13560 return MFI.getObjectOffset(FI) == (MFI.getObjectOffset(BFI) + Dist*Bytes);
13563 SDValue Base1 = Loc, Base2 = BaseLoc;
13564 int64_t Offset1 = 0, Offset2 = 0;
13565 getBaseWithConstantOffset(Loc, Base1, Offset1, DAG);
13566 getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG);
13567 if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes))
13570 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
13571 const GlobalValue *GV1 = nullptr;
13572 const GlobalValue *GV2 = nullptr;
13575 bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
13576 bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
13577 if (isGA1 && isGA2 && GV1 == GV2)
13578 return Offset1 == (Offset2 + Dist*Bytes);
13582 // Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
13583 // not enforce equality of the chain operands.
13584 static bool isConsecutiveLS(SDNode *N, LSBaseSDNode *Base,
13585 unsigned Bytes, int Dist,
13586 SelectionDAG &DAG) {
13587 if (LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(N)) {
13588 EVT VT = LS->getMemoryVT();
13589 SDValue Loc = LS->getBasePtr();
13590 return isConsecutiveLSLoc(Loc, VT, Base, Bytes, Dist, DAG);
13593 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
13595 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
13596 default: return false;
13597 case Intrinsic::ppc_altivec_lvx:
13598 case Intrinsic::ppc_altivec_lvxl:
13599 case Intrinsic::ppc_vsx_lxvw4x:
13600 case Intrinsic::ppc_vsx_lxvw4x_be:
13603 case Intrinsic::ppc_vsx_lxvd2x:
13604 case Intrinsic::ppc_vsx_lxvd2x_be:
13607 case Intrinsic::ppc_altivec_lvebx:
13610 case Intrinsic::ppc_altivec_lvehx:
13613 case Intrinsic::ppc_altivec_lvewx:
13618 return isConsecutiveLSLoc(N->getOperand(2), VT, Base, Bytes, Dist, DAG);
13621 if (N->getOpcode() == ISD::INTRINSIC_VOID) {
13623 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
13624 default: return false;
13625 case Intrinsic::ppc_altivec_stvx:
13626 case Intrinsic::ppc_altivec_stvxl:
13627 case Intrinsic::ppc_vsx_stxvw4x:
13630 case Intrinsic::ppc_vsx_stxvd2x:
13633 case Intrinsic::ppc_vsx_stxvw4x_be:
13636 case Intrinsic::ppc_vsx_stxvd2x_be:
13639 case Intrinsic::ppc_altivec_stvebx:
13642 case Intrinsic::ppc_altivec_stvehx:
13645 case Intrinsic::ppc_altivec_stvewx:
13650 return isConsecutiveLSLoc(N->getOperand(3), VT, Base, Bytes, Dist, DAG);
13656 // Return true is there is a nearyby consecutive load to the one provided
13657 // (regardless of alignment). We search up and down the chain, looking though
13658 // token factors and other loads (but nothing else). As a result, a true result
13659 // indicates that it is safe to create a new consecutive load adjacent to the
13661 static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
13662 SDValue Chain = LD->getChain();
13663 EVT VT = LD->getMemoryVT();
13665 SmallSet<SDNode *, 16> LoadRoots;
13666 SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
13667 SmallSet<SDNode *, 16> Visited;
13669 // First, search up the chain, branching to follow all token-factor operands.
13670 // If we find a consecutive load, then we're done, otherwise, record all
13671 // nodes just above the top-level loads and token factors.
13672 while (!Queue.empty()) {
13673 SDNode *ChainNext = Queue.pop_back_val();
13674 if (!Visited.insert(ChainNext).second)
13677 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(ChainNext)) {
13678 if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
13681 if (!Visited.count(ChainLD->getChain().getNode()))
13682 Queue.push_back(ChainLD->getChain().getNode());
13683 } else if (ChainNext->getOpcode() == ISD::TokenFactor) {
13684 for (const SDUse &O : ChainNext->ops())
13685 if (!Visited.count(O.getNode()))
13686 Queue.push_back(O.getNode());
13688 LoadRoots.insert(ChainNext);
13691 // Second, search down the chain, starting from the top-level nodes recorded
13692 // in the first phase. These top-level nodes are the nodes just above all
13693 // loads and token factors. Starting with their uses, recursively look though
13694 // all loads (just the chain uses) and token factors to find a consecutive
13699 for (SDNode *I : LoadRoots) {
13700 Queue.push_back(I);
13702 while (!Queue.empty()) {
13703 SDNode *LoadRoot = Queue.pop_back_val();
13704 if (!Visited.insert(LoadRoot).second)
13707 if (MemSDNode *ChainLD = dyn_cast<MemSDNode>(LoadRoot))
13708 if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
13711 for (SDNode *U : LoadRoot->uses())
13712 if (((isa<MemSDNode>(U) &&
13713 cast<MemSDNode>(U)->getChain().getNode() == LoadRoot) ||
13714 U->getOpcode() == ISD::TokenFactor) &&
13716 Queue.push_back(U);
13723 /// This function is called when we have proved that a SETCC node can be replaced
13724 /// by subtraction (and other supporting instructions) so that the result of
13725 /// comparison is kept in a GPR instead of CR. This function is purely for
13726 /// codegen purposes and has some flags to guide the codegen process.
13727 static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
13728 bool Swap, SDLoc &DL, SelectionDAG &DAG) {
13729 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
13731 // Zero extend the operands to the largest legal integer. Originally, they
13732 // must be of a strictly smaller size.
13733 auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0),
13734 DAG.getConstant(Size, DL, MVT::i32));
13735 auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1),
13736 DAG.getConstant(Size, DL, MVT::i32));
13738 // Swap if needed. Depends on the condition code.
13740 std::swap(Op0, Op1);
13742 // Subtract extended integers.
13743 auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1);
13745 // Move the sign bit to the least significant position and zero out the rest.
13746 // Now the least significant bit carries the result of original comparison.
13747 auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode,
13748 DAG.getConstant(Size - 1, DL, MVT::i32));
13749 auto Final = Shifted;
13751 // Complement the result if needed. Based on the condition code.
13753 Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted,
13754 DAG.getConstant(1, DL, MVT::i64));
13756 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final);
13759 SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
13760 DAGCombinerInfo &DCI) const {
13761 assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
13763 SelectionDAG &DAG = DCI.DAG;
13766 // Size of integers being compared has a critical role in the following
13767 // analysis, so we prefer to do this when all types are legal.
13768 if (!DCI.isAfterLegalizeDAG())
13771 // If all users of SETCC extend its value to a legal integer type
13772 // then we replace SETCC with a subtraction
13773 for (const SDNode *U : N->uses())
13774 if (U->getOpcode() != ISD::ZERO_EXTEND)
13777 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13778 auto OpSize = N->getOperand(0).getValueSizeInBits();
13780 unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();
13782 if (OpSize < Size) {
13786 return generateEquivalentSub(N, Size, false, false, DL, DAG);
13788 return generateEquivalentSub(N, Size, true, true, DL, DAG);
13790 return generateEquivalentSub(N, Size, false, true, DL, DAG);
13792 return generateEquivalentSub(N, Size, true, false, DL, DAG);
13799 SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
13800 DAGCombinerInfo &DCI) const {
13801 SelectionDAG &DAG = DCI.DAG;
13804 assert(Subtarget.useCRBits() && "Expecting to be tracking CR bits");
13805 // If we're tracking CR bits, we need to be careful that we don't have:
13806 // trunc(binary-ops(zext(x), zext(y)))
13808 // trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
13809 // such that we're unnecessarily moving things into GPRs when it would be
13810 // better to keep them in CR bits.
13812 // Note that trunc here can be an actual i1 trunc, or can be the effective
13813 // truncation that comes from a setcc or select_cc.
13814 if (N->getOpcode() == ISD::TRUNCATE &&
13815 N->getValueType(0) != MVT::i1)
13818 if (N->getOperand(0).getValueType() != MVT::i32 &&
13819 N->getOperand(0).getValueType() != MVT::i64)
13822 if (N->getOpcode() == ISD::SETCC ||
13823 N->getOpcode() == ISD::SELECT_CC) {
13824 // If we're looking at a comparison, then we need to make sure that the
13825 // high bits (all except for the first) don't matter the result.
13827 cast<CondCodeSDNode>(N->getOperand(
13828 N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
13829 unsigned OpBits = N->getOperand(0).getValueSizeInBits();
13831 if (ISD::isSignedIntSetCC(CC)) {
13832 if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits ||
13833 DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits)
13835 } else if (ISD::isUnsignedIntSetCC(CC)) {
13836 if (!DAG.MaskedValueIsZero(N->getOperand(0),
13837 APInt::getHighBitsSet(OpBits, OpBits-1)) ||
13838 !DAG.MaskedValueIsZero(N->getOperand(1),
13839 APInt::getHighBitsSet(OpBits, OpBits-1)))
13840 return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)
13843 // This is neither a signed nor an unsigned comparison, just make sure
13844 // that the high bits are equal.
13845 KnownBits Op1Known = DAG.computeKnownBits(N->getOperand(0));
13846 KnownBits Op2Known = DAG.computeKnownBits(N->getOperand(1));
13848 // We don't really care about what is known about the first bit (if
13849 // anything), so pretend that it is known zero for both to ensure they can
13850 // be compared as constants.
13851 Op1Known.Zero.setBit(0); Op1Known.One.clearBit(0);
13852 Op2Known.Zero.setBit(0); Op2Known.One.clearBit(0);
13854 if (!Op1Known.isConstant() || !Op2Known.isConstant() ||
13855 Op1Known.getConstant() != Op2Known.getConstant())
13860 // We now know that the higher-order bits are irrelevant, we just need to
13861 // make sure that all of the intermediate operations are bit operations, and
13862 // all inputs are extensions.
13863 if (N->getOperand(0).getOpcode() != ISD::AND &&
13864 N->getOperand(0).getOpcode() != ISD::OR &&
13865 N->getOperand(0).getOpcode() != ISD::XOR &&
13866 N->getOperand(0).getOpcode() != ISD::SELECT &&
13867 N->getOperand(0).getOpcode() != ISD::SELECT_CC &&
13868 N->getOperand(0).getOpcode() != ISD::TRUNCATE &&
13869 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND &&
13870 N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
13871 N->getOperand(0).getOpcode() != ISD::ANY_EXTEND)
13874 if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) &&
13875 N->getOperand(1).getOpcode() != ISD::AND &&
13876 N->getOperand(1).getOpcode() != ISD::OR &&
13877 N->getOperand(1).getOpcode() != ISD::XOR &&
13878 N->getOperand(1).getOpcode() != ISD::SELECT &&
13879 N->getOperand(1).getOpcode() != ISD::SELECT_CC &&
13880 N->getOperand(1).getOpcode() != ISD::TRUNCATE &&
13881 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND &&
13882 N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
13883 N->getOperand(1).getOpcode() != ISD::ANY_EXTEND)
13886 SmallVector<SDValue, 4> Inputs;
13887 SmallVector<SDValue, 8> BinOps, PromOps;
13888 SmallPtrSet<SDNode *, 16> Visited;
13890 for (unsigned i = 0; i < 2; ++i) {
13891 if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
13892 N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
13893 N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
13894 N->getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
13895 isa<ConstantSDNode>(N->getOperand(i)))
13896 Inputs.push_back(N->getOperand(i));
13898 BinOps.push_back(N->getOperand(i));
13900 if (N->getOpcode() == ISD::TRUNCATE)
13904 // Visit all inputs, collect all binary operations (and, or, xor and
13905 // select) that are all fed by extensions.
13906 while (!BinOps.empty()) {
13907 SDValue BinOp = BinOps.pop_back_val();
13909 if (!Visited.insert(BinOp.getNode()).second)
13912 PromOps.push_back(BinOp);
13914 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
13915 // The condition of the select is not promoted.
13916 if (BinOp.getOpcode() == ISD::SELECT && i == 0)
13918 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
13921 if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
13922 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
13923 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
13924 BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
13925 isa<ConstantSDNode>(BinOp.getOperand(i))) {
13926 Inputs.push_back(BinOp.getOperand(i));
13927 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
13928 BinOp.getOperand(i).getOpcode() == ISD::OR ||
13929 BinOp.getOperand(i).getOpcode() == ISD::XOR ||
13930 BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
13931 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC ||
13932 BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
13933 BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
13934 BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
13935 BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
13936 BinOps.push_back(BinOp.getOperand(i));
13938 // We have an input that is not an extension or another binary
13939 // operation; we'll abort this transformation.
13945 // Make sure that this is a self-contained cluster of operations (which
13946 // is not quite the same thing as saying that everything has only one
13948 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
13949 if (isa<ConstantSDNode>(Inputs[i]))
13952 for (const SDNode *User : Inputs[i].getNode()->uses()) {
13953 if (User != N && !Visited.count(User))
13956 // Make sure that we're not going to promote the non-output-value
13957 // operand(s) or SELECT or SELECT_CC.
13958 // FIXME: Although we could sometimes handle this, and it does occur in
13959 // practice that one of the condition inputs to the select is also one of
13960 // the outputs, we currently can't deal with this.
13961 if (User->getOpcode() == ISD::SELECT) {
13962 if (User->getOperand(0) == Inputs[i])
13964 } else if (User->getOpcode() == ISD::SELECT_CC) {
13965 if (User->getOperand(0) == Inputs[i] ||
13966 User->getOperand(1) == Inputs[i])
13972 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
13973 for (const SDNode *User : PromOps[i].getNode()->uses()) {
13974 if (User != N && !Visited.count(User))
13977 // Make sure that we're not going to promote the non-output-value
13978 // operand(s) or SELECT or SELECT_CC.
13979 // FIXME: Although we could sometimes handle this, and it does occur in
13980 // practice that one of the condition inputs to the select is also one of
13981 // the outputs, we currently can't deal with this.
13982 if (User->getOpcode() == ISD::SELECT) {
13983 if (User->getOperand(0) == PromOps[i])
13985 } else if (User->getOpcode() == ISD::SELECT_CC) {
13986 if (User->getOperand(0) == PromOps[i] ||
13987 User->getOperand(1) == PromOps[i])
13993 // Replace all inputs with the extension operand.
13994 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
13995 // Constants may have users outside the cluster of to-be-promoted nodes,
13996 // and so we need to replace those as we do the promotions.
13997 if (isa<ConstantSDNode>(Inputs[i]))
14000 DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
14003 std::list<HandleSDNode> PromOpHandles;
14004 for (auto &PromOp : PromOps)
14005 PromOpHandles.emplace_back(PromOp);
14007 // Replace all operations (these are all the same, but have a different
14008 // (i1) return type). DAG.getNode will validate that the types of
14009 // a binary operator match, so go through the list in reverse so that
14010 // we've likely promoted both operands first. Any intermediate truncations or
14011 // extensions disappear.
14012 while (!PromOpHandles.empty()) {
14013 SDValue PromOp = PromOpHandles.back().getValue();
14014 PromOpHandles.pop_back();
14016 if (PromOp.getOpcode() == ISD::TRUNCATE ||
14017 PromOp.getOpcode() == ISD::SIGN_EXTEND ||
14018 PromOp.getOpcode() == ISD::ZERO_EXTEND ||
14019 PromOp.getOpcode() == ISD::ANY_EXTEND) {
14020 if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&
14021 PromOp.getOperand(0).getValueType() != MVT::i1) {
14022 // The operand is not yet ready (see comment below).
14023 PromOpHandles.emplace_front(PromOp);
14027 SDValue RepValue = PromOp.getOperand(0);
14028 if (isa<ConstantSDNode>(RepValue))
14029 RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue);
14031 DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue);
14036 switch (PromOp.getOpcode()) {
14037 default: C = 0; break;
14038 case ISD::SELECT: C = 1; break;
14039 case ISD::SELECT_CC: C = 2; break;
14042 if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
14043 PromOp.getOperand(C).getValueType() != MVT::i1) ||
14044 (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
14045 PromOp.getOperand(C+1).getValueType() != MVT::i1)) {
14046 // The to-be-promoted operands of this node have not yet been
14047 // promoted (this should be rare because we're going through the
14048 // list backward, but if one of the operands has several users in
14049 // this cluster of to-be-promoted nodes, it is possible).
14050 PromOpHandles.emplace_front(PromOp);
14054 SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
14055 PromOp.getNode()->op_end());
14057 // If there are any constant inputs, make sure they're replaced now.
14058 for (unsigned i = 0; i < 2; ++i)
14059 if (isa<ConstantSDNode>(Ops[C+i]))
14060 Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);
14062 DAG.ReplaceAllUsesOfValueWith(PromOp,
14063 DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));
14066 // Now we're left with the initial truncation itself.
14067 if (N->getOpcode() == ISD::TRUNCATE)
14068 return N->getOperand(0);
14070 // Otherwise, this is a comparison. The operands to be compared have just
14071 // changed type (to i1), but everything else is the same.
14072 return SDValue(N, 0);
14075 SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
14076 DAGCombinerInfo &DCI) const {
14077 SelectionDAG &DAG = DCI.DAG;
14080 // If we're tracking CR bits, we need to be careful that we don't have:
14081 // zext(binary-ops(trunc(x), trunc(y)))
14083 // zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
14084 // such that we're unnecessarily moving things into CR bits that can more
14085 // efficiently stay in GPRs. Note that if we're not certain that the high
14086 // bits are set as required by the final extension, we still may need to do
14087 // some masking to get the proper behavior.
14089 // This same functionality is important on PPC64 when dealing with
14090 // 32-to-64-bit extensions; these occur often when 32-bit values are used as
14091 // the return values of functions. Because it is so similar, it is handled
14094 if (N->getValueType(0) != MVT::i32 &&
14095 N->getValueType(0) != MVT::i64)
14098 if (!((N->getOperand(0).getValueType() == MVT::i1 && Subtarget.useCRBits()) ||
14099 (N->getOperand(0).getValueType() == MVT::i32 && Subtarget.isPPC64())))
14102 if (N->getOperand(0).getOpcode() != ISD::AND &&
14103 N->getOperand(0).getOpcode() != ISD::OR &&
14104 N->getOperand(0).getOpcode() != ISD::XOR &&
14105 N->getOperand(0).getOpcode() != ISD::SELECT &&
14106 N->getOperand(0).getOpcode() != ISD::SELECT_CC)
14109 SmallVector<SDValue, 4> Inputs;
14110 SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps;
14111 SmallPtrSet<SDNode *, 16> Visited;
14113 // Visit all inputs, collect all binary operations (and, or, xor and
14114 // select) that are all fed by truncations.
14115 while (!BinOps.empty()) {
14116 SDValue BinOp = BinOps.pop_back_val();
14118 if (!Visited.insert(BinOp.getNode()).second)
14121 PromOps.push_back(BinOp);
14123 for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
14124 // The condition of the select is not promoted.
14125 if (BinOp.getOpcode() == ISD::SELECT && i == 0)
14127 if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
14130 if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
14131 isa<ConstantSDNode>(BinOp.getOperand(i))) {
14132 Inputs.push_back(BinOp.getOperand(i));
14133 } else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
14134 BinOp.getOperand(i).getOpcode() == ISD::OR ||
14135 BinOp.getOperand(i).getOpcode() == ISD::XOR ||
14136 BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
14137 BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
14138 BinOps.push_back(BinOp.getOperand(i));
14140 // We have an input that is not a truncation or another binary
14141 // operation; we'll abort this transformation.
14147 // The operands of a select that must be truncated when the select is
14148 // promoted because the operand is actually part of the to-be-promoted set.
14149 DenseMap<SDNode *, EVT> SelectTruncOp[2];
14151 // Make sure that this is a self-contained cluster of operations (which
14152 // is not quite the same thing as saying that everything has only one
14154 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
14155 if (isa<ConstantSDNode>(Inputs[i]))
14158 for (SDNode *User : Inputs[i].getNode()->uses()) {
14159 if (User != N && !Visited.count(User))
14162 // If we're going to promote the non-output-value operand(s) or SELECT or
14163 // SELECT_CC, record them for truncation.
14164 if (User->getOpcode() == ISD::SELECT) {
14165 if (User->getOperand(0) == Inputs[i])
14166 SelectTruncOp[0].insert(std::make_pair(User,
14167 User->getOperand(0).getValueType()));
14168 } else if (User->getOpcode() == ISD::SELECT_CC) {
14169 if (User->getOperand(0) == Inputs[i])
14170 SelectTruncOp[0].insert(std::make_pair(User,
14171 User->getOperand(0).getValueType()));
14172 if (User->getOperand(1) == Inputs[i])
14173 SelectTruncOp[1].insert(std::make_pair(User,
14174 User->getOperand(1).getValueType()));
14179 for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
14180 for (SDNode *User : PromOps[i].getNode()->uses()) {
14181 if (User != N && !Visited.count(User))
14184 // If we're going to promote the non-output-value operand(s) or SELECT or
14185 // SELECT_CC, record them for truncation.
14186 if (User->getOpcode() == ISD::SELECT) {
14187 if (User->getOperand(0) == PromOps[i])
14188 SelectTruncOp[0].insert(std::make_pair(User,
14189 User->getOperand(0).getValueType()));
14190 } else if (User->getOpcode() == ISD::SELECT_CC) {
14191 if (User->getOperand(0) == PromOps[i])
14192 SelectTruncOp[0].insert(std::make_pair(User,
14193 User->getOperand(0).getValueType()));
14194 if (User->getOperand(1) == PromOps[i])
14195 SelectTruncOp[1].insert(std::make_pair(User,
14196 User->getOperand(1).getValueType()));
14201 unsigned PromBits = N->getOperand(0).getValueSizeInBits();
14202 bool ReallyNeedsExt = false;
14203 if (N->getOpcode() != ISD::ANY_EXTEND) {
14204 // If all of the inputs are not already sign/zero extended, then
14205 // we'll still need to do that at the end.
14206 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
14207 if (isa<ConstantSDNode>(Inputs[i]))
14211 Inputs[i].getOperand(0).getValueSizeInBits();
14212 assert(PromBits < OpBits && "Truncation not to a smaller bit count?");
14214 if ((N->getOpcode() == ISD::ZERO_EXTEND &&
14215 !DAG.MaskedValueIsZero(Inputs[i].getOperand(0),
14216 APInt::getHighBitsSet(OpBits,
14217 OpBits-PromBits))) ||
14218 (N->getOpcode() == ISD::SIGN_EXTEND &&
14219 DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) <
14220 (OpBits-(PromBits-1)))) {
14221 ReallyNeedsExt = true;
14227 // Replace all inputs, either with the truncation operand, or a
14228 // truncation or extension to the final output type.
14229 for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
14230 // Constant inputs need to be replaced with the to-be-promoted nodes that
14231 // use them because they might have users outside of the cluster of
14233 if (isa<ConstantSDNode>(Inputs[i]))
14236 SDValue InSrc = Inputs[i].getOperand(0);
14237 if (Inputs[i].getValueType() == N->getValueType(0))
14238 DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc);
14239 else if (N->getOpcode() == ISD::SIGN_EXTEND)
14240 DAG.ReplaceAllUsesOfValueWith(Inputs[i],
14241 DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0)));
14242 else if (N->getOpcode() == ISD::ZERO_EXTEND)
14243 DAG.ReplaceAllUsesOfValueWith(Inputs[i],
14244 DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0)));
14246 DAG.ReplaceAllUsesOfValueWith(Inputs[i],
14247 DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));
14250 std::list<HandleSDNode> PromOpHandles;
14251 for (auto &PromOp : PromOps)
14252 PromOpHandles.emplace_back(PromOp);
14254 // Replace all operations (these are all the same, but have a different
14255 // (promoted) return type). DAG.getNode will validate that the types of
14256 // a binary operator match, so go through the list in reverse so that
14257 // we've likely promoted both operands first.
14258 while (!PromOpHandles.empty()) {
14259 SDValue PromOp = PromOpHandles.back().getValue();
14260 PromOpHandles.pop_back();
14263 switch (PromOp.getOpcode()) {
14264 default: C = 0; break;
14265 case ISD::SELECT: C = 1; break;
14266 case ISD::SELECT_CC: C = 2; break;
14269 if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
14270 PromOp.getOperand(C).getValueType() != N->getValueType(0)) ||
14271 (!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
14272 PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) {
14273 // The to-be-promoted operands of this node have not yet been
14274 // promoted (this should be rare because we're going through the
14275 // list backward, but if one of the operands has several users in
14276 // this cluster of to-be-promoted nodes, it is possible).
14277 PromOpHandles.emplace_front(PromOp);
14281 // For SELECT and SELECT_CC nodes, we do a similar check for any
14282 // to-be-promoted comparison inputs.
14283 if (PromOp.getOpcode() == ISD::SELECT ||
14284 PromOp.getOpcode() == ISD::SELECT_CC) {
14285 if ((SelectTruncOp[0].count(PromOp.getNode()) &&
14286 PromOp.getOperand(0).getValueType() != N->getValueType(0)) ||
14287 (SelectTruncOp[1].count(PromOp.getNode()) &&
14288 PromOp.getOperand(1).getValueType() != N->getValueType(0))) {
14289 PromOpHandles.emplace_front(PromOp);
14294 SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
14295 PromOp.getNode()->op_end());
14297 // If this node has constant inputs, then they'll need to be promoted here.
14298 for (unsigned i = 0; i < 2; ++i) {
14299 if (!isa<ConstantSDNode>(Ops[C+i]))
14301 if (Ops[C+i].getValueType() == N->getValueType(0))
14304 if (N->getOpcode() == ISD::SIGN_EXTEND)
14305 Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
14306 else if (N->getOpcode() == ISD::ZERO_EXTEND)
14307 Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
14309 Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
14312 // If we've promoted the comparison inputs of a SELECT or SELECT_CC,
14313 // truncate them again to the original value type.
14314 if (PromOp.getOpcode() == ISD::SELECT ||
14315 PromOp.getOpcode() == ISD::SELECT_CC) {
14316 auto SI0 = SelectTruncOp[0].find(PromOp.getNode());
14317 if (SI0 != SelectTruncOp[0].end())
14318 Ops[0] = DAG.getNode(ISD::TRUNCATE, dl, SI0->second, Ops[0]);
14319 auto SI1 = SelectTruncOp[1].find(PromOp.getNode());
14320 if (SI1 != SelectTruncOp[1].end())
14321 Ops[1] = DAG.getNode(ISD::TRUNCATE, dl, SI1->second, Ops[1]);
14324 DAG.ReplaceAllUsesOfValueWith(PromOp,
14325 DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
14328 // Now we're left with the initial extension itself.
14329 if (!ReallyNeedsExt)
14330 return N->getOperand(0);
14332 // To zero extend, just mask off everything except for the first bit (in the
14334 if (N->getOpcode() == ISD::ZERO_EXTEND)
14335 return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0),
14336 DAG.getConstant(APInt::getLowBitsSet(
14337 N->getValueSizeInBits(0), PromBits),
14338 dl, N->getValueType(0)));
14340 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
14341 "Invalid extension type");
14342 EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout());
14344 DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy);
14345 return DAG.getNode(
14346 ISD::SRA, dl, N->getValueType(0),
14347 DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst),
14351 SDValue PPCTargetLowering::combineSetCC(SDNode *N,
14352 DAGCombinerInfo &DCI) const {
14353 assert(N->getOpcode() == ISD::SETCC &&
14354 "Should be called with a SETCC node");
14356 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14357 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
14358 SDValue LHS = N->getOperand(0);
14359 SDValue RHS = N->getOperand(1);
14361 // If there is a '0 - y' pattern, canonicalize the pattern to the RHS.
14362 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
14364 std::swap(LHS, RHS);
14366 // x == 0-y --> x+y == 0
14367 // x != 0-y --> x+y != 0
14368 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
14371 SelectionDAG &DAG = DCI.DAG;
14372 EVT VT = N->getValueType(0);
14373 EVT OpVT = LHS.getValueType();
14374 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
14375 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
14379 return DAGCombineTruncBoolExt(N, DCI);
14382 // Is this an extending load from an f32 to an f64?
14383 static bool isFPExtLoad(SDValue Op) {
14384 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode()))
14385 return LD->getExtensionType() == ISD::EXTLOAD &&
14386 Op.getValueType() == MVT::f64;
14390 /// Reduces the number of fp-to-int conversion when building a vector.
14392 /// If this vector is built out of floating to integer conversions,
14393 /// transform it to a vector built out of floating point values followed by a
14394 /// single floating to integer conversion of the vector.
14395 /// Namely (build_vector (fptosi $A), (fptosi $B), ...)
14396 /// becomes (fptosi (build_vector ($A, $B, ...)))
14397 SDValue PPCTargetLowering::
14398 combineElementTruncationToVectorTruncation(SDNode *N,
14399 DAGCombinerInfo &DCI) const {
14400 assert(N->getOpcode() == ISD::BUILD_VECTOR &&
14401 "Should be called with a BUILD_VECTOR node");
14403 SelectionDAG &DAG = DCI.DAG;
14406 SDValue FirstInput = N->getOperand(0);
14407 assert(FirstInput.getOpcode() == PPCISD::MFVSR &&
14408 "The input operand must be an fp-to-int conversion.");
14410 // This combine happens after legalization so the fp_to_[su]i nodes are
14411 // already converted to PPCSISD nodes.
14412 unsigned FirstConversion = FirstInput.getOperand(0).getOpcode();
14413 if (FirstConversion == PPCISD::FCTIDZ ||
14414 FirstConversion == PPCISD::FCTIDUZ ||
14415 FirstConversion == PPCISD::FCTIWZ ||
14416 FirstConversion == PPCISD::FCTIWUZ) {
14417 bool IsSplat = true;
14418 bool Is32Bit = FirstConversion == PPCISD::FCTIWZ ||
14419 FirstConversion == PPCISD::FCTIWUZ;
14420 EVT SrcVT = FirstInput.getOperand(0).getValueType();
14421 SmallVector<SDValue, 4> Ops;
14422 EVT TargetVT = N->getValueType(0);
14423 for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
14424 SDValue NextOp = N->getOperand(i);
14425 if (NextOp.getOpcode() != PPCISD::MFVSR)
14427 unsigned NextConversion = NextOp.getOperand(0).getOpcode();
14428 if (NextConversion != FirstConversion)
14430 // If we are converting to 32-bit integers, we need to add an FP_ROUND.
14431 // This is not valid if the input was originally double precision. It is
14432 // also not profitable to do unless this is an extending load in which
14433 // case doing this combine will allow us to combine consecutive loads.
14434 if (Is32Bit && !isFPExtLoad(NextOp.getOperand(0).getOperand(0)))
14436 if (N->getOperand(i) != FirstInput)
14440 // If this is a splat, we leave it as-is since there will be only a single
14441 // fp-to-int conversion followed by a splat of the integer. This is better
14442 // for 32-bit and smaller ints and neutral for 64-bit ints.
14446 // Now that we know we have the right type of node, get its operands
14447 for (int i = 0, e = N->getNumOperands(); i < e; ++i) {
14448 SDValue In = N->getOperand(i).getOperand(0);
14450 // For 32-bit values, we need to add an FP_ROUND node (if we made it
14451 // here, we know that all inputs are extending loads so this is safe).
14453 Ops.push_back(DAG.getUNDEF(SrcVT));
14456 DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, In.getOperand(0),
14457 DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));
14458 Ops.push_back(Trunc);
14461 Ops.push_back(In.isUndef() ? DAG.getUNDEF(SrcVT) : In.getOperand(0));
14465 if (FirstConversion == PPCISD::FCTIDZ ||
14466 FirstConversion == PPCISD::FCTIWZ)
14467 Opcode = ISD::FP_TO_SINT;
14469 Opcode = ISD::FP_TO_UINT;
14471 EVT NewVT = TargetVT == MVT::v2i64 ? MVT::v2f64 : MVT::v4f32;
14472 SDValue BV = DAG.getBuildVector(NewVT, dl, Ops);
14473 return DAG.getNode(Opcode, dl, TargetVT, BV);
14478 /// Reduce the number of loads when building a vector.
14480 /// Building a vector out of multiple loads can be converted to a load
14481 /// of the vector type if the loads are consecutive. If the loads are
14482 /// consecutive but in descending order, a shuffle is added at the end
14483 /// to reorder the vector.
14484 static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) {
14485 assert(N->getOpcode() == ISD::BUILD_VECTOR &&
14486 "Should be called with a BUILD_VECTOR node");
14490 // Return early for non byte-sized type, as they can't be consecutive.
14491 if (!N->getValueType(0).getVectorElementType().isByteSized())
14494 bool InputsAreConsecutiveLoads = true;
14495 bool InputsAreReverseConsecutive = true;
14496 unsigned ElemSize = N->getValueType(0).getScalarType().getStoreSize();
14497 SDValue FirstInput = N->getOperand(0);
14498 bool IsRoundOfExtLoad = false;
14499 LoadSDNode *FirstLoad = nullptr;
14501 if (FirstInput.getOpcode() == ISD::FP_ROUND &&
14502 FirstInput.getOperand(0).getOpcode() == ISD::LOAD) {
14503 FirstLoad = cast<LoadSDNode>(FirstInput.getOperand(0));
14504 IsRoundOfExtLoad = FirstLoad->getExtensionType() == ISD::EXTLOAD;
14506 // Not a build vector of (possibly fp_rounded) loads.
14507 if ((!IsRoundOfExtLoad && FirstInput.getOpcode() != ISD::LOAD) ||
14508 N->getNumOperands() == 1)
14511 if (!IsRoundOfExtLoad)
14512 FirstLoad = cast<LoadSDNode>(FirstInput);
14514 SmallVector<LoadSDNode *, 4> InputLoads;
14515 InputLoads.push_back(FirstLoad);
14516 for (int i = 1, e = N->getNumOperands(); i < e; ++i) {
14517 // If any inputs are fp_round(extload), they all must be.
14518 if (IsRoundOfExtLoad && N->getOperand(i).getOpcode() != ISD::FP_ROUND)
14521 SDValue NextInput = IsRoundOfExtLoad ? N->getOperand(i).getOperand(0) :
14523 if (NextInput.getOpcode() != ISD::LOAD)
14526 SDValue PreviousInput =
14527 IsRoundOfExtLoad ? N->getOperand(i-1).getOperand(0) : N->getOperand(i-1);
14528 LoadSDNode *LD1 = cast<LoadSDNode>(PreviousInput);
14529 LoadSDNode *LD2 = cast<LoadSDNode>(NextInput);
14531 // If any inputs are fp_round(extload), they all must be.
14532 if (IsRoundOfExtLoad && LD2->getExtensionType() != ISD::EXTLOAD)
14535 // We only care about regular loads. The PPC-specific load intrinsics
14536 // will not lead to a merge opportunity.
14537 if (!DAG.areNonVolatileConsecutiveLoads(LD2, LD1, ElemSize, 1))
14538 InputsAreConsecutiveLoads = false;
14539 if (!DAG.areNonVolatileConsecutiveLoads(LD1, LD2, ElemSize, 1))
14540 InputsAreReverseConsecutive = false;
14542 // Exit early if the loads are neither consecutive nor reverse consecutive.
14543 if (!InputsAreConsecutiveLoads && !InputsAreReverseConsecutive)
14545 InputLoads.push_back(LD2);
14548 assert(!(InputsAreConsecutiveLoads && InputsAreReverseConsecutive) &&
14549 "The loads cannot be both consecutive and reverse consecutive.");
14552 SDValue ReturnSDVal;
14553 if (InputsAreConsecutiveLoads) {
14554 assert(FirstLoad && "Input needs to be a LoadSDNode.");
14555 WideLoad = DAG.getLoad(N->getValueType(0), dl, FirstLoad->getChain(),
14556 FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(),
14557 FirstLoad->getAlign());
14558 ReturnSDVal = WideLoad;
14559 } else if (InputsAreReverseConsecutive) {
14560 LoadSDNode *LastLoad = InputLoads.back();
14561 assert(LastLoad && "Input needs to be a LoadSDNode.");
14562 WideLoad = DAG.getLoad(N->getValueType(0), dl, LastLoad->getChain(),
14563 LastLoad->getBasePtr(), LastLoad->getPointerInfo(),
14564 LastLoad->getAlign());
14565 SmallVector<int, 16> Ops;
14566 for (int i = N->getNumOperands() - 1; i >= 0; i--)
14569 ReturnSDVal = DAG.getVectorShuffle(N->getValueType(0), dl, WideLoad,
14570 DAG.getUNDEF(N->getValueType(0)), Ops);
14574 for (auto *LD : InputLoads)
14575 DAG.makeEquivalentMemoryOrdering(LD, WideLoad);
14576 return ReturnSDVal;
14579 // This function adds the required vector_shuffle needed to get
14580 // the elements of the vector extract in the correct position
14581 // as specified by the CorrectElems encoding.
14582 static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
14583 SDValue Input, uint64_t Elems,
14584 uint64_t CorrectElems) {
14587 unsigned NumElems = Input.getValueType().getVectorNumElements();
14588 SmallVector<int, 16> ShuffleMask(NumElems, -1);
14590 // Knowing the element indices being extracted from the original
14591 // vector and the order in which they're being inserted, just put
14592 // them at element indices required for the instruction.
14593 for (unsigned i = 0; i < N->getNumOperands(); i++) {
14594 if (DAG.getDataLayout().isLittleEndian())
14595 ShuffleMask[CorrectElems & 0xF] = Elems & 0xF;
14597 ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4;
14598 CorrectElems = CorrectElems >> 8;
14599 Elems = Elems >> 8;
14603 DAG.getVectorShuffle(Input.getValueType(), dl, Input,
14604 DAG.getUNDEF(Input.getValueType()), ShuffleMask);
14606 EVT VT = N->getValueType(0);
14607 SDValue Conv = DAG.getBitcast(VT, Shuffle);
14609 EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
14610 Input.getValueType().getVectorElementType(),
14611 VT.getVectorNumElements());
14612 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Conv,
14613 DAG.getValueType(ExtVT));
14616 // Look for build vector patterns where input operands come from sign
14617 // extended vector_extract elements of specific indices. If the correct indices
14618 // aren't used, add a vector shuffle to fix up the indices and create
14619 // SIGN_EXTEND_INREG node which selects the vector sign extend instructions
14620 // during instruction selection.
14621 static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
14622 // This array encodes the indices that the vector sign extend instructions
14623 // extract from when extending from one type to another for both BE and LE.
14624 // The right nibble of each byte corresponds to the LE incides.
14625 // and the left nibble of each byte corresponds to the BE incides.
14626 // For example: 0x3074B8FC byte->word
14627 // For LE: the allowed indices are: 0x0,0x4,0x8,0xC
14628 // For BE: the allowed indices are: 0x3,0x7,0xB,0xF
14629 // For example: 0x000070F8 byte->double word
14630 // For LE: the allowed indices are: 0x0,0x8
14631 // For BE: the allowed indices are: 0x7,0xF
14632 uint64_t TargetElems[] = {
14633 0x3074B8FC, // b->w
14634 0x000070F8, // b->d
14635 0x10325476, // h->w
14636 0x00003074, // h->d
14637 0x00001032, // w->d
14640 uint64_t Elems = 0;
14644 auto isSExtOfVecExtract = [&](SDValue Op) -> bool {
14647 if (Op.getOpcode() != ISD::SIGN_EXTEND &&
14648 Op.getOpcode() != ISD::SIGN_EXTEND_INREG)
14651 // A SIGN_EXTEND_INREG might be fed by an ANY_EXTEND to produce a value
14652 // of the right width.
14653 SDValue Extract = Op.getOperand(0);
14654 if (Extract.getOpcode() == ISD::ANY_EXTEND)
14655 Extract = Extract.getOperand(0);
14656 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14659 ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
14663 Index = ExtOp->getZExtValue();
14664 if (Input && Input != Extract.getOperand(0))
14668 Input = Extract.getOperand(0);
14670 Elems = Elems << 8;
14671 Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4;
14677 // If the build vector operands aren't sign extended vector extracts,
14678 // of the same input vector, then return.
14679 for (unsigned i = 0; i < N->getNumOperands(); i++) {
14680 if (!isSExtOfVecExtract(N->getOperand(i))) {
14685 // If the vector extract indicies are not correct, add the appropriate
14687 int TgtElemArrayIdx;
14688 int InputSize = Input.getValueType().getScalarSizeInBits();
14689 int OutputSize = N->getValueType(0).getScalarSizeInBits();
14690 if (InputSize + OutputSize == 40)
14691 TgtElemArrayIdx = 0;
14692 else if (InputSize + OutputSize == 72)
14693 TgtElemArrayIdx = 1;
14694 else if (InputSize + OutputSize == 48)
14695 TgtElemArrayIdx = 2;
14696 else if (InputSize + OutputSize == 80)
14697 TgtElemArrayIdx = 3;
14698 else if (InputSize + OutputSize == 96)
14699 TgtElemArrayIdx = 4;
14703 uint64_t CorrectElems = TargetElems[TgtElemArrayIdx];
14704 CorrectElems = DAG.getDataLayout().isLittleEndian()
14705 ? CorrectElems & 0x0F0F0F0F0F0F0F0F
14706 : CorrectElems & 0xF0F0F0F0F0F0F0F0;
14707 if (Elems != CorrectElems) {
14708 return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems);
14711 // Regular lowering will catch cases where a shuffle is not needed.
14715 // Look for the pattern of a load from a narrow width to i128, feeding
14716 // into a BUILD_VECTOR of v1i128. Replace this sequence with a PPCISD node
14717 // (LXVRZX). This node represents a zero extending load that will be matched
14718 // to the Load VSX Vector Rightmost instructions.
14719 static SDValue combineBVZEXTLOAD(SDNode *N, SelectionDAG &DAG) {
14722 // This combine is only eligible for a BUILD_VECTOR of v1i128.
14723 if (N->getValueType(0) != MVT::v1i128)
14726 SDValue Operand = N->getOperand(0);
14727 // Proceed with the transformation if the operand to the BUILD_VECTOR
14728 // is a load instruction.
14729 if (Operand.getOpcode() != ISD::LOAD)
14732 auto *LD = cast<LoadSDNode>(Operand);
14733 EVT MemoryType = LD->getMemoryVT();
14735 // This transformation is only valid if the we are loading either a byte,
14736 // halfword, word, or doubleword.
14737 bool ValidLDType = MemoryType == MVT::i8 || MemoryType == MVT::i16 ||
14738 MemoryType == MVT::i32 || MemoryType == MVT::i64;
14740 // Ensure that the load from the narrow width is being zero extended to i128.
14741 if (!ValidLDType ||
14742 (LD->getExtensionType() != ISD::ZEXTLOAD &&
14743 LD->getExtensionType() != ISD::EXTLOAD))
14746 SDValue LoadOps[] = {
14747 LD->getChain(), LD->getBasePtr(),
14748 DAG.getIntPtrConstant(MemoryType.getScalarSizeInBits(), DL)};
14750 return DAG.getMemIntrinsicNode(PPCISD::LXVRZX, DL,
14751 DAG.getVTList(MVT::v1i128, MVT::Other),
14752 LoadOps, MemoryType, LD->getMemOperand());
14755 SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N,
14756 DAGCombinerInfo &DCI) const {
14757 assert(N->getOpcode() == ISD::BUILD_VECTOR &&
14758 "Should be called with a BUILD_VECTOR node");
14760 SelectionDAG &DAG = DCI.DAG;
14763 if (!Subtarget.hasVSX())
14766 // The target independent DAG combiner will leave a build_vector of
14767 // float-to-int conversions intact. We can generate MUCH better code for
14768 // a float-to-int conversion of a vector of floats.
14769 SDValue FirstInput = N->getOperand(0);
14770 if (FirstInput.getOpcode() == PPCISD::MFVSR) {
14771 SDValue Reduced = combineElementTruncationToVectorTruncation(N, DCI);
14776 // If we're building a vector out of consecutive loads, just load that
14778 SDValue Reduced = combineBVOfConsecutiveLoads(N, DAG);
14782 // If we're building a vector out of extended elements from another vector
14783 // we have P9 vector integer extend instructions. The code assumes legal
14784 // input types (i.e. it can't handle things like v4i16) so do not run before
14786 if (Subtarget.hasP9Altivec() && !DCI.isBeforeLegalize()) {
14787 Reduced = combineBVOfVecSExt(N, DAG);
14792 // On Power10, the Load VSX Vector Rightmost instructions can be utilized
14793 // if this is a BUILD_VECTOR of v1i128, and if the operand to the BUILD_VECTOR
14794 // is a load from <valid narrow width> to i128.
14795 if (Subtarget.isISA3_1()) {
14796 SDValue BVOfZLoad = combineBVZEXTLOAD(N, DAG);
14801 if (N->getValueType(0) != MVT::v2f64)
14805 // (build_vector ([su]int_to_fp (extractelt 0)), [su]int_to_fp (extractelt 1))
14806 if (FirstInput.getOpcode() != ISD::SINT_TO_FP &&
14807 FirstInput.getOpcode() != ISD::UINT_TO_FP)
14809 if (N->getOperand(1).getOpcode() != ISD::SINT_TO_FP &&
14810 N->getOperand(1).getOpcode() != ISD::UINT_TO_FP)
14812 if (FirstInput.getOpcode() != N->getOperand(1).getOpcode())
14815 SDValue Ext1 = FirstInput.getOperand(0);
14816 SDValue Ext2 = N->getOperand(1).getOperand(0);
14817 if(Ext1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14818 Ext2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14821 ConstantSDNode *Ext1Op = dyn_cast<ConstantSDNode>(Ext1.getOperand(1));
14822 ConstantSDNode *Ext2Op = dyn_cast<ConstantSDNode>(Ext2.getOperand(1));
14823 if (!Ext1Op || !Ext2Op)
14825 if (Ext1.getOperand(0).getValueType() != MVT::v4i32 ||
14826 Ext1.getOperand(0) != Ext2.getOperand(0))
14829 int FirstElem = Ext1Op->getZExtValue();
14830 int SecondElem = Ext2Op->getZExtValue();
14832 if (FirstElem == 0 && SecondElem == 1)
14833 SubvecIdx = Subtarget.isLittleEndian() ? 1 : 0;
14834 else if (FirstElem == 2 && SecondElem == 3)
14835 SubvecIdx = Subtarget.isLittleEndian() ? 0 : 1;
14839 SDValue SrcVec = Ext1.getOperand(0);
14840 auto NodeType = (N->getOperand(1).getOpcode() == ISD::SINT_TO_FP) ?
14841 PPCISD::SINT_VEC_TO_FP : PPCISD::UINT_VEC_TO_FP;
14842 return DAG.getNode(NodeType, dl, MVT::v2f64,
14843 SrcVec, DAG.getIntPtrConstant(SubvecIdx, dl));
14846 SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N,
14847 DAGCombinerInfo &DCI) const {
14848 assert((N->getOpcode() == ISD::SINT_TO_FP ||
14849 N->getOpcode() == ISD::UINT_TO_FP) &&
14850 "Need an int -> FP conversion node here");
14852 if (useSoftFloat() || !Subtarget.has64BitSupport())
14855 SelectionDAG &DAG = DCI.DAG;
14859 // Don't handle ppc_fp128 here or conversions that are out-of-range capable
14860 // from the hardware.
14861 if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
14863 if (!Op.getOperand(0).getValueType().isSimple())
14865 if (Op.getOperand(0).getValueType().getSimpleVT() <= MVT(MVT::i1) ||
14866 Op.getOperand(0).getValueType().getSimpleVT() > MVT(MVT::i64))
14869 SDValue FirstOperand(Op.getOperand(0));
14870 bool SubWordLoad = FirstOperand.getOpcode() == ISD::LOAD &&
14871 (FirstOperand.getValueType() == MVT::i8 ||
14872 FirstOperand.getValueType() == MVT::i16);
14873 if (Subtarget.hasP9Vector() && Subtarget.hasP9Altivec() && SubWordLoad) {
14874 bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
14875 bool DstDouble = Op.getValueType() == MVT::f64;
14876 unsigned ConvOp = Signed ?
14877 (DstDouble ? PPCISD::FCFID : PPCISD::FCFIDS) :
14878 (DstDouble ? PPCISD::FCFIDU : PPCISD::FCFIDUS);
14879 SDValue WidthConst =
14880 DAG.getIntPtrConstant(FirstOperand.getValueType() == MVT::i8 ? 1 : 2,
14882 LoadSDNode *LDN = cast<LoadSDNode>(FirstOperand.getNode());
14883 SDValue Ops[] = { LDN->getChain(), LDN->getBasePtr(), WidthConst };
14884 SDValue Ld = DAG.getMemIntrinsicNode(PPCISD::LXSIZX, dl,
14885 DAG.getVTList(MVT::f64, MVT::Other),
14886 Ops, MVT::i8, LDN->getMemOperand());
14888 // For signed conversion, we need to sign-extend the value in the VSR
14890 SDValue ExtOps[] = { Ld, WidthConst };
14891 SDValue Ext = DAG.getNode(PPCISD::VEXTS, dl, MVT::f64, ExtOps);
14892 return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ext);
14894 return DAG.getNode(ConvOp, dl, DstDouble ? MVT::f64 : MVT::f32, Ld);
14898 // For i32 intermediate values, unfortunately, the conversion functions
14899 // leave the upper 32 bits of the value are undefined. Within the set of
14900 // scalar instructions, we have no method for zero- or sign-extending the
14901 // value. Thus, we cannot handle i32 intermediate values here.
14902 if (Op.getOperand(0).getValueType() == MVT::i32)
14905 assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
14906 "UINT_TO_FP is supported only with FPCVT");
14908 // If we have FCFIDS, then use it when converting to single-precision.
14909 // Otherwise, convert to double-precision and then round.
14910 unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
14911 ? (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDUS
14913 : (Op.getOpcode() == ISD::UINT_TO_FP ? PPCISD::FCFIDU
14915 MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32)
14919 // If we're converting from a float, to an int, and back to a float again,
14920 // then we don't need the store/load pair at all.
14921 if ((Op.getOperand(0).getOpcode() == ISD::FP_TO_UINT &&
14922 Subtarget.hasFPCVT()) ||
14923 (Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT)) {
14924 SDValue Src = Op.getOperand(0).getOperand(0);
14925 if (Src.getValueType() == MVT::f32) {
14926 Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
14927 DCI.AddToWorklist(Src.getNode());
14928 } else if (Src.getValueType() != MVT::f64) {
14929 // Make sure that we don't pick up a ppc_fp128 source value.
14934 Op.getOperand(0).getOpcode() == ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
14937 SDValue Tmp = DAG.getNode(FCTOp, dl, MVT::f64, Src);
14938 SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Tmp);
14940 if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT()) {
14941 FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP,
14942 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
14943 DCI.AddToWorklist(FP.getNode());
14952 // expandVSXLoadForLE - Convert VSX loads (which may be intrinsics for
14953 // builtins) into loads with swaps.
14954 SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
14955 DAGCombinerInfo &DCI) const {
14956 // Delay VSX load for LE combine until after LegalizeOps to prioritize other
14958 if (DCI.isBeforeLegalizeOps())
14961 SelectionDAG &DAG = DCI.DAG;
14965 MachineMemOperand *MMO;
14967 switch (N->getOpcode()) {
14969 llvm_unreachable("Unexpected opcode for little endian VSX load");
14971 LoadSDNode *LD = cast<LoadSDNode>(N);
14972 Chain = LD->getChain();
14973 Base = LD->getBasePtr();
14974 MMO = LD->getMemOperand();
14975 // If the MMO suggests this isn't a load of a full vector, leave
14976 // things alone. For a built-in, we have to make the change for
14977 // correctness, so if there is a size problem that will be a bug.
14978 if (MMO->getSize() < 16)
14982 case ISD::INTRINSIC_W_CHAIN: {
14983 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
14984 Chain = Intrin->getChain();
14985 // Similarly to the store case below, Intrin->getBasePtr() doesn't get
14986 // us what we want. Get operand 2 instead.
14987 Base = Intrin->getOperand(2);
14988 MMO = Intrin->getMemOperand();
14993 MVT VecTy = N->getValueType(0).getSimpleVT();
14995 SDValue LoadOps[] = { Chain, Base };
14996 SDValue Load = DAG.getMemIntrinsicNode(PPCISD::LXVD2X, dl,
14997 DAG.getVTList(MVT::v2f64, MVT::Other),
14998 LoadOps, MVT::v2f64, MMO);
15000 DCI.AddToWorklist(Load.getNode());
15001 Chain = Load.getValue(1);
15002 SDValue Swap = DAG.getNode(
15003 PPCISD::XXSWAPD, dl, DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Load);
15004 DCI.AddToWorklist(Swap.getNode());
15006 // Add a bitcast if the resulting load type doesn't match v2f64.
15007 if (VecTy != MVT::v2f64) {
15008 SDValue N = DAG.getNode(ISD::BITCAST, dl, VecTy, Swap);
15009 DCI.AddToWorklist(N.getNode());
15010 // Package {bitcast value, swap's chain} to match Load's shape.
15011 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VecTy, MVT::Other),
15012 N, Swap.getValue(1));
15018 // expandVSXStoreForLE - Convert VSX stores (which may be intrinsics for
15019 // builtins) into stores with swaps.
15020 SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
15021 DAGCombinerInfo &DCI) const {
15022 // Delay VSX store for LE combine until after LegalizeOps to prioritize other
15024 if (DCI.isBeforeLegalizeOps())
15027 SelectionDAG &DAG = DCI.DAG;
15032 MachineMemOperand *MMO;
15034 switch (N->getOpcode()) {
15036 llvm_unreachable("Unexpected opcode for little endian VSX store");
15038 StoreSDNode *ST = cast<StoreSDNode>(N);
15039 Chain = ST->getChain();
15040 Base = ST->getBasePtr();
15041 MMO = ST->getMemOperand();
15043 // If the MMO suggests this isn't a store of a full vector, leave
15044 // things alone. For a built-in, we have to make the change for
15045 // correctness, so if there is a size problem that will be a bug.
15046 if (MMO->getSize() < 16)
15050 case ISD::INTRINSIC_VOID: {
15051 MemIntrinsicSDNode *Intrin = cast<MemIntrinsicSDNode>(N);
15052 Chain = Intrin->getChain();
15053 // Intrin->getBasePtr() oddly does not get what we want.
15054 Base = Intrin->getOperand(3);
15055 MMO = Intrin->getMemOperand();
15061 SDValue Src = N->getOperand(SrcOpnd);
15062 MVT VecTy = Src.getValueType().getSimpleVT();
15064 // All stores are done as v2f64 and possible bit cast.
15065 if (VecTy != MVT::v2f64) {
15066 Src = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Src);
15067 DCI.AddToWorklist(Src.getNode());
15070 SDValue Swap = DAG.getNode(PPCISD::XXSWAPD, dl,
15071 DAG.getVTList(MVT::v2f64, MVT::Other), Chain, Src);
15072 DCI.AddToWorklist(Swap.getNode());
15073 Chain = Swap.getValue(1);
15074 SDValue StoreOps[] = { Chain, Swap, Base };
15075 SDValue Store = DAG.getMemIntrinsicNode(PPCISD::STXVD2X, dl,
15076 DAG.getVTList(MVT::Other),
15077 StoreOps, VecTy, MMO);
15078 DCI.AddToWorklist(Store.getNode());
15082 // Handle DAG combine for STORE (FP_TO_INT F).
15083 SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
15084 DAGCombinerInfo &DCI) const {
15085 SelectionDAG &DAG = DCI.DAG;
15087 unsigned Opcode = N->getOperand(1).getOpcode();
15089 bool Strict = N->getOperand(1)->isStrictFPOpcode();
15091 assert((Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT ||
15092 Opcode == ISD::STRICT_FP_TO_SINT || Opcode == ISD::STRICT_FP_TO_UINT)
15093 && "Not a FP_TO_INT Instruction!");
15095 SDValue Val = N->getOperand(1).getOperand(Strict ? 1 : 0);
15096 EVT Op1VT = N->getOperand(1).getValueType();
15097 EVT ResVT = Val.getValueType();
15099 if (!Subtarget.hasVSX() || !Subtarget.hasFPCVT() || !isTypeLegal(ResVT))
15102 // Only perform combine for conversion to i64/i32 or power9 i16/i8.
15103 bool ValidTypeForStoreFltAsInt =
15104 (Op1VT == MVT::i32 || (Op1VT == MVT::i64 && Subtarget.isPPC64()) ||
15105 (Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8)));
15107 // TODO: Lower conversion from f128 on all VSX targets
15108 if (ResVT == MVT::ppcf128 || (ResVT == MVT::f128 && !Subtarget.hasP9Vector()))
15111 if ((Op1VT != MVT::i64 && !Subtarget.hasP8Vector()) ||
15112 cast<StoreSDNode>(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt)
15115 Val = convertFPToInt(N->getOperand(1), DAG, Subtarget);
15117 // Set number of bytes being converted.
15118 unsigned ByteSize = Op1VT.getScalarSizeInBits() / 8;
15119 SDValue Ops[] = {N->getOperand(0), Val, N->getOperand(2),
15120 DAG.getIntPtrConstant(ByteSize, dl, false),
15121 DAG.getValueType(Op1VT)};
15123 Val = DAG.getMemIntrinsicNode(PPCISD::ST_VSR_SCAL_INT, dl,
15124 DAG.getVTList(MVT::Other), Ops,
15125 cast<StoreSDNode>(N)->getMemoryVT(),
15126 cast<StoreSDNode>(N)->getMemOperand());
15131 static bool isAlternatingShuffMask(const ArrayRef<int> &Mask, int NumElts) {
15132 // Check that the source of the element keeps flipping
15133 // (i.e. Mask[i] < NumElts -> Mask[i+i] >= NumElts).
15134 bool PrevElemFromFirstVec = Mask[0] < NumElts;
15135 for (int i = 1, e = Mask.size(); i < e; i++) {
15136 if (PrevElemFromFirstVec && Mask[i] < NumElts)
15138 if (!PrevElemFromFirstVec && Mask[i] >= NumElts)
15140 PrevElemFromFirstVec = !PrevElemFromFirstVec;
15145 static bool isSplatBV(SDValue Op) {
15146 if (Op.getOpcode() != ISD::BUILD_VECTOR)
15150 // Find first non-undef input.
15151 for (int i = 0, e = Op.getNumOperands(); i < e; i++) {
15152 FirstOp = Op.getOperand(i);
15153 if (!FirstOp.isUndef())
15157 // All inputs are undef or the same as the first non-undef input.
15158 for (int i = 1, e = Op.getNumOperands(); i < e; i++)
15159 if (Op.getOperand(i) != FirstOp && !Op.getOperand(i).isUndef())
15164 static SDValue isScalarToVec(SDValue Op) {
15165 if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
15167 if (Op.getOpcode() != ISD::BITCAST)
15169 Op = Op.getOperand(0);
15170 if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
15175 // Fix up the shuffle mask to account for the fact that the result of
15176 // scalar_to_vector is not in lane zero. This just takes all values in
15177 // the ranges specified by the min/max indices and adds the number of
15178 // elements required to ensure each element comes from the respective
15179 // position in the valid lane.
15180 // On little endian, that's just the corresponding element in the other
15181 // half of the vector. On big endian, it is in the same half but right
15182 // justified rather than left justified in that half.
15183 static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl<int> &ShuffV,
15184 int LHSMaxIdx, int RHSMinIdx,
15185 int RHSMaxIdx, int HalfVec,
15186 unsigned ValidLaneWidth,
15187 const PPCSubtarget &Subtarget) {
15188 for (int i = 0, e = ShuffV.size(); i < e; i++) {
15189 int Idx = ShuffV[i];
15190 if ((Idx >= 0 && Idx < LHSMaxIdx) || (Idx >= RHSMinIdx && Idx < RHSMaxIdx))
15192 Subtarget.isLittleEndian() ? HalfVec : HalfVec - ValidLaneWidth;
15196 // Replace a SCALAR_TO_VECTOR with a SCALAR_TO_VECTOR_PERMUTED except if
15197 // the original is:
15198 // (<n x Ty> (scalar_to_vector (Ty (extract_elt <n x Ty> %a, C))))
15199 // In such a case, just change the shuffle mask to extract the element
15200 // from the permuted index.
15201 static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG,
15202 const PPCSubtarget &Subtarget) {
15203 SDLoc dl(OrigSToV);
15204 EVT VT = OrigSToV.getValueType();
15205 assert(OrigSToV.getOpcode() == ISD::SCALAR_TO_VECTOR &&
15206 "Expecting a SCALAR_TO_VECTOR here");
15207 SDValue Input = OrigSToV.getOperand(0);
15209 if (Input.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
15210 ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Input.getOperand(1));
15211 SDValue OrigVector = Input.getOperand(0);
15213 // Can't handle non-const element indices or different vector types
15214 // for the input to the extract and the output of the scalar_to_vector.
15215 if (Idx && VT == OrigVector.getValueType()) {
15216 unsigned NumElts = VT.getVectorNumElements();
15219 "Cannot produce a permuted scalar_to_vector for one element vector");
15220 SmallVector<int, 16> NewMask(NumElts, -1);
15221 unsigned ResultInElt = NumElts / 2;
15222 ResultInElt -= Subtarget.isLittleEndian() ? 0 : 1;
15223 NewMask[ResultInElt] = Idx->getZExtValue();
15224 return DAG.getVectorShuffle(VT, dl, OrigVector, OrigVector, NewMask);
15227 return DAG.getNode(PPCISD::SCALAR_TO_VECTOR_PERMUTED, dl, VT,
15228 OrigSToV.getOperand(0));
15231 // On little endian subtargets, combine shuffles such as:
15232 // vector_shuffle<16,1,17,3,18,5,19,7,20,9,21,11,22,13,23,15>, <zero>, %b
15234 // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7>, <zero>, %b
15235 // because the latter can be matched to a single instruction merge.
15236 // Furthermore, SCALAR_TO_VECTOR on little endian always involves a permute
15237 // to put the value into element zero. Adjust the shuffle mask so that the
15238 // vector can remain in permuted form (to prevent a swap prior to a shuffle).
15239 // On big endian targets, this is still useful for SCALAR_TO_VECTOR
15240 // nodes with elements smaller than doubleword because all the ways
15241 // of getting scalar data into a vector register put the value in the
15242 // rightmost element of the left half of the vector.
15243 SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
15244 SelectionDAG &DAG) const {
15245 SDValue LHS = SVN->getOperand(0);
15246 SDValue RHS = SVN->getOperand(1);
15247 auto Mask = SVN->getMask();
15248 int NumElts = LHS.getValueType().getVectorNumElements();
15249 SDValue Res(SVN, 0);
15251 bool IsLittleEndian = Subtarget.isLittleEndian();
15253 // On big endian targets this is only useful for subtargets with direct moves.
15254 // On little endian targets it would be useful for all subtargets with VSX.
15255 // However adding special handling for LE subtargets without direct moves
15256 // would be wasted effort since the minimum arch for LE is ISA 2.07 (Power8)
15257 // which includes direct moves.
15258 if (!Subtarget.hasDirectMove())
15261 // If this is not a shuffle of a shuffle and the first element comes from
15262 // the second vector, canonicalize to the commuted form. This will make it
15263 // more likely to match one of the single instruction patterns.
15264 if (Mask[0] >= NumElts && LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
15265 RHS.getOpcode() != ISD::VECTOR_SHUFFLE) {
15266 std::swap(LHS, RHS);
15267 Res = DAG.getCommutedVectorShuffle(*SVN);
15268 Mask = cast<ShuffleVectorSDNode>(Res)->getMask();
15271 // Adjust the shuffle mask if either input vector comes from a
15272 // SCALAR_TO_VECTOR and keep the respective input vector in permuted
15273 // form (to prevent the need for a swap).
15274 SmallVector<int, 16> ShuffV(Mask);
15275 SDValue SToVLHS = isScalarToVec(LHS);
15276 SDValue SToVRHS = isScalarToVec(RHS);
15277 if (SToVLHS || SToVRHS) {
15278 // FIXME: If both LHS and RHS are SCALAR_TO_VECTOR, but are not the
15279 // same type and have differing element sizes, then do not perform
15280 // the following transformation. The current transformation for
15281 // SCALAR_TO_VECTOR assumes that both input vectors have the same
15282 // element size. This will be updated in the future to account for
15283 // differing sizes of the LHS and RHS.
15284 if (SToVLHS && SToVRHS &&
15285 (SToVLHS.getValueType().getScalarSizeInBits() !=
15286 SToVRHS.getValueType().getScalarSizeInBits()))
15289 int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements()
15290 : SToVRHS.getValueType().getVectorNumElements();
15291 int NumEltsOut = ShuffV.size();
15292 // The width of the "valid lane" (i.e. the lane that contains the value that
15293 // is vectorized) needs to be expressed in terms of the number of elements
15294 // of the shuffle. It is thereby the ratio of the values before and after
15296 unsigned ValidLaneWidth =
15297 SToVLHS ? SToVLHS.getValueType().getScalarSizeInBits() /
15298 LHS.getValueType().getScalarSizeInBits()
15299 : SToVRHS.getValueType().getScalarSizeInBits() /
15300 RHS.getValueType().getScalarSizeInBits();
15302 // Initially assume that neither input is permuted. These will be adjusted
15303 // accordingly if either input is.
15304 int LHSMaxIdx = -1;
15305 int RHSMinIdx = -1;
15306 int RHSMaxIdx = -1;
15307 int HalfVec = LHS.getValueType().getVectorNumElements() / 2;
15309 // Get the permuted scalar to vector nodes for the source(s) that come from
15310 // ISD::SCALAR_TO_VECTOR.
15311 // On big endian systems, this only makes sense for element sizes smaller
15312 // than 64 bits since for 64-bit elements, all instructions already put
15313 // the value into element zero. Since scalar size of LHS and RHS may differ
15314 // after isScalarToVec, this should be checked using their own sizes.
15316 if (!IsLittleEndian && SToVLHS.getValueType().getScalarSizeInBits() >= 64)
15318 // Set up the values for the shuffle vector fixup.
15319 LHSMaxIdx = NumEltsOut / NumEltsIn;
15320 SToVLHS = getSToVPermuted(SToVLHS, DAG, Subtarget);
15321 if (SToVLHS.getValueType() != LHS.getValueType())
15322 SToVLHS = DAG.getBitcast(LHS.getValueType(), SToVLHS);
15326 if (!IsLittleEndian && SToVRHS.getValueType().getScalarSizeInBits() >= 64)
15328 RHSMinIdx = NumEltsOut;
15329 RHSMaxIdx = NumEltsOut / NumEltsIn + RHSMinIdx;
15330 SToVRHS = getSToVPermuted(SToVRHS, DAG, Subtarget);
15331 if (SToVRHS.getValueType() != RHS.getValueType())
15332 SToVRHS = DAG.getBitcast(RHS.getValueType(), SToVRHS);
15336 // Fix up the shuffle mask to reflect where the desired element actually is.
15337 // The minimum and maximum indices that correspond to element zero for both
15338 // the LHS and RHS are computed and will control which shuffle mask entries
15339 // are to be changed. For example, if the RHS is permuted, any shuffle mask
15340 // entries in the range [RHSMinIdx,RHSMaxIdx) will be adjusted.
15341 fixupShuffleMaskForPermutedSToV(ShuffV, LHSMaxIdx, RHSMinIdx, RHSMaxIdx,
15342 HalfVec, ValidLaneWidth, Subtarget);
15343 Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
15345 // We may have simplified away the shuffle. We won't be able to do anything
15346 // further with it here.
15347 if (!isa<ShuffleVectorSDNode>(Res))
15349 Mask = cast<ShuffleVectorSDNode>(Res)->getMask();
15352 SDValue TheSplat = IsLittleEndian ? RHS : LHS;
15353 // The common case after we commuted the shuffle is that the RHS is a splat
15354 // and we have elements coming in from the splat at indices that are not
15355 // conducive to using a merge.
15357 // vector_shuffle<0,17,1,19,2,21,3,23,4,25,5,27,6,29,7,31> t1, <zero>
15358 if (!isSplatBV(TheSplat))
15361 // We are looking for a mask such that all even elements are from
15362 // one vector and all odd elements from the other.
15363 if (!isAlternatingShuffMask(Mask, NumElts))
15366 // Adjust the mask so we are pulling in the same index from the splat
15367 // as the index from the interesting vector in consecutive elements.
15368 if (IsLittleEndian) {
15369 // Example (even elements from first vector):
15370 // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>
15371 if (Mask[0] < NumElts)
15372 for (int i = 1, e = Mask.size(); i < e; i += 2) {
15375 ShuffV[i] = (ShuffV[i - 1] + NumElts);
15377 // Example (odd elements from first vector):
15378 // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>
15380 for (int i = 0, e = Mask.size(); i < e; i += 2) {
15383 ShuffV[i] = (ShuffV[i + 1] + NumElts);
15386 // Example (even elements from first vector):
15387 // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> <zero>, t1
15388 if (Mask[0] < NumElts)
15389 for (int i = 0, e = Mask.size(); i < e; i += 2) {
15392 ShuffV[i] = ShuffV[i + 1] - NumElts;
15394 // Example (odd elements from first vector):
15395 // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> <zero>, t1
15397 for (int i = 1, e = Mask.size(); i < e; i += 2) {
15400 ShuffV[i] = ShuffV[i - 1] - NumElts;
15404 // If the RHS has undefs, we need to remove them since we may have created
15405 // a shuffle that adds those instead of the splat value.
15407 cast<BuildVectorSDNode>(TheSplat.getNode())->getSplatValue();
15408 TheSplat = DAG.getSplatBuildVector(TheSplat.getValueType(), dl, SplatVal);
15410 if (IsLittleEndian)
15414 return DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
15417 SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
15418 LSBaseSDNode *LSBase,
15419 DAGCombinerInfo &DCI) const {
15420 assert((ISD::isNormalLoad(LSBase) || ISD::isNormalStore(LSBase)) &&
15421 "Not a reverse memop pattern!");
15423 auto IsElementReverse = [](const ShuffleVectorSDNode *SVN) -> bool {
15424 auto Mask = SVN->getMask();
15426 auto I = Mask.rbegin();
15427 auto E = Mask.rend();
15429 for (; I != E; ++I) {
15437 SelectionDAG &DAG = DCI.DAG;
15438 EVT VT = SVN->getValueType(0);
15440 if (!isTypeLegal(VT) || !Subtarget.isLittleEndian() || !Subtarget.hasVSX())
15443 // Before P9, we have PPCVSXSwapRemoval pass to hack the element order.
15444 // See comment in PPCVSXSwapRemoval.cpp.
15445 // It is conflict with PPCVSXSwapRemoval opt. So we don't do it.
15446 if (!Subtarget.hasP9Vector())
15449 if(!IsElementReverse(SVN))
15452 if (LSBase->getOpcode() == ISD::LOAD) {
15453 // If the load return value 0 has more than one user except the
15454 // shufflevector instruction, it is not profitable to replace the
15455 // shufflevector with a reverse load.
15456 for (SDNode::use_iterator UI = LSBase->use_begin(), UE = LSBase->use_end();
15458 if (UI.getUse().getResNo() == 0 && UI->getOpcode() != ISD::VECTOR_SHUFFLE)
15462 SDValue LoadOps[] = {LSBase->getChain(), LSBase->getBasePtr()};
15463 return DAG.getMemIntrinsicNode(
15464 PPCISD::LOAD_VEC_BE, dl, DAG.getVTList(VT, MVT::Other), LoadOps,
15465 LSBase->getMemoryVT(), LSBase->getMemOperand());
15468 if (LSBase->getOpcode() == ISD::STORE) {
15469 // If there are other uses of the shuffle, the swap cannot be avoided.
15470 // Forcing the use of an X-Form (since swapped stores only have
15471 // X-Forms) without removing the swap is unprofitable.
15472 if (!SVN->hasOneUse())
15476 SDValue StoreOps[] = {LSBase->getChain(), SVN->getOperand(0),
15477 LSBase->getBasePtr()};
15478 return DAG.getMemIntrinsicNode(
15479 PPCISD::STORE_VEC_BE, dl, DAG.getVTList(MVT::Other), StoreOps,
15480 LSBase->getMemoryVT(), LSBase->getMemOperand());
15483 llvm_unreachable("Expected a load or store node here");
15486 static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) {
15487 unsigned IntrinsicID =
15488 cast<ConstantSDNode>(Intrin.getOperand(1))->getZExtValue();
15489 if (IntrinsicID == Intrinsic::ppc_stdcx)
15491 else if (IntrinsicID == Intrinsic::ppc_stwcx)
15493 else if (IntrinsicID == Intrinsic::ppc_sthcx)
15495 else if (IntrinsicID == Intrinsic::ppc_stbcx)
15502 SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
15503 DAGCombinerInfo &DCI) const {
15504 SelectionDAG &DAG = DCI.DAG;
15506 switch (N->getOpcode()) {
15509 return combineADD(N, DCI);
15511 // We don't want (and (zext (shift...)), C) if C fits in the width of the
15512 // original input as that will prevent us from selecting optimal rotates.
15513 // This only matters if the input to the extend is i32 widened to i64.
15514 SDValue Op1 = N->getOperand(0);
15515 SDValue Op2 = N->getOperand(1);
15516 if ((Op1.getOpcode() != ISD::ZERO_EXTEND &&
15517 Op1.getOpcode() != ISD::ANY_EXTEND) ||
15518 !isa<ConstantSDNode>(Op2) || N->getValueType(0) != MVT::i64 ||
15519 Op1.getOperand(0).getValueType() != MVT::i32)
15521 SDValue NarrowOp = Op1.getOperand(0);
15522 if (NarrowOp.getOpcode() != ISD::SHL && NarrowOp.getOpcode() != ISD::SRL &&
15523 NarrowOp.getOpcode() != ISD::ROTL && NarrowOp.getOpcode() != ISD::ROTR)
15526 uint64_t Imm = cast<ConstantSDNode>(Op2)->getZExtValue();
15527 // Make sure that the constant is narrow enough to fit in the narrow type.
15528 if (!isUInt<32>(Imm))
15530 SDValue ConstOp = DAG.getConstant(Imm, dl, MVT::i32);
15531 SDValue NarrowAnd = DAG.getNode(ISD::AND, dl, MVT::i32, NarrowOp, ConstOp);
15532 return DAG.getAnyExtOrTrunc(NarrowAnd, dl, N->getValueType(0));
15535 return combineSHL(N, DCI);
15537 return combineSRA(N, DCI);
15539 return combineSRL(N, DCI);
15541 return combineMUL(N, DCI);
15543 case PPCISD::FNMSUB:
15544 return combineFMALike(N, DCI);
15546 if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
15547 return N->getOperand(0);
15550 if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0.
15551 return N->getOperand(0);
15554 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
15555 if (C->isZero() || // 0 >>s V -> 0.
15556 C->isAllOnes()) // -1 >>s V -> -1.
15557 return N->getOperand(0);
15560 case ISD::SIGN_EXTEND:
15561 case ISD::ZERO_EXTEND:
15562 case ISD::ANY_EXTEND:
15563 return DAGCombineExtBoolTrunc(N, DCI);
15564 case ISD::TRUNCATE:
15565 return combineTRUNCATE(N, DCI);
15567 if (SDValue CSCC = combineSetCC(N, DCI))
15570 case ISD::SELECT_CC:
15571 return DAGCombineTruncBoolExt(N, DCI);
15572 case ISD::SINT_TO_FP:
15573 case ISD::UINT_TO_FP:
15574 return combineFPToIntToFP(N, DCI);
15575 case ISD::VECTOR_SHUFFLE:
15576 if (ISD::isNormalLoad(N->getOperand(0).getNode())) {
15577 LSBaseSDNode* LSBase = cast<LSBaseSDNode>(N->getOperand(0));
15578 return combineVReverseMemOP(cast<ShuffleVectorSDNode>(N), LSBase, DCI);
15580 return combineVectorShuffle(cast<ShuffleVectorSDNode>(N), DCI.DAG);
15583 EVT Op1VT = N->getOperand(1).getValueType();
15584 unsigned Opcode = N->getOperand(1).getOpcode();
15586 if (Opcode == ISD::FP_TO_SINT || Opcode == ISD::FP_TO_UINT ||
15587 Opcode == ISD::STRICT_FP_TO_SINT || Opcode == ISD::STRICT_FP_TO_UINT) {
15588 SDValue Val = combineStoreFPToInt(N, DCI);
15593 if (Opcode == ISD::VECTOR_SHUFFLE && ISD::isNormalStore(N)) {
15594 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N->getOperand(1));
15595 SDValue Val= combineVReverseMemOP(SVN, cast<LSBaseSDNode>(N), DCI);
15600 // Turn STORE (BSWAP) -> sthbrx/stwbrx.
15601 if (cast<StoreSDNode>(N)->isUnindexed() && Opcode == ISD::BSWAP &&
15602 N->getOperand(1).getNode()->hasOneUse() &&
15603 (Op1VT == MVT::i32 || Op1VT == MVT::i16 ||
15604 (Subtarget.hasLDBRX() && Subtarget.isPPC64() && Op1VT == MVT::i64))) {
15606 // STBRX can only handle simple types and it makes no sense to store less
15607 // two bytes in byte-reversed order.
15608 EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
15609 if (mVT.isExtended() || mVT.getSizeInBits() < 16)
15612 SDValue BSwapOp = N->getOperand(1).getOperand(0);
15613 // Do an any-extend to 32-bits if this is a half-word input.
15614 if (BSwapOp.getValueType() == MVT::i16)
15615 BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
15617 // If the type of BSWAP operand is wider than stored memory width
15618 // it need to be shifted to the right side before STBRX.
15619 if (Op1VT.bitsGT(mVT)) {
15620 int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
15621 BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp,
15622 DAG.getConstant(Shift, dl, MVT::i32));
15623 // Need to truncate if this is a bswap of i64 stored as i32/i16.
15624 if (Op1VT == MVT::i64)
15625 BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp);
15629 N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT)
15632 DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
15633 Ops, cast<StoreSDNode>(N)->getMemoryVT(),
15634 cast<StoreSDNode>(N)->getMemOperand());
15637 // STORE Constant:i32<0> -> STORE<trunc to i32> Constant:i64<0>
15638 // So it can increase the chance of CSE constant construction.
15639 if (Subtarget.isPPC64() && !DCI.isBeforeLegalize() &&
15640 isa<ConstantSDNode>(N->getOperand(1)) && Op1VT == MVT::i32) {
15641 // Need to sign-extended to 64-bits to handle negative values.
15642 EVT MemVT = cast<StoreSDNode>(N)->getMemoryVT();
15643 uint64_t Val64 = SignExtend64(N->getConstantOperandVal(1),
15644 MemVT.getSizeInBits());
15645 SDValue Const64 = DAG.getConstant(Val64, dl, MVT::i64);
15647 // DAG.getTruncStore() can't be used here because it doesn't accept
15648 // the general (base + offset) addressing mode.
15649 // So we use UpdateNodeOperands and setTruncatingStore instead.
15650 DAG.UpdateNodeOperands(N, N->getOperand(0), Const64, N->getOperand(2),
15652 cast<StoreSDNode>(N)->setTruncatingStore(true);
15653 return SDValue(N, 0);
15656 // For little endian, VSX stores require generating xxswapd/lxvd2x.
15657 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
15658 if (Op1VT.isSimple()) {
15659 MVT StoreVT = Op1VT.getSimpleVT();
15660 if (Subtarget.needsSwapsForVSXMemOps() &&
15661 (StoreVT == MVT::v2f64 || StoreVT == MVT::v2i64 ||
15662 StoreVT == MVT::v4f32 || StoreVT == MVT::v4i32))
15663 return expandVSXStoreForLE(N, DCI);
15668 LoadSDNode *LD = cast<LoadSDNode>(N);
15669 EVT VT = LD->getValueType(0);
15671 // For little endian, VSX loads require generating lxvd2x/xxswapd.
15672 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
15673 if (VT.isSimple()) {
15674 MVT LoadVT = VT.getSimpleVT();
15675 if (Subtarget.needsSwapsForVSXMemOps() &&
15676 (LoadVT == MVT::v2f64 || LoadVT == MVT::v2i64 ||
15677 LoadVT == MVT::v4f32 || LoadVT == MVT::v4i32))
15678 return expandVSXLoadForLE(N, DCI);
15681 // We sometimes end up with a 64-bit integer load, from which we extract
15682 // two single-precision floating-point numbers. This happens with
15683 // std::complex<float>, and other similar structures, because of the way we
15684 // canonicalize structure copies. However, if we lack direct moves,
15685 // then the final bitcasts from the extracted integer values to the
15686 // floating-point numbers turn into store/load pairs. Even with direct moves,
15687 // just loading the two floating-point numbers is likely better.
15688 auto ReplaceTwoFloatLoad = [&]() {
15689 if (VT != MVT::i64)
15692 if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
15696 // We're looking for a sequence like this:
15697 // t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
15698 // t16: i64 = srl t13, Constant:i32<32>
15699 // t17: i32 = truncate t16
15700 // t18: f32 = bitcast t17
15701 // t19: i32 = truncate t13
15702 // t20: f32 = bitcast t19
15704 if (!LD->hasNUsesOfValue(2, 0))
15707 auto UI = LD->use_begin();
15708 while (UI.getUse().getResNo() != 0) ++UI;
15709 SDNode *Trunc = *UI++;
15710 while (UI.getUse().getResNo() != 0) ++UI;
15711 SDNode *RightShift = *UI;
15712 if (Trunc->getOpcode() != ISD::TRUNCATE)
15713 std::swap(Trunc, RightShift);
15715 if (Trunc->getOpcode() != ISD::TRUNCATE ||
15716 Trunc->getValueType(0) != MVT::i32 ||
15717 !Trunc->hasOneUse())
15719 if (RightShift->getOpcode() != ISD::SRL ||
15720 !isa<ConstantSDNode>(RightShift->getOperand(1)) ||
15721 RightShift->getConstantOperandVal(1) != 32 ||
15722 !RightShift->hasOneUse())
15725 SDNode *Trunc2 = *RightShift->use_begin();
15726 if (Trunc2->getOpcode() != ISD::TRUNCATE ||
15727 Trunc2->getValueType(0) != MVT::i32 ||
15728 !Trunc2->hasOneUse())
15731 SDNode *Bitcast = *Trunc->use_begin();
15732 SDNode *Bitcast2 = *Trunc2->use_begin();
15734 if (Bitcast->getOpcode() != ISD::BITCAST ||
15735 Bitcast->getValueType(0) != MVT::f32)
15737 if (Bitcast2->getOpcode() != ISD::BITCAST ||
15738 Bitcast2->getValueType(0) != MVT::f32)
15741 if (Subtarget.isLittleEndian())
15742 std::swap(Bitcast, Bitcast2);
15744 // Bitcast has the second float (in memory-layout order) and Bitcast2
15745 // has the first one.
15747 SDValue BasePtr = LD->getBasePtr();
15748 if (LD->isIndexed()) {
15749 assert(LD->getAddressingMode() == ISD::PRE_INC &&
15750 "Non-pre-inc AM on PPC?");
15752 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
15757 LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOVolatile;
15758 SDValue FloatLoad = DAG.getLoad(MVT::f32, dl, LD->getChain(), BasePtr,
15759 LD->getPointerInfo(), LD->getAlign(),
15760 MMOFlags, LD->getAAInfo());
15762 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
15763 BasePtr, DAG.getIntPtrConstant(4, dl));
15764 SDValue FloatLoad2 = DAG.getLoad(
15765 MVT::f32, dl, SDValue(FloatLoad.getNode(), 1), AddPtr,
15766 LD->getPointerInfo().getWithOffset(4),
15767 commonAlignment(LD->getAlign(), 4), MMOFlags, LD->getAAInfo());
15769 if (LD->isIndexed()) {
15770 // Note that DAGCombine should re-form any pre-increment load(s) from
15771 // what is produced here if that makes sense.
15772 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), BasePtr);
15775 DCI.CombineTo(Bitcast2, FloatLoad);
15776 DCI.CombineTo(Bitcast, FloatLoad2);
15778 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, LD->isIndexed() ? 2 : 1),
15779 SDValue(FloatLoad2.getNode(), 1));
15783 if (ReplaceTwoFloatLoad())
15784 return SDValue(N, 0);
15786 EVT MemVT = LD->getMemoryVT();
15787 Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
15788 Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty);
15789 if (LD->isUnindexed() && VT.isVector() &&
15790 ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
15791 // P8 and later hardware should just use LOAD.
15792 !Subtarget.hasP8Vector() &&
15793 (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
15794 VT == MVT::v4f32))) &&
15795 LD->getAlign() < ABIAlignment) {
15796 // This is a type-legal unaligned Altivec load.
15797 SDValue Chain = LD->getChain();
15798 SDValue Ptr = LD->getBasePtr();
15799 bool isLittleEndian = Subtarget.isLittleEndian();
15801 // This implements the loading of unaligned vectors as described in
15802 // the venerable Apple Velocity Engine overview. Specifically:
15803 // https://developer.apple.com/hardwaredrivers/ve/alignment.html
15804 // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
15806 // The general idea is to expand a sequence of one or more unaligned
15807 // loads into an alignment-based permutation-control instruction (lvsl
15808 // or lvsr), a series of regular vector loads (which always truncate
15809 // their input address to an aligned address), and a series of
15810 // permutations. The results of these permutations are the requested
15811 // loaded values. The trick is that the last "extra" load is not taken
15812 // from the address you might suspect (sizeof(vector) bytes after the
15813 // last requested load), but rather sizeof(vector) - 1 bytes after the
15814 // last requested vector. The point of this is to avoid a page fault if
15815 // the base address happened to be aligned. This works because if the
15816 // base address is aligned, then adding less than a full vector length
15817 // will cause the last vector in the sequence to be (re)loaded.
15818 // Otherwise, the next vector will be fetched as you might suspect was
15821 // We might be able to reuse the permutation generation from
15822 // a different base address offset from this one by an aligned amount.
15823 // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
15824 // optimization later.
15825 Intrinsic::ID Intr, IntrLD, IntrPerm;
15826 MVT PermCntlTy, PermTy, LDTy;
15827 Intr = isLittleEndian ? Intrinsic::ppc_altivec_lvsr
15828 : Intrinsic::ppc_altivec_lvsl;
15829 IntrLD = Intrinsic::ppc_altivec_lvx;
15830 IntrPerm = Intrinsic::ppc_altivec_vperm;
15831 PermCntlTy = MVT::v16i8;
15832 PermTy = MVT::v4i32;
15835 SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, PermCntlTy);
15837 // Create the new MMO for the new base load. It is like the original MMO,
15838 // but represents an area in memory almost twice the vector size centered
15839 // on the original address. If the address is unaligned, we might start
15840 // reading up to (sizeof(vector)-1) bytes below the address of the
15841 // original unaligned load.
15842 MachineFunction &MF = DAG.getMachineFunction();
15843 MachineMemOperand *BaseMMO =
15844 MF.getMachineMemOperand(LD->getMemOperand(),
15845 -(int64_t)MemVT.getStoreSize()+1,
15846 2*MemVT.getStoreSize()-1);
15848 // Create the new base load.
15850 DAG.getTargetConstant(IntrLD, dl, getPointerTy(MF.getDataLayout()));
15851 SDValue BaseLoadOps[] = { Chain, LDXIntID, Ptr };
15853 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
15854 DAG.getVTList(PermTy, MVT::Other),
15855 BaseLoadOps, LDTy, BaseMMO);
15857 // Note that the value of IncOffset (which is provided to the next
15858 // load's pointer info offset value, and thus used to calculate the
15859 // alignment), and the value of IncValue (which is actually used to
15860 // increment the pointer value) are different! This is because we
15861 // require the next load to appear to be aligned, even though it
15862 // is actually offset from the base pointer by a lesser amount.
15863 int IncOffset = VT.getSizeInBits() / 8;
15864 int IncValue = IncOffset;
15866 // Walk (both up and down) the chain looking for another load at the real
15867 // (aligned) offset (the alignment of the other load does not matter in
15868 // this case). If found, then do not use the offset reduction trick, as
15869 // that will prevent the loads from being later combined (as they would
15870 // otherwise be duplicates).
15871 if (!findConsecutiveLoad(LD, DAG))
15874 SDValue Increment =
15875 DAG.getConstant(IncValue, dl, getPointerTy(MF.getDataLayout()));
15876 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
15878 MachineMemOperand *ExtraMMO =
15879 MF.getMachineMemOperand(LD->getMemOperand(),
15880 1, 2*MemVT.getStoreSize()-1);
15881 SDValue ExtraLoadOps[] = { Chain, LDXIntID, Ptr };
15882 SDValue ExtraLoad =
15883 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, dl,
15884 DAG.getVTList(PermTy, MVT::Other),
15885 ExtraLoadOps, LDTy, ExtraMMO);
15887 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
15888 BaseLoad.getValue(1), ExtraLoad.getValue(1));
15890 // Because vperm has a big-endian bias, we must reverse the order
15891 // of the input vectors and complement the permute control vector
15892 // when generating little endian code. We have already handled the
15893 // latter by using lvsr instead of lvsl, so just reverse BaseLoad
15894 // and ExtraLoad here.
15896 if (isLittleEndian)
15897 Perm = BuildIntrinsicOp(IntrPerm,
15898 ExtraLoad, BaseLoad, PermCntl, DAG, dl);
15900 Perm = BuildIntrinsicOp(IntrPerm,
15901 BaseLoad, ExtraLoad, PermCntl, DAG, dl);
15904 Perm = Subtarget.hasAltivec()
15905 ? DAG.getNode(ISD::BITCAST, dl, VT, Perm)
15906 : DAG.getNode(ISD::FP_ROUND, dl, VT, Perm,
15907 DAG.getTargetConstant(1, dl, MVT::i64));
15908 // second argument is 1 because this rounding
15909 // is always exact.
15911 // The output of the permutation is our loaded result, the TokenFactor is
15913 DCI.CombineTo(N, Perm, TF);
15914 return SDValue(N, 0);
15918 case ISD::INTRINSIC_WO_CHAIN: {
15919 bool isLittleEndian = Subtarget.isLittleEndian();
15920 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
15921 Intrinsic::ID Intr = (isLittleEndian ? Intrinsic::ppc_altivec_lvsr
15922 : Intrinsic::ppc_altivec_lvsl);
15923 if (IID == Intr && N->getOperand(1)->getOpcode() == ISD::ADD) {
15924 SDValue Add = N->getOperand(1);
15926 int Bits = 4 /* 16 byte alignment */;
15928 if (DAG.MaskedValueIsZero(Add->getOperand(1),
15929 APInt::getAllOnes(Bits /* alignment */)
15930 .zext(Add.getScalarValueSizeInBits()))) {
15931 SDNode *BasePtr = Add->getOperand(0).getNode();
15932 for (SDNode *U : BasePtr->uses()) {
15933 if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
15934 cast<ConstantSDNode>(U->getOperand(0))->getZExtValue() == IID) {
15935 // We've found another LVSL/LVSR, and this address is an aligned
15936 // multiple of that one. The results will be the same, so use the
15937 // one we've just found instead.
15939 return SDValue(U, 0);
15944 if (isa<ConstantSDNode>(Add->getOperand(1))) {
15945 SDNode *BasePtr = Add->getOperand(0).getNode();
15946 for (SDNode *U : BasePtr->uses()) {
15947 if (U->getOpcode() == ISD::ADD &&
15948 isa<ConstantSDNode>(U->getOperand(1)) &&
15949 (cast<ConstantSDNode>(Add->getOperand(1))->getZExtValue() -
15950 cast<ConstantSDNode>(U->getOperand(1))->getZExtValue()) %
15953 SDNode *OtherAdd = U;
15954 for (SDNode *V : OtherAdd->uses()) {
15955 if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
15956 cast<ConstantSDNode>(V->getOperand(0))->getZExtValue() ==
15958 return SDValue(V, 0);
15966 // Combine vmaxsw/h/b(a, a's negation) to abs(a)
15967 // Expose the vabsduw/h/b opportunity for down stream
15968 if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() &&
15969 (IID == Intrinsic::ppc_altivec_vmaxsw ||
15970 IID == Intrinsic::ppc_altivec_vmaxsh ||
15971 IID == Intrinsic::ppc_altivec_vmaxsb)) {
15972 SDValue V1 = N->getOperand(1);
15973 SDValue V2 = N->getOperand(2);
15974 if ((V1.getSimpleValueType() == MVT::v4i32 ||
15975 V1.getSimpleValueType() == MVT::v8i16 ||
15976 V1.getSimpleValueType() == MVT::v16i8) &&
15977 V1.getSimpleValueType() == V2.getSimpleValueType()) {
15979 if (V1.getOpcode() == ISD::SUB &&
15980 ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
15981 V1.getOperand(1) == V2) {
15982 return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2);
15985 if (V2.getOpcode() == ISD::SUB &&
15986 ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
15987 V2.getOperand(1) == V1) {
15988 return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
15991 if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB &&
15992 V1.getOperand(0) == V2.getOperand(1) &&
15993 V1.getOperand(1) == V2.getOperand(0)) {
15994 return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
16001 case ISD::INTRINSIC_W_CHAIN:
16002 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
16005 case Intrinsic::ppc_altivec_vsum4sbs:
16006 case Intrinsic::ppc_altivec_vsum4shs:
16007 case Intrinsic::ppc_altivec_vsum4ubs: {
16008 // These sum-across intrinsics only have a chain due to the side effect
16009 // that they may set the SAT bit. If we know the SAT bit will not be set
16010 // for some inputs, we can replace any uses of their chain with the input
16012 if (BuildVectorSDNode *BVN =
16013 dyn_cast<BuildVectorSDNode>(N->getOperand(3))) {
16014 APInt APSplatBits, APSplatUndef;
16015 unsigned SplatBitSize;
16017 bool BVNIsConstantSplat = BVN->isConstantSplat(
16018 APSplatBits, APSplatUndef, SplatBitSize, HasAnyUndefs, 0,
16019 !Subtarget.isLittleEndian());
16020 // If the constant splat vector is 0, the SAT bit will not be set.
16021 if (BVNIsConstantSplat && APSplatBits == 0)
16022 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), N->getOperand(0));
16026 case Intrinsic::ppc_vsx_lxvw4x:
16027 case Intrinsic::ppc_vsx_lxvd2x:
16028 // For little endian, VSX loads require generating lxvd2x/xxswapd.
16029 // Not needed on ISA 3.0 based CPUs since we have a non-permuting load.
16030 if (Subtarget.needsSwapsForVSXMemOps())
16031 return expandVSXLoadForLE(N, DCI);
16035 case ISD::INTRINSIC_VOID:
16036 // For little endian, VSX stores require generating xxswapd/stxvd2x.
16037 // Not needed on ISA 3.0 based CPUs since we have a non-permuting store.
16038 if (Subtarget.needsSwapsForVSXMemOps()) {
16039 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
16042 case Intrinsic::ppc_vsx_stxvw4x:
16043 case Intrinsic::ppc_vsx_stxvd2x:
16044 return expandVSXStoreForLE(N, DCI);
16049 // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
16050 // For subtargets without LDBRX, we can still do better than the default
16051 // expansion even for 64-bit BSWAP (LOAD).
16052 bool Is64BitBswapOn64BitTgt =
16053 Subtarget.isPPC64() && N->getValueType(0) == MVT::i64;
16054 bool IsSingleUseNormalLd = ISD::isNormalLoad(N->getOperand(0).getNode()) &&
16055 N->getOperand(0).hasOneUse();
16056 if (IsSingleUseNormalLd &&
16057 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||
16058 (Subtarget.hasLDBRX() && Is64BitBswapOn64BitTgt))) {
16059 SDValue Load = N->getOperand(0);
16060 LoadSDNode *LD = cast<LoadSDNode>(Load);
16061 // Create the byte-swapping load.
16063 LD->getChain(), // Chain
16064 LD->getBasePtr(), // Ptr
16065 DAG.getValueType(N->getValueType(0)) // VT
16068 DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
16069 DAG.getVTList(N->getValueType(0) == MVT::i64 ?
16070 MVT::i64 : MVT::i32, MVT::Other),
16071 Ops, LD->getMemoryVT(), LD->getMemOperand());
16073 // If this is an i16 load, insert the truncate.
16074 SDValue ResVal = BSLoad;
16075 if (N->getValueType(0) == MVT::i16)
16076 ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);
16078 // First, combine the bswap away. This makes the value produced by the
16080 DCI.CombineTo(N, ResVal);
16082 // Next, combine the load away, we give it a bogus result value but a real
16083 // chain result. The result value is dead because the bswap is dead.
16084 DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
16086 // Return N so it doesn't get rechecked!
16087 return SDValue(N, 0);
16089 // Convert this to two 32-bit bswap loads and a BUILD_PAIR. Do this only
16090 // before legalization so that the BUILD_PAIR is handled correctly.
16091 if (!DCI.isBeforeLegalize() || !Is64BitBswapOn64BitTgt ||
16092 !IsSingleUseNormalLd)
16094 LoadSDNode *LD = cast<LoadSDNode>(N->getOperand(0));
16096 // Can't split volatile or atomic loads.
16097 if (!LD->isSimple())
16099 SDValue BasePtr = LD->getBasePtr();
16100 SDValue Lo = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr,
16101 LD->getPointerInfo(), LD->getAlign());
16102 Lo = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Lo);
16103 BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
16104 DAG.getIntPtrConstant(4, dl));
16105 MachineMemOperand *NewMMO = DAG.getMachineFunction().getMachineMemOperand(
16106 LD->getMemOperand(), 4, 4);
16107 SDValue Hi = DAG.getLoad(MVT::i32, dl, LD->getChain(), BasePtr, NewMMO);
16108 Hi = DAG.getNode(ISD::BSWAP, dl, MVT::i32, Hi);
16110 if (Subtarget.isLittleEndian())
16111 Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Hi, Lo);
16113 Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
16115 DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
16116 Hi.getOperand(0).getValue(1), Lo.getOperand(0).getValue(1));
16117 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), TF);
16121 // If a VCMP_rec node already exists with exactly the same operands as this
16122 // node, use its result instead of this node (VCMP_rec computes both a CR6
16123 // and a normal output).
16125 if (!N->getOperand(0).hasOneUse() &&
16126 !N->getOperand(1).hasOneUse() &&
16127 !N->getOperand(2).hasOneUse()) {
16129 // Scan all of the users of the LHS, looking for VCMP_rec's that match.
16130 SDNode *VCMPrecNode = nullptr;
16132 SDNode *LHSN = N->getOperand(0).getNode();
16133 for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
16135 if (UI->getOpcode() == PPCISD::VCMP_rec &&
16136 UI->getOperand(1) == N->getOperand(1) &&
16137 UI->getOperand(2) == N->getOperand(2) &&
16138 UI->getOperand(0) == N->getOperand(0)) {
16143 // If there is no VCMP_rec node, or if the flag value has a single use,
16144 // don't transform this.
16145 if (!VCMPrecNode || VCMPrecNode->hasNUsesOfValue(0, 1))
16148 // Look at the (necessarily single) use of the flag value. If it has a
16149 // chain, this transformation is more complex. Note that multiple things
16150 // could use the value result, which we should ignore.
16151 SDNode *FlagUser = nullptr;
16152 for (SDNode::use_iterator UI = VCMPrecNode->use_begin();
16153 FlagUser == nullptr; ++UI) {
16154 assert(UI != VCMPrecNode->use_end() && "Didn't find user!");
16155 SDNode *User = *UI;
16156 for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
16157 if (User->getOperand(i) == SDValue(VCMPrecNode, 1)) {
16164 // If the user is a MFOCRF instruction, we know this is safe.
16165 // Otherwise we give up for right now.
16166 if (FlagUser->getOpcode() == PPCISD::MFOCRF)
16167 return SDValue(VCMPrecNode, 0);
16171 // If this is a branch on an altivec predicate comparison, lower this so
16172 // that we don't have to do a MFOCRF: instead, branch directly on CR6. This
16173 // lowering is done pre-legalize, because the legalizer lowers the predicate
16174 // compare down to code that is difficult to reassemble.
16175 // This code also handles branches that depend on the result of a store
16177 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
16178 SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);
16183 if (!isa<ConstantSDNode>(RHS) || (CC != ISD::SETEQ && CC != ISD::SETNE))
16186 // Since we are doing this pre-legalize, the RHS can be a constant of
16187 // arbitrary bitwidth which may cause issues when trying to get the value
16188 // from the underlying APInt.
16189 auto RHSAPInt = cast<ConstantSDNode>(RHS)->getAPIntValue();
16190 if (!RHSAPInt.isIntN(64))
16193 unsigned Val = RHSAPInt.getZExtValue();
16194 auto isImpossibleCompare = [&]() {
16195 // If this is a comparison against something other than 0/1, then we know
16196 // that the condition is never/always true.
16197 if (Val != 0 && Val != 1) {
16198 if (CC == ISD::SETEQ) // Cond never true, remove branch.
16199 return N->getOperand(0);
16200 // Always !=, turn it into an unconditional branch.
16201 return DAG.getNode(ISD::BR, dl, MVT::Other,
16202 N->getOperand(0), N->getOperand(4));
16206 // Combine branches fed by store conditional instructions (st[bhwd]cx).
16207 unsigned StoreWidth = 0;
16208 if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
16209 isStoreConditional(LHS, StoreWidth)) {
16210 if (SDValue Impossible = isImpossibleCompare())
16212 PPC::Predicate CompOpc;
16218 CompOpc = CC == ISD::SETEQ ? PPC::PRED_NE : PPC::PRED_EQ;
16220 CompOpc = CC == ISD::SETEQ ? PPC::PRED_EQ : PPC::PRED_NE;
16222 SDValue Ops[] = {LHS.getOperand(0), LHS.getOperand(2), LHS.getOperand(3),
16223 DAG.getConstant(StoreWidth, dl, MVT::i32)};
16224 auto *MemNode = cast<MemSDNode>(LHS);
16225 SDValue ConstSt = DAG.getMemIntrinsicNode(
16226 PPCISD::STORE_COND, dl,
16227 DAG.getVTList(MVT::i32, MVT::Other, MVT::Glue), Ops,
16228 MemNode->getMemoryVT(), MemNode->getMemOperand());
16231 // Unchain the branch from the original store conditional.
16232 if (N->getOperand(0) == LHS.getValue(1))
16233 InChain = LHS.getOperand(0);
16234 else if (N->getOperand(0).getOpcode() == ISD::TokenFactor) {
16235 SmallVector<SDValue, 4> InChains;
16236 SDValue InTF = N->getOperand(0);
16237 for (int i = 0, e = InTF.getNumOperands(); i < e; i++)
16238 if (InTF.getOperand(i) != LHS.getValue(1))
16239 InChains.push_back(InTF.getOperand(i));
16240 InChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, InChains);
16243 return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, InChain,
16244 DAG.getConstant(CompOpc, dl, MVT::i32),
16245 DAG.getRegister(PPC::CR0, MVT::i32), N->getOperand(4),
16246 ConstSt.getValue(2));
16249 if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
16250 getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) {
16251 assert(isDot && "Can't compare against a vector result!");
16253 if (SDValue Impossible = isImpossibleCompare())
16256 bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
16257 // Create the PPCISD altivec 'dot' comparison node.
16259 LHS.getOperand(2), // LHS of compare
16260 LHS.getOperand(3), // RHS of compare
16261 DAG.getConstant(CompareOpc, dl, MVT::i32)
16263 EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
16264 SDValue CompNode = DAG.getNode(PPCISD::VCMP_rec, dl, VTs, Ops);
16266 // Unpack the result based on how the target uses it.
16267 PPC::Predicate CompOpc;
16268 switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
16269 default: // Can't happen, don't crash on invalid number though.
16270 case 0: // Branch on the value of the EQ bit of CR6.
16271 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
16273 case 1: // Branch on the inverted value of the EQ bit of CR6.
16274 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
16276 case 2: // Branch on the value of the LT bit of CR6.
16277 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
16279 case 3: // Branch on the inverted value of the LT bit of CR6.
16280 CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
16284 return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
16285 DAG.getConstant(CompOpc, dl, MVT::i32),
16286 DAG.getRegister(PPC::CR6, MVT::i32),
16287 N->getOperand(4), CompNode.getValue(1));
16291 case ISD::BUILD_VECTOR:
16292 return DAGCombineBuildVector(N, DCI);
16299 PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
16301 SmallVectorImpl<SDNode *> &Created) const {
16302 // fold (sdiv X, pow2)
16303 EVT VT = N->getValueType(0);
16304 if (VT == MVT::i64 && !Subtarget.isPPC64())
16306 if ((VT != MVT::i32 && VT != MVT::i64) ||
16307 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
16311 SDValue N0 = N->getOperand(0);
16313 bool IsNegPow2 = Divisor.isNegatedPowerOf2();
16314 unsigned Lg2 = (IsNegPow2 ? -Divisor : Divisor).countr_zero();
16315 SDValue ShiftAmt = DAG.getConstant(Lg2, DL, VT);
16317 SDValue Op = DAG.getNode(PPCISD::SRA_ADDZE, DL, VT, N0, ShiftAmt);
16318 Created.push_back(Op.getNode());
16321 Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);
16322 Created.push_back(Op.getNode());
16328 //===----------------------------------------------------------------------===//
16329 // Inline Assembly Support
16330 //===----------------------------------------------------------------------===//
16332 void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
16334 const APInt &DemandedElts,
16335 const SelectionDAG &DAG,
16336 unsigned Depth) const {
16338 switch (Op.getOpcode()) {
16340 case PPCISD::LBRX: {
16341 // lhbrx is known to have the top bits cleared out.
16342 if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
16343 Known.Zero = 0xFFFF0000;
16346 case ISD::INTRINSIC_WO_CHAIN: {
16347 switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
16349 case Intrinsic::ppc_altivec_vcmpbfp_p:
16350 case Intrinsic::ppc_altivec_vcmpeqfp_p:
16351 case Intrinsic::ppc_altivec_vcmpequb_p:
16352 case Intrinsic::ppc_altivec_vcmpequh_p:
16353 case Intrinsic::ppc_altivec_vcmpequw_p:
16354 case Intrinsic::ppc_altivec_vcmpequd_p:
16355 case Intrinsic::ppc_altivec_vcmpequq_p:
16356 case Intrinsic::ppc_altivec_vcmpgefp_p:
16357 case Intrinsic::ppc_altivec_vcmpgtfp_p:
16358 case Intrinsic::ppc_altivec_vcmpgtsb_p:
16359 case Intrinsic::ppc_altivec_vcmpgtsh_p:
16360 case Intrinsic::ppc_altivec_vcmpgtsw_p:
16361 case Intrinsic::ppc_altivec_vcmpgtsd_p:
16362 case Intrinsic::ppc_altivec_vcmpgtsq_p:
16363 case Intrinsic::ppc_altivec_vcmpgtub_p:
16364 case Intrinsic::ppc_altivec_vcmpgtuh_p:
16365 case Intrinsic::ppc_altivec_vcmpgtuw_p:
16366 case Intrinsic::ppc_altivec_vcmpgtud_p:
16367 case Intrinsic::ppc_altivec_vcmpgtuq_p:
16368 Known.Zero = ~1U; // All bits but the low one are known to be zero.
16373 case ISD::INTRINSIC_W_CHAIN: {
16374 switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
16377 case Intrinsic::ppc_load2r:
16378 // Top bits are cleared for load2r (which is the same as lhbrx).
16379 Known.Zero = 0xFFFF0000;
16387 Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
16388 switch (Subtarget.getCPUDirective()) {
16391 case PPC::DIR_PWR4:
16392 case PPC::DIR_PWR5:
16393 case PPC::DIR_PWR5X:
16394 case PPC::DIR_PWR6:
16395 case PPC::DIR_PWR6X:
16396 case PPC::DIR_PWR7:
16397 case PPC::DIR_PWR8:
16398 case PPC::DIR_PWR9:
16399 case PPC::DIR_PWR10:
16400 case PPC::DIR_PWR_FUTURE: {
16404 if (!DisableInnermostLoopAlign32) {
16405 // If the nested loop is an innermost loop, prefer to a 32-byte alignment,
16406 // so that we can decrease cache misses and branch-prediction misses.
16407 // Actual alignment of the loop will depend on the hotness check and other
16408 // logic in alignBlocks.
16409 if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty())
16413 const PPCInstrInfo *TII = Subtarget.getInstrInfo();
16415 // For small loops (between 5 and 8 instructions), align to a 32-byte
16416 // boundary so that the entire loop fits in one instruction-cache line.
16417 uint64_t LoopSize = 0;
16418 for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I)
16419 for (const MachineInstr &J : **I) {
16420 LoopSize += TII->getInstSizeInBytes(J);
16425 if (LoopSize > 16 && LoopSize <= 32)
16432 return TargetLowering::getPrefLoopAlignment(ML);
16435 /// getConstraintType - Given a constraint, return the type of
16436 /// constraint it is for this target.
16437 PPCTargetLowering::ConstraintType
16438 PPCTargetLowering::getConstraintType(StringRef Constraint) const {
16439 if (Constraint.size() == 1) {
16440 switch (Constraint[0]) {
16448 return C_RegisterClass;
16450 // FIXME: While Z does indicate a memory constraint, it specifically
16451 // indicates an r+r address (used in conjunction with the 'y' modifier
16452 // in the replacement string). Currently, we're forcing the base
16453 // register to be r0 in the asm printer (which is interpreted as zero)
16454 // and forming the complete address in the second register. This is
16458 } else if (Constraint == "wc") { // individual CR bits.
16459 return C_RegisterClass;
16460 } else if (Constraint == "wa" || Constraint == "wd" ||
16461 Constraint == "wf" || Constraint == "ws" ||
16462 Constraint == "wi" || Constraint == "ww") {
16463 return C_RegisterClass; // VSX registers.
16465 return TargetLowering::getConstraintType(Constraint);
16468 /// Examine constraint type and operand type and determine a weight value.
16469 /// This object must already have been set up with the operand type
16470 /// and the current alternative constraint selected.
16471 TargetLowering::ConstraintWeight
16472 PPCTargetLowering::getSingleConstraintMatchWeight(
16473 AsmOperandInfo &info, const char *constraint) const {
16474 ConstraintWeight weight = CW_Invalid;
16475 Value *CallOperandVal = info.CallOperandVal;
16476 // If we don't have a value, we can't do a match,
16477 // but allow it at the lowest weight.
16478 if (!CallOperandVal)
16480 Type *type = CallOperandVal->getType();
16482 // Look at the constraint type.
16483 if (StringRef(constraint) == "wc" && type->isIntegerTy(1))
16484 return CW_Register; // an individual CR bit.
16485 else if ((StringRef(constraint) == "wa" ||
16486 StringRef(constraint) == "wd" ||
16487 StringRef(constraint) == "wf") &&
16488 type->isVectorTy())
16489 return CW_Register;
16490 else if (StringRef(constraint) == "wi" && type->isIntegerTy(64))
16491 return CW_Register; // just hold 64-bit integers data.
16492 else if (StringRef(constraint) == "ws" && type->isDoubleTy())
16493 return CW_Register;
16494 else if (StringRef(constraint) == "ww" && type->isFloatTy())
16495 return CW_Register;
16497 switch (*constraint) {
16499 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
16502 if (type->isIntegerTy())
16503 weight = CW_Register;
16506 if (type->isFloatTy())
16507 weight = CW_Register;
16510 if (type->isDoubleTy())
16511 weight = CW_Register;
16514 if (type->isVectorTy())
16515 weight = CW_Register;
16518 weight = CW_Register;
16521 weight = CW_Memory;
16527 std::pair<unsigned, const TargetRegisterClass *>
16528 PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
16529 StringRef Constraint,
16531 if (Constraint.size() == 1) {
16532 // GCC RS6000 Constraint Letters
16533 switch (Constraint[0]) {
16534 case 'b': // R1-R31
16535 if (VT == MVT::i64 && Subtarget.isPPC64())
16536 return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
16537 return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
16538 case 'r': // R0-R31
16539 if (VT == MVT::i64 && Subtarget.isPPC64())
16540 return std::make_pair(0U, &PPC::G8RCRegClass);
16541 return std::make_pair(0U, &PPC::GPRCRegClass);
16542 // 'd' and 'f' constraints are both defined to be "the floating point
16543 // registers", where one is for 32-bit and the other for 64-bit. We don't
16544 // really care overly much here so just give them all the same reg classes.
16547 if (Subtarget.hasSPE()) {
16548 if (VT == MVT::f32 || VT == MVT::i32)
16549 return std::make_pair(0U, &PPC::GPRCRegClass);
16550 if (VT == MVT::f64 || VT == MVT::i64)
16551 return std::make_pair(0U, &PPC::SPERCRegClass);
16553 if (VT == MVT::f32 || VT == MVT::i32)
16554 return std::make_pair(0U, &PPC::F4RCRegClass);
16555 if (VT == MVT::f64 || VT == MVT::i64)
16556 return std::make_pair(0U, &PPC::F8RCRegClass);
16560 if (Subtarget.hasAltivec() && VT.isVector())
16561 return std::make_pair(0U, &PPC::VRRCRegClass);
16562 else if (Subtarget.hasVSX())
16563 // Scalars in Altivec registers only make sense with VSX.
16564 return std::make_pair(0U, &PPC::VFRCRegClass);
16567 return std::make_pair(0U, &PPC::CRRCRegClass);
16569 } else if (Constraint == "wc" && Subtarget.useCRBits()) {
16570 // An individual CR bit.
16571 return std::make_pair(0U, &PPC::CRBITRCRegClass);
16572 } else if ((Constraint == "wa" || Constraint == "wd" ||
16573 Constraint == "wf" || Constraint == "wi") &&
16574 Subtarget.hasVSX()) {
16575 // A VSX register for either a scalar (FP) or vector. There is no
16576 // support for single precision scalars on subtargets prior to Power8.
16578 return std::make_pair(0U, &PPC::VSRCRegClass);
16579 if (VT == MVT::f32 && Subtarget.hasP8Vector())
16580 return std::make_pair(0U, &PPC::VSSRCRegClass);
16581 return std::make_pair(0U, &PPC::VSFRCRegClass);
16582 } else if ((Constraint == "ws" || Constraint == "ww") && Subtarget.hasVSX()) {
16583 if (VT == MVT::f32 && Subtarget.hasP8Vector())
16584 return std::make_pair(0U, &PPC::VSSRCRegClass);
16586 return std::make_pair(0U, &PPC::VSFRCRegClass);
16587 } else if (Constraint == "lr") {
16588 if (VT == MVT::i64)
16589 return std::make_pair(0U, &PPC::LR8RCRegClass);
16591 return std::make_pair(0U, &PPC::LRRCRegClass);
16594 // Handle special cases of physical registers that are not properly handled
16595 // by the base class.
16596 if (Constraint[0] == '{' && Constraint[Constraint.size() - 1] == '}') {
16597 // If we name a VSX register, we can't defer to the base class because it
16598 // will not recognize the correct register (their names will be VSL{0-31}
16599 // and V{0-31} so they won't match). So we match them here.
16600 if (Constraint.size() > 3 && Constraint[1] == 'v' && Constraint[2] == 's') {
16601 int VSNum = atoi(Constraint.data() + 3);
16602 assert(VSNum >= 0 && VSNum <= 63 &&
16603 "Attempted to access a vsr out of range");
16605 return std::make_pair(PPC::VSL0 + VSNum, &PPC::VSRCRegClass);
16606 return std::make_pair(PPC::V0 + VSNum - 32, &PPC::VSRCRegClass);
16609 // For float registers, we can't defer to the base class as it will match
16610 // the SPILLTOVSRRC class.
16611 if (Constraint.size() > 3 && Constraint[1] == 'f') {
16612 int RegNum = atoi(Constraint.data() + 2);
16613 if (RegNum > 31 || RegNum < 0)
16614 report_fatal_error("Invalid floating point register number");
16615 if (VT == MVT::f32 || VT == MVT::i32)
16616 return Subtarget.hasSPE()
16617 ? std::make_pair(PPC::R0 + RegNum, &PPC::GPRCRegClass)
16618 : std::make_pair(PPC::F0 + RegNum, &PPC::F4RCRegClass);
16619 if (VT == MVT::f64 || VT == MVT::i64)
16620 return Subtarget.hasSPE()
16621 ? std::make_pair(PPC::S0 + RegNum, &PPC::SPERCRegClass)
16622 : std::make_pair(PPC::F0 + RegNum, &PPC::F8RCRegClass);
16626 std::pair<unsigned, const TargetRegisterClass *> R =
16627 TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
16629 // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
16630 // (which we call X[0-9]+). If a 64-bit value has been requested, and a
16631 // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
16633 // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
16634 // the AsmName field from *RegisterInfo.td, then this would not be necessary.
16635 if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
16636 PPC::GPRCRegClass.contains(R.first))
16637 return std::make_pair(TRI->getMatchingSuperReg(R.first,
16638 PPC::sub_32, &PPC::G8RCRegClass),
16639 &PPC::G8RCRegClass);
16641 // GCC accepts 'cc' as an alias for 'cr0', and we need to do the same.
16642 if (!R.second && StringRef("{cc}").equals_insensitive(Constraint)) {
16643 R.first = PPC::CR0;
16644 R.second = &PPC::CRRCRegClass;
16646 // FIXME: This warning should ideally be emitted in the front end.
16647 const auto &TM = getTargetMachine();
16648 if (Subtarget.isAIXABI() && !TM.getAIXExtendedAltivecABI()) {
16649 if (((R.first >= PPC::V20 && R.first <= PPC::V31) ||
16650 (R.first >= PPC::VF20 && R.first <= PPC::VF31)) &&
16651 (R.second == &PPC::VSRCRegClass || R.second == &PPC::VSFRCRegClass))
16652 errs() << "warning: vector registers 20 to 32 are reserved in the "
16653 "default AIX AltiVec ABI and cannot be used\n";
16659 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
16660 /// vector. If it is invalid, don't add anything to Ops.
16661 void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
16662 std::string &Constraint,
16663 std::vector<SDValue>&Ops,
16664 SelectionDAG &DAG) const {
16667 // Only support length 1 constraints.
16668 if (Constraint.length() > 1) return;
16670 char Letter = Constraint[0];
16681 ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
16682 if (!CST) return; // Must be an immediate to match.
16684 int64_t Value = CST->getSExtValue();
16685 EVT TCVT = MVT::i64; // All constants taken to be 64 bits so that negative
16686 // numbers are printed as such.
16688 default: llvm_unreachable("Unknown constraint letter!");
16689 case 'I': // "I" is a signed 16-bit constant.
16690 if (isInt<16>(Value))
16691 Result = DAG.getTargetConstant(Value, dl, TCVT);
16693 case 'J': // "J" is a constant with only the high-order 16 bits nonzero.
16694 if (isShiftedUInt<16, 16>(Value))
16695 Result = DAG.getTargetConstant(Value, dl, TCVT);
16697 case 'L': // "L" is a signed 16-bit constant shifted left 16 bits.
16698 if (isShiftedInt<16, 16>(Value))
16699 Result = DAG.getTargetConstant(Value, dl, TCVT);
16701 case 'K': // "K" is a constant with only the low-order 16 bits nonzero.
16702 if (isUInt<16>(Value))
16703 Result = DAG.getTargetConstant(Value, dl, TCVT);
16705 case 'M': // "M" is a constant that is greater than 31.
16707 Result = DAG.getTargetConstant(Value, dl, TCVT);
16709 case 'N': // "N" is a positive constant that is an exact power of two.
16710 if (Value > 0 && isPowerOf2_64(Value))
16711 Result = DAG.getTargetConstant(Value, dl, TCVT);
16713 case 'O': // "O" is the constant zero.
16715 Result = DAG.getTargetConstant(Value, dl, TCVT);
16717 case 'P': // "P" is a constant whose negation is a signed 16-bit constant.
16718 if (isInt<16>(-Value))
16719 Result = DAG.getTargetConstant(Value, dl, TCVT);
16726 if (Result.getNode()) {
16727 Ops.push_back(Result);
16731 // Handle standard constraint letters.
16732 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
16735 void PPCTargetLowering::CollectTargetIntrinsicOperands(const CallInst &I,
16736 SmallVectorImpl<SDValue> &Ops,
16737 SelectionDAG &DAG) const {
16738 if (I.getNumOperands() <= 1)
16740 if (!isa<ConstantSDNode>(Ops[1].getNode()))
16742 auto IntrinsicID = cast<ConstantSDNode>(Ops[1].getNode())->getZExtValue();
16743 if (IntrinsicID != Intrinsic::ppc_tdw && IntrinsicID != Intrinsic::ppc_tw &&
16744 IntrinsicID != Intrinsic::ppc_trapd && IntrinsicID != Intrinsic::ppc_trap)
16747 if (I.hasMetadata("annotation")) {
16748 MDNode *MDN = I.getMetadata("annotation");
16749 Ops.push_back(DAG.getMDNode(MDN));
16753 // isLegalAddressingMode - Return true if the addressing mode represented
16754 // by AM is legal for this target, for a load/store of the specified type.
16755 bool PPCTargetLowering::isLegalAddressingMode(const DataLayout &DL,
16756 const AddrMode &AM, Type *Ty,
16758 Instruction *I) const {
16759 // Vector type r+i form is supported since power9 as DQ form. We don't check
16760 // the offset matching DQ form requirement(off % 16 == 0), because on PowerPC,
16761 // imm form is preferred and the offset can be adjusted to use imm form later
16762 // in pass PPCLoopInstrFormPrep. Also in LSR, for one LSRUse, it uses min and
16763 // max offset to check legal addressing mode, we should be a little aggressive
16764 // to contain other offsets for that LSRUse.
16765 if (Ty->isVectorTy() && AM.BaseOffs != 0 && !Subtarget.hasP9Vector())
16768 // PPC allows a sign-extended 16-bit immediate field.
16769 if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
16772 // No global is ever allowed as a base.
16776 // PPC only support r+r,
16777 switch (AM.Scale) {
16778 case 0: // "r+i" or just "i", depending on HasBaseReg.
16781 if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
16783 // Otherwise we have r+r or r+i.
16786 if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed.
16788 // Allow 2*r as r+r.
16791 // No other scales are supported.
16798 SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
16799 SelectionDAG &DAG) const {
16800 MachineFunction &MF = DAG.getMachineFunction();
16801 MachineFrameInfo &MFI = MF.getFrameInfo();
16802 MFI.setReturnAddressIsTaken(true);
16804 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
16808 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
16810 // Make sure the function does not optimize away the store of the RA to
16812 PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
16813 FuncInfo->setLRStoreRequired();
16814 bool isPPC64 = Subtarget.isPPC64();
16815 auto PtrVT = getPointerTy(MF.getDataLayout());
16818 // The link register (return address) is saved in the caller's frame
16819 // not the callee's stack frame. So we must get the caller's frame
16820 // address and load the return address at the LR offset from there.
16821 SDValue FrameAddr =
16822 DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
16823 LowerFRAMEADDR(Op, DAG), MachinePointerInfo());
16825 DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl,
16826 isPPC64 ? MVT::i64 : MVT::i32);
16827 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
16828 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
16829 MachinePointerInfo());
16832 // Just load the return address off the stack.
16833 SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
16834 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
16835 MachinePointerInfo());
16838 SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
16839 SelectionDAG &DAG) const {
16841 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
16843 MachineFunction &MF = DAG.getMachineFunction();
16844 MachineFrameInfo &MFI = MF.getFrameInfo();
16845 MFI.setFrameAddressIsTaken(true);
16847 EVT PtrVT = getPointerTy(MF.getDataLayout());
16848 bool isPPC64 = PtrVT == MVT::i64;
16850 // Naked functions never have a frame pointer, and so we use r1. For all
16851 // other functions, this decision must be delayed until during PEI.
16853 if (MF.getFunction().hasFnAttribute(Attribute::Naked))
16854 FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
16856 FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
16858 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
16861 FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
16862 FrameAddr, MachinePointerInfo());
16866 // FIXME? Maybe this could be a TableGen attribute on some registers and
16867 // this table could be generated automatically from RegInfo.
16868 Register PPCTargetLowering::getRegisterByName(const char* RegName, LLT VT,
16869 const MachineFunction &MF) const {
16870 bool isPPC64 = Subtarget.isPPC64();
16872 bool is64Bit = isPPC64 && VT == LLT::scalar(64);
16873 if (!is64Bit && VT != LLT::scalar(32))
16874 report_fatal_error("Invalid register global variable type");
16876 Register Reg = StringSwitch<Register>(RegName)
16877 .Case("r1", is64Bit ? PPC::X1 : PPC::R1)
16878 .Case("r2", isPPC64 ? Register() : PPC::R2)
16879 .Case("r13", (is64Bit ? PPC::X13 : PPC::R13))
16880 .Default(Register());
16884 report_fatal_error("Invalid register name global variable");
16887 bool PPCTargetLowering::isAccessedAsGotIndirect(SDValue GA) const {
16888 // 32-bit SVR4 ABI access everything as got-indirect.
16889 if (Subtarget.is32BitELFABI())
16892 // AIX accesses everything indirectly through the TOC, which is similar to
16894 if (Subtarget.isAIXABI())
16897 CodeModel::Model CModel = getTargetMachine().getCodeModel();
16898 // If it is small or large code model, module locals are accessed
16899 // indirectly by loading their address from .toc/.got.
16900 if (CModel == CodeModel::Small || CModel == CodeModel::Large)
16903 // JumpTable and BlockAddress are accessed as got-indirect.
16904 if (isa<JumpTableSDNode>(GA) || isa<BlockAddressSDNode>(GA))
16907 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA))
16908 return Subtarget.isGVIndirectSymbol(G->getGlobal());
16914 PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
16915 // The PowerPC target isn't yet aware of offsets.
16919 bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
16921 MachineFunction &MF,
16922 unsigned Intrinsic) const {
16923 switch (Intrinsic) {
16924 case Intrinsic::ppc_atomicrmw_xchg_i128:
16925 case Intrinsic::ppc_atomicrmw_add_i128:
16926 case Intrinsic::ppc_atomicrmw_sub_i128:
16927 case Intrinsic::ppc_atomicrmw_nand_i128:
16928 case Intrinsic::ppc_atomicrmw_and_i128:
16929 case Intrinsic::ppc_atomicrmw_or_i128:
16930 case Intrinsic::ppc_atomicrmw_xor_i128:
16931 case Intrinsic::ppc_cmpxchg_i128:
16932 Info.opc = ISD::INTRINSIC_W_CHAIN;
16933 Info.memVT = MVT::i128;
16934 Info.ptrVal = I.getArgOperand(0);
16936 Info.align = Align(16);
16937 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
16938 MachineMemOperand::MOVolatile;
16940 case Intrinsic::ppc_atomic_load_i128:
16941 Info.opc = ISD::INTRINSIC_W_CHAIN;
16942 Info.memVT = MVT::i128;
16943 Info.ptrVal = I.getArgOperand(0);
16945 Info.align = Align(16);
16946 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
16948 case Intrinsic::ppc_atomic_store_i128:
16949 Info.opc = ISD::INTRINSIC_VOID;
16950 Info.memVT = MVT::i128;
16951 Info.ptrVal = I.getArgOperand(2);
16953 Info.align = Align(16);
16954 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
16956 case Intrinsic::ppc_altivec_lvx:
16957 case Intrinsic::ppc_altivec_lvxl:
16958 case Intrinsic::ppc_altivec_lvebx:
16959 case Intrinsic::ppc_altivec_lvehx:
16960 case Intrinsic::ppc_altivec_lvewx:
16961 case Intrinsic::ppc_vsx_lxvd2x:
16962 case Intrinsic::ppc_vsx_lxvw4x:
16963 case Intrinsic::ppc_vsx_lxvd2x_be:
16964 case Intrinsic::ppc_vsx_lxvw4x_be:
16965 case Intrinsic::ppc_vsx_lxvl:
16966 case Intrinsic::ppc_vsx_lxvll: {
16968 switch (Intrinsic) {
16969 case Intrinsic::ppc_altivec_lvebx:
16972 case Intrinsic::ppc_altivec_lvehx:
16975 case Intrinsic::ppc_altivec_lvewx:
16978 case Intrinsic::ppc_vsx_lxvd2x:
16979 case Intrinsic::ppc_vsx_lxvd2x_be:
16987 Info.opc = ISD::INTRINSIC_W_CHAIN;
16989 Info.ptrVal = I.getArgOperand(0);
16990 Info.offset = -VT.getStoreSize()+1;
16991 Info.size = 2*VT.getStoreSize()-1;
16992 Info.align = Align(1);
16993 Info.flags = MachineMemOperand::MOLoad;
16996 case Intrinsic::ppc_altivec_stvx:
16997 case Intrinsic::ppc_altivec_stvxl:
16998 case Intrinsic::ppc_altivec_stvebx:
16999 case Intrinsic::ppc_altivec_stvehx:
17000 case Intrinsic::ppc_altivec_stvewx:
17001 case Intrinsic::ppc_vsx_stxvd2x:
17002 case Intrinsic::ppc_vsx_stxvw4x:
17003 case Intrinsic::ppc_vsx_stxvd2x_be:
17004 case Intrinsic::ppc_vsx_stxvw4x_be:
17005 case Intrinsic::ppc_vsx_stxvl:
17006 case Intrinsic::ppc_vsx_stxvll: {
17008 switch (Intrinsic) {
17009 case Intrinsic::ppc_altivec_stvebx:
17012 case Intrinsic::ppc_altivec_stvehx:
17015 case Intrinsic::ppc_altivec_stvewx:
17018 case Intrinsic::ppc_vsx_stxvd2x:
17019 case Intrinsic::ppc_vsx_stxvd2x_be:
17027 Info.opc = ISD::INTRINSIC_VOID;
17029 Info.ptrVal = I.getArgOperand(1);
17030 Info.offset = -VT.getStoreSize()+1;
17031 Info.size = 2*VT.getStoreSize()-1;
17032 Info.align = Align(1);
17033 Info.flags = MachineMemOperand::MOStore;
17036 case Intrinsic::ppc_stdcx:
17037 case Intrinsic::ppc_stwcx:
17038 case Intrinsic::ppc_sthcx:
17039 case Intrinsic::ppc_stbcx: {
17041 auto Alignment = Align(8);
17042 switch (Intrinsic) {
17043 case Intrinsic::ppc_stdcx:
17046 case Intrinsic::ppc_stwcx:
17048 Alignment = Align(4);
17050 case Intrinsic::ppc_sthcx:
17052 Alignment = Align(2);
17054 case Intrinsic::ppc_stbcx:
17056 Alignment = Align(1);
17059 Info.opc = ISD::INTRINSIC_W_CHAIN;
17061 Info.ptrVal = I.getArgOperand(0);
17063 Info.align = Alignment;
17064 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
17074 /// It returns EVT::Other if the type should be determined using generic
17075 /// target-independent logic.
17076 EVT PPCTargetLowering::getOptimalMemOpType(
17077 const MemOp &Op, const AttributeList &FuncAttributes) const {
17078 if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
17079 // We should use Altivec/VSX loads and stores when available. For unaligned
17080 // addresses, unaligned VSX loads are only fast starting with the P8.
17081 if (Subtarget.hasAltivec() && Op.size() >= 16 &&
17082 (Op.isAligned(Align(16)) ||
17083 ((Op.isMemset() && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
17087 if (Subtarget.isPPC64()) {
17094 /// Returns true if it is beneficial to convert a load of a constant
17095 /// to just the constant itself.
17096 bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
17098 assert(Ty->isIntegerTy());
17100 unsigned BitSize = Ty->getPrimitiveSizeInBits();
17101 return !(BitSize == 0 || BitSize > 64);
17104 bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
17105 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17107 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
17108 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
17109 return NumBits1 == 64 && NumBits2 == 32;
17112 bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
17113 if (!VT1.isInteger() || !VT2.isInteger())
17115 unsigned NumBits1 = VT1.getSizeInBits();
17116 unsigned NumBits2 = VT2.getSizeInBits();
17117 return NumBits1 == 64 && NumBits2 == 32;
17120 bool PPCTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
17121 // Generally speaking, zexts are not free, but they are free when they can be
17122 // folded with other operations.
17123 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Val)) {
17124 EVT MemVT = LD->getMemoryVT();
17125 if ((MemVT == MVT::i1 || MemVT == MVT::i8 || MemVT == MVT::i16 ||
17126 (Subtarget.isPPC64() && MemVT == MVT::i32)) &&
17127 (LD->getExtensionType() == ISD::NON_EXTLOAD ||
17128 LD->getExtensionType() == ISD::ZEXTLOAD))
17132 // FIXME: Add other cases...
17133 // - 32-bit shifts with a zext to i64
17134 // - zext after ctlz, bswap, etc.
17135 // - zext after and by a constant mask
17137 return TargetLowering::isZExtFree(Val, VT2);
17140 bool PPCTargetLowering::isFPExtFree(EVT DestVT, EVT SrcVT) const {
17141 assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
17142 "invalid fpext types");
17143 // Extending to float128 is not free.
17144 if (DestVT == MVT::f128)
17149 bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
17150 return isInt<16>(Imm) || isUInt<16>(Imm);
17153 bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
17154 return isInt<16>(Imm) || isUInt<16>(Imm);
17157 bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, Align,
17158 MachineMemOperand::Flags,
17159 unsigned *Fast) const {
17160 if (DisablePPCUnaligned)
17163 // PowerPC supports unaligned memory access for simple non-vector types.
17164 // Although accessing unaligned addresses is not as efficient as accessing
17165 // aligned addresses, it is generally more efficient than manual expansion,
17166 // and generally only traps for software emulation when crossing page
17169 if (!VT.isSimple())
17172 if (VT.isFloatingPoint() && !VT.isVector() &&
17173 !Subtarget.allowsUnalignedFPAccess())
17176 if (VT.getSimpleVT().isVector()) {
17177 if (Subtarget.hasVSX()) {
17178 if (VT != MVT::v2f64 && VT != MVT::v2i64 &&
17179 VT != MVT::v4f32 && VT != MVT::v4i32)
17186 if (VT == MVT::ppcf128)
17195 bool PPCTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
17197 // Check integral scalar types.
17198 if (!VT.isScalarInteger())
17200 if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
17201 if (!ConstNode->getAPIntValue().isSignedIntN(64))
17203 // This transformation will generate >= 2 operations. But the following
17204 // cases will generate <= 2 instructions during ISEL. So exclude them.
17205 // 1. If the constant multiplier fits 16 bits, it can be handled by one
17206 // HW instruction, ie. MULLI
17207 // 2. If the multiplier after shifted fits 16 bits, an extra shift
17208 // instruction is needed than case 1, ie. MULLI and RLDICR
17209 int64_t Imm = ConstNode->getSExtValue();
17210 unsigned Shift = llvm::countr_zero<uint64_t>(Imm);
17212 if (isInt<16>(Imm))
17214 uint64_t UImm = static_cast<uint64_t>(Imm);
17215 if (isPowerOf2_64(UImm + 1) || isPowerOf2_64(UImm - 1) ||
17216 isPowerOf2_64(1 - UImm) || isPowerOf2_64(-1 - UImm))
17222 bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
17224 return isFMAFasterThanFMulAndFAdd(
17225 MF.getFunction(), VT.getTypeForEVT(MF.getFunction().getContext()));
17228 bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
17230 if (Subtarget.hasSPE())
17232 switch (Ty->getScalarType()->getTypeID()) {
17233 case Type::FloatTyID:
17234 case Type::DoubleTyID:
17236 case Type::FP128TyID:
17237 return Subtarget.hasP9Vector();
17243 // FIXME: add more patterns which are not profitable to hoist.
17244 bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
17245 if (!I->hasOneUse())
17248 Instruction *User = I->user_back();
17249 assert(User && "A single use instruction with no uses.");
17251 switch (I->getOpcode()) {
17252 case Instruction::FMul: {
17253 // Don't break FMA, PowerPC prefers FMA.
17254 if (User->getOpcode() != Instruction::FSub &&
17255 User->getOpcode() != Instruction::FAdd)
17258 const TargetOptions &Options = getTargetMachine().Options;
17259 const Function *F = I->getFunction();
17260 const DataLayout &DL = F->getParent()->getDataLayout();
17261 Type *Ty = User->getOperand(0)->getType();
17264 isFMAFasterThanFMulAndFAdd(*F, Ty) &&
17265 isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
17266 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
17268 case Instruction::Load: {
17269 // Don't break "store (load float*)" pattern, this pattern will be combined
17270 // to "store (load int32)" in later InstCombine pass. See function
17271 // combineLoadToOperationType. On PowerPC, loading a float point takes more
17272 // cycles than loading a 32 bit integer.
17273 LoadInst *LI = cast<LoadInst>(I);
17274 // For the loads that combineLoadToOperationType does nothing, like
17275 // ordered load, it should be profitable to hoist them.
17276 // For swifterror load, it can only be used for pointer to pointer type, so
17277 // later type check should get rid of this case.
17278 if (!LI->isUnordered())
17281 if (User->getOpcode() != Instruction::Store)
17284 if (I->getType()->getTypeID() != Type::FloatTyID)
17296 PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
17297 // LR is a callee-save register, but we must treat it as clobbered by any call
17298 // site. Hence we include LR in the scratch registers, which are in turn added
17299 // as implicit-defs for stackmaps and patchpoints. The same reasoning applies
17300 // to CTR, which is used by any indirect call.
17301 static const MCPhysReg ScratchRegs[] = {
17302 PPC::X12, PPC::LR8, PPC::CTR8, 0
17305 return ScratchRegs;
17308 Register PPCTargetLowering::getExceptionPointerRegister(
17309 const Constant *PersonalityFn) const {
17310 return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
17313 Register PPCTargetLowering::getExceptionSelectorRegister(
17314 const Constant *PersonalityFn) const {
17315 return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
17319 PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
17320 EVT VT , unsigned DefinedValues) const {
17321 if (VT == MVT::v2i64)
17322 return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves
17324 if (Subtarget.hasVSX())
17327 return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
17330 Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
17331 if (DisableILPPref || Subtarget.enableMachineScheduler())
17332 return TargetLowering::getSchedulingPreference(N);
17337 // Create a fast isel object.
17339 PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
17340 const TargetLibraryInfo *LibInfo) const {
17341 return PPC::createFastISel(FuncInfo, LibInfo);
17344 // 'Inverted' means the FMA opcode after negating one multiplicand.
17345 // For example, (fma -a b c) = (fnmsub a b c)
17346 static unsigned invertFMAOpcode(unsigned Opc) {
17349 llvm_unreachable("Invalid FMA opcode for PowerPC!");
17351 return PPCISD::FNMSUB;
17352 case PPCISD::FNMSUB:
17357 SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
17358 bool LegalOps, bool OptForSize,
17359 NegatibleCost &Cost,
17360 unsigned Depth) const {
17361 if (Depth > SelectionDAG::MaxRecursionDepth)
17364 unsigned Opc = Op.getOpcode();
17365 EVT VT = Op.getValueType();
17366 SDNodeFlags Flags = Op.getNode()->getFlags();
17369 case PPCISD::FNMSUB:
17370 if (!Op.hasOneUse() || !isTypeLegal(VT))
17373 const TargetOptions &Options = getTargetMachine().Options;
17374 SDValue N0 = Op.getOperand(0);
17375 SDValue N1 = Op.getOperand(1);
17376 SDValue N2 = Op.getOperand(2);
17379 NegatibleCost N2Cost = NegatibleCost::Expensive;
17381 getNegatedExpression(N2, DAG, LegalOps, OptForSize, N2Cost, Depth + 1);
17386 // (fneg (fnmsub a b c)) => (fnmsub (fneg a) b (fneg c))
17387 // (fneg (fnmsub a b c)) => (fnmsub a (fneg b) (fneg c))
17388 // These transformations may change sign of zeroes. For example,
17389 // -(-ab-(-c))=-0 while -(-(ab-c))=+0 when a=b=c=1.
17390 if (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) {
17391 // Try and choose the cheaper one to negate.
17392 NegatibleCost N0Cost = NegatibleCost::Expensive;
17393 SDValue NegN0 = getNegatedExpression(N0, DAG, LegalOps, OptForSize,
17394 N0Cost, Depth + 1);
17396 NegatibleCost N1Cost = NegatibleCost::Expensive;
17397 SDValue NegN1 = getNegatedExpression(N1, DAG, LegalOps, OptForSize,
17398 N1Cost, Depth + 1);
17400 if (NegN0 && N0Cost <= N1Cost) {
17401 Cost = std::min(N0Cost, N2Cost);
17402 return DAG.getNode(Opc, Loc, VT, NegN0, N1, NegN2, Flags);
17403 } else if (NegN1) {
17404 Cost = std::min(N1Cost, N2Cost);
17405 return DAG.getNode(Opc, Loc, VT, N0, NegN1, NegN2, Flags);
17409 // (fneg (fnmsub a b c)) => (fma a b (fneg c))
17410 if (isOperationLegal(ISD::FMA, VT)) {
17412 return DAG.getNode(ISD::FMA, Loc, VT, N0, N1, NegN2, Flags);
17418 return TargetLowering::getNegatedExpression(Op, DAG, LegalOps, OptForSize,
17422 // Override to enable LOAD_STACK_GUARD lowering on Linux.
17423 bool PPCTargetLowering::useLoadStackGuardNode() const {
17424 if (!Subtarget.isTargetLinux())
17425 return TargetLowering::useLoadStackGuardNode();
17429 // Override to disable global variable loading on Linux and insert AIX canary
17430 // word declaration.
17431 void PPCTargetLowering::insertSSPDeclarations(Module &M) const {
17432 if (Subtarget.isAIXABI()) {
17433 M.getOrInsertGlobal(AIXSSPCanaryWordName,
17434 Type::getInt8PtrTy(M.getContext()));
17437 if (!Subtarget.isTargetLinux())
17438 return TargetLowering::insertSSPDeclarations(M);
17441 Value *PPCTargetLowering::getSDagStackGuard(const Module &M) const {
17442 if (Subtarget.isAIXABI())
17443 return M.getGlobalVariable(AIXSSPCanaryWordName);
17444 return TargetLowering::getSDagStackGuard(M);
17447 bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
17448 bool ForCodeSize) const {
17449 if (!VT.isSimple() || !Subtarget.hasVSX())
17452 switch(VT.getSimpleVT().SimpleTy) {
17454 // For FP types that are currently not supported by PPC backend, return
17455 // false. Examples: f16, f80.
17459 if (Subtarget.hasPrefixInstrs()) {
17460 // we can materialize all immediatess via XXSPLTI32DX and XXSPLTIDP.
17464 APSInt IntResult(16, false);
17465 // The rounding mode doesn't really matter because we only care about floats
17466 // that can be converted to integers exactly.
17467 Imm.convertToInteger(IntResult, APFloat::rmTowardZero, &IsExact);
17468 // For exact values in the range [-16, 15] we can materialize the float.
17469 if (IsExact && IntResult <= 15 && IntResult >= -16)
17471 return Imm.isZero();
17474 return Imm.isPosZero();
17478 // For vector shift operation op, fold
17479 // (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y)
17480 static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
17481 SelectionDAG &DAG) {
17482 SDValue N0 = N->getOperand(0);
17483 SDValue N1 = N->getOperand(1);
17484 EVT VT = N0.getValueType();
17485 unsigned OpSizeInBits = VT.getScalarSizeInBits();
17486 unsigned Opcode = N->getOpcode();
17487 unsigned TargetOpcode;
17491 llvm_unreachable("Unexpected shift operation");
17493 TargetOpcode = PPCISD::SHL;
17496 TargetOpcode = PPCISD::SRL;
17499 TargetOpcode = PPCISD::SRA;
17503 if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) &&
17504 N1->getOpcode() == ISD::AND)
17505 if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1)))
17506 if (Mask->getZExtValue() == OpSizeInBits - 1)
17507 return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0));
17512 SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
17513 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
17516 SDValue N0 = N->getOperand(0);
17517 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
17518 if (!Subtarget.isISA3_0() || !Subtarget.isPPC64() ||
17519 N0.getOpcode() != ISD::SIGN_EXTEND ||
17520 N0.getOperand(0).getValueType() != MVT::i32 || CN1 == nullptr ||
17521 N->getValueType(0) != MVT::i64)
17524 // We can't save an operation here if the value is already extended, and
17525 // the existing shift is easier to combine.
17526 SDValue ExtsSrc = N0.getOperand(0);
17527 if (ExtsSrc.getOpcode() == ISD::TRUNCATE &&
17528 ExtsSrc.getOperand(0).getOpcode() == ISD::AssertSext)
17532 SDValue ShiftBy = SDValue(CN1, 0);
17533 // We want the shift amount to be i32 on the extswli, but the shift could
17535 if (ShiftBy.getValueType() == MVT::i64)
17536 ShiftBy = DCI.DAG.getConstant(CN1->getZExtValue(), DL, MVT::i32);
17538 return DCI.DAG.getNode(PPCISD::EXTSWSLI, DL, MVT::i64, N0->getOperand(0),
17542 SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
17543 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
17549 SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
17550 if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
17556 // Transform (add X, (zext(setne Z, C))) -> (addze X, (addic (addi Z, -C), -1))
17557 // Transform (add X, (zext(sete Z, C))) -> (addze X, (subfic (addi Z, -C), 0))
17558 // When C is zero, the equation (addi Z, -C) can be simplified to Z
17559 // Requirement: -C in [-32768, 32767], X and Z are MVT::i64 types
17560 static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG,
17561 const PPCSubtarget &Subtarget) {
17562 if (!Subtarget.isPPC64())
17565 SDValue LHS = N->getOperand(0);
17566 SDValue RHS = N->getOperand(1);
17568 auto isZextOfCompareWithConstant = [](SDValue Op) {
17569 if (Op.getOpcode() != ISD::ZERO_EXTEND || !Op.hasOneUse() ||
17570 Op.getValueType() != MVT::i64)
17573 SDValue Cmp = Op.getOperand(0);
17574 if (Cmp.getOpcode() != ISD::SETCC || !Cmp.hasOneUse() ||
17575 Cmp.getOperand(0).getValueType() != MVT::i64)
17578 if (auto *Constant = dyn_cast<ConstantSDNode>(Cmp.getOperand(1))) {
17579 int64_t NegConstant = 0 - Constant->getSExtValue();
17580 // Due to the limitations of the addi instruction,
17581 // -C is required to be [-32768, 32767].
17582 return isInt<16>(NegConstant);
17588 bool LHSHasPattern = isZextOfCompareWithConstant(LHS);
17589 bool RHSHasPattern = isZextOfCompareWithConstant(RHS);
17591 // If there is a pattern, canonicalize a zext operand to the RHS.
17592 if (LHSHasPattern && !RHSHasPattern)
17593 std::swap(LHS, RHS);
17594 else if (!LHSHasPattern && !RHSHasPattern)
17598 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Glue);
17599 SDValue Cmp = RHS.getOperand(0);
17600 SDValue Z = Cmp.getOperand(0);
17601 auto *Constant = cast<ConstantSDNode>(Cmp.getOperand(1));
17602 int64_t NegConstant = 0 - Constant->getSExtValue();
17604 switch(cast<CondCodeSDNode>(Cmp.getOperand(2))->get()) {
17608 // --> addze X, (addic Z, -1).carry
17610 // add X, (zext(setne Z, C))--
17611 // \ when -32768 <= -C <= 32767 && C != 0
17612 // --> addze X, (addic (addi Z, -C), -1).carry
17613 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z,
17614 DAG.getConstant(NegConstant, DL, MVT::i64));
17615 SDValue AddOrZ = NegConstant != 0 ? Add : Z;
17616 SDValue Addc = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i64, MVT::Glue),
17617 AddOrZ, DAG.getConstant(-1ULL, DL, MVT::i64));
17618 return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64),
17619 SDValue(Addc.getNode(), 1));
17623 // --> addze X, (subfic Z, 0).carry
17625 // add X, (zext(sete Z, C))--
17626 // \ when -32768 <= -C <= 32767 && C != 0
17627 // --> addze X, (subfic (addi Z, -C), 0).carry
17628 SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z,
17629 DAG.getConstant(NegConstant, DL, MVT::i64));
17630 SDValue AddOrZ = NegConstant != 0 ? Add : Z;
17631 SDValue Subc = DAG.getNode(ISD::SUBC, DL, DAG.getVTList(MVT::i64, MVT::Glue),
17632 DAG.getConstant(0, DL, MVT::i64), AddOrZ);
17633 return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64),
17634 SDValue(Subc.getNode(), 1));
17642 // (add C1, (MAT_PCREL_ADDR GlobalAddr+C2)) to
17643 // (MAT_PCREL_ADDR GlobalAddr+(C1+C2))
17644 // In this case both C1 and C2 must be known constants.
17645 // C1+C2 must fit into a 34 bit signed integer.
17646 static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,
17647 const PPCSubtarget &Subtarget) {
17648 if (!Subtarget.isUsingPCRelativeCalls())
17651 // Check both Operand 0 and Operand 1 of the ADD node for the PCRel node.
17652 // If we find that node try to cast the Global Address and the Constant.
17653 SDValue LHS = N->getOperand(0);
17654 SDValue RHS = N->getOperand(1);
17656 if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
17657 std::swap(LHS, RHS);
17659 if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
17662 // Operand zero of PPCISD::MAT_PCREL_ADDR is the GA node.
17663 GlobalAddressSDNode *GSDN = dyn_cast<GlobalAddressSDNode>(LHS.getOperand(0));
17664 ConstantSDNode* ConstNode = dyn_cast<ConstantSDNode>(RHS);
17666 // Check that both casts succeeded.
17667 if (!GSDN || !ConstNode)
17670 int64_t NewOffset = GSDN->getOffset() + ConstNode->getSExtValue();
17673 // The signed int offset needs to fit in 34 bits.
17674 if (!isInt<34>(NewOffset))
17677 // The new global address is a copy of the old global address except
17678 // that it has the updated Offset.
17680 DAG.getTargetGlobalAddress(GSDN->getGlobal(), DL, GSDN->getValueType(0),
17681 NewOffset, GSDN->getTargetFlags());
17683 DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, GSDN->getValueType(0), GA);
17687 SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
17688 if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget))
17691 if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget))
17697 // Detect TRUNCATE operations on bitcasts of float128 values.
17698 // What we are looking for here is the situtation where we extract a subset
17699 // of bits from a 128 bit float.
17700 // This can be of two forms:
17701 // 1) BITCAST of f128 feeding TRUNCATE
17702 // 2) BITCAST of f128 feeding SRL (a shift) feeding TRUNCATE
17703 // The reason this is required is because we do not have a legal i128 type
17704 // and so we want to prevent having to store the f128 and then reload part
17706 SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
17707 DAGCombinerInfo &DCI) const {
17708 // If we are using CRBits then try that first.
17709 if (Subtarget.useCRBits()) {
17710 // Check if CRBits did anything and return that if it did.
17711 if (SDValue CRTruncValue = DAGCombineTruncBoolExt(N, DCI))
17712 return CRTruncValue;
17716 SDValue Op0 = N->getOperand(0);
17718 // Looking for a truncate of i128 to i64.
17719 if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64)
17722 int EltToExtract = DCI.DAG.getDataLayout().isBigEndian() ? 1 : 0;
17724 // SRL feeding TRUNCATE.
17725 if (Op0.getOpcode() == ISD::SRL) {
17726 ConstantSDNode *ConstNode = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
17727 // The right shift has to be by 64 bits.
17728 if (!ConstNode || ConstNode->getZExtValue() != 64)
17731 // Switch the element number to extract.
17732 EltToExtract = EltToExtract ? 0 : 1;
17733 // Update Op0 past the SRL.
17734 Op0 = Op0.getOperand(0);
17737 // BITCAST feeding a TRUNCATE possibly via SRL.
17738 if (Op0.getOpcode() == ISD::BITCAST &&
17739 Op0.getValueType() == MVT::i128 &&
17740 Op0.getOperand(0).getValueType() == MVT::f128) {
17741 SDValue Bitcast = DCI.DAG.getBitcast(MVT::v2i64, Op0.getOperand(0));
17742 return DCI.DAG.getNode(
17743 ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Bitcast,
17744 DCI.DAG.getTargetConstant(EltToExtract, dl, MVT::i32));
17749 SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
17750 SelectionDAG &DAG = DCI.DAG;
17752 ConstantSDNode *ConstOpOrElement = isConstOrConstSplat(N->getOperand(1));
17753 if (!ConstOpOrElement)
17756 // An imul is usually smaller than the alternative sequence for legal type.
17757 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
17758 isOperationLegal(ISD::MUL, N->getValueType(0)))
17761 auto IsProfitable = [this](bool IsNeg, bool IsAddOne, EVT VT) -> bool {
17762 switch (this->Subtarget.getCPUDirective()) {
17764 // TODO: enhance the condition for subtarget before pwr8
17766 case PPC::DIR_PWR8:
17767 // type mul add shl
17771 case PPC::DIR_PWR9:
17772 case PPC::DIR_PWR10:
17773 case PPC::DIR_PWR_FUTURE:
17774 // type mul add shl
17778 // The cycle RATIO of related operations are showed as a table above.
17779 // Because mul is 5(scalar)/7(vector), add/sub/shl are all 2 for both
17780 // scalar and vector type. For 2 instrs patterns, add/sub + shl
17781 // are 4, it is always profitable; but for 3 instrs patterns
17782 // (mul x, -(2^N + 1)) => -(add (shl x, N), x), sub + add + shl are 6.
17783 // So we should only do it for vector type.
17784 return IsAddOne && IsNeg ? VT.isVector() : true;
17788 EVT VT = N->getValueType(0);
17791 const APInt &MulAmt = ConstOpOrElement->getAPIntValue();
17792 bool IsNeg = MulAmt.isNegative();
17793 APInt MulAmtAbs = MulAmt.abs();
17795 if ((MulAmtAbs - 1).isPowerOf2()) {
17796 // (mul x, 2^N + 1) => (add (shl x, N), x)
17797 // (mul x, -(2^N + 1)) => -(add (shl x, N), x)
17799 if (!IsProfitable(IsNeg, true, VT))
17802 SDValue Op0 = N->getOperand(0);
17804 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
17805 DAG.getConstant((MulAmtAbs - 1).logBase2(), DL, VT));
17806 SDValue Res = DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
17811 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
17812 } else if ((MulAmtAbs + 1).isPowerOf2()) {
17813 // (mul x, 2^N - 1) => (sub (shl x, N), x)
17814 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
17816 if (!IsProfitable(IsNeg, false, VT))
17819 SDValue Op0 = N->getOperand(0);
17821 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
17822 DAG.getConstant((MulAmtAbs + 1).logBase2(), DL, VT));
17825 return DAG.getNode(ISD::SUB, DL, VT, Op1, Op0);
17827 return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1);
17834 // Combine fma-like op (like fnmsub) with fnegs to appropriate op. Do this
17835 // in combiner since we need to check SD flags and other subtarget features.
17836 SDValue PPCTargetLowering::combineFMALike(SDNode *N,
17837 DAGCombinerInfo &DCI) const {
17838 SDValue N0 = N->getOperand(0);
17839 SDValue N1 = N->getOperand(1);
17840 SDValue N2 = N->getOperand(2);
17841 SDNodeFlags Flags = N->getFlags();
17842 EVT VT = N->getValueType(0);
17843 SelectionDAG &DAG = DCI.DAG;
17844 const TargetOptions &Options = getTargetMachine().Options;
17845 unsigned Opc = N->getOpcode();
17846 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
17847 bool LegalOps = !DCI.isBeforeLegalizeOps();
17850 if (!isOperationLegal(ISD::FMA, VT))
17853 // Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0
17854 // since (fnmsub a b c)=-0 while c-ab=+0.
17855 if (!Flags.hasNoSignedZeros() && !Options.NoSignedZerosFPMath)
17858 // (fma (fneg a) b c) => (fnmsub a b c)
17859 // (fnmsub (fneg a) b c) => (fma a b c)
17860 if (SDValue NegN0 = getCheaperNegatedExpression(N0, DAG, LegalOps, CodeSize))
17861 return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, NegN0, N1, N2, Flags);
17863 // (fma a (fneg b) c) => (fnmsub a b c)
17864 // (fnmsub a (fneg b) c) => (fma a b c)
17865 if (SDValue NegN1 = getCheaperNegatedExpression(N1, DAG, LegalOps, CodeSize))
17866 return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, N0, NegN1, N2, Flags);
17871 bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
17872 // Only duplicate to increase tail-calls for the 64bit SysV ABIs.
17873 if (!Subtarget.is64BitELFABI())
17876 // If not a tail call then no need to proceed.
17877 if (!CI->isTailCall())
17880 // If sibling calls have been disabled and tail-calls aren't guaranteed
17881 // there is no reason to duplicate.
17882 auto &TM = getTargetMachine();
17883 if (!TM.Options.GuaranteedTailCallOpt && DisableSCO)
17886 // Can't tail call a function called indirectly, or if it has variadic args.
17887 const Function *Callee = CI->getCalledFunction();
17888 if (!Callee || Callee->isVarArg())
17891 // Make sure the callee and caller calling conventions are eligible for tco.
17892 const Function *Caller = CI->getParent()->getParent();
17893 if (!areCallingConvEligibleForTCO_64SVR4(Caller->getCallingConv(),
17894 CI->getCallingConv()))
17897 // If the function is local then we have a good chance at tail-calling it
17898 return getTargetMachine().shouldAssumeDSOLocal(*Caller->getParent(), Callee);
17901 bool PPCTargetLowering::
17902 isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
17903 const Value *Mask = AndI.getOperand(1);
17904 // If the mask is suitable for andi. or andis. we should sink the and.
17905 if (const ConstantInt *CI = dyn_cast<ConstantInt>(Mask)) {
17906 // Can't handle constants wider than 64-bits.
17907 if (CI->getBitWidth() > 64)
17909 int64_t ConstVal = CI->getZExtValue();
17910 return isUInt<16>(ConstVal) ||
17911 (isUInt<16>(ConstVal >> 16) && !(ConstVal & 0xFFFF));
17914 // For non-constant masks, we can always use the record-form and.
17918 /// getAddrModeForFlags - Based on the set of address flags, select the most
17919 /// optimal instruction format to match by.
17920 PPC::AddrMode PPCTargetLowering::getAddrModeForFlags(unsigned Flags) const {
17921 // This is not a node we should be handling here.
17922 if (Flags == PPC::MOF_None)
17923 return PPC::AM_None;
17924 // Unaligned D-Forms are tried first, followed by the aligned D-Forms.
17925 for (auto FlagSet : AddrModesMap.at(PPC::AM_DForm))
17926 if ((Flags & FlagSet) == FlagSet)
17927 return PPC::AM_DForm;
17928 for (auto FlagSet : AddrModesMap.at(PPC::AM_DSForm))
17929 if ((Flags & FlagSet) == FlagSet)
17930 return PPC::AM_DSForm;
17931 for (auto FlagSet : AddrModesMap.at(PPC::AM_DQForm))
17932 if ((Flags & FlagSet) == FlagSet)
17933 return PPC::AM_DQForm;
17934 for (auto FlagSet : AddrModesMap.at(PPC::AM_PrefixDForm))
17935 if ((Flags & FlagSet) == FlagSet)
17936 return PPC::AM_PrefixDForm;
17937 // If no other forms are selected, return an X-Form as it is the most
17938 // general addressing mode.
17939 return PPC::AM_XForm;
17942 /// Set alignment flags based on whether or not the Frame Index is aligned.
17943 /// Utilized when computing flags for address computation when selecting
17944 /// load and store instructions.
17945 static void setAlignFlagsForFI(SDValue N, unsigned &FlagSet,
17946 SelectionDAG &DAG) {
17947 bool IsAdd = ((N.getOpcode() == ISD::ADD) || (N.getOpcode() == ISD::OR));
17948 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(IsAdd ? N.getOperand(0) : N);
17951 const MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
17952 unsigned FrameIndexAlign = MFI.getObjectAlign(FI->getIndex()).value();
17953 // If this is (add $FI, $S16Imm), the alignment flags are already set
17954 // based on the immediate. We just need to clear the alignment flags
17955 // if the FI alignment is weaker.
17956 if ((FrameIndexAlign % 4) != 0)
17957 FlagSet &= ~PPC::MOF_RPlusSImm16Mult4;
17958 if ((FrameIndexAlign % 16) != 0)
17959 FlagSet &= ~PPC::MOF_RPlusSImm16Mult16;
17960 // If the address is a plain FrameIndex, set alignment flags based on
17963 if ((FrameIndexAlign % 4) == 0)
17964 FlagSet |= PPC::MOF_RPlusSImm16Mult4;
17965 if ((FrameIndexAlign % 16) == 0)
17966 FlagSet |= PPC::MOF_RPlusSImm16Mult16;
17970 /// Given a node, compute flags that are used for address computation when
17971 /// selecting load and store instructions. The flags computed are stored in
17972 /// FlagSet. This function takes into account whether the node is a constant,
17973 /// an ADD, OR, or a constant, and computes the address flags accordingly.
17974 static void computeFlagsForAddressComputation(SDValue N, unsigned &FlagSet,
17975 SelectionDAG &DAG) {
17976 // Set the alignment flags for the node depending on if the node is
17977 // 4-byte or 16-byte aligned.
17978 auto SetAlignFlagsForImm = [&](uint64_t Imm) {
17979 if ((Imm & 0x3) == 0)
17980 FlagSet |= PPC::MOF_RPlusSImm16Mult4;
17981 if ((Imm & 0xf) == 0)
17982 FlagSet |= PPC::MOF_RPlusSImm16Mult16;
17985 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
17986 // All 32-bit constants can be computed as LIS + Disp.
17987 const APInt &ConstImm = CN->getAPIntValue();
17988 if (ConstImm.isSignedIntN(32)) { // Flag to handle 32-bit constants.
17989 FlagSet |= PPC::MOF_AddrIsSImm32;
17990 SetAlignFlagsForImm(ConstImm.getZExtValue());
17991 setAlignFlagsForFI(N, FlagSet, DAG);
17993 if (ConstImm.isSignedIntN(34)) // Flag to handle 34-bit constants.
17994 FlagSet |= PPC::MOF_RPlusSImm34;
17995 else // Let constant materialization handle large constants.
17996 FlagSet |= PPC::MOF_NotAddNorCst;
17997 } else if (N.getOpcode() == ISD::ADD || provablyDisjointOr(DAG, N)) {
17998 // This address can be represented as an addition of:
17999 // - Register + Imm16 (possibly a multiple of 4/16)
18000 // - Register + Imm34
18001 // - Register + PPCISD::Lo
18002 // - Register + Register
18003 // In any case, we won't have to match this as Base + Zero.
18004 SDValue RHS = N.getOperand(1);
18005 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(RHS)) {
18006 const APInt &ConstImm = CN->getAPIntValue();
18007 if (ConstImm.isSignedIntN(16)) {
18008 FlagSet |= PPC::MOF_RPlusSImm16; // Signed 16-bit immediates.
18009 SetAlignFlagsForImm(ConstImm.getZExtValue());
18010 setAlignFlagsForFI(N, FlagSet, DAG);
18012 if (ConstImm.isSignedIntN(34))
18013 FlagSet |= PPC::MOF_RPlusSImm34; // Signed 34-bit immediates.
18015 FlagSet |= PPC::MOF_RPlusR; // Register.
18016 } else if (RHS.getOpcode() == PPCISD::Lo &&
18017 !cast<ConstantSDNode>(RHS.getOperand(1))->getZExtValue())
18018 FlagSet |= PPC::MOF_RPlusLo; // PPCISD::Lo.
18020 FlagSet |= PPC::MOF_RPlusR;
18021 } else { // The address computation is not a constant or an addition.
18022 setAlignFlagsForFI(N, FlagSet, DAG);
18023 FlagSet |= PPC::MOF_NotAddNorCst;
18027 static bool isPCRelNode(SDValue N) {
18028 return (N.getOpcode() == PPCISD::MAT_PCREL_ADDR ||
18029 isValidPCRelNode<ConstantPoolSDNode>(N) ||
18030 isValidPCRelNode<GlobalAddressSDNode>(N) ||
18031 isValidPCRelNode<JumpTableSDNode>(N) ||
18032 isValidPCRelNode<BlockAddressSDNode>(N));
18035 /// computeMOFlags - Given a node N and it's Parent (a MemSDNode), compute
18036 /// the address flags of the load/store instruction that is to be matched.
18037 unsigned PPCTargetLowering::computeMOFlags(const SDNode *Parent, SDValue N,
18038 SelectionDAG &DAG) const {
18039 unsigned FlagSet = PPC::MOF_None;
18041 // Compute subtarget flags.
18042 if (!Subtarget.hasP9Vector())
18043 FlagSet |= PPC::MOF_SubtargetBeforeP9;
18045 FlagSet |= PPC::MOF_SubtargetP9;
18046 if (Subtarget.hasPrefixInstrs())
18047 FlagSet |= PPC::MOF_SubtargetP10;
18049 if (Subtarget.hasSPE())
18050 FlagSet |= PPC::MOF_SubtargetSPE;
18052 // Check if we have a PCRel node and return early.
18053 if ((FlagSet & PPC::MOF_SubtargetP10) && isPCRelNode(N))
18056 // If the node is the paired load/store intrinsics, compute flags for
18057 // address computation and return early.
18058 unsigned ParentOp = Parent->getOpcode();
18059 if (Subtarget.isISA3_1() && ((ParentOp == ISD::INTRINSIC_W_CHAIN) ||
18060 (ParentOp == ISD::INTRINSIC_VOID))) {
18061 unsigned ID = cast<ConstantSDNode>(Parent->getOperand(1))->getZExtValue();
18062 if ((ID == Intrinsic::ppc_vsx_lxvp) || (ID == Intrinsic::ppc_vsx_stxvp)) {
18063 SDValue IntrinOp = (ID == Intrinsic::ppc_vsx_lxvp)
18064 ? Parent->getOperand(2)
18065 : Parent->getOperand(3);
18066 computeFlagsForAddressComputation(IntrinOp, FlagSet, DAG);
18067 FlagSet |= PPC::MOF_Vector;
18072 // Mark this as something we don't want to handle here if it is atomic
18073 // or pre-increment instruction.
18074 if (const LSBaseSDNode *LSB = dyn_cast<LSBaseSDNode>(Parent))
18075 if (LSB->isIndexed())
18076 return PPC::MOF_None;
18078 // Compute in-memory type flags. This is based on if there are scalars,
18079 // floats or vectors.
18080 const MemSDNode *MN = dyn_cast<MemSDNode>(Parent);
18081 assert(MN && "Parent should be a MemSDNode!");
18082 EVT MemVT = MN->getMemoryVT();
18083 unsigned Size = MemVT.getSizeInBits();
18084 if (MemVT.isScalarInteger()) {
18085 assert(Size <= 128 &&
18086 "Not expecting scalar integers larger than 16 bytes!");
18088 FlagSet |= PPC::MOF_SubWordInt;
18089 else if (Size == 32)
18090 FlagSet |= PPC::MOF_WordInt;
18092 FlagSet |= PPC::MOF_DoubleWordInt;
18093 } else if (MemVT.isVector() && !MemVT.isFloatingPoint()) { // Integer vectors.
18095 FlagSet |= PPC::MOF_Vector;
18096 else if (Size == 256) {
18097 assert(Subtarget.pairedVectorMemops() &&
18098 "256-bit vectors are only available when paired vector memops is "
18100 FlagSet |= PPC::MOF_Vector;
18102 llvm_unreachable("Not expecting illegal vectors!");
18103 } else { // Floating point type: can be scalar, f128 or vector types.
18104 if (Size == 32 || Size == 64)
18105 FlagSet |= PPC::MOF_ScalarFloat;
18106 else if (MemVT == MVT::f128 || MemVT.isVector())
18107 FlagSet |= PPC::MOF_Vector;
18109 llvm_unreachable("Not expecting illegal scalar floats!");
18112 // Compute flags for address computation.
18113 computeFlagsForAddressComputation(N, FlagSet, DAG);
18115 // Compute type extension flags.
18116 if (const LoadSDNode *LN = dyn_cast<LoadSDNode>(Parent)) {
18117 switch (LN->getExtensionType()) {
18118 case ISD::SEXTLOAD:
18119 FlagSet |= PPC::MOF_SExt;
18122 case ISD::ZEXTLOAD:
18123 FlagSet |= PPC::MOF_ZExt;
18125 case ISD::NON_EXTLOAD:
18126 FlagSet |= PPC::MOF_NoExt;
18130 FlagSet |= PPC::MOF_NoExt;
18132 // For integers, no extension is the same as zero extension.
18133 // We set the extension mode to zero extension so we don't have
18134 // to add separate entries in AddrModesMap for loads and stores.
18135 if (MemVT.isScalarInteger() && (FlagSet & PPC::MOF_NoExt)) {
18136 FlagSet |= PPC::MOF_ZExt;
18137 FlagSet &= ~PPC::MOF_NoExt;
18140 // If we don't have prefixed instructions, 34-bit constants should be
18141 // treated as PPC::MOF_NotAddNorCst so they can match D-Forms.
18142 bool IsNonP1034BitConst =
18143 ((PPC::MOF_RPlusSImm34 | PPC::MOF_AddrIsSImm32 | PPC::MOF_SubtargetP10) &
18144 FlagSet) == PPC::MOF_RPlusSImm34;
18145 if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::OR &&
18146 IsNonP1034BitConst)
18147 FlagSet |= PPC::MOF_NotAddNorCst;
18152 /// SelectForceXFormMode - Given the specified address, force it to be
18153 /// represented as an indexed [r+r] operation (an XForm instruction).
18154 PPC::AddrMode PPCTargetLowering::SelectForceXFormMode(SDValue N, SDValue &Disp,
18156 SelectionDAG &DAG) const {
18158 PPC::AddrMode Mode = PPC::AM_XForm;
18159 int16_t ForceXFormImm = 0;
18160 if (provablyDisjointOr(DAG, N) &&
18161 !isIntS16Immediate(N.getOperand(1), ForceXFormImm)) {
18162 Disp = N.getOperand(0);
18163 Base = N.getOperand(1);
18167 // If the address is the result of an add, we will utilize the fact that the
18168 // address calculation includes an implicit add. However, we can reduce
18169 // register pressure if we do not materialize a constant just for use as the
18170 // index register. We only get rid of the add if it is not an add of a
18171 // value and a 16-bit signed constant and both have a single use.
18172 if (N.getOpcode() == ISD::ADD &&
18173 (!isIntS16Immediate(N.getOperand(1), ForceXFormImm) ||
18174 !N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) {
18175 Disp = N.getOperand(0);
18176 Base = N.getOperand(1);
18180 // Otherwise, use R0 as the base register.
18181 Disp = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
18188 bool PPCTargetLowering::splitValueIntoRegisterParts(
18189 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
18190 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
18191 EVT ValVT = Val.getValueType();
18192 // If we are splitting a scalar integer into f64 parts (i.e. so they
18193 // can be placed into VFRC registers), we need to zero extend and
18194 // bitcast the values. This will ensure the value is placed into a
18195 // VSR using direct moves or stack operations as needed.
18196 if (PartVT == MVT::f64 &&
18197 (ValVT == MVT::i32 || ValVT == MVT::i16 || ValVT == MVT::i8)) {
18198 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
18199 Val = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Val);
18206 SDValue PPCTargetLowering::lowerToLibCall(const char *LibCallName, SDValue Op,
18207 SelectionDAG &DAG) const {
18208 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18209 TargetLowering::CallLoweringInfo CLI(DAG);
18210 EVT RetVT = Op.getValueType();
18211 Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
18213 DAG.getExternalSymbol(LibCallName, TLI.getPointerTy(DAG.getDataLayout()));
18214 bool SignExtend = TLI.shouldSignExtendTypeInLibCall(RetVT, false);
18215 TargetLowering::ArgListTy Args;
18216 TargetLowering::ArgListEntry Entry;
18217 for (const SDValue &N : Op->op_values()) {
18218 EVT ArgVT = N.getValueType();
18219 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18222 Entry.IsSExt = TLI.shouldSignExtendTypeInLibCall(ArgVT, SignExtend);
18223 Entry.IsZExt = !Entry.IsSExt;
18224 Args.push_back(Entry);
18227 SDValue InChain = DAG.getEntryNode();
18228 SDValue TCChain = InChain;
18229 const Function &F = DAG.getMachineFunction().getFunction();
18231 TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
18232 (RetTy == F.getReturnType() || F.getReturnType()->isVoidTy());
18235 CLI.setDebugLoc(SDLoc(Op))
18237 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args))
18238 .setTailCall(isTailCall)
18239 .setSExtResult(SignExtend)
18240 .setZExtResult(!SignExtend)
18241 .setIsPostTypeLegalization(true);
18242 return TLI.LowerCallTo(CLI).first;
18245 SDValue PPCTargetLowering::lowerLibCallBasedOnType(
18246 const char *LibCallFloatName, const char *LibCallDoubleName, SDValue Op,
18247 SelectionDAG &DAG) const {
18248 if (Op.getValueType() == MVT::f32)
18249 return lowerToLibCall(LibCallFloatName, Op, DAG);
18251 if (Op.getValueType() == MVT::f64)
18252 return lowerToLibCall(LibCallDoubleName, Op, DAG);
18257 bool PPCTargetLowering::isLowringToMASSFiniteSafe(SDValue Op) const {
18258 SDNodeFlags Flags = Op.getNode()->getFlags();
18259 return isLowringToMASSSafe(Op) && Flags.hasNoSignedZeros() &&
18260 Flags.hasNoNaNs() && Flags.hasNoInfs();
18263 bool PPCTargetLowering::isLowringToMASSSafe(SDValue Op) const {
18264 return Op.getNode()->getFlags().hasApproximateFuncs();
18267 bool PPCTargetLowering::isScalarMASSConversionEnabled() const {
18268 return getTargetMachine().Options.PPCGenScalarMASSEntries;
18271 SDValue PPCTargetLowering::lowerLibCallBase(const char *LibCallDoubleName,
18272 const char *LibCallFloatName,
18273 const char *LibCallDoubleNameFinite,
18274 const char *LibCallFloatNameFinite,
18276 SelectionDAG &DAG) const {
18277 if (!isScalarMASSConversionEnabled() || !isLowringToMASSSafe(Op))
18280 if (!isLowringToMASSFiniteSafe(Op))
18281 return lowerLibCallBasedOnType(LibCallFloatName, LibCallDoubleName, Op,
18284 return lowerLibCallBasedOnType(LibCallFloatNameFinite,
18285 LibCallDoubleNameFinite, Op, DAG);
18288 SDValue PPCTargetLowering::lowerPow(SDValue Op, SelectionDAG &DAG) const {
18289 return lowerLibCallBase("__xl_pow", "__xl_powf", "__xl_pow_finite",
18290 "__xl_powf_finite", Op, DAG);
18293 SDValue PPCTargetLowering::lowerSin(SDValue Op, SelectionDAG &DAG) const {
18294 return lowerLibCallBase("__xl_sin", "__xl_sinf", "__xl_sin_finite",
18295 "__xl_sinf_finite", Op, DAG);
18298 SDValue PPCTargetLowering::lowerCos(SDValue Op, SelectionDAG &DAG) const {
18299 return lowerLibCallBase("__xl_cos", "__xl_cosf", "__xl_cos_finite",
18300 "__xl_cosf_finite", Op, DAG);
18303 SDValue PPCTargetLowering::lowerLog(SDValue Op, SelectionDAG &DAG) const {
18304 return lowerLibCallBase("__xl_log", "__xl_logf", "__xl_log_finite",
18305 "__xl_logf_finite", Op, DAG);
18308 SDValue PPCTargetLowering::lowerLog10(SDValue Op, SelectionDAG &DAG) const {
18309 return lowerLibCallBase("__xl_log10", "__xl_log10f", "__xl_log10_finite",
18310 "__xl_log10f_finite", Op, DAG);
18313 SDValue PPCTargetLowering::lowerExp(SDValue Op, SelectionDAG &DAG) const {
18314 return lowerLibCallBase("__xl_exp", "__xl_expf", "__xl_exp_finite",
18315 "__xl_expf_finite", Op, DAG);
18318 // If we happen to match to an aligned D-Form, check if the Frame Index is
18319 // adequately aligned. If it is not, reset the mode to match to X-Form.
18320 static void setXFormForUnalignedFI(SDValue N, unsigned Flags,
18321 PPC::AddrMode &Mode) {
18322 if (!isa<FrameIndexSDNode>(N))
18324 if ((Mode == PPC::AM_DSForm && !(Flags & PPC::MOF_RPlusSImm16Mult4)) ||
18325 (Mode == PPC::AM_DQForm && !(Flags & PPC::MOF_RPlusSImm16Mult16)))
18326 Mode = PPC::AM_XForm;
18329 /// SelectOptimalAddrMode - Based on a node N and it's Parent (a MemSDNode),
18330 /// compute the address flags of the node, get the optimal address mode based
18331 /// on the flags, and set the Base and Disp based on the address mode.
18332 PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
18333 SDValue N, SDValue &Disp,
18336 MaybeAlign Align) const {
18339 // Compute the address flags.
18340 unsigned Flags = computeMOFlags(Parent, N, DAG);
18342 // Get the optimal address mode based on the Flags.
18343 PPC::AddrMode Mode = getAddrModeForFlags(Flags);
18345 // If the address mode is DS-Form or DQ-Form, check if the FI is aligned.
18346 // Select an X-Form load if it is not.
18347 setXFormForUnalignedFI(N, Flags, Mode);
18349 // Set the mode to PC-Relative addressing mode if we have a valid PC-Rel node.
18350 if ((Mode == PPC::AM_XForm) && isPCRelNode(N)) {
18351 assert(Subtarget.isUsingPCRelativeCalls() &&
18352 "Must be using PC-Relative calls when a valid PC-Relative node is "
18354 Mode = PPC::AM_PCRel;
18357 // Set Base and Disp accordingly depending on the address mode.
18359 case PPC::AM_DForm:
18360 case PPC::AM_DSForm:
18361 case PPC::AM_DQForm: {
18362 // This is a register plus a 16-bit immediate. The base will be the
18363 // register and the displacement will be the immediate unless it
18364 // isn't sufficiently aligned.
18365 if (Flags & PPC::MOF_RPlusSImm16) {
18366 SDValue Op0 = N.getOperand(0);
18367 SDValue Op1 = N.getOperand(1);
18368 int16_t Imm = cast<ConstantSDNode>(Op1)->getZExtValue();
18369 if (!Align || isAligned(*Align, Imm)) {
18370 Disp = DAG.getTargetConstant(Imm, DL, N.getValueType());
18372 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op0)) {
18373 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
18374 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
18379 // This is a register plus the @lo relocation. The base is the register
18380 // and the displacement is the global address.
18381 else if (Flags & PPC::MOF_RPlusLo) {
18382 Disp = N.getOperand(1).getOperand(0); // The global address.
18383 assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
18384 Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
18385 Disp.getOpcode() == ISD::TargetConstantPool ||
18386 Disp.getOpcode() == ISD::TargetJumpTable);
18387 Base = N.getOperand(0);
18390 // This is a constant address at most 32 bits. The base will be
18391 // zero or load-immediate-shifted and the displacement will be
18392 // the low 16 bits of the address.
18393 else if (Flags & PPC::MOF_AddrIsSImm32) {
18394 auto *CN = cast<ConstantSDNode>(N);
18395 EVT CNType = CN->getValueType(0);
18396 uint64_t CNImm = CN->getZExtValue();
18397 // If this address fits entirely in a 16-bit sext immediate field, codegen
18400 if (isIntS16Immediate(CN, Imm) && (!Align || isAligned(*Align, Imm))) {
18401 Disp = DAG.getTargetConstant(Imm, DL, CNType);
18402 Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
18406 // Handle 32-bit sext immediate with LIS + Addr mode.
18407 if ((CNType == MVT::i32 || isInt<32>(CNImm)) &&
18408 (!Align || isAligned(*Align, CNImm))) {
18409 int32_t Addr = (int32_t)CNImm;
18410 // Otherwise, break this down into LIS + Disp.
18411 Disp = DAG.getTargetConstant((int16_t)Addr, DL, MVT::i32);
18413 DAG.getTargetConstant((Addr - (int16_t)Addr) >> 16, DL, MVT::i32);
18414 uint32_t LIS = CNType == MVT::i32 ? PPC::LIS : PPC::LIS8;
18415 Base = SDValue(DAG.getMachineNode(LIS, DL, CNType, Base), 0);
18419 // Otherwise, the PPC:MOF_NotAdd flag is set. Load/Store is Non-foldable.
18420 Disp = DAG.getTargetConstant(0, DL, getPointerTy(DAG.getDataLayout()));
18421 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
18422 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
18423 fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
18428 case PPC::AM_PrefixDForm: {
18430 unsigned Opcode = N.getOpcode();
18431 if (((Opcode == ISD::ADD) || (Opcode == ISD::OR)) &&
18432 (isIntS34Immediate(N.getOperand(1), Imm34))) {
18433 // N is an Add/OR Node, and it's operand is a 34-bit signed immediate.
18434 Disp = DAG.getTargetConstant(Imm34, DL, N.getValueType());
18435 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
18436 Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
18438 Base = N.getOperand(0);
18439 } else if (isIntS34Immediate(N, Imm34)) {
18440 // The address is a 34-bit signed immediate.
18441 Disp = DAG.getTargetConstant(Imm34, DL, N.getValueType());
18442 Base = DAG.getRegister(PPC::ZERO8, N.getValueType());
18446 case PPC::AM_PCRel: {
18447 // When selecting PC-Relative instructions, "Base" is not utilized as
18448 // we select the address as [PC+imm].
18454 default: { // By default, X-Form is always available to be selected.
18455 // When a frame index is not aligned, we also match by XForm.
18456 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N);
18457 Base = FI ? N : N.getOperand(1);
18458 Disp = FI ? DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
18467 CCAssignFn *PPCTargetLowering::ccAssignFnForCall(CallingConv::ID CC,
18469 bool IsVarArg) const {
18471 case CallingConv::Cold:
18472 return (Return ? RetCC_PPC_Cold : CC_PPC64_ELF);
18474 return CC_PPC64_ELF;
18478 bool PPCTargetLowering::shouldInlineQuadwordAtomics() const {
18479 return Subtarget.isPPC64() && Subtarget.hasQuadwordAtomics();
18482 TargetLowering::AtomicExpansionKind
18483 PPCTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
18484 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
18485 if (shouldInlineQuadwordAtomics() && Size == 128)
18486 return AtomicExpansionKind::MaskedIntrinsic;
18488 switch (AI->getOperation()) {
18489 case AtomicRMWInst::UIncWrap:
18490 case AtomicRMWInst::UDecWrap:
18491 return AtomicExpansionKind::CmpXChg;
18493 return TargetLowering::shouldExpandAtomicRMWInIR(AI);
18496 llvm_unreachable("unreachable atomicrmw operation");
18499 TargetLowering::AtomicExpansionKind
18500 PPCTargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
18501 unsigned Size = AI->getNewValOperand()->getType()->getPrimitiveSizeInBits();
18502 if (shouldInlineQuadwordAtomics() && Size == 128)
18503 return AtomicExpansionKind::MaskedIntrinsic;
18504 return TargetLowering::shouldExpandAtomicCmpXchgInIR(AI);
18507 static Intrinsic::ID
18508 getIntrinsicForAtomicRMWBinOp128(AtomicRMWInst::BinOp BinOp) {
18511 llvm_unreachable("Unexpected AtomicRMW BinOp");
18512 case AtomicRMWInst::Xchg:
18513 return Intrinsic::ppc_atomicrmw_xchg_i128;
18514 case AtomicRMWInst::Add:
18515 return Intrinsic::ppc_atomicrmw_add_i128;
18516 case AtomicRMWInst::Sub:
18517 return Intrinsic::ppc_atomicrmw_sub_i128;
18518 case AtomicRMWInst::And:
18519 return Intrinsic::ppc_atomicrmw_and_i128;
18520 case AtomicRMWInst::Or:
18521 return Intrinsic::ppc_atomicrmw_or_i128;
18522 case AtomicRMWInst::Xor:
18523 return Intrinsic::ppc_atomicrmw_xor_i128;
18524 case AtomicRMWInst::Nand:
18525 return Intrinsic::ppc_atomicrmw_nand_i128;
18529 Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
18530 IRBuilderBase &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
18531 Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const {
18532 assert(shouldInlineQuadwordAtomics() && "Only support quadword now");
18533 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
18534 Type *ValTy = Incr->getType();
18535 assert(ValTy->getPrimitiveSizeInBits() == 128);
18536 Function *RMW = Intrinsic::getDeclaration(
18537 M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation()));
18538 Type *Int64Ty = Type::getInt64Ty(M->getContext());
18539 Value *IncrLo = Builder.CreateTrunc(Incr, Int64Ty, "incr_lo");
18541 Builder.CreateTrunc(Builder.CreateLShr(Incr, 64), Int64Ty, "incr_hi");
18543 Builder.CreateBitCast(AlignedAddr, Type::getInt8PtrTy(M->getContext()));
18544 Value *LoHi = Builder.CreateCall(RMW, {Addr, IncrLo, IncrHi});
18545 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
18546 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
18547 Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
18548 Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
18549 return Builder.CreateOr(
18550 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
18553 Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
18554 IRBuilderBase &Builder, AtomicCmpXchgInst *CI, Value *AlignedAddr,
18555 Value *CmpVal, Value *NewVal, Value *Mask, AtomicOrdering Ord) const {
18556 assert(shouldInlineQuadwordAtomics() && "Only support quadword now");
18557 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
18558 Type *ValTy = CmpVal->getType();
18559 assert(ValTy->getPrimitiveSizeInBits() == 128);
18560 Function *IntCmpXchg =
18561 Intrinsic::getDeclaration(M, Intrinsic::ppc_cmpxchg_i128);
18562 Type *Int64Ty = Type::getInt64Ty(M->getContext());
18563 Value *CmpLo = Builder.CreateTrunc(CmpVal, Int64Ty, "cmp_lo");
18565 Builder.CreateTrunc(Builder.CreateLShr(CmpVal, 64), Int64Ty, "cmp_hi");
18566 Value *NewLo = Builder.CreateTrunc(NewVal, Int64Ty, "new_lo");
18568 Builder.CreateTrunc(Builder.CreateLShr(NewVal, 64), Int64Ty, "new_hi");
18570 Builder.CreateBitCast(AlignedAddr, Type::getInt8PtrTy(M->getContext()));
18571 emitLeadingFence(Builder, CI, Ord);
18573 Builder.CreateCall(IntCmpXchg, {Addr, CmpLo, CmpHi, NewLo, NewHi});
18574 emitTrailingFence(Builder, CI, Ord);
18575 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
18576 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
18577 Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
18578 Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
18579 return Builder.CreateOr(
18580 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");