1 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the ARMSelectionDAGInfo class.
11 //===----------------------------------------------------------------------===//
13 #include "ARMTargetMachine.h"
14 #include "ARMTargetTransformInfo.h"
15 #include "llvm/CodeGen/SelectionDAG.h"
16 #include "llvm/IR/DerivedTypes.h"
17 #include "llvm/Support/CommandLine.h"
20 #define DEBUG_TYPE "arm-selectiondag-info"
22 cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
23 "arm-memtransfer-tploop", cl::Hidden,
24 cl::desc("Control conversion of memcpy to "
25 "Tail predicated loops (WLSTP)"),
26 cl::init(TPLoop::ForceDisabled),
27 cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
28 "Don't convert memcpy to TP loop."),
29 clEnumValN(TPLoop::ForceEnabled, "force-enabled",
30 "Always convert memcpy to TP loop."),
31 clEnumValN(TPLoop::Allow, "allow",
32 "Allow (may be subject to certain conditions) "
33 "conversion of memcpy to TP loop.")));
35 // Emit, if possible, a specialized version of the given Libcall. Typically this
36 // means selecting the appropriately aligned version, but we also convert memset
38 SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
39 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
40 SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
41 const ARMSubtarget &Subtarget =
42 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
43 const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
45 // Only use a specialized AEABI function if the default version of this
46 // Libcall is an AEABI function.
47 if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
50 // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
51 // able to translate memset to memclr and use the value to index the function
61 AEABILibcall = AEABI_MEMCPY;
64 AEABILibcall = AEABI_MEMMOVE;
67 AEABILibcall = AEABI_MEMSET;
68 if (ConstantSDNode *ConstantSrc = dyn_cast<ConstantSDNode>(Src))
69 if (ConstantSrc->getZExtValue() == 0)
70 AEABILibcall = AEABI_MEMCLR;
76 // Choose the most-aligned libcall variant that we can
83 AlignVariant = ALIGN8;
84 else if ((Align & 3) == 0)
85 AlignVariant = ALIGN4;
87 AlignVariant = ALIGN1;
89 TargetLowering::ArgListTy Args;
90 TargetLowering::ArgListEntry Entry;
91 Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
93 Args.push_back(Entry);
94 if (AEABILibcall == AEABI_MEMCLR) {
96 Args.push_back(Entry);
97 } else if (AEABILibcall == AEABI_MEMSET) {
98 // Adjust parameters for memset, EABI uses format (ptr, size, value),
99 // GNU library uses (ptr, value, size)
100 // See RTABI section 4.3.4
102 Args.push_back(Entry);
104 // Extend or truncate the argument to be an i32 value for the call.
105 if (Src.getValueType().bitsGT(MVT::i32))
106 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
107 else if (Src.getValueType().bitsLT(MVT::i32))
108 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
111 Entry.Ty = Type::getInt32Ty(*DAG.getContext());
112 Entry.IsSExt = false;
113 Args.push_back(Entry);
116 Args.push_back(Entry);
119 Args.push_back(Entry);
122 char const *FunctionNames[4][3] = {
123 { "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" },
124 { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" },
125 { "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" },
126 { "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" }
128 TargetLowering::CallLoweringInfo CLI(DAG);
132 TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
133 DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
134 TLI->getPointerTy(DAG.getDataLayout())),
137 std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
139 return CallResult.second;
142 static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
143 const SelectionDAG &DAG,
144 ConstantSDNode *ConstantSize,
145 Align Alignment, bool IsMemcpy) {
146 auto &F = DAG.getMachineFunction().getFunction();
147 if (!EnableMemtransferTPLoop)
149 if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
151 // Do not generate inline TP loop if optimizations is disabled,
152 // or if optimization for size (-Os or -Oz) is on.
153 if (F.hasOptNone() || F.hasOptSize())
155 // If cli option is unset, for memset always generate inline TP.
156 // For memcpy, check some conditions
159 if (!ConstantSize && Alignment >= Align(4))
162 ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
163 ConstantSize->getZExtValue() <
164 Subtarget.getMaxMemcpyTPInlineSizeThreshold())
169 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
170 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
171 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
172 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
173 const ARMSubtarget &Subtarget =
174 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
175 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
177 if (Subtarget.hasMVEIntegerOps() &&
178 shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
179 return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
180 DAG.getZExtOrTrunc(Size, dl, MVT::i32));
182 // Do repeated 4-byte loads and stores. To be improved.
183 // This requires 4-byte alignment.
184 if (Alignment < Align(4))
186 // This requires the copy size to be a constant, preferably
187 // within a subtarget-specific limit.
189 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
190 Alignment.value(), RTLIB::MEMCPY);
191 uint64_t SizeVal = ConstantSize->getZExtValue();
192 if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
193 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
194 Alignment.value(), RTLIB::MEMCPY);
196 unsigned BytesLeft = SizeVal & 3;
197 unsigned NumMemOps = SizeVal >> 2;
198 unsigned EmittedNumMemOps = 0;
202 // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
203 const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
206 uint64_t SrcOff = 0, DstOff = 0;
208 // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
209 // VLDM/VSTM and make this code emit it when appropriate. This would reduce
210 // pressure on the general purpose registers. However this seems harder to map
211 // onto the register allocator's view of the world.
213 // The number of MEMCPY pseudo-instructions to emit. We use up to
214 // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
215 // later on. This is a lower bound on the number of MEMCPY operations we must
217 unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
219 // Code size optimisation: do not inline memcpy if expansion results in
220 // more instructions than the libary call.
221 if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
225 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
227 for (unsigned I = 0; I != NumMEMCPYs; ++I) {
228 // Evenly distribute registers among MEMCPY operations to reduce register
230 unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
231 unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
233 Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
234 DAG.getConstant(NumRegs, dl, MVT::i32));
235 Src = Dst.getValue(1);
236 Chain = Dst.getValue(2);
238 DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
239 SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
241 EmittedNumMemOps = NextEmittedNumMemOps;
247 // Issue loads / stores for the trailing (1 - 3) bytes.
248 auto getRemainingValueType = [](unsigned BytesLeft) {
249 return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
251 auto getRemainingSize = [](unsigned BytesLeft) {
252 return (BytesLeft >= 2) ? 2 : 1;
255 unsigned BytesLeftSave = BytesLeft;
258 VT = getRemainingValueType(BytesLeft);
259 VTSize = getRemainingSize(BytesLeft);
260 Loads[i] = DAG.getLoad(VT, dl, Chain,
261 DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
262 DAG.getConstant(SrcOff, dl, MVT::i32)),
263 SrcPtrInfo.getWithOffset(SrcOff));
264 TFOps[i] = Loads[i].getValue(1);
269 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
270 makeArrayRef(TFOps, i));
273 BytesLeft = BytesLeftSave;
275 VT = getRemainingValueType(BytesLeft);
276 VTSize = getRemainingSize(BytesLeft);
277 TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
278 DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
279 DAG.getConstant(DstOff, dl, MVT::i32)),
280 DstPtrInfo.getWithOffset(DstOff));
285 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
286 makeArrayRef(TFOps, i));
289 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
290 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
291 SDValue Size, Align Alignment, bool isVolatile,
292 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
293 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
294 Alignment.value(), RTLIB::MEMMOVE);
297 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
298 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
299 SDValue Size, Align Alignment, bool isVolatile,
300 MachinePointerInfo DstPtrInfo) const {
302 const ARMSubtarget &Subtarget =
303 DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
305 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
307 // Generate TP loop for llvm.memset
308 if (Subtarget.hasMVEIntegerOps() &&
309 shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
311 Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
312 DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
313 return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
314 DAG.getZExtOrTrunc(Size, dl, MVT::i32));
317 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
318 Alignment.value(), RTLIB::MEMSET);