1 //===-- NVPTXTargetMachine.cpp - Define TargetMachine for NVPTX -----------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // Top-level implementation for the NVPTX target.
12 //===----------------------------------------------------------------------===//
15 #include "NVPTXAllocaHoisting.h"
16 #include "NVPTXLowerAggrCopies.h"
17 #include "NVPTXTargetMachine.h"
18 #include "NVPTXTargetObjectFile.h"
19 #include "NVPTXTargetTransformInfo.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/Triple.h"
22 #include "llvm/Analysis/TargetTransformInfo.h"
23 #include "llvm/CodeGen/Passes.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/IR/LegacyPassManager.h"
26 #include "llvm/Pass.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/TargetRegistry.h"
29 #include "llvm/Target/TargetMachine.h"
30 #include "llvm/Target/TargetOptions.h"
31 #include "llvm/Transforms/Scalar.h"
32 #include "llvm/Transforms/Scalar/GVN.h"
33 #include "llvm/Transforms/Vectorize.h"
39 // LSV is still relatively new; this switch lets us turn it off in case we
40 // encounter (or suspect) a bug.
42 DisableLoadStoreVectorizer("disable-nvptx-load-store-vectorizer",
43 cl::desc("Disable load/store vectorizer"),
44 cl::init(false), cl::Hidden);
48 void initializeNVVMIntrRangePass(PassRegistry&);
49 void initializeNVVMReflectPass(PassRegistry&);
50 void initializeGenericToNVVMPass(PassRegistry&);
51 void initializeNVPTXAllocaHoistingPass(PassRegistry &);
52 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
53 void initializeNVPTXInferAddressSpacesPass(PassRegistry &);
54 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
55 void initializeNVPTXLowerArgsPass(PassRegistry &);
56 void initializeNVPTXLowerAllocaPass(PassRegistry &);
58 } // end namespace llvm
60 extern "C" void LLVMInitializeNVPTXTarget() {
61 // Register the target.
62 RegisterTargetMachine<NVPTXTargetMachine32> X(getTheNVPTXTarget32());
63 RegisterTargetMachine<NVPTXTargetMachine64> Y(getTheNVPTXTarget64());
65 // FIXME: This pass is really intended to be invoked during IR optimization,
66 // but it's very NVPTX-specific.
67 PassRegistry &PR = *PassRegistry::getPassRegistry();
68 initializeNVVMReflectPass(PR);
69 initializeNVVMIntrRangePass(PR);
70 initializeGenericToNVVMPass(PR);
71 initializeNVPTXAllocaHoistingPass(PR);
72 initializeNVPTXAssignValidGlobalNamesPass(PR);
73 initializeNVPTXInferAddressSpacesPass(PR);
74 initializeNVPTXLowerArgsPass(PR);
75 initializeNVPTXLowerAllocaPass(PR);
76 initializeNVPTXLowerAggrCopiesPass(PR);
79 static std::string computeDataLayout(bool is64Bit) {
80 std::string Ret = "e";
85 Ret += "-i64:64-v16:16-v32:32-n16:32:64";
90 NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
91 StringRef CPU, StringRef FS,
92 const TargetOptions &Options,
93 Optional<Reloc::Model> RM,
95 CodeGenOpt::Level OL, bool is64bit)
96 // The pic relocation model is used regardless of what the client has
97 // specified, as it is the only relocation model currently supported.
98 : LLVMTargetMachine(T, computeDataLayout(is64bit), TT, CPU, FS, Options,
101 TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
102 Subtarget(TT, CPU, FS, *this) {
103 if (TT.getOS() == Triple::NVCL)
104 drvInterface = NVPTX::NVCL;
106 drvInterface = NVPTX::CUDA;
110 NVPTXTargetMachine::~NVPTXTargetMachine() = default;
112 void NVPTXTargetMachine32::anchor() {}
114 NVPTXTargetMachine32::NVPTXTargetMachine32(const Target &T, const Triple &TT,
115 StringRef CPU, StringRef FS,
116 const TargetOptions &Options,
117 Optional<Reloc::Model> RM,
119 CodeGenOpt::Level OL)
120 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
122 void NVPTXTargetMachine64::anchor() {}
124 NVPTXTargetMachine64::NVPTXTargetMachine64(const Target &T, const Triple &TT,
125 StringRef CPU, StringRef FS,
126 const TargetOptions &Options,
127 Optional<Reloc::Model> RM,
129 CodeGenOpt::Level OL)
130 : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
134 class NVPTXPassConfig : public TargetPassConfig {
136 NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
137 : TargetPassConfig(TM, PM) {}
139 NVPTXTargetMachine &getNVPTXTargetMachine() const {
140 return getTM<NVPTXTargetMachine>();
143 void addIRPasses() override;
144 bool addInstSelector() override;
145 void addPostRegAlloc() override;
146 void addMachineSSAOptimization() override;
148 FunctionPass *createTargetRegisterAllocator(bool) override;
149 void addFastRegAlloc(FunctionPass *RegAllocPass) override;
150 void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
153 // If the opt level is aggressive, add GVN; otherwise, add EarlyCSE. This
154 // function is only called in opt mode.
155 void addEarlyCSEOrGVNPass();
157 // Add passes that propagate special memory spaces.
158 void addAddressSpaceInferencePasses();
160 // Add passes that perform straight-line scalar optimizations.
161 void addStraightLineScalarOptimizationPasses();
164 } // end anonymous namespace
166 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
167 return new NVPTXPassConfig(this, PM);
170 void NVPTXTargetMachine::addEarlyAsPossiblePasses(PassManagerBase &PM) {
171 PM.add(createNVVMReflectPass());
172 PM.add(createNVVMIntrRangePass(Subtarget.getSmVersion()));
175 TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() {
176 return TargetIRAnalysis([this](const Function &F) {
177 return TargetTransformInfo(NVPTXTTIImpl(this, F));
181 void NVPTXPassConfig::addEarlyCSEOrGVNPass() {
182 if (getOptLevel() == CodeGenOpt::Aggressive)
183 addPass(createGVNPass());
185 addPass(createEarlyCSEPass());
188 void NVPTXPassConfig::addAddressSpaceInferencePasses() {
189 // NVPTXLowerArgs emits alloca for byval parameters which can often
190 // be eliminated by SROA.
191 addPass(createSROAPass());
192 addPass(createNVPTXLowerAllocaPass());
193 addPass(createNVPTXInferAddressSpacesPass());
196 void NVPTXPassConfig::addStraightLineScalarOptimizationPasses() {
197 addPass(createSeparateConstOffsetFromGEPPass());
198 addPass(createSpeculativeExecutionPass());
199 // ReassociateGEPs exposes more opportunites for SLSR. See
200 // the example in reassociate-geps-and-slsr.ll.
201 addPass(createStraightLineStrengthReducePass());
202 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
203 // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE
204 // for some of our benchmarks.
205 addEarlyCSEOrGVNPass();
206 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
207 addPass(createNaryReassociatePass());
208 // NaryReassociate on GEPs creates redundant common expressions, so run
209 // EarlyCSE after it.
210 addPass(createEarlyCSEPass());
213 void NVPTXPassConfig::addIRPasses() {
214 // The following passes are known to not play well with virtual regs hanging
215 // around after register allocation (which in our case, is *all* registers).
216 // We explicitly disable them here. We do, however, need some functionality
217 // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
218 // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
219 disablePass(&PrologEpilogCodeInserterID);
220 disablePass(&MachineCopyPropagationID);
221 disablePass(&TailDuplicateID);
222 disablePass(&StackMapLivenessID);
223 disablePass(&LiveDebugValuesID);
224 disablePass(&PostRASchedulerID);
225 disablePass(&FuncletLayoutID);
226 disablePass(&PatchableFunctionID);
228 // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
229 // it here does nothing. But since we need it for correctness when lowering
230 // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
231 // call addEarlyAsPossiblePasses.
232 addPass(createNVVMReflectPass());
234 if (getOptLevel() != CodeGenOpt::None)
235 addPass(createNVPTXImageOptimizerPass());
236 addPass(createNVPTXAssignValidGlobalNamesPass());
237 addPass(createGenericToNVVMPass());
239 // NVPTXLowerArgs is required for correctness and should be run right
240 // before the address space inference passes.
241 addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
242 if (getOptLevel() != CodeGenOpt::None) {
243 addAddressSpaceInferencePasses();
244 if (!DisableLoadStoreVectorizer)
245 addPass(createLoadStoreVectorizerPass());
246 addStraightLineScalarOptimizationPasses();
249 // === LSR and other generic IR passes ===
250 TargetPassConfig::addIRPasses();
251 // EarlyCSE is not always strong enough to clean up what LSR produces. For
252 // example, GVN can combine
259 // %0 = shl nsw %a, 2
262 // but EarlyCSE can do neither of them.
263 if (getOptLevel() != CodeGenOpt::None)
264 addEarlyCSEOrGVNPass();
267 bool NVPTXPassConfig::addInstSelector() {
268 const NVPTXSubtarget &ST = *getTM<NVPTXTargetMachine>().getSubtargetImpl();
270 addPass(createLowerAggrCopies());
271 addPass(createAllocaHoisting());
272 addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
274 if (!ST.hasImageHandles())
275 addPass(createNVPTXReplaceImageHandlesPass());
280 void NVPTXPassConfig::addPostRegAlloc() {
281 addPass(createNVPTXPrologEpilogPass(), false);
282 if (getOptLevel() != CodeGenOpt::None) {
283 // NVPTXPrologEpilogPass calculates frame object offset and replace frame
284 // index with VRFrame register. NVPTXPeephole need to be run after that and
285 // will replace VRFrame with VRFrameLocal when possible.
286 addPass(createNVPTXPeephole());
290 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
291 return nullptr; // No reg alloc
294 void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
295 assert(!RegAllocPass && "NVPTX uses no regalloc!");
296 addPass(&PHIEliminationID);
297 addPass(&TwoAddressInstructionPassID);
300 void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
301 assert(!RegAllocPass && "NVPTX uses no regalloc!");
303 addPass(&ProcessImplicitDefsID);
304 addPass(&LiveVariablesID);
305 addPass(&MachineLoopInfoID);
306 addPass(&PHIEliminationID);
308 addPass(&TwoAddressInstructionPassID);
309 addPass(&RegisterCoalescerID);
311 // PreRA instruction scheduling.
312 if (addPass(&MachineSchedulerID))
313 printAndVerify("After Machine Scheduling");
316 addPass(&StackSlotColoringID);
318 // FIXME: Needs physical registers
319 //addPass(&PostRAMachineLICMID);
321 printAndVerify("After StackSlotColoring");
324 void NVPTXPassConfig::addMachineSSAOptimization() {
325 // Pre-ra tail duplication.
326 if (addPass(&EarlyTailDuplicateID))
327 printAndVerify("After Pre-RegAlloc TailDuplicate");
329 // Optimize PHIs before DCE: removing dead PHI cycles may make more
330 // instructions dead.
331 addPass(&OptimizePHIsID);
333 // This pass merges large allocas. StackSlotColoring is a different pass
334 // which merges spill slots.
335 addPass(&StackColoringID);
337 // If the target requests it, assign local variables to stack slots relative
338 // to one another and simplify frame index references where possible.
339 addPass(&LocalStackSlotAllocationID);
341 // With optimization, dead code should already be eliminated. However
342 // there is one known exception: lowered code for arguments that are only
343 // used by tail calls, where the tail calls reuse the incoming stack
344 // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
345 addPass(&DeadMachineInstructionElimID);
346 printAndVerify("After codegen DCE pass");
348 // Allow targets to insert passes that improve instruction level parallelism,
349 // like if-conversion. Such passes will typically need dominator trees and
350 // loop info, just like LICM and CSE below.
352 printAndVerify("After ILP optimizations");
354 addPass(&MachineLICMID);
355 addPass(&MachineCSEID);
357 addPass(&MachineSinkingID);
358 printAndVerify("After Machine LICM, CSE and Sinking passes");
360 addPass(&PeepholeOptimizerID);
361 printAndVerify("After codegen peephole optimization pass");