]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
Update lld to trunk r290819 and resolve conflicts.
[FreeBSD/FreeBSD.git] / contrib / llvm / tools / clang / lib / CodeGen / CGOpenMPRuntimeNVPTX.cpp
1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This provides a class for OpenMP runtime code generation specialized to NVPTX
11 // targets.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "CGOpenMPRuntimeNVPTX.h"
16 #include "clang/AST/DeclOpenMP.h"
17 #include "CodeGenFunction.h"
18 #include "clang/AST/StmtOpenMP.h"
19
20 using namespace clang;
21 using namespace CodeGen;
22
23 /// \brief Get the GPU warp size.
24 llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXWarpSize(CodeGenFunction &CGF) {
25   CGBuilderTy &Bld = CGF.Builder;
26   return Bld.CreateCall(
27       llvm::Intrinsic::getDeclaration(
28           &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
29       llvm::None, "nvptx_warp_size");
30 }
31
32 /// \brief Get the id of the current thread on the GPU.
33 llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXThreadID(CodeGenFunction &CGF) {
34   CGBuilderTy &Bld = CGF.Builder;
35   return Bld.CreateCall(
36       llvm::Intrinsic::getDeclaration(
37           &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
38       llvm::None, "nvptx_tid");
39 }
40
41 // \brief Get the maximum number of threads in a block of the GPU.
42 llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXNumThreads(CodeGenFunction &CGF) {
43   CGBuilderTy &Bld = CGF.Builder;
44   return Bld.CreateCall(
45       llvm::Intrinsic::getDeclaration(
46           &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
47       llvm::None, "nvptx_num_threads");
48 }
49
50 /// \brief Get barrier to synchronize all threads in a block.
51 void CGOpenMPRuntimeNVPTX::getNVPTXCTABarrier(CodeGenFunction &CGF) {
52   CGBuilderTy &Bld = CGF.Builder;
53   Bld.CreateCall(llvm::Intrinsic::getDeclaration(
54       &CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
55 }
56
57 // \brief Synchronize all GPU threads in a block.
58 void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
59   getNVPTXCTABarrier(CGF);
60 }
61
62 /// \brief Get the thread id of the OMP master thread.
63 /// The master thread id is the first thread (lane) of the last warp in the
64 /// GPU block.  Warp size is assumed to be some power of 2.
65 /// Thread id is 0 indexed.
66 /// E.g: If NumThreads is 33, master id is 32.
67 ///      If NumThreads is 64, master id is 32.
68 ///      If NumThreads is 1024, master id is 992.
69 llvm::Value *CGOpenMPRuntimeNVPTX::getMasterThreadID(CodeGenFunction &CGF) {
70   CGBuilderTy &Bld = CGF.Builder;
71   llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
72
73   // We assume that the warp size is a power of 2.
74   llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
75
76   return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
77                        Bld.CreateNot(Mask), "master_tid");
78 }
79
80 namespace {
81 enum OpenMPRTLFunctionNVPTX {
82   /// \brief Call to void __kmpc_kernel_init(kmp_int32 omp_handle,
83   /// kmp_int32 thread_limit);
84   OMPRTL_NVPTX__kmpc_kernel_init,
85 };
86
87 // NVPTX Address space
88 enum ADDRESS_SPACE {
89   ADDRESS_SPACE_SHARED = 3,
90 };
91 } // namespace
92
93 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
94     CodeGenModule &CGM)
95     : WorkerFn(nullptr), CGFI(nullptr) {
96   createWorkerFunction(CGM);
97 }
98
99 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
100     CodeGenModule &CGM) {
101   // Create an worker function with no arguments.
102   CGFI = &CGM.getTypes().arrangeNullaryFunction();
103
104   WorkerFn = llvm::Function::Create(
105       CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
106       /* placeholder */ "_worker", &CGM.getModule());
107   CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
108   WorkerFn->setLinkage(llvm::GlobalValue::InternalLinkage);
109   WorkerFn->addFnAttr(llvm::Attribute::NoInline);
110 }
111
112 void CGOpenMPRuntimeNVPTX::initializeEnvironment() {
113   //
114   // Initialize master-worker control state in shared memory.
115   //
116
117   auto DL = CGM.getDataLayout();
118   ActiveWorkers = new llvm::GlobalVariable(
119       CGM.getModule(), CGM.Int32Ty, /*isConstant=*/false,
120       llvm::GlobalValue::CommonLinkage,
121       llvm::Constant::getNullValue(CGM.Int32Ty), "__omp_num_threads", 0,
122       llvm::GlobalVariable::NotThreadLocal, ADDRESS_SPACE_SHARED);
123   ActiveWorkers->setAlignment(DL.getPrefTypeAlignment(CGM.Int32Ty));
124
125   WorkID = new llvm::GlobalVariable(
126       CGM.getModule(), CGM.Int64Ty, /*isConstant=*/false,
127       llvm::GlobalValue::CommonLinkage,
128       llvm::Constant::getNullValue(CGM.Int64Ty), "__tgt_work_id", 0,
129       llvm::GlobalVariable::NotThreadLocal, ADDRESS_SPACE_SHARED);
130   WorkID->setAlignment(DL.getPrefTypeAlignment(CGM.Int64Ty));
131 }
132
133 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
134   auto &Ctx = CGM.getContext();
135
136   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
137   CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
138   emitWorkerLoop(CGF, WST);
139   CGF.FinishFunction();
140 }
141
142 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
143                                           WorkerFunctionState &WST) {
144   //
145   // The workers enter this loop and wait for parallel work from the master.
146   // When the master encounters a parallel region it sets up the work + variable
147   // arguments, and wakes up the workers.  The workers first check to see if
148   // they are required for the parallel region, i.e., within the # of requested
149   // parallel threads.  The activated workers load the variable arguments and
150   // execute the parallel work.
151   //
152
153   CGBuilderTy &Bld = CGF.Builder;
154
155   llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
156   llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
157   llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
158   llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
159   llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
160   llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
161
162   CGF.EmitBranch(AwaitBB);
163
164   // Workers wait for work from master.
165   CGF.EmitBlock(AwaitBB);
166   // Wait for parallel work
167   syncCTAThreads(CGF);
168   // On termination condition (workid == 0), exit loop.
169   llvm::Value *ShouldTerminate = Bld.CreateICmpEQ(
170       Bld.CreateAlignedLoad(WorkID, WorkID->getAlignment()),
171       llvm::Constant::getNullValue(WorkID->getType()->getElementType()),
172       "should_terminate");
173   Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
174
175   // Activate requested workers.
176   CGF.EmitBlock(SelectWorkersBB);
177   llvm::Value *ThreadID = getNVPTXThreadID(CGF);
178   llvm::Value *ActiveThread = Bld.CreateICmpSLT(
179       ThreadID,
180       Bld.CreateAlignedLoad(ActiveWorkers, ActiveWorkers->getAlignment()),
181       "active_thread");
182   Bld.CreateCondBr(ActiveThread, ExecuteBB, BarrierBB);
183
184   // Signal start of parallel region.
185   CGF.EmitBlock(ExecuteBB);
186   // TODO: Add parallel work.
187
188   // Signal end of parallel region.
189   CGF.EmitBlock(TerminateBB);
190   CGF.EmitBranch(BarrierBB);
191
192   // All active and inactive workers wait at a barrier after parallel region.
193   CGF.EmitBlock(BarrierBB);
194   // Barrier after parallel region.
195   syncCTAThreads(CGF);
196   CGF.EmitBranch(AwaitBB);
197
198   // Exit target region.
199   CGF.EmitBlock(ExitBB);
200 }
201
202 // Setup NVPTX threads for master-worker OpenMP scheme.
203 void CGOpenMPRuntimeNVPTX::emitEntryHeader(CodeGenFunction &CGF,
204                                            EntryFunctionState &EST,
205                                            WorkerFunctionState &WST) {
206   CGBuilderTy &Bld = CGF.Builder;
207
208   // Get the master thread id.
209   llvm::Value *MasterID = getMasterThreadID(CGF);
210   // Current thread's identifier.
211   llvm::Value *ThreadID = getNVPTXThreadID(CGF);
212
213   // Setup BBs in entry function.
214   llvm::BasicBlock *WorkerCheckBB = CGF.createBasicBlock(".check.for.worker");
215   llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
216   llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
217   EST.ExitBB = CGF.createBasicBlock(".exit");
218
219   // The head (master thread) marches on while its body of companion threads in
220   // the warp go to sleep.
221   llvm::Value *ShouldDie =
222       Bld.CreateICmpUGT(ThreadID, MasterID, "excess_in_master_warp");
223   Bld.CreateCondBr(ShouldDie, EST.ExitBB, WorkerCheckBB);
224
225   // Select worker threads...
226   CGF.EmitBlock(WorkerCheckBB);
227   llvm::Value *IsWorker = Bld.CreateICmpULT(ThreadID, MasterID, "is_worker");
228   Bld.CreateCondBr(IsWorker, WorkerBB, MasterBB);
229
230   // ... and send to worker loop, awaiting parallel invocation.
231   CGF.EmitBlock(WorkerBB);
232   CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None);
233   CGF.EmitBranch(EST.ExitBB);
234
235   // Only master thread executes subsequent serial code.
236   CGF.EmitBlock(MasterBB);
237
238   // First action in sequential region:
239   // Initialize the state of the OpenMP runtime library on the GPU.
240   llvm::Value *Args[] = {Bld.getInt32(/*OmpHandle=*/0), getNVPTXThreadID(CGF)};
241   CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init),
242                       Args);
243 }
244
245 void CGOpenMPRuntimeNVPTX::emitEntryFooter(CodeGenFunction &CGF,
246                                            EntryFunctionState &EST) {
247   if (!EST.ExitBB)
248     EST.ExitBB = CGF.createBasicBlock(".exit");
249
250   CGBuilderTy &Bld = CGF.Builder;
251   llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
252   CGF.EmitBranch(TerminateBB);
253
254   CGF.EmitBlock(TerminateBB);
255   // Signal termination condition.
256   Bld.CreateAlignedStore(
257       llvm::Constant::getNullValue(WorkID->getType()->getElementType()), WorkID,
258       WorkID->getAlignment());
259   // Barrier to terminate worker threads.
260   syncCTAThreads(CGF);
261   // Master thread jumps to exit point.
262   CGF.EmitBranch(EST.ExitBB);
263
264   CGF.EmitBlock(EST.ExitBB);
265   EST.ExitBB = nullptr;
266 }
267
268 /// \brief Returns specified OpenMP runtime function for the current OpenMP
269 /// implementation.  Specialized for the NVPTX device.
270 /// \param Function OpenMP runtime function.
271 /// \return Specified function.
272 llvm::Constant *
273 CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
274   llvm::Constant *RTLFn = nullptr;
275   switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
276   case OMPRTL_NVPTX__kmpc_kernel_init: {
277     // Build void __kmpc_kernel_init(kmp_int32 omp_handle,
278     // kmp_int32 thread_limit);
279     llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int32Ty};
280     llvm::FunctionType *FnTy =
281         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
282     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
283     break;
284   }
285   }
286   return RTLFn;
287 }
288
289 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
290                                               llvm::Constant *Addr,
291                                               uint64_t Size) {
292   auto *F = dyn_cast<llvm::Function>(Addr);
293   // TODO: Add support for global variables on the device after declare target
294   // support.
295   if (!F)
296     return;
297   llvm::Module *M = F->getParent();
298   llvm::LLVMContext &Ctx = M->getContext();
299
300   // Get "nvvm.annotations" metadata node
301   llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
302
303   llvm::Metadata *MDVals[] = {
304       llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
305       llvm::ConstantAsMetadata::get(
306           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
307   // Append metadata to nvvm.annotations
308   MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
309 }
310
311 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
312     const OMPExecutableDirective &D, StringRef ParentName,
313     llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
314     bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
315   if (!IsOffloadEntry) // Nothing to do.
316     return;
317
318   assert(!ParentName.empty() && "Invalid target region parent name!");
319
320   EntryFunctionState EST;
321   WorkerFunctionState WST(CGM);
322
323   // Emit target region as a standalone region.
324   class NVPTXPrePostActionTy : public PrePostActionTy {
325     CGOpenMPRuntimeNVPTX &RT;
326     CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
327     CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
328
329   public:
330     NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
331                          CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
332                          CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
333         : RT(RT), EST(EST), WST(WST) {}
334     void Enter(CodeGenFunction &CGF) override {
335       RT.emitEntryHeader(CGF, EST, WST);
336     }
337     void Exit(CodeGenFunction &CGF) override { RT.emitEntryFooter(CGF, EST); }
338   } Action(*this, EST, WST);
339   CodeGen.setAction(Action);
340   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
341                                    IsOffloadEntry, CodeGen);
342
343   // Create the worker function
344   emitWorkerFunction(WST);
345
346   // Now change the name of the worker function to correspond to this target
347   // region's entry function.
348   WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
349 }
350
351 CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
352     : CGOpenMPRuntime(CGM), ActiveWorkers(nullptr), WorkID(nullptr) {
353   if (!CGM.getLangOpts().OpenMPIsDevice)
354     llvm_unreachable("OpenMP NVPTX can only handle device code.");
355
356   // Called once per module during initialization.
357   initializeEnvironment();
358 }
359
360 void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
361                                               const Expr *NumTeams,
362                                               const Expr *ThreadLimit,
363                                               SourceLocation Loc) {}
364
365 llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction(
366     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
367     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
368
369   llvm::Function *OutlinedFun = nullptr;
370   if (isa<OMPTeamsDirective>(D)) {
371     llvm::Value *OutlinedFunVal =
372         CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
373             D, ThreadIDVar, InnermostKind, CodeGen);
374     OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
375     OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
376     OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
377   } else
378     llvm_unreachable("parallel directive is not yet supported for nvptx "
379                      "backend.");
380
381   return OutlinedFun;
382 }
383
384 void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
385                                          const OMPExecutableDirective &D,
386                                          SourceLocation Loc,
387                                          llvm::Value *OutlinedFn,
388                                          ArrayRef<llvm::Value *> CapturedVars) {
389   if (!CGF.HaveInsertPoint())
390     return;
391
392   Address ZeroAddr =
393       CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
394                            /*Name*/ ".zero.addr");
395   CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
396   llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
397   OutlinedFnArgs.push_back(ZeroAddr.getPointer());
398   OutlinedFnArgs.push_back(ZeroAddr.getPointer());
399   OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
400   CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
401 }