1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This provides a class for OpenMP runtime code generation specialized to NVPTX
13 //===----------------------------------------------------------------------===//
15 #include "CGOpenMPRuntimeNVPTX.h"
16 #include "clang/AST/DeclOpenMP.h"
17 #include "CodeGenFunction.h"
18 #include "clang/AST/StmtOpenMP.h"
20 using namespace clang;
21 using namespace CodeGen;
23 /// \brief Get the GPU warp size.
24 llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXWarpSize(CodeGenFunction &CGF) {
25 CGBuilderTy &Bld = CGF.Builder;
26 return Bld.CreateCall(
27 llvm::Intrinsic::getDeclaration(
28 &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
29 llvm::None, "nvptx_warp_size");
32 /// \brief Get the id of the current thread on the GPU.
33 llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXThreadID(CodeGenFunction &CGF) {
34 CGBuilderTy &Bld = CGF.Builder;
35 return Bld.CreateCall(
36 llvm::Intrinsic::getDeclaration(
37 &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
38 llvm::None, "nvptx_tid");
41 // \brief Get the maximum number of threads in a block of the GPU.
42 llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXNumThreads(CodeGenFunction &CGF) {
43 CGBuilderTy &Bld = CGF.Builder;
44 return Bld.CreateCall(
45 llvm::Intrinsic::getDeclaration(
46 &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
47 llvm::None, "nvptx_num_threads");
50 /// \brief Get barrier to synchronize all threads in a block.
51 void CGOpenMPRuntimeNVPTX::getNVPTXCTABarrier(CodeGenFunction &CGF) {
52 CGBuilderTy &Bld = CGF.Builder;
53 Bld.CreateCall(llvm::Intrinsic::getDeclaration(
54 &CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
57 // \brief Synchronize all GPU threads in a block.
58 void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
59 getNVPTXCTABarrier(CGF);
62 /// \brief Get the thread id of the OMP master thread.
63 /// The master thread id is the first thread (lane) of the last warp in the
64 /// GPU block. Warp size is assumed to be some power of 2.
65 /// Thread id is 0 indexed.
66 /// E.g: If NumThreads is 33, master id is 32.
67 /// If NumThreads is 64, master id is 32.
68 /// If NumThreads is 1024, master id is 992.
69 llvm::Value *CGOpenMPRuntimeNVPTX::getMasterThreadID(CodeGenFunction &CGF) {
70 CGBuilderTy &Bld = CGF.Builder;
71 llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
73 // We assume that the warp size is a power of 2.
74 llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
76 return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
77 Bld.CreateNot(Mask), "master_tid");
81 enum OpenMPRTLFunctionNVPTX {
82 /// \brief Call to void __kmpc_kernel_init(kmp_int32 omp_handle,
83 /// kmp_int32 thread_limit);
84 OMPRTL_NVPTX__kmpc_kernel_init,
87 // NVPTX Address space
89 ADDRESS_SPACE_SHARED = 3,
93 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
95 : WorkerFn(nullptr), CGFI(nullptr) {
96 createWorkerFunction(CGM);
99 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
100 CodeGenModule &CGM) {
101 // Create an worker function with no arguments.
102 CGFI = &CGM.getTypes().arrangeNullaryFunction();
104 WorkerFn = llvm::Function::Create(
105 CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
106 /* placeholder */ "_worker", &CGM.getModule());
107 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
108 WorkerFn->setLinkage(llvm::GlobalValue::InternalLinkage);
109 WorkerFn->addFnAttr(llvm::Attribute::NoInline);
112 void CGOpenMPRuntimeNVPTX::initializeEnvironment() {
114 // Initialize master-worker control state in shared memory.
117 auto DL = CGM.getDataLayout();
118 ActiveWorkers = new llvm::GlobalVariable(
119 CGM.getModule(), CGM.Int32Ty, /*isConstant=*/false,
120 llvm::GlobalValue::CommonLinkage,
121 llvm::Constant::getNullValue(CGM.Int32Ty), "__omp_num_threads", 0,
122 llvm::GlobalVariable::NotThreadLocal, ADDRESS_SPACE_SHARED);
123 ActiveWorkers->setAlignment(DL.getPrefTypeAlignment(CGM.Int32Ty));
125 WorkID = new llvm::GlobalVariable(
126 CGM.getModule(), CGM.Int64Ty, /*isConstant=*/false,
127 llvm::GlobalValue::CommonLinkage,
128 llvm::Constant::getNullValue(CGM.Int64Ty), "__tgt_work_id", 0,
129 llvm::GlobalVariable::NotThreadLocal, ADDRESS_SPACE_SHARED);
130 WorkID->setAlignment(DL.getPrefTypeAlignment(CGM.Int64Ty));
133 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
134 auto &Ctx = CGM.getContext();
136 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
137 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
138 emitWorkerLoop(CGF, WST);
139 CGF.FinishFunction();
142 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
143 WorkerFunctionState &WST) {
145 // The workers enter this loop and wait for parallel work from the master.
146 // When the master encounters a parallel region it sets up the work + variable
147 // arguments, and wakes up the workers. The workers first check to see if
148 // they are required for the parallel region, i.e., within the # of requested
149 // parallel threads. The activated workers load the variable arguments and
150 // execute the parallel work.
153 CGBuilderTy &Bld = CGF.Builder;
155 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
156 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
157 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
158 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
159 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
160 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
162 CGF.EmitBranch(AwaitBB);
164 // Workers wait for work from master.
165 CGF.EmitBlock(AwaitBB);
166 // Wait for parallel work
168 // On termination condition (workid == 0), exit loop.
169 llvm::Value *ShouldTerminate = Bld.CreateICmpEQ(
170 Bld.CreateAlignedLoad(WorkID, WorkID->getAlignment()),
171 llvm::Constant::getNullValue(WorkID->getType()->getElementType()),
173 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
175 // Activate requested workers.
176 CGF.EmitBlock(SelectWorkersBB);
177 llvm::Value *ThreadID = getNVPTXThreadID(CGF);
178 llvm::Value *ActiveThread = Bld.CreateICmpSLT(
180 Bld.CreateAlignedLoad(ActiveWorkers, ActiveWorkers->getAlignment()),
182 Bld.CreateCondBr(ActiveThread, ExecuteBB, BarrierBB);
184 // Signal start of parallel region.
185 CGF.EmitBlock(ExecuteBB);
186 // TODO: Add parallel work.
188 // Signal end of parallel region.
189 CGF.EmitBlock(TerminateBB);
190 CGF.EmitBranch(BarrierBB);
192 // All active and inactive workers wait at a barrier after parallel region.
193 CGF.EmitBlock(BarrierBB);
194 // Barrier after parallel region.
196 CGF.EmitBranch(AwaitBB);
198 // Exit target region.
199 CGF.EmitBlock(ExitBB);
202 // Setup NVPTX threads for master-worker OpenMP scheme.
203 void CGOpenMPRuntimeNVPTX::emitEntryHeader(CodeGenFunction &CGF,
204 EntryFunctionState &EST,
205 WorkerFunctionState &WST) {
206 CGBuilderTy &Bld = CGF.Builder;
208 // Get the master thread id.
209 llvm::Value *MasterID = getMasterThreadID(CGF);
210 // Current thread's identifier.
211 llvm::Value *ThreadID = getNVPTXThreadID(CGF);
213 // Setup BBs in entry function.
214 llvm::BasicBlock *WorkerCheckBB = CGF.createBasicBlock(".check.for.worker");
215 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
216 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
217 EST.ExitBB = CGF.createBasicBlock(".exit");
219 // The head (master thread) marches on while its body of companion threads in
220 // the warp go to sleep.
221 llvm::Value *ShouldDie =
222 Bld.CreateICmpUGT(ThreadID, MasterID, "excess_in_master_warp");
223 Bld.CreateCondBr(ShouldDie, EST.ExitBB, WorkerCheckBB);
225 // Select worker threads...
226 CGF.EmitBlock(WorkerCheckBB);
227 llvm::Value *IsWorker = Bld.CreateICmpULT(ThreadID, MasterID, "is_worker");
228 Bld.CreateCondBr(IsWorker, WorkerBB, MasterBB);
230 // ... and send to worker loop, awaiting parallel invocation.
231 CGF.EmitBlock(WorkerBB);
232 CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None);
233 CGF.EmitBranch(EST.ExitBB);
235 // Only master thread executes subsequent serial code.
236 CGF.EmitBlock(MasterBB);
238 // First action in sequential region:
239 // Initialize the state of the OpenMP runtime library on the GPU.
240 llvm::Value *Args[] = {Bld.getInt32(/*OmpHandle=*/0), getNVPTXThreadID(CGF)};
241 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init),
245 void CGOpenMPRuntimeNVPTX::emitEntryFooter(CodeGenFunction &CGF,
246 EntryFunctionState &EST) {
247 CGBuilderTy &Bld = CGF.Builder;
248 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
249 CGF.EmitBranch(TerminateBB);
251 CGF.EmitBlock(TerminateBB);
252 // Signal termination condition.
253 Bld.CreateAlignedStore(
254 llvm::Constant::getNullValue(WorkID->getType()->getElementType()), WorkID,
255 WorkID->getAlignment());
256 // Barrier to terminate worker threads.
258 // Master thread jumps to exit point.
259 CGF.EmitBranch(EST.ExitBB);
261 CGF.EmitBlock(EST.ExitBB);
264 /// \brief Returns specified OpenMP runtime function for the current OpenMP
265 /// implementation. Specialized for the NVPTX device.
266 /// \param Function OpenMP runtime function.
267 /// \return Specified function.
269 CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
270 llvm::Constant *RTLFn = nullptr;
271 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
272 case OMPRTL_NVPTX__kmpc_kernel_init: {
273 // Build void __kmpc_kernel_init(kmp_int32 omp_handle,
274 // kmp_int32 thread_limit);
275 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int32Ty};
276 llvm::FunctionType *FnTy =
277 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
278 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
285 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
286 llvm::Constant *Addr,
288 auto *F = dyn_cast<llvm::Function>(Addr);
289 // TODO: Add support for global variables on the device after declare target
293 llvm::Module *M = F->getParent();
294 llvm::LLVMContext &Ctx = M->getContext();
296 // Get "nvvm.annotations" metadata node
297 llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
299 llvm::Metadata *MDVals[] = {
300 llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
301 llvm::ConstantAsMetadata::get(
302 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
303 // Append metadata to nvvm.annotations
304 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
307 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
308 const OMPExecutableDirective &D, StringRef ParentName,
309 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
310 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
311 if (!IsOffloadEntry) // Nothing to do.
314 assert(!ParentName.empty() && "Invalid target region parent name!");
316 EntryFunctionState EST;
317 WorkerFunctionState WST(CGM);
319 // Emit target region as a standalone region.
320 class NVPTXPrePostActionTy : public PrePostActionTy {
321 CGOpenMPRuntimeNVPTX &RT;
322 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
323 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
326 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
327 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
328 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
329 : RT(RT), EST(EST), WST(WST) {}
330 void Enter(CodeGenFunction &CGF) override {
331 RT.emitEntryHeader(CGF, EST, WST);
333 void Exit(CodeGenFunction &CGF) override { RT.emitEntryFooter(CGF, EST); }
334 } Action(*this, EST, WST);
335 CodeGen.setAction(Action);
336 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
337 IsOffloadEntry, CodeGen);
339 // Create the worker function
340 emitWorkerFunction(WST);
342 // Now change the name of the worker function to correspond to this target
343 // region's entry function.
344 WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
347 CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
348 : CGOpenMPRuntime(CGM), ActiveWorkers(nullptr), WorkID(nullptr) {
349 if (!CGM.getLangOpts().OpenMPIsDevice)
350 llvm_unreachable("OpenMP NVPTX can only handle device code.");
352 // Called once per module during initialization.
353 initializeEnvironment();
356 void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
357 const Expr *NumTeams,
358 const Expr *ThreadLimit,
359 SourceLocation Loc) {}
361 llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction(
362 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
363 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
365 llvm::Function *OutlinedFun = nullptr;
366 if (isa<OMPTeamsDirective>(D)) {
367 llvm::Value *OutlinedFunVal =
368 CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
369 D, ThreadIDVar, InnermostKind, CodeGen);
370 OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
371 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
373 llvm_unreachable("parallel directive is not yet supported for nvptx "
379 void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
380 const OMPExecutableDirective &D,
382 llvm::Value *OutlinedFn,
383 ArrayRef<llvm::Value *> CapturedVars) {
384 if (!CGF.HaveInsertPoint())
388 CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
389 /*Name*/ ".zero.addr");
390 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
391 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
392 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
393 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
394 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
395 CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);