//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This provides a class for OpenMP runtime code generation specialized to NVPTX // targets. // //===----------------------------------------------------------------------===// #include "CGOpenMPRuntimeNVPTX.h" #include "clang/AST/DeclOpenMP.h" #include "CodeGenFunction.h" #include "clang/AST/StmtOpenMP.h" using namespace clang; using namespace CodeGen; namespace { enum OpenMPRTLFunctionNVPTX { /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit); OMPRTL_NVPTX__kmpc_kernel_init, /// \brief Call to void __kmpc_kernel_deinit(); OMPRTL_NVPTX__kmpc_kernel_deinit, }; } // namespace /// Get the GPU warp size. static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; return Bld.CreateCall( llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize), llvm::None, "nvptx_warp_size"); } /// Get the id of the current thread on the GPU. static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; return Bld.CreateCall( llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x), llvm::None, "nvptx_tid"); } /// Get the maximum number of threads in a block of the GPU. static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; return Bld.CreateCall( llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x), llvm::None, "nvptx_num_threads"); } /// Get barrier to synchronize all threads in a block. static void getNVPTXCTABarrier(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; Bld.CreateCall(llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0)); } /// Synchronize all GPU threads in a block. static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); } /// Get the value of the thread_limit clause in the teams directive. /// The runtime encodes thread_limit in the launch parameter, always starting /// thread_limit+warpSize threads per team. static llvm::Value *getThreadLimit(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; return Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF), "thread_limit"); } /// Get the thread id of the OMP master thread. /// The master thread id is the first thread (lane) of the last warp in the /// GPU block. Warp size is assumed to be some power of 2. /// Thread id is 0 indexed. /// E.g: If NumThreads is 33, master id is 32. /// If NumThreads is 64, master id is 32. /// If NumThreads is 1024, master id is 992. static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; llvm::Value *NumThreads = getNVPTXNumThreads(CGF); // We assume that the warp size is a power of 2. llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1)); return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)), Bld.CreateNot(Mask), "master_tid"); } CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState( CodeGenModule &CGM) : WorkerFn(nullptr), CGFI(nullptr) { createWorkerFunction(CGM); } void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction( CodeGenModule &CGM) { // Create an worker function with no arguments. CGFI = &CGM.getTypes().arrangeNullaryFunction(); WorkerFn = llvm::Function::Create( CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage, /* placeholder */ "_worker", &CGM.getModule()); CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI); } void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { EntryFunctionState EST; WorkerFunctionState WST(CGM); // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { CGOpenMPRuntimeNVPTX &RT; CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST; public: NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) : RT(RT), EST(EST), WST(WST) {} void Enter(CodeGenFunction &CGF) override { RT.emitGenericEntryHeader(CGF, EST, WST); } void Exit(CodeGenFunction &CGF) override { RT.emitGenericEntryFooter(CGF, EST); } } Action(*this, EST, WST); CodeGen.setAction(Action); emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); // Create the worker function emitWorkerFunction(WST); // Now change the name of the worker function to correspond to this target // region's entry function. WST.WorkerFn->setName(OutlinedFn->getName() + "_worker"); } // Setup NVPTX threads for master-worker OpenMP scheme. void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, WorkerFunctionState &WST) { CGBuilderTy &Bld = CGF.Builder; llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck"); llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); EST.ExitBB = CGF.createBasicBlock(".exit"); auto *IsWorker = Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF)); Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); CGF.EmitBlock(WorkerBB); CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None); CGF.EmitBranch(EST.ExitBB); CGF.EmitBlock(MasterCheckBB); auto *IsMaster = Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF)); Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); CGF.EmitBlock(MasterBB); // First action in sequential region: // Initialize the state of the OpenMP runtime library on the GPU. llvm::Value *Args[] = {getThreadLimit(CGF)}; CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args); } void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST) { if (!EST.ExitBB) EST.ExitBB = CGF.createBasicBlock(".exit"); llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); CGF.EmitBranch(TerminateBB); CGF.EmitBlock(TerminateBB); // Signal termination condition. CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), None); // Barrier to terminate worker threads. syncCTAThreads(CGF); // Master thread jumps to exit point. CGF.EmitBranch(EST.ExitBB); CGF.EmitBlock(EST.ExitBB); EST.ExitBB = nullptr; } void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { auto &Ctx = CGM.getContext(); CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); CGF.disableDebugInfo(); CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {}); emitWorkerLoop(CGF, WST); CGF.FinishFunction(); } void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST) { // // The workers enter this loop and wait for parallel work from the master. // When the master encounters a parallel region it sets up the work + variable // arguments, and wakes up the workers. The workers first check to see if // they are required for the parallel region, i.e., within the # of requested // parallel threads. The activated workers load the variable arguments and // execute the parallel work. // CGBuilderTy &Bld = CGF.Builder; llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work"); llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers"); llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel"); llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel"); llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel"); llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); CGF.EmitBranch(AwaitBB); // Workers wait for work from master. CGF.EmitBlock(AwaitBB); // Wait for parallel work syncCTAThreads(CGF); Address WorkFn = CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn"); Address ExecStatus = CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status"); CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); // TODO: Call into runtime to get parallel work. // On termination condition (workid == 0), exit loop. llvm::Value *ShouldTerminate = Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate"); Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); // Activate requested workers. CGF.EmitBlock(SelectWorkersBB); llvm::Value *IsActive = Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active"); Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB); // Signal start of parallel region. CGF.EmitBlock(ExecuteBB); // TODO: Add parallel work. // Signal end of parallel region. CGF.EmitBlock(TerminateBB); CGF.EmitBranch(BarrierBB); // All active and inactive workers wait at a barrier after parallel region. CGF.EmitBlock(BarrierBB); // Barrier after parallel region. syncCTAThreads(CGF); CGF.EmitBranch(AwaitBB); // Exit target region. CGF.EmitBlock(ExitBB); } /// \brief Returns specified OpenMP runtime function for the current OpenMP /// implementation. Specialized for the NVPTX device. /// \param Function OpenMP runtime function. /// \return Specified function. llvm::Constant * CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { llvm::Constant *RTLFn = nullptr; switch (static_cast(Function)) { case OMPRTL_NVPTX__kmpc_kernel_init: { // Build void __kmpc_kernel_init(kmp_int32 thread_limit); llvm::Type *TypeParams[] = {CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init"); break; } case OMPRTL_NVPTX__kmpc_kernel_deinit: { // Build void __kmpc_kernel_deinit(); llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, {}, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit"); break; } } return RTLFn; } void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t) { auto *F = dyn_cast(Addr); // TODO: Add support for global variables on the device after declare target // support. if (!F) return; llvm::Module *M = F->getParent(); llvm::LLVMContext &Ctx = M->getContext(); // Get "nvvm.annotations" metadata node llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations"); llvm::Metadata *MDVals[] = { llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"), llvm::ConstantAsMetadata::get( llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; // Append metadata to nvvm.annotations MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); } void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction( const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { if (!IsOffloadEntry) // Nothing to do. return; assert(!ParentName.empty() && "Invalid target region parent name!"); emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); } CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) : CGOpenMPRuntime(CGM) { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP NVPTX can only handle device code."); } void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) {} llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { llvm::Function *OutlinedFun = nullptr; if (isa(D)) { llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction( D, ThreadIDVar, InnermostKind, CodeGen); OutlinedFun = cast(OutlinedFunVal); OutlinedFun->removeFnAttr(llvm::Attribute::NoInline); OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); } else llvm_unreachable("parallel directive is not yet supported for nvptx " "backend."); return OutlinedFun; } void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef CapturedVars) { if (!CGF.HaveInsertPoint()) return; Address ZeroAddr = CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4), /*Name*/ ".zero.addr"); CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); llvm::SmallVector OutlinedFnArgs; OutlinedFnArgs.push_back(ZeroAddr.getPointer()); OutlinedFnArgs.push_back(ZeroAddr.getPointer()); OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); }