//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This provides a class for OpenMP runtime code generation specialized to NVPTX // targets. // //===----------------------------------------------------------------------===// #include "CGOpenMPRuntimeNVPTX.h" #include "clang/AST/DeclOpenMP.h" #include "CodeGenFunction.h" #include "clang/AST/StmtOpenMP.h" using namespace clang; using namespace CodeGen; namespace { enum OpenMPRTLFunctionNVPTX { /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit); OMPRTL_NVPTX__kmpc_kernel_init, /// \brief Call to void __kmpc_kernel_deinit(); OMPRTL_NVPTX__kmpc_kernel_deinit, /// \brief Call to void __kmpc_kernel_prepare_parallel(void /// *outlined_function); OMPRTL_NVPTX__kmpc_kernel_prepare_parallel, /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function); OMPRTL_NVPTX__kmpc_kernel_parallel, /// \brief Call to void __kmpc_kernel_end_parallel(); OMPRTL_NVPTX__kmpc_kernel_end_parallel, /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 /// global_tid); OMPRTL_NVPTX__kmpc_serialized_parallel, /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 /// global_tid); OMPRTL_NVPTX__kmpc_end_serialized_parallel, }; /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. class NVPTXActionTy final : public PrePostActionTy { llvm::Value *EnterCallee; ArrayRef EnterArgs; llvm::Value *ExitCallee; ArrayRef ExitArgs; bool Conditional; llvm::BasicBlock *ContBlock = nullptr; public: NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef EnterArgs, llvm::Value *ExitCallee, ArrayRef ExitArgs, bool Conditional = false) : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee), ExitArgs(ExitArgs), Conditional(Conditional) {} void Enter(CodeGenFunction &CGF) override { llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs); if (Conditional) { llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes); auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); ContBlock = CGF.createBasicBlock("omp_if.end"); // Generate the branch (If-stmt) CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); CGF.EmitBlock(ThenBlock); } } void Done(CodeGenFunction &CGF) { // Emit the rest of blocks/branches CGF.EmitBranch(ContBlock); CGF.EmitBlock(ContBlock, true); } void Exit(CodeGenFunction &CGF) override { CGF.EmitRuntimeCall(ExitCallee, ExitArgs); } }; } // anonymous namespace /// Get the GPU warp size. static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; return Bld.CreateCall( llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize), llvm::None, "nvptx_warp_size"); } /// Get the id of the current thread on the GPU. static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; return Bld.CreateCall( llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x), llvm::None, "nvptx_tid"); } /// Get the maximum number of threads in a block of the GPU. static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; return Bld.CreateCall( llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x), llvm::None, "nvptx_num_threads"); } /// Get barrier to synchronize all threads in a block. static void getNVPTXCTABarrier(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; Bld.CreateCall(llvm::Intrinsic::getDeclaration( &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0)); } /// Synchronize all GPU threads in a block. static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); } /// Get the value of the thread_limit clause in the teams directive. /// The runtime encodes thread_limit in the launch parameter, always starting /// thread_limit+warpSize threads per team. static llvm::Value *getThreadLimit(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; return Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF), "thread_limit"); } /// Get the thread id of the OMP master thread. /// The master thread id is the first thread (lane) of the last warp in the /// GPU block. Warp size is assumed to be some power of 2. /// Thread id is 0 indexed. /// E.g: If NumThreads is 33, master id is 32. /// If NumThreads is 64, master id is 32. /// If NumThreads is 1024, master id is 992. static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) { CGBuilderTy &Bld = CGF.Builder; llvm::Value *NumThreads = getNVPTXNumThreads(CGF); // We assume that the warp size is a power of 2. llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1)); return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)), Bld.CreateNot(Mask), "master_tid"); } CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState( CodeGenModule &CGM) : WorkerFn(nullptr), CGFI(nullptr) { createWorkerFunction(CGM); } void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction( CodeGenModule &CGM) { // Create an worker function with no arguments. CGFI = &CGM.getTypes().arrangeNullaryFunction(); WorkerFn = llvm::Function::Create( CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage, /* placeholder */ "_worker", &CGM.getModule()); CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI); } void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { EntryFunctionState EST; WorkerFunctionState WST(CGM); Work.clear(); // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { CGOpenMPRuntimeNVPTX &RT; CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST; public: NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) : RT(RT), EST(EST), WST(WST) {} void Enter(CodeGenFunction &CGF) override { RT.emitGenericEntryHeader(CGF, EST, WST); } void Exit(CodeGenFunction &CGF) override { RT.emitGenericEntryFooter(CGF, EST); } } Action(*this, EST, WST); CodeGen.setAction(Action); emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); // Create the worker function emitWorkerFunction(WST); // Now change the name of the worker function to correspond to this target // region's entry function. WST.WorkerFn->setName(OutlinedFn->getName() + "_worker"); } // Setup NVPTX threads for master-worker OpenMP scheme. void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, WorkerFunctionState &WST) { CGBuilderTy &Bld = CGF.Builder; llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck"); llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); EST.ExitBB = CGF.createBasicBlock(".exit"); auto *IsWorker = Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF)); Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); CGF.EmitBlock(WorkerBB); CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None); CGF.EmitBranch(EST.ExitBB); CGF.EmitBlock(MasterCheckBB); auto *IsMaster = Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF)); Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); CGF.EmitBlock(MasterBB); // First action in sequential region: // Initialize the state of the OpenMP runtime library on the GPU. llvm::Value *Args[] = {getThreadLimit(CGF)}; CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args); } void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST) { if (!EST.ExitBB) EST.ExitBB = CGF.createBasicBlock(".exit"); llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); CGF.EmitBranch(TerminateBB); CGF.EmitBlock(TerminateBB); // Signal termination condition. CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), None); // Barrier to terminate worker threads. syncCTAThreads(CGF); // Master thread jumps to exit point. CGF.EmitBranch(EST.ExitBB); CGF.EmitBlock(EST.ExitBB); EST.ExitBB = nullptr; } void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { auto &Ctx = CGM.getContext(); CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); CGF.disableDebugInfo(); CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {}); emitWorkerLoop(CGF, WST); CGF.FinishFunction(); } void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST) { // // The workers enter this loop and wait for parallel work from the master. // When the master encounters a parallel region it sets up the work + variable // arguments, and wakes up the workers. The workers first check to see if // they are required for the parallel region, i.e., within the # of requested // parallel threads. The activated workers load the variable arguments and // execute the parallel work. // CGBuilderTy &Bld = CGF.Builder; llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work"); llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers"); llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel"); llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel"); llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel"); llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); CGF.EmitBranch(AwaitBB); // Workers wait for work from master. CGF.EmitBlock(AwaitBB); // Wait for parallel work syncCTAThreads(CGF); Address WorkFn = CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn"); Address ExecStatus = CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status"); CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); llvm::Value *Args[] = {WorkFn.getPointer()}; llvm::Value *Ret = CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args); Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); // On termination condition (workid == 0), exit loop. llvm::Value *ShouldTerminate = Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate"); Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); // Activate requested workers. CGF.EmitBlock(SelectWorkersBB); llvm::Value *IsActive = Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active"); Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB); // Signal start of parallel region. CGF.EmitBlock(ExecuteBB); // Process work items: outlined parallel functions. for (auto *W : Work) { // Try to match this outlined function. auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy); llvm::Value *WorkFnMatch = Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match"); llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn"); llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next"); Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB); // Execute this outlined function. CGF.EmitBlock(ExecuteFNBB); // Insert call to work function. // FIXME: Pass arguments to outlined function from master thread. auto *Fn = cast(W); Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr"); CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0)); llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()}; CGF.EmitCallOrInvoke(Fn, FnArgs); // Go to end of parallel region. CGF.EmitBranch(TerminateBB); CGF.EmitBlock(CheckNextBB); } // Signal end of parallel region. CGF.EmitBlock(TerminateBB); CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel), llvm::None); CGF.EmitBranch(BarrierBB); // All active and inactive workers wait at a barrier after parallel region. CGF.EmitBlock(BarrierBB); // Barrier after parallel region. syncCTAThreads(CGF); CGF.EmitBranch(AwaitBB); // Exit target region. CGF.EmitBlock(ExitBB); } /// \brief Returns specified OpenMP runtime function for the current OpenMP /// implementation. Specialized for the NVPTX device. /// \param Function OpenMP runtime function. /// \return Specified function. llvm::Constant * CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { llvm::Constant *RTLFn = nullptr; switch (static_cast(Function)) { case OMPRTL_NVPTX__kmpc_kernel_init: { // Build void __kmpc_kernel_init(kmp_int32 thread_limit); llvm::Type *TypeParams[] = {CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init"); break; } case OMPRTL_NVPTX__kmpc_kernel_deinit: { // Build void __kmpc_kernel_deinit(); llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit"); break; } case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: { /// Build void __kmpc_kernel_prepare_parallel( /// void *outlined_function); llvm::Type *TypeParams[] = {CGM.Int8PtrTy}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel"); break; } case OMPRTL_NVPTX__kmpc_kernel_parallel: { /// Build bool __kmpc_kernel_parallel(void **outlined_function); llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy}; llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy); llvm::FunctionType *FnTy = llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel"); break; } case OMPRTL_NVPTX__kmpc_kernel_end_parallel: { /// Build void __kmpc_kernel_end_parallel(); llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel"); break; } case OMPRTL_NVPTX__kmpc_serialized_parallel: { // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 // global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel"); break; } case OMPRTL_NVPTX__kmpc_end_serialized_parallel: { // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 // global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel"); break; } } return RTLFn; } void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t) { auto *F = dyn_cast(Addr); // TODO: Add support for global variables on the device after declare target // support. if (!F) return; llvm::Module *M = F->getParent(); llvm::LLVMContext &Ctx = M->getContext(); // Get "nvvm.annotations" metadata node llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations"); llvm::Metadata *MDVals[] = { llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"), llvm::ConstantAsMetadata::get( llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; // Append metadata to nvvm.annotations MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); } void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction( const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { if (!IsOffloadEntry) // Nothing to do. return; assert(!ParentName.empty() && "Invalid target region parent name!"); emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, CodeGen); } CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) : CGOpenMPRuntime(CGM) { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP NVPTX can only handle device code."); } void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) {} llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction( const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { llvm::Function *OutlinedFun = nullptr; if (isa(D)) { llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction( D, ThreadIDVar, InnermostKind, CodeGen); OutlinedFun = cast(OutlinedFunVal); OutlinedFun->removeFnAttr(llvm::Attribute::NoInline); OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); } else { llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction( D, ThreadIDVar, InnermostKind, CodeGen); OutlinedFun = cast(OutlinedFunVal); } return OutlinedFun; } void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef CapturedVars) { if (!CGF.HaveInsertPoint()) return; Address ZeroAddr = CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4), /*Name*/ ".zero.addr"); CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); llvm::SmallVector OutlinedFnArgs; OutlinedFnArgs.push_back(ZeroAddr.getPointer()); OutlinedFnArgs.push_back(ZeroAddr.getPointer()); OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); } void CGOpenMPRuntimeNVPTX::emitParallelCall( CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef CapturedVars, const Expr *IfCond) { if (!CGF.HaveInsertPoint()) return; emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); } void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, ArrayRef CapturedVars, const Expr *IfCond) { llvm::Function *Fn = cast(OutlinedFn); auto &&L0ParallelGen = [this, Fn, &CapturedVars](CodeGenFunction &CGF, PrePostActionTy &) { CGBuilderTy &Bld = CGF.Builder; // Prepare for parallel region. Indicate the outlined function. llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy)}; CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), Args); // Activate workers. This barrier is used by the master to signal // work for the workers. syncCTAThreads(CGF); // OpenMP [2.5, Parallel Construct, p.49] // There is an implied barrier at the end of a parallel region. After the // end of a parallel region, only the master thread of the team resumes // execution of the enclosing task region. // // The master waits at this barrier until all workers are done. syncCTAThreads(CGF); // Remember for post-processing in worker loop. Work.push_back(Fn); }; auto *RTLoc = emitUpdateLocation(CGF, Loc); auto *ThreadID = getThreadID(CGF, Loc); llvm::Value *Args[] = {RTLoc, ThreadID}; auto &&SeqGen = [this, Fn, &CapturedVars, &Args](CodeGenFunction &CGF, PrePostActionTy &) { auto &&CodeGen = [this, Fn, &CapturedVars, &Args](CodeGenFunction &CGF, PrePostActionTy &Action) { Action.Enter(CGF); llvm::SmallVector OutlinedFnArgs; OutlinedFnArgs.push_back( llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); OutlinedFnArgs.push_back( llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); CGF.EmitCallOrInvoke(Fn, OutlinedFnArgs); }; RegionCodeGenTy RCG(CodeGen); NVPTXActionTy Action( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel), Args, createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel), Args); RCG.setAction(Action); RCG(CGF); }; if (IfCond) emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen); else { CodeGenFunction::RunCleanupsScope Scope(CGF); RegionCodeGenTy ThenRCG(L0ParallelGen); ThenRCG(CGF); } }