1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This provides a class for OpenMP runtime code generation specialized to NVPTX
13 //===----------------------------------------------------------------------===//
15 #include "CGOpenMPRuntimeNVPTX.h"
16 #include "clang/AST/DeclOpenMP.h"
17 #include "CodeGenFunction.h"
18 #include "clang/AST/StmtOpenMP.h"
20 using namespace clang;
21 using namespace CodeGen;
24 enum OpenMPRTLFunctionNVPTX {
25 /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
26 /// int16_t RequiresOMPRuntime);
27 OMPRTL_NVPTX__kmpc_kernel_init,
28 /// \brief Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
29 OMPRTL_NVPTX__kmpc_kernel_deinit,
30 /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
31 /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
32 OMPRTL_NVPTX__kmpc_spmd_kernel_init,
33 /// \brief Call to void __kmpc_spmd_kernel_deinit();
34 OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
35 /// \brief Call to void __kmpc_kernel_prepare_parallel(void
36 /// *outlined_function, void ***args, kmp_int32 nArgs, int16_t
37 /// IsOMPRuntimeInitialized);
38 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
39 /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function, void
40 /// ***args, int16_t IsOMPRuntimeInitialized);
41 OMPRTL_NVPTX__kmpc_kernel_parallel,
42 /// \brief Call to void __kmpc_kernel_end_parallel();
43 OMPRTL_NVPTX__kmpc_kernel_end_parallel,
44 /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
46 OMPRTL_NVPTX__kmpc_serialized_parallel,
47 /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
49 OMPRTL_NVPTX__kmpc_end_serialized_parallel,
50 /// \brief Call to int32_t __kmpc_shuffle_int32(int32_t element,
51 /// int16_t lane_offset, int16_t warp_size);
52 OMPRTL_NVPTX__kmpc_shuffle_int32,
53 /// \brief Call to int64_t __kmpc_shuffle_int64(int64_t element,
54 /// int16_t lane_offset, int16_t warp_size);
55 OMPRTL_NVPTX__kmpc_shuffle_int64,
56 /// \brief Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
57 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
58 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
59 /// lane_offset, int16_t shortCircuit),
60 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
61 OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
62 /// \brief Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
63 /// int32_t num_vars, size_t reduce_size, void *reduce_data,
64 /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
65 /// lane_offset, int16_t shortCircuit),
66 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
67 /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
68 /// int32_t index, int32_t width),
69 /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
70 /// index, int32_t width, int32_t reduce))
71 OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
72 /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
73 OMPRTL_NVPTX__kmpc_end_reduce_nowait
76 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
77 class NVPTXActionTy final : public PrePostActionTy {
78 llvm::Value *EnterCallee;
79 ArrayRef<llvm::Value *> EnterArgs;
80 llvm::Value *ExitCallee;
81 ArrayRef<llvm::Value *> ExitArgs;
83 llvm::BasicBlock *ContBlock = nullptr;
86 NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
87 llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
88 bool Conditional = false)
89 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
90 ExitArgs(ExitArgs), Conditional(Conditional) {}
91 void Enter(CodeGenFunction &CGF) override {
92 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
94 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
95 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
96 ContBlock = CGF.createBasicBlock("omp_if.end");
97 // Generate the branch (If-stmt)
98 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
99 CGF.EmitBlock(ThenBlock);
102 void Done(CodeGenFunction &CGF) {
103 // Emit the rest of blocks/branches
104 CGF.EmitBranch(ContBlock);
105 CGF.EmitBlock(ContBlock, true);
107 void Exit(CodeGenFunction &CGF) override {
108 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
112 // A class to track the execution mode when codegening directives within
113 // a target region. The appropriate mode (generic/spmd) is set on entry
114 // to the target region and used by containing directives such as 'parallel'
115 // to emit optimized code.
116 class ExecutionModeRAII {
118 CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
119 CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
122 ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
123 CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
128 ~ExecutionModeRAII() { Mode = SavedMode; }
131 /// GPU Configuration: This information can be derived from cuda registers,
132 /// however, providing compile time constants helps generate more efficient
133 /// code. For all practical purposes this is fine because the configuration
134 /// is the same for all known NVPTX architectures.
135 enum MachineConfiguration : unsigned {
137 /// Number of bits required to represent a lane identifier, which is
138 /// computed as log_2(WarpSize).
140 LaneIDMask = WarpSize - 1,
142 /// Global memory alignment for performance.
143 GlobalMemoryAlignment = 256,
146 enum NamedBarrier : unsigned {
147 /// Synchronize on this barrier #ID using a named barrier primitive.
148 /// Only the subset of active threads in a parallel region arrive at the
152 } // anonymous namespace
154 /// Get the GPU warp size.
155 static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
156 return CGF.EmitRuntimeCall(
157 llvm::Intrinsic::getDeclaration(
158 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
162 /// Get the id of the current thread on the GPU.
163 static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
164 return CGF.EmitRuntimeCall(
165 llvm::Intrinsic::getDeclaration(
166 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
170 /// Get the id of the warp in the block.
171 /// We assume that the warp size is 32, which is always the case
172 /// on the NVPTX device, to generate more efficient code.
173 static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
174 CGBuilderTy &Bld = CGF.Builder;
175 return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
178 /// Get the id of the current lane in the Warp.
179 /// We assume that the warp size is 32, which is always the case
180 /// on the NVPTX device, to generate more efficient code.
181 static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
182 CGBuilderTy &Bld = CGF.Builder;
183 return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
187 /// Get the maximum number of threads in a block of the GPU.
188 static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
189 return CGF.EmitRuntimeCall(
190 llvm::Intrinsic::getDeclaration(
191 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
192 "nvptx_num_threads");
195 /// Get barrier to synchronize all threads in a block.
196 static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
197 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
198 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
201 /// Get barrier #ID to synchronize selected (multiple of warp size) threads in
203 static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
204 llvm::Value *NumThreads) {
205 CGBuilderTy &Bld = CGF.Builder;
206 llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
207 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
208 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
212 /// Synchronize all GPU threads in a block.
213 static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
215 /// Synchronize worker threads in a parallel region.
216 static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
217 return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
220 /// Get the value of the thread_limit clause in the teams directive.
221 /// For the 'generic' execution mode, the runtime encodes thread_limit in
222 /// the launch parameters, always starting thread_limit+warpSize threads per
223 /// CTA. The threads in the last warp are reserved for master execution.
224 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
225 static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
226 bool IsInSpmdExecutionMode = false) {
227 CGBuilderTy &Bld = CGF.Builder;
228 return IsInSpmdExecutionMode
229 ? getNVPTXNumThreads(CGF)
230 : Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
234 /// Get the thread id of the OMP master thread.
235 /// The master thread id is the first thread (lane) of the last warp in the
236 /// GPU block. Warp size is assumed to be some power of 2.
237 /// Thread id is 0 indexed.
238 /// E.g: If NumThreads is 33, master id is 32.
239 /// If NumThreads is 64, master id is 32.
240 /// If NumThreads is 1024, master id is 992.
241 static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
242 CGBuilderTy &Bld = CGF.Builder;
243 llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
245 // We assume that the warp size is a power of 2.
246 llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
248 return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
249 Bld.CreateNot(Mask), "master_tid");
252 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
254 : WorkerFn(nullptr), CGFI(nullptr) {
255 createWorkerFunction(CGM);
258 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
259 CodeGenModule &CGM) {
260 // Create an worker function with no arguments.
261 CGFI = &CGM.getTypes().arrangeNullaryFunction();
263 WorkerFn = llvm::Function::Create(
264 CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
265 /* placeholder */ "_worker", &CGM.getModule());
266 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
269 bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
270 return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
273 static CGOpenMPRuntimeNVPTX::ExecutionMode
274 getExecutionModeForDirective(CodeGenModule &CGM,
275 const OMPExecutableDirective &D) {
276 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
277 switch (DirectiveKind) {
279 case OMPD_target_teams:
280 return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
281 case OMPD_target_parallel:
282 case OMPD_target_parallel_for:
283 case OMPD_target_parallel_for_simd:
284 return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
286 llvm_unreachable("Unsupported directive on NVPTX device.");
288 llvm_unreachable("Unsupported directive on NVPTX device.");
291 void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
292 StringRef ParentName,
293 llvm::Function *&OutlinedFn,
294 llvm::Constant *&OutlinedFnID,
296 const RegionCodeGenTy &CodeGen) {
297 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
298 CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
299 EntryFunctionState EST;
300 WorkerFunctionState WST(CGM);
302 WrapperFunctionsMap.clear();
304 // Emit target region as a standalone region.
305 class NVPTXPrePostActionTy : public PrePostActionTy {
306 CGOpenMPRuntimeNVPTX &RT;
307 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
308 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
311 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
312 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
313 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
314 : RT(RT), EST(EST), WST(WST) {}
315 void Enter(CodeGenFunction &CGF) override {
316 RT.emitGenericEntryHeader(CGF, EST, WST);
318 void Exit(CodeGenFunction &CGF) override {
319 RT.emitGenericEntryFooter(CGF, EST);
321 } Action(*this, EST, WST);
322 CodeGen.setAction(Action);
323 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
324 IsOffloadEntry, CodeGen);
326 // Create the worker function
327 emitWorkerFunction(WST);
329 // Now change the name of the worker function to correspond to this target
330 // region's entry function.
331 WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
334 // Setup NVPTX threads for master-worker OpenMP scheme.
335 void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
336 EntryFunctionState &EST,
337 WorkerFunctionState &WST) {
338 CGBuilderTy &Bld = CGF.Builder;
340 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
341 llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
342 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
343 EST.ExitBB = CGF.createBasicBlock(".exit");
346 Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
347 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
349 CGF.EmitBlock(WorkerBB);
350 emitCall(CGF, WST.WorkerFn);
351 CGF.EmitBranch(EST.ExitBB);
353 CGF.EmitBlock(MasterCheckBB);
355 Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
356 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
358 CGF.EmitBlock(MasterBB);
359 // First action in sequential region:
360 // Initialize the state of the OpenMP runtime library on the GPU.
361 // TODO: Optimize runtime initialization and pass in correct value.
362 llvm::Value *Args[] = {getThreadLimit(CGF),
363 Bld.getInt16(/*RequiresOMPRuntime=*/1)};
365 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
368 void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
369 EntryFunctionState &EST) {
371 EST.ExitBB = CGF.createBasicBlock(".exit");
373 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
374 CGF.EmitBranch(TerminateBB);
376 CGF.EmitBlock(TerminateBB);
377 // Signal termination condition.
378 // TODO: Optimize runtime initialization and pass in correct value.
379 llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
381 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
382 // Barrier to terminate worker threads.
384 // Master thread jumps to exit point.
385 CGF.EmitBranch(EST.ExitBB);
387 CGF.EmitBlock(EST.ExitBB);
388 EST.ExitBB = nullptr;
391 void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
392 StringRef ParentName,
393 llvm::Function *&OutlinedFn,
394 llvm::Constant *&OutlinedFnID,
396 const RegionCodeGenTy &CodeGen) {
397 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
398 CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
399 EntryFunctionState EST;
401 // Emit target region as a standalone region.
402 class NVPTXPrePostActionTy : public PrePostActionTy {
403 CGOpenMPRuntimeNVPTX &RT;
404 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
405 const OMPExecutableDirective &D;
408 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
409 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
410 const OMPExecutableDirective &D)
411 : RT(RT), EST(EST), D(D) {}
412 void Enter(CodeGenFunction &CGF) override {
413 RT.emitSpmdEntryHeader(CGF, EST, D);
415 void Exit(CodeGenFunction &CGF) override {
416 RT.emitSpmdEntryFooter(CGF, EST);
418 } Action(*this, EST, D);
419 CodeGen.setAction(Action);
420 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
421 IsOffloadEntry, CodeGen);
424 void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
425 CodeGenFunction &CGF, EntryFunctionState &EST,
426 const OMPExecutableDirective &D) {
427 auto &Bld = CGF.Builder;
429 // Setup BBs in entry function.
430 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
431 EST.ExitBB = CGF.createBasicBlock(".exit");
433 // Initialize the OMP state in the runtime; called by all active threads.
434 // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
435 // based on code analysis of the target region.
436 llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true),
437 /*RequiresOMPRuntime=*/Bld.getInt16(1),
438 /*RequiresDataSharing=*/Bld.getInt16(1)};
440 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
441 CGF.EmitBranch(ExecuteBB);
443 CGF.EmitBlock(ExecuteBB);
446 void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
447 EntryFunctionState &EST) {
449 EST.ExitBB = CGF.createBasicBlock(".exit");
451 llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
452 CGF.EmitBranch(OMPDeInitBB);
454 CGF.EmitBlock(OMPDeInitBB);
455 // DeInitialize the OMP state in the runtime; called by all active threads.
457 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
458 CGF.EmitBranch(EST.ExitBB);
460 CGF.EmitBlock(EST.ExitBB);
461 EST.ExitBB = nullptr;
464 // Create a unique global variable to indicate the execution mode of this target
465 // region. The execution mode is either 'generic', or 'spmd' depending on the
466 // target directive. This variable is picked up by the offload library to setup
467 // the device appropriately before kernel launch. If the execution mode is
468 // 'generic', the runtime reserves one warp for the master, otherwise, all
469 // warps participate in parallel work.
470 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
471 CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
472 (void)new llvm::GlobalVariable(
473 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
474 llvm::GlobalValue::WeakAnyLinkage,
475 llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));
478 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
479 ASTContext &Ctx = CGM.getContext();
481 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
482 CGF.disableDebugInfo();
483 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
484 emitWorkerLoop(CGF, WST);
485 CGF.FinishFunction();
488 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
489 WorkerFunctionState &WST) {
491 // The workers enter this loop and wait for parallel work from the master.
492 // When the master encounters a parallel region it sets up the work + variable
493 // arguments, and wakes up the workers. The workers first check to see if
494 // they are required for the parallel region, i.e., within the # of requested
495 // parallel threads. The activated workers load the variable arguments and
496 // execute the parallel work.
499 CGBuilderTy &Bld = CGF.Builder;
501 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
502 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
503 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
504 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
505 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
506 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
508 CGF.EmitBranch(AwaitBB);
510 // Workers wait for work from master.
511 CGF.EmitBlock(AwaitBB);
512 // Wait for parallel work
516 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
518 CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
519 CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
520 CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
522 // Set up shared arguments
524 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrPtrTy, "shared_args");
525 // TODO: Optimize runtime initialization and pass in correct value.
526 llvm::Value *Args[] = {WorkFn.getPointer(), SharedArgs.getPointer(),
527 /*RequiresOMPRuntime=*/Bld.getInt16(1)};
528 llvm::Value *Ret = CGF.EmitRuntimeCall(
529 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
530 Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
532 // On termination condition (workid == 0), exit loop.
533 llvm::Value *ShouldTerminate =
534 Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
535 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
537 // Activate requested workers.
538 CGF.EmitBlock(SelectWorkersBB);
539 llvm::Value *IsActive =
540 Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
541 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
543 // Signal start of parallel region.
544 CGF.EmitBlock(ExecuteBB);
547 ASTContext &Ctx = CGF.getContext();
549 // Process work items: outlined parallel functions.
550 for (auto *W : Work) {
551 // Try to match this outlined function.
552 auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
554 llvm::Value *WorkFnMatch =
555 Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
557 llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
558 llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
559 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
561 // Execute this outlined function.
562 CGF.EmitBlock(ExecuteFNBB);
564 // Insert call to work function via shared wrapper. The shared
565 // wrapper takes exactly three arguments:
566 // - the parallelism level;
567 // - the master thread ID;
568 // - the list of references to shared arguments.
570 // TODO: Assert that the function is a wrapper function.s
571 Address Capture = CGF.EmitLoadOfPointer(SharedArgs,
573 Ctx.getPointerType(Ctx.VoidPtrTy)).castAs<PointerType>());
574 emitCall(CGF, W, {Bld.getInt16(/*ParallelLevel=*/0),
575 getMasterThreadID(CGF), Capture.getPointer()});
577 // Go to end of parallel region.
578 CGF.EmitBranch(TerminateBB);
580 CGF.EmitBlock(CheckNextBB);
583 // Signal end of parallel region.
584 CGF.EmitBlock(TerminateBB);
586 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
588 CGF.EmitBranch(BarrierBB);
590 // All active and inactive workers wait at a barrier after parallel region.
591 CGF.EmitBlock(BarrierBB);
592 // Barrier after parallel region.
594 CGF.EmitBranch(AwaitBB);
596 // Exit target region.
597 CGF.EmitBlock(ExitBB);
600 /// \brief Returns specified OpenMP runtime function for the current OpenMP
601 /// implementation. Specialized for the NVPTX device.
602 /// \param Function OpenMP runtime function.
603 /// \return Specified function.
605 CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
606 llvm::Constant *RTLFn = nullptr;
607 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
608 case OMPRTL_NVPTX__kmpc_kernel_init: {
609 // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
610 // RequiresOMPRuntime);
611 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
612 llvm::FunctionType *FnTy =
613 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
614 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
617 case OMPRTL_NVPTX__kmpc_kernel_deinit: {
618 // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
619 llvm::Type *TypeParams[] = {CGM.Int16Ty};
620 llvm::FunctionType *FnTy =
621 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
622 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
625 case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
626 // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
627 // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
628 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
629 llvm::FunctionType *FnTy =
630 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
631 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
634 case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
635 // Build void __kmpc_spmd_kernel_deinit();
636 llvm::FunctionType *FnTy =
637 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
638 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
641 case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
642 /// Build void __kmpc_kernel_prepare_parallel(
643 /// void *outlined_function, void ***args, kmp_int32 nArgs, int16_t
644 /// IsOMPRuntimeInitialized);
645 llvm::Type *TypeParams[] = {CGM.Int8PtrTy,
646 CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int32Ty,
648 llvm::FunctionType *FnTy =
649 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
650 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
653 case OMPRTL_NVPTX__kmpc_kernel_parallel: {
654 /// Build bool __kmpc_kernel_parallel(void **outlined_function, void
655 /// ***args, int16_t IsOMPRuntimeInitialized);
656 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy,
657 CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int16Ty};
658 llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
659 llvm::FunctionType *FnTy =
660 llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
661 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
664 case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
665 /// Build void __kmpc_kernel_end_parallel();
666 llvm::FunctionType *FnTy =
667 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
668 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
671 case OMPRTL_NVPTX__kmpc_serialized_parallel: {
672 // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
674 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
675 llvm::FunctionType *FnTy =
676 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
677 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
680 case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
681 // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
683 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
684 llvm::FunctionType *FnTy =
685 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
686 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
689 case OMPRTL_NVPTX__kmpc_shuffle_int32: {
690 // Build int32_t __kmpc_shuffle_int32(int32_t element,
691 // int16_t lane_offset, int16_t warp_size);
692 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
693 llvm::FunctionType *FnTy =
694 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
695 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
698 case OMPRTL_NVPTX__kmpc_shuffle_int64: {
699 // Build int64_t __kmpc_shuffle_int64(int64_t element,
700 // int16_t lane_offset, int16_t warp_size);
701 llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
702 llvm::FunctionType *FnTy =
703 llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
704 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
707 case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
708 // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
709 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
710 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
711 // lane_offset, int16_t Algorithm Version),
712 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
713 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
714 CGM.Int16Ty, CGM.Int16Ty};
715 auto *ShuffleReduceFnTy =
716 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
718 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
719 auto *InterWarpCopyFnTy =
720 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
722 llvm::Type *TypeParams[] = {CGM.Int32Ty,
726 ShuffleReduceFnTy->getPointerTo(),
727 InterWarpCopyFnTy->getPointerTo()};
728 llvm::FunctionType *FnTy =
729 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
730 RTLFn = CGM.CreateRuntimeFunction(
731 FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
734 case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
735 // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
736 // int32_t num_vars, size_t reduce_size, void *reduce_data,
737 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
738 // lane_offset, int16_t shortCircuit),
739 // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
740 // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
741 // int32_t index, int32_t width),
742 // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
743 // int32_t index, int32_t width, int32_t reduce))
744 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
745 CGM.Int16Ty, CGM.Int16Ty};
746 auto *ShuffleReduceFnTy =
747 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
749 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
750 auto *InterWarpCopyFnTy =
751 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
753 llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
754 CGM.Int32Ty, CGM.Int32Ty};
755 auto *CopyToScratchpadFnTy =
756 llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
758 llvm::Type *LoadReduceTypeParams[] = {
759 CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
760 auto *LoadReduceFnTy =
761 llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
763 llvm::Type *TypeParams[] = {CGM.Int32Ty,
767 ShuffleReduceFnTy->getPointerTo(),
768 InterWarpCopyFnTy->getPointerTo(),
769 CopyToScratchpadFnTy->getPointerTo(),
770 LoadReduceFnTy->getPointerTo()};
771 llvm::FunctionType *FnTy =
772 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
773 RTLFn = CGM.CreateRuntimeFunction(
774 FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
777 case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
778 // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
779 llvm::Type *TypeParams[] = {CGM.Int32Ty};
780 llvm::FunctionType *FnTy =
781 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
782 RTLFn = CGM.CreateRuntimeFunction(
783 FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
790 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
791 llvm::Constant *Addr,
792 uint64_t Size, int32_t) {
793 auto *F = dyn_cast<llvm::Function>(Addr);
794 // TODO: Add support for global variables on the device after declare target
798 llvm::Module *M = F->getParent();
799 llvm::LLVMContext &Ctx = M->getContext();
801 // Get "nvvm.annotations" metadata node
802 llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
804 llvm::Metadata *MDVals[] = {
805 llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
806 llvm::ConstantAsMetadata::get(
807 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
808 // Append metadata to nvvm.annotations
809 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
812 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
813 const OMPExecutableDirective &D, StringRef ParentName,
814 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
815 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
816 if (!IsOffloadEntry) // Nothing to do.
819 assert(!ParentName.empty() && "Invalid target region parent name!");
821 CGOpenMPRuntimeNVPTX::ExecutionMode Mode =
822 getExecutionModeForDirective(CGM, D);
824 case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
825 emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
828 case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
829 emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
832 case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
834 "Unknown programming model for OpenMP directive on NVPTX target.");
837 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
840 CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
841 : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
842 if (!CGM.getLangOpts().OpenMPIsDevice)
843 llvm_unreachable("OpenMP NVPTX can only handle device code.");
846 void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
847 OpenMPProcBindClauseKind ProcBind,
848 SourceLocation Loc) {
849 // Do nothing in case of Spmd mode and L0 parallel.
850 // TODO: If in Spmd mode and L1 parallel emit the clause.
851 if (isInSpmdExecutionMode())
854 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
857 void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
858 llvm::Value *NumThreads,
859 SourceLocation Loc) {
860 // Do nothing in case of Spmd mode and L0 parallel.
861 // TODO: If in Spmd mode and L1 parallel emit the clause.
862 if (isInSpmdExecutionMode())
865 CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
868 void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
869 const Expr *NumTeams,
870 const Expr *ThreadLimit,
871 SourceLocation Loc) {}
873 llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
874 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
875 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
877 auto *OutlinedFun = cast<llvm::Function>(
878 CGOpenMPRuntime::emitParallelOutlinedFunction(
879 D, ThreadIDVar, InnermostKind, CodeGen));
880 if (!isInSpmdExecutionMode()) {
881 llvm::Function *WrapperFun =
882 createDataSharingWrapper(OutlinedFun, D);
883 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
889 llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
890 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
891 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
893 llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
894 D, ThreadIDVar, InnermostKind, CodeGen);
895 llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
896 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
897 OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
898 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
903 void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
904 const OMPExecutableDirective &D,
906 llvm::Value *OutlinedFn,
907 ArrayRef<llvm::Value *> CapturedVars) {
908 if (!CGF.HaveInsertPoint())
912 CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
913 /*Name*/ ".zero.addr");
914 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
915 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
916 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
917 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
918 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
919 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
922 void CGOpenMPRuntimeNVPTX::emitParallelCall(
923 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
924 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
925 if (!CGF.HaveInsertPoint())
928 if (isInSpmdExecutionMode())
929 emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
931 emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
934 void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
935 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
936 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
937 llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
938 llvm::Function *WFn = WrapperFunctionsMap[Fn];
939 assert(WFn && "Wrapper function does not exist!");
941 // Force inline this outlined function at its call site.
942 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
944 auto &&L0ParallelGen = [this, WFn, &CapturedVars](CodeGenFunction &CGF,
946 CGBuilderTy &Bld = CGF.Builder;
948 llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
950 if (!CapturedVars.empty()) {
951 // There's somehting to share, add the attribute
952 CGF.CurFn->addFnAttr("has-nvptx-shared-depot");
953 // Prepare for parallel region. Indicate the outlined function.
955 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy,
957 llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
958 // TODO: Optimize runtime initialization and pass in correct value.
959 llvm::Value *Args[] = {ID, SharedArgsPtr,
960 Bld.getInt32(CapturedVars.size()),
961 /*RequiresOMPRuntime=*/Bld.getInt16(1)};
964 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
968 ASTContext &Ctx = CGF.getContext();
969 for (llvm::Value *V : CapturedVars) {
970 Address Dst = Bld.CreateConstInBoundsGEP(
971 CGF.EmitLoadOfPointer(SharedArgs,
973 Ctx.getPointerType(Ctx.VoidPtrTy)).castAs<PointerType>()),
974 Idx, CGF.getPointerSize());
975 llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy);
976 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
977 Ctx.getPointerType(Ctx.VoidPtrTy));
981 // TODO: Optimize runtime initialization and pass in correct value.
982 llvm::Value *Args[] = {
983 ID, llvm::ConstantPointerNull::get(CGF.VoidPtrPtrTy->getPointerTo(0)),
984 /*nArgs=*/Bld.getInt32(0), /*RequiresOMPRuntime=*/Bld.getInt16(1)};
986 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
990 // Activate workers. This barrier is used by the master to signal
991 // work for the workers.
994 // OpenMP [2.5, Parallel Construct, p.49]
995 // There is an implied barrier at the end of a parallel region. After the
996 // end of a parallel region, only the master thread of the team resumes
997 // execution of the enclosing task region.
999 // The master waits at this barrier until all workers are done.
1000 syncCTAThreads(CGF);
1002 // Remember for post-processing in worker loop.
1003 Work.emplace_back(WFn);
1006 auto *RTLoc = emitUpdateLocation(CGF, Loc);
1007 auto *ThreadID = getThreadID(CGF, Loc);
1008 llvm::Value *Args[] = {RTLoc, ThreadID};
1010 auto &&SeqGen = [this, Fn, &CapturedVars, &Args, Loc](CodeGenFunction &CGF,
1011 PrePostActionTy &) {
1012 auto &&CodeGen = [this, Fn, &CapturedVars, Loc](CodeGenFunction &CGF,
1013 PrePostActionTy &Action) {
1016 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1017 OutlinedFnArgs.push_back(
1018 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
1019 OutlinedFnArgs.push_back(
1020 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
1021 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1022 emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
1025 RegionCodeGenTy RCG(CodeGen);
1026 NVPTXActionTy Action(
1027 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
1029 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
1031 RCG.setAction(Action);
1036 emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
1038 CodeGenFunction::RunCleanupsScope Scope(CGF);
1039 RegionCodeGenTy ThenRCG(L0ParallelGen);
1044 void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
1045 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1046 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1047 // Just call the outlined function to execute the parallel region.
1048 // OutlinedFn(>id, &zero, CapturedStruct);
1050 // TODO: Do something with IfCond when support for the 'if' clause
1051 // is added on Spmd target directives.
1052 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1053 OutlinedFnArgs.push_back(
1054 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
1055 OutlinedFnArgs.push_back(
1056 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
1057 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1058 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1061 /// This function creates calls to one of two shuffle functions to copy
1062 /// variables between lanes in a warp.
1063 static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1066 llvm::Value *Offset) {
1067 auto &CGM = CGF.CGM;
1068 auto &C = CGM.getContext();
1069 auto &Bld = CGF.Builder;
1070 CGOpenMPRuntimeNVPTX &RT =
1071 *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
1073 unsigned Size = CGM.getContext().getTypeSizeInChars(ElemTy).getQuantity();
1074 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction.");
1076 OpenMPRTLFunctionNVPTX ShuffleFn = Size <= 4
1077 ? OMPRTL_NVPTX__kmpc_shuffle_int32
1078 : OMPRTL_NVPTX__kmpc_shuffle_int64;
1080 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1081 auto CastTy = Size <= 4 ? CGM.Int32Ty : CGM.Int64Ty;
1082 auto *ElemCast = Bld.CreateSExtOrBitCast(Elem, CastTy);
1083 auto *WarpSize = CGF.EmitScalarConversion(
1084 getNVPTXWarpSize(CGF), C.getIntTypeForBitwidth(32, /* Signed */ true),
1085 C.getIntTypeForBitwidth(16, /* Signed */ true), SourceLocation());
1088 CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn),
1089 {ElemCast, Offset, WarpSize});
1091 return Bld.CreateTruncOrBitCast(ShuffledVal, CGF.ConvertTypeForMem(ElemTy));
1095 enum CopyAction : unsigned {
1096 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1097 // the warp using shuffle instructions.
1099 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1101 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1103 // ScratchpadToThread: Copy from a scratchpad array in global memory
1104 // containing team-reduced data to a thread's stack.
1109 struct CopyOptionsTy {
1110 llvm::Value *RemoteLaneOffset;
1111 llvm::Value *ScratchpadIndex;
1112 llvm::Value *ScratchpadWidth;
1115 /// Emit instructions to copy a Reduce list, which contains partially
1116 /// aggregated values, in the specified direction.
1117 static void emitReductionListCopy(
1118 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1119 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1120 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1122 auto &CGM = CGF.CGM;
1123 auto &C = CGM.getContext();
1124 auto &Bld = CGF.Builder;
1126 auto *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1127 auto *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1128 auto *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1130 // Iterates, element-by-element, through the source Reduce list and
1133 unsigned Size = Privates.size();
1134 for (auto &Private : Privates) {
1135 Address SrcElementAddr = Address::invalid();
1136 Address DestElementAddr = Address::invalid();
1137 Address DestElementPtrAddr = Address::invalid();
1138 // Should we shuffle in an element from a remote lane?
1139 bool ShuffleInElement = false;
1140 // Set to true to update the pointer in the dest Reduce list to a
1141 // newly created element.
1142 bool UpdateDestListPtr = false;
1143 // Increment the src or dest pointer to the scratchpad, for each
1145 bool IncrScratchpadSrc = false;
1146 bool IncrScratchpadDest = false;
1149 case RemoteLaneToThread: {
1150 // Step 1.1: Get the address for the src element in the Reduce list.
1151 Address SrcElementPtrAddr =
1152 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1153 llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1154 SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1156 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1158 // Step 1.2: Create a temporary to store the element in the destination
1160 DestElementPtrAddr =
1161 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1163 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1164 ShuffleInElement = true;
1165 UpdateDestListPtr = true;
1169 // Step 1.1: Get the address for the src element in the Reduce list.
1170 Address SrcElementPtrAddr =
1171 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1172 llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1173 SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1175 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1177 // Step 1.2: Get the address for dest element. The destination
1178 // element has already been created on the thread's stack.
1179 DestElementPtrAddr =
1180 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1181 llvm::Value *DestElementPtr =
1182 CGF.EmitLoadOfScalar(DestElementPtrAddr, /*Volatile=*/false,
1183 C.VoidPtrTy, SourceLocation());
1184 Address DestElemAddr =
1185 Address(DestElementPtr, C.getTypeAlignInChars(Private->getType()));
1186 DestElementAddr = Bld.CreateElementBitCast(
1187 DestElemAddr, CGF.ConvertTypeForMem(Private->getType()));
1190 case ThreadToScratchpad: {
1191 // Step 1.1: Get the address for the src element in the Reduce list.
1192 Address SrcElementPtrAddr =
1193 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1194 llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1195 SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1197 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1199 // Step 1.2: Get the address for dest element:
1200 // address = base + index * ElementSizeInChars.
1201 unsigned ElementSizeInChars =
1202 C.getTypeSizeInChars(Private->getType()).getQuantity();
1203 auto *CurrentOffset =
1204 Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1206 auto *ScratchPadElemAbsolutePtrVal =
1207 Bld.CreateAdd(DestBase.getPointer(), CurrentOffset);
1208 ScratchPadElemAbsolutePtrVal =
1209 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1210 Address ScratchpadPtr =
1211 Address(ScratchPadElemAbsolutePtrVal,
1212 C.getTypeAlignInChars(Private->getType()));
1213 DestElementAddr = Bld.CreateElementBitCast(
1214 ScratchpadPtr, CGF.ConvertTypeForMem(Private->getType()));
1215 IncrScratchpadDest = true;
1218 case ScratchpadToThread: {
1219 // Step 1.1: Get the address for the src element in the scratchpad.
1220 // address = base + index * ElementSizeInChars.
1221 unsigned ElementSizeInChars =
1222 C.getTypeSizeInChars(Private->getType()).getQuantity();
1223 auto *CurrentOffset =
1224 Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1226 auto *ScratchPadElemAbsolutePtrVal =
1227 Bld.CreateAdd(SrcBase.getPointer(), CurrentOffset);
1228 ScratchPadElemAbsolutePtrVal =
1229 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1230 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1231 C.getTypeAlignInChars(Private->getType()));
1232 IncrScratchpadSrc = true;
1234 // Step 1.2: Create a temporary to store the element in the destination
1236 DestElementPtrAddr =
1237 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1239 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1240 UpdateDestListPtr = true;
1245 // Regardless of src and dest of copy, we emit the load of src
1246 // element as this is required in all directions
1247 SrcElementAddr = Bld.CreateElementBitCast(
1248 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1250 CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
1251 Private->getType(), SourceLocation());
1253 // Now that all active lanes have read the element in the
1254 // Reduce list, shuffle over the value from the remote lane.
1255 if (ShuffleInElement) {
1256 Elem = createRuntimeShuffleFunction(CGF, Private->getType(), Elem,
1260 // Store the source element value to the dest element address.
1261 CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
1262 Private->getType());
1264 // Step 3.1: Modify reference in dest Reduce list as needed.
1265 // Modifying the reference in Reduce list to point to the newly
1266 // created element. The element is live in the current function
1267 // scope and that of functions it invokes (i.e., reduce_function).
1268 // RemoteReduceData[i] = (void*)&RemoteElem
1269 if (UpdateDestListPtr) {
1270 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1271 DestElementAddr.getPointer(), CGF.VoidPtrTy),
1272 DestElementPtrAddr, /*Volatile=*/false,
1276 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1277 // address of the next element in scratchpad memory, unless we're currently
1278 // processing the last one. Memory alignment is also taken care of here.
1279 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1280 llvm::Value *ScratchpadBasePtr =
1281 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
1282 unsigned ElementSizeInChars =
1283 C.getTypeSizeInChars(Private->getType()).getQuantity();
1284 ScratchpadBasePtr = Bld.CreateAdd(
1286 Bld.CreateMul(ScratchpadWidth, llvm::ConstantInt::get(
1287 CGM.SizeTy, ElementSizeInChars)));
1289 // Take care of global memory alignment for performance
1290 ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr,
1291 llvm::ConstantInt::get(CGM.SizeTy, 1));
1292 ScratchpadBasePtr = Bld.CreateSDiv(
1294 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1295 ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr,
1296 llvm::ConstantInt::get(CGM.SizeTy, 1));
1297 ScratchpadBasePtr = Bld.CreateMul(
1299 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1301 if (IncrScratchpadDest)
1302 DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1303 else /* IncrScratchpadSrc = true */
1304 SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1311 /// This function emits a helper that loads data from the scratchpad array
1312 /// and (optionally) reduces it with the input operand.
1314 /// load_and_reduce(local, scratchpad, index, width, should_reduce)
1315 /// reduce_data remote;
1316 /// for elem in remote:
1317 /// remote.elem = Scratchpad[elem_id][index]
1318 /// if (should_reduce)
1319 /// local = local @ remote
1322 static llvm::Value *
1323 emitReduceScratchpadFunction(CodeGenModule &CGM,
1324 ArrayRef<const Expr *> Privates,
1325 QualType ReductionArrayTy, llvm::Value *ReduceFn) {
1326 auto &C = CGM.getContext();
1327 auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1329 // Destination of the copy.
1330 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
1331 // Base address of the scratchpad array, with each element storing a
1332 // Reduce list per team.
1333 ImplicitParamDecl ScratchPadArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
1334 // A source index into the scratchpad array.
1335 ImplicitParamDecl IndexArg(C, Int32Ty, ImplicitParamDecl::Other);
1336 // Row width of an element in the scratchpad array, typically
1337 // the number of teams.
1338 ImplicitParamDecl WidthArg(C, Int32Ty, ImplicitParamDecl::Other);
1339 // If should_reduce == 1, then it's load AND reduce,
1340 // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
1341 // The latter case is used for initialization.
1342 ImplicitParamDecl ShouldReduceArg(C, Int32Ty, ImplicitParamDecl::Other);
1344 FunctionArgList Args;
1345 Args.push_back(&ReduceListArg);
1346 Args.push_back(&ScratchPadArg);
1347 Args.push_back(&IndexArg);
1348 Args.push_back(&WidthArg);
1349 Args.push_back(&ShouldReduceArg);
1351 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1352 auto *Fn = llvm::Function::Create(
1353 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1354 "_omp_reduction_load_and_reduce", &CGM.getModule());
1355 CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1356 CodeGenFunction CGF(CGM);
1357 // We don't need debug information in this function as nothing here refers to
1359 CGF.disableDebugInfo();
1360 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1362 auto &Bld = CGF.Builder;
1364 // Get local Reduce list pointer.
1365 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1366 Address ReduceListAddr(
1367 Bld.CreatePointerBitCastOrAddrSpaceCast(
1368 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1369 C.VoidPtrTy, SourceLocation()),
1370 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1371 CGF.getPointerAlign());
1373 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1374 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
1375 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1377 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
1378 llvm::Value *IndexVal =
1379 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false,
1380 Int32Ty, SourceLocation()),
1381 CGM.SizeTy, /*isSigned=*/true);
1383 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
1384 llvm::Value *WidthVal =
1385 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
1386 Int32Ty, SourceLocation()),
1387 CGM.SizeTy, /*isSigned=*/true);
1389 Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
1390 llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
1391 AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, SourceLocation());
1393 // The absolute ptr address to the base addr of the next element to copy.
1394 llvm::Value *CumulativeElemBasePtr =
1395 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
1396 Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
1398 // Create a Remote Reduce list to store the elements read from the
1399 // scratchpad array.
1400 Address RemoteReduceList =
1401 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
1403 // Assemble remote Reduce list from scratchpad array.
1404 emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
1405 SrcDataAddr, RemoteReduceList,
1406 {/*RemoteLaneOffset=*/nullptr,
1407 /*ScratchpadIndex=*/IndexVal,
1408 /*ScratchpadWidth=*/WidthVal});
1410 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1411 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1412 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1414 auto CondReduce = Bld.CreateICmpEQ(ShouldReduceVal, Bld.getInt32(1));
1415 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1417 CGF.EmitBlock(ThenBB);
1418 // We should reduce with the local Reduce list.
1419 // reduce_function(LocalReduceList, RemoteReduceList)
1420 llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1421 ReduceListAddr.getPointer(), CGF.VoidPtrTy);
1422 llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1423 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
1424 CGF.EmitCallOrInvoke(ReduceFn, {LocalDataPtr, RemoteDataPtr});
1425 Bld.CreateBr(MergeBB);
1427 CGF.EmitBlock(ElseBB);
1428 // No reduction; just copy:
1429 // Local Reduce list = Remote Reduce list.
1430 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
1431 RemoteReduceList, ReduceListAddr);
1432 Bld.CreateBr(MergeBB);
1434 CGF.EmitBlock(MergeBB);
1436 CGF.FinishFunction();
1440 /// This function emits a helper that stores reduced data from the team
1441 /// master to a scratchpad array in global memory.
1443 /// for elem in Reduce List:
1444 /// scratchpad[elem_id][index] = elem
1446 static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
1447 ArrayRef<const Expr *> Privates,
1448 QualType ReductionArrayTy) {
1450 auto &C = CGM.getContext();
1451 auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1453 // Source of the copy.
1454 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
1455 // Base address of the scratchpad array, with each element storing a
1456 // Reduce list per team.
1457 ImplicitParamDecl ScratchPadArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
1458 // A destination index into the scratchpad array, typically the team
1460 ImplicitParamDecl IndexArg(C, Int32Ty, ImplicitParamDecl::Other);
1461 // Row width of an element in the scratchpad array, typically
1462 // the number of teams.
1463 ImplicitParamDecl WidthArg(C, Int32Ty, ImplicitParamDecl::Other);
1465 FunctionArgList Args;
1466 Args.push_back(&ReduceListArg);
1467 Args.push_back(&ScratchPadArg);
1468 Args.push_back(&IndexArg);
1469 Args.push_back(&WidthArg);
1471 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1472 auto *Fn = llvm::Function::Create(
1473 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1474 "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
1475 CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1476 CodeGenFunction CGF(CGM);
1477 // We don't need debug information in this function as nothing here refers to
1479 CGF.disableDebugInfo();
1480 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1482 auto &Bld = CGF.Builder;
1484 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1485 Address SrcDataAddr(
1486 Bld.CreatePointerBitCastOrAddrSpaceCast(
1487 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1488 C.VoidPtrTy, SourceLocation()),
1489 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1490 CGF.getPointerAlign());
1492 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1493 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
1494 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1496 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
1497 llvm::Value *IndexVal =
1498 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false,
1499 Int32Ty, SourceLocation()),
1500 CGF.SizeTy, /*isSigned=*/true);
1502 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
1503 llvm::Value *WidthVal =
1504 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
1505 Int32Ty, SourceLocation()),
1506 CGF.SizeTy, /*isSigned=*/true);
1508 // The absolute ptr address to the base addr of the next element to copy.
1509 llvm::Value *CumulativeElemBasePtr =
1510 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
1511 Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
1513 emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
1514 SrcDataAddr, DestDataAddr,
1515 {/*RemoteLaneOffset=*/nullptr,
1516 /*ScratchpadIndex=*/IndexVal,
1517 /*ScratchpadWidth=*/WidthVal});
1519 CGF.FinishFunction();
1523 /// This function emits a helper that gathers Reduce lists from the first
1524 /// lane of every active warp to lanes in the first warp.
1526 /// void inter_warp_copy_func(void* reduce_data, num_warps)
1527 /// shared smem[warp_size];
1528 /// For all data entries D in reduce_data:
1529 /// If (I am the first lane in each warp)
1530 /// Copy my local D to smem[warp_id]
1532 /// if (I am the first warp)
1533 /// Copy smem[thread_id] to my local D
1535 static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1536 ArrayRef<const Expr *> Privates,
1537 QualType ReductionArrayTy) {
1538 auto &C = CGM.getContext();
1539 auto &M = CGM.getModule();
1541 // ReduceList: thread local Reduce list.
1542 // At the stage of the computation when this function is called, partially
1543 // aggregated values reside in the first lane of every active warp.
1544 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
1545 // NumWarps: number of warps active in the parallel region. This could
1546 // be smaller than 32 (max warps in a CTA) for partial block reduction.
1547 ImplicitParamDecl NumWarpsArg(C,
1548 C.getIntTypeForBitwidth(32, /* Signed */ true),
1549 ImplicitParamDecl::Other);
1550 FunctionArgList Args;
1551 Args.push_back(&ReduceListArg);
1552 Args.push_back(&NumWarpsArg);
1554 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1555 auto *Fn = llvm::Function::Create(
1556 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1557 "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
1558 CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1559 CodeGenFunction CGF(CGM);
1560 // We don't need debug information in this function as nothing here refers to
1562 CGF.disableDebugInfo();
1563 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1565 auto &Bld = CGF.Builder;
1567 // This array is used as a medium to transfer, one reduce element at a time,
1568 // the data from the first lane of every warp to lanes in the first warp
1569 // in order to perform the final step of a reduction in a parallel region
1570 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1571 // for reduced latency, as well as to have a distinct copy for concurrently
1572 // executing target regions. The array is declared with common linkage so
1573 // as to be shared across compilation units.
1574 const char *TransferMediumName =
1575 "__openmp_nvptx_data_transfer_temporary_storage";
1576 llvm::GlobalVariable *TransferMedium =
1577 M.getGlobalVariable(TransferMediumName);
1578 if (!TransferMedium) {
1579 auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
1580 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1581 TransferMedium = new llvm::GlobalVariable(
1583 /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
1584 llvm::Constant::getNullValue(Ty), TransferMediumName,
1585 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1586 SharedAddressSpace);
1589 // Get the CUDA thread id of the current OpenMP thread on the GPU.
1590 auto *ThreadID = getNVPTXThreadID(CGF);
1591 // nvptx_lane_id = nvptx_id % warpsize
1592 auto *LaneID = getNVPTXLaneID(CGF);
1593 // nvptx_warp_id = nvptx_id / warpsize
1594 auto *WarpID = getNVPTXWarpID(CGF);
1596 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1597 Address LocalReduceList(
1598 Bld.CreatePointerBitCastOrAddrSpaceCast(
1599 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1600 C.VoidPtrTy, SourceLocation()),
1601 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1602 CGF.getPointerAlign());
1605 for (auto &Private : Privates) {
1607 // Warp master copies reduce element to transfer medium in __shared__
1610 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1611 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1612 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1614 // if (lane_id == 0)
1616 Bld.CreateICmpEQ(LaneID, Bld.getInt32(0), "warp_master");
1617 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1618 CGF.EmitBlock(ThenBB);
1620 // Reduce element = LocalReduceList[i]
1621 Address ElemPtrPtrAddr =
1622 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
1623 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1624 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1625 // elemptr = (type[i]*)(elemptrptr)
1627 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
1628 ElemPtr = Bld.CreateElementBitCast(
1629 ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
1631 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1632 ElemPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
1634 // Get pointer to location in transfer medium.
1635 // MediumPtr = &medium[warp_id]
1636 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1637 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1638 Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
1639 // Casting to actual data type.
1640 // MediumPtr = (type[i]*)MediumPtrAddr;
1641 MediumPtr = Bld.CreateElementBitCast(
1642 MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
1645 Bld.CreateStore(Elem, MediumPtr);
1647 Bld.CreateBr(MergeBB);
1649 CGF.EmitBlock(ElseBB);
1650 Bld.CreateBr(MergeBB);
1652 CGF.EmitBlock(MergeBB);
1654 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1655 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1656 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
1658 auto *NumActiveThreads = Bld.CreateNSWMul(
1659 NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
1660 // named_barrier_sync(ParallelBarrierID, num_active_threads)
1661 syncParallelThreads(CGF, NumActiveThreads);
1664 // Warp 0 copies reduce element from transfer medium.
1666 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1667 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1668 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1670 // Up to 32 threads in warp 0 are active.
1671 auto IsActiveThread =
1672 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1673 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1675 CGF.EmitBlock(W0ThenBB);
1677 // SrcMediumPtr = &medium[tid]
1678 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1679 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1680 Address SrcMediumPtr(SrcMediumPtrVal,
1681 C.getTypeAlignInChars(Private->getType()));
1682 // SrcMediumVal = *SrcMediumPtr;
1683 SrcMediumPtr = Bld.CreateElementBitCast(
1684 SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
1685 llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
1686 SrcMediumPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
1688 // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
1689 Address TargetElemPtrPtr =
1690 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
1691 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1692 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1693 Address TargetElemPtr =
1694 Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
1695 TargetElemPtr = Bld.CreateElementBitCast(
1696 TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
1698 // *TargetElemPtr = SrcMediumVal;
1699 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
1700 Private->getType());
1701 Bld.CreateBr(W0MergeBB);
1703 CGF.EmitBlock(W0ElseBB);
1704 Bld.CreateBr(W0MergeBB);
1706 CGF.EmitBlock(W0MergeBB);
1708 // While warp 0 copies values from transfer medium, all other warps must
1710 syncParallelThreads(CGF, NumActiveThreads);
1714 CGF.FinishFunction();
1718 /// Emit a helper that reduces data across two OpenMP threads (lanes)
1719 /// in the same warp. It uses shuffle instructions to copy over data from
1720 /// a remote lane's stack. The reduction algorithm performed is specified
1721 /// by the fourth parameter.
1723 /// Algorithm Versions.
1724 /// Full Warp Reduce (argument value 0):
1725 /// This algorithm assumes that all 32 lanes are active and gathers
1726 /// data from these 32 lanes, producing a single resultant value.
1727 /// Contiguous Partial Warp Reduce (argument value 1):
1728 /// This algorithm assumes that only a *contiguous* subset of lanes
1729 /// are active. This happens for the last warp in a parallel region
1730 /// when the user specified num_threads is not an integer multiple of
1731 /// 32. This contiguous subset always starts with the zeroth lane.
1732 /// Partial Warp Reduce (argument value 2):
1733 /// This algorithm gathers data from any number of lanes at any position.
1734 /// All reduced values are stored in the lowest possible lane. The set
1735 /// of problems every algorithm addresses is a super set of those
1736 /// addressable by algorithms with a lower version number. Overhead
1737 /// increases as algorithm version increases.
1741 /// Reduce element refers to the individual data field with primitive
1742 /// data types to be combined and reduced across threads.
1744 /// Reduce list refers to a collection of local, thread-private
1745 /// reduce elements.
1746 /// Remote Reduce list:
1747 /// Remote Reduce list refers to a collection of remote (relative to
1748 /// the current thread) reduce elements.
1750 /// We distinguish between three states of threads that are important to
1751 /// the implementation of this function.
1753 /// Threads in a warp executing the SIMT instruction, as distinguished from
1754 /// threads that are inactive due to divergent control flow.
1756 /// The minimal set of threads that has to be alive upon entry to this
1757 /// function. The computation is correct iff active threads are alive.
1758 /// Some threads are alive but they are not active because they do not
1759 /// contribute to the computation in any useful manner. Turning them off
1760 /// may introduce control flow overheads without any tangible benefits.
1761 /// Effective threads:
1762 /// In order to comply with the argument requirements of the shuffle
1763 /// function, we must keep all lanes holding data alive. But at most
1764 /// half of them perform value aggregation; we refer to this half of
1765 /// threads as effective. The other half is simply handing off their
1770 /// In this step active threads transfer data from higher lane positions
1771 /// in the warp to lower lane positions, creating Remote Reduce list.
1772 /// Value aggregation:
1773 /// In this step, effective threads combine their thread local Reduce list
1774 /// with Remote Reduce list and store the result in the thread local
1777 /// In this step, we deal with the assumption made by algorithm 2
1778 /// (i.e. contiguity assumption). When we have an odd number of lanes
1779 /// active, say 2k+1, only k threads will be effective and therefore k
1780 /// new values will be produced. However, the Reduce list owned by the
1781 /// (2k+1)th thread is ignored in the value aggregation. Therefore
1782 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1783 /// that the contiguity assumption still holds.
1784 static llvm::Value *
1785 emitShuffleAndReduceFunction(CodeGenModule &CGM,
1786 ArrayRef<const Expr *> Privates,
1787 QualType ReductionArrayTy, llvm::Value *ReduceFn) {
1788 auto &C = CGM.getContext();
1790 // Thread local Reduce list used to host the values of data to be reduced.
1791 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
1792 // Current lane id; could be logical.
1793 ImplicitParamDecl LaneIDArg(C, C.ShortTy, ImplicitParamDecl::Other);
1794 // Offset of the remote source lane relative to the current lane.
1795 ImplicitParamDecl RemoteLaneOffsetArg(C, C.ShortTy,
1796 ImplicitParamDecl::Other);
1797 // Algorithm version. This is expected to be known at compile time.
1798 ImplicitParamDecl AlgoVerArg(C, C.ShortTy, ImplicitParamDecl::Other);
1799 FunctionArgList Args;
1800 Args.push_back(&ReduceListArg);
1801 Args.push_back(&LaneIDArg);
1802 Args.push_back(&RemoteLaneOffsetArg);
1803 Args.push_back(&AlgoVerArg);
1805 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1806 auto *Fn = llvm::Function::Create(
1807 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1808 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
1809 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
1810 CodeGenFunction CGF(CGM);
1811 // We don't need debug information in this function as nothing here refers to
1813 CGF.disableDebugInfo();
1814 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1816 auto &Bld = CGF.Builder;
1818 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1819 Address LocalReduceList(
1820 Bld.CreatePointerBitCastOrAddrSpaceCast(
1821 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1822 C.VoidPtrTy, SourceLocation()),
1823 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1824 CGF.getPointerAlign());
1826 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
1827 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
1828 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1830 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
1831 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
1832 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1834 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
1835 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
1836 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1838 // Create a local thread-private variable to host the Reduce list
1839 // from a remote lane.
1840 Address RemoteReduceList =
1841 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
1843 // This loop iterates through the list of reduce elements and copies,
1844 // element by element, from a remote lane in the warp to RemoteReduceList,
1845 // hosted on the thread's stack.
1846 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
1847 LocalReduceList, RemoteReduceList,
1848 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
1849 /*ScratchpadIndex=*/nullptr,
1850 /*ScratchpadWidth=*/nullptr});
1852 // The actions to be performed on the Remote Reduce list is dependent
1853 // on the algorithm version.
1855 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
1856 // LaneId % 2 == 0 && Offset > 0):
1857 // do the reduction value aggregation
1859 // The thread local variable Reduce list is mutated in place to host the
1860 // reduced data, which is the aggregated value produced from local and
1863 // Note that AlgoVer is expected to be a constant integer known at compile
1865 // When AlgoVer==0, the first conjunction evaluates to true, making
1866 // the entire predicate true during compile time.
1867 // When AlgoVer==1, the second conjunction has only the second part to be
1868 // evaluated during runtime. Other conjunctions evaluates to false
1869 // during compile time.
1870 // When AlgoVer==2, the third conjunction has only the second part to be
1871 // evaluated during runtime. Other conjunctions evaluates to false
1872 // during compile time.
1873 auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0));
1875 auto Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1876 auto CondAlgo1 = Bld.CreateAnd(
1877 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
1879 auto Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
1880 auto CondAlgo2 = Bld.CreateAnd(
1882 Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)),
1884 CondAlgo2 = Bld.CreateAnd(
1885 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
1887 auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
1888 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
1890 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1891 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1892 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1893 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1895 CGF.EmitBlock(ThenBB);
1896 // reduce_function(LocalReduceList, RemoteReduceList)
1897 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1898 LocalReduceList.getPointer(), CGF.VoidPtrTy);
1899 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1900 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
1901 CGF.EmitCallOrInvoke(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
1902 Bld.CreateBr(MergeBB);
1904 CGF.EmitBlock(ElseBB);
1905 Bld.CreateBr(MergeBB);
1907 CGF.EmitBlock(MergeBB);
1909 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
1911 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1912 auto CondCopy = Bld.CreateAnd(
1913 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
1915 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
1916 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
1917 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
1918 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
1920 CGF.EmitBlock(CpyThenBB);
1921 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
1922 RemoteReduceList, LocalReduceList);
1923 Bld.CreateBr(CpyMergeBB);
1925 CGF.EmitBlock(CpyElseBB);
1926 Bld.CreateBr(CpyMergeBB);
1928 CGF.EmitBlock(CpyMergeBB);
1930 CGF.FinishFunction();
1935 /// Design of OpenMP reductions on the GPU
1937 /// Consider a typical OpenMP program with one or more reduction
1942 /// #pragma omp target teams distribute parallel for \
1943 /// reduction(+:foo) reduction(*:bar)
1944 /// for (int i = 0; i < N; i++) {
1945 /// foo += A[i]; bar *= B[i];
1948 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1949 /// all teams. In our OpenMP implementation on the NVPTX device an
1950 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1951 /// within a team are mapped to CUDA threads within a threadblock.
1952 /// Our goal is to efficiently aggregate values across all OpenMP
1953 /// threads such that:
1955 /// - the compiler and runtime are logically concise, and
1956 /// - the reduction is performed efficiently in a hierarchical
1957 /// manner as follows: within OpenMP threads in the same warp,
1958 /// across warps in a threadblock, and finally across teams on
1959 /// the NVPTX device.
1961 /// Introduction to Decoupling
1963 /// We would like to decouple the compiler and the runtime so that the
1964 /// latter is ignorant of the reduction variables (number, data types)
1965 /// and the reduction operators. This allows a simpler interface
1966 /// and implementation while still attaining good performance.
1968 /// Pseudocode for the aforementioned OpenMP program generated by the
1969 /// compiler is as follows:
1971 /// 1. Create private copies of reduction variables on each OpenMP
1972 /// thread: 'foo_private', 'bar_private'
1973 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1974 /// to it and writes the result in 'foo_private' and 'bar_private'
1976 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1977 /// and store the result on the team master:
1979 /// __kmpc_nvptx_parallel_reduce_nowait(...,
1980 /// reduceData, shuffleReduceFn, interWarpCpyFn)
1983 /// struct ReduceData {
1987 /// reduceData.foo = &foo_private
1988 /// reduceData.bar = &bar_private
1990 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1991 /// auxiliary functions generated by the compiler that operate on
1992 /// variables of type 'ReduceData'. They aid the runtime perform
1993 /// algorithmic steps in a data agnostic manner.
1995 /// 'shuffleReduceFn' is a pointer to a function that reduces data
1996 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
1997 /// same warp. It takes the following arguments as input:
1999 /// a. variable of type 'ReduceData' on the calling lane,
2001 /// c. an offset relative to the current lane_id to generate a
2002 /// remote_lane_id. The remote lane contains the second
2003 /// variable of type 'ReduceData' that is to be reduced.
2004 /// d. an algorithm version parameter determining which reduction
2005 /// algorithm to use.
2007 /// 'shuffleReduceFn' retrieves data from the remote lane using
2008 /// efficient GPU shuffle intrinsics and reduces, using the
2009 /// algorithm specified by the 4th parameter, the two operands
2010 /// element-wise. The result is written to the first operand.
2012 /// Different reduction algorithms are implemented in different
2013 /// runtime functions, all calling 'shuffleReduceFn' to perform
2014 /// the essential reduction step. Therefore, based on the 4th
2015 /// parameter, this function behaves slightly differently to
2016 /// cooperate with the runtime to ensure correctness under
2017 /// different circumstances.
2019 /// 'InterWarpCpyFn' is a pointer to a function that transfers
2020 /// reduced variables across warps. It tunnels, through CUDA
2021 /// shared memory, the thread-private data of type 'ReduceData'
2022 /// from lane 0 of each warp to a lane in the first warp.
2023 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2024 /// The last team writes the global reduced value to memory.
2026 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2027 /// reduceData, shuffleReduceFn, interWarpCpyFn,
2028 /// scratchpadCopyFn, loadAndReduceFn)
2030 /// 'scratchpadCopyFn' is a helper that stores reduced
2031 /// data from the team master to a scratchpad array in
2034 /// 'loadAndReduceFn' is a helper that loads data from
2035 /// the scratchpad array and reduces it with the input
2038 /// These compiler generated functions hide address
2039 /// calculation and alignment information from the runtime.
2041 /// The team master of the last team stores the reduced
2042 /// result to the globals in memory.
2043 /// foo += reduceData.foo; bar *= reduceData.bar
2046 /// Warp Reduction Algorithms
2048 /// On the warp level, we have three algorithms implemented in the
2049 /// OpenMP runtime depending on the number of active lanes:
2051 /// Full Warp Reduction
2053 /// The reduce algorithm within a warp where all lanes are active
2054 /// is implemented in the runtime as follows:
2056 /// full_warp_reduce(void *reduce_data,
2057 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2058 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2059 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
2062 /// The algorithm completes in log(2, WARPSIZE) steps.
2064 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2065 /// not used therefore we save instructions by not retrieving lane_id
2066 /// from the corresponding special registers. The 4th parameter, which
2067 /// represents the version of the algorithm being used, is set to 0 to
2068 /// signify full warp reduction.
2070 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2072 /// #reduce_elem refers to an element in the local lane's data structure
2073 /// #remote_elem is retrieved from a remote lane
2074 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2075 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2077 /// Contiguous Partial Warp Reduction
2079 /// This reduce algorithm is used within a warp where only the first
2080 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2081 /// number of OpenMP threads in a parallel region is not a multiple of
2082 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
2085 /// contiguous_partial_reduce(void *reduce_data,
2086 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2087 /// int size, int lane_id) {
2090 /// curr_size = size;
2091 /// mask = curr_size/2;
2092 /// while (offset>0) {
2093 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2094 /// curr_size = (curr_size+1)/2;
2095 /// offset = curr_size/2;
2099 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2101 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2102 /// if (lane_id < offset)
2103 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2105 /// reduce_elem = remote_elem
2107 /// This algorithm assumes that the data to be reduced are located in a
2108 /// contiguous subset of lanes starting from the first. When there is
2109 /// an odd number of active lanes, the data in the last lane is not
2110 /// aggregated with any other lane's dat but is instead copied over.
2112 /// Dispersed Partial Warp Reduction
2114 /// This algorithm is used within a warp when any discontiguous subset of
2115 /// lanes are active. It is used to implement the reduction operation
2116 /// across lanes in an OpenMP simd region or in a nested parallel region.
2119 /// dispersed_partial_reduce(void *reduce_data,
2120 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2121 /// int size, remote_id;
2122 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2124 /// remote_id = next_active_lane_id_right_after_me();
2125 /// # the above function returns 0 of no active lane
2126 /// # is present right after the current lane.
2127 /// size = number_of_active_lanes_in_this_warp();
2128 /// logical_lane_id /= 2;
2129 /// ShuffleReduceFn(reduce_data, logical_lane_id,
2130 /// remote_id-1-threadIdx.x, 2);
2131 /// } while (logical_lane_id % 2 == 0 && size > 1);
2134 /// There is no assumption made about the initial state of the reduction.
2135 /// Any number of lanes (>=1) could be active at any position. The reduction
2136 /// result is returned in the first active lane.
2138 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2140 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2141 /// if (lane_id % 2 == 0 && offset > 0)
2142 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2144 /// reduce_elem = remote_elem
2147 /// Intra-Team Reduction
2149 /// This function, as implemented in the runtime call
2150 /// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
2151 /// threads in a team. It first reduces within a warp using the
2152 /// aforementioned algorithms. We then proceed to gather all such
2153 /// reduced values at the first warp.
2155 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
2156 /// data from each of the "warp master" (zeroth lane of each warp, where
2157 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
2158 /// a mathematical sense) the problem of reduction across warp masters in
2159 /// a block to the problem of warp reduction.
2162 /// Inter-Team Reduction
2164 /// Once a team has reduced its data to a single value, it is stored in
2165 /// a global scratchpad array. Since each team has a distinct slot, this
2166 /// can be done without locking.
2168 /// The last team to write to the scratchpad array proceeds to reduce the
2169 /// scratchpad array. One or more workers in the last team use the helper
2170 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2171 /// the k'th worker reduces every k'th element.
2173 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
2174 /// reduce across workers and compute a globally reduced value.
2176 void CGOpenMPRuntimeNVPTX::emitReduction(
2177 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
2178 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
2179 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2180 if (!CGF.HaveInsertPoint())
2183 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2184 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2185 // FIXME: Add support for simd reduction.
2186 assert((TeamsReduction || ParallelReduction) &&
2187 "Invalid reduction selection in emitReduction.");
2189 auto &C = CGM.getContext();
2191 // 1. Build a list of reduction variables.
2192 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2193 auto Size = RHSExprs.size();
2194 for (auto *E : Privates) {
2195 if (E->getType()->isVariablyModifiedType())
2196 // Reserve place for array size.
2199 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2200 QualType ReductionArrayTy =
2201 C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
2202 /*IndexTypeQuals=*/0);
2203 Address ReductionList =
2204 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2205 auto IPriv = Privates.begin();
2207 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2208 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2209 CGF.getPointerSize());
2210 CGF.Builder.CreateStore(
2211 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2212 CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
2214 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2215 // Store array size.
2217 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2218 CGF.getPointerSize());
2219 llvm::Value *Size = CGF.Builder.CreateIntCast(
2221 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2223 CGF.SizeTy, /*isSigned=*/false);
2224 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2229 // 2. Emit reduce_func().
2230 auto *ReductionFn = emitReductionFunction(
2231 CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
2232 LHSExprs, RHSExprs, ReductionOps);
2234 // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2235 // RedList, shuffle_reduce_func, interwarp_copy_func);
2236 auto *ThreadId = getThreadID(CGF, Loc);
2237 auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
2238 auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2239 ReductionList.getPointer(), CGF.VoidPtrTy);
2241 auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2242 CGM, Privates, ReductionArrayTy, ReductionFn);
2243 auto *InterWarpCopyFn =
2244 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy);
2246 llvm::Value *Res = nullptr;
2247 if (ParallelReduction) {
2248 llvm::Value *Args[] = {ThreadId,
2249 CGF.Builder.getInt32(RHSExprs.size()),
2250 ReductionArrayTySize,
2255 Res = CGF.EmitRuntimeCall(
2256 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
2260 if (TeamsReduction) {
2261 auto *ScratchPadCopyFn =
2262 emitCopyToScratchpad(CGM, Privates, ReductionArrayTy);
2263 auto *LoadAndReduceFn = emitReduceScratchpadFunction(
2264 CGM, Privates, ReductionArrayTy, ReductionFn);
2266 llvm::Value *Args[] = {ThreadId,
2267 CGF.Builder.getInt32(RHSExprs.size()),
2268 ReductionArrayTySize,
2274 Res = CGF.EmitRuntimeCall(
2275 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
2279 // 5. Build switch(res)
2280 auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
2281 auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
2283 // 6. Build case 1: where we have reduced values in the master
2284 // thread in each team.
2285 // __kmpc_end_reduce{_nowait}(<gtid>);
2287 auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
2288 SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
2289 CGF.EmitBlock(Case1BB);
2291 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2292 llvm::Value *EndArgs[] = {ThreadId};
2293 auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps,
2294 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2295 auto IPriv = Privates.begin();
2296 auto ILHS = LHSExprs.begin();
2297 auto IRHS = RHSExprs.begin();
2298 for (auto *E : ReductionOps) {
2299 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
2300 cast<DeclRefExpr>(*IRHS));
2306 RegionCodeGenTy RCG(CodeGen);
2307 NVPTXActionTy Action(
2308 nullptr, llvm::None,
2309 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
2311 RCG.setAction(Action);
2313 CGF.EmitBranch(DefaultBB);
2314 CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
2318 CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
2319 const VarDecl *NativeParam) const {
2320 if (!NativeParam->getType()->isReferenceType())
2322 QualType ArgType = NativeParam->getType();
2323 QualifierCollector QC;
2324 const Type *NonQualTy = QC.strip(ArgType);
2325 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2326 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
2327 if (Attr->getCaptureKind() == OMPC_map) {
2328 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
2329 LangAS::opencl_global);
2332 ArgType = CGM.getContext().getPointerType(PointeeTy);
2334 enum { NVPTX_local_addr = 5 };
2335 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
2336 ArgType = QC.apply(CGM.getContext(), ArgType);
2337 if (isa<ImplicitParamDecl>(NativeParam)) {
2338 return ImplicitParamDecl::Create(
2339 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
2340 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
2342 return ParmVarDecl::Create(
2344 const_cast<DeclContext *>(NativeParam->getDeclContext()),
2345 NativeParam->getLocStart(), NativeParam->getLocation(),
2346 NativeParam->getIdentifier(), ArgType,
2347 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
2351 CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
2352 const VarDecl *NativeParam,
2353 const VarDecl *TargetParam) const {
2354 assert(NativeParam != TargetParam &&
2355 NativeParam->getType()->isReferenceType() &&
2356 "Native arg must not be the same as target arg.");
2357 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
2358 QualType NativeParamType = NativeParam->getType();
2359 QualifierCollector QC;
2360 const Type *NonQualTy = QC.strip(NativeParamType);
2361 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2362 unsigned NativePointeeAddrSpace =
2363 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
2364 QualType TargetTy = TargetParam->getType();
2365 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
2366 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
2367 // First cast to generic.
2368 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2369 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2371 // Cast from generic to native address space.
2372 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2373 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2374 NativePointeeAddrSpace));
2375 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
2376 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
2378 return NativeParamAddr;
2381 void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
2382 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2383 ArrayRef<llvm::Value *> Args) const {
2384 SmallVector<llvm::Value *, 4> TargetArgs;
2385 TargetArgs.reserve(Args.size());
2387 cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
2388 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
2389 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
2390 TargetArgs.append(std::next(Args.begin(), I), Args.end());
2393 llvm::Type *TargetType = FnType->getParamType(I);
2394 llvm::Value *NativeArg = Args[I];
2395 if (!TargetType->isPointerTy()) {
2396 TargetArgs.emplace_back(NativeArg);
2399 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2400 NativeArg, NativeArg->getType()->getPointerElementType()->getPointerTo(
2402 TargetArgs.emplace_back(
2403 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
2405 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
2408 /// Emit function which wraps the outline parallel region
2409 /// and controls the arguments which are passed to this function.
2410 /// The wrapper ensures that the outlined function is called
2411 /// with the correct arguments when data is shared.
2412 llvm::Function *CGOpenMPRuntimeNVPTX::createDataSharingWrapper(
2413 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
2414 ASTContext &Ctx = CGM.getContext();
2415 const auto &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
2417 // Create a function that takes as argument the source thread.
2418 FunctionArgList WrapperArgs;
2420 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
2422 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
2423 QualType Int32PtrQTy = Ctx.getPointerType(Int32QTy);
2424 QualType VoidPtrPtrQTy = Ctx.getPointerType(Ctx.VoidPtrTy);
2425 ImplicitParamDecl ParallelLevelArg(Ctx, Int16QTy, ImplicitParamDecl::Other);
2426 ImplicitParamDecl WrapperArg(Ctx, Int32QTy, ImplicitParamDecl::Other);
2427 ImplicitParamDecl SharedArgsList(Ctx, VoidPtrPtrQTy,
2428 ImplicitParamDecl::Other);
2429 WrapperArgs.emplace_back(&ParallelLevelArg);
2430 WrapperArgs.emplace_back(&WrapperArg);
2431 WrapperArgs.emplace_back(&SharedArgsList);
2434 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
2436 auto *Fn = llvm::Function::Create(
2437 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2438 OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule());
2439 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
2440 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2442 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
2443 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs);
2445 const auto *RD = CS.getCapturedRecordDecl();
2446 auto CurField = RD->field_begin();
2448 // Get the array of arguments.
2449 SmallVector<llvm::Value *, 8> Args;
2451 // TODO: suppport SIMD and pass actual values
2452 Args.emplace_back(llvm::ConstantPointerNull::get(
2453 CGM.Int32Ty->getPointerTo()));
2454 Args.emplace_back(llvm::ConstantPointerNull::get(
2455 CGM.Int32Ty->getPointerTo()));
2457 CGBuilderTy &Bld = CGF.Builder;
2458 auto CI = CS.capture_begin();
2460 // Load the start of the array
2462 CGF.EmitLoadOfPointer(CGF.GetAddrOfLocalVar(&SharedArgsList),
2463 VoidPtrPtrQTy->castAs<PointerType>());
2465 // For each captured variable
2466 for (unsigned I = 0; I < CS.capture_size(); ++I, ++CI, ++CurField) {
2467 // Name of captured variable
2469 if (CI->capturesThis())
2472 Name = CI->getCapturedVar()->getName();
2474 // We retrieve the CLANG type of the argument. We use it to create
2475 // an alloca which will give us the LLVM type.
2476 QualType ElemTy = CurField->getType();
2477 // If this is a capture by copy the element type has to be the pointer to
2479 if (CI->capturesVariableByCopy())
2480 ElemTy = Ctx.getPointerType(ElemTy);
2482 // Get shared address of the captured variable.
2483 Address ArgAddress = Bld.CreateConstInBoundsGEP(
2484 SharedArgs, I, CGF.getPointerSize());
2485 Address TypedArgAddress = Bld.CreateBitCast(
2486 ArgAddress, CGF.ConvertTypeForMem(Ctx.getPointerType(ElemTy)));
2487 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedArgAddress,
2488 /*Volatile=*/false, Int32PtrQTy, SourceLocation());
2489 Args.emplace_back(Arg);
2492 emitCall(CGF, OutlinedParallelFn, Args);
2493 CGF.FinishFunction();