contrib/llvm/tools/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp

   1 //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8 //
   9 // This provides a class for OpenMP runtime code generation specialized to NVPTX
  10 // targets.
  11 //
  12 //===----------------------------------------------------------------------===//
  13
  14 #include "CGOpenMPRuntimeNVPTX.h"
  15 #include "CodeGenFunction.h"
  16 #include "clang/AST/DeclOpenMP.h"
  17 #include "clang/AST/StmtOpenMP.h"
  18 #include "clang/AST/StmtVisitor.h"
  19 #include "clang/Basic/Cuda.h"
  20 #include "llvm/ADT/SmallPtrSet.h"
  21
  22 using namespace clang;
  23 using namespace CodeGen;
  24
  25 namespace {
  26 enum OpenMPRTLFunctionNVPTX {
  27   /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
  28   /// int16_t RequiresOMPRuntime);
  29   OMPRTL_NVPTX__kmpc_kernel_init,
  30   /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
  31   OMPRTL_NVPTX__kmpc_kernel_deinit,
  32   /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
  33   /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
  34   OMPRTL_NVPTX__kmpc_spmd_kernel_init,
  35   /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
  36   OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2,
  37   /// Call to void __kmpc_kernel_prepare_parallel(void
  38   /// *outlined_function, int16_t
  39   /// IsOMPRuntimeInitialized);
  40   OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
  41   /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
  42   /// int16_t IsOMPRuntimeInitialized);
  43   OMPRTL_NVPTX__kmpc_kernel_parallel,
  44   /// Call to void __kmpc_kernel_end_parallel();
  45   OMPRTL_NVPTX__kmpc_kernel_end_parallel,
  46   /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
  47   /// global_tid);
  48   OMPRTL_NVPTX__kmpc_serialized_parallel,
  49   /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
  50   /// global_tid);
  51   OMPRTL_NVPTX__kmpc_end_serialized_parallel,
  52   /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
  53   /// int16_t lane_offset, int16_t warp_size);
  54   OMPRTL_NVPTX__kmpc_shuffle_int32,
  55   /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
  56   /// int16_t lane_offset, int16_t warp_size);
  57   OMPRTL_NVPTX__kmpc_shuffle_int64,
  58   /// Call to __kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc, kmp_int32
  59   /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
  60   /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
  61   /// lane_offset, int16_t shortCircuit),
  62   /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
  63   OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2,
  64   /// Call to __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
  65   /// global_tid, void *global_buffer, int32_t num_of_records, void*
  66   /// reduce_data,
  67   /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
  68   /// lane_offset, int16_t shortCircuit),
  69   /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void
  70   /// (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data),
  71   /// void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx,
  72   /// void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
  73   /// int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void
  74   /// *buffer, int idx, void *reduce_data));
  75   OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2,
  76   /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
  77   OMPRTL_NVPTX__kmpc_end_reduce_nowait,
  78   /// Call to void __kmpc_data_sharing_init_stack();
  79   OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
  80   /// Call to void __kmpc_data_sharing_init_stack_spmd();
  81   OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
  82   /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
  83   /// int16_t UseSharedMemory);
  84   OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
  85   /// Call to void __kmpc_data_sharing_pop_stack(void *a);
  86   OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
  87   /// Call to void __kmpc_begin_sharing_variables(void ***args,
  88   /// size_t n_args);
  89   OMPRTL_NVPTX__kmpc_begin_sharing_variables,
  90   /// Call to void __kmpc_end_sharing_variables();
  91   OMPRTL_NVPTX__kmpc_end_sharing_variables,
  92   /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
  93   OMPRTL_NVPTX__kmpc_get_shared_variables,
  94   /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
  95   /// global_tid);
  96   OMPRTL_NVPTX__kmpc_parallel_level,
  97   /// Call to int8_t __kmpc_is_spmd_exec_mode();
  98   OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
  99   /// Call to void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
 100   /// const void *buf, size_t size, int16_t is_shared, const void **res);
 101   OMPRTL_NVPTX__kmpc_get_team_static_memory,
 102   /// Call to void __kmpc_restore_team_static_memory(int16_t
 103   /// isSPMDExecutionMode, int16_t is_shared);
 104   OMPRTL_NVPTX__kmpc_restore_team_static_memory,
 105   /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
 106   OMPRTL__kmpc_barrier,
 107   /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
 108   /// global_tid);
 109   OMPRTL__kmpc_barrier_simple_spmd,
 110 };
 111
 112 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
 113 class NVPTXActionTy final : public PrePostActionTy {
 114   llvm::FunctionCallee EnterCallee = nullptr;
 115   ArrayRef<llvm::Value *> EnterArgs;
 116   llvm::FunctionCallee ExitCallee = nullptr;
 117   ArrayRef<llvm::Value *> ExitArgs;
 118   bool Conditional = false;
 119   llvm::BasicBlock *ContBlock = nullptr;
 120
 121 public:
 122   NVPTXActionTy(llvm::FunctionCallee EnterCallee,
 123                 ArrayRef<llvm::Value *> EnterArgs,
 124                 llvm::FunctionCallee ExitCallee,
 125                 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
 126       : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
 127         ExitArgs(ExitArgs), Conditional(Conditional) {}
 128   void Enter(CodeGenFunction &CGF) override {
 129     llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
 130     if (Conditional) {
 131       llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
 132       auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
 133       ContBlock = CGF.createBasicBlock("omp_if.end");
 134       // Generate the branch (If-stmt)
 135       CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
 136       CGF.EmitBlock(ThenBlock);
 137     }
 138   }
 139   void Done(CodeGenFunction &CGF) {
 140     // Emit the rest of blocks/branches
 141     CGF.EmitBranch(ContBlock);
 142     CGF.EmitBlock(ContBlock, true);
 143   }
 144   void Exit(CodeGenFunction &CGF) override {
 145     CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
 146   }
 147 };
 148
 149 /// A class to track the execution mode when codegening directives within
 150 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
 151 /// to the target region and used by containing directives such as 'parallel'
 152 /// to emit optimized code.
 153 class ExecutionRuntimeModesRAII {
 154 private:
 155   CGOpenMPRuntimeNVPTX::ExecutionMode SavedExecMode =
 156       CGOpenMPRuntimeNVPTX::EM_Unknown;
 157   CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode;
 158   bool SavedRuntimeMode = false;
 159   bool *RuntimeMode = nullptr;
 160
 161 public:
 162   /// Constructor for Non-SPMD mode.
 163   ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode)
 164       : ExecMode(ExecMode) {
 165     SavedExecMode = ExecMode;
 166     ExecMode = CGOpenMPRuntimeNVPTX::EM_NonSPMD;
 167   }
 168   /// Constructor for SPMD mode.
 169   ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode,
 170                             bool &RuntimeMode, bool FullRuntimeMode)
 171       : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
 172     SavedExecMode = ExecMode;
 173     SavedRuntimeMode = RuntimeMode;
 174     ExecMode = CGOpenMPRuntimeNVPTX::EM_SPMD;
 175     RuntimeMode = FullRuntimeMode;
 176   }
 177   ~ExecutionRuntimeModesRAII() {
 178     ExecMode = SavedExecMode;
 179     if (RuntimeMode)
 180       *RuntimeMode = SavedRuntimeMode;
 181   }
 182 };
 183
 184 /// GPU Configuration:  This information can be derived from cuda registers,
 185 /// however, providing compile time constants helps generate more efficient
 186 /// code.  For all practical purposes this is fine because the configuration
 187 /// is the same for all known NVPTX architectures.
 188 enum MachineConfiguration : unsigned {
 189   WarpSize = 32,
 190   /// Number of bits required to represent a lane identifier, which is
 191   /// computed as log_2(WarpSize).
 192   LaneIDBits = 5,
 193   LaneIDMask = WarpSize - 1,
 194
 195   /// Global memory alignment for performance.
 196   GlobalMemoryAlignment = 128,
 197
 198   /// Maximal size of the shared memory buffer.
 199   SharedMemorySize = 128,
 200 };
 201
 202 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
 203   RefExpr = RefExpr->IgnoreParens();
 204   if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
 205     const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
 206     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
 207       Base = TempASE->getBase()->IgnoreParenImpCasts();
 208     RefExpr = Base;
 209   } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
 210     const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
 211     while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
 212       Base = TempOASE->getBase()->IgnoreParenImpCasts();
 213     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
 214       Base = TempASE->getBase()->IgnoreParenImpCasts();
 215     RefExpr = Base;
 216   }
 217   RefExpr = RefExpr->IgnoreParenImpCasts();
 218   if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
 219     return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
 220   const auto *ME = cast<MemberExpr>(RefExpr);
 221   return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
 222 }
 223
 224
 225 static RecordDecl *buildRecordForGlobalizedVars(
 226     ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
 227     ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
 228     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
 229         &MappedDeclsFields, int BufSize) {
 230   using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
 231   if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
 232     return nullptr;
 233   SmallVector<VarsDataTy, 4> GlobalizedVars;
 234   for (const ValueDecl *D : EscapedDecls)
 235     GlobalizedVars.emplace_back(
 236         CharUnits::fromQuantity(std::max(
 237             C.getDeclAlign(D).getQuantity(),
 238             static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
 239         D);
 240   for (const ValueDecl *D : EscapedDeclsForTeams)
 241     GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
 242   llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
 243     return L.first > R.first;
 244   });
 245
 246   // Build struct _globalized_locals_ty {
 247   //         /*  globalized vars  */[WarSize] align (max(decl_align,
 248   //         GlobalMemoryAlignment))
 249   //         /*  globalized vars  */ for EscapedDeclsForTeams
 250   //       };
 251   RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
 252   GlobalizedRD->startDefinition();
 253   llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
 254       EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
 255   for (const auto &Pair : GlobalizedVars) {
 256     const ValueDecl *VD = Pair.second;
 257     QualType Type = VD->getType();
 258     if (Type->isLValueReferenceType())
 259       Type = C.getPointerType(Type.getNonReferenceType());
 260     else
 261       Type = Type.getNonReferenceType();
 262     SourceLocation Loc = VD->getLocation();
 263     FieldDecl *Field;
 264     if (SingleEscaped.count(VD)) {
 265       Field = FieldDecl::Create(
 266           C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
 267           C.getTrivialTypeSourceInfo(Type, SourceLocation()),
 268           /*BW=*/nullptr, /*Mutable=*/false,
 269           /*InitStyle=*/ICIS_NoInit);
 270       Field->setAccess(AS_public);
 271       if (VD->hasAttrs()) {
 272         for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
 273              E(VD->getAttrs().end());
 274              I != E; ++I)
 275           Field->addAttr(*I);
 276       }
 277     } else {
 278       llvm::APInt ArraySize(32, BufSize);
 279       Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0);
 280       Field = FieldDecl::Create(
 281           C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
 282           C.getTrivialTypeSourceInfo(Type, SourceLocation()),
 283           /*BW=*/nullptr, /*Mutable=*/false,
 284           /*InitStyle=*/ICIS_NoInit);
 285       Field->setAccess(AS_public);
 286       llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
 287                                      static_cast<CharUnits::QuantityType>(
 288                                          GlobalMemoryAlignment)));
 289       Field->addAttr(AlignedAttr::CreateImplicit(
 290           C, AlignedAttr::GNU_aligned, /*IsAlignmentExpr=*/true,
 291           IntegerLiteral::Create(C, Align,
 292                                  C.getIntTypeForBitwidth(32, /*Signed=*/0),
 293                                  SourceLocation())));
 294     }
 295     GlobalizedRD->addDecl(Field);
 296     MappedDeclsFields.try_emplace(VD, Field);
 297   }
 298   GlobalizedRD->completeDefinition();
 299   return GlobalizedRD;
 300 }
 301
 302 /// Get the list of variables that can escape their declaration context.
 303 class CheckVarsEscapingDeclContext final
 304     : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
 305   CodeGenFunction &CGF;
 306   llvm::SetVector<const ValueDecl *> EscapedDecls;
 307   llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
 308   llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
 309   RecordDecl *GlobalizedRD = nullptr;
 310   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
 311   bool AllEscaped = false;
 312   bool IsForCombinedParallelRegion = false;
 313
 314   void markAsEscaped(const ValueDecl *VD) {
 315     // Do not globalize declare target variables.
 316     if (!isa<VarDecl>(VD) ||
 317         OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
 318       return;
 319     VD = cast<ValueDecl>(VD->getCanonicalDecl());
 320     // Use user-specified allocation.
 321     if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
 322       return;
 323     // Variables captured by value must be globalized.
 324     if (auto *CSI = CGF.CapturedStmtInfo) {
 325       if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
 326         // Check if need to capture the variable that was already captured by
 327         // value in the outer region.
 328         if (!IsForCombinedParallelRegion) {
 329           if (!FD->hasAttrs())
 330             return;
 331           const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
 332           if (!Attr)
 333             return;
 334           if (((Attr->getCaptureKind() != OMPC_map) &&
 335                !isOpenMPPrivate(
 336                    static_cast<OpenMPClauseKind>(Attr->getCaptureKind()))) ||
 337               ((Attr->getCaptureKind() == OMPC_map) &&
 338                !FD->getType()->isAnyPointerType()))
 339             return;
 340         }
 341         if (!FD->getType()->isReferenceType()) {
 342           assert(!VD->getType()->isVariablyModifiedType() &&
 343                  "Parameter captured by value with variably modified type");
 344           EscapedParameters.insert(VD);
 345         } else if (!IsForCombinedParallelRegion) {
 346           return;
 347         }
 348       }
 349     }
 350     if ((!CGF.CapturedStmtInfo ||
 351          (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
 352         VD->getType()->isReferenceType())
 353       // Do not globalize variables with reference type.
 354       return;
 355     if (VD->getType()->isVariablyModifiedType())
 356       EscapedVariableLengthDecls.insert(VD);
 357     else
 358       EscapedDecls.insert(VD);
 359   }
 360
 361   void VisitValueDecl(const ValueDecl *VD) {
 362     if (VD->getType()->isLValueReferenceType())
 363       markAsEscaped(VD);
 364     if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
 365       if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
 366         const bool SavedAllEscaped = AllEscaped;
 367         AllEscaped = VD->getType()->isLValueReferenceType();
 368         Visit(VarD->getInit());
 369         AllEscaped = SavedAllEscaped;
 370       }
 371     }
 372   }
 373   void VisitOpenMPCapturedStmt(const CapturedStmt *S,
 374                                ArrayRef<OMPClause *> Clauses,
 375                                bool IsCombinedParallelRegion) {
 376     if (!S)
 377       return;
 378     for (const CapturedStmt::Capture &C : S->captures()) {
 379       if (C.capturesVariable() && !C.capturesVariableByCopy()) {
 380         const ValueDecl *VD = C.getCapturedVar();
 381         bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
 382         if (IsCombinedParallelRegion) {
 383           // Check if the variable is privatized in the combined construct and
 384           // those private copies must be shared in the inner parallel
 385           // directive.
 386           IsForCombinedParallelRegion = false;
 387           for (const OMPClause *C : Clauses) {
 388             if (!isOpenMPPrivate(C->getClauseKind()) ||
 389                 C->getClauseKind() == OMPC_reduction ||
 390                 C->getClauseKind() == OMPC_linear ||
 391                 C->getClauseKind() == OMPC_private)
 392               continue;
 393             ArrayRef<const Expr *> Vars;
 394             if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
 395               Vars = PC->getVarRefs();
 396             else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
 397               Vars = PC->getVarRefs();
 398             else
 399               llvm_unreachable("Unexpected clause.");
 400             for (const auto *E : Vars) {
 401               const Decl *D =
 402                   cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
 403               if (D == VD->getCanonicalDecl()) {
 404                 IsForCombinedParallelRegion = true;
 405                 break;
 406               }
 407             }
 408             if (IsForCombinedParallelRegion)
 409               break;
 410           }
 411         }
 412         markAsEscaped(VD);
 413         if (isa<OMPCapturedExprDecl>(VD))
 414           VisitValueDecl(VD);
 415         IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
 416       }
 417     }
 418   }
 419
 420   void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
 421     assert(!GlobalizedRD &&
 422            "Record for globalized variables is built already.");
 423     ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
 424     if (IsInTTDRegion)
 425       EscapedDeclsForTeams = EscapedDecls.getArrayRef();
 426     else
 427       EscapedDeclsForParallel = EscapedDecls.getArrayRef();
 428     GlobalizedRD = ::buildRecordForGlobalizedVars(
 429         CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
 430         MappedDeclsFields, WarpSize);
 431   }
 432
 433 public:
 434   CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
 435                                ArrayRef<const ValueDecl *> TeamsReductions)
 436       : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
 437   }
 438   virtual ~CheckVarsEscapingDeclContext() = default;
 439   void VisitDeclStmt(const DeclStmt *S) {
 440     if (!S)
 441       return;
 442     for (const Decl *D : S->decls())
 443       if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
 444         VisitValueDecl(VD);
 445   }
 446   void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
 447     if (!D)
 448       return;
 449     if (!D->hasAssociatedStmt())
 450       return;
 451     if (const auto *S =
 452             dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
 453       // Do not analyze directives that do not actually require capturing,
 454       // like `omp for` or `omp simd` directives.
 455       llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
 456       getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
 457       if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
 458         VisitStmt(S->getCapturedStmt());
 459         return;
 460       }
 461       VisitOpenMPCapturedStmt(
 462           S, D->clauses(),
 463           CaptureRegions.back() == OMPD_parallel &&
 464               isOpenMPDistributeDirective(D->getDirectiveKind()));
 465     }
 466   }
 467   void VisitCapturedStmt(const CapturedStmt *S) {
 468     if (!S)
 469       return;
 470     for (const CapturedStmt::Capture &C : S->captures()) {
 471       if (C.capturesVariable() && !C.capturesVariableByCopy()) {
 472         const ValueDecl *VD = C.getCapturedVar();
 473         markAsEscaped(VD);
 474         if (isa<OMPCapturedExprDecl>(VD))
 475           VisitValueDecl(VD);
 476       }
 477     }
 478   }
 479   void VisitLambdaExpr(const LambdaExpr *E) {
 480     if (!E)
 481       return;
 482     for (const LambdaCapture &C : E->captures()) {
 483       if (C.capturesVariable()) {
 484         if (C.getCaptureKind() == LCK_ByRef) {
 485           const ValueDecl *VD = C.getCapturedVar();
 486           markAsEscaped(VD);
 487           if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
 488             VisitValueDecl(VD);
 489         }
 490       }
 491     }
 492   }
 493   void VisitBlockExpr(const BlockExpr *E) {
 494     if (!E)
 495       return;
 496     for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
 497       if (C.isByRef()) {
 498         const VarDecl *VD = C.getVariable();
 499         markAsEscaped(VD);
 500         if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
 501           VisitValueDecl(VD);
 502       }
 503     }
 504   }
 505   void VisitCallExpr(const CallExpr *E) {
 506     if (!E)
 507       return;
 508     for (const Expr *Arg : E->arguments()) {
 509       if (!Arg)
 510         continue;
 511       if (Arg->isLValue()) {
 512         const bool SavedAllEscaped = AllEscaped;
 513         AllEscaped = true;
 514         Visit(Arg);
 515         AllEscaped = SavedAllEscaped;
 516       } else {
 517         Visit(Arg);
 518       }
 519     }
 520     Visit(E->getCallee());
 521   }
 522   void VisitDeclRefExpr(const DeclRefExpr *E) {
 523     if (!E)
 524       return;
 525     const ValueDecl *VD = E->getDecl();
 526     if (AllEscaped)
 527       markAsEscaped(VD);
 528     if (isa<OMPCapturedExprDecl>(VD))
 529       VisitValueDecl(VD);
 530     else if (const auto *VarD = dyn_cast<VarDecl>(VD))
 531       if (VarD->isInitCapture())
 532         VisitValueDecl(VD);
 533   }
 534   void VisitUnaryOperator(const UnaryOperator *E) {
 535     if (!E)
 536       return;
 537     if (E->getOpcode() == UO_AddrOf) {
 538       const bool SavedAllEscaped = AllEscaped;
 539       AllEscaped = true;
 540       Visit(E->getSubExpr());
 541       AllEscaped = SavedAllEscaped;
 542     } else {
 543       Visit(E->getSubExpr());
 544     }
 545   }
 546   void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
 547     if (!E)
 548       return;
 549     if (E->getCastKind() == CK_ArrayToPointerDecay) {
 550       const bool SavedAllEscaped = AllEscaped;
 551       AllEscaped = true;
 552       Visit(E->getSubExpr());
 553       AllEscaped = SavedAllEscaped;
 554     } else {
 555       Visit(E->getSubExpr());
 556     }
 557   }
 558   void VisitExpr(const Expr *E) {
 559     if (!E)
 560       return;
 561     bool SavedAllEscaped = AllEscaped;
 562     if (!E->isLValue())
 563       AllEscaped = false;
 564     for (const Stmt *Child : E->children())
 565       if (Child)
 566         Visit(Child);
 567     AllEscaped = SavedAllEscaped;
 568   }
 569   void VisitStmt(const Stmt *S) {
 570     if (!S)
 571       return;
 572     for (const Stmt *Child : S->children())
 573       if (Child)
 574         Visit(Child);
 575   }
 576
 577   /// Returns the record that handles all the escaped local variables and used
 578   /// instead of their original storage.
 579   const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
 580     if (!GlobalizedRD)
 581       buildRecordForGlobalizedVars(IsInTTDRegion);
 582     return GlobalizedRD;
 583   }
 584
 585   /// Returns the field in the globalized record for the escaped variable.
 586   const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
 587     assert(GlobalizedRD &&
 588            "Record for globalized variables must be generated already.");
 589     auto I = MappedDeclsFields.find(VD);
 590     if (I == MappedDeclsFields.end())
 591       return nullptr;
 592     return I->getSecond();
 593   }
 594
 595   /// Returns the list of the escaped local variables/parameters.
 596   ArrayRef<const ValueDecl *> getEscapedDecls() const {
 597     return EscapedDecls.getArrayRef();
 598   }
 599
 600   /// Checks if the escaped local variable is actually a parameter passed by
 601   /// value.
 602   const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
 603     return EscapedParameters;
 604   }
 605
 606   /// Returns the list of the escaped variables with the variably modified
 607   /// types.
 608   ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
 609     return EscapedVariableLengthDecls.getArrayRef();
 610   }
 611 };
 612 } // anonymous namespace
 613
 614 /// Get the GPU warp size.
 615 static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
 616   return CGF.EmitRuntimeCall(
 617       llvm::Intrinsic::getDeclaration(
 618           &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
 619       "nvptx_warp_size");
 620 }
 621
 622 /// Get the id of the current thread on the GPU.
 623 static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
 624   return CGF.EmitRuntimeCall(
 625       llvm::Intrinsic::getDeclaration(
 626           &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
 627       "nvptx_tid");
 628 }
 629
 630 /// Get the id of the warp in the block.
 631 /// We assume that the warp size is 32, which is always the case
 632 /// on the NVPTX device, to generate more efficient code.
 633 static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
 634   CGBuilderTy &Bld = CGF.Builder;
 635   return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
 636 }
 637
 638 /// Get the id of the current lane in the Warp.
 639 /// We assume that the warp size is 32, which is always the case
 640 /// on the NVPTX device, to generate more efficient code.
 641 static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
 642   CGBuilderTy &Bld = CGF.Builder;
 643   return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
 644                        "nvptx_lane_id");
 645 }
 646
 647 /// Get the maximum number of threads in a block of the GPU.
 648 static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
 649   return CGF.EmitRuntimeCall(
 650       llvm::Intrinsic::getDeclaration(
 651           &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
 652       "nvptx_num_threads");
 653 }
 654
 655 /// Get the value of the thread_limit clause in the teams directive.
 656 /// For the 'generic' execution mode, the runtime encodes thread_limit in
 657 /// the launch parameters, always starting thread_limit+warpSize threads per
 658 /// CTA. The threads in the last warp are reserved for master execution.
 659 /// For the 'spmd' execution mode, all threads in a CTA are part of the team.
 660 static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
 661                                    bool IsInSPMDExecutionMode = false) {
 662   CGBuilderTy &Bld = CGF.Builder;
 663   return IsInSPMDExecutionMode
 664              ? getNVPTXNumThreads(CGF)
 665              : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
 666                                 "thread_limit");
 667 }
 668
 669 /// Get the thread id of the OMP master thread.
 670 /// The master thread id is the first thread (lane) of the last warp in the
 671 /// GPU block.  Warp size is assumed to be some power of 2.
 672 /// Thread id is 0 indexed.
 673 /// E.g: If NumThreads is 33, master id is 32.
 674 ///      If NumThreads is 64, master id is 32.
 675 ///      If NumThreads is 1024, master id is 992.
 676 static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
 677   CGBuilderTy &Bld = CGF.Builder;
 678   llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
 679
 680   // We assume that the warp size is a power of 2.
 681   llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
 682
 683   return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
 684                        Bld.CreateNot(Mask), "master_tid");
 685 }
 686
 687 CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
 688     CodeGenModule &CGM, SourceLocation Loc)
 689     : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
 690       Loc(Loc) {
 691   createWorkerFunction(CGM);
 692 }
 693
 694 void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
 695     CodeGenModule &CGM) {
 696   // Create an worker function with no arguments.
 697
 698   WorkerFn = llvm::Function::Create(
 699       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
 700       /*placeholder=*/"_worker", &CGM.getModule());
 701   CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
 702   WorkerFn->setDoesNotRecurse();
 703 }
 704
 705 CGOpenMPRuntimeNVPTX::ExecutionMode
 706 CGOpenMPRuntimeNVPTX::getExecutionMode() const {
 707   return CurrentExecutionMode;
 708 }
 709
 710 static CGOpenMPRuntimeNVPTX::DataSharingMode
 711 getDataSharingMode(CodeGenModule &CGM) {
 712   return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
 713                                           : CGOpenMPRuntimeNVPTX::Generic;
 714 }
 715
 716 /// Check for inner (nested) SPMD construct, if any
 717 static bool hasNestedSPMDDirective(ASTContext &Ctx,
 718                                    const OMPExecutableDirective &D) {
 719   const auto *CS = D.getInnermostCapturedStmt();
 720   const auto *Body =
 721       CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
 722   const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
 723
 724   if (const auto *NestedDir =
 725           dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
 726     OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
 727     switch (D.getDirectiveKind()) {
 728     case OMPD_target:
 729       if (isOpenMPParallelDirective(DKind))
 730         return true;
 731       if (DKind == OMPD_teams) {
 732         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
 733             /*IgnoreCaptured=*/true);
 734         if (!Body)
 735           return false;
 736         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
 737         if (const auto *NND =
 738                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
 739           DKind = NND->getDirectiveKind();
 740           if (isOpenMPParallelDirective(DKind))
 741             return true;
 742         }
 743       }
 744       return false;
 745     case OMPD_target_teams:
 746       return isOpenMPParallelDirective(DKind);
 747     case OMPD_target_simd:
 748     case OMPD_target_parallel:
 749     case OMPD_target_parallel_for:
 750     case OMPD_target_parallel_for_simd:
 751     case OMPD_target_teams_distribute:
 752     case OMPD_target_teams_distribute_simd:
 753     case OMPD_target_teams_distribute_parallel_for:
 754     case OMPD_target_teams_distribute_parallel_for_simd:
 755     case OMPD_parallel:
 756     case OMPD_for:
 757     case OMPD_parallel_for:
 758     case OMPD_parallel_sections:
 759     case OMPD_for_simd:
 760     case OMPD_parallel_for_simd:
 761     case OMPD_cancel:
 762     case OMPD_cancellation_point:
 763     case OMPD_ordered:
 764     case OMPD_threadprivate:
 765     case OMPD_allocate:
 766     case OMPD_task:
 767     case OMPD_simd:
 768     case OMPD_sections:
 769     case OMPD_section:
 770     case OMPD_single:
 771     case OMPD_master:
 772     case OMPD_critical:
 773     case OMPD_taskyield:
 774     case OMPD_barrier:
 775     case OMPD_taskwait:
 776     case OMPD_taskgroup:
 777     case OMPD_atomic:
 778     case OMPD_flush:
 779     case OMPD_teams:
 780     case OMPD_target_data:
 781     case OMPD_target_exit_data:
 782     case OMPD_target_enter_data:
 783     case OMPD_distribute:
 784     case OMPD_distribute_simd:
 785     case OMPD_distribute_parallel_for:
 786     case OMPD_distribute_parallel_for_simd:
 787     case OMPD_teams_distribute:
 788     case OMPD_teams_distribute_simd:
 789     case OMPD_teams_distribute_parallel_for:
 790     case OMPD_teams_distribute_parallel_for_simd:
 791     case OMPD_target_update:
 792     case OMPD_declare_simd:
 793     case OMPD_declare_target:
 794     case OMPD_end_declare_target:
 795     case OMPD_declare_reduction:
 796     case OMPD_declare_mapper:
 797     case OMPD_taskloop:
 798     case OMPD_taskloop_simd:
 799     case OMPD_requires:
 800     case OMPD_unknown:
 801       llvm_unreachable("Unexpected directive.");
 802     }
 803   }
 804
 805   return false;
 806 }
 807
 808 static bool supportsSPMDExecutionMode(ASTContext &Ctx,
 809                                       const OMPExecutableDirective &D) {
 810   OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
 811   switch (DirectiveKind) {
 812   case OMPD_target:
 813   case OMPD_target_teams:
 814     return hasNestedSPMDDirective(Ctx, D);
 815   case OMPD_target_parallel:
 816   case OMPD_target_parallel_for:
 817   case OMPD_target_parallel_for_simd:
 818   case OMPD_target_teams_distribute_parallel_for:
 819   case OMPD_target_teams_distribute_parallel_for_simd:
 820   case OMPD_target_simd:
 821   case OMPD_target_teams_distribute_simd:
 822     return true;
 823   case OMPD_target_teams_distribute:
 824     return false;
 825   case OMPD_parallel:
 826   case OMPD_for:
 827   case OMPD_parallel_for:
 828   case OMPD_parallel_sections:
 829   case OMPD_for_simd:
 830   case OMPD_parallel_for_simd:
 831   case OMPD_cancel:
 832   case OMPD_cancellation_point:
 833   case OMPD_ordered:
 834   case OMPD_threadprivate:
 835   case OMPD_allocate:
 836   case OMPD_task:
 837   case OMPD_simd:
 838   case OMPD_sections:
 839   case OMPD_section:
 840   case OMPD_single:
 841   case OMPD_master:
 842   case OMPD_critical:
 843   case OMPD_taskyield:
 844   case OMPD_barrier:
 845   case OMPD_taskwait:
 846   case OMPD_taskgroup:
 847   case OMPD_atomic:
 848   case OMPD_flush:
 849   case OMPD_teams:
 850   case OMPD_target_data:
 851   case OMPD_target_exit_data:
 852   case OMPD_target_enter_data:
 853   case OMPD_distribute:
 854   case OMPD_distribute_simd:
 855   case OMPD_distribute_parallel_for:
 856   case OMPD_distribute_parallel_for_simd:
 857   case OMPD_teams_distribute:
 858   case OMPD_teams_distribute_simd:
 859   case OMPD_teams_distribute_parallel_for:
 860   case OMPD_teams_distribute_parallel_for_simd:
 861   case OMPD_target_update:
 862   case OMPD_declare_simd:
 863   case OMPD_declare_target:
 864   case OMPD_end_declare_target:
 865   case OMPD_declare_reduction:
 866   case OMPD_declare_mapper:
 867   case OMPD_taskloop:
 868   case OMPD_taskloop_simd:
 869   case OMPD_requires:
 870   case OMPD_unknown:
 871     break;
 872   }
 873   llvm_unreachable(
 874       "Unknown programming model for OpenMP directive on NVPTX target.");
 875 }
 876
 877 /// Check if the directive is loops based and has schedule clause at all or has
 878 /// static scheduling.
 879 static bool hasStaticScheduling(const OMPExecutableDirective &D) {
 880   assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
 881          isOpenMPLoopDirective(D.getDirectiveKind()) &&
 882          "Expected loop-based directive.");
 883   return !D.hasClausesOfKind<OMPOrderedClause>() &&
 884          (!D.hasClausesOfKind<OMPScheduleClause>() ||
 885           llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
 886                        [](const OMPScheduleClause *C) {
 887                          return C->getScheduleKind() == OMPC_SCHEDULE_static;
 888                        }));
 889 }
 890
 891 /// Check for inner (nested) lightweight runtime construct, if any
 892 static bool hasNestedLightweightDirective(ASTContext &Ctx,
 893                                           const OMPExecutableDirective &D) {
 894   assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
 895   const auto *CS = D.getInnermostCapturedStmt();
 896   const auto *Body =
 897       CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
 898   const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
 899
 900   if (const auto *NestedDir =
 901           dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
 902     OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
 903     switch (D.getDirectiveKind()) {
 904     case OMPD_target:
 905       if (isOpenMPParallelDirective(DKind) &&
 906           isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
 907           hasStaticScheduling(*NestedDir))
 908         return true;
 909       if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
 910         return true;
 911       if (DKind == OMPD_parallel) {
 912         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
 913             /*IgnoreCaptured=*/true);
 914         if (!Body)
 915           return false;
 916         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
 917         if (const auto *NND =
 918                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
 919           DKind = NND->getDirectiveKind();
 920           if (isOpenMPWorksharingDirective(DKind) &&
 921               isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
 922             return true;
 923         }
 924       } else if (DKind == OMPD_teams) {
 925         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
 926             /*IgnoreCaptured=*/true);
 927         if (!Body)
 928           return false;
 929         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
 930         if (const auto *NND =
 931                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
 932           DKind = NND->getDirectiveKind();
 933           if (isOpenMPParallelDirective(DKind) &&
 934               isOpenMPWorksharingDirective(DKind) &&
 935               isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
 936             return true;
 937           if (DKind == OMPD_parallel) {
 938             Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
 939                 /*IgnoreCaptured=*/true);
 940             if (!Body)
 941               return false;
 942             ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
 943             if (const auto *NND =
 944                     dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
 945               DKind = NND->getDirectiveKind();
 946               if (isOpenMPWorksharingDirective(DKind) &&
 947                   isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
 948                 return true;
 949             }
 950           }
 951         }
 952       }
 953       return false;
 954     case OMPD_target_teams:
 955       if (isOpenMPParallelDirective(DKind) &&
 956           isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
 957           hasStaticScheduling(*NestedDir))
 958         return true;
 959       if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
 960         return true;
 961       if (DKind == OMPD_parallel) {
 962         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
 963             /*IgnoreCaptured=*/true);
 964         if (!Body)
 965           return false;
 966         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
 967         if (const auto *NND =
 968                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
 969           DKind = NND->getDirectiveKind();
 970           if (isOpenMPWorksharingDirective(DKind) &&
 971               isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
 972             return true;
 973         }
 974       }
 975       return false;
 976     case OMPD_target_parallel:
 977       if (DKind == OMPD_simd)
 978         return true;
 979       return isOpenMPWorksharingDirective(DKind) &&
 980              isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
 981     case OMPD_target_teams_distribute:
 982     case OMPD_target_simd:
 983     case OMPD_target_parallel_for:
 984     case OMPD_target_parallel_for_simd:
 985     case OMPD_target_teams_distribute_simd:
 986     case OMPD_target_teams_distribute_parallel_for:
 987     case OMPD_target_teams_distribute_parallel_for_simd:
 988     case OMPD_parallel:
 989     case OMPD_for:
 990     case OMPD_parallel_for:
 991     case OMPD_parallel_sections:
 992     case OMPD_for_simd:
 993     case OMPD_parallel_for_simd:
 994     case OMPD_cancel:
 995     case OMPD_cancellation_point:
 996     case OMPD_ordered:
 997     case OMPD_threadprivate:
 998     case OMPD_allocate:
 999     case OMPD_task:
1000     case OMPD_simd:
1001     case OMPD_sections:
1002     case OMPD_section:
1003     case OMPD_single:
1004     case OMPD_master:
1005     case OMPD_critical:
1006     case OMPD_taskyield:
1007     case OMPD_barrier:
1008     case OMPD_taskwait:
1009     case OMPD_taskgroup:
1010     case OMPD_atomic:
1011     case OMPD_flush:
1012     case OMPD_teams:
1013     case OMPD_target_data:
1014     case OMPD_target_exit_data:
1015     case OMPD_target_enter_data:
1016     case OMPD_distribute:
1017     case OMPD_distribute_simd:
1018     case OMPD_distribute_parallel_for:
1019     case OMPD_distribute_parallel_for_simd:
1020     case OMPD_teams_distribute:
1021     case OMPD_teams_distribute_simd:
1022     case OMPD_teams_distribute_parallel_for:
1023     case OMPD_teams_distribute_parallel_for_simd:
1024     case OMPD_target_update:
1025     case OMPD_declare_simd:
1026     case OMPD_declare_target:
1027     case OMPD_end_declare_target:
1028     case OMPD_declare_reduction:
1029     case OMPD_declare_mapper:
1030     case OMPD_taskloop:
1031     case OMPD_taskloop_simd:
1032     case OMPD_requires:
1033     case OMPD_unknown:
1034       llvm_unreachable("Unexpected directive.");
1035     }
1036   }
1037
1038   return false;
1039 }
1040
1041 /// Checks if the construct supports lightweight runtime. It must be SPMD
1042 /// construct + inner loop-based construct with static scheduling.
1043 static bool supportsLightweightRuntime(ASTContext &Ctx,
1044                                        const OMPExecutableDirective &D) {
1045   if (!supportsSPMDExecutionMode(Ctx, D))
1046     return false;
1047   OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
1048   switch (DirectiveKind) {
1049   case OMPD_target:
1050   case OMPD_target_teams:
1051   case OMPD_target_parallel:
1052     return hasNestedLightweightDirective(Ctx, D);
1053   case OMPD_target_parallel_for:
1054   case OMPD_target_parallel_for_simd:
1055   case OMPD_target_teams_distribute_parallel_for:
1056   case OMPD_target_teams_distribute_parallel_for_simd:
1057     // (Last|First)-privates must be shared in parallel region.
1058     return hasStaticScheduling(D);
1059   case OMPD_target_simd:
1060   case OMPD_target_teams_distribute_simd:
1061     return true;
1062   case OMPD_target_teams_distribute:
1063     return false;
1064   case OMPD_parallel:
1065   case OMPD_for:
1066   case OMPD_parallel_for:
1067   case OMPD_parallel_sections:
1068   case OMPD_for_simd:
1069   case OMPD_parallel_for_simd:
1070   case OMPD_cancel:
1071   case OMPD_cancellation_point:
1072   case OMPD_ordered:
1073   case OMPD_threadprivate:
1074   case OMPD_allocate:
1075   case OMPD_task:
1076   case OMPD_simd:
1077   case OMPD_sections:
1078   case OMPD_section:
1079   case OMPD_single:
1080   case OMPD_master:
1081   case OMPD_critical:
1082   case OMPD_taskyield:
1083   case OMPD_barrier:
1084   case OMPD_taskwait:
1085   case OMPD_taskgroup:
1086   case OMPD_atomic:
1087   case OMPD_flush:
1088   case OMPD_teams:
1089   case OMPD_target_data:
1090   case OMPD_target_exit_data:
1091   case OMPD_target_enter_data:
1092   case OMPD_distribute:
1093   case OMPD_distribute_simd:
1094   case OMPD_distribute_parallel_for:
1095   case OMPD_distribute_parallel_for_simd:
1096   case OMPD_teams_distribute:
1097   case OMPD_teams_distribute_simd:
1098   case OMPD_teams_distribute_parallel_for:
1099   case OMPD_teams_distribute_parallel_for_simd:
1100   case OMPD_target_update:
1101   case OMPD_declare_simd:
1102   case OMPD_declare_target:
1103   case OMPD_end_declare_target:
1104   case OMPD_declare_reduction:
1105   case OMPD_declare_mapper:
1106   case OMPD_taskloop:
1107   case OMPD_taskloop_simd:
1108   case OMPD_requires:
1109   case OMPD_unknown:
1110     break;
1111   }
1112   llvm_unreachable(
1113       "Unknown programming model for OpenMP directive on NVPTX target.");
1114 }
1115
1116 void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
1117                                              StringRef ParentName,
1118                                              llvm::Function *&OutlinedFn,
1119                                              llvm::Constant *&OutlinedFnID,
1120                                              bool IsOffloadEntry,
1121                                              const RegionCodeGenTy &CodeGen) {
1122   ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1123   EntryFunctionState EST;
1124   WorkerFunctionState WST(CGM, D.getBeginLoc());
1125   Work.clear();
1126   WrapperFunctionsMap.clear();
1127
1128   // Emit target region as a standalone region.
1129   class NVPTXPrePostActionTy : public PrePostActionTy {
1130     CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1131     CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
1132
1133   public:
1134     NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1135                          CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
1136         : EST(EST), WST(WST) {}
1137     void Enter(CodeGenFunction &CGF) override {
1138       auto &RT =
1139           static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1140       RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1141       // Skip target region initialization.
1142       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1143     }
1144     void Exit(CodeGenFunction &CGF) override {
1145       auto &RT =
1146           static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1147       RT.clearLocThreadIdInsertPt(CGF);
1148       RT.emitNonSPMDEntryFooter(CGF, EST);
1149     }
1150   } Action(EST, WST);
1151   CodeGen.setAction(Action);
1152   IsInTTDRegion = true;
1153   // Reserve place for the globalized memory.
1154   GlobalizedRecords.emplace_back();
1155   if (!KernelStaticGlobalized) {
1156     KernelStaticGlobalized = new llvm::GlobalVariable(
1157         CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1158         llvm::GlobalValue::InternalLinkage,
1159         llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1160         "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1161         llvm::GlobalValue::NotThreadLocal,
1162         CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1163   }
1164   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1165                                    IsOffloadEntry, CodeGen);
1166   IsInTTDRegion = false;
1167
1168   // Now change the name of the worker function to correspond to this target
1169   // region's entry function.
1170   WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
1171
1172   // Create the worker function
1173   emitWorkerFunction(WST);
1174 }
1175
1176 // Setup NVPTX threads for master-worker OpenMP scheme.
1177 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
1178                                                   EntryFunctionState &EST,
1179                                                   WorkerFunctionState &WST) {
1180   CGBuilderTy &Bld = CGF.Builder;
1181
1182   llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1183   llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1184   llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1185   EST.ExitBB = CGF.createBasicBlock(".exit");
1186
1187   llvm::Value *IsWorker =
1188       Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1189   Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1190
1191   CGF.EmitBlock(WorkerBB);
1192   emitCall(CGF, WST.Loc, WST.WorkerFn);
1193   CGF.EmitBranch(EST.ExitBB);
1194
1195   CGF.EmitBlock(MasterCheckBB);
1196   llvm::Value *IsMaster =
1197       Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1198   Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1199
1200   CGF.EmitBlock(MasterBB);
1201   IsInTargetMasterThreadRegion = true;
1202   // SEQUENTIAL (MASTER) REGION START
1203   // First action in sequential region:
1204   // Initialize the state of the OpenMP runtime library on the GPU.
1205   // TODO: Optimize runtime initialization and pass in correct value.
1206   llvm::Value *Args[] = {getThreadLimit(CGF),
1207                          Bld.getInt16(/*RequiresOMPRuntime=*/1)};
1208   CGF.EmitRuntimeCall(
1209       createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
1210
1211   // For data sharing, we need to initialize the stack.
1212   CGF.EmitRuntimeCall(
1213       createNVPTXRuntimeFunction(
1214           OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1215
1216   emitGenericVarsProlog(CGF, WST.Loc);
1217 }
1218
1219 void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
1220                                                   EntryFunctionState &EST) {
1221   IsInTargetMasterThreadRegion = false;
1222   if (!CGF.HaveInsertPoint())
1223     return;
1224
1225   emitGenericVarsEpilog(CGF);
1226
1227   if (!EST.ExitBB)
1228     EST.ExitBB = CGF.createBasicBlock(".exit");
1229
1230   llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1231   CGF.EmitBranch(TerminateBB);
1232
1233   CGF.EmitBlock(TerminateBB);
1234   // Signal termination condition.
1235   // TODO: Optimize runtime initialization and pass in correct value.
1236   llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
1237   CGF.EmitRuntimeCall(
1238       createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
1239   // Barrier to terminate worker threads.
1240   syncCTAThreads(CGF);
1241   // Master thread jumps to exit point.
1242   CGF.EmitBranch(EST.ExitBB);
1243
1244   CGF.EmitBlock(EST.ExitBB);
1245   EST.ExitBB = nullptr;
1246 }
1247
1248 void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
1249                                           StringRef ParentName,
1250                                           llvm::Function *&OutlinedFn,
1251                                           llvm::Constant *&OutlinedFnID,
1252                                           bool IsOffloadEntry,
1253                                           const RegionCodeGenTy &CodeGen) {
1254   ExecutionRuntimeModesRAII ModeRAII(
1255       CurrentExecutionMode, RequiresFullRuntime,
1256       CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1257           !supportsLightweightRuntime(CGM.getContext(), D));
1258   EntryFunctionState EST;
1259
1260   // Emit target region as a standalone region.
1261   class NVPTXPrePostActionTy : public PrePostActionTy {
1262     CGOpenMPRuntimeNVPTX &RT;
1263     CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1264     const OMPExecutableDirective &D;
1265
1266   public:
1267     NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1268                          CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1269                          const OMPExecutableDirective &D)
1270         : RT(RT), EST(EST), D(D) {}
1271     void Enter(CodeGenFunction &CGF) override {
1272       RT.emitSPMDEntryHeader(CGF, EST, D);
1273       // Skip target region initialization.
1274       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1275     }
1276     void Exit(CodeGenFunction &CGF) override {
1277       RT.clearLocThreadIdInsertPt(CGF);
1278       RT.emitSPMDEntryFooter(CGF, EST);
1279     }
1280   } Action(*this, EST, D);
1281   CodeGen.setAction(Action);
1282   IsInTTDRegion = true;
1283   // Reserve place for the globalized memory.
1284   GlobalizedRecords.emplace_back();
1285   if (!KernelStaticGlobalized) {
1286     KernelStaticGlobalized = new llvm::GlobalVariable(
1287         CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1288         llvm::GlobalValue::InternalLinkage,
1289         llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1290         "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1291         llvm::GlobalValue::NotThreadLocal,
1292         CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1293   }
1294   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1295                                    IsOffloadEntry, CodeGen);
1296   IsInTTDRegion = false;
1297 }
1298
1299 void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
1300     CodeGenFunction &CGF, EntryFunctionState &EST,
1301     const OMPExecutableDirective &D) {
1302   CGBuilderTy &Bld = CGF.Builder;
1303
1304   // Setup BBs in entry function.
1305   llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1306   EST.ExitBB = CGF.createBasicBlock(".exit");
1307
1308   llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1309                          /*RequiresOMPRuntime=*/
1310                          Bld.getInt16(RequiresFullRuntime ? 1 : 0),
1311                          /*RequiresDataSharing=*/Bld.getInt16(0)};
1312   CGF.EmitRuntimeCall(
1313       createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
1314
1315   if (RequiresFullRuntime) {
1316     // For data sharing, we need to initialize the stack.
1317     CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1318         OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1319   }
1320
1321   CGF.EmitBranch(ExecuteBB);
1322
1323   CGF.EmitBlock(ExecuteBB);
1324
1325   IsInTargetMasterThreadRegion = true;
1326 }
1327
1328 void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
1329                                                EntryFunctionState &EST) {
1330   IsInTargetMasterThreadRegion = false;
1331   if (!CGF.HaveInsertPoint())
1332     return;
1333
1334   if (!EST.ExitBB)
1335     EST.ExitBB = CGF.createBasicBlock(".exit");
1336
1337   llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1338   CGF.EmitBranch(OMPDeInitBB);
1339
1340   CGF.EmitBlock(OMPDeInitBB);
1341   // DeInitialize the OMP state in the runtime; called by all active threads.
1342   llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
1343                          CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
1344   CGF.EmitRuntimeCall(
1345       createNVPTXRuntimeFunction(
1346           OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args);
1347   CGF.EmitBranch(EST.ExitBB);
1348
1349   CGF.EmitBlock(EST.ExitBB);
1350   EST.ExitBB = nullptr;
1351 }
1352
1353 // Create a unique global variable to indicate the execution mode of this target
1354 // region. The execution mode is either 'generic', or 'spmd' depending on the
1355 // target directive. This variable is picked up by the offload library to setup
1356 // the device appropriately before kernel launch. If the execution mode is
1357 // 'generic', the runtime reserves one warp for the master, otherwise, all
1358 // warps participate in parallel work.
1359 static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1360                                      bool Mode) {
1361   auto *GVMode =
1362       new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1363                                llvm::GlobalValue::WeakAnyLinkage,
1364                                llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1365                                Twine(Name, "_exec_mode"));
1366   CGM.addCompilerUsedGlobal(GVMode);
1367 }
1368
1369 void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
1370   ASTContext &Ctx = CGM.getContext();
1371
1372   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1373   CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
1374                     WST.Loc, WST.Loc);
1375   emitWorkerLoop(CGF, WST);
1376   CGF.FinishFunction();
1377 }
1378
1379 void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1380                                           WorkerFunctionState &WST) {
1381   //
1382   // The workers enter this loop and wait for parallel work from the master.
1383   // When the master encounters a parallel region it sets up the work + variable
1384   // arguments, and wakes up the workers.  The workers first check to see if
1385   // they are required for the parallel region, i.e., within the # of requested
1386   // parallel threads.  The activated workers load the variable arguments and
1387   // execute the parallel work.
1388   //
1389
1390   CGBuilderTy &Bld = CGF.Builder;
1391
1392   llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1393   llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1394   llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1395   llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1396   llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1397   llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1398
1399   CGF.EmitBranch(AwaitBB);
1400
1401   // Workers wait for work from master.
1402   CGF.EmitBlock(AwaitBB);
1403   // Wait for parallel work
1404   syncCTAThreads(CGF);
1405
1406   Address WorkFn =
1407       CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1408   Address ExecStatus =
1409       CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1410   CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1411   CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1412
1413   // TODO: Optimize runtime initialization and pass in correct value.
1414   llvm::Value *Args[] = {WorkFn.getPointer(),
1415                          /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1416   llvm::Value *Ret = CGF.EmitRuntimeCall(
1417       createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1418   Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
1419
1420   // On termination condition (workid == 0), exit loop.
1421   llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1422   llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
1423   Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1424
1425   // Activate requested workers.
1426   CGF.EmitBlock(SelectWorkersBB);
1427   llvm::Value *IsActive =
1428       Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1429   Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
1430
1431   // Signal start of parallel region.
1432   CGF.EmitBlock(ExecuteBB);
1433   // Skip initialization.
1434   setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1435
1436   // Process work items: outlined parallel functions.
1437   for (llvm::Function *W : Work) {
1438     // Try to match this outlined function.
1439     llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
1440
1441     llvm::Value *WorkFnMatch =
1442         Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1443
1444     llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1445     llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1446     Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1447
1448     // Execute this outlined function.
1449     CGF.EmitBlock(ExecuteFNBB);
1450
1451     // Insert call to work function via shared wrapper. The shared
1452     // wrapper takes two arguments:
1453     //   - the parallelism level;
1454     //   - the thread ID;
1455     emitCall(CGF, WST.Loc, W,
1456              {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1457
1458     // Go to end of parallel region.
1459     CGF.EmitBranch(TerminateBB);
1460
1461     CGF.EmitBlock(CheckNextBB);
1462   }
1463   // Default case: call to outlined function through pointer if the target
1464   // region makes a declare target call that may contain an orphaned parallel
1465   // directive.
1466   auto *ParallelFnTy =
1467       llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1468                               /*isVarArg=*/false);
1469   llvm::Value *WorkFnCast =
1470       Bld.CreateBitCast(WorkID, ParallelFnTy->getPointerTo());
1471   // Insert call to work function via shared wrapper. The shared
1472   // wrapper takes two arguments:
1473   //   - the parallelism level;
1474   //   - the thread ID;
1475   emitCall(CGF, WST.Loc, {ParallelFnTy, WorkFnCast},
1476            {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1477   // Go to end of parallel region.
1478   CGF.EmitBranch(TerminateBB);
1479
1480   // Signal end of parallel region.
1481   CGF.EmitBlock(TerminateBB);
1482   CGF.EmitRuntimeCall(
1483       createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1484       llvm::None);
1485   CGF.EmitBranch(BarrierBB);
1486
1487   // All active and inactive workers wait at a barrier after parallel region.
1488   CGF.EmitBlock(BarrierBB);
1489   // Barrier after parallel region.
1490   syncCTAThreads(CGF);
1491   CGF.EmitBranch(AwaitBB);
1492
1493   // Exit target region.
1494   CGF.EmitBlock(ExitBB);
1495   // Skip initialization.
1496   clearLocThreadIdInsertPt(CGF);
1497 }
1498
1499 /// Returns specified OpenMP runtime function for the current OpenMP
1500 /// implementation.  Specialized for the NVPTX device.
1501 /// \param Function OpenMP runtime function.
1502 /// \return Specified function.
1503 llvm::FunctionCallee
1504 CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
1505   llvm::FunctionCallee RTLFn = nullptr;
1506   switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1507   case OMPRTL_NVPTX__kmpc_kernel_init: {
1508     // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1509     // RequiresOMPRuntime);
1510     llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
1511     auto *FnTy =
1512         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1513     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1514     break;
1515   }
1516   case OMPRTL_NVPTX__kmpc_kernel_deinit: {
1517     // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1518     llvm::Type *TypeParams[] = {CGM.Int16Ty};
1519     auto *FnTy =
1520         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1521     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1522     break;
1523   }
1524   case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1525     // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
1526     // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
1527     llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1528     auto *FnTy =
1529         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1530     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1531     break;
1532   }
1533   case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: {
1534     // Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
1535     llvm::Type *TypeParams[] = {CGM.Int16Ty};
1536     auto *FnTy =
1537         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1538     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2");
1539     break;
1540   }
1541   case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1542     /// Build void __kmpc_kernel_prepare_parallel(
1543     /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
1544     llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
1545     auto *FnTy =
1546         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1547     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1548     break;
1549   }
1550   case OMPRTL_NVPTX__kmpc_kernel_parallel: {
1551     /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1552     /// int16_t IsOMPRuntimeInitialized);
1553     llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
1554     llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
1555     auto *FnTy =
1556         llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1557     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1558     break;
1559   }
1560   case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1561     /// Build void __kmpc_kernel_end_parallel();
1562     auto *FnTy =
1563         llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1564     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1565     break;
1566   }
1567   case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1568     // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1569     // global_tid);
1570     llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1571     auto *FnTy =
1572         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1573     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1574     break;
1575   }
1576   case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1577     // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1578     // global_tid);
1579     llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1580     auto *FnTy =
1581         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1582     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1583     break;
1584   }
1585   case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1586     // Build int32_t __kmpc_shuffle_int32(int32_t element,
1587     // int16_t lane_offset, int16_t warp_size);
1588     llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
1589     auto *FnTy =
1590         llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1591     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1592     break;
1593   }
1594   case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1595     // Build int64_t __kmpc_shuffle_int64(int64_t element,
1596     // int16_t lane_offset, int16_t warp_size);
1597     llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
1598     auto *FnTy =
1599         llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1600     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1601     break;
1602   }
1603   case OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2: {
1604     // Build int32_t kmpc_nvptx_parallel_reduce_nowait_v2(ident_t *loc,
1605     // kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void*
1606     // reduce_data, void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t
1607     // lane_id, int16_t lane_offset, int16_t Algorithm Version), void
1608     // (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1609     llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1610                                              CGM.Int16Ty, CGM.Int16Ty};
1611     auto *ShuffleReduceFnTy =
1612         llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1613                                 /*isVarArg=*/false);
1614     llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1615     auto *InterWarpCopyFnTy =
1616         llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1617                                 /*isVarArg=*/false);
1618     llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1619                                 CGM.Int32Ty,
1620                                 CGM.Int32Ty,
1621                                 CGM.SizeTy,
1622                                 CGM.VoidPtrTy,
1623                                 ShuffleReduceFnTy->getPointerTo(),
1624                                 InterWarpCopyFnTy->getPointerTo()};
1625     auto *FnTy =
1626         llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1627     RTLFn = CGM.CreateRuntimeFunction(
1628         FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait_v2");
1629     break;
1630   }
1631   case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1632     // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1633     llvm::Type *TypeParams[] = {CGM.Int32Ty};
1634     auto *FnTy =
1635         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1636     RTLFn = CGM.CreateRuntimeFunction(
1637         FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1638     break;
1639   }
1640   case OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2: {
1641     // Build int32_t __kmpc_nvptx_teams_reduce_nowait_v2(ident_t *loc, kmp_int32
1642     // global_tid, void *global_buffer, int32_t num_of_records, void*
1643     // reduce_data,
1644     // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1645     // lane_offset, int16_t shortCircuit),
1646     // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), void
1647     // (*kmp_ListToGlobalCpyFctPtr)(void *buffer, int idx, void *reduce_data),
1648     // void (*kmp_GlobalToListCpyFctPtr)(void *buffer, int idx,
1649     // void *reduce_data), void (*kmp_GlobalToListCpyPtrsFctPtr)(void *buffer,
1650     // int idx, void *reduce_data), void (*kmp_GlobalToListRedFctPtr)(void
1651     // *buffer, int idx, void *reduce_data));
1652     llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1653                                              CGM.Int16Ty, CGM.Int16Ty};
1654     auto *ShuffleReduceFnTy =
1655         llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1656                                 /*isVarArg=*/false);
1657     llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1658     auto *InterWarpCopyFnTy =
1659         llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1660                                 /*isVarArg=*/false);
1661     llvm::Type *GlobalListTypeParams[] = {CGM.VoidPtrTy, CGM.IntTy,
1662                                           CGM.VoidPtrTy};
1663     auto *GlobalListFnTy =
1664         llvm::FunctionType::get(CGM.VoidTy, GlobalListTypeParams,
1665                                 /*isVarArg=*/false);
1666     llvm::Type *TypeParams[] = {getIdentTyPointerTy(),
1667                                 CGM.Int32Ty,
1668                                 CGM.VoidPtrTy,
1669                                 CGM.Int32Ty,
1670                                 CGM.VoidPtrTy,
1671                                 ShuffleReduceFnTy->getPointerTo(),
1672                                 InterWarpCopyFnTy->getPointerTo(),
1673                                 GlobalListFnTy->getPointerTo(),
1674                                 GlobalListFnTy->getPointerTo(),
1675                                 GlobalListFnTy->getPointerTo(),
1676                                 GlobalListFnTy->getPointerTo()};
1677     auto *FnTy =
1678         llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1679     RTLFn = CGM.CreateRuntimeFunction(
1680         FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait_v2");
1681     break;
1682   }
1683   case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1684     /// Build void __kmpc_data_sharing_init_stack();
1685     auto *FnTy =
1686         llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1687     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1688     break;
1689   }
1690   case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1691     /// Build void __kmpc_data_sharing_init_stack_spmd();
1692     auto *FnTy =
1693         llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1694     RTLFn =
1695         CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
1696     break;
1697   }
1698   case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: {
1699     // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
1700     // int16_t UseSharedMemory);
1701     llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1702     auto *FnTy =
1703         llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1704     RTLFn = CGM.CreateRuntimeFunction(
1705         FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack");
1706     break;
1707   }
1708   case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1709     // Build void __kmpc_data_sharing_pop_stack(void *a);
1710     llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1711     auto *FnTy =
1712         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1713     RTLFn = CGM.CreateRuntimeFunction(FnTy,
1714                                       /*Name=*/"__kmpc_data_sharing_pop_stack");
1715     break;
1716   }
1717   case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1718     /// Build void __kmpc_begin_sharing_variables(void ***args,
1719     /// size_t n_args);
1720     llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1721     auto *FnTy =
1722         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1723     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1724     break;
1725   }
1726   case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1727     /// Build void __kmpc_end_sharing_variables();
1728     auto *FnTy =
1729         llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1730     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1731     break;
1732   }
1733   case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1734     /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1735     llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1736     auto *FnTy =
1737         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1738     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1739     break;
1740   }
1741   case OMPRTL_NVPTX__kmpc_parallel_level: {
1742     // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1743     llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1744     auto *FnTy =
1745         llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1746     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1747     break;
1748   }
1749   case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1750     // Build int8_t __kmpc_is_spmd_exec_mode();
1751     auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1752     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1753     break;
1754   }
1755   case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
1756     // Build void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
1757     // const void *buf, size_t size, int16_t is_shared, const void **res);
1758     llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.VoidPtrTy, CGM.SizeTy,
1759                                 CGM.Int16Ty, CGM.VoidPtrPtrTy};
1760     auto *FnTy =
1761         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1762     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
1763     break;
1764   }
1765   case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
1766     // Build void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
1767     // int16_t is_shared);
1768     llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty};
1769     auto *FnTy =
1770         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1771     RTLFn =
1772         CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
1773     break;
1774   }
1775   case OMPRTL__kmpc_barrier: {
1776     // Build void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
1777     llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1778     auto *FnTy =
1779         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1780     RTLFn = CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier");
1781     cast<llvm::Function>(RTLFn.getCallee())
1782         ->addFnAttr(llvm::Attribute::Convergent);
1783     break;
1784   }
1785   case OMPRTL__kmpc_barrier_simple_spmd: {
1786     // Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
1787     // global_tid);
1788     llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1789     auto *FnTy =
1790         llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1791     RTLFn =
1792         CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier_simple_spmd");
1793     cast<llvm::Function>(RTLFn.getCallee())
1794         ->addFnAttr(llvm::Attribute::Convergent);
1795     break;
1796   }
1797   }
1798   return RTLFn;
1799 }
1800
1801 void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1802                                               llvm::Constant *Addr,
1803                                               uint64_t Size, int32_t,
1804                                               llvm::GlobalValue::LinkageTypes) {
1805   // TODO: Add support for global variables on the device after declare target
1806   // support.
1807   if (!isa<llvm::Function>(Addr))
1808     return;
1809   llvm::Module &M = CGM.getModule();
1810   llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1811
1812   // Get "nvvm.annotations" metadata node
1813   llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1814
1815   llvm::Metadata *MDVals[] = {
1816       llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1817       llvm::ConstantAsMetadata::get(
1818           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1819   // Append metadata to nvvm.annotations
1820   MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1821 }
1822
1823 void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1824     const OMPExecutableDirective &D, StringRef ParentName,
1825     llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1826     bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1827   if (!IsOffloadEntry) // Nothing to do.
1828     return;
1829
1830   assert(!ParentName.empty() && "Invalid target region parent name!");
1831
1832   bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1833   if (Mode)
1834     emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1835                    CodeGen);
1836   else
1837     emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1838                       CodeGen);
1839
1840   setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1841 }
1842
1843 namespace {
1844 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
1845 /// Enum for accesseing the reserved_2 field of the ident_t struct.
1846 enum ModeFlagsTy : unsigned {
1847   /// Bit set to 1 when in SPMD mode.
1848   KMP_IDENT_SPMD_MODE = 0x01,
1849   /// Bit set to 1 when a simplified runtime is used.
1850   KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1851   LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)
1852 };
1853
1854 /// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1855 static const ModeFlagsTy UndefinedMode =
1856     (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1857 } // anonymous namespace
1858
1859 unsigned CGOpenMPRuntimeNVPTX::getDefaultLocationReserved2Flags() const {
1860   switch (getExecutionMode()) {
1861   case EM_SPMD:
1862     if (requiresFullRuntime())
1863       return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1864     return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1865   case EM_NonSPMD:
1866     assert(requiresFullRuntime() && "Expected full runtime.");
1867     return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1868   case EM_Unknown:
1869     return UndefinedMode;
1870   }
1871   llvm_unreachable("Unknown flags are requested.");
1872 }
1873
1874 CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
1875     : CGOpenMPRuntime(CGM, "_", "$") {
1876   if (!CGM.getLangOpts().OpenMPIsDevice)
1877     llvm_unreachable("OpenMP NVPTX can only handle device code.");
1878 }
1879
1880 void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
1881                                               OpenMPProcBindClauseKind ProcBind,
1882                                               SourceLocation Loc) {
1883   // Do nothing in case of SPMD mode and L0 parallel.
1884   if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1885     return;
1886
1887   CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1888 }
1889
1890 void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
1891                                                 llvm::Value *NumThreads,
1892                                                 SourceLocation Loc) {
1893   // Do nothing in case of SPMD mode and L0 parallel.
1894   if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
1895     return;
1896
1897   CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1898 }
1899
1900 void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
1901                                               const Expr *NumTeams,
1902                                               const Expr *ThreadLimit,
1903                                               SourceLocation Loc) {}
1904
1905 llvm::Function *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
1906     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1907     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1908   // Emit target region as a standalone region.
1909   class NVPTXPrePostActionTy : public PrePostActionTy {
1910     bool &IsInParallelRegion;
1911     bool PrevIsInParallelRegion;
1912
1913   public:
1914     NVPTXPrePostActionTy(bool &IsInParallelRegion)
1915         : IsInParallelRegion(IsInParallelRegion) {}
1916     void Enter(CodeGenFunction &CGF) override {
1917       PrevIsInParallelRegion = IsInParallelRegion;
1918       IsInParallelRegion = true;
1919     }
1920     void Exit(CodeGenFunction &CGF) override {
1921       IsInParallelRegion = PrevIsInParallelRegion;
1922     }
1923   } Action(IsInParallelRegion);
1924   CodeGen.setAction(Action);
1925   bool PrevIsInTTDRegion = IsInTTDRegion;
1926   IsInTTDRegion = false;
1927   bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1928   IsInTargetMasterThreadRegion = false;
1929   auto *OutlinedFun =
1930       cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1931           D, ThreadIDVar, InnermostKind, CodeGen));
1932   if (CGM.getLangOpts().Optimize) {
1933     OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
1934     OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
1935     OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
1936   }
1937   IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1938   IsInTTDRegion = PrevIsInTTDRegion;
1939   if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1940       !IsInParallelRegion) {
1941     llvm::Function *WrapperFun =
1942         createParallelDataSharingWrapper(OutlinedFun, D);
1943     WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1944   }
1945
1946   return OutlinedFun;
1947 }
1948
1949 /// Get list of lastprivate variables from the teams distribute ... or
1950 /// teams {distribute ...} directives.
1951 static void
1952 getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1953                              llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1954   assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1955          "expected teams directive.");
1956   const OMPExecutableDirective *Dir = &D;
1957   if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1958     if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
1959             Ctx,
1960             D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1961                 /*IgnoreCaptured=*/true))) {
1962       Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1963       if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1964         Dir = nullptr;
1965     }
1966   }
1967   if (!Dir)
1968     return;
1969   for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1970     for (const Expr *E : C->getVarRefs())
1971       Vars.push_back(getPrivateItem(E));
1972   }
1973 }
1974
1975 /// Get list of reduction variables from the teams ... directives.
1976 static void
1977 getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1978                       llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1979   assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1980          "expected teams directive.");
1981   for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
1982     for (const Expr *E : C->privates())
1983       Vars.push_back(getPrivateItem(E));
1984   }
1985 }
1986
1987 llvm::Function *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
1988     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1989     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1990   SourceLocation Loc = D.getBeginLoc();
1991
1992   const RecordDecl *GlobalizedRD = nullptr;
1993   llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1994   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1995   // Globalize team reductions variable unconditionally in all modes.
1996   if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
1997     getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1998   if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
1999     getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
2000     if (!LastPrivatesReductions.empty()) {
2001       GlobalizedRD = ::buildRecordForGlobalizedVars(
2002           CGM.getContext(), llvm::None, LastPrivatesReductions,
2003           MappedDeclsFields, WarpSize);
2004     }
2005   } else if (!LastPrivatesReductions.empty()) {
2006     assert(!TeamAndReductions.first &&
2007            "Previous team declaration is not expected.");
2008     TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
2009     std::swap(TeamAndReductions.second, LastPrivatesReductions);
2010   }
2011
2012   // Emit target region as a standalone region.
2013   class NVPTXPrePostActionTy : public PrePostActionTy {
2014     SourceLocation &Loc;
2015     const RecordDecl *GlobalizedRD;
2016     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2017         &MappedDeclsFields;
2018
2019   public:
2020     NVPTXPrePostActionTy(
2021         SourceLocation &Loc, const RecordDecl *GlobalizedRD,
2022         llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2023             &MappedDeclsFields)
2024         : Loc(Loc), GlobalizedRD(GlobalizedRD),
2025           MappedDeclsFields(MappedDeclsFields) {}
2026     void Enter(CodeGenFunction &CGF) override {
2027       auto &Rt =
2028           static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
2029       if (GlobalizedRD) {
2030         auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2031         I->getSecond().GlobalRecord = GlobalizedRD;
2032         I->getSecond().MappedParams =
2033             llvm::make_unique<CodeGenFunction::OMPMapVars>();
2034         DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2035         for (const auto &Pair : MappedDeclsFields) {
2036           assert(Pair.getFirst()->isCanonicalDecl() &&
2037                  "Expected canonical declaration");
2038           Data.insert(std::make_pair(Pair.getFirst(),
2039                                      MappedVarData(Pair.getSecond(),
2040                                                    /*IsOnePerTeam=*/true)));
2041         }
2042       }
2043       Rt.emitGenericVarsProlog(CGF, Loc);
2044     }
2045     void Exit(CodeGenFunction &CGF) override {
2046       static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
2047           .emitGenericVarsEpilog(CGF);
2048     }
2049   } Action(Loc, GlobalizedRD, MappedDeclsFields);
2050   CodeGen.setAction(Action);
2051   llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
2052       D, ThreadIDVar, InnermostKind, CodeGen);
2053   if (CGM.getLangOpts().Optimize) {
2054     OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
2055     OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
2056     OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
2057   }
2058
2059   return OutlinedFun;
2060 }
2061
2062 void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
2063                                                  SourceLocation Loc,
2064                                                  bool WithSPMDCheck) {
2065   if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
2066       getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2067     return;
2068
2069   CGBuilderTy &Bld = CGF.Builder;
2070
2071   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2072   if (I == FunctionGlobalizedDecls.end())
2073     return;
2074   if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
2075     QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
2076     QualType SecGlobalRecTy;
2077
2078     // Recover pointer to this function's global record. The runtime will
2079     // handle the specifics of the allocation of the memory.
2080     // Use actual memory size of the record including the padding
2081     // for alignment purposes.
2082     unsigned Alignment =
2083         CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2084     unsigned GlobalRecordSize =
2085         CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
2086     GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2087
2088     llvm::PointerType *GlobalRecPtrTy =
2089         CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
2090     llvm::Value *GlobalRecCastAddr;
2091     llvm::Value *IsTTD = nullptr;
2092     if (!IsInTTDRegion &&
2093         (WithSPMDCheck ||
2094          getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2095       llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2096       llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
2097       llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2098       if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
2099         llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2100         llvm::Value *ThreadID = getThreadID(CGF, Loc);
2101         llvm::Value *PL = CGF.EmitRuntimeCall(
2102             createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2103             {RTLoc, ThreadID});
2104         IsTTD = Bld.CreateIsNull(PL);
2105       }
2106       llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2107           createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2108       Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
2109       // There is no need to emit line number for unconditional branch.
2110       (void)ApplyDebugLocation::CreateEmpty(CGF);
2111       CGF.EmitBlock(SPMDBB);
2112       Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
2113                                CharUnits::fromQuantity(Alignment));
2114       CGF.EmitBranch(ExitBB);
2115       // There is no need to emit line number for unconditional branch.
2116       (void)ApplyDebugLocation::CreateEmpty(CGF);
2117       CGF.EmitBlock(NonSPMDBB);
2118       llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
2119       if (const RecordDecl *SecGlobalizedVarsRecord =
2120               I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
2121         SecGlobalRecTy =
2122             CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
2123
2124         // Recover pointer to this function's global record. The runtime will
2125         // handle the specifics of the allocation of the memory.
2126         // Use actual memory size of the record including the padding
2127         // for alignment purposes.
2128         unsigned Alignment =
2129             CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
2130         unsigned GlobalRecordSize =
2131             CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
2132         GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2133         Size = Bld.CreateSelect(
2134             IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
2135       }
2136       // TODO: allow the usage of shared memory to be controlled by
2137       // the user, for now, default to global.
2138       llvm::Value *GlobalRecordSizeArg[] = {
2139           Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2140       llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2141           createNVPTXRuntimeFunction(
2142               OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2143           GlobalRecordSizeArg);
2144       GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2145           GlobalRecValue, GlobalRecPtrTy);
2146       CGF.EmitBlock(ExitBB);
2147       auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
2148                                 /*NumReservedValues=*/2, "_select_stack");
2149       Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
2150       Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
2151       GlobalRecCastAddr = Phi;
2152       I->getSecond().GlobalRecordAddr = Phi;
2153       I->getSecond().IsInSPMDModeFlag = IsSPMD;
2154     } else if (IsInTTDRegion) {
2155       assert(GlobalizedRecords.back().Records.size() < 2 &&
2156              "Expected less than 2 globalized records: one for target and one "
2157              "for teams.");
2158       unsigned Offset = 0;
2159       for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
2160         QualType RDTy = CGM.getContext().getRecordType(RD);
2161         unsigned Alignment =
2162             CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
2163         unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
2164         Offset =
2165             llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
2166       }
2167       unsigned Alignment =
2168           CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2169       Offset = llvm::alignTo(Offset, Alignment);
2170       GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
2171       ++GlobalizedRecords.back().RegionCounter;
2172       if (GlobalizedRecords.back().Records.size() == 1) {
2173         assert(KernelStaticGlobalized &&
2174                "Kernel static pointer must be initialized already.");
2175         auto *UseSharedMemory = new llvm::GlobalVariable(
2176             CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
2177             llvm::GlobalValue::InternalLinkage, nullptr,
2178             "_openmp_static_kernel$is_shared");
2179         UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2180         QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2181             /*DestWidth=*/16, /*Signed=*/0);
2182         llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2183             Address(UseSharedMemory,
2184                     CGM.getContext().getTypeAlignInChars(Int16Ty)),
2185             /*Volatile=*/false, Int16Ty, Loc);
2186         auto *StaticGlobalized = new llvm::GlobalVariable(
2187             CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
2188             llvm::GlobalValue::CommonLinkage, nullptr);
2189         auto *RecSize = new llvm::GlobalVariable(
2190             CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
2191             llvm::GlobalValue::InternalLinkage, nullptr,
2192             "_openmp_static_kernel$size");
2193         RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2194         llvm::Value *Ld = CGF.EmitLoadOfScalar(
2195             Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
2196             CGM.getContext().getSizeType(), Loc);
2197         llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2198             KernelStaticGlobalized, CGM.VoidPtrPtrTy);
2199         llvm::Value *GlobalRecordSizeArg[] = {
2200             llvm::ConstantInt::get(
2201                 CGM.Int16Ty,
2202                 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2203             StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
2204         CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2205                                 OMPRTL_NVPTX__kmpc_get_team_static_memory),
2206                             GlobalRecordSizeArg);
2207         GlobalizedRecords.back().Buffer = StaticGlobalized;
2208         GlobalizedRecords.back().RecSize = RecSize;
2209         GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
2210         GlobalizedRecords.back().Loc = Loc;
2211       }
2212       assert(KernelStaticGlobalized && "Global address must be set already.");
2213       Address FrameAddr = CGF.EmitLoadOfPointer(
2214           Address(KernelStaticGlobalized, CGM.getPointerAlign()),
2215           CGM.getContext()
2216               .getPointerType(CGM.getContext().VoidPtrTy)
2217               .castAs<PointerType>());
2218       llvm::Value *GlobalRecValue =
2219           Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
2220       I->getSecond().GlobalRecordAddr = GlobalRecValue;
2221       I->getSecond().IsInSPMDModeFlag = nullptr;
2222       GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2223           GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
2224     } else {
2225       // TODO: allow the usage of shared memory to be controlled by
2226       // the user, for now, default to global.
2227       llvm::Value *GlobalRecordSizeArg[] = {
2228           llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
2229           CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2230       llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2231           createNVPTXRuntimeFunction(
2232               OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2233           GlobalRecordSizeArg);
2234       GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2235           GlobalRecValue, GlobalRecPtrTy);
2236       I->getSecond().GlobalRecordAddr = GlobalRecValue;
2237       I->getSecond().IsInSPMDModeFlag = nullptr;
2238     }
2239     LValue Base =
2240         CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
2241
2242     // Emit the "global alloca" which is a GEP from the global declaration
2243     // record using the pointer returned by the runtime.
2244     LValue SecBase;
2245     decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
2246     if (IsTTD) {
2247       SecIt = I->getSecond().SecondaryLocalVarData->begin();
2248       llvm::PointerType *SecGlobalRecPtrTy =
2249           CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
2250       SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
2251           Bld.CreatePointerBitCastOrAddrSpaceCast(
2252               I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
2253           SecGlobalRecTy);
2254     }
2255     for (auto &Rec : I->getSecond().LocalVarData) {
2256       bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
2257       llvm::Value *ParValue;
2258       if (EscapedParam) {
2259         const auto *VD = cast<VarDecl>(Rec.first);
2260         LValue ParLVal =
2261             CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
2262         ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
2263       }
2264       LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
2265       // Emit VarAddr basing on lane-id if required.
2266       QualType VarTy;
2267       if (Rec.second.IsOnePerTeam) {
2268         VarTy = Rec.second.FD->getType();
2269       } else {
2270         llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
2271             VarAddr.getAddress().getPointer(),
2272             {Bld.getInt32(0), getNVPTXLaneID(CGF)});
2273         VarTy =
2274             Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
2275         VarAddr = CGF.MakeAddrLValue(
2276             Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
2277             AlignmentSource::Decl);
2278       }
2279       Rec.second.PrivateAddr = VarAddr.getAddress();
2280       if (!IsInTTDRegion &&
2281           (WithSPMDCheck ||
2282            getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2283         assert(I->getSecond().IsInSPMDModeFlag &&
2284                "Expected unknown execution mode or required SPMD check.");
2285         if (IsTTD) {
2286           assert(SecIt->second.IsOnePerTeam &&
2287                  "Secondary glob data must be one per team.");
2288           LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
2289           VarAddr.setAddress(
2290               Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
2291                                        VarAddr.getPointer()),
2292                       VarAddr.getAlignment()));
2293           Rec.second.PrivateAddr = VarAddr.getAddress();
2294         }
2295         Address GlobalPtr = Rec.second.PrivateAddr;
2296         Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
2297         Rec.second.PrivateAddr = Address(
2298             Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
2299                              LocalAddr.getPointer(), GlobalPtr.getPointer()),
2300             LocalAddr.getAlignment());
2301       }
2302       if (EscapedParam) {
2303         const auto *VD = cast<VarDecl>(Rec.first);
2304         CGF.EmitStoreOfScalar(ParValue, VarAddr);
2305         I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
2306       }
2307       if (IsTTD)
2308         ++SecIt;
2309     }
2310   }
2311   for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
2312     // Recover pointer to this function's global record. The runtime will
2313     // handle the specifics of the allocation of the memory.
2314     // Use actual memory size of the record including the padding
2315     // for alignment purposes.
2316     CGBuilderTy &Bld = CGF.Builder;
2317     llvm::Value *Size = CGF.getTypeSize(VD->getType());
2318     CharUnits Align = CGM.getContext().getDeclAlign(VD);
2319     Size = Bld.CreateNUWAdd(
2320         Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
2321     llvm::Value *AlignVal =
2322         llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
2323     Size = Bld.CreateUDiv(Size, AlignVal);
2324     Size = Bld.CreateNUWMul(Size, AlignVal);
2325     // TODO: allow the usage of shared memory to be controlled by
2326     // the user, for now, default to global.
2327     llvm::Value *GlobalRecordSizeArg[] = {
2328         Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2329     llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2330         createNVPTXRuntimeFunction(
2331             OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2332         GlobalRecordSizeArg);
2333     llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2334         GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
2335     LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
2336                                      CGM.getContext().getDeclAlign(VD),
2337                                      AlignmentSource::Decl);
2338     I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
2339                                             Base.getAddress());
2340     I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
2341   }
2342   I->getSecond().MappedParams->apply(CGF);
2343 }
2344
2345 void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
2346                                                  bool WithSPMDCheck) {
2347   if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
2348       getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
2349     return;
2350
2351   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2352   if (I != FunctionGlobalizedDecls.end()) {
2353     I->getSecond().MappedParams->restore(CGF);
2354     if (!CGF.HaveInsertPoint())
2355       return;
2356     for (llvm::Value *Addr :
2357          llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2358       CGF.EmitRuntimeCall(
2359           createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2360           Addr);
2361     }
2362     if (I->getSecond().GlobalRecordAddr) {
2363       if (!IsInTTDRegion &&
2364           (WithSPMDCheck ||
2365            getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
2366         CGBuilderTy &Bld = CGF.Builder;
2367         llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2368         llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2369         Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2370         // There is no need to emit line number for unconditional branch.
2371         (void)ApplyDebugLocation::CreateEmpty(CGF);
2372         CGF.EmitBlock(NonSPMDBB);
2373         CGF.EmitRuntimeCall(
2374             createNVPTXRuntimeFunction(
2375                 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2376             CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2377         CGF.EmitBlock(ExitBB);
2378       } else if (IsInTTDRegion) {
2379         assert(GlobalizedRecords.back().RegionCounter > 0 &&
2380                "region counter must be > 0.");
2381         --GlobalizedRecords.back().RegionCounter;
2382         // Emit the restore function only in the target region.
2383         if (GlobalizedRecords.back().RegionCounter == 0) {
2384           QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2385               /*DestWidth=*/16, /*Signed=*/0);
2386           llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2387               Address(GlobalizedRecords.back().UseSharedMemory,
2388                       CGM.getContext().getTypeAlignInChars(Int16Ty)),
2389               /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
2390           llvm::Value *Args[] = {
2391               llvm::ConstantInt::get(
2392                   CGM.Int16Ty,
2393                   getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD ? 1 : 0),
2394               IsInSharedMemory};
2395           CGF.EmitRuntimeCall(
2396               createNVPTXRuntimeFunction(
2397                   OMPRTL_NVPTX__kmpc_restore_team_static_memory),
2398               Args);
2399         }
2400       } else {
2401         CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2402                                 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2403                             I->getSecond().GlobalRecordAddr);
2404       }
2405     }
2406   }
2407 }
2408
2409 void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
2410                                          const OMPExecutableDirective &D,
2411                                          SourceLocation Loc,
2412                                          llvm::Function *OutlinedFn,
2413                                          ArrayRef<llvm::Value *> CapturedVars) {
2414   if (!CGF.HaveInsertPoint())
2415     return;
2416
2417   Address ZeroAddr = CGF.CreateMemTemp(
2418       CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
2419       /*Name*/ ".zero.addr");
2420   CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2421   llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2422   OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
2423   OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2424   OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2425   emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2426 }
2427
2428 void CGOpenMPRuntimeNVPTX::emitParallelCall(
2429     CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
2430     ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2431   if (!CGF.HaveInsertPoint())
2432     return;
2433
2434   if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
2435     emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2436   else
2437     emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
2438 }
2439
2440 void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
2441     CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2442     ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2443   llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
2444
2445   // Force inline this outlined function at its call site.
2446   Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2447
2448   Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
2449                                            /*DestWidth=*/32, /*Signed=*/1),
2450                                        ".zero.addr");
2451   CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2452   // ThreadId for serialized parallels is 0.
2453   Address ThreadIDAddr = ZeroAddr;
2454   auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr](
2455                        CodeGenFunction &CGF, PrePostActionTy &Action) {
2456     Action.Enter(CGF);
2457
2458     llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2459     OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2460     OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2461     OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2462     emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2463   };
2464   auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2465                                         PrePostActionTy &) {
2466
2467     RegionCodeGenTy RCG(CodeGen);
2468     llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2469     llvm::Value *ThreadID = getThreadID(CGF, Loc);
2470     llvm::Value *Args[] = {RTLoc, ThreadID};
2471
2472     NVPTXActionTy Action(
2473         createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2474         Args,
2475         createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2476         Args);
2477     RCG.setAction(Action);
2478     RCG(CGF);
2479   };
2480
2481   auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2482                                                   PrePostActionTy &Action) {
2483     CGBuilderTy &Bld = CGF.Builder;
2484     llvm::Function *WFn = WrapperFunctionsMap[Fn];
2485     assert(WFn && "Wrapper function does not exist!");
2486     llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2487
2488     // Prepare for parallel region. Indicate the outlined function.
2489     llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
2490     CGF.EmitRuntimeCall(
2491         createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2492         Args);
2493
2494     // Create a private scope that will globalize the arguments
2495     // passed from the outside of the target region.
2496     CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2497
2498     // There's something to share.
2499     if (!CapturedVars.empty()) {
2500       // Prepare for parallel region. Indicate the outlined function.
2501       Address SharedArgs =
2502           CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
2503       llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2504
2505       llvm::Value *DataSharingArgs[] = {
2506           SharedArgsPtr,
2507           llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2508       CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2509                               OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2510                           DataSharingArgs);
2511
2512       // Store variable address in a list of references to pass to workers.
2513       unsigned Idx = 0;
2514       ASTContext &Ctx = CGF.getContext();
2515       Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2516           SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2517                           .castAs<PointerType>());
2518       for (llvm::Value *V : CapturedVars) {
2519         Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
2520         llvm::Value *PtrV;
2521         if (V->getType()->isIntegerTy())
2522           PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2523         else
2524           PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
2525         CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2526                               Ctx.getPointerType(Ctx.VoidPtrTy));
2527         ++Idx;
2528       }
2529     }
2530
2531     // Activate workers. This barrier is used by the master to signal
2532     // work for the workers.
2533     syncCTAThreads(CGF);
2534
2535     // OpenMP [2.5, Parallel Construct, p.49]
2536     // There is an implied barrier at the end of a parallel region. After the
2537     // end of a parallel region, only the master thread of the team resumes
2538     // execution of the enclosing task region.
2539     //
2540     // The master waits at this barrier until all workers are done.
2541     syncCTAThreads(CGF);
2542
2543     if (!CapturedVars.empty())
2544       CGF.EmitRuntimeCall(
2545           createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2546
2547     // Remember for post-processing in worker loop.
2548     Work.emplace_back(WFn);
2549   };
2550
2551   auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
2552                              CodeGenFunction &CGF, PrePostActionTy &Action) {
2553     if (IsInParallelRegion) {
2554       SeqGen(CGF, Action);
2555     } else if (IsInTargetMasterThreadRegion) {
2556       L0ParallelGen(CGF, Action);
2557     } else {
2558       // Check for master and then parallelism:
2559       // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
2560       //   Serialized execution.
2561       // } else {
2562       //   Worker call.
2563       // }
2564       CGBuilderTy &Bld = CGF.Builder;
2565       llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2566       llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2567       llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
2568       llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
2569       llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2570           createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2571       Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
2572       // There is no need to emit line number for unconditional branch.
2573       (void)ApplyDebugLocation::CreateEmpty(CGF);
2574       CGF.EmitBlock(ParallelCheckBB);
2575       llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2576       llvm::Value *ThreadID = getThreadID(CGF, Loc);
2577       llvm::Value *PL = CGF.EmitRuntimeCall(
2578           createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2579           {RTLoc, ThreadID});
2580       llvm::Value *Res = Bld.CreateIsNotNull(PL);
2581       Bld.CreateCondBr(Res, SeqBB, MasterBB);
2582       CGF.EmitBlock(SeqBB);
2583       SeqGen(CGF, Action);
2584       CGF.EmitBranch(ExitBB);
2585       // There is no need to emit line number for unconditional branch.
2586       (void)ApplyDebugLocation::CreateEmpty(CGF);
2587       CGF.EmitBlock(MasterBB);
2588       L0ParallelGen(CGF, Action);
2589       CGF.EmitBranch(ExitBB);
2590       // There is no need to emit line number for unconditional branch.
2591       (void)ApplyDebugLocation::CreateEmpty(CGF);
2592       // Emit the continuation block for code after the if.
2593       CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2594     }
2595   };
2596
2597   if (IfCond) {
2598     emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
2599   } else {
2600     CodeGenFunction::RunCleanupsScope Scope(CGF);
2601     RegionCodeGenTy ThenRCG(LNParallelGen);
2602     ThenRCG(CGF);
2603   }
2604 }
2605
2606 void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
2607     CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn,
2608     ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2609   // Just call the outlined function to execute the parallel region.
2610   // OutlinedFn(&GTid, &zero, CapturedStruct);
2611   //
2612   llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2613
2614   Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
2615                                            /*DestWidth=*/32, /*Signed=*/1),
2616                                        ".zero.addr");
2617   CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2618   // ThreadId for serialized parallels is 0.
2619   Address ThreadIDAddr = ZeroAddr;
2620   auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
2621                     &ThreadIDAddr](CodeGenFunction &CGF,
2622                                    PrePostActionTy &Action) {
2623     Action.Enter(CGF);
2624
2625     llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2626     OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2627     OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2628     OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2629     emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2630   };
2631   auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2632                                         PrePostActionTy &) {
2633
2634     RegionCodeGenTy RCG(CodeGen);
2635     llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2636     llvm::Value *ThreadID = getThreadID(CGF, Loc);
2637     llvm::Value *Args[] = {RTLoc, ThreadID};
2638
2639     NVPTXActionTy Action(
2640         createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2641         Args,
2642         createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2643         Args);
2644     RCG.setAction(Action);
2645     RCG(CGF);
2646   };
2647
2648   if (IsInTargetMasterThreadRegion) {
2649     // In the worker need to use the real thread id.
2650     ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
2651     RegionCodeGenTy RCG(CodeGen);
2652     RCG(CGF);
2653   } else {
2654     // If we are not in the target region, it is definitely L2 parallelism or
2655     // more, because for SPMD mode we always has L1 parallel level, sowe don't
2656     // need to check for orphaned directives.
2657     RegionCodeGenTy RCG(SeqGen);
2658     RCG(CGF);
2659   }
2660 }
2661
2662 void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
2663   // Always emit simple barriers!
2664   if (!CGF.HaveInsertPoint())
2665     return;
2666   // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
2667   // This function does not use parameters, so we can emit just default values.
2668   llvm::Value *Args[] = {
2669       llvm::ConstantPointerNull::get(
2670           cast<llvm::PointerType>(getIdentTyPointerTy())),
2671       llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
2672   CGF.EmitRuntimeCall(
2673       createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args);
2674 }
2675
2676 void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF,
2677                                            SourceLocation Loc,
2678                                            OpenMPDirectiveKind Kind, bool,
2679                                            bool) {
2680   // Always emit simple barriers!
2681   if (!CGF.HaveInsertPoint())
2682     return;
2683   // Build call __kmpc_cancel_barrier(loc, thread_id);
2684   unsigned Flags = getDefaultFlagsForBarriers(Kind);
2685   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
2686                          getThreadID(CGF, Loc)};
2687   CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier), Args);
2688 }
2689
2690 void CGOpenMPRuntimeNVPTX::emitCriticalRegion(
2691     CodeGenFunction &CGF, StringRef CriticalName,
2692     const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2693     const Expr *Hint) {
2694   llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2695   llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2696   llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2697   llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2698   llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2699
2700   // Fetch team-local id of the thread.
2701   llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2702
2703   // Get the width of the team.
2704   llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2705
2706   // Initialize the counter variable for the loop.
2707   QualType Int32Ty =
2708       CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2709   Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2710   LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2711   CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2712                         /*isInit=*/true);
2713
2714   // Block checks if loop counter exceeds upper bound.
2715   CGF.EmitBlock(LoopBB);
2716   llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2717   llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2718   CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2719
2720   // Block tests which single thread should execute region, and which threads
2721   // should go straight to synchronisation point.
2722   CGF.EmitBlock(TestBB);
2723   CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2724   llvm::Value *CmpThreadToCounter =
2725       CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2726   CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2727
2728   // Block emits the body of the critical region.
2729   CGF.EmitBlock(BodyBB);
2730
2731   // Output the critical statement.
2732   CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
2733                                       Hint);
2734
2735   // After the body surrounded by the critical region, the single executing
2736   // thread will jump to the synchronisation point.
2737   // Block waits for all threads in current team to finish then increments the
2738   // counter variable and returns to the loop.
2739   CGF.EmitBlock(SyncBB);
2740   emitBarrierCall(CGF, Loc, OMPD_unknown, /*EmitChecks=*/false,
2741                   /*ForceSimpleCall=*/true);
2742
2743   llvm::Value *IncCounterVal =
2744       CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2745   CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2746   CGF.EmitBranch(LoopBB);
2747
2748   // Block that is reached when  all threads in the team complete the region.
2749   CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2750 }
2751
2752 /// Cast value to the specified type.
2753 static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
2754                                     QualType ValTy, QualType CastTy,
2755                                     SourceLocation Loc) {
2756   assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2757          "Cast type must sized.");
2758   assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2759          "Val type must sized.");
2760   llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2761   if (ValTy == CastTy)
2762     return Val;
2763   if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2764       CGF.getContext().getTypeSizeInChars(CastTy))
2765     return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2766   if (CastTy->isIntegerType() && ValTy->isIntegerType())
2767     return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2768                                      CastTy->hasSignedIntegerRepresentation());
2769   Address CastItem = CGF.CreateMemTemp(CastTy);
2770   Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2771       CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
2772   CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2773   return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
2774 }
2775
2776 /// This function creates calls to one of two shuffle functions to copy
2777 /// variables between lanes in a warp.
2778 static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
2779                                                  llvm::Value *Elem,
2780                                                  QualType ElemType,
2781                                                  llvm::Value *Offset,
2782                                                  SourceLocation Loc) {
2783   CodeGenModule &CGM = CGF.CGM;
2784   CGBuilderTy &Bld = CGF.Builder;
2785   CGOpenMPRuntimeNVPTX &RT =
2786       *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2787
2788   CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2789   assert(Size.getQuantity() <= 8 &&
2790          "Unsupported bitwidth in shuffle instruction.");
2791
2792   OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
2793                                          ? OMPRTL_NVPTX__kmpc_shuffle_int32
2794                                          : OMPRTL_NVPTX__kmpc_shuffle_int64;
2795
2796   // Cast all types to 32- or 64-bit values before calling shuffle routines.
2797   QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2798       Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2799   llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
2800   llvm::Value *WarpSize =
2801       Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
2802
2803   llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2804       RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
2805
2806   return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
2807 }
2808
2809 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2810                             Address DestAddr, QualType ElemType,
2811                             llvm::Value *Offset, SourceLocation Loc) {
2812   CGBuilderTy &Bld = CGF.Builder;
2813
2814   CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2815   // Create the loop over the big sized data.
2816   // ptr = (void*)Elem;
2817   // ptrEnd = (void*) Elem + 1;
2818   // Step = 8;
2819   // while (ptr + Step < ptrEnd)
2820   //   shuffle((int64_t)*ptr);
2821   // Step = 4;
2822   // while (ptr + Step < ptrEnd)
2823   //   shuffle((int32_t)*ptr);
2824   // ...
2825   Address ElemPtr = DestAddr;
2826   Address Ptr = SrcAddr;
2827   Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
2828       Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
2829   for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2830     if (Size < CharUnits::fromQuantity(IntSize))
2831       continue;
2832     QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2833         CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2834         /*Signed=*/1);
2835     llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2836     Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2837     ElemPtr =
2838         Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2839     if (Size.getQuantity() / IntSize > 1) {
2840       llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2841       llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2842       llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2843       llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2844       CGF.EmitBlock(PreCondBB);
2845       llvm::PHINode *PhiSrc =
2846           Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2847       PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2848       llvm::PHINode *PhiDest =
2849           Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2850       PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2851       Ptr = Address(PhiSrc, Ptr.getAlignment());
2852       ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2853       llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2854           PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
2855                                    Ptr.getPointer(), CGF.VoidPtrTy));
2856       Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2857                        ThenBB, ExitBB);
2858       CGF.EmitBlock(ThenBB);
2859       llvm::Value *Res = createRuntimeShuffleFunction(
2860           CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2861           IntType, Offset, Loc);
2862       CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2863       Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
2864       Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2865       PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
2866       PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
2867       CGF.EmitBranch(PreCondBB);
2868       CGF.EmitBlock(ExitBB);
2869     } else {
2870       llvm::Value *Res = createRuntimeShuffleFunction(
2871           CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2872           IntType, Offset, Loc);
2873       CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2874       Ptr = Bld.CreateConstGEP(Ptr, 1);
2875       ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
2876     }
2877     Size = Size % IntSize;
2878   }
2879 }
2880
2881 namespace {
2882 enum CopyAction : unsigned {
2883   // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2884   // the warp using shuffle instructions.
2885   RemoteLaneToThread,
2886   // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2887   ThreadCopy,
2888   // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2889   ThreadToScratchpad,
2890   // ScratchpadToThread: Copy from a scratchpad array in global memory
2891   // containing team-reduced data to a thread's stack.
2892   ScratchpadToThread,
2893 };
2894 } // namespace
2895
2896 struct CopyOptionsTy {
2897   llvm::Value *RemoteLaneOffset;
2898   llvm::Value *ScratchpadIndex;
2899   llvm::Value *ScratchpadWidth;
2900 };
2901
2902 /// Emit instructions to copy a Reduce list, which contains partially
2903 /// aggregated values, in the specified direction.
2904 static void emitReductionListCopy(
2905     CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2906     ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2907     CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
2908
2909   CodeGenModule &CGM = CGF.CGM;
2910   ASTContext &C = CGM.getContext();
2911   CGBuilderTy &Bld = CGF.Builder;
2912
2913   llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2914   llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2915   llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
2916
2917   // Iterates, element-by-element, through the source Reduce list and
2918   // make a copy.
2919   unsigned Idx = 0;
2920   unsigned Size = Privates.size();
2921   for (const Expr *Private : Privates) {
2922     Address SrcElementAddr = Address::invalid();
2923     Address DestElementAddr = Address::invalid();
2924     Address DestElementPtrAddr = Address::invalid();
2925     // Should we shuffle in an element from a remote lane?
2926     bool ShuffleInElement = false;
2927     // Set to true to update the pointer in the dest Reduce list to a
2928     // newly created element.
2929     bool UpdateDestListPtr = false;
2930     // Increment the src or dest pointer to the scratchpad, for each
2931     // new element.
2932     bool IncrScratchpadSrc = false;
2933     bool IncrScratchpadDest = false;
2934
2935     switch (Action) {
2936     case RemoteLaneToThread: {
2937       // Step 1.1: Get the address for the src element in the Reduce list.
2938       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2939       SrcElementAddr = CGF.EmitLoadOfPointer(
2940           SrcElementPtrAddr,
2941           C.getPointerType(Private->getType())->castAs<PointerType>());
2942
2943       // Step 1.2: Create a temporary to store the element in the destination
2944       // Reduce list.
2945       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2946       DestElementAddr =
2947           CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2948       ShuffleInElement = true;
2949       UpdateDestListPtr = true;
2950       break;
2951     }
2952     case ThreadCopy: {
2953       // Step 1.1: Get the address for the src element in the Reduce list.
2954       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2955       SrcElementAddr = CGF.EmitLoadOfPointer(
2956           SrcElementPtrAddr,
2957           C.getPointerType(Private->getType())->castAs<PointerType>());
2958
2959       // Step 1.2: Get the address for dest element.  The destination
2960       // element has already been created on the thread's stack.
2961       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
2962       DestElementAddr = CGF.EmitLoadOfPointer(
2963           DestElementPtrAddr,
2964           C.getPointerType(Private->getType())->castAs<PointerType>());
2965       break;
2966     }
2967     case ThreadToScratchpad: {
2968       // Step 1.1: Get the address for the src element in the Reduce list.
2969       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
2970       SrcElementAddr = CGF.EmitLoadOfPointer(
2971           SrcElementPtrAddr,
2972           C.getPointerType(Private->getType())->castAs<PointerType>());
2973
2974       // Step 1.2: Get the address for dest element:
2975       // address = base + index * ElementSizeInChars.
2976       llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2977       llvm::Value *CurrentOffset =
2978           Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2979       llvm::Value *ScratchPadElemAbsolutePtrVal =
2980           Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
2981       ScratchPadElemAbsolutePtrVal =
2982           Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2983       DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2984                                 C.getTypeAlignInChars(Private->getType()));
2985       IncrScratchpadDest = true;
2986       break;
2987     }
2988     case ScratchpadToThread: {
2989       // Step 1.1: Get the address for the src element in the scratchpad.
2990       // address = base + index * ElementSizeInChars.
2991       llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2992       llvm::Value *CurrentOffset =
2993           Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
2994       llvm::Value *ScratchPadElemAbsolutePtrVal =
2995           Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
2996       ScratchPadElemAbsolutePtrVal =
2997           Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2998       SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2999                                C.getTypeAlignInChars(Private->getType()));
3000       IncrScratchpadSrc = true;
3001
3002       // Step 1.2: Create a temporary to store the element in the destination
3003       // Reduce list.
3004       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
3005       DestElementAddr =
3006           CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
3007       UpdateDestListPtr = true;
3008       break;
3009     }
3010     }
3011
3012     // Regardless of src and dest of copy, we emit the load of src
3013     // element as this is required in all directions
3014     SrcElementAddr = Bld.CreateElementBitCast(
3015         SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
3016     DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
3017                                                SrcElementAddr.getElementType());
3018
3019     // Now that all active lanes have read the element in the
3020     // Reduce list, shuffle over the value from the remote lane.
3021     if (ShuffleInElement) {
3022       shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
3023                       RemoteLaneOffset, Private->getExprLoc());
3024     } else {
3025       switch (CGF.getEvaluationKind(Private->getType())) {
3026       case TEK_Scalar: {
3027         llvm::Value *Elem =
3028             CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
3029                                  Private->getType(), Private->getExprLoc());
3030         // Store the source element value to the dest element address.
3031         CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
3032                               Private->getType());
3033         break;
3034       }
3035       case TEK_Complex: {
3036         CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
3037             CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3038             Private->getExprLoc());
3039         CGF.EmitStoreOfComplex(
3040             Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3041             /*isInit=*/false);
3042         break;
3043       }
3044       case TEK_Aggregate:
3045         CGF.EmitAggregateCopy(
3046             CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
3047             CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
3048             Private->getType(), AggValueSlot::DoesNotOverlap);
3049         break;
3050       }
3051     }
3052
3053     // Step 3.1: Modify reference in dest Reduce list as needed.
3054     // Modifying the reference in Reduce list to point to the newly
3055     // created element.  The element is live in the current function
3056     // scope and that of functions it invokes (i.e., reduce_function).
3057     // RemoteReduceData[i] = (void*)&RemoteElem
3058     if (UpdateDestListPtr) {
3059       CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
3060                                 DestElementAddr.getPointer(), CGF.VoidPtrTy),
3061                             DestElementPtrAddr, /*Volatile=*/false,
3062                             C.VoidPtrTy);
3063     }
3064
3065     // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
3066     // address of the next element in scratchpad memory, unless we're currently
3067     // processing the last one.  Memory alignment is also taken care of here.
3068     if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
3069       llvm::Value *ScratchpadBasePtr =
3070           IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
3071       llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3072       ScratchpadBasePtr = Bld.CreateNUWAdd(
3073           ScratchpadBasePtr,
3074           Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
3075
3076       // Take care of global memory alignment for performance
3077       ScratchpadBasePtr = Bld.CreateNUWSub(
3078           ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3079       ScratchpadBasePtr = Bld.CreateUDiv(
3080           ScratchpadBasePtr,
3081           llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3082       ScratchpadBasePtr = Bld.CreateNUWAdd(
3083           ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3084       ScratchpadBasePtr = Bld.CreateNUWMul(
3085           ScratchpadBasePtr,
3086           llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3087
3088       if (IncrScratchpadDest)
3089         DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3090       else /* IncrScratchpadSrc = true */
3091         SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3092     }
3093
3094     ++Idx;
3095   }
3096 }
3097
3098 /// This function emits a helper that gathers Reduce lists from the first
3099 /// lane of every active warp to lanes in the first warp.
3100 ///
3101 /// void inter_warp_copy_func(void* reduce_data, num_warps)
3102 ///   shared smem[warp_size];
3103 ///   For all data entries D in reduce_data:
3104 ///     sync
3105 ///     If (I am the first lane in each warp)
3106 ///       Copy my local D to smem[warp_id]
3107 ///     sync
3108 ///     if (I am the first warp)
3109 ///       Copy smem[thread_id] to my local D
3110 static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
3111                                               ArrayRef<const Expr *> Privates,
3112                                               QualType ReductionArrayTy,
3113                                               SourceLocation Loc) {
3114   ASTContext &C = CGM.getContext();
3115   llvm::Module &M = CGM.getModule();
3116
3117   // ReduceList: thread local Reduce list.
3118   // At the stage of the computation when this function is called, partially
3119   // aggregated values reside in the first lane of every active warp.
3120   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3121                                   C.VoidPtrTy, ImplicitParamDecl::Other);
3122   // NumWarps: number of warps active in the parallel region.  This could
3123   // be smaller than 32 (max warps in a CTA) for partial block reduction.
3124   ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3125                                 C.getIntTypeForBitwidth(32, /* Signed */ true),
3126                                 ImplicitParamDecl::Other);
3127   FunctionArgList Args;
3128   Args.push_back(&ReduceListArg);
3129   Args.push_back(&NumWarpsArg);
3130
3131   const CGFunctionInfo &CGFI =
3132       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3133   auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
3134                                     llvm::GlobalValue::InternalLinkage,
3135                                     "_omp_reduction_inter_warp_copy_func", &M);
3136   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3137   Fn->setDoesNotRecurse();
3138   CodeGenFunction CGF(CGM);
3139   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3140
3141   CGBuilderTy &Bld = CGF.Builder;
3142
3143   // This array is used as a medium to transfer, one reduce element at a time,
3144   // the data from the first lane of every warp to lanes in the first warp
3145   // in order to perform the final step of a reduction in a parallel region
3146   // (reduction across warps).  The array is placed in NVPTX __shared__ memory
3147   // for reduced latency, as well as to have a distinct copy for concurrently
3148   // executing target regions.  The array is declared with common linkage so
3149   // as to be shared across compilation units.
3150   StringRef TransferMediumName =
3151       "__openmp_nvptx_data_transfer_temporary_storage";
3152   llvm::GlobalVariable *TransferMedium =
3153       M.getGlobalVariable(TransferMediumName);
3154   if (!TransferMedium) {
3155     auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
3156     unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
3157     TransferMedium = new llvm::GlobalVariable(
3158         M, Ty, /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
3159         llvm::Constant::getNullValue(Ty), TransferMediumName,
3160         /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
3161         SharedAddressSpace);
3162     CGM.addCompilerUsedGlobal(TransferMedium);
3163   }
3164
3165   // Get the CUDA thread id of the current OpenMP thread on the GPU.
3166   llvm::Value *ThreadID = getNVPTXThreadID(CGF);
3167   // nvptx_lane_id = nvptx_id % warpsize
3168   llvm::Value *LaneID = getNVPTXLaneID(CGF);
3169   // nvptx_warp_id = nvptx_id / warpsize
3170   llvm::Value *WarpID = getNVPTXWarpID(CGF);
3171
3172   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3173   Address LocalReduceList(
3174       Bld.CreatePointerBitCastOrAddrSpaceCast(
3175           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3176                                C.VoidPtrTy, Loc),
3177           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3178       CGF.getPointerAlign());
3179
3180   unsigned Idx = 0;
3181   for (const Expr *Private : Privates) {
3182     //
3183     // Warp master copies reduce element to transfer medium in __shared__
3184     // memory.
3185     //
3186     unsigned RealTySize =
3187         C.getTypeSizeInChars(Private->getType())
3188             .alignTo(C.getTypeAlignInChars(Private->getType()))
3189             .getQuantity();
3190     for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
3191       unsigned NumIters = RealTySize / TySize;
3192       if (NumIters == 0)
3193         continue;
3194       QualType CType = C.getIntTypeForBitwidth(
3195           C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
3196       llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
3197       CharUnits Align = CharUnits::fromQuantity(TySize);
3198       llvm::Value *Cnt = nullptr;
3199       Address CntAddr = Address::invalid();
3200       llvm::BasicBlock *PrecondBB = nullptr;
3201       llvm::BasicBlock *ExitBB = nullptr;
3202       if (NumIters > 1) {
3203         CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
3204         CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
3205                               /*Volatile=*/false, C.IntTy);
3206         PrecondBB = CGF.createBasicBlock("precond");
3207         ExitBB = CGF.createBasicBlock("exit");
3208         llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
3209         // There is no need to emit line number for unconditional branch.
3210         (void)ApplyDebugLocation::CreateEmpty(CGF);
3211         CGF.EmitBlock(PrecondBB);
3212         Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
3213         llvm::Value *Cmp =
3214             Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
3215         Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
3216         CGF.EmitBlock(BodyBB);
3217       }
3218       // kmpc_barrier.
3219       CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3220                                              /*EmitChecks=*/false,
3221                                              /*ForceSimpleCall=*/true);
3222       llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3223       llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3224       llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3225
3226       // if (lane_id == 0)
3227       llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
3228       Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3229       CGF.EmitBlock(ThenBB);
3230
3231       // Reduce element = LocalReduceList[i]
3232       Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3233       llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3234           ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3235       // elemptr = ((CopyType*)(elemptrptr)) + I
3236       Address ElemPtr = Address(ElemPtrPtr, Align);
3237       ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
3238       if (NumIters > 1) {
3239         ElemPtr = Address(Bld.CreateGEP(ElemPtr.getPointer(), Cnt),
3240                           ElemPtr.getAlignment());
3241       }
3242
3243       // Get pointer to location in transfer medium.
3244       // MediumPtr = &medium[warp_id]
3245       llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3246           TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3247       Address MediumPtr(MediumPtrVal, Align);
3248       // Casting to actual data type.
3249       // MediumPtr = (CopyType*)MediumPtrAddr;
3250       MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
3251
3252       // elem = *elemptr
3253       //*MediumPtr = elem
3254       llvm::Value *Elem =
3255           CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false, CType, Loc);
3256       // Store the source element value to the dest element address.
3257       CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType);
3258
3259       Bld.CreateBr(MergeBB);
3260
3261       CGF.EmitBlock(ElseBB);
3262       Bld.CreateBr(MergeBB);
3263
3264       CGF.EmitBlock(MergeBB);
3265
3266       // kmpc_barrier.
3267       CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
3268                                              /*EmitChecks=*/false,
3269                                              /*ForceSimpleCall=*/true);
3270
3271       //
3272       // Warp 0 copies reduce element from transfer medium.
3273       //
3274       llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3275       llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3276       llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3277
3278       Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3279       llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3280           AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
3281
3282       // Up to 32 threads in warp 0 are active.
3283       llvm::Value *IsActiveThread =
3284           Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3285       Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3286
3287       CGF.EmitBlock(W0ThenBB);
3288
3289       // SrcMediumPtr = &medium[tid]
3290       llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3291           TransferMedium,
3292           {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3293       Address SrcMediumPtr(SrcMediumPtrVal, Align);
3294       // SrcMediumVal = *SrcMediumPtr;
3295       SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
3296
3297       // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3298       Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3299       llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3300           TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
3301       Address TargetElemPtr = Address(TargetElemPtrVal, Align);
3302       TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
3303       if (NumIters > 1) {
3304         TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getPointer(), Cnt),
3305                                 TargetElemPtr.getAlignment());
3306       }
3307
3308       // *TargetElemPtr = SrcMediumVal;
3309       llvm::Value *SrcMediumValue =
3310           CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
3311       CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
3312                             CType);
3313       Bld.CreateBr(W0MergeBB);
3314
3315       CGF.EmitBlock(W0ElseBB);
3316       Bld.CreateBr(W0MergeBB);
3317
3318       CGF.EmitBlock(W0MergeBB);
3319
3320       if (NumIters > 1) {
3321         Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
3322         CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
3323         CGF.EmitBranch(PrecondBB);
3324         (void)ApplyDebugLocation::CreateEmpty(CGF);
3325         CGF.EmitBlock(ExitBB);
3326       }
3327       RealTySize %= TySize;
3328     }
3329     ++Idx;
3330   }
3331
3332   CGF.FinishFunction();
3333   return Fn;
3334 }
3335
3336 /// Emit a helper that reduces data across two OpenMP threads (lanes)
3337 /// in the same warp.  It uses shuffle instructions to copy over data from
3338 /// a remote lane's stack.  The reduction algorithm performed is specified
3339 /// by the fourth parameter.
3340 ///
3341 /// Algorithm Versions.
3342 /// Full Warp Reduce (argument value 0):
3343 ///   This algorithm assumes that all 32 lanes are active and gathers
3344 ///   data from these 32 lanes, producing a single resultant value.
3345 /// Contiguous Partial Warp Reduce (argument value 1):
3346 ///   This algorithm assumes that only a *contiguous* subset of lanes
3347 ///   are active.  This happens for the last warp in a parallel region
3348 ///   when the user specified num_threads is not an integer multiple of
3349 ///   32.  This contiguous subset always starts with the zeroth lane.
3350 /// Partial Warp Reduce (argument value 2):
3351 ///   This algorithm gathers data from any number of lanes at any position.
3352 /// All reduced values are stored in the lowest possible lane.  The set
3353 /// of problems every algorithm addresses is a super set of those
3354 /// addressable by algorithms with a lower version number.  Overhead
3355 /// increases as algorithm version increases.
3356 ///
3357 /// Terminology
3358 /// Reduce element:
3359 ///   Reduce element refers to the individual data field with primitive
3360 ///   data types to be combined and reduced across threads.
3361 /// Reduce list:
3362 ///   Reduce list refers to a collection of local, thread-private
3363 ///   reduce elements.
3364 /// Remote Reduce list:
3365 ///   Remote Reduce list refers to a collection of remote (relative to
3366 ///   the current thread) reduce elements.
3367 ///
3368 /// We distinguish between three states of threads that are important to
3369 /// the implementation of this function.
3370 /// Alive threads:
3371 ///   Threads in a warp executing the SIMT instruction, as distinguished from
3372 ///   threads that are inactive due to divergent control flow.
3373 /// Active threads:
3374 ///   The minimal set of threads that has to be alive upon entry to this
3375 ///   function.  The computation is correct iff active threads are alive.
3376 ///   Some threads are alive but they are not active because they do not
3377 ///   contribute to the computation in any useful manner.  Turning them off
3378 ///   may introduce control flow overheads without any tangible benefits.
3379 /// Effective threads:
3380 ///   In order to comply with the argument requirements of the shuffle
3381 ///   function, we must keep all lanes holding data alive.  But at most
3382 ///   half of them perform value aggregation; we refer to this half of
3383 ///   threads as effective. The other half is simply handing off their
3384 ///   data.
3385 ///
3386 /// Procedure
3387 /// Value shuffle:
3388 ///   In this step active threads transfer data from higher lane positions
3389 ///   in the warp to lower lane positions, creating Remote Reduce list.
3390 /// Value aggregation:
3391 ///   In this step, effective threads combine their thread local Reduce list
3392 ///   with Remote Reduce list and store the result in the thread local
3393 ///   Reduce list.
3394 /// Value copy:
3395 ///   In this step, we deal with the assumption made by algorithm 2
3396 ///   (i.e. contiguity assumption).  When we have an odd number of lanes
3397 ///   active, say 2k+1, only k threads will be effective and therefore k
3398 ///   new values will be produced.  However, the Reduce list owned by the
3399 ///   (2k+1)th thread is ignored in the value aggregation.  Therefore
3400 ///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3401 ///   that the contiguity assumption still holds.
3402 static llvm::Function *emitShuffleAndReduceFunction(
3403     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3404     QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
3405   ASTContext &C = CGM.getContext();
3406
3407   // Thread local Reduce list used to host the values of data to be reduced.
3408   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3409                                   C.VoidPtrTy, ImplicitParamDecl::Other);
3410   // Current lane id; could be logical.
3411   ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
3412                               ImplicitParamDecl::Other);
3413   // Offset of the remote source lane relative to the current lane.
3414   ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3415                                         C.ShortTy, ImplicitParamDecl::Other);
3416   // Algorithm version.  This is expected to be known at compile time.
3417   ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3418                                C.ShortTy, ImplicitParamDecl::Other);
3419   FunctionArgList Args;
3420   Args.push_back(&ReduceListArg);
3421   Args.push_back(&LaneIDArg);
3422   Args.push_back(&RemoteLaneOffsetArg);
3423   Args.push_back(&AlgoVerArg);
3424
3425   const CGFunctionInfo &CGFI =
3426       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3427   auto *Fn = llvm::Function::Create(
3428       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3429       "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
3430   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3431   Fn->setDoesNotRecurse();
3432   if (CGM.getLangOpts().Optimize) {
3433     Fn->removeFnAttr(llvm::Attribute::NoInline);
3434     Fn->removeFnAttr(llvm::Attribute::OptimizeNone);
3435     Fn->addFnAttr(llvm::Attribute::AlwaysInline);
3436   }
3437
3438   CodeGenFunction CGF(CGM);
3439   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3440
3441   CGBuilderTy &Bld = CGF.Builder;
3442
3443   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3444   Address LocalReduceList(
3445       Bld.CreatePointerBitCastOrAddrSpaceCast(
3446           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3447                                C.VoidPtrTy, SourceLocation()),
3448           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3449       CGF.getPointerAlign());
3450
3451   Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3452   llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3453       AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3454
3455   Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3456   llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3457       AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3458
3459   Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3460   llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3461       AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3462
3463   // Create a local thread-private variable to host the Reduce list
3464   // from a remote lane.
3465   Address RemoteReduceList =
3466       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
3467
3468   // This loop iterates through the list of reduce elements and copies,
3469   // element by element, from a remote lane in the warp to RemoteReduceList,
3470   // hosted on the thread's stack.
3471   emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3472                         LocalReduceList, RemoteReduceList,
3473                         {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3474                          /*ScratchpadIndex=*/nullptr,
3475                          /*ScratchpadWidth=*/nullptr});
3476
3477   // The actions to be performed on the Remote Reduce list is dependent
3478   // on the algorithm version.
3479   //
3480   //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3481   //  LaneId % 2 == 0 && Offset > 0):
3482   //    do the reduction value aggregation
3483   //
3484   //  The thread local variable Reduce list is mutated in place to host the
3485   //  reduced data, which is the aggregated value produced from local and
3486   //  remote lanes.
3487   //
3488   //  Note that AlgoVer is expected to be a constant integer known at compile
3489   //  time.
3490   //  When AlgoVer==0, the first conjunction evaluates to true, making
3491   //    the entire predicate true during compile time.
3492   //  When AlgoVer==1, the second conjunction has only the second part to be
3493   //    evaluated during runtime.  Other conjunctions evaluates to false
3494   //    during compile time.
3495   //  When AlgoVer==2, the third conjunction has only the second part to be
3496   //    evaluated during runtime.  Other conjunctions evaluates to false
3497   //    during compile time.
3498   llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
3499
3500   llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3501   llvm::Value *CondAlgo1 = Bld.CreateAnd(
3502       Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3503
3504   llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3505   llvm::Value *CondAlgo2 = Bld.CreateAnd(
3506       Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
3507   CondAlgo2 = Bld.CreateAnd(
3508       CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3509
3510   llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
3511   CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3512
3513   llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3514   llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3515   llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3516   Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3517
3518   CGF.EmitBlock(ThenBB);
3519   // reduce_function(LocalReduceList, RemoteReduceList)
3520   llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3521       LocalReduceList.getPointer(), CGF.VoidPtrTy);
3522   llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3523       RemoteReduceList.getPointer(), CGF.VoidPtrTy);
3524   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3525       CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
3526   Bld.CreateBr(MergeBB);
3527
3528   CGF.EmitBlock(ElseBB);
3529   Bld.CreateBr(MergeBB);
3530
3531   CGF.EmitBlock(MergeBB);
3532
3533   // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3534   // Reduce list.
3535   Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3536   llvm::Value *CondCopy = Bld.CreateAnd(
3537       Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3538
3539   llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3540   llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3541   llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3542   Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3543
3544   CGF.EmitBlock(CpyThenBB);
3545   emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3546                         RemoteReduceList, LocalReduceList);
3547   Bld.CreateBr(CpyMergeBB);
3548
3549   CGF.EmitBlock(CpyElseBB);
3550   Bld.CreateBr(CpyMergeBB);
3551
3552   CGF.EmitBlock(CpyMergeBB);
3553
3554   CGF.FinishFunction();
3555   return Fn;
3556 }
3557
3558 /// This function emits a helper that copies all the reduction variables from
3559 /// the team into the provided global buffer for the reduction variables.
3560 ///
3561 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3562 ///   For all data entries D in reduce_data:
3563 ///     Copy local D to buffer.D[Idx]
3564 static llvm::Value *emitListToGlobalCopyFunction(
3565     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3566     QualType ReductionArrayTy, SourceLocation Loc,
3567     const RecordDecl *TeamReductionRec,
3568     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3569         &VarFieldMap) {
3570   ASTContext &C = CGM.getContext();
3571
3572   // Buffer: global reduction buffer.
3573   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3574                               C.VoidPtrTy, ImplicitParamDecl::Other);
3575   // Idx: index of the buffer.
3576   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3577                            ImplicitParamDecl::Other);
3578   // ReduceList: thread local Reduce list.
3579   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3580                                   C.VoidPtrTy, ImplicitParamDecl::Other);
3581   FunctionArgList Args;
3582   Args.push_back(&BufferArg);
3583   Args.push_back(&IdxArg);
3584   Args.push_back(&ReduceListArg);
3585
3586   const CGFunctionInfo &CGFI =
3587       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3588   auto *Fn = llvm::Function::Create(
3589       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3590       "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
3591   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3592   Fn->setDoesNotRecurse();
3593   CodeGenFunction CGF(CGM);
3594   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3595
3596   CGBuilderTy &Bld = CGF.Builder;
3597
3598   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3599   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3600   Address LocalReduceList(
3601       Bld.CreatePointerBitCastOrAddrSpaceCast(
3602           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3603                                C.VoidPtrTy, Loc),
3604           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3605       CGF.getPointerAlign());
3606   QualType StaticTy = C.getRecordType(TeamReductionRec);
3607   llvm::Type *LLVMReductionsBufferTy =
3608       CGM.getTypes().ConvertTypeForMem(StaticTy);
3609   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3610       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3611       LLVMReductionsBufferTy->getPointerTo());
3612   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3613                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3614                                               /*Volatile=*/false, C.IntTy,
3615                                               Loc)};
3616   unsigned Idx = 0;
3617   for (const Expr *Private : Privates) {
3618     // Reduce element = LocalReduceList[i]
3619     Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3620     llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3621         ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3622     // elemptr = ((CopyType*)(elemptrptr)) + I
3623     ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3624         ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3625     Address ElemPtr =
3626         Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3627     const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3628     // Global = Buffer.VD[Idx];
3629     const FieldDecl *FD = VarFieldMap.lookup(VD);
3630     LValue GlobLVal = CGF.EmitLValueForField(
3631         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3632     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3633     GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3634     switch (CGF.getEvaluationKind(Private->getType())) {
3635     case TEK_Scalar: {
3636       llvm::Value *V = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
3637                                             Private->getType(), Loc);
3638       CGF.EmitStoreOfScalar(V, GlobLVal);
3639       break;
3640     }
3641     case TEK_Complex: {
3642       CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
3643           CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
3644       CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
3645       break;
3646     }
3647     case TEK_Aggregate:
3648       CGF.EmitAggregateCopy(GlobLVal,
3649                             CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3650                             Private->getType(), AggValueSlot::DoesNotOverlap);
3651       break;
3652     }
3653     ++Idx;
3654   }
3655
3656   CGF.FinishFunction();
3657   return Fn;
3658 }
3659
3660 /// This function emits a helper that reduces all the reduction variables from
3661 /// the team into the provided global buffer for the reduction variables.
3662 ///
3663 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
3664 ///  void *GlobPtrs[];
3665 ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
3666 ///  ...
3667 ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
3668 ///  reduce_function(GlobPtrs, reduce_data);
3669 static llvm::Value *emitListToGlobalReduceFunction(
3670     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3671     QualType ReductionArrayTy, SourceLocation Loc,
3672     const RecordDecl *TeamReductionRec,
3673     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3674         &VarFieldMap,
3675     llvm::Function *ReduceFn) {
3676   ASTContext &C = CGM.getContext();
3677
3678   // Buffer: global reduction buffer.
3679   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3680                               C.VoidPtrTy, ImplicitParamDecl::Other);
3681   // Idx: index of the buffer.
3682   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3683                            ImplicitParamDecl::Other);
3684   // ReduceList: thread local Reduce list.
3685   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3686                                   C.VoidPtrTy, ImplicitParamDecl::Other);
3687   FunctionArgList Args;
3688   Args.push_back(&BufferArg);
3689   Args.push_back(&IdxArg);
3690   Args.push_back(&ReduceListArg);
3691
3692   const CGFunctionInfo &CGFI =
3693       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3694   auto *Fn = llvm::Function::Create(
3695       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3696       "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
3697   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3698   Fn->setDoesNotRecurse();
3699   CodeGenFunction CGF(CGM);
3700   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3701
3702   CGBuilderTy &Bld = CGF.Builder;
3703
3704   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3705   QualType StaticTy = C.getRecordType(TeamReductionRec);
3706   llvm::Type *LLVMReductionsBufferTy =
3707       CGM.getTypes().ConvertTypeForMem(StaticTy);
3708   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3709       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3710       LLVMReductionsBufferTy->getPointerTo());
3711
3712   // 1. Build a list of reduction variables.
3713   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3714   Address ReductionList =
3715       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3716   auto IPriv = Privates.begin();
3717   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3718                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3719                                               /*Volatile=*/false, C.IntTy,
3720                                               Loc)};
3721   unsigned Idx = 0;
3722   for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3723     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3724     // Global = Buffer.VD[Idx];
3725     const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3726     const FieldDecl *FD = VarFieldMap.lookup(VD);
3727     LValue GlobLVal = CGF.EmitLValueForField(
3728         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3729     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3730     llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3731     CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3732     if ((*IPriv)->getType()->isVariablyModifiedType()) {
3733       // Store array size.
3734       ++Idx;
3735       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3736       llvm::Value *Size = CGF.Builder.CreateIntCast(
3737           CGF.getVLASize(
3738                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3739               .NumElts,
3740           CGF.SizeTy, /*isSigned=*/false);
3741       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3742                               Elem);
3743     }
3744   }
3745
3746   // Call reduce_function(GlobalReduceList, ReduceList)
3747   llvm::Value *GlobalReduceList =
3748       CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3749   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3750   llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3751       AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3752   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3753       CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
3754   CGF.FinishFunction();
3755   return Fn;
3756 }
3757
3758 /// This function emits a helper that copies all the reduction variables from
3759 /// the team into the provided global buffer for the reduction variables.
3760 ///
3761 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
3762 ///   For all data entries D in reduce_data:
3763 ///     Copy buffer.D[Idx] to local D;
3764 static llvm::Value *emitGlobalToListCopyFunction(
3765     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3766     QualType ReductionArrayTy, SourceLocation Loc,
3767     const RecordDecl *TeamReductionRec,
3768     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3769         &VarFieldMap) {
3770   ASTContext &C = CGM.getContext();
3771
3772   // Buffer: global reduction buffer.
3773   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3774                               C.VoidPtrTy, ImplicitParamDecl::Other);
3775   // Idx: index of the buffer.
3776   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3777                            ImplicitParamDecl::Other);
3778   // ReduceList: thread local Reduce list.
3779   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3780                                   C.VoidPtrTy, ImplicitParamDecl::Other);
3781   FunctionArgList Args;
3782   Args.push_back(&BufferArg);
3783   Args.push_back(&IdxArg);
3784   Args.push_back(&ReduceListArg);
3785
3786   const CGFunctionInfo &CGFI =
3787       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3788   auto *Fn = llvm::Function::Create(
3789       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3790       "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
3791   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3792   Fn->setDoesNotRecurse();
3793   CodeGenFunction CGF(CGM);
3794   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3795
3796   CGBuilderTy &Bld = CGF.Builder;
3797
3798   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3799   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3800   Address LocalReduceList(
3801       Bld.CreatePointerBitCastOrAddrSpaceCast(
3802           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3803                                C.VoidPtrTy, Loc),
3804           CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3805       CGF.getPointerAlign());
3806   QualType StaticTy = C.getRecordType(TeamReductionRec);
3807   llvm::Type *LLVMReductionsBufferTy =
3808       CGM.getTypes().ConvertTypeForMem(StaticTy);
3809   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3810       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3811       LLVMReductionsBufferTy->getPointerTo());
3812
3813   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3814                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3815                                               /*Volatile=*/false, C.IntTy,
3816                                               Loc)};
3817   unsigned Idx = 0;
3818   for (const Expr *Private : Privates) {
3819     // Reduce element = LocalReduceList[i]
3820     Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
3821     llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3822         ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3823     // elemptr = ((CopyType*)(elemptrptr)) + I
3824     ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3825         ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
3826     Address ElemPtr =
3827         Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3828     const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
3829     // Global = Buffer.VD[Idx];
3830     const FieldDecl *FD = VarFieldMap.lookup(VD);
3831     LValue GlobLVal = CGF.EmitLValueForField(
3832         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3833     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3834     GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
3835     switch (CGF.getEvaluationKind(Private->getType())) {
3836     case TEK_Scalar: {
3837       llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
3838       CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType());
3839       break;
3840     }
3841     case TEK_Complex: {
3842       CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
3843       CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3844                              /*isInit=*/false);
3845       break;
3846     }
3847     case TEK_Aggregate:
3848       CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3849                             GlobLVal, Private->getType(),
3850                             AggValueSlot::DoesNotOverlap);
3851       break;
3852     }
3853     ++Idx;
3854   }
3855
3856   CGF.FinishFunction();
3857   return Fn;
3858 }
3859
3860 /// This function emits a helper that reduces all the reduction variables from
3861 /// the team into the provided global buffer for the reduction variables.
3862 ///
3863 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
3864 ///  void *GlobPtrs[];
3865 ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
3866 ///  ...
3867 ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
3868 ///  reduce_function(reduce_data, GlobPtrs);
3869 static llvm::Value *emitGlobalToListReduceFunction(
3870     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3871     QualType ReductionArrayTy, SourceLocation Loc,
3872     const RecordDecl *TeamReductionRec,
3873     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
3874         &VarFieldMap,
3875     llvm::Function *ReduceFn) {
3876   ASTContext &C = CGM.getContext();
3877
3878   // Buffer: global reduction buffer.
3879   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3880                               C.VoidPtrTy, ImplicitParamDecl::Other);
3881   // Idx: index of the buffer.
3882   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
3883                            ImplicitParamDecl::Other);
3884   // ReduceList: thread local Reduce list.
3885   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3886                                   C.VoidPtrTy, ImplicitParamDecl::Other);
3887   FunctionArgList Args;
3888   Args.push_back(&BufferArg);
3889   Args.push_back(&IdxArg);
3890   Args.push_back(&ReduceListArg);
3891
3892   const CGFunctionInfo &CGFI =
3893       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
3894   auto *Fn = llvm::Function::Create(
3895       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3896       "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
3897   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3898   Fn->setDoesNotRecurse();
3899   CodeGenFunction CGF(CGM);
3900   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
3901
3902   CGBuilderTy &Bld = CGF.Builder;
3903
3904   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
3905   QualType StaticTy = C.getRecordType(TeamReductionRec);
3906   llvm::Type *LLVMReductionsBufferTy =
3907       CGM.getTypes().ConvertTypeForMem(StaticTy);
3908   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3909       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
3910       LLVMReductionsBufferTy->getPointerTo());
3911
3912   // 1. Build a list of reduction variables.
3913   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3914   Address ReductionList =
3915       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3916   auto IPriv = Privates.begin();
3917   llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
3918                          CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
3919                                               /*Volatile=*/false, C.IntTy,
3920                                               Loc)};
3921   unsigned Idx = 0;
3922   for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
3923     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3924     // Global = Buffer.VD[Idx];
3925     const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
3926     const FieldDecl *FD = VarFieldMap.lookup(VD);
3927     LValue GlobLVal = CGF.EmitLValueForField(
3928         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
3929     llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
3930     llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
3931     CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
3932     if ((*IPriv)->getType()->isVariablyModifiedType()) {
3933       // Store array size.
3934       ++Idx;
3935       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3936       llvm::Value *Size = CGF.Builder.CreateIntCast(
3937           CGF.getVLASize(
3938                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3939               .NumElts,
3940           CGF.SizeTy, /*isSigned=*/false);
3941       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3942                               Elem);
3943     }
3944   }
3945
3946   // Call reduce_function(ReduceList, GlobalReduceList)
3947   llvm::Value *GlobalReduceList =
3948       CGF.EmitCastToVoidPtr(ReductionList.getPointer());
3949   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3950   llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
3951       AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
3952   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3953       CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
3954   CGF.FinishFunction();
3955   return Fn;
3956 }
3957
3958 ///
3959 /// Design of OpenMP reductions on the GPU
3960 ///
3961 /// Consider a typical OpenMP program with one or more reduction
3962 /// clauses:
3963 ///
3964 /// float foo;
3965 /// double bar;
3966 /// #pragma omp target teams distribute parallel for \
3967 ///             reduction(+:foo) reduction(*:bar)
3968 /// for (int i = 0; i < N; i++) {
3969 ///   foo += A[i]; bar *= B[i];
3970 /// }
3971 ///
3972 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
3973 /// all teams.  In our OpenMP implementation on the NVPTX device an
3974 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3975 /// within a team are mapped to CUDA threads within a threadblock.
3976 /// Our goal is to efficiently aggregate values across all OpenMP
3977 /// threads such that:
3978 ///
3979 ///   - the compiler and runtime are logically concise, and
3980 ///   - the reduction is performed efficiently in a hierarchical
3981 ///     manner as follows: within OpenMP threads in the same warp,
3982 ///     across warps in a threadblock, and finally across teams on
3983 ///     the NVPTX device.
3984 ///
3985 /// Introduction to Decoupling
3986 ///
3987 /// We would like to decouple the compiler and the runtime so that the
3988 /// latter is ignorant of the reduction variables (number, data types)
3989 /// and the reduction operators.  This allows a simpler interface
3990 /// and implementation while still attaining good performance.
3991 ///
3992 /// Pseudocode for the aforementioned OpenMP program generated by the
3993 /// compiler is as follows:
3994 ///
3995 /// 1. Create private copies of reduction variables on each OpenMP
3996 ///    thread: 'foo_private', 'bar_private'
3997 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3998 ///    to it and writes the result in 'foo_private' and 'bar_private'
3999 ///    respectively.
4000 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
4001 ///    and store the result on the team master:
4002 ///
4003 ///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,
4004 ///        reduceData, shuffleReduceFn, interWarpCpyFn)
4005 ///
4006 ///     where:
4007 ///       struct ReduceData {
4008 ///         double *foo;
4009 ///         double *bar;
4010 ///       } reduceData
4011 ///       reduceData.foo = &foo_private
4012 ///       reduceData.bar = &bar_private
4013 ///
4014 ///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
4015 ///     auxiliary functions generated by the compiler that operate on
4016 ///     variables of type 'ReduceData'.  They aid the runtime perform
4017 ///     algorithmic steps in a data agnostic manner.
4018 ///
4019 ///     'shuffleReduceFn' is a pointer to a function that reduces data
4020 ///     of type 'ReduceData' across two OpenMP threads (lanes) in the
4021 ///     same warp.  It takes the following arguments as input:
4022 ///
4023 ///     a. variable of type 'ReduceData' on the calling lane,
4024 ///     b. its lane_id,
4025 ///     c. an offset relative to the current lane_id to generate a
4026 ///        remote_lane_id.  The remote lane contains the second
4027 ///        variable of type 'ReduceData' that is to be reduced.
4028 ///     d. an algorithm version parameter determining which reduction
4029 ///        algorithm to use.
4030 ///
4031 ///     'shuffleReduceFn' retrieves data from the remote lane using
4032 ///     efficient GPU shuffle intrinsics and reduces, using the
4033 ///     algorithm specified by the 4th parameter, the two operands
4034 ///     element-wise.  The result is written to the first operand.
4035 ///
4036 ///     Different reduction algorithms are implemented in different
4037 ///     runtime functions, all calling 'shuffleReduceFn' to perform
4038 ///     the essential reduction step.  Therefore, based on the 4th
4039 ///     parameter, this function behaves slightly differently to
4040 ///     cooperate with the runtime to ensure correctness under
4041 ///     different circumstances.
4042 ///
4043 ///     'InterWarpCpyFn' is a pointer to a function that transfers
4044 ///     reduced variables across warps.  It tunnels, through CUDA
4045 ///     shared memory, the thread-private data of type 'ReduceData'
4046 ///     from lane 0 of each warp to a lane in the first warp.
4047 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
4048 ///    The last team writes the global reduced value to memory.
4049 ///
4050 ///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
4051 ///             reduceData, shuffleReduceFn, interWarpCpyFn,
4052 ///             scratchpadCopyFn, loadAndReduceFn)
4053 ///
4054 ///     'scratchpadCopyFn' is a helper that stores reduced
4055 ///     data from the team master to a scratchpad array in
4056 ///     global memory.
4057 ///
4058 ///     'loadAndReduceFn' is a helper that loads data from
4059 ///     the scratchpad array and reduces it with the input
4060 ///     operand.
4061 ///
4062 ///     These compiler generated functions hide address
4063 ///     calculation and alignment information from the runtime.
4064 /// 5. if ret == 1:
4065 ///     The team master of the last team stores the reduced
4066 ///     result to the globals in memory.
4067 ///     foo += reduceData.foo; bar *= reduceData.bar
4068 ///
4069 ///
4070 /// Warp Reduction Algorithms
4071 ///
4072 /// On the warp level, we have three algorithms implemented in the
4073 /// OpenMP runtime depending on the number of active lanes:
4074 ///
4075 /// Full Warp Reduction
4076 ///
4077 /// The reduce algorithm within a warp where all lanes are active
4078 /// is implemented in the runtime as follows:
4079 ///
4080 /// full_warp_reduce(void *reduce_data,
4081 ///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4082 ///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
4083 ///     ShuffleReduceFn(reduce_data, 0, offset, 0);
4084 /// }
4085 ///
4086 /// The algorithm completes in log(2, WARPSIZE) steps.
4087 ///
4088 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
4089 /// not used therefore we save instructions by not retrieving lane_id
4090 /// from the corresponding special registers.  The 4th parameter, which
4091 /// represents the version of the algorithm being used, is set to 0 to
4092 /// signify full warp reduction.
4093 ///
4094 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4095 ///
4096 /// #reduce_elem refers to an element in the local lane's data structure
4097 /// #remote_elem is retrieved from a remote lane
4098 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4099 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
4100 ///
4101 /// Contiguous Partial Warp Reduction
4102 ///
4103 /// This reduce algorithm is used within a warp where only the first
4104 /// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the
4105 /// number of OpenMP threads in a parallel region is not a multiple of
4106 /// WARPSIZE.  The algorithm is implemented in the runtime as follows:
4107 ///
4108 /// void
4109 /// contiguous_partial_reduce(void *reduce_data,
4110 ///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,
4111 ///                           int size, int lane_id) {
4112 ///   int curr_size;
4113 ///   int offset;
4114 ///   curr_size = size;
4115 ///   mask = curr_size/2;
4116 ///   while (offset>0) {
4117 ///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);
4118 ///     curr_size = (curr_size+1)/2;
4119 ///     offset = curr_size/2;
4120 ///   }
4121 /// }
4122 ///
4123 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4124 ///
4125 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4126 /// if (lane_id < offset)
4127 ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
4128 /// else
4129 ///     reduce_elem = remote_elem
4130 ///
4131 /// This algorithm assumes that the data to be reduced are located in a
4132 /// contiguous subset of lanes starting from the first.  When there is
4133 /// an odd number of active lanes, the data in the last lane is not
4134 /// aggregated with any other lane's dat but is instead copied over.
4135 ///
4136 /// Dispersed Partial Warp Reduction
4137 ///
4138 /// This algorithm is used within a warp when any discontiguous subset of
4139 /// lanes are active.  It is used to implement the reduction operation
4140 /// across lanes in an OpenMP simd region or in a nested parallel region.
4141 ///
4142 /// void
4143 /// dispersed_partial_reduce(void *reduce_data,
4144 ///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
4145 ///   int size, remote_id;
4146 ///   int logical_lane_id = number_of_active_lanes_before_me() * 2;
4147 ///   do {
4148 ///       remote_id = next_active_lane_id_right_after_me();
4149 ///       # the above function returns 0 of no active lane
4150 ///       # is present right after the current lane.
4151 ///       size = number_of_active_lanes_in_this_warp();
4152 ///       logical_lane_id /= 2;
4153 ///       ShuffleReduceFn(reduce_data, logical_lane_id,
4154 ///                       remote_id-1-threadIdx.x, 2);
4155 ///   } while (logical_lane_id % 2 == 0 && size > 1);
4156 /// }
4157 ///
4158 /// There is no assumption made about the initial state of the reduction.
4159 /// Any number of lanes (>=1) could be active at any position.  The reduction
4160 /// result is returned in the first active lane.
4161 ///
4162 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
4163 ///
4164 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
4165 /// if (lane_id % 2 == 0 && offset > 0)
4166 ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
4167 /// else
4168 ///     reduce_elem = remote_elem
4169 ///
4170 ///
4171 /// Intra-Team Reduction
4172 ///
4173 /// This function, as implemented in the runtime call
4174 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
4175 /// threads in a team.  It first reduces within a warp using the
4176 /// aforementioned algorithms.  We then proceed to gather all such
4177 /// reduced values at the first warp.
4178 ///
4179 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
4180 /// data from each of the "warp master" (zeroth lane of each warp, where
4181 /// warp-reduced data is held) to the zeroth warp.  This step reduces (in
4182 /// a mathematical sense) the problem of reduction across warp masters in
4183 /// a block to the problem of warp reduction.
4184 ///
4185 ///
4186 /// Inter-Team Reduction
4187 ///
4188 /// Once a team has reduced its data to a single value, it is stored in
4189 /// a global scratchpad array.  Since each team has a distinct slot, this
4190 /// can be done without locking.
4191 ///
4192 /// The last team to write to the scratchpad array proceeds to reduce the
4193 /// scratchpad array.  One or more workers in the last team use the helper
4194 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
4195 /// the k'th worker reduces every k'th element.
4196 ///
4197 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
4198 /// reduce across workers and compute a globally reduced value.
4199 ///
4200 void CGOpenMPRuntimeNVPTX::emitReduction(
4201     CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
4202     ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
4203     ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
4204   if (!CGF.HaveInsertPoint())
4205     return;
4206
4207   bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
4208 #ifndef NDEBUG
4209   bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
4210 #endif
4211
4212   if (Options.SimpleReduction) {
4213     assert(!TeamsReduction && !ParallelReduction &&
4214            "Invalid reduction selection in emitReduction.");
4215     CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
4216                                    ReductionOps, Options);
4217     return;
4218   }
4219
4220   assert((TeamsReduction || ParallelReduction) &&
4221          "Invalid reduction selection in emitReduction.");
4222
4223   // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4224   // RedList, shuffle_reduce_func, interwarp_copy_func);
4225   // or
4226   // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4227   llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
4228   llvm::Value *ThreadId = getThreadID(CGF, Loc);
4229
4230   llvm::Value *Res;
4231   ASTContext &C = CGM.getContext();
4232   // 1. Build a list of reduction variables.
4233   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4234   auto Size = RHSExprs.size();
4235   for (const Expr *E : Privates) {
4236     if (E->getType()->isVariablyModifiedType())
4237       // Reserve place for array size.
4238       ++Size;
4239   }
4240   llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
4241   QualType ReductionArrayTy =
4242       C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
4243                              /*IndexTypeQuals=*/0);
4244   Address ReductionList =
4245       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
4246   auto IPriv = Privates.begin();
4247   unsigned Idx = 0;
4248   for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
4249     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4250     CGF.Builder.CreateStore(
4251         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4252             CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
4253         Elem);
4254     if ((*IPriv)->getType()->isVariablyModifiedType()) {
4255       // Store array size.
4256       ++Idx;
4257       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
4258       llvm::Value *Size = CGF.Builder.CreateIntCast(
4259           CGF.getVLASize(
4260                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
4261               .NumElts,
4262           CGF.SizeTy, /*isSigned=*/false);
4263       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
4264                               Elem);
4265     }
4266   }
4267
4268   llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4269       ReductionList.getPointer(), CGF.VoidPtrTy);
4270   llvm::Function *ReductionFn = emitReductionFunction(
4271       Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
4272       LHSExprs, RHSExprs, ReductionOps);
4273   llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
4274   llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
4275       CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
4276   llvm::Value *InterWarpCopyFn =
4277       emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
4278
4279   if (ParallelReduction) {
4280     llvm::Value *Args[] = {RTLoc,
4281                            ThreadId,
4282                            CGF.Builder.getInt32(RHSExprs.size()),
4283                            ReductionArrayTySize,
4284                            RL,
4285                            ShuffleAndReduceFn,
4286                            InterWarpCopyFn};
4287
4288     Res = CGF.EmitRuntimeCall(
4289         createNVPTXRuntimeFunction(
4290             OMPRTL_NVPTX__kmpc_nvptx_parallel_reduce_nowait_v2),
4291         Args);
4292   } else {
4293     assert(TeamsReduction && "expected teams reduction.");
4294     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
4295     llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
4296     int Cnt = 0;
4297     for (const Expr *DRE : Privates) {
4298       PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
4299       ++Cnt;
4300     }
4301     const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
4302         CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
4303         C.getLangOpts().OpenMPCUDAReductionBufNum);
4304     TeamsReductions.push_back(TeamReductionRec);
4305     if (!KernelTeamsReductionPtr) {
4306       KernelTeamsReductionPtr = new llvm::GlobalVariable(
4307           CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
4308           llvm::GlobalValue::InternalLinkage, nullptr,
4309           "_openmp_teams_reductions_buffer_$_$ptr");
4310     }
4311     llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
4312         Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
4313         /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
4314     llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
4315         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4316     llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
4317         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4318         ReductionFn);
4319     llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
4320         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
4321     llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
4322         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
4323         ReductionFn);
4324
4325     llvm::Value *Args[] = {
4326         RTLoc,
4327         ThreadId,
4328         GlobalBufferPtr,
4329         CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
4330         RL,
4331         ShuffleAndReduceFn,
4332         InterWarpCopyFn,
4333         GlobalToBufferCpyFn,
4334         GlobalToBufferRedFn,
4335         BufferToGlobalCpyFn,
4336         BufferToGlobalRedFn};
4337
4338     Res = CGF.EmitRuntimeCall(
4339         createNVPTXRuntimeFunction(
4340             OMPRTL_NVPTX__kmpc_nvptx_teams_reduce_nowait_v2),
4341         Args);
4342   }
4343
4344   // 5. Build if (res == 1)
4345   llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
4346   llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
4347   llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
4348       Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
4349   CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4350
4351   // 6. Build then branch: where we have reduced values in the master
4352   //    thread in each team.
4353   //    __kmpc_end_reduce{_nowait}(<gtid>);
4354   //    break;
4355   CGF.EmitBlock(ThenBB);
4356
4357   // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4358   auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
4359                     this](CodeGenFunction &CGF, PrePostActionTy &Action) {
4360     auto IPriv = Privates.begin();
4361     auto ILHS = LHSExprs.begin();
4362     auto IRHS = RHSExprs.begin();
4363     for (const Expr *E : ReductionOps) {
4364       emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
4365                                   cast<DeclRefExpr>(*IRHS));
4366       ++IPriv;
4367       ++ILHS;
4368       ++IRHS;
4369     }
4370   };
4371   llvm::Value *EndArgs[] = {ThreadId};
4372   RegionCodeGenTy RCG(CodeGen);
4373   NVPTXActionTy Action(
4374       nullptr, llvm::None,
4375       createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
4376       EndArgs);
4377   RCG.setAction(Action);
4378   RCG(CGF);
4379   // There is no need to emit line number for unconditional branch.
4380   (void)ApplyDebugLocation::CreateEmpty(CGF);
4381   CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
4382 }
4383
4384 const VarDecl *
4385 CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
4386                                          const VarDecl *NativeParam) const {
4387   if (!NativeParam->getType()->isReferenceType())
4388     return NativeParam;
4389   QualType ArgType = NativeParam->getType();
4390   QualifierCollector QC;
4391   const Type *NonQualTy = QC.strip(ArgType);
4392   QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4393   if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
4394     if (Attr->getCaptureKind() == OMPC_map) {
4395       PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4396                                                         LangAS::opencl_global);
4397     } else if (Attr->getCaptureKind() == OMPC_firstprivate &&
4398                PointeeTy.isConstant(CGM.getContext())) {
4399       PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4400                                                         LangAS::opencl_generic);
4401     }
4402   }
4403   ArgType = CGM.getContext().getPointerType(PointeeTy);
4404   QC.addRestrict();
4405   enum { NVPTX_local_addr = 5 };
4406   QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
4407   ArgType = QC.apply(CGM.getContext(), ArgType);
4408   if (isa<ImplicitParamDecl>(NativeParam))
4409     return ImplicitParamDecl::Create(
4410         CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
4411         NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
4412   return ParmVarDecl::Create(
4413       CGM.getContext(),
4414       const_cast<DeclContext *>(NativeParam->getDeclContext()),
4415       NativeParam->getBeginLoc(), NativeParam->getLocation(),
4416       NativeParam->getIdentifier(), ArgType,
4417       /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
4418 }
4419
4420 Address
4421 CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
4422                                           const VarDecl *NativeParam,
4423                                           const VarDecl *TargetParam) const {
4424   assert(NativeParam != TargetParam &&
4425          NativeParam->getType()->isReferenceType() &&
4426          "Native arg must not be the same as target arg.");
4427   Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
4428   QualType NativeParamType = NativeParam->getType();
4429   QualifierCollector QC;
4430   const Type *NonQualTy = QC.strip(NativeParamType);
4431   QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4432   unsigned NativePointeeAddrSpace =
4433       CGF.getContext().getTargetAddressSpace(NativePointeeTy);
4434   QualType TargetTy = TargetParam->getType();
4435   llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
4436       LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
4437   // First cast to generic.
4438   TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4439       TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4440                       /*AddrSpace=*/0));
4441   // Cast from generic to native address space.
4442   TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4443       TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4444                       NativePointeeAddrSpace));
4445   Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
4446   CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
4447                         NativeParamType);
4448   return NativeParamAddr;
4449 }
4450
4451 void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
4452     CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
4453     ArrayRef<llvm::Value *> Args) const {
4454   SmallVector<llvm::Value *, 4> TargetArgs;
4455   TargetArgs.reserve(Args.size());
4456   auto *FnType = OutlinedFn.getFunctionType();
4457   for (unsigned I = 0, E = Args.size(); I < E; ++I) {
4458     if (FnType->isVarArg() && FnType->getNumParams() <= I) {
4459       TargetArgs.append(std::next(Args.begin(), I), Args.end());
4460       break;
4461     }
4462     llvm::Type *TargetType = FnType->getParamType(I);
4463     llvm::Value *NativeArg = Args[I];
4464     if (!TargetType->isPointerTy()) {
4465       TargetArgs.emplace_back(NativeArg);
4466       continue;
4467     }
4468     llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4469         NativeArg,
4470         NativeArg->getType()->getPointerElementType()->getPointerTo());
4471     TargetArgs.emplace_back(
4472         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4473   }
4474   CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
4475 }
4476
4477 /// Emit function which wraps the outline parallel region
4478 /// and controls the arguments which are passed to this function.
4479 /// The wrapper ensures that the outlined function is called
4480 /// with the correct arguments when data is shared.
4481 llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
4482     llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4483   ASTContext &Ctx = CGM.getContext();
4484   const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4485
4486   // Create a function that takes as argument the source thread.
4487   FunctionArgList WrapperArgs;
4488   QualType Int16QTy =
4489       Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4490   QualType Int32QTy =
4491       Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
4492   ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4493                                      /*Id=*/nullptr, Int16QTy,
4494                                      ImplicitParamDecl::Other);
4495   ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
4496                                /*Id=*/nullptr, Int32QTy,
4497                                ImplicitParamDecl::Other);
4498   WrapperArgs.emplace_back(&ParallelLevelArg);
4499   WrapperArgs.emplace_back(&WrapperArg);
4500
4501   const CGFunctionInfo &CGFI =
4502       CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4503
4504   auto *Fn = llvm::Function::Create(
4505       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
4506       Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
4507   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
4508   Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
4509   Fn->setDoesNotRecurse();
4510
4511   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4512   CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
4513                     D.getBeginLoc(), D.getBeginLoc());
4514
4515   const auto *RD = CS.getCapturedRecordDecl();
4516   auto CurField = RD->field_begin();
4517
4518   Address ZeroAddr = CGF.CreateMemTemp(
4519       CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
4520       /*Name*/ ".zero.addr");
4521   CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
4522   // Get the array of arguments.
4523   SmallVector<llvm::Value *, 8> Args;
4524
4525   Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4526   Args.emplace_back(ZeroAddr.getPointer());
4527
4528   CGBuilderTy &Bld = CGF.Builder;
4529   auto CI = CS.capture_begin();
4530
4531   // Use global memory for data sharing.
4532   // Handle passing of global args to workers.
4533   Address GlobalArgs =
4534       CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4535   llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4536   llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4537   CGF.EmitRuntimeCall(
4538       createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
4539       DataSharingArgs);
4540
4541   // Retrieve the shared variables from the list of references returned
4542   // by the runtime. Pass the variables to the outlined function.
4543   Address SharedArgListAddress = Address::invalid();
4544   if (CS.capture_size() > 0 ||
4545       isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4546     SharedArgListAddress = CGF.EmitLoadOfPointer(
4547         GlobalArgs, CGF.getContext()
4548                         .getPointerType(CGF.getContext().getPointerType(
4549                             CGF.getContext().VoidPtrTy))
4550                         .castAs<PointerType>());
4551   }
4552   unsigned Idx = 0;
4553   if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4554     Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4555     Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4556         Src, CGF.SizeTy->getPointerTo());
4557     llvm::Value *LB = CGF.EmitLoadOfScalar(
4558         TypedAddress,
4559         /*Volatile=*/false,
4560         CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4561         cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4562     Args.emplace_back(LB);
4563     ++Idx;
4564     Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
4565     TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4566         Src, CGF.SizeTy->getPointerTo());
4567     llvm::Value *UB = CGF.EmitLoadOfScalar(
4568         TypedAddress,
4569         /*Volatile=*/false,
4570         CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4571         cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4572     Args.emplace_back(UB);
4573     ++Idx;
4574   }
4575   if (CS.capture_size() > 0) {
4576     ASTContext &CGFContext = CGF.getContext();
4577     for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4578       QualType ElemTy = CurField->getType();
4579       Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
4580       Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4581           Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4582       llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4583                                               /*Volatile=*/false,
4584                                               CGFContext.getPointerType(ElemTy),
4585                                               CI->getLocation());
4586       if (CI->capturesVariableByCopy() &&
4587           !CI->getCapturedVar()->getType()->isAnyPointerType()) {
4588         Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4589                               CI->getLocation());
4590       }
4591       Args.emplace_back(Arg);
4592     }
4593   }
4594
4595   emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
4596   CGF.FinishFunction();
4597   return Fn;
4598 }
4599
4600 void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
4601                                               const Decl *D) {
4602   if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
4603     return;
4604
4605   assert(D && "Expected function or captured|block decl.");
4606   assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4607          "Function is registered already.");
4608   assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
4609          "Team is set but not processed.");
4610   const Stmt *Body = nullptr;
4611   bool NeedToDelayGlobalization = false;
4612   if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4613     Body = FD->getBody();
4614   } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4615     Body = BD->getBody();
4616   } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4617     Body = CD->getBody();
4618     NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
4619     if (NeedToDelayGlobalization &&
4620         getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
4621       return;
4622   }
4623   if (!Body)
4624     return;
4625   CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
4626   VarChecker.Visit(Body);
4627   const RecordDecl *GlobalizedVarsRecord =
4628       VarChecker.getGlobalizedRecord(IsInTTDRegion);
4629   TeamAndReductions.first = nullptr;
4630   TeamAndReductions.second.clear();
4631   ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4632       VarChecker.getEscapedVariableLengthDecls();
4633   if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
4634     return;
4635   auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4636   I->getSecond().MappedParams =
4637       llvm::make_unique<CodeGenFunction::OMPMapVars>();
4638   I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4639   I->getSecond().EscapedParameters.insert(
4640       VarChecker.getEscapedParameters().begin(),
4641       VarChecker.getEscapedParameters().end());
4642   I->getSecond().EscapedVariableLengthDecls.append(
4643       EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
4644   DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
4645   for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4646     assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4647     const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4648     Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
4649   }
4650   if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
4651     CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
4652     VarChecker.Visit(Body);
4653     I->getSecond().SecondaryGlobalRecord =
4654         VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
4655     I->getSecond().SecondaryLocalVarData.emplace();
4656     DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4657     for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4658       assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4659       const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
4660       Data.insert(
4661           std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
4662     }
4663   }
4664   if (!NeedToDelayGlobalization) {
4665     emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
4666     struct GlobalizationScope final : EHScopeStack::Cleanup {
4667       GlobalizationScope() = default;
4668
4669       void Emit(CodeGenFunction &CGF, Flags flags) override {
4670         static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
4671             .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
4672       }
4673     };
4674     CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
4675   }
4676 }
4677
4678 Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
4679                                                         const VarDecl *VD) {
4680   if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
4681     const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4682     switch (A->getAllocatorType()) {
4683       // Use the default allocator here as by default local vars are
4684       // threadlocal.
4685     case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4686     case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4687     case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4688     case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4689       // Follow the user decision - use default allocation.
4690       return Address::invalid();
4691     case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4692       // TODO: implement aupport for user-defined allocators.
4693       return Address::invalid();
4694     case OMPAllocateDeclAttr::OMPConstMemAlloc: {
4695       llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4696       auto *GV = new llvm::GlobalVariable(
4697           CGM.getModule(), VarTy, /*isConstant=*/false,
4698           llvm::GlobalValue::InternalLinkage,
4699           llvm::Constant::getNullValue(VarTy), VD->getName(),
4700           /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4701           CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant));
4702       CharUnits Align = CGM.getContext().getDeclAlign(VD);
4703       GV->setAlignment(Align.getQuantity());
4704       return Address(GV, Align);
4705     }
4706     case OMPAllocateDeclAttr::OMPPTeamMemAlloc: {
4707       llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4708       auto *GV = new llvm::GlobalVariable(
4709           CGM.getModule(), VarTy, /*isConstant=*/false,
4710           llvm::GlobalValue::InternalLinkage,
4711           llvm::Constant::getNullValue(VarTy), VD->getName(),
4712           /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
4713           CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
4714       CharUnits Align = CGM.getContext().getDeclAlign(VD);
4715       GV->setAlignment(Align.getQuantity());
4716       return Address(GV, Align);
4717     }
4718     case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4719     case OMPAllocateDeclAttr::OMPCGroupMemAlloc: {
4720       llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
4721       auto *GV = new llvm::GlobalVariable(
4722           CGM.getModule(), VarTy, /*isConstant=*/false,
4723           llvm::GlobalValue::InternalLinkage,
4724           llvm::Constant::getNullValue(VarTy), VD->getName());
4725       CharUnits Align = CGM.getContext().getDeclAlign(VD);
4726       GV->setAlignment(Align.getQuantity());
4727       return Address(GV, Align);
4728     }
4729     }
4730   }
4731
4732   if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
4733     return Address::invalid();
4734
4735   VD = VD->getCanonicalDecl();
4736   auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4737   if (I == FunctionGlobalizedDecls.end())
4738     return Address::invalid();
4739   auto VDI = I->getSecond().LocalVarData.find(VD);
4740   if (VDI != I->getSecond().LocalVarData.end())
4741     return VDI->second.PrivateAddr;
4742   if (VD->hasAttrs()) {
4743     for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
4744          E(VD->attr_end());
4745          IT != E; ++IT) {
4746       auto VDI = I->getSecond().LocalVarData.find(
4747           cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4748               ->getCanonicalDecl());
4749       if (VDI != I->getSecond().LocalVarData.end())
4750         return VDI->second.PrivateAddr;
4751     }
4752   }
4753
4754   return Address::invalid();
4755 }
4756
4757 void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) {
4758   FunctionGlobalizedDecls.erase(CGF.CurFn);
4759   CGOpenMPRuntime::functionFinished(CGF);
4760 }
4761
4762 void CGOpenMPRuntimeNVPTX::getDefaultDistScheduleAndChunk(
4763     CodeGenFunction &CGF, const OMPLoopDirective &S,
4764     OpenMPDistScheduleClauseKind &ScheduleKind,
4765     llvm::Value *&Chunk) const {
4766   if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4767     ScheduleKind = OMPC_DIST_SCHEDULE_static;
4768     Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4769         CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4770         S.getIterationVariable()->getType(), S.getBeginLoc());
4771     return;
4772   }
4773   CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
4774       CGF, S, ScheduleKind, Chunk);
4775 }
4776
4777 void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk(
4778     CodeGenFunction &CGF, const OMPLoopDirective &S,
4779     OpenMPScheduleClauseKind &ScheduleKind,
4780     const Expr *&ChunkExpr) const {
4781   ScheduleKind = OMPC_SCHEDULE_static;
4782   // Chunk size is 1 in this case.
4783   llvm::APInt ChunkSize(32, 1);
4784   ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4785       CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4786       SourceLocation());
4787 }
4788
4789 void CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas(
4790     CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4791   assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
4792          " Expected target-based directive.");
4793   const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4794   for (const CapturedStmt::Capture &C : CS->captures()) {
4795     // Capture variables captured by reference in lambdas for target-based
4796     // directives.
4797     if (!C.capturesVariable())
4798       continue;
4799     const VarDecl *VD = C.getCapturedVar();
4800     const auto *RD = VD->getType()
4801                          .getCanonicalType()
4802                          .getNonReferenceType()
4803                          ->getAsCXXRecordDecl();
4804     if (!RD || !RD->isLambda())
4805       continue;
4806     Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4807     LValue VDLVal;
4808     if (VD->getType().getCanonicalType()->isReferenceType())
4809       VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4810     else
4811       VDLVal = CGF.MakeAddrLValue(
4812           VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4813     llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4814     FieldDecl *ThisCapture = nullptr;
4815     RD->getCaptureFields(Captures, ThisCapture);
4816     if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4817       LValue ThisLVal =
4818           CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4819       llvm::Value *CXXThis = CGF.LoadCXXThis();
4820       CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4821     }
4822     for (const LambdaCapture &LC : RD->captures()) {
4823       if (LC.getCaptureKind() != LCK_ByRef)
4824         continue;
4825       const VarDecl *VD = LC.getCapturedVar();
4826       if (!CS->capturesVariable(VD))
4827         continue;
4828       auto It = Captures.find(VD);
4829       assert(It != Captures.end() && "Found lambda capture without field.");
4830       LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4831       Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4832       if (VD->getType().getCanonicalType()->isReferenceType())
4833         VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4834                                                VD->getType().getCanonicalType())
4835                      .getAddress();
4836       CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4837     }
4838   }
4839 }
4840
4841 unsigned CGOpenMPRuntimeNVPTX::getDefaultFirstprivateAddressSpace() const {
4842   return CGM.getContext().getTargetAddressSpace(LangAS::cuda_constant);
4843 }
4844
4845 bool CGOpenMPRuntimeNVPTX::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
4846                                                             LangAS &AS) {
4847   if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
4848     return false;
4849   const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
4850   switch(A->getAllocatorType()) {
4851   case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
4852   // Not supported, fallback to the default mem space.
4853   case OMPAllocateDeclAttr::OMPThreadMemAlloc:
4854   case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
4855   case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
4856   case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
4857   case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
4858     AS = LangAS::Default;
4859     return true;
4860   case OMPAllocateDeclAttr::OMPConstMemAlloc:
4861     AS = LangAS::cuda_constant;
4862     return true;
4863   case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
4864     AS = LangAS::cuda_shared;
4865     return true;
4866   case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
4867     llvm_unreachable("Expected predefined allocator for the variables with the "
4868                      "static storage.");
4869   }
4870   return false;
4871 }
4872
4873 // Get current CudaArch and ignore any unknown values
4874 static CudaArch getCudaArch(CodeGenModule &CGM) {
4875   if (!CGM.getTarget().hasFeature("ptx"))
4876     return CudaArch::UNKNOWN;
4877   llvm::StringMap<bool> Features;
4878   CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
4879                                  CGM.getTarget().getTargetOpts().CPU,
4880                                  CGM.getTarget().getTargetOpts().Features);
4881   for (const auto &Feature : Features) {
4882     if (Feature.getValue()) {
4883       CudaArch Arch = StringToCudaArch(Feature.getKey());
4884       if (Arch != CudaArch::UNKNOWN)
4885         return Arch;
4886     }
4887   }
4888   return CudaArch::UNKNOWN;
4889 }
4890
4891 /// Check to see if target architecture supports unified addressing which is
4892 /// a restriction for OpenMP requires clause "unified_shared_memory".
4893 void CGOpenMPRuntimeNVPTX::checkArchForUnifiedAddressing(
4894     const OMPRequiresDecl *D) {
4895   for (const OMPClause *Clause : D->clauselists()) {
4896     if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
4897       switch (getCudaArch(CGM)) {
4898       case CudaArch::SM_20:
4899       case CudaArch::SM_21:
4900       case CudaArch::SM_30:
4901       case CudaArch::SM_32:
4902       case CudaArch::SM_35:
4903       case CudaArch::SM_37:
4904       case CudaArch::SM_50:
4905       case CudaArch::SM_52:
4906       case CudaArch::SM_53:
4907       case CudaArch::SM_60:
4908       case CudaArch::SM_61:
4909       case CudaArch::SM_62:
4910         CGM.Error(Clause->getBeginLoc(),
4911                   "Target architecture does not support unified addressing");
4912         return;
4913       case CudaArch::SM_70:
4914       case CudaArch::SM_72:
4915       case CudaArch::SM_75:
4916       case CudaArch::GFX600:
4917       case CudaArch::GFX601:
4918       case CudaArch::GFX700:
4919       case CudaArch::GFX701:
4920       case CudaArch::GFX702:
4921       case CudaArch::GFX703:
4922       case CudaArch::GFX704:
4923       case CudaArch::GFX801:
4924       case CudaArch::GFX802:
4925       case CudaArch::GFX803:
4926       case CudaArch::GFX810:
4927       case CudaArch::GFX900:
4928       case CudaArch::GFX902:
4929       case CudaArch::GFX904:
4930       case CudaArch::GFX906:
4931       case CudaArch::GFX908:
4932       case CudaArch::GFX909:
4933       case CudaArch::GFX1010:
4934       case CudaArch::GFX1011:
4935       case CudaArch::GFX1012:
4936       case CudaArch::UNKNOWN:
4937         break;
4938       case CudaArch::LAST:
4939         llvm_unreachable("Unexpected Cuda arch.");
4940       }
4941     }
4942   }
4943   CGOpenMPRuntime::checkArchForUnifiedAddressing(D);
4944 }
4945
4946 /// Get number of SMs and number of blocks per SM.
4947 static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4948   std::pair<unsigned, unsigned> Data;
4949   if (CGM.getLangOpts().OpenMPCUDANumSMs)
4950     Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4951   if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4952     Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4953   if (Data.first && Data.second)
4954     return Data;
4955   switch (getCudaArch(CGM)) {
4956   case CudaArch::SM_20:
4957   case CudaArch::SM_21:
4958   case CudaArch::SM_30:
4959   case CudaArch::SM_32:
4960   case CudaArch::SM_35:
4961   case CudaArch::SM_37:
4962   case CudaArch::SM_50:
4963   case CudaArch::SM_52:
4964   case CudaArch::SM_53:
4965     return {16, 16};
4966   case CudaArch::SM_60:
4967   case CudaArch::SM_61:
4968   case CudaArch::SM_62:
4969     return {56, 32};
4970   case CudaArch::SM_70:
4971   case CudaArch::SM_72:
4972   case CudaArch::SM_75:
4973     return {84, 32};
4974   case CudaArch::GFX600:
4975   case CudaArch::GFX601:
4976   case CudaArch::GFX700:
4977   case CudaArch::GFX701:
4978   case CudaArch::GFX702:
4979   case CudaArch::GFX703:
4980   case CudaArch::GFX704:
4981   case CudaArch::GFX801:
4982   case CudaArch::GFX802:
4983   case CudaArch::GFX803:
4984   case CudaArch::GFX810:
4985   case CudaArch::GFX900:
4986   case CudaArch::GFX902:
4987   case CudaArch::GFX904:
4988   case CudaArch::GFX906:
4989   case CudaArch::GFX908:
4990   case CudaArch::GFX909:
4991   case CudaArch::GFX1010:
4992   case CudaArch::GFX1011:
4993   case CudaArch::GFX1012:
4994   case CudaArch::UNKNOWN:
4995     break;
4996   case CudaArch::LAST:
4997     llvm_unreachable("Unexpected Cuda arch.");
4998   }
4999   llvm_unreachable("Unexpected NVPTX target without ptx feature.");
5000 }
5001
5002 void CGOpenMPRuntimeNVPTX::clear() {
5003   if (!GlobalizedRecords.empty()) {
5004     ASTContext &C = CGM.getContext();
5005     llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
5006     llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
5007     RecordDecl *StaticRD = C.buildImplicitRecord(
5008         "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
5009     StaticRD->startDefinition();
5010     RecordDecl *SharedStaticRD = C.buildImplicitRecord(
5011         "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
5012     SharedStaticRD->startDefinition();
5013     for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
5014       if (Records.Records.empty())
5015         continue;
5016       unsigned Size = 0;
5017       unsigned RecAlignment = 0;
5018       for (const RecordDecl *RD : Records.Records) {
5019         QualType RDTy = C.getRecordType(RD);
5020         unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
5021         RecAlignment = std::max(RecAlignment, Alignment);
5022         unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
5023         Size =
5024             llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
5025       }
5026       Size = llvm::alignTo(Size, RecAlignment);
5027       llvm::APInt ArySize(/*numBits=*/64, Size);
5028       QualType SubTy = C.getConstantArrayType(
5029           C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
5030       const bool UseSharedMemory = Size <= SharedMemorySize;
5031       auto *Field =
5032           FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
5033                             SourceLocation(), SourceLocation(), nullptr, SubTy,
5034                             C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
5035                             /*BW=*/nullptr, /*Mutable=*/false,
5036                             /*InitStyle=*/ICIS_NoInit);
5037       Field->setAccess(AS_public);
5038       if (UseSharedMemory) {
5039         SharedStaticRD->addDecl(Field);
5040         SharedRecs.push_back(&Records);
5041       } else {
5042         StaticRD->addDecl(Field);
5043         GlobalRecs.push_back(&Records);
5044       }
5045       Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
5046       Records.UseSharedMemory->setInitializer(
5047           llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
5048     }
5049     // Allocate SharedMemorySize buffer for the shared memory.
5050     // FIXME: nvlink does not handle weak linkage correctly (object with the
5051     // different size are reported as erroneous).
5052     // Restore this code as sson as nvlink is fixed.
5053     if (!SharedStaticRD->field_empty()) {
5054       llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
5055       QualType SubTy = C.getConstantArrayType(
5056           C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
5057       auto *Field = FieldDecl::Create(
5058           C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
5059           C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
5060           /*BW=*/nullptr, /*Mutable=*/false,
5061           /*InitStyle=*/ICIS_NoInit);
5062       Field->setAccess(AS_public);
5063       SharedStaticRD->addDecl(Field);
5064     }
5065     SharedStaticRD->completeDefinition();
5066     if (!SharedStaticRD->field_empty()) {
5067       QualType StaticTy = C.getRecordType(SharedStaticRD);
5068       llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
5069       auto *GV = new llvm::GlobalVariable(
5070           CGM.getModule(), LLVMStaticTy,
5071           /*isConstant=*/false, llvm::GlobalValue::CommonLinkage,
5072           llvm::Constant::getNullValue(LLVMStaticTy),
5073           "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
5074           llvm::GlobalValue::NotThreadLocal,
5075           C.getTargetAddressSpace(LangAS::cuda_shared));
5076       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5077           GV, CGM.VoidPtrTy);
5078       for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
5079         Rec->Buffer->replaceAllUsesWith(Replacement);
5080         Rec->Buffer->eraseFromParent();
5081       }
5082     }
5083     StaticRD->completeDefinition();
5084     if (!StaticRD->field_empty()) {
5085       QualType StaticTy = C.getRecordType(StaticRD);
5086       std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
5087       llvm::APInt Size1(32, SMsBlockPerSM.second);
5088       QualType Arr1Ty =
5089           C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
5090                                  /*IndexTypeQuals=*/0);
5091       llvm::APInt Size2(32, SMsBlockPerSM.first);
5092       QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
5093                                                /*IndexTypeQuals=*/0);
5094       llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
5095       // FIXME: nvlink does not handle weak linkage correctly (object with the
5096       // different size are reported as erroneous).
5097       // Restore CommonLinkage as soon as nvlink is fixed.
5098       auto *GV = new llvm::GlobalVariable(
5099           CGM.getModule(), LLVMArr2Ty,
5100           /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
5101           llvm::Constant::getNullValue(LLVMArr2Ty),
5102           "_openmp_static_glob_rd_$_");
5103       auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
5104           GV, CGM.VoidPtrTy);
5105       for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
5106         Rec->Buffer->replaceAllUsesWith(Replacement);
5107         Rec->Buffer->eraseFromParent();
5108       }
5109     }
5110   }
5111   if (!TeamsReductions.empty()) {
5112     ASTContext &C = CGM.getContext();
5113     RecordDecl *StaticRD = C.buildImplicitRecord(
5114         "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
5115     StaticRD->startDefinition();
5116     for (const RecordDecl *TeamReductionRec : TeamsReductions) {
5117       QualType RecTy = C.getRecordType(TeamReductionRec);
5118       auto *Field = FieldDecl::Create(
5119           C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
5120           C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
5121           /*BW=*/nullptr, /*Mutable=*/false,
5122           /*InitStyle=*/ICIS_NoInit);
5123       Field->setAccess(AS_public);
5124       StaticRD->addDecl(Field);
5125     }
5126     StaticRD->completeDefinition();
5127     QualType StaticTy = C.getRecordType(StaticRD);
5128     llvm::Type *LLVMReductionsBufferTy =
5129         CGM.getTypes().ConvertTypeForMem(StaticTy);
5130     // FIXME: nvlink does not handle weak linkage correctly (object with the
5131     // different size are reported as erroneous).
5132     // Restore CommonLinkage as soon as nvlink is fixed.
5133     auto *GV = new llvm::GlobalVariable(
5134         CGM.getModule(), LLVMReductionsBufferTy,
5135         /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
5136         llvm::Constant::getNullValue(LLVMReductionsBufferTy),
5137         "_openmp_teams_reductions_buffer_$_");
5138     KernelTeamsReductionPtr->setInitializer(
5139         llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
5140                                                              CGM.VoidPtrTy));
5141   }
5142   CGOpenMPRuntime::clear();
5143 }