1 //===--- Cuda.cpp - Cuda Tool and ToolChain Implementations -----*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 #include "InputInfo.h"
12 #include "clang/Basic/Cuda.h"
13 #include "clang/Basic/VirtualFileSystem.h"
14 #include "clang/Driver/Compilation.h"
15 #include "clang/Driver/Driver.h"
16 #include "clang/Driver/DriverDiagnostic.h"
17 #include "clang/Driver/Options.h"
18 #include "llvm/Option/ArgList.h"
19 #include "llvm/Support/Path.h"
20 #include <system_error>
22 using namespace clang::driver;
23 using namespace clang::driver::toolchains;
24 using namespace clang::driver::tools;
25 using namespace clang;
26 using namespace llvm::opt;
28 // Parses the contents of version.txt in an CUDA installation. It should
29 // contain one line of the from e.g. "CUDA Version 7.5.2".
30 static CudaVersion ParseCudaVersionFile(llvm::StringRef V) {
31 if (!V.startswith("CUDA Version "))
32 return CudaVersion::UNKNOWN;
33 V = V.substr(strlen("CUDA Version "));
34 int Major = -1, Minor = -1;
35 auto First = V.split('.');
36 auto Second = First.second.split('.');
37 if (First.first.getAsInteger(10, Major) ||
38 Second.first.getAsInteger(10, Minor))
39 return CudaVersion::UNKNOWN;
41 if (Major == 7 && Minor == 0) {
42 // This doesn't appear to ever happen -- version.txt doesn't exist in the
43 // CUDA 7 installs I've seen. But no harm in checking.
44 return CudaVersion::CUDA_70;
46 if (Major == 7 && Minor == 5)
47 return CudaVersion::CUDA_75;
48 if (Major == 8 && Minor == 0)
49 return CudaVersion::CUDA_80;
50 return CudaVersion::UNKNOWN;
53 CudaInstallationDetector::CudaInstallationDetector(
54 const Driver &D, const llvm::Triple &HostTriple,
55 const llvm::opt::ArgList &Args)
57 SmallVector<std::string, 4> CudaPathCandidates;
59 // In decreasing order so we prefer newer versions to older versions.
60 std::initializer_list<const char *> Versions = {"8.0", "7.5", "7.0"};
62 if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) {
63 CudaPathCandidates.push_back(
64 Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ));
65 } else if (HostTriple.isOSWindows()) {
66 for (const char *Ver : Versions)
67 CudaPathCandidates.push_back(
68 D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" +
71 CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda");
72 for (const char *Ver : Versions)
73 CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-" + Ver);
76 for (const auto &CudaPath : CudaPathCandidates) {
77 if (CudaPath.empty() || !D.getVFS().exists(CudaPath))
80 InstallPath = CudaPath;
81 BinPath = CudaPath + "/bin";
82 IncludePath = InstallPath + "/include";
83 LibDevicePath = InstallPath + "/nvvm/libdevice";
85 auto &FS = D.getVFS();
86 if (!(FS.exists(IncludePath) && FS.exists(BinPath) &&
87 FS.exists(LibDevicePath)))
90 // On Linux, we have both lib and lib64 directories, and we need to choose
91 // based on our triple. On MacOS, we have only a lib directory.
93 // It's sufficient for our purposes to be flexible: If both lib and lib64
94 // exist, we choose whichever one matches our triple. Otherwise, if only
95 // lib exists, we use it.
96 if (HostTriple.isArch64Bit() && FS.exists(InstallPath + "/lib64"))
97 LibPath = InstallPath + "/lib64";
98 else if (FS.exists(InstallPath + "/lib"))
99 LibPath = InstallPath + "/lib";
103 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> VersionFile =
104 FS.getBufferForFile(InstallPath + "/version.txt");
106 // CUDA 7.0 doesn't have a version.txt, so guess that's our version if
107 // version.txt isn't present.
108 Version = CudaVersion::CUDA_70;
110 Version = ParseCudaVersionFile((*VersionFile)->getBuffer());
114 for (llvm::sys::fs::directory_iterator LI(LibDevicePath, EC), LE;
115 !EC && LI != LE; LI = LI.increment(EC)) {
116 StringRef FilePath = LI->path();
117 StringRef FileName = llvm::sys::path::filename(FilePath);
118 // Process all bitcode filenames that look like libdevice.compute_XX.YY.bc
119 const StringRef LibDeviceName = "libdevice.";
120 if (!(FileName.startswith(LibDeviceName) && FileName.endswith(".bc")))
122 StringRef GpuArch = FileName.slice(
123 LibDeviceName.size(), FileName.find('.', LibDeviceName.size()));
124 LibDeviceMap[GpuArch] = FilePath.str();
125 // Insert map entries for specifc devices with this compute
126 // capability. NVCC's choice of the libdevice library version is
127 // rather peculiar and depends on the CUDA version.
128 if (GpuArch == "compute_20") {
129 LibDeviceMap["sm_20"] = FilePath;
130 LibDeviceMap["sm_21"] = FilePath;
131 LibDeviceMap["sm_32"] = FilePath;
132 } else if (GpuArch == "compute_30") {
133 LibDeviceMap["sm_30"] = FilePath;
134 if (Version < CudaVersion::CUDA_80) {
135 LibDeviceMap["sm_50"] = FilePath;
136 LibDeviceMap["sm_52"] = FilePath;
137 LibDeviceMap["sm_53"] = FilePath;
139 LibDeviceMap["sm_60"] = FilePath;
140 LibDeviceMap["sm_61"] = FilePath;
141 LibDeviceMap["sm_62"] = FilePath;
142 } else if (GpuArch == "compute_35") {
143 LibDeviceMap["sm_35"] = FilePath;
144 LibDeviceMap["sm_37"] = FilePath;
145 } else if (GpuArch == "compute_50") {
146 if (Version >= CudaVersion::CUDA_80) {
147 LibDeviceMap["sm_50"] = FilePath;
148 LibDeviceMap["sm_52"] = FilePath;
149 LibDeviceMap["sm_53"] = FilePath;
159 void CudaInstallationDetector::AddCudaIncludeArgs(
160 const ArgList &DriverArgs, ArgStringList &CC1Args) const {
161 if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
162 // Add cuda_wrappers/* to our system include path. This lets us wrap
163 // standard library headers.
164 SmallString<128> P(D.ResourceDir);
165 llvm::sys::path::append(P, "include");
166 llvm::sys::path::append(P, "cuda_wrappers");
167 CC1Args.push_back("-internal-isystem");
168 CC1Args.push_back(DriverArgs.MakeArgString(P));
171 if (DriverArgs.hasArg(options::OPT_nocudainc))
175 D.Diag(diag::err_drv_no_cuda_installation);
179 CC1Args.push_back("-internal-isystem");
180 CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath()));
181 CC1Args.push_back("-include");
182 CC1Args.push_back("__clang_cuda_runtime_wrapper.h");
185 void CudaInstallationDetector::CheckCudaVersionSupportsArch(
186 CudaArch Arch) const {
187 if (Arch == CudaArch::UNKNOWN || Version == CudaVersion::UNKNOWN ||
188 ArchsWithVersionTooLowErrors.count(Arch) > 0)
191 auto RequiredVersion = MinVersionForCudaArch(Arch);
192 if (Version < RequiredVersion) {
193 ArchsWithVersionTooLowErrors.insert(Arch);
194 D.Diag(diag::err_drv_cuda_version_too_low)
195 << InstallPath << CudaArchToString(Arch) << CudaVersionToString(Version)
196 << CudaVersionToString(RequiredVersion);
200 void CudaInstallationDetector::print(raw_ostream &OS) const {
202 OS << "Found CUDA installation: " << InstallPath << ", version "
203 << CudaVersionToString(Version) << "\n";
206 void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
207 const InputInfo &Output,
208 const InputInfoList &Inputs,
210 const char *LinkingOutput) const {
212 static_cast<const toolchains::CudaToolChain &>(getToolChain());
213 assert(TC.getTriple().isNVPTX() && "Wrong platform");
215 // Obtain architecture from the action.
216 CudaArch gpu_arch = StringToCudaArch(JA.getOffloadingArch());
217 assert(gpu_arch != CudaArch::UNKNOWN &&
218 "Device action expected to have an architecture.");
220 // Check that our installation's ptxas supports gpu_arch.
221 if (!Args.hasArg(options::OPT_no_cuda_version_check)) {
222 TC.CudaInstallation.CheckCudaVersionSupportsArch(gpu_arch);
225 ArgStringList CmdArgs;
226 CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32");
227 if (Args.hasFlag(options::OPT_cuda_noopt_device_debug,
228 options::OPT_no_cuda_noopt_device_debug, false)) {
229 // ptxas does not accept -g option if optimization is enabled, so
230 // we ignore the compiler's -O* options if we want debug info.
231 CmdArgs.push_back("-g");
232 CmdArgs.push_back("--dont-merge-basicblocks");
233 CmdArgs.push_back("--return-at-end");
234 } else if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
235 // Map the -O we received to -O{0,1,2,3}.
237 // TODO: Perhaps we should map host -O2 to ptxas -O3. -O3 is ptxas's
238 // default, so it may correspond more closely to the spirit of clang -O2.
240 // -O3 seems like the least-bad option when -Osomething is specified to
241 // clang but it isn't handled below.
242 StringRef OOpt = "3";
243 if (A->getOption().matches(options::OPT_O4) ||
244 A->getOption().matches(options::OPT_Ofast))
246 else if (A->getOption().matches(options::OPT_O0))
248 else if (A->getOption().matches(options::OPT_O)) {
249 // -Os, -Oz, and -O(anything else) map to -O2, for lack of better options.
250 OOpt = llvm::StringSwitch<const char *>(A->getValue())
258 CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt));
260 // If no -O was passed, pass -O0 to ptxas -- no opt flag should correspond
261 // to no optimizations, but ptxas's default is -O3.
262 CmdArgs.push_back("-O0");
265 CmdArgs.push_back("--gpu-name");
266 CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
267 CmdArgs.push_back("--output-file");
268 CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
269 for (const auto& II : Inputs)
270 CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
272 for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
273 CmdArgs.push_back(Args.MakeArgString(A));
276 if (Arg *A = Args.getLastArg(options::OPT_ptxas_path_EQ))
277 Exec = A->getValue();
279 Exec = Args.MakeArgString(TC.GetProgramPath("ptxas"));
280 C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
283 // All inputs to this linker must be from CudaDeviceActions, as we need to look
284 // at the Inputs' Actions in order to figure out which GPU architecture they
286 void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
287 const InputInfo &Output,
288 const InputInfoList &Inputs,
290 const char *LinkingOutput) const {
292 static_cast<const toolchains::CudaToolChain &>(getToolChain());
293 assert(TC.getTriple().isNVPTX() && "Wrong platform");
295 ArgStringList CmdArgs;
296 CmdArgs.push_back("--cuda");
297 CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32");
298 CmdArgs.push_back(Args.MakeArgString("--create"));
299 CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
301 for (const auto& II : Inputs) {
302 auto *A = II.getAction();
303 assert(A->getInputs().size() == 1 &&
304 "Device offload action is expected to have a single input");
305 const char *gpu_arch_str = A->getOffloadingArch();
306 assert(gpu_arch_str &&
307 "Device action expected to have associated a GPU architecture!");
308 CudaArch gpu_arch = StringToCudaArch(gpu_arch_str);
310 // We need to pass an Arch of the form "sm_XX" for cubin files and
311 // "compute_XX" for ptx.
313 (II.getType() == types::TY_PP_Asm)
314 ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch))
316 CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
317 Arch + ",file=" + II.getFilename()));
320 for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary))
321 CmdArgs.push_back(Args.MakeArgString(A));
323 const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary"));
324 C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
327 /// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary,
328 /// which isn't properly a linker but nonetheless performs the step of stitching
329 /// together object files from the assembler into a single blob.
331 CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
332 const ToolChain &HostTC, const ArgList &Args)
333 : ToolChain(D, Triple, Args), HostTC(HostTC),
334 CudaInstallation(D, HostTC.getTriple(), Args) {
335 if (CudaInstallation.isValid())
336 getProgramPaths().push_back(CudaInstallation.getBinPath());
339 void CudaToolChain::addClangTargetOptions(
340 const llvm::opt::ArgList &DriverArgs,
341 llvm::opt::ArgStringList &CC1Args,
342 Action::OffloadKind DeviceOffloadingKind) const {
343 HostTC.addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadingKind);
345 StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
346 assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
347 assert((DeviceOffloadingKind == Action::OFK_OpenMP ||
348 DeviceOffloadingKind == Action::OFK_Cuda) &&
349 "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");
351 if (DeviceOffloadingKind == Action::OFK_Cuda) {
352 CC1Args.push_back("-fcuda-is-device");
354 if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
355 options::OPT_fno_cuda_flush_denormals_to_zero, false))
356 CC1Args.push_back("-fcuda-flush-denormals-to-zero");
358 if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
359 options::OPT_fno_cuda_approx_transcendentals, false))
360 CC1Args.push_back("-fcuda-approx-transcendentals");
362 if (DriverArgs.hasArg(options::OPT_nocudalib))
366 std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
368 if (LibDeviceFile.empty()) {
369 getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
373 CC1Args.push_back("-mlink-cuda-bitcode");
374 CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
376 // Libdevice in CUDA-7.0 requires PTX version that's more recent
377 // than LLVM defaults to. Use PTX4.2 which is the PTX version that
378 // came with CUDA-7.0.
379 CC1Args.push_back("-target-feature");
380 CC1Args.push_back("+ptx42");
383 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
384 ArgStringList &CC1Args) const {
385 // Check our CUDA version if we're going to include the CUDA headers.
386 if (!DriverArgs.hasArg(options::OPT_nocudainc) &&
387 !DriverArgs.hasArg(options::OPT_no_cuda_version_check)) {
388 StringRef Arch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
389 assert(!Arch.empty() && "Must have an explicit GPU arch.");
390 CudaInstallation.CheckCudaVersionSupportsArch(StringToCudaArch(Arch));
392 CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
395 llvm::opt::DerivedArgList *
396 CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
398 Action::OffloadKind DeviceOffloadKind) const {
399 DerivedArgList *DAL =
400 HostTC.TranslateArgs(Args, BoundArch, DeviceOffloadKind);
402 DAL = new DerivedArgList(Args.getBaseArgs());
404 const OptTable &Opts = getDriver().getOpts();
406 // For OpenMP device offloading, append derived arguments. Make sure
407 // flags are not duplicated.
408 // TODO: Append the compute capability.
409 if (DeviceOffloadKind == Action::OFK_OpenMP) {
411 bool IsDuplicate = false;
412 for (Arg *DALArg : *DAL){
424 for (Arg *A : Args) {
425 if (A->getOption().matches(options::OPT_Xarch__)) {
426 // Skip this argument unless the architecture matches BoundArch
427 if (BoundArch.empty() || A->getValue(0) != BoundArch)
430 unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
431 unsigned Prev = Index;
432 std::unique_ptr<Arg> XarchArg(Opts.ParseOneArg(Args, Index));
434 // If the argument parsing failed or more than one argument was
435 // consumed, the -Xarch_ argument's parameter tried to consume
436 // extra arguments. Emit an error and ignore.
438 // We also want to disallow any options which would alter the
439 // driver behavior; that isn't going to work in our model. We
440 // use isDriverOption() as an approximation, although things
441 // like -O4 are going to slip through.
442 if (!XarchArg || Index > Prev + 1) {
443 getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args)
444 << A->getAsString(Args);
446 } else if (XarchArg->getOption().hasFlag(options::DriverOption)) {
447 getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver)
448 << A->getAsString(Args);
451 XarchArg->setBaseArg(A);
452 A = XarchArg.release();
453 DAL->AddSynthesizedArg(A);
458 if (!BoundArch.empty()) {
459 DAL->eraseArg(options::OPT_march_EQ);
460 DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
465 Tool *CudaToolChain::buildAssembler() const {
466 return new tools::NVPTX::Assembler(*this);
469 Tool *CudaToolChain::buildLinker() const {
470 return new tools::NVPTX::Linker(*this);
473 void CudaToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {
474 HostTC.addClangWarningOptions(CC1Args);
477 ToolChain::CXXStdlibType
478 CudaToolChain::GetCXXStdlibType(const ArgList &Args) const {
479 return HostTC.GetCXXStdlibType(Args);
482 void CudaToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
483 ArgStringList &CC1Args) const {
484 HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args);
487 void CudaToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &Args,
488 ArgStringList &CC1Args) const {
489 HostTC.AddClangCXXStdlibIncludeArgs(Args, CC1Args);
492 void CudaToolChain::AddIAMCUIncludeArgs(const ArgList &Args,
493 ArgStringList &CC1Args) const {
494 HostTC.AddIAMCUIncludeArgs(Args, CC1Args);
497 SanitizerMask CudaToolChain::getSupportedSanitizers() const {
498 // The CudaToolChain only supports sanitizers in the sense that it allows
499 // sanitizer arguments on the command line if they are supported by the host
500 // toolchain. The CudaToolChain will actually ignore any command line
501 // arguments for any of these "supported" sanitizers. That means that no
502 // sanitization of device code is actually supported at this time.
504 // This behavior is necessary because the host and device toolchains
505 // invocations often share the command line, so the device toolchain must
506 // tolerate flags meant only for the host toolchain.
507 return HostTC.getSupportedSanitizers();
510 VersionTuple CudaToolChain::computeMSVCVersion(const Driver *D,
511 const ArgList &Args) const {
512 return HostTC.computeMSVCVersion(D, Args);