1 //===--- Cuda.cpp - Cuda Tool and ToolChain Implementations -----*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 #include "InputInfo.h"
12 #include "clang/Basic/Cuda.h"
13 #include "clang/Basic/VirtualFileSystem.h"
14 #include "clang/Driver/Compilation.h"
15 #include "clang/Driver/Driver.h"
16 #include "clang/Driver/DriverDiagnostic.h"
17 #include "clang/Driver/Options.h"
18 #include "llvm/Option/ArgList.h"
19 #include "llvm/Support/Path.h"
20 #include <system_error>
22 using namespace clang::driver;
23 using namespace clang::driver::toolchains;
24 using namespace clang::driver::tools;
25 using namespace clang;
26 using namespace llvm::opt;
28 // Parses the contents of version.txt in an CUDA installation. It should
29 // contain one line of the from e.g. "CUDA Version 7.5.2".
30 static CudaVersion ParseCudaVersionFile(llvm::StringRef V) {
31 if (!V.startswith("CUDA Version "))
32 return CudaVersion::UNKNOWN;
33 V = V.substr(strlen("CUDA Version "));
34 int Major = -1, Minor = -1;
35 auto First = V.split('.');
36 auto Second = First.second.split('.');
37 if (First.first.getAsInteger(10, Major) ||
38 Second.first.getAsInteger(10, Minor))
39 return CudaVersion::UNKNOWN;
41 if (Major == 7 && Minor == 0) {
42 // This doesn't appear to ever happen -- version.txt doesn't exist in the
43 // CUDA 7 installs I've seen. But no harm in checking.
44 return CudaVersion::CUDA_70;
46 if (Major == 7 && Minor == 5)
47 return CudaVersion::CUDA_75;
48 if (Major == 8 && Minor == 0)
49 return CudaVersion::CUDA_80;
50 return CudaVersion::UNKNOWN;
53 CudaInstallationDetector::CudaInstallationDetector(
54 const Driver &D, const llvm::Triple &HostTriple,
55 const llvm::opt::ArgList &Args)
57 SmallVector<std::string, 4> CudaPathCandidates;
59 // In decreasing order so we prefer newer versions to older versions.
60 std::initializer_list<const char *> Versions = {"8.0", "7.5", "7.0"};
62 if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) {
63 CudaPathCandidates.push_back(
64 Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ));
65 } else if (HostTriple.isOSWindows()) {
66 for (const char *Ver : Versions)
67 CudaPathCandidates.push_back(
68 D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" +
71 CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda");
72 for (const char *Ver : Versions)
73 CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-" + Ver);
76 for (const auto &CudaPath : CudaPathCandidates) {
77 if (CudaPath.empty() || !D.getVFS().exists(CudaPath))
80 InstallPath = CudaPath;
81 BinPath = CudaPath + "/bin";
82 IncludePath = InstallPath + "/include";
83 LibDevicePath = InstallPath + "/nvvm/libdevice";
85 auto &FS = D.getVFS();
86 if (!(FS.exists(IncludePath) && FS.exists(BinPath) &&
87 FS.exists(LibDevicePath)))
90 // On Linux, we have both lib and lib64 directories, and we need to choose
91 // based on our triple. On MacOS, we have only a lib directory.
93 // It's sufficient for our purposes to be flexible: If both lib and lib64
94 // exist, we choose whichever one matches our triple. Otherwise, if only
95 // lib exists, we use it.
96 if (HostTriple.isArch64Bit() && FS.exists(InstallPath + "/lib64"))
97 LibPath = InstallPath + "/lib64";
98 else if (FS.exists(InstallPath + "/lib"))
99 LibPath = InstallPath + "/lib";
103 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> VersionFile =
104 FS.getBufferForFile(InstallPath + "/version.txt");
106 // CUDA 7.0 doesn't have a version.txt, so guess that's our version if
107 // version.txt isn't present.
108 Version = CudaVersion::CUDA_70;
110 Version = ParseCudaVersionFile((*VersionFile)->getBuffer());
114 for (llvm::sys::fs::directory_iterator LI(LibDevicePath, EC), LE;
115 !EC && LI != LE; LI = LI.increment(EC)) {
116 StringRef FilePath = LI->path();
117 StringRef FileName = llvm::sys::path::filename(FilePath);
118 // Process all bitcode filenames that look like libdevice.compute_XX.YY.bc
119 const StringRef LibDeviceName = "libdevice.";
120 if (!(FileName.startswith(LibDeviceName) && FileName.endswith(".bc")))
122 StringRef GpuArch = FileName.slice(
123 LibDeviceName.size(), FileName.find('.', LibDeviceName.size()));
124 LibDeviceMap[GpuArch] = FilePath.str();
125 // Insert map entries for specifc devices with this compute
126 // capability. NVCC's choice of the libdevice library version is
127 // rather peculiar and depends on the CUDA version.
128 if (GpuArch == "compute_20") {
129 LibDeviceMap["sm_20"] = FilePath;
130 LibDeviceMap["sm_21"] = FilePath;
131 LibDeviceMap["sm_32"] = FilePath;
132 } else if (GpuArch == "compute_30") {
133 LibDeviceMap["sm_30"] = FilePath;
134 if (Version < CudaVersion::CUDA_80) {
135 LibDeviceMap["sm_50"] = FilePath;
136 LibDeviceMap["sm_52"] = FilePath;
137 LibDeviceMap["sm_53"] = FilePath;
139 LibDeviceMap["sm_60"] = FilePath;
140 LibDeviceMap["sm_61"] = FilePath;
141 LibDeviceMap["sm_62"] = FilePath;
142 } else if (GpuArch == "compute_35") {
143 LibDeviceMap["sm_35"] = FilePath;
144 LibDeviceMap["sm_37"] = FilePath;
145 } else if (GpuArch == "compute_50") {
146 if (Version >= CudaVersion::CUDA_80) {
147 LibDeviceMap["sm_50"] = FilePath;
148 LibDeviceMap["sm_52"] = FilePath;
149 LibDeviceMap["sm_53"] = FilePath;
159 void CudaInstallationDetector::AddCudaIncludeArgs(
160 const ArgList &DriverArgs, ArgStringList &CC1Args) const {
161 if (!DriverArgs.hasArg(options::OPT_nobuiltininc)) {
162 // Add cuda_wrappers/* to our system include path. This lets us wrap
163 // standard library headers.
164 SmallString<128> P(D.ResourceDir);
165 llvm::sys::path::append(P, "include");
166 llvm::sys::path::append(P, "cuda_wrappers");
167 CC1Args.push_back("-internal-isystem");
168 CC1Args.push_back(DriverArgs.MakeArgString(P));
171 if (DriverArgs.hasArg(options::OPT_nocudainc))
175 D.Diag(diag::err_drv_no_cuda_installation);
179 CC1Args.push_back("-internal-isystem");
180 CC1Args.push_back(DriverArgs.MakeArgString(getIncludePath()));
181 CC1Args.push_back("-include");
182 CC1Args.push_back("__clang_cuda_runtime_wrapper.h");
185 void CudaInstallationDetector::CheckCudaVersionSupportsArch(
186 CudaArch Arch) const {
187 if (Arch == CudaArch::UNKNOWN || Version == CudaVersion::UNKNOWN ||
188 ArchsWithVersionTooLowErrors.count(Arch) > 0)
191 auto RequiredVersion = MinVersionForCudaArch(Arch);
192 if (Version < RequiredVersion) {
193 ArchsWithVersionTooLowErrors.insert(Arch);
194 D.Diag(diag::err_drv_cuda_version_too_low)
195 << InstallPath << CudaArchToString(Arch) << CudaVersionToString(Version)
196 << CudaVersionToString(RequiredVersion);
200 void CudaInstallationDetector::print(raw_ostream &OS) const {
202 OS << "Found CUDA installation: " << InstallPath << ", version "
203 << CudaVersionToString(Version) << "\n";
206 void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
207 const InputInfo &Output,
208 const InputInfoList &Inputs,
210 const char *LinkingOutput) const {
212 static_cast<const toolchains::CudaToolChain &>(getToolChain());
213 assert(TC.getTriple().isNVPTX() && "Wrong platform");
215 // Obtain architecture from the action.
216 CudaArch gpu_arch = StringToCudaArch(JA.getOffloadingArch());
217 assert(gpu_arch != CudaArch::UNKNOWN &&
218 "Device action expected to have an architecture.");
220 // Check that our installation's ptxas supports gpu_arch.
221 if (!Args.hasArg(options::OPT_no_cuda_version_check)) {
222 TC.CudaInstallation.CheckCudaVersionSupportsArch(gpu_arch);
225 ArgStringList CmdArgs;
226 CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32");
227 if (Args.hasFlag(options::OPT_cuda_noopt_device_debug,
228 options::OPT_no_cuda_noopt_device_debug, false)) {
229 // ptxas does not accept -g option if optimization is enabled, so
230 // we ignore the compiler's -O* options if we want debug info.
231 CmdArgs.push_back("-g");
232 CmdArgs.push_back("--dont-merge-basicblocks");
233 CmdArgs.push_back("--return-at-end");
234 } else if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
235 // Map the -O we received to -O{0,1,2,3}.
237 // TODO: Perhaps we should map host -O2 to ptxas -O3. -O3 is ptxas's
238 // default, so it may correspond more closely to the spirit of clang -O2.
240 // -O3 seems like the least-bad option when -Osomething is specified to
241 // clang but it isn't handled below.
242 StringRef OOpt = "3";
243 if (A->getOption().matches(options::OPT_O4) ||
244 A->getOption().matches(options::OPT_Ofast))
246 else if (A->getOption().matches(options::OPT_O0))
248 else if (A->getOption().matches(options::OPT_O)) {
249 // -Os, -Oz, and -O(anything else) map to -O2, for lack of better options.
250 OOpt = llvm::StringSwitch<const char *>(A->getValue())
258 CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt));
260 // If no -O was passed, pass -O0 to ptxas -- no opt flag should correspond
261 // to no optimizations, but ptxas's default is -O3.
262 CmdArgs.push_back("-O0");
265 CmdArgs.push_back("--gpu-name");
266 CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
267 CmdArgs.push_back("--output-file");
268 CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
269 for (const auto& II : Inputs)
270 CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
272 for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
273 CmdArgs.push_back(Args.MakeArgString(A));
276 if (Arg *A = Args.getLastArg(options::OPT_ptxas_path_EQ))
277 Exec = A->getValue();
279 Exec = Args.MakeArgString(TC.GetProgramPath("ptxas"));
280 C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
283 // All inputs to this linker must be from CudaDeviceActions, as we need to look
284 // at the Inputs' Actions in order to figure out which GPU architecture they
286 void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
287 const InputInfo &Output,
288 const InputInfoList &Inputs,
290 const char *LinkingOutput) const {
292 static_cast<const toolchains::CudaToolChain &>(getToolChain());
293 assert(TC.getTriple().isNVPTX() && "Wrong platform");
295 ArgStringList CmdArgs;
296 CmdArgs.push_back("--cuda");
297 CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32");
298 CmdArgs.push_back(Args.MakeArgString("--create"));
299 CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
301 for (const auto& II : Inputs) {
302 auto *A = II.getAction();
303 assert(A->getInputs().size() == 1 &&
304 "Device offload action is expected to have a single input");
305 const char *gpu_arch_str = A->getOffloadingArch();
306 assert(gpu_arch_str &&
307 "Device action expected to have associated a GPU architecture!");
308 CudaArch gpu_arch = StringToCudaArch(gpu_arch_str);
310 // We need to pass an Arch of the form "sm_XX" for cubin files and
311 // "compute_XX" for ptx.
313 (II.getType() == types::TY_PP_Asm)
314 ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch))
316 CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
317 Arch + ",file=" + II.getFilename()));
320 for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary))
321 CmdArgs.push_back(Args.MakeArgString(A));
323 const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary"));
324 C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
327 /// CUDA toolchain. Our assembler is ptxas, and our "linker" is fatbinary,
328 /// which isn't properly a linker but nonetheless performs the step of stitching
329 /// together object files from the assembler into a single blob.
331 CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
332 const ToolChain &HostTC, const ArgList &Args)
333 : ToolChain(D, Triple, Args), HostTC(HostTC),
334 CudaInstallation(D, HostTC.getTriple(), Args) {
335 if (CudaInstallation.isValid())
336 getProgramPaths().push_back(CudaInstallation.getBinPath());
339 void CudaToolChain::addClangTargetOptions(
340 const llvm::opt::ArgList &DriverArgs,
341 llvm::opt::ArgStringList &CC1Args) const {
342 HostTC.addClangTargetOptions(DriverArgs, CC1Args);
344 CC1Args.push_back("-fcuda-is-device");
346 if (DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
347 options::OPT_fno_cuda_flush_denormals_to_zero, false))
348 CC1Args.push_back("-fcuda-flush-denormals-to-zero");
350 if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
351 options::OPT_fno_cuda_approx_transcendentals, false))
352 CC1Args.push_back("-fcuda-approx-transcendentals");
354 if (DriverArgs.hasArg(options::OPT_nocudalib))
357 StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
358 assert(!GpuArch.empty() && "Must have an explicit GPU arch.");
359 std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
361 if (LibDeviceFile.empty()) {
362 getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
366 CC1Args.push_back("-mlink-cuda-bitcode");
367 CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
369 // Libdevice in CUDA-7.0 requires PTX version that's more recent
370 // than LLVM defaults to. Use PTX4.2 which is the PTX version that
371 // came with CUDA-7.0.
372 CC1Args.push_back("-target-feature");
373 CC1Args.push_back("+ptx42");
376 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
377 ArgStringList &CC1Args) const {
378 // Check our CUDA version if we're going to include the CUDA headers.
379 if (!DriverArgs.hasArg(options::OPT_nocudainc) &&
380 !DriverArgs.hasArg(options::OPT_no_cuda_version_check)) {
381 StringRef Arch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
382 assert(!Arch.empty() && "Must have an explicit GPU arch.");
383 CudaInstallation.CheckCudaVersionSupportsArch(StringToCudaArch(Arch));
385 CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
388 llvm::opt::DerivedArgList *
389 CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
391 Action::OffloadKind DeviceOffloadKind) const {
392 DerivedArgList *DAL =
393 HostTC.TranslateArgs(Args, BoundArch, DeviceOffloadKind);
395 DAL = new DerivedArgList(Args.getBaseArgs());
397 const OptTable &Opts = getDriver().getOpts();
399 for (Arg *A : Args) {
400 if (A->getOption().matches(options::OPT_Xarch__)) {
401 // Skip this argument unless the architecture matches BoundArch
402 if (BoundArch.empty() || A->getValue(0) != BoundArch)
405 unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
406 unsigned Prev = Index;
407 std::unique_ptr<Arg> XarchArg(Opts.ParseOneArg(Args, Index));
409 // If the argument parsing failed or more than one argument was
410 // consumed, the -Xarch_ argument's parameter tried to consume
411 // extra arguments. Emit an error and ignore.
413 // We also want to disallow any options which would alter the
414 // driver behavior; that isn't going to work in our model. We
415 // use isDriverOption() as an approximation, although things
416 // like -O4 are going to slip through.
417 if (!XarchArg || Index > Prev + 1) {
418 getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args)
419 << A->getAsString(Args);
421 } else if (XarchArg->getOption().hasFlag(options::DriverOption)) {
422 getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver)
423 << A->getAsString(Args);
426 XarchArg->setBaseArg(A);
427 A = XarchArg.release();
428 DAL->AddSynthesizedArg(A);
433 if (!BoundArch.empty()) {
434 DAL->eraseArg(options::OPT_march_EQ);
435 DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
440 Tool *CudaToolChain::buildAssembler() const {
441 return new tools::NVPTX::Assembler(*this);
444 Tool *CudaToolChain::buildLinker() const {
445 return new tools::NVPTX::Linker(*this);
448 void CudaToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {
449 HostTC.addClangWarningOptions(CC1Args);
452 ToolChain::CXXStdlibType
453 CudaToolChain::GetCXXStdlibType(const ArgList &Args) const {
454 return HostTC.GetCXXStdlibType(Args);
457 void CudaToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
458 ArgStringList &CC1Args) const {
459 HostTC.AddClangSystemIncludeArgs(DriverArgs, CC1Args);
462 void CudaToolChain::AddClangCXXStdlibIncludeArgs(const ArgList &Args,
463 ArgStringList &CC1Args) const {
464 HostTC.AddClangCXXStdlibIncludeArgs(Args, CC1Args);
467 void CudaToolChain::AddIAMCUIncludeArgs(const ArgList &Args,
468 ArgStringList &CC1Args) const {
469 HostTC.AddIAMCUIncludeArgs(Args, CC1Args);
472 SanitizerMask CudaToolChain::getSupportedSanitizers() const {
473 // The CudaToolChain only supports sanitizers in the sense that it allows
474 // sanitizer arguments on the command line if they are supported by the host
475 // toolchain. The CudaToolChain will actually ignore any command line
476 // arguments for any of these "supported" sanitizers. That means that no
477 // sanitization of device code is actually supported at this time.
479 // This behavior is necessary because the host and device toolchains
480 // invocations often share the command line, so the device toolchain must
481 // tolerate flags meant only for the host toolchain.
482 return HostTC.getSupportedSanitizers();
485 VersionTuple CudaToolChain::computeMSVCVersion(const Driver *D,
486 const ArgList &Args) const {
487 return HostTC.computeMSVCVersion(D, Args);