contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

   1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 ///
  12 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
  13 /// code.  When passed an MCAsmStreamer it prints assembly and when passed
  14 /// an MCObjectStreamer it outputs binary code.
  15 //
  16 //===----------------------------------------------------------------------===//
  17 //
  18
  19 #include "AMDGPUAsmPrinter.h"
  20 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
  21 #include "InstPrinter/AMDGPUInstPrinter.h"
  22 #include "Utils/AMDGPUBaseInfo.h"
  23 #include "AMDGPU.h"
  24 #include "AMDKernelCodeT.h"
  25 #include "AMDGPUSubtarget.h"
  26 #include "R600Defines.h"
  27 #include "R600MachineFunctionInfo.h"
  28 #include "R600RegisterInfo.h"
  29 #include "SIDefines.h"
  30 #include "SIMachineFunctionInfo.h"
  31 #include "SIInstrInfo.h"
  32 #include "SIRegisterInfo.h"
  33 #include "llvm/CodeGen/MachineFrameInfo.h"
  34 #include "llvm/IR/DiagnosticInfo.h"
  35 #include "llvm/MC/MCContext.h"
  36 #include "llvm/MC/MCSectionELF.h"
  37 #include "llvm/MC/MCStreamer.h"
  38 #include "llvm/Support/ELF.h"
  39 #include "llvm/Support/MathExtras.h"
  40 #include "llvm/Support/TargetRegistry.h"
  41 #include "llvm/Target/TargetLoweringObjectFile.h"
  42 #include "AMDGPURuntimeMetadata.h"
  43
  44 using namespace ::AMDGPU;
  45 using namespace llvm;
  46
  47 // TODO: This should get the default rounding mode from the kernel. We just set
  48 // the default here, but this could change if the OpenCL rounding mode pragmas
  49 // are used.
  50 //
  51 // The denormal mode here should match what is reported by the OpenCL runtime
  52 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
  53 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
  54 //
  55 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
  56 // precision, and leaves single precision to flush all and does not report
  57 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
  58 // CL_FP_DENORM for both.
  59 //
  60 // FIXME: It seems some instructions do not support single precision denormals
  61 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
  62 // and sin_f32, cos_f32 on most parts).
  63
  64 // We want to use these instructions, and using fp32 denormals also causes
  65 // instructions to run at the double precision rate for the device so it's
  66 // probably best to just report no single precision denormals.
  67 static uint32_t getFPMode(const MachineFunction &F) {
  68   const SISubtarget& ST = F.getSubtarget<SISubtarget>();
  69   // TODO: Is there any real use for the flush in only / flush out only modes?
  70
  71   uint32_t FP32Denormals =
  72     ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
  73
  74   uint32_t FP64Denormals =
  75     ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
  76
  77   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
  78          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
  79          FP_DENORM_MODE_SP(FP32Denormals) |
  80          FP_DENORM_MODE_DP(FP64Denormals);
  81 }
  82
  83 static AsmPrinter *
  84 createAMDGPUAsmPrinterPass(TargetMachine &tm,
  85                            std::unique_ptr<MCStreamer> &&Streamer) {
  86   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
  87 }
  88
  89 extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
  90   TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
  91   TargetRegistry::RegisterAsmPrinter(TheGCNTarget, createAMDGPUAsmPrinterPass);
  92 }
  93
  94 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
  95                                    std::unique_ptr<MCStreamer> Streamer)
  96     : AsmPrinter(TM, std::move(Streamer)) {}
  97
  98 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
  99   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
 100     return;
 101
 102   // Need to construct an MCSubtargetInfo here in case we have no functions
 103   // in the module.
 104   std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
 105         TM.getTargetTriple().str(), TM.getTargetCPU(),
 106         TM.getTargetFeatureString()));
 107
 108   AMDGPUTargetStreamer *TS =
 109       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
 110
 111   TS->EmitDirectiveHSACodeObjectVersion(2, 1);
 112
 113   AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
 114   TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
 115                                     "AMD", "AMDGPU");
 116   emitStartOfRuntimeMetadata(M);
 117 }
 118
 119 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
 120   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
 121   SIProgramInfo KernelInfo;
 122   if (STM.isAmdHsaOS()) {
 123     getSIProgramInfo(KernelInfo, *MF);
 124     EmitAmdKernelCodeT(*MF, KernelInfo);
 125   }
 126 }
 127
 128 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
 129   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 130   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
 131   if (MFI->isKernel() && STM.isAmdHsaOS()) {
 132     AMDGPUTargetStreamer *TS =
 133         static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
 134     TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(),
 135                              ELF::STT_AMDGPU_HSA_KERNEL);
 136   }
 137
 138   AsmPrinter::EmitFunctionEntryLabel();
 139 }
 140
 141 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 142
 143   // Group segment variables aren't emitted in HSA.
 144   if (AMDGPU::isGroupSegment(GV))
 145     return;
 146
 147   AsmPrinter::EmitGlobalVariable(GV);
 148 }
 149
 150 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 151
 152   // The starting address of all shader programs must be 256 bytes aligned.
 153   MF.setAlignment(8);
 154
 155   SetupMachineFunction(MF);
 156
 157   MCContext &Context = getObjFileLowering().getContext();
 158   MCSectionELF *ConfigSection =
 159       Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
 160   OutStreamer->SwitchSection(ConfigSection);
 161
 162   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
 163   SIProgramInfo KernelInfo;
 164   if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
 165     getSIProgramInfo(KernelInfo, MF);
 166     if (!STM.isAmdHsaOS()) {
 167       EmitProgramInfoSI(MF, KernelInfo);
 168     }
 169   } else {
 170     EmitProgramInfoR600(MF);
 171   }
 172
 173   DisasmLines.clear();
 174   HexLines.clear();
 175   DisasmLineMaxLen = 0;
 176
 177   EmitFunctionBody();
 178
 179   if (isVerbose()) {
 180     MCSectionELF *CommentSection =
 181         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
 182     OutStreamer->SwitchSection(CommentSection);
 183
 184     if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
 185       OutStreamer->emitRawComment(" Kernel info:", false);
 186       OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
 187                                   false);
 188       OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
 189                                   false);
 190       OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
 191                                   false);
 192       OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
 193                                   false);
 194       OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
 195                                   false);
 196       OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
 197                                   false);
 198       OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
 199                                   " bytes/workgroup (compile time only)", false);
 200
 201       OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
 202                                   false);
 203       OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
 204                                   false);
 205
 206       if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
 207         OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
 208                                     Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
 209         OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +
 210                                     Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);
 211       }
 212
 213       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
 214                                   Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
 215                                   false);
 216       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
 217                                   Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
 218                                   false);
 219       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
 220                                   Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
 221                                   false);
 222       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
 223                                   Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
 224                                   false);
 225       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
 226                                   Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
 227                                   false);
 228
 229     } else {
 230       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 231       OutStreamer->emitRawComment(
 232         Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
 233     }
 234   }
 235
 236   if (STM.dumpCode()) {
 237
 238     OutStreamer->SwitchSection(
 239         Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
 240
 241     for (size_t i = 0; i < DisasmLines.size(); ++i) {
 242       std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
 243       Comment += " ; " + HexLines[i] + "\n";
 244
 245       OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
 246       OutStreamer->EmitBytes(StringRef(Comment));
 247     }
 248   }
 249
 250   emitRuntimeMetadata(*MF.getFunction());
 251
 252   return false;
 253 }
 254
 255 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
 256   unsigned MaxGPR = 0;
 257   bool killPixel = false;
 258   const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
 259   const R600RegisterInfo *RI = STM.getRegisterInfo();
 260   const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 261
 262   for (const MachineBasicBlock &MBB : MF) {
 263     for (const MachineInstr &MI : MBB) {
 264       if (MI.getOpcode() == AMDGPU::KILLGT)
 265         killPixel = true;
 266       unsigned numOperands = MI.getNumOperands();
 267       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
 268         const MachineOperand &MO = MI.getOperand(op_idx);
 269         if (!MO.isReg())
 270           continue;
 271         unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
 272
 273         // Register with value > 127 aren't GPR
 274         if (HWReg > 127)
 275           continue;
 276         MaxGPR = std::max(MaxGPR, HWReg);
 277       }
 278     }
 279   }
 280
 281   unsigned RsrcReg;
 282   if (STM.getGeneration() >= R600Subtarget::EVERGREEN) {
 283     // Evergreen / Northern Islands
 284     switch (MF.getFunction()->getCallingConv()) {
 285     default: // Fall through
 286     case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
 287     case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
 288     case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
 289     case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
 290     }
 291   } else {
 292     // R600 / R700
 293     switch (MF.getFunction()->getCallingConv()) {
 294     default: // Fall through
 295     case CallingConv::AMDGPU_GS: // Fall through
 296     case CallingConv::AMDGPU_CS: // Fall through
 297     case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
 298     case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
 299     }
 300   }
 301
 302   OutStreamer->EmitIntValue(RsrcReg, 4);
 303   OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
 304                            S_STACK_SIZE(MFI->StackSize), 4);
 305   OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
 306   OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
 307
 308   if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
 309     OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
 310     OutStreamer->EmitIntValue(alignTo(MFI->LDSSize, 4) >> 2, 4);
 311   }
 312 }
 313
 314 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 315                                         const MachineFunction &MF) const {
 316   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
 317   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 318   uint64_t CodeSize = 0;
 319   unsigned MaxSGPR = 0;
 320   unsigned MaxVGPR = 0;
 321   bool VCCUsed = false;
 322   bool FlatUsed = false;
 323   const SIRegisterInfo *RI = STM.getRegisterInfo();
 324   const SIInstrInfo *TII = STM.getInstrInfo();
 325
 326   for (const MachineBasicBlock &MBB : MF) {
 327     for (const MachineInstr &MI : MBB) {
 328       // TODO: CodeSize should account for multiple functions.
 329
 330       // TODO: Should we count size of debug info?
 331       if (MI.isDebugValue())
 332         continue;
 333
 334       CodeSize += TII->getInstSizeInBytes(MI);
 335
 336       unsigned numOperands = MI.getNumOperands();
 337       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
 338         const MachineOperand &MO = MI.getOperand(op_idx);
 339         unsigned width = 0;
 340         bool isSGPR = false;
 341
 342         if (!MO.isReg())
 343           continue;
 344
 345         unsigned reg = MO.getReg();
 346         switch (reg) {
 347         case AMDGPU::EXEC:
 348         case AMDGPU::EXEC_LO:
 349         case AMDGPU::EXEC_HI:
 350         case AMDGPU::SCC:
 351         case AMDGPU::M0:
 352           continue;
 353
 354         case AMDGPU::VCC:
 355         case AMDGPU::VCC_LO:
 356         case AMDGPU::VCC_HI:
 357           VCCUsed = true;
 358           continue;
 359
 360         case AMDGPU::FLAT_SCR:
 361         case AMDGPU::FLAT_SCR_LO:
 362         case AMDGPU::FLAT_SCR_HI:
 363           FlatUsed = true;
 364           continue;
 365
 366         case AMDGPU::TBA:
 367         case AMDGPU::TBA_LO:
 368         case AMDGPU::TBA_HI:
 369         case AMDGPU::TMA:
 370         case AMDGPU::TMA_LO:
 371         case AMDGPU::TMA_HI:
 372           llvm_unreachable("Trap Handler registers should not be used");
 373           continue;
 374
 375         default:
 376           break;
 377         }
 378
 379         if (AMDGPU::SReg_32RegClass.contains(reg)) {
 380           if (AMDGPU::TTMP_32RegClass.contains(reg)) {
 381             llvm_unreachable("Trap Handler registers should not be used");
 382           }
 383           isSGPR = true;
 384           width = 1;
 385         } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
 386           isSGPR = false;
 387           width = 1;
 388         } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
 389           if (AMDGPU::TTMP_64RegClass.contains(reg)) {
 390             llvm_unreachable("Trap Handler registers should not be used");
 391           }
 392           isSGPR = true;
 393           width = 2;
 394         } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
 395           isSGPR = false;
 396           width = 2;
 397         } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
 398           isSGPR = false;
 399           width = 3;
 400         } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
 401           isSGPR = true;
 402           width = 4;
 403         } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
 404           isSGPR = false;
 405           width = 4;
 406         } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
 407           isSGPR = true;
 408           width = 8;
 409         } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
 410           isSGPR = false;
 411           width = 8;
 412         } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
 413           isSGPR = true;
 414           width = 16;
 415         } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
 416           isSGPR = false;
 417           width = 16;
 418         } else {
 419           llvm_unreachable("Unknown register class");
 420         }
 421         unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
 422         unsigned maxUsed = hwReg + width - 1;
 423         if (isSGPR) {
 424           MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
 425         } else {
 426           MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
 427         }
 428       }
 429     }
 430   }
 431
 432   unsigned ExtraSGPRs = 0;
 433
 434   if (VCCUsed)
 435     ExtraSGPRs = 2;
 436
 437   if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
 438     if (FlatUsed)
 439       ExtraSGPRs = 4;
 440   } else {
 441     if (STM.isXNACKEnabled())
 442       ExtraSGPRs = 4;
 443
 444     if (FlatUsed)
 445       ExtraSGPRs = 6;
 446   }
 447
 448   MaxSGPR += ExtraSGPRs;
 449
 450   // Record first reserved register and reserved register count fields, and
 451   // update max register counts if "amdgpu-debugger-reserve-regs" attribute was
 452   // specified.
 453   if (STM.debuggerReserveRegs()) {
 454     ProgInfo.ReservedVGPRFirst = MaxVGPR + 1;
 455     ProgInfo.ReservedVGPRCount = MFI->getDebuggerReservedVGPRCount();
 456     MaxVGPR += MFI->getDebuggerReservedVGPRCount();
 457   }
 458
 459   // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
 460   // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
 461   // attribute was specified.
 462   if (STM.debuggerEmitPrologue()) {
 463     ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
 464       RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
 465     ProgInfo.DebuggerPrivateSegmentBufferSGPR =
 466       RI->getHWRegIndex(MFI->getScratchRSrcReg());
 467   }
 468
 469   // We found the maximum register index. They start at 0, so add one to get the
 470   // number of registers.
 471   ProgInfo.NumVGPR = MaxVGPR + 1;
 472   ProgInfo.NumSGPR = MaxSGPR + 1;
 473
 474   if (STM.hasSGPRInitBug()) {
 475     if (ProgInfo.NumSGPR > SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) {
 476       LLVMContext &Ctx = MF.getFunction()->getContext();
 477       DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
 478                                        "SGPRs with SGPR init bug",
 479                                        ProgInfo.NumSGPR, DS_Error);
 480       Ctx.diagnose(Diag);
 481     }
 482
 483     ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
 484   }
 485
 486   if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
 487     LLVMContext &Ctx = MF.getFunction()->getContext();
 488     DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs",
 489                                      MFI->NumUserSGPRs, DS_Error);
 490     Ctx.diagnose(Diag);
 491   }
 492
 493   if (MFI->LDSSize > static_cast<unsigned>(STM.getLocalMemorySize())) {
 494     LLVMContext &Ctx = MF.getFunction()->getContext();
 495     DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory",
 496                                      MFI->LDSSize, DS_Error);
 497     Ctx.diagnose(Diag);
 498   }
 499
 500   ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
 501   ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
 502   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
 503   // register.
 504   ProgInfo.FloatMode = getFPMode(MF);
 505
 506   ProgInfo.IEEEMode = 0;
 507
 508   // Make clamp modifier on NaN input returns 0.
 509   ProgInfo.DX10Clamp = 1;
 510
 511   const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
 512   ProgInfo.ScratchSize = FrameInfo->getStackSize();
 513
 514   ProgInfo.FlatUsed = FlatUsed;
 515   ProgInfo.VCCUsed = VCCUsed;
 516   ProgInfo.CodeLen = CodeSize;
 517
 518   unsigned LDSAlignShift;
 519   if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
 520     // LDS is allocated in 64 dword blocks.
 521     LDSAlignShift = 8;
 522   } else {
 523     // LDS is allocated in 128 dword blocks.
 524     LDSAlignShift = 9;
 525   }
 526
 527   unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
 528                           MFI->getMaximumWorkGroupSize(MF);
 529
 530   ProgInfo.LDSSize = MFI->LDSSize + LDSSpillSize;
 531   ProgInfo.LDSBlocks =
 532       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
 533
 534   // Scratch is allocated in 256 dword blocks.
 535   unsigned ScratchAlignShift = 10;
 536   // We need to program the hardware with the amount of scratch memory that
 537   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
 538   // scratch memory used per thread.
 539   ProgInfo.ScratchBlocks =
 540       alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
 541               1ULL << ScratchAlignShift) >>
 542       ScratchAlignShift;
 543
 544   ProgInfo.ComputePGMRSrc1 =
 545       S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
 546       S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
 547       S_00B848_PRIORITY(ProgInfo.Priority) |
 548       S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
 549       S_00B848_PRIV(ProgInfo.Priv) |
 550       S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
 551       S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
 552       S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
 553
 554   // 0 = X, 1 = XY, 2 = XYZ
 555   unsigned TIDIGCompCnt = 0;
 556   if (MFI->hasWorkItemIDZ())
 557     TIDIGCompCnt = 2;
 558   else if (MFI->hasWorkItemIDY())
 559     TIDIGCompCnt = 1;
 560
 561   ProgInfo.ComputePGMRSrc2 =
 562       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
 563       S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
 564       S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
 565       S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
 566       S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
 567       S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
 568       S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
 569       S_00B84C_EXCP_EN_MSB(0) |
 570       S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
 571       S_00B84C_EXCP_EN(0);
 572 }
 573
 574 static unsigned getRsrcReg(CallingConv::ID CallConv) {
 575   switch (CallConv) {
 576   default: // Fall through
 577   case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
 578   case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
 579   case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
 580   case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
 581   }
 582 }
 583
 584 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
 585                                          const SIProgramInfo &KernelInfo) {
 586   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
 587   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 588   unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
 589
 590   if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
 591     OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
 592
 593     OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
 594
 595     OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
 596     OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
 597
 598     OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
 599     OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
 600
 601     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
 602     // 0" comment but I don't see a corresponding field in the register spec.
 603   } else {
 604     OutStreamer->EmitIntValue(RsrcReg, 4);
 605     OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
 606                               S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
 607     if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
 608       OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
 609       OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
 610     }
 611   }
 612
 613   if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
 614     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
 615     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
 616     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
 617     OutStreamer->EmitIntValue(MFI->PSInputEna, 4);
 618     OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
 619     OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
 620   }
 621
 622   OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
 623   OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
 624   OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
 625   OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
 626 }
 627
 628 // This is supposed to be log2(Size)
 629 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
 630   switch (Size) {
 631   case 4:
 632     return AMD_ELEMENT_4_BYTES;
 633   case 8:
 634     return AMD_ELEMENT_8_BYTES;
 635   case 16:
 636     return AMD_ELEMENT_16_BYTES;
 637   default:
 638     llvm_unreachable("invalid private_element_size");
 639   }
 640 }
 641
 642 void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
 643                                          const SIProgramInfo &KernelInfo) const {
 644   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 645   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
 646   amd_kernel_code_t header;
 647
 648   AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits());
 649
 650   header.compute_pgm_resource_registers =
 651       KernelInfo.ComputePGMRSrc1 |
 652       (KernelInfo.ComputePGMRSrc2 << 32);
 653   header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
 654
 655
 656   AMD_HSA_BITS_SET(header.code_properties,
 657                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
 658                    getElementByteSizeValue(STM.getMaxPrivateElementSize()));
 659
 660   if (MFI->hasPrivateSegmentBuffer()) {
 661     header.code_properties |=
 662       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
 663   }
 664
 665   if (MFI->hasDispatchPtr())
 666     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 667
 668   if (MFI->hasQueuePtr())
 669     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
 670
 671   if (MFI->hasKernargSegmentPtr())
 672     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
 673
 674   if (MFI->hasDispatchID())
 675     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
 676
 677   if (MFI->hasFlatScratchInit())
 678     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
 679
 680   // TODO: Private segment size
 681
 682   if (MFI->hasGridWorkgroupCountX()) {
 683     header.code_properties |=
 684       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
 685   }
 686
 687   if (MFI->hasGridWorkgroupCountY()) {
 688     header.code_properties |=
 689       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
 690   }
 691
 692   if (MFI->hasGridWorkgroupCountZ()) {
 693     header.code_properties |=
 694       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
 695   }
 696
 697   if (MFI->hasDispatchPtr())
 698     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 699
 700   if (STM.debuggerSupported())
 701     header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
 702
 703   if (STM.isXNACKEnabled())
 704     header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 705
 706   header.kernarg_segment_byte_size = MFI->ABIArgOffset;
 707   header.wavefront_sgpr_count = KernelInfo.NumSGPR;
 708   header.workitem_vgpr_count = KernelInfo.NumVGPR;
 709   header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
 710   header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
 711   header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
 712   header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
 713
 714   if (STM.debuggerEmitPrologue()) {
 715     header.debug_wavefront_private_segment_offset_sgpr =
 716       KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
 717     header.debug_private_segment_buffer_sgpr =
 718       KernelInfo.DebuggerPrivateSegmentBufferSGPR;
 719   }
 720
 721   AMDGPUTargetStreamer *TS =
 722       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
 723
 724   OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
 725   TS->EmitAMDKernelCodeT(header);
 726 }
 727
 728 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
 729                                        unsigned AsmVariant,
 730                                        const char *ExtraCode, raw_ostream &O) {
 731   if (ExtraCode && ExtraCode[0]) {
 732     if (ExtraCode[1] != 0)
 733       return true; // Unknown modifier.
 734
 735     switch (ExtraCode[0]) {
 736     default:
 737       // See if this is a generic print operand
 738       return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
 739     case 'r':
 740       break;
 741     }
 742   }
 743
 744   AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O,
 745                    *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
 746   return false;
 747 }
 748
 749 // Emit a key and an integer value for runtime metadata.
 750 static void emitRuntimeMDIntValue(std::unique_ptr<MCStreamer> &Streamer,
 751                                   RuntimeMD::Key K, uint64_t V,
 752                                   unsigned Size) {
 753   Streamer->EmitIntValue(K, 1);
 754   Streamer->EmitIntValue(V, Size);
 755 }
 756
 757 // Emit a key and a string value for runtime metadata.
 758 static void emitRuntimeMDStringValue(std::unique_ptr<MCStreamer> &Streamer,
 759                                      RuntimeMD::Key K, StringRef S) {
 760   Streamer->EmitIntValue(K, 1);
 761   Streamer->EmitIntValue(S.size(), 4);
 762   Streamer->EmitBytes(S);
 763 }
 764
 765 // Emit a key and three integer values for runtime metadata.
 766 // The three integer values are obtained from MDNode \p Node;
 767 static void emitRuntimeMDThreeIntValues(std::unique_ptr<MCStreamer> &Streamer,
 768                                         RuntimeMD::Key K, MDNode *Node,
 769                                         unsigned Size) {
 770   Streamer->EmitIntValue(K, 1);
 771   Streamer->EmitIntValue(mdconst::extract<ConstantInt>(
 772     Node->getOperand(0))->getZExtValue(), Size);
 773   Streamer->EmitIntValue(mdconst::extract<ConstantInt>(
 774     Node->getOperand(1))->getZExtValue(), Size);
 775   Streamer->EmitIntValue(mdconst::extract<ConstantInt>(
 776     Node->getOperand(2))->getZExtValue(), Size);
 777 }
 778
 779 void AMDGPUAsmPrinter::emitStartOfRuntimeMetadata(const Module &M) {
 780   OutStreamer->SwitchSection(getObjFileLowering().getContext()
 781     .getELFSection(RuntimeMD::SectionName, ELF::SHT_PROGBITS, 0));
 782
 783   emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyMDVersion,
 784                         RuntimeMD::MDVersion << 8 | RuntimeMD::MDRevision, 2);
 785   if (auto MD = M.getNamedMetadata("opencl.ocl.version")) {
 786     if (MD->getNumOperands()) {
 787       auto Node = MD->getOperand(0);
 788       if (Node->getNumOperands() > 1) {
 789         emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguage,
 790                               RuntimeMD::OpenCL_C, 1);
 791         uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
 792                          ->getZExtValue();
 793         uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
 794                          ->getZExtValue();
 795         emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyLanguageVersion,
 796                               Major * 100 + Minor * 10, 2);
 797       }
 798     }
 799   }
 800 }
 801
 802 static std::string getOCLTypeName(Type *Ty, bool isSigned) {
 803   if (VectorType* VecTy = dyn_cast<VectorType>(Ty)) {
 804     Type* EleTy = VecTy->getElementType();
 805     unsigned Size = VecTy->getVectorNumElements();
 806     return (Twine(getOCLTypeName(EleTy, isSigned)) + Twine(Size)).str();
 807   }
 808   switch (Ty->getTypeID()) {
 809   case Type::HalfTyID:   return "half";
 810   case Type::FloatTyID:  return "float";
 811   case Type::DoubleTyID: return "double";
 812   case Type::IntegerTyID: {
 813     if (!isSigned)
 814       return (Twine('u') + Twine(getOCLTypeName(Ty, true))).str();
 815     auto IntTy = cast<IntegerType>(Ty);
 816     auto BW = IntTy->getIntegerBitWidth();
 817     switch (BW) {
 818     case 8:
 819       return "char";
 820     case 16:
 821       return "short";
 822     case 32:
 823       return "int";
 824     case 64:
 825       return "long";
 826     default:
 827       return (Twine('i') + Twine(BW)).str();
 828     }
 829   }
 830   default:
 831     llvm_unreachable("invalid type");
 832   }
 833 }
 834
 835 static RuntimeMD::KernelArg::ValueType getRuntimeMDValueType(
 836          Type *Ty, StringRef TypeName) {
 837   if (auto VT = dyn_cast<VectorType>(Ty))
 838     return getRuntimeMDValueType(VT->getElementType(), TypeName);
 839   else if (auto PT = dyn_cast<PointerType>(Ty))
 840     return getRuntimeMDValueType(PT->getElementType(), TypeName);
 841   else if (Ty->isHalfTy())
 842     return RuntimeMD::KernelArg::F16;
 843   else if (Ty->isFloatTy())
 844     return RuntimeMD::KernelArg::F32;
 845   else if (Ty->isDoubleTy())
 846     return RuntimeMD::KernelArg::F64;
 847   else if (IntegerType* intTy = dyn_cast<IntegerType>(Ty)) {
 848     bool Signed = !TypeName.startswith("u");
 849     switch (intTy->getIntegerBitWidth()) {
 850     case 8:
 851       return Signed ? RuntimeMD::KernelArg::I8 : RuntimeMD::KernelArg::U8;
 852     case 16:
 853       return Signed ? RuntimeMD::KernelArg::I16 : RuntimeMD::KernelArg::U16;
 854     case 32:
 855       return Signed ? RuntimeMD::KernelArg::I32 : RuntimeMD::KernelArg::U32;
 856     case 64:
 857       return Signed ? RuntimeMD::KernelArg::I64 : RuntimeMD::KernelArg::U64;
 858     default:
 859       // Runtime does not recognize other integer types. Report as
 860       // struct type.
 861       return RuntimeMD::KernelArg::Struct;
 862     }
 863   } else
 864     return RuntimeMD::KernelArg::Struct;
 865 }
 866
 867 void AMDGPUAsmPrinter::emitRuntimeMetadata(const Function &F) {
 868   if (!F.getMetadata("kernel_arg_type"))
 869     return;
 870
 871   MCContext &Context = getObjFileLowering().getContext();
 872   OutStreamer->SwitchSection(
 873       Context.getELFSection(RuntimeMD::SectionName, ELF::SHT_PROGBITS, 0));
 874   OutStreamer->EmitIntValue(RuntimeMD::KeyKernelBegin, 1);
 875   emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyKernelName, F.getName());
 876
 877   for (auto &Arg:F.args()) {
 878     // Emit KeyArgBegin.
 879     unsigned I = Arg.getArgNo();
 880     OutStreamer->EmitIntValue(RuntimeMD::KeyArgBegin, 1);
 881
 882     // Emit KeyArgSize and KeyArgAlign.
 883     auto T = Arg.getType();
 884     auto DL = F.getParent()->getDataLayout();
 885     emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgSize,
 886                           DL.getTypeAllocSize(T), 4);
 887     emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAlign,
 888                           DL.getABITypeAlignment(T), 4);
 889
 890     // Emit KeyArgTypeName.
 891     auto TypeName = dyn_cast<MDString>(F.getMetadata(
 892       "kernel_arg_type")->getOperand(I))->getString();
 893     emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyArgTypeName, TypeName);
 894
 895     // Emit KeyArgName.
 896     if (auto ArgNameMD = F.getMetadata("kernel_arg_name")) {
 897       auto ArgName = cast<MDString>(ArgNameMD->getOperand(
 898         I))->getString();
 899       emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyArgName, ArgName);
 900     }
 901
 902     // Emit KeyArgIsVolatile, KeyArgIsRestrict, KeyArgIsConst and KeyArgIsPipe.
 903     auto TypeQual = cast<MDString>(F.getMetadata(
 904       "kernel_arg_type_qual")->getOperand(I))->getString();
 905     SmallVector<StringRef, 1> SplitQ;
 906     TypeQual.split(SplitQ, " ", -1, false/* drop empty entry*/);
 907     for (auto &I:SplitQ) {
 908       auto Key = StringSwitch<RuntimeMD::Key>(I)
 909         .Case("volatile", RuntimeMD::KeyArgIsVolatile)
 910         .Case("restrict", RuntimeMD::KeyArgIsRestrict)
 911         .Case("const",    RuntimeMD::KeyArgIsConst)
 912         .Case("pipe",     RuntimeMD::KeyArgIsPipe)
 913         .Default(RuntimeMD::KeyNull);
 914       OutStreamer->EmitIntValue(Key, 1);
 915     }
 916
 917     // Emit KeyArgTypeKind.
 918     auto BaseTypeName = cast<MDString>(
 919       F.getMetadata("kernel_arg_base_type")->getOperand(I))->getString();
 920     auto TypeKind = StringSwitch<RuntimeMD::KernelArg::TypeKind>(BaseTypeName)
 921       .Case("sampler_t", RuntimeMD::KernelArg::Sampler)
 922       .Case("queue_t",   RuntimeMD::KernelArg::Queue)
 923       .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t",
 924              "image2d_t" , "image2d_array_t",  RuntimeMD::KernelArg::Image)
 925       .Cases("image2d_depth_t", "image2d_array_depth_t",
 926              "image2d_msaa_t", "image2d_array_msaa_t",
 927              "image2d_msaa_depth_t",  RuntimeMD::KernelArg::Image)
 928       .Cases("image2d_array_msaa_depth_t", "image3d_t",
 929              RuntimeMD::KernelArg::Image)
 930       .Default(isa<PointerType>(T) ? RuntimeMD::KernelArg::Pointer :
 931                RuntimeMD::KernelArg::Value);
 932     emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgTypeKind, TypeKind, 1);
 933
 934     // Emit KeyArgValueType.
 935     emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgValueType,
 936                           getRuntimeMDValueType(T, BaseTypeName), 2);
 937
 938     // Emit KeyArgAccQual.
 939     auto AccQual = cast<MDString>(F.getMetadata(
 940       "kernel_arg_access_qual")->getOperand(I))->getString();
 941     auto AQ = StringSwitch<RuntimeMD::KernelArg::AccessQualifer>(AccQual)
 942       .Case("read_only",  RuntimeMD::KernelArg::ReadOnly)
 943       .Case("write_only", RuntimeMD::KernelArg::WriteOnly)
 944       .Case("read_write", RuntimeMD::KernelArg::ReadWrite)
 945       .Default(RuntimeMD::KernelArg::None);
 946     emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAccQual,
 947                           AQ, 1);
 948
 949     // Emit KeyArgAddrQual.
 950     if (isa<PointerType>(T))
 951       emitRuntimeMDIntValue(OutStreamer, RuntimeMD::KeyArgAddrQual,
 952                             T->getPointerAddressSpace(), 1);
 953
 954     // Emit KeyArgEnd
 955     OutStreamer->EmitIntValue(RuntimeMD::KeyArgEnd, 1);
 956   }
 957
 958   // Emit KeyReqdWorkGroupSize, KeyWorkGroupSizeHint, and KeyVecTypeHint.
 959   if (auto RWGS = F.getMetadata("reqd_work_group_size"))
 960     emitRuntimeMDThreeIntValues(OutStreamer, RuntimeMD::KeyReqdWorkGroupSize,
 961                                 RWGS, 4);
 962   if (auto WGSH = F.getMetadata("work_group_size_hint"))
 963     emitRuntimeMDThreeIntValues(OutStreamer, RuntimeMD::KeyWorkGroupSizeHint,
 964                                 WGSH, 4);
 965   if (auto VTH = F.getMetadata("vec_type_hint")) {
 966     auto TypeName = getOCLTypeName(cast<ValueAsMetadata>(
 967       VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>(
 968       VTH->getOperand(1))->getZExtValue());
 969     emitRuntimeMDStringValue(OutStreamer, RuntimeMD::KeyVecTypeHint,
 970                              TypeName);
 971   }
 972
 973   // Emit KeyKernelEnd
 974   OutStreamer->EmitIntValue(RuntimeMD::KeyKernelEnd, 1);
 975 }