contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

   1 //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 ///
  12 /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
  13 /// code.  When passed an MCAsmStreamer it prints assembly and when passed
  14 /// an MCObjectStreamer it outputs binary code.
  15 //
  16 //===----------------------------------------------------------------------===//
  17 //
  18
  19 #include "AMDGPUAsmPrinter.h"
  20 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
  21 #include "InstPrinter/AMDGPUInstPrinter.h"
  22 #include "Utils/AMDGPUBaseInfo.h"
  23 #include "AMDGPU.h"
  24 #include "AMDKernelCodeT.h"
  25 #include "AMDGPUSubtarget.h"
  26 #include "R600Defines.h"
  27 #include "R600MachineFunctionInfo.h"
  28 #include "R600RegisterInfo.h"
  29 #include "SIDefines.h"
  30 #include "SIMachineFunctionInfo.h"
  31 #include "SIInstrInfo.h"
  32 #include "SIRegisterInfo.h"
  33 #include "llvm/CodeGen/MachineFrameInfo.h"
  34 #include "llvm/IR/DiagnosticInfo.h"
  35 #include "llvm/MC/MCContext.h"
  36 #include "llvm/MC/MCSectionELF.h"
  37 #include "llvm/MC/MCStreamer.h"
  38 #include "llvm/Support/ELF.h"
  39 #include "llvm/Support/MathExtras.h"
  40 #include "llvm/Support/TargetRegistry.h"
  41 #include "llvm/Target/TargetLoweringObjectFile.h"
  42
  43 using namespace llvm;
  44
  45 // TODO: This should get the default rounding mode from the kernel. We just set
  46 // the default here, but this could change if the OpenCL rounding mode pragmas
  47 // are used.
  48 //
  49 // The denormal mode here should match what is reported by the OpenCL runtime
  50 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
  51 // can also be override to flush with the -cl-denorms-are-zero compiler flag.
  52 //
  53 // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
  54 // precision, and leaves single precision to flush all and does not report
  55 // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
  56 // CL_FP_DENORM for both.
  57 //
  58 // FIXME: It seems some instructions do not support single precision denormals
  59 // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
  60 // and sin_f32, cos_f32 on most parts).
  61
  62 // We want to use these instructions, and using fp32 denormals also causes
  63 // instructions to run at the double precision rate for the device so it's
  64 // probably best to just report no single precision denormals.
  65 static uint32_t getFPMode(const MachineFunction &F) {
  66   const SISubtarget& ST = F.getSubtarget<SISubtarget>();
  67   // TODO: Is there any real use for the flush in only / flush out only modes?
  68
  69   uint32_t FP32Denormals =
  70     ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
  71
  72   uint32_t FP64Denormals =
  73     ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
  74
  75   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
  76          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
  77          FP_DENORM_MODE_SP(FP32Denormals) |
  78          FP_DENORM_MODE_DP(FP64Denormals);
  79 }
  80
  81 static AsmPrinter *
  82 createAMDGPUAsmPrinterPass(TargetMachine &tm,
  83                            std::unique_ptr<MCStreamer> &&Streamer) {
  84   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
  85 }
  86
  87 extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
  88   TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
  89                                      createAMDGPUAsmPrinterPass);
  90   TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
  91                                      createAMDGPUAsmPrinterPass);
  92 }
  93
  94 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
  95                                    std::unique_ptr<MCStreamer> Streamer)
  96   : AsmPrinter(TM, std::move(Streamer)) {}
  97
  98 StringRef AMDGPUAsmPrinter::getPassName() const {
  99   return "AMDGPU Assembly Printer";
 100 }
 101
 102 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
 103   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
 104     return;
 105
 106   // Need to construct an MCSubtargetInfo here in case we have no functions
 107   // in the module.
 108   std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
 109         TM.getTargetTriple().str(), TM.getTargetCPU(),
 110         TM.getTargetFeatureString()));
 111
 112   AMDGPUTargetStreamer *TS =
 113       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
 114
 115   TS->EmitDirectiveHSACodeObjectVersion(2, 1);
 116
 117   AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
 118   TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
 119                                     "AMD", "AMDGPU");
 120
 121   // Emit runtime metadata.
 122   TS->EmitRuntimeMetadata(M);
 123 }
 124
 125 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
 126   const MachineBasicBlock *MBB) const {
 127   if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
 128     return false;
 129
 130   if (MBB->empty())
 131     return true;
 132
 133   // If this is a block implementing a long branch, an expression relative to
 134   // the start of the block is needed.  to the start of the block.
 135   // XXX - Is there a smarter way to check this?
 136   return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
 137 }
 138
 139
 140 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
 141   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
 142   SIProgramInfo KernelInfo;
 143   if (STM.isAmdCodeObjectV2(*MF)) {
 144     getSIProgramInfo(KernelInfo, *MF);
 145     EmitAmdKernelCodeT(*MF, KernelInfo);
 146   }
 147 }
 148
 149 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
 150   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 151   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
 152   if (MFI->isKernel() && STM.isAmdCodeObjectV2(*MF)) {
 153     AMDGPUTargetStreamer *TS =
 154         static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
 155     SmallString<128> SymbolName;
 156     getNameWithPrefix(SymbolName, MF->getFunction()),
 157     TS->EmitAMDGPUSymbolType(SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
 158   }
 159
 160   AsmPrinter::EmitFunctionEntryLabel();
 161 }
 162
 163 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 164
 165   // Group segment variables aren't emitted in HSA.
 166   if (AMDGPU::isGroupSegment(GV))
 167     return;
 168
 169   AsmPrinter::EmitGlobalVariable(GV);
 170 }
 171
 172 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 173
 174   // The starting address of all shader programs must be 256 bytes aligned.
 175   MF.setAlignment(8);
 176
 177   SetupMachineFunction(MF);
 178
 179   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
 180   MCContext &Context = getObjFileLowering().getContext();
 181   if (!STM.isAmdHsaOS()) {
 182     MCSectionELF *ConfigSection =
 183         Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
 184     OutStreamer->SwitchSection(ConfigSection);
 185   }
 186
 187   SIProgramInfo KernelInfo;
 188   if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
 189     getSIProgramInfo(KernelInfo, MF);
 190     if (!STM.isAmdHsaOS()) {
 191       EmitProgramInfoSI(MF, KernelInfo);
 192     }
 193   } else {
 194     EmitProgramInfoR600(MF);
 195   }
 196
 197   DisasmLines.clear();
 198   HexLines.clear();
 199   DisasmLineMaxLen = 0;
 200
 201   EmitFunctionBody();
 202
 203   if (isVerbose()) {
 204     MCSectionELF *CommentSection =
 205         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
 206     OutStreamer->SwitchSection(CommentSection);
 207
 208     if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
 209       OutStreamer->emitRawComment(" Kernel info:", false);
 210       OutStreamer->emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
 211                                   false);
 212       OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
 213                                   false);
 214       OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
 215                                   false);
 216       OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
 217                                   false);
 218       OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
 219                                   false);
 220       OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
 221                                   false);
 222       OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
 223                                   " bytes/workgroup (compile time only)", false);
 224
 225       OutStreamer->emitRawComment(" SGPRBlocks: " +
 226                                   Twine(KernelInfo.SGPRBlocks), false);
 227       OutStreamer->emitRawComment(" VGPRBlocks: " +
 228                                   Twine(KernelInfo.VGPRBlocks), false);
 229
 230       OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " +
 231                                   Twine(KernelInfo.NumSGPRsForWavesPerEU), false);
 232       OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " +
 233                                   Twine(KernelInfo.NumVGPRsForWavesPerEU), false);
 234
 235       OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
 236                                   false);
 237       OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
 238                                   false);
 239
 240       if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
 241         OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
 242                                     Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
 243         OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +
 244                                     Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);
 245       }
 246
 247       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
 248                                   Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
 249                                   false);
 250       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
 251                                   Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
 252                                   false);
 253       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
 254                                   Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
 255                                   false);
 256       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
 257                                   Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
 258                                   false);
 259       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
 260                                   Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
 261                                   false);
 262
 263     } else {
 264       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 265       OutStreamer->emitRawComment(
 266         Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize)));
 267     }
 268   }
 269
 270   if (STM.dumpCode()) {
 271
 272     OutStreamer->SwitchSection(
 273         Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
 274
 275     for (size_t i = 0; i < DisasmLines.size(); ++i) {
 276       std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
 277       Comment += " ; " + HexLines[i] + "\n";
 278
 279       OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
 280       OutStreamer->EmitBytes(StringRef(Comment));
 281     }
 282   }
 283
 284   return false;
 285 }
 286
 287 void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
 288   unsigned MaxGPR = 0;
 289   bool killPixel = false;
 290   const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
 291   const R600RegisterInfo *RI = STM.getRegisterInfo();
 292   const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 293
 294   for (const MachineBasicBlock &MBB : MF) {
 295     for (const MachineInstr &MI : MBB) {
 296       if (MI.getOpcode() == AMDGPU::KILLGT)
 297         killPixel = true;
 298       unsigned numOperands = MI.getNumOperands();
 299       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
 300         const MachineOperand &MO = MI.getOperand(op_idx);
 301         if (!MO.isReg())
 302           continue;
 303         unsigned HWReg = RI->getEncodingValue(MO.getReg()) & 0xff;
 304
 305         // Register with value > 127 aren't GPR
 306         if (HWReg > 127)
 307           continue;
 308         MaxGPR = std::max(MaxGPR, HWReg);
 309       }
 310     }
 311   }
 312
 313   unsigned RsrcReg;
 314   if (STM.getGeneration() >= R600Subtarget::EVERGREEN) {
 315     // Evergreen / Northern Islands
 316     switch (MF.getFunction()->getCallingConv()) {
 317     default: LLVM_FALLTHROUGH;
 318     case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
 319     case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
 320     case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
 321     case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
 322     }
 323   } else {
 324     // R600 / R700
 325     switch (MF.getFunction()->getCallingConv()) {
 326     default: LLVM_FALLTHROUGH;
 327     case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH;
 328     case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH;
 329     case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
 330     case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
 331     }
 332   }
 333
 334   OutStreamer->EmitIntValue(RsrcReg, 4);
 335   OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
 336                            S_STACK_SIZE(MFI->CFStackSize), 4);
 337   OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
 338   OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
 339
 340   if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
 341     OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
 342     OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
 343   }
 344 }
 345
 346 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 347                                         const MachineFunction &MF) const {
 348   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
 349   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 350   uint64_t CodeSize = 0;
 351   unsigned MaxSGPR = 0;
 352   unsigned MaxVGPR = 0;
 353   bool VCCUsed = false;
 354   bool FlatUsed = false;
 355   const SIRegisterInfo *RI = STM.getRegisterInfo();
 356   const SIInstrInfo *TII = STM.getInstrInfo();
 357
 358   for (const MachineBasicBlock &MBB : MF) {
 359     for (const MachineInstr &MI : MBB) {
 360       // TODO: CodeSize should account for multiple functions.
 361
 362       // TODO: Should we count size of debug info?
 363       if (MI.isDebugValue())
 364         continue;
 365
 366       if (isVerbose())
 367         CodeSize += TII->getInstSizeInBytes(MI);
 368
 369       unsigned numOperands = MI.getNumOperands();
 370       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
 371         const MachineOperand &MO = MI.getOperand(op_idx);
 372         unsigned width = 0;
 373         bool isSGPR = false;
 374
 375         if (!MO.isReg())
 376           continue;
 377
 378         unsigned reg = MO.getReg();
 379         switch (reg) {
 380         case AMDGPU::EXEC:
 381         case AMDGPU::EXEC_LO:
 382         case AMDGPU::EXEC_HI:
 383         case AMDGPU::SCC:
 384         case AMDGPU::M0:
 385           continue;
 386
 387         case AMDGPU::VCC:
 388         case AMDGPU::VCC_LO:
 389         case AMDGPU::VCC_HI:
 390           VCCUsed = true;
 391           continue;
 392
 393         case AMDGPU::FLAT_SCR:
 394         case AMDGPU::FLAT_SCR_LO:
 395         case AMDGPU::FLAT_SCR_HI:
 396           // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
 397           // instructions aren't used to access the scratch buffer.
 398           if (MFI->hasFlatScratchInit())
 399             FlatUsed = true;
 400           continue;
 401
 402         case AMDGPU::TBA:
 403         case AMDGPU::TBA_LO:
 404         case AMDGPU::TBA_HI:
 405         case AMDGPU::TMA:
 406         case AMDGPU::TMA_LO:
 407         case AMDGPU::TMA_HI:
 408           llvm_unreachable("trap handler registers should not be used");
 409
 410         default:
 411           break;
 412         }
 413
 414         if (AMDGPU::SReg_32RegClass.contains(reg)) {
 415           assert(!AMDGPU::TTMP_32RegClass.contains(reg) &&
 416                  "trap handler registers should not be used");
 417           isSGPR = true;
 418           width = 1;
 419         } else if (AMDGPU::VGPR_32RegClass.contains(reg)) {
 420           isSGPR = false;
 421           width = 1;
 422         } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
 423           assert(!AMDGPU::TTMP_64RegClass.contains(reg) &&
 424                  "trap handler registers should not be used");
 425           isSGPR = true;
 426           width = 2;
 427         } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
 428           isSGPR = false;
 429           width = 2;
 430         } else if (AMDGPU::VReg_96RegClass.contains(reg)) {
 431           isSGPR = false;
 432           width = 3;
 433         } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
 434           isSGPR = true;
 435           width = 4;
 436         } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
 437           isSGPR = false;
 438           width = 4;
 439         } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
 440           isSGPR = true;
 441           width = 8;
 442         } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
 443           isSGPR = false;
 444           width = 8;
 445         } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
 446           isSGPR = true;
 447           width = 16;
 448         } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
 449           isSGPR = false;
 450           width = 16;
 451         } else {
 452           llvm_unreachable("Unknown register class");
 453         }
 454         unsigned hwReg = RI->getEncodingValue(reg) & 0xff;
 455         unsigned maxUsed = hwReg + width - 1;
 456         if (isSGPR) {
 457           MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
 458         } else {
 459           MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
 460         }
 461       }
 462     }
 463   }
 464
 465   unsigned ExtraSGPRs = 0;
 466
 467   if (VCCUsed)
 468     ExtraSGPRs = 2;
 469
 470   if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
 471     if (FlatUsed)
 472       ExtraSGPRs = 4;
 473   } else {
 474     if (STM.isXNACKEnabled())
 475       ExtraSGPRs = 4;
 476
 477     if (FlatUsed)
 478       ExtraSGPRs = 6;
 479   }
 480
 481   // Record first reserved register and reserved register count fields, and
 482   // update max register counts if "amdgpu-debugger-reserve-regs" attribute was
 483   // requested.
 484   ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0;
 485   ProgInfo.ReservedVGPRCount = RI->getNumDebuggerReservedVGPRs(STM);
 486
 487   // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
 488   // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
 489   // attribute was requested.
 490   if (STM.debuggerEmitPrologue()) {
 491     ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
 492       RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
 493     ProgInfo.DebuggerPrivateSegmentBufferSGPR =
 494       RI->getHWRegIndex(MFI->getScratchRSrcReg());
 495   }
 496
 497   // Check the addressable register limit before we add ExtraSGPRs.
 498   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
 499       !STM.hasSGPRInitBug()) {
 500     unsigned MaxAddressableNumSGPRs = STM.getMaxNumSGPRs();
 501     if (MaxSGPR + 1 > MaxAddressableNumSGPRs) {
 502       // This can happen due to a compiler bug or when using inline asm.
 503       LLVMContext &Ctx = MF.getFunction()->getContext();
 504       DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
 505                                        "addressable scalar registers",
 506                                        MaxSGPR + 1, DS_Error,
 507                                        DK_ResourceLimit, MaxAddressableNumSGPRs);
 508       Ctx.diagnose(Diag);
 509       MaxSGPR = MaxAddressableNumSGPRs - 1;
 510     }
 511   }
 512
 513   // Account for extra SGPRs and VGPRs reserved for debugger use.
 514   MaxSGPR += ExtraSGPRs;
 515   MaxVGPR += RI->getNumDebuggerReservedVGPRs(STM);
 516
 517   // We found the maximum register index. They start at 0, so add one to get the
 518   // number of registers.
 519   ProgInfo.NumVGPR = MaxVGPR + 1;
 520   ProgInfo.NumSGPR = MaxSGPR + 1;
 521
 522   // Adjust number of registers used to meet default/requested minimum/maximum
 523   // number of waves per execution unit request.
 524   ProgInfo.NumSGPRsForWavesPerEU = std::max(
 525     ProgInfo.NumSGPR, RI->getMinNumSGPRs(STM, MFI->getMaxWavesPerEU()));
 526   ProgInfo.NumVGPRsForWavesPerEU = std::max(
 527     ProgInfo.NumVGPR, RI->getMinNumVGPRs(MFI->getMaxWavesPerEU()));
 528
 529   if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
 530       STM.hasSGPRInitBug()) {
 531     unsigned MaxNumSGPRs = STM.getMaxNumSGPRs();
 532     if (ProgInfo.NumSGPR > MaxNumSGPRs) {
 533       // This can happen due to a compiler bug or when using inline asm to use the
 534       // registers which are usually reserved for vcc etc.
 535
 536       LLVMContext &Ctx = MF.getFunction()->getContext();
 537       DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
 538                                        "scalar registers",
 539                                        ProgInfo.NumSGPR, DS_Error,
 540                                        DK_ResourceLimit, MaxNumSGPRs);
 541       Ctx.diagnose(Diag);
 542       ProgInfo.NumSGPR = MaxNumSGPRs;
 543       ProgInfo.NumSGPRsForWavesPerEU = MaxNumSGPRs;
 544     }
 545   }
 546
 547   if (STM.hasSGPRInitBug()) {
 548     ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
 549     ProgInfo.NumSGPRsForWavesPerEU = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
 550   }
 551
 552   if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
 553     LLVMContext &Ctx = MF.getFunction()->getContext();
 554     DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs",
 555                                      MFI->NumUserSGPRs, DS_Error);
 556     Ctx.diagnose(Diag);
 557   }
 558
 559   if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
 560     LLVMContext &Ctx = MF.getFunction()->getContext();
 561     DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory",
 562                                      MFI->getLDSSize(), DS_Error);
 563     Ctx.diagnose(Diag);
 564   }
 565
 566   // SGPRBlocks is actual number of SGPR blocks minus 1.
 567   ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU,
 568                                 RI->getSGPRAllocGranule());
 569   ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / RI->getSGPRAllocGranule() - 1;
 570
 571   // VGPRBlocks is actual number of VGPR blocks minus 1.
 572   ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU,
 573                                 RI->getVGPRAllocGranule());
 574   ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / RI->getVGPRAllocGranule() - 1;
 575
 576   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
 577   // register.
 578   ProgInfo.FloatMode = getFPMode(MF);
 579
 580   ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
 581
 582   // Make clamp modifier on NaN input returns 0.
 583   ProgInfo.DX10Clamp = 1;
 584
 585   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
 586   ProgInfo.ScratchSize = FrameInfo.getStackSize();
 587
 588   ProgInfo.FlatUsed = FlatUsed;
 589   ProgInfo.VCCUsed = VCCUsed;
 590   ProgInfo.CodeLen = CodeSize;
 591
 592   unsigned LDSAlignShift;
 593   if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
 594     // LDS is allocated in 64 dword blocks.
 595     LDSAlignShift = 8;
 596   } else {
 597     // LDS is allocated in 128 dword blocks.
 598     LDSAlignShift = 9;
 599   }
 600
 601   unsigned LDSSpillSize =
 602     MFI->LDSWaveSpillSize * MFI->getMaxFlatWorkGroupSize();
 603
 604   ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
 605   ProgInfo.LDSBlocks =
 606       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
 607
 608   // Scratch is allocated in 256 dword blocks.
 609   unsigned ScratchAlignShift = 10;
 610   // We need to program the hardware with the amount of scratch memory that
 611   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
 612   // scratch memory used per thread.
 613   ProgInfo.ScratchBlocks =
 614       alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
 615               1ULL << ScratchAlignShift) >>
 616       ScratchAlignShift;
 617
 618   ProgInfo.ComputePGMRSrc1 =
 619       S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
 620       S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
 621       S_00B848_PRIORITY(ProgInfo.Priority) |
 622       S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
 623       S_00B848_PRIV(ProgInfo.Priv) |
 624       S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
 625       S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
 626       S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
 627
 628   // 0 = X, 1 = XY, 2 = XYZ
 629   unsigned TIDIGCompCnt = 0;
 630   if (MFI->hasWorkItemIDZ())
 631     TIDIGCompCnt = 2;
 632   else if (MFI->hasWorkItemIDY())
 633     TIDIGCompCnt = 1;
 634
 635   ProgInfo.ComputePGMRSrc2 =
 636       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
 637       S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
 638       S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
 639       S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
 640       S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
 641       S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
 642       S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
 643       S_00B84C_EXCP_EN_MSB(0) |
 644       S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
 645       S_00B84C_EXCP_EN(0);
 646 }
 647
 648 static unsigned getRsrcReg(CallingConv::ID CallConv) {
 649   switch (CallConv) {
 650   default: LLVM_FALLTHROUGH;
 651   case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
 652   case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
 653   case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
 654   case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
 655   }
 656 }
 657
 658 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
 659                                          const SIProgramInfo &KernelInfo) {
 660   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
 661   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 662   unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
 663
 664   if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
 665     OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
 666
 667     OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
 668
 669     OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
 670     OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
 671
 672     OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
 673     OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
 674
 675     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
 676     // 0" comment but I don't see a corresponding field in the register spec.
 677   } else {
 678     OutStreamer->EmitIntValue(RsrcReg, 4);
 679     OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
 680                               S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
 681     if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
 682       OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
 683       OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
 684     }
 685   }
 686
 687   if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
 688     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
 689     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
 690     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
 691     OutStreamer->EmitIntValue(MFI->PSInputEna, 4);
 692     OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
 693     OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
 694   }
 695
 696   OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
 697   OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
 698   OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
 699   OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
 700 }
 701
 702 // This is supposed to be log2(Size)
 703 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
 704   switch (Size) {
 705   case 4:
 706     return AMD_ELEMENT_4_BYTES;
 707   case 8:
 708     return AMD_ELEMENT_8_BYTES;
 709   case 16:
 710     return AMD_ELEMENT_16_BYTES;
 711   default:
 712     llvm_unreachable("invalid private_element_size");
 713   }
 714 }
 715
 716 void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
 717                                          const SIProgramInfo &KernelInfo) const {
 718   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 719   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
 720   amd_kernel_code_t header;
 721
 722   AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits());
 723
 724   header.compute_pgm_resource_registers =
 725       KernelInfo.ComputePGMRSrc1 |
 726       (KernelInfo.ComputePGMRSrc2 << 32);
 727   header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
 728
 729
 730   AMD_HSA_BITS_SET(header.code_properties,
 731                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
 732                    getElementByteSizeValue(STM.getMaxPrivateElementSize()));
 733
 734   if (MFI->hasPrivateSegmentBuffer()) {
 735     header.code_properties |=
 736       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
 737   }
 738
 739   if (MFI->hasDispatchPtr())
 740     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 741
 742   if (MFI->hasQueuePtr())
 743     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
 744
 745   if (MFI->hasKernargSegmentPtr())
 746     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
 747
 748   if (MFI->hasDispatchID())
 749     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
 750
 751   if (MFI->hasFlatScratchInit())
 752     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
 753
 754   // TODO: Private segment size
 755
 756   if (MFI->hasGridWorkgroupCountX()) {
 757     header.code_properties |=
 758       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
 759   }
 760
 761   if (MFI->hasGridWorkgroupCountY()) {
 762     header.code_properties |=
 763       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
 764   }
 765
 766   if (MFI->hasGridWorkgroupCountZ()) {
 767     header.code_properties |=
 768       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
 769   }
 770
 771   if (MFI->hasDispatchPtr())
 772     header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 773
 774   if (STM.debuggerSupported())
 775     header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
 776
 777   if (STM.isXNACKEnabled())
 778     header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 779
 780   // FIXME: Should use getKernArgSize
 781   header.kernarg_segment_byte_size =
 782     STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
 783   header.wavefront_sgpr_count = KernelInfo.NumSGPR;
 784   header.workitem_vgpr_count = KernelInfo.NumVGPR;
 785   header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
 786   header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
 787   header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
 788   header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
 789
 790   // These alignment values are specified in powers of two, so alignment =
 791   // 2^n.  The minimum alignment is 2^4 = 16.
 792   header.kernarg_segment_alignment = std::max((size_t)4,
 793       countTrailingZeros(MFI->getMaxKernArgAlign()));
 794
 795   if (STM.debuggerEmitPrologue()) {
 796     header.debug_wavefront_private_segment_offset_sgpr =
 797       KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
 798     header.debug_private_segment_buffer_sgpr =
 799       KernelInfo.DebuggerPrivateSegmentBufferSGPR;
 800   }
 801
 802   AMDGPUTargetStreamer *TS =
 803       static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
 804
 805   OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
 806   TS->EmitAMDKernelCodeT(header);
 807 }
 808
 809 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
 810                                        unsigned AsmVariant,
 811                                        const char *ExtraCode, raw_ostream &O) {
 812   if (ExtraCode && ExtraCode[0]) {
 813     if (ExtraCode[1] != 0)
 814       return true; // Unknown modifier.
 815
 816     switch (ExtraCode[0]) {
 817     default:
 818       // See if this is a generic print operand
 819       return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
 820     case 'r':
 821       break;
 822     }
 823   }
 824
 825   AMDGPUInstPrinter::printRegOperand(MI->getOperand(OpNo).getReg(), O,
 826                    *TM.getSubtargetImpl(*MF->getFunction())->getRegisterInfo());
 827   return false;
 828 }