contrib/llvm/tools/llvm-mca/InstrBuilder.cpp

   1 //===--------------------- InstrBuilder.cpp ---------------------*- C++ -*-===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 /// \file
  10 ///
  11 /// This file implements the InstrBuilder interface.
  12 ///
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "InstrBuilder.h"
  16 #include "llvm/ADT/APInt.h"
  17 #include "llvm/ADT/DenseMap.h"
  18 #include "llvm/MC/MCInst.h"
  19 #include "llvm/Support/Debug.h"
  20 #include "llvm/Support/WithColor.h"
  21 #include "llvm/Support/raw_ostream.h"
  22
  23 #define DEBUG_TYPE "llvm-mca"
  24
  25 namespace mca {
  26
  27 using namespace llvm;
  28
  29 static void initializeUsedResources(InstrDesc &ID,
  30                                     const MCSchedClassDesc &SCDesc,
  31                                     const MCSubtargetInfo &STI,
  32                                     ArrayRef<uint64_t> ProcResourceMasks) {
  33   const MCSchedModel &SM = STI.getSchedModel();
  34
  35   // Populate resources consumed.
  36   using ResourcePlusCycles = std::pair<uint64_t, ResourceUsage>;
  37   std::vector<ResourcePlusCycles> Worklist;
  38
  39   // Track cycles contributed by resources that are in a "Super" relationship.
  40   // This is required if we want to correctly match the behavior of method
  41   // SubtargetEmitter::ExpandProcResource() in Tablegen. When computing the set
  42   // of "consumed" processor resources and resource cycles, the logic in
  43   // ExpandProcResource() doesn't update the number of resource cycles
  44   // contributed by a "Super" resource to a group.
  45   // We need to take this into account when we find that a processor resource is
  46   // part of a group, and it is also used as the "Super" of other resources.
  47   // This map stores the number of cycles contributed by sub-resources that are
  48   // part of a "Super" resource. The key value is the "Super" resource mask ID.
  49   DenseMap<uint64_t, unsigned> SuperResources;
  50
  51   for (unsigned I = 0, E = SCDesc.NumWriteProcResEntries; I < E; ++I) {
  52     const MCWriteProcResEntry *PRE = STI.getWriteProcResBegin(&SCDesc) + I;
  53     const MCProcResourceDesc &PR = *SM.getProcResource(PRE->ProcResourceIdx);
  54     uint64_t Mask = ProcResourceMasks[PRE->ProcResourceIdx];
  55     if (PR.BufferSize != -1)
  56       ID.Buffers.push_back(Mask);
  57     CycleSegment RCy(0, PRE->Cycles, false);
  58     Worklist.emplace_back(ResourcePlusCycles(Mask, ResourceUsage(RCy)));
  59     if (PR.SuperIdx) {
  60       uint64_t Super = ProcResourceMasks[PR.SuperIdx];
  61       SuperResources[Super] += PRE->Cycles;
  62     }
  63   }
  64
  65   // Sort elements by mask popcount, so that we prioritize resource units over
  66   // resource groups, and smaller groups over larger groups.
  67   llvm::sort(Worklist.begin(), Worklist.end(),
  68              [](const ResourcePlusCycles &A, const ResourcePlusCycles &B) {
  69                unsigned popcntA = countPopulation(A.first);
  70                unsigned popcntB = countPopulation(B.first);
  71                if (popcntA < popcntB)
  72                  return true;
  73                if (popcntA > popcntB)
  74                  return false;
  75                return A.first < B.first;
  76              });
  77
  78   uint64_t UsedResourceUnits = 0;
  79
  80   // Remove cycles contributed by smaller resources.
  81   for (unsigned I = 0, E = Worklist.size(); I < E; ++I) {
  82     ResourcePlusCycles &A = Worklist[I];
  83     if (!A.second.size()) {
  84       A.second.NumUnits = 0;
  85       A.second.setReserved();
  86       ID.Resources.emplace_back(A);
  87       continue;
  88     }
  89
  90     ID.Resources.emplace_back(A);
  91     uint64_t NormalizedMask = A.first;
  92     if (countPopulation(A.first) == 1) {
  93       UsedResourceUnits |= A.first;
  94     } else {
  95       // Remove the leading 1 from the resource group mask.
  96       NormalizedMask ^= PowerOf2Floor(NormalizedMask);
  97     }
  98
  99     for (unsigned J = I + 1; J < E; ++J) {
 100       ResourcePlusCycles &B = Worklist[J];
 101       if ((NormalizedMask & B.first) == NormalizedMask) {
 102         B.second.CS.Subtract(A.second.size() - SuperResources[A.first]);
 103         if (countPopulation(B.first) > 1)
 104           B.second.NumUnits++;
 105       }
 106     }
 107   }
 108
 109   // A SchedWrite may specify a number of cycles in which a resource group
 110   // is reserved. For example (on target x86; cpu Haswell):
 111   //
 112   //  SchedWriteRes<[HWPort0, HWPort1, HWPort01]> {
 113   //    let ResourceCycles = [2, 2, 3];
 114   //  }
 115   //
 116   // This means:
 117   // Resource units HWPort0 and HWPort1 are both used for 2cy.
 118   // Resource group HWPort01 is the union of HWPort0 and HWPort1.
 119   // Since this write touches both HWPort0 and HWPort1 for 2cy, HWPort01
 120   // will not be usable for 2 entire cycles from instruction issue.
 121   //
 122   // On top of those 2cy, SchedWriteRes explicitly specifies an extra latency
 123   // of 3 cycles for HWPort01. This tool assumes that the 3cy latency is an
 124   // extra delay on top of the 2 cycles latency.
 125   // During those extra cycles, HWPort01 is not usable by other instructions.
 126   for (ResourcePlusCycles &RPC : ID.Resources) {
 127     if (countPopulation(RPC.first) > 1 && !RPC.second.isReserved()) {
 128       // Remove the leading 1 from the resource group mask.
 129       uint64_t Mask = RPC.first ^ PowerOf2Floor(RPC.first);
 130       if ((Mask & UsedResourceUnits) == Mask)
 131         RPC.second.setReserved();
 132     }
 133   }
 134
 135   LLVM_DEBUG({
 136     for (const std::pair<uint64_t, ResourceUsage> &R : ID.Resources)
 137       dbgs() << "\t\tMask=" << R.first << ", cy=" << R.second.size() << '\n';
 138     for (const uint64_t R : ID.Buffers)
 139       dbgs() << "\t\tBuffer Mask=" << R << '\n';
 140   });
 141 }
 142
 143 static void computeMaxLatency(InstrDesc &ID, const MCInstrDesc &MCDesc,
 144                               const MCSchedClassDesc &SCDesc,
 145                               const MCSubtargetInfo &STI) {
 146   if (MCDesc.isCall()) {
 147     // We cannot estimate how long this call will take.
 148     // Artificially set an arbitrarily high latency (100cy).
 149     ID.MaxLatency = 100U;
 150     return;
 151   }
 152
 153   int Latency = MCSchedModel::computeInstrLatency(STI, SCDesc);
 154   // If latency is unknown, then conservatively assume a MaxLatency of 100cy.
 155   ID.MaxLatency = Latency < 0 ? 100U : static_cast<unsigned>(Latency);
 156 }
 157
 158 void InstrBuilder::populateWrites(InstrDesc &ID, const MCInst &MCI,
 159                                   unsigned SchedClassID) {
 160   const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode());
 161   const MCSchedModel &SM = STI.getSchedModel();
 162   const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
 163
 164   // These are for now the (strong) assumptions made by this algorithm:
 165   //  * The number of explicit and implicit register definitions in a MCInst
 166   //    matches the number of explicit and implicit definitions according to
 167   //    the opcode descriptor (MCInstrDesc).
 168   //  * Register definitions take precedence over register uses in the operands
 169   //    list.
 170   //  * If an opcode specifies an optional definition, then the optional
 171   //    definition is always the last operand in the sequence, and it can be
 172   //    set to zero (i.e. "no register").
 173   //
 174   // These assumptions work quite well for most out-of-order in-tree targets
 175   // like x86. This is mainly because the vast majority of instructions is
 176   // expanded to MCInst using a straightforward lowering logic that preserves
 177   // the ordering of the operands.
 178   unsigned NumExplicitDefs = MCDesc.getNumDefs();
 179   unsigned NumImplicitDefs = MCDesc.getNumImplicitDefs();
 180   unsigned NumWriteLatencyEntries = SCDesc.NumWriteLatencyEntries;
 181   unsigned TotalDefs = NumExplicitDefs + NumImplicitDefs;
 182   if (MCDesc.hasOptionalDef())
 183     TotalDefs++;
 184   ID.Writes.resize(TotalDefs);
 185   // Iterate over the operands list, and skip non-register operands.
 186   // The first NumExplictDefs register operands are expected to be register
 187   // definitions.
 188   unsigned CurrentDef = 0;
 189   unsigned i = 0;
 190   for (; i < MCI.getNumOperands() && CurrentDef < NumExplicitDefs; ++i) {
 191     const MCOperand &Op = MCI.getOperand(i);
 192     if (!Op.isReg())
 193       continue;
 194
 195     WriteDescriptor &Write = ID.Writes[CurrentDef];
 196     Write.OpIndex = i;
 197     if (CurrentDef < NumWriteLatencyEntries) {
 198       const MCWriteLatencyEntry &WLE =
 199           *STI.getWriteLatencyEntry(&SCDesc, CurrentDef);
 200       // Conservatively default to MaxLatency.
 201       Write.Latency =
 202           WLE.Cycles < 0 ? ID.MaxLatency : static_cast<unsigned>(WLE.Cycles);
 203       Write.SClassOrWriteResourceID = WLE.WriteResourceID;
 204     } else {
 205       // Assign a default latency for this write.
 206       Write.Latency = ID.MaxLatency;
 207       Write.SClassOrWriteResourceID = 0;
 208     }
 209     Write.IsOptionalDef = false;
 210     LLVM_DEBUG({
 211       dbgs() << "\t\t[Def] OpIdx=" << Write.OpIndex
 212              << ", Latency=" << Write.Latency
 213              << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n';
 214     });
 215     CurrentDef++;
 216   }
 217
 218   if (CurrentDef != NumExplicitDefs)
 219     llvm::report_fatal_error(
 220         "error: Expected more register operand definitions. ");
 221
 222   CurrentDef = 0;
 223   for (CurrentDef = 0; CurrentDef < NumImplicitDefs; ++CurrentDef) {
 224     unsigned Index = NumExplicitDefs + CurrentDef;
 225     WriteDescriptor &Write = ID.Writes[Index];
 226     Write.OpIndex = ~CurrentDef;
 227     Write.RegisterID = MCDesc.getImplicitDefs()[CurrentDef];
 228     if (Index < NumWriteLatencyEntries) {
 229       const MCWriteLatencyEntry &WLE =
 230           *STI.getWriteLatencyEntry(&SCDesc, Index);
 231       // Conservatively default to MaxLatency.
 232       Write.Latency =
 233           WLE.Cycles < 0 ? ID.MaxLatency : static_cast<unsigned>(WLE.Cycles);
 234       Write.SClassOrWriteResourceID = WLE.WriteResourceID;
 235     } else {
 236       // Assign a default latency for this write.
 237       Write.Latency = ID.MaxLatency;
 238       Write.SClassOrWriteResourceID = 0;
 239     }
 240
 241     Write.IsOptionalDef = false;
 242     assert(Write.RegisterID != 0 && "Expected a valid phys register!");
 243     LLVM_DEBUG({
 244       dbgs() << "\t\t[Def] OpIdx=" << Write.OpIndex
 245              << ", PhysReg=" << MRI.getName(Write.RegisterID)
 246              << ", Latency=" << Write.Latency
 247              << ", WriteResourceID=" << Write.SClassOrWriteResourceID << '\n';
 248     });
 249   }
 250
 251   if (MCDesc.hasOptionalDef()) {
 252     // Always assume that the optional definition is the last operand of the
 253     // MCInst sequence.
 254     const MCOperand &Op = MCI.getOperand(MCI.getNumOperands() - 1);
 255     if (i == MCI.getNumOperands() || !Op.isReg())
 256       llvm::report_fatal_error(
 257           "error: expected a register operand for an optional "
 258           "definition. Instruction has not be correctly analyzed.\n",
 259           false);
 260
 261     WriteDescriptor &Write = ID.Writes[TotalDefs - 1];
 262     Write.OpIndex = MCI.getNumOperands() - 1;
 263     // Assign a default latency for this write.
 264     Write.Latency = ID.MaxLatency;
 265     Write.SClassOrWriteResourceID = 0;
 266     Write.IsOptionalDef = true;
 267   }
 268 }
 269
 270 void InstrBuilder::populateReads(InstrDesc &ID, const MCInst &MCI,
 271                                  unsigned SchedClassID) {
 272   const MCInstrDesc &MCDesc = MCII.get(MCI.getOpcode());
 273   unsigned NumExplicitDefs = MCDesc.getNumDefs();
 274
 275   // Skip explicit definitions.
 276   unsigned i = 0;
 277   for (; i < MCI.getNumOperands() && NumExplicitDefs; ++i) {
 278     const MCOperand &Op = MCI.getOperand(i);
 279     if (Op.isReg())
 280       NumExplicitDefs--;
 281   }
 282
 283   if (NumExplicitDefs)
 284     llvm::report_fatal_error(
 285         "error: Expected more register operand definitions. ", false);
 286
 287   unsigned NumExplicitUses = MCI.getNumOperands() - i;
 288   unsigned NumImplicitUses = MCDesc.getNumImplicitUses();
 289   if (MCDesc.hasOptionalDef()) {
 290     assert(NumExplicitUses);
 291     NumExplicitUses--;
 292   }
 293   unsigned TotalUses = NumExplicitUses + NumImplicitUses;
 294   if (!TotalUses)
 295     return;
 296
 297   ID.Reads.resize(TotalUses);
 298   for (unsigned CurrentUse = 0; CurrentUse < NumExplicitUses; ++CurrentUse) {
 299     ReadDescriptor &Read = ID.Reads[CurrentUse];
 300     Read.OpIndex = i + CurrentUse;
 301     Read.UseIndex = CurrentUse;
 302     Read.SchedClassID = SchedClassID;
 303     LLVM_DEBUG(dbgs() << "\t\t[Use] OpIdx=" << Read.OpIndex
 304                       << ", UseIndex=" << Read.UseIndex << '\n');
 305   }
 306
 307   for (unsigned CurrentUse = 0; CurrentUse < NumImplicitUses; ++CurrentUse) {
 308     ReadDescriptor &Read = ID.Reads[NumExplicitUses + CurrentUse];
 309     Read.OpIndex = ~CurrentUse;
 310     Read.UseIndex = NumExplicitUses + CurrentUse;
 311     Read.RegisterID = MCDesc.getImplicitUses()[CurrentUse];
 312     Read.SchedClassID = SchedClassID;
 313     LLVM_DEBUG(dbgs() << "\t\t[Use] OpIdx=" << Read.OpIndex << ", RegisterID="
 314                       << MRI.getName(Read.RegisterID) << '\n');
 315   }
 316 }
 317
 318 const InstrDesc &InstrBuilder::createInstrDescImpl(const MCInst &MCI) {
 319   assert(STI.getSchedModel().hasInstrSchedModel() &&
 320          "Itineraries are not yet supported!");
 321
 322   // Obtain the instruction descriptor from the opcode.
 323   unsigned short Opcode = MCI.getOpcode();
 324   const MCInstrDesc &MCDesc = MCII.get(Opcode);
 325   const MCSchedModel &SM = STI.getSchedModel();
 326
 327   // Then obtain the scheduling class information from the instruction.
 328   unsigned SchedClassID = MCDesc.getSchedClass();
 329   unsigned CPUID = SM.getProcessorID();
 330
 331   // Try to solve variant scheduling classes.
 332   if (SchedClassID) {
 333     while (SchedClassID && SM.getSchedClassDesc(SchedClassID)->isVariant())
 334       SchedClassID = STI.resolveVariantSchedClass(SchedClassID, &MCI, CPUID);
 335
 336     if (!SchedClassID)
 337       llvm::report_fatal_error("unable to resolve this variant class.");
 338   }
 339
 340   // Check if this instruction is supported. Otherwise, report a fatal error.
 341   const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SchedClassID);
 342   if (SCDesc.NumMicroOps == MCSchedClassDesc::InvalidNumMicroOps) {
 343     std::string ToString;
 344     llvm::raw_string_ostream OS(ToString);
 345     WithColor::error() << "found an unsupported instruction in the input"
 346                        << " assembly sequence.\n";
 347     MCIP.printInst(&MCI, OS, "", STI);
 348     OS.flush();
 349
 350     WithColor::note() << "instruction: " << ToString << '\n';
 351     llvm::report_fatal_error(
 352         "Don't know how to analyze unsupported instructions.");
 353   }
 354
 355   // Create a new empty descriptor.
 356   std::unique_ptr<InstrDesc> ID = llvm::make_unique<InstrDesc>();
 357   ID->NumMicroOps = SCDesc.NumMicroOps;
 358
 359   if (MCDesc.isCall()) {
 360     // We don't correctly model calls.
 361     WithColor::warning() << "found a call in the input assembly sequence.\n";
 362     WithColor::note() << "call instructions are not correctly modeled. "
 363                       << "Assume a latency of 100cy.\n";
 364   }
 365
 366   if (MCDesc.isReturn()) {
 367     WithColor::warning() << "found a return instruction in the input"
 368                          << " assembly sequence.\n";
 369     WithColor::note() << "program counter updates are ignored.\n";
 370   }
 371
 372   ID->MayLoad = MCDesc.mayLoad();
 373   ID->MayStore = MCDesc.mayStore();
 374   ID->HasSideEffects = MCDesc.hasUnmodeledSideEffects();
 375
 376   initializeUsedResources(*ID, SCDesc, STI, ProcResourceMasks);
 377   computeMaxLatency(*ID, MCDesc, SCDesc, STI);
 378   populateWrites(*ID, MCI, SchedClassID);
 379   populateReads(*ID, MCI, SchedClassID);
 380
 381   LLVM_DEBUG(dbgs() << "\t\tMaxLatency=" << ID->MaxLatency << '\n');
 382   LLVM_DEBUG(dbgs() << "\t\tNumMicroOps=" << ID->NumMicroOps << '\n');
 383
 384   // Now add the new descriptor.
 385   SchedClassID = MCDesc.getSchedClass();
 386   if (!SM.getSchedClassDesc(SchedClassID)->isVariant()) {
 387     Descriptors[MCI.getOpcode()] = std::move(ID);
 388     return *Descriptors[MCI.getOpcode()];
 389   }
 390
 391   VariantDescriptors[&MCI] = std::move(ID);
 392   return *VariantDescriptors[&MCI];
 393 }
 394
 395 const InstrDesc &InstrBuilder::getOrCreateInstrDesc(const MCInst &MCI) {
 396   if (Descriptors.find_as(MCI.getOpcode()) != Descriptors.end())
 397     return *Descriptors[MCI.getOpcode()];
 398
 399   if (VariantDescriptors.find(&MCI) != VariantDescriptors.end())
 400     return *VariantDescriptors[&MCI];
 401
 402   return createInstrDescImpl(MCI);
 403 }
 404
 405 std::unique_ptr<Instruction>
 406 InstrBuilder::createInstruction(const MCInst &MCI) {
 407   const InstrDesc &D = getOrCreateInstrDesc(MCI);
 408   std::unique_ptr<Instruction> NewIS = llvm::make_unique<Instruction>(D);
 409
 410   // Initialize Reads first.
 411   for (const ReadDescriptor &RD : D.Reads) {
 412     int RegID = -1;
 413     if (!RD.isImplicitRead()) {
 414       // explicit read.
 415       const MCOperand &Op = MCI.getOperand(RD.OpIndex);
 416       // Skip non-register operands.
 417       if (!Op.isReg())
 418         continue;
 419       RegID = Op.getReg();
 420     } else {
 421       // Implicit read.
 422       RegID = RD.RegisterID;
 423     }
 424
 425     // Skip invalid register operands.
 426     if (!RegID)
 427       continue;
 428
 429     // Okay, this is a register operand. Create a ReadState for it.
 430     assert(RegID > 0 && "Invalid register ID found!");
 431     NewIS->getUses().emplace_back(llvm::make_unique<ReadState>(RD, RegID));
 432   }
 433
 434   // Early exit if there are no writes.
 435   if (D.Writes.empty())
 436     return NewIS;
 437
 438   // Track register writes that implicitly clear the upper portion of the
 439   // underlying super-registers using an APInt.
 440   APInt WriteMask(D.Writes.size(), 0);
 441
 442   // Now query the MCInstrAnalysis object to obtain information about which
 443   // register writes implicitly clear the upper portion of a super-register.
 444   MCIA.clearsSuperRegisters(MRI, MCI, WriteMask);
 445
 446   // Check if this is a dependency breaking instruction.
 447   if (MCIA.isDependencyBreaking(STI, MCI))
 448     NewIS->setDependencyBreaking();
 449
 450   // Initialize writes.
 451   unsigned WriteIndex = 0;
 452   for (const WriteDescriptor &WD : D.Writes) {
 453     unsigned RegID = WD.isImplicitWrite() ? WD.RegisterID
 454                                           : MCI.getOperand(WD.OpIndex).getReg();
 455     // Check if this is a optional definition that references NoReg.
 456     if (WD.IsOptionalDef && !RegID) {
 457       ++WriteIndex;
 458       continue;
 459     }
 460
 461     assert(RegID && "Expected a valid register ID!");
 462     NewIS->getDefs().emplace_back(llvm::make_unique<WriteState>(
 463         WD, RegID, /* ClearsSuperRegs */ WriteMask[WriteIndex]));
 464     ++WriteIndex;
 465   }
 466
 467   return NewIS;
 468 }
 469 } // namespace mca