contrib/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp

   1 //====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 /// \file
  10 ///
  11 /// Provide a pass which mitigates speculative execution attacks which operate
  12 /// by speculating incorrectly past some predicate (a type check, bounds check,
  13 /// or other condition) to reach a load with invalid inputs and leak the data
  14 /// accessed by that load using a side channel out of the speculative domain.
  15 ///
  16 /// For details on the attacks, see the first variant in both the Project Zero
  17 /// writeup and the Spectre paper:
  18 /// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html
  19 /// https://spectreattack.com/spectre.pdf
  20 ///
  21 //===----------------------------------------------------------------------===//
  22
  23 #include "X86.h"
  24 #include "X86InstrBuilder.h"
  25 #include "X86InstrInfo.h"
  26 #include "X86Subtarget.h"
  27 #include "llvm/ADT/ArrayRef.h"
  28 #include "llvm/ADT/DenseMap.h"
  29 #include "llvm/ADT/Optional.h"
  30 #include "llvm/ADT/STLExtras.h"
  31 #include "llvm/ADT/ScopeExit.h"
  32 #include "llvm/ADT/SmallPtrSet.h"
  33 #include "llvm/ADT/SmallSet.h"
  34 #include "llvm/ADT/SmallVector.h"
  35 #include "llvm/ADT/SparseBitVector.h"
  36 #include "llvm/ADT/Statistic.h"
  37 #include "llvm/CodeGen/MachineBasicBlock.h"
  38 #include "llvm/CodeGen/MachineConstantPool.h"
  39 #include "llvm/CodeGen/MachineFunction.h"
  40 #include "llvm/CodeGen/MachineFunctionPass.h"
  41 #include "llvm/CodeGen/MachineInstr.h"
  42 #include "llvm/CodeGen/MachineInstrBuilder.h"
  43 #include "llvm/CodeGen/MachineModuleInfo.h"
  44 #include "llvm/CodeGen/MachineOperand.h"
  45 #include "llvm/CodeGen/MachineRegisterInfo.h"
  46 #include "llvm/CodeGen/MachineSSAUpdater.h"
  47 #include "llvm/CodeGen/TargetInstrInfo.h"
  48 #include "llvm/CodeGen/TargetRegisterInfo.h"
  49 #include "llvm/CodeGen/TargetSchedule.h"
  50 #include "llvm/CodeGen/TargetSubtargetInfo.h"
  51 #include "llvm/IR/DebugLoc.h"
  52 #include "llvm/MC/MCSchedule.h"
  53 #include "llvm/Pass.h"
  54 #include "llvm/Support/CommandLine.h"
  55 #include "llvm/Support/Debug.h"
  56 #include "llvm/Support/raw_ostream.h"
  57 #include <algorithm>
  58 #include <cassert>
  59 #include <iterator>
  60 #include <utility>
  61
  62 using namespace llvm;
  63
  64 #define PASS_KEY "x86-slh"
  65 #define DEBUG_TYPE PASS_KEY
  66
  67 STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced");
  68 STATISTIC(NumBranchesUntraced, "Number of branches unable to trace");
  69 STATISTIC(NumAddrRegsHardened,
  70           "Number of address mode used registers hardaned");
  71 STATISTIC(NumPostLoadRegsHardened,
  72           "Number of post-load register values hardened");
  73 STATISTIC(NumCallsOrJumpsHardened,
  74           "Number of calls or jumps requiring extra hardening");
  75 STATISTIC(NumInstsInserted, "Number of instructions inserted");
  76 STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
  77
  78 static cl::opt<bool> EnableSpeculativeLoadHardening(
  79     "x86-speculative-load-hardening",
  80     cl::desc("Force enable speculative load hardening"), cl::init(false),
  81     cl::Hidden);
  82
  83 static cl::opt<bool> HardenEdgesWithLFENCE(
  84     PASS_KEY "-lfence",
  85     cl::desc(
  86         "Use LFENCE along each conditional edge to harden against speculative "
  87         "loads rather than conditional movs and poisoned pointers."),
  88     cl::init(false), cl::Hidden);
  89
  90 static cl::opt<bool> EnablePostLoadHardening(
  91     PASS_KEY "-post-load",
  92     cl::desc("Harden the value loaded *after* it is loaded by "
  93              "flushing the loaded bits to 1. This is hard to do "
  94              "in general but can be done easily for GPRs."),
  95     cl::init(true), cl::Hidden);
  96
  97 static cl::opt<bool> FenceCallAndRet(
  98     PASS_KEY "-fence-call-and-ret",
  99     cl::desc("Use a full speculation fence to harden both call and ret edges "
 100              "rather than a lighter weight mitigation."),
 101     cl::init(false), cl::Hidden);
 102
 103 static cl::opt<bool> HardenInterprocedurally(
 104     PASS_KEY "-ip",
 105     cl::desc("Harden interprocedurally by passing our state in and out of "
 106              "functions in the high bits of the stack pointer."),
 107     cl::init(true), cl::Hidden);
 108
 109 static cl::opt<bool>
 110     HardenLoads(PASS_KEY "-loads",
 111                 cl::desc("Sanitize loads from memory. When disable, no "
 112                          "significant security is provided."),
 113                 cl::init(true), cl::Hidden);
 114
 115 static cl::opt<bool> HardenIndirectCallsAndJumps(
 116     PASS_KEY "-indirect",
 117     cl::desc("Harden indirect calls and jumps against using speculatively "
 118              "stored attacker controlled addresses. This is designed to "
 119              "mitigate Spectre v1.2 style attacks."),
 120     cl::init(true), cl::Hidden);
 121
 122 namespace {
 123
 124 class X86SpeculativeLoadHardeningPass : public MachineFunctionPass {
 125 public:
 126   X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) {
 127     initializeX86SpeculativeLoadHardeningPassPass(
 128         *PassRegistry::getPassRegistry());
 129   }
 130
 131   StringRef getPassName() const override {
 132     return "X86 speculative load hardening";
 133   }
 134   bool runOnMachineFunction(MachineFunction &MF) override;
 135   void getAnalysisUsage(AnalysisUsage &AU) const override;
 136
 137   /// Pass identification, replacement for typeid.
 138   static char ID;
 139
 140 private:
 141   /// The information about a block's conditional terminators needed to trace
 142   /// our predicate state through the exiting edges.
 143   struct BlockCondInfo {
 144     MachineBasicBlock *MBB;
 145
 146     // We mostly have one conditional branch, and in extremely rare cases have
 147     // two. Three and more are so rare as to be unimportant for compile time.
 148     SmallVector<MachineInstr *, 2> CondBrs;
 149
 150     MachineInstr *UncondBr;
 151   };
 152
 153   /// Manages the predicate state traced through the program.
 154   struct PredState {
 155     unsigned InitialReg;
 156     unsigned PoisonReg;
 157
 158     const TargetRegisterClass *RC;
 159     MachineSSAUpdater SSA;
 160
 161     PredState(MachineFunction &MF, const TargetRegisterClass *RC)
 162         : RC(RC), SSA(MF) {}
 163   };
 164
 165   const X86Subtarget *Subtarget;
 166   MachineRegisterInfo *MRI;
 167   const X86InstrInfo *TII;
 168   const TargetRegisterInfo *TRI;
 169
 170   Optional<PredState> PS;
 171
 172   void hardenEdgesWithLFENCE(MachineFunction &MF);
 173
 174   SmallVector<BlockCondInfo, 16> collectBlockCondInfo(MachineFunction &MF);
 175
 176   SmallVector<MachineInstr *, 16>
 177   tracePredStateThroughCFG(MachineFunction &MF, ArrayRef<BlockCondInfo> Infos);
 178
 179   void unfoldCallAndJumpLoads(MachineFunction &MF);
 180
 181   SmallVector<MachineInstr *, 16>
 182   tracePredStateThroughIndirectBranches(MachineFunction &MF);
 183
 184   void tracePredStateThroughBlocksAndHarden(MachineFunction &MF);
 185
 186   unsigned saveEFLAGS(MachineBasicBlock &MBB,
 187                       MachineBasicBlock::iterator InsertPt, DebugLoc Loc);
 188   void restoreEFLAGS(MachineBasicBlock &MBB,
 189                      MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
 190                      unsigned OFReg);
 191
 192   void mergePredStateIntoSP(MachineBasicBlock &MBB,
 193                             MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
 194                             unsigned PredStateReg);
 195   unsigned extractPredStateFromSP(MachineBasicBlock &MBB,
 196                                   MachineBasicBlock::iterator InsertPt,
 197                                   DebugLoc Loc);
 198
 199   void
 200   hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO,
 201                  MachineOperand &IndexMO,
 202                  SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
 203   MachineInstr *
 204   sinkPostLoadHardenedInst(MachineInstr &MI,
 205                            SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
 206   bool canHardenRegister(unsigned Reg);
 207   unsigned hardenValueInRegister(unsigned Reg, MachineBasicBlock &MBB,
 208                                  MachineBasicBlock::iterator InsertPt,
 209                                  DebugLoc Loc);
 210   unsigned hardenPostLoad(MachineInstr &MI);
 211   void hardenReturnInstr(MachineInstr &MI);
 212   void tracePredStateThroughCall(MachineInstr &MI);
 213   void hardenIndirectCallOrJumpInstr(
 214       MachineInstr &MI,
 215       SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
 216 };
 217
 218 } // end anonymous namespace
 219
 220 char X86SpeculativeLoadHardeningPass::ID = 0;
 221
 222 void X86SpeculativeLoadHardeningPass::getAnalysisUsage(
 223     AnalysisUsage &AU) const {
 224   MachineFunctionPass::getAnalysisUsage(AU);
 225 }
 226
 227 static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB,
 228                                     MachineBasicBlock &Succ, int SuccCount,
 229                                     MachineInstr *Br, MachineInstr *&UncondBr,
 230                                     const X86InstrInfo &TII) {
 231   assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!");
 232
 233   MachineFunction &MF = *MBB.getParent();
 234
 235   MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock();
 236
 237   // We have to insert the new block immediately after the current one as we
 238   // don't know what layout-successor relationships the successor has and we
 239   // may not be able to (and generally don't want to) try to fix those up.
 240   MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB);
 241
 242   // Update the branch instruction if necessary.
 243   if (Br) {
 244     assert(Br->getOperand(0).getMBB() == &Succ &&
 245            "Didn't start with the right target!");
 246     Br->getOperand(0).setMBB(&NewMBB);
 247
 248     // If this successor was reached through a branch rather than fallthrough,
 249     // we might have *broken* fallthrough and so need to inject a new
 250     // unconditional branch.
 251     if (!UncondBr) {
 252       MachineBasicBlock &OldLayoutSucc =
 253           *std::next(MachineFunction::iterator(&NewMBB));
 254       assert(MBB.isSuccessor(&OldLayoutSucc) &&
 255              "Without an unconditional branch, the old layout successor should "
 256              "be an actual successor!");
 257       auto BrBuilder =
 258           BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc);
 259       // Update the unconditional branch now that we've added one.
 260       UncondBr = &*BrBuilder;
 261     }
 262
 263     // Insert unconditional "jump Succ" instruction in the new block if
 264     // necessary.
 265     if (!NewMBB.isLayoutSuccessor(&Succ)) {
 266       SmallVector<MachineOperand, 4> Cond;
 267       TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc());
 268     }
 269   } else {
 270     assert(!UncondBr &&
 271            "Cannot have a branchless successor and an unconditional branch!");
 272     assert(NewMBB.isLayoutSuccessor(&Succ) &&
 273            "A non-branch successor must have been a layout successor before "
 274            "and now is a layout successor of the new block.");
 275   }
 276
 277   // If this is the only edge to the successor, we can just replace it in the
 278   // CFG. Otherwise we need to add a new entry in the CFG for the new
 279   // successor.
 280   if (SuccCount == 1) {
 281     MBB.replaceSuccessor(&Succ, &NewMBB);
 282   } else {
 283     MBB.splitSuccessor(&Succ, &NewMBB);
 284   }
 285
 286   // Hook up the edge from the new basic block to the old successor in the CFG.
 287   NewMBB.addSuccessor(&Succ);
 288
 289   // Fix PHI nodes in Succ so they refer to NewMBB instead of MBB.
 290   for (MachineInstr &MI : Succ) {
 291     if (!MI.isPHI())
 292       break;
 293     for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
 294          OpIdx += 2) {
 295       MachineOperand &OpV = MI.getOperand(OpIdx);
 296       MachineOperand &OpMBB = MI.getOperand(OpIdx + 1);
 297       assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!");
 298       if (OpMBB.getMBB() != &MBB)
 299         continue;
 300
 301       // If this is the last edge to the succesor, just replace MBB in the PHI
 302       if (SuccCount == 1) {
 303         OpMBB.setMBB(&NewMBB);
 304         break;
 305       }
 306
 307       // Otherwise, append a new pair of operands for the new incoming edge.
 308       MI.addOperand(MF, OpV);
 309       MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB));
 310       break;
 311     }
 312   }
 313
 314   // Inherit live-ins from the successor
 315   for (auto &LI : Succ.liveins())
 316     NewMBB.addLiveIn(LI);
 317
 318   LLVM_DEBUG(dbgs() << "  Split edge from '" << MBB.getName() << "' to '"
 319                     << Succ.getName() << "'.\n");
 320   return NewMBB;
 321 }
 322
 323 /// Removing duplicate PHI operands to leave the PHI in a canonical and
 324 /// predictable form.
 325 ///
 326 /// FIXME: It's really frustrating that we have to do this, but SSA-form in MIR
 327 /// isn't what you might expect. We may have multiple entries in PHI nodes for
 328 /// a single predecessor. This makes CFG-updating extremely complex, so here we
 329 /// simplify all PHI nodes to a model even simpler than the IR's model: exactly
 330 /// one entry per predecessor, regardless of how many edges there are.
 331 static void canonicalizePHIOperands(MachineFunction &MF) {
 332   SmallPtrSet<MachineBasicBlock *, 4> Preds;
 333   SmallVector<int, 4> DupIndices;
 334   for (auto &MBB : MF)
 335     for (auto &MI : MBB) {
 336       if (!MI.isPHI())
 337         break;
 338
 339       // First we scan the operands of the PHI looking for duplicate entries
 340       // a particular predecessor. We retain the operand index of each duplicate
 341       // entry found.
 342       for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps;
 343            OpIdx += 2)
 344         if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second)
 345           DupIndices.push_back(OpIdx);
 346
 347       // Now walk the duplicate indices, removing both the block and value. Note
 348       // that these are stored as a vector making this element-wise removal
 349       // :w
 350       // potentially quadratic.
 351       //
 352       // FIXME: It is really frustrating that we have to use a quadratic
 353       // removal algorithm here. There should be a better way, but the use-def
 354       // updates required make that impossible using the public API.
 355       //
 356       // Note that we have to process these backwards so that we don't
 357       // invalidate other indices with each removal.
 358       while (!DupIndices.empty()) {
 359         int OpIdx = DupIndices.pop_back_val();
 360         // Remove both the block and value operand, again in reverse order to
 361         // preserve indices.
 362         MI.RemoveOperand(OpIdx + 1);
 363         MI.RemoveOperand(OpIdx);
 364       }
 365
 366       Preds.clear();
 367     }
 368 }
 369
 370 /// Helper to scan a function for loads vulnerable to misspeculation that we
 371 /// want to harden.
 372 ///
 373 /// We use this to avoid making changes to functions where there is nothing we
 374 /// need to do to harden against misspeculation.
 375 static bool hasVulnerableLoad(MachineFunction &MF) {
 376   for (MachineBasicBlock &MBB : MF) {
 377     for (MachineInstr &MI : MBB) {
 378       // Loads within this basic block after an LFENCE are not at risk of
 379       // speculatively executing with invalid predicates from prior control
 380       // flow. So break out of this block but continue scanning the function.
 381       if (MI.getOpcode() == X86::LFENCE)
 382         break;
 383
 384       // Looking for loads only.
 385       if (!MI.mayLoad())
 386         continue;
 387
 388       // An MFENCE is modeled as a load but isn't vulnerable to misspeculation.
 389       if (MI.getOpcode() == X86::MFENCE)
 390         continue;
 391
 392       // We found a load.
 393       return true;
 394     }
 395   }
 396
 397   // No loads found.
 398   return false;
 399 }
 400
 401 bool X86SpeculativeLoadHardeningPass::runOnMachineFunction(
 402     MachineFunction &MF) {
 403   LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
 404                     << " **********\n");
 405
 406   // Only run if this pass is forced enabled or we detect the relevant function
 407   // attribute requesting SLH.
 408   if (!EnableSpeculativeLoadHardening &&
 409       !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
 410     return false;
 411
 412   Subtarget = &MF.getSubtarget<X86Subtarget>();
 413   MRI = &MF.getRegInfo();
 414   TII = Subtarget->getInstrInfo();
 415   TRI = Subtarget->getRegisterInfo();
 416
 417   // FIXME: Support for 32-bit.
 418   PS.emplace(MF, &X86::GR64_NOSPRegClass);
 419
 420   if (MF.begin() == MF.end())
 421     // Nothing to do for a degenerate empty function...
 422     return false;
 423
 424   // We support an alternative hardening technique based on a debug flag.
 425   if (HardenEdgesWithLFENCE) {
 426     hardenEdgesWithLFENCE(MF);
 427     return true;
 428   }
 429
 430   // Create a dummy debug loc to use for all the generated code here.
 431   DebugLoc Loc;
 432
 433   MachineBasicBlock &Entry = *MF.begin();
 434   auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin());
 435
 436   // Do a quick scan to see if we have any checkable loads.
 437   bool HasVulnerableLoad = hasVulnerableLoad(MF);
 438
 439   // See if we have any conditional branching blocks that we will need to trace
 440   // predicate state through.
 441   SmallVector<BlockCondInfo, 16> Infos = collectBlockCondInfo(MF);
 442
 443   // If we have no interesting conditions or loads, nothing to do here.
 444   if (!HasVulnerableLoad && Infos.empty())
 445     return true;
 446
 447   // The poison value is required to be an all-ones value for many aspects of
 448   // this mitigation.
 449   const int PoisonVal = -1;
 450   PS->PoisonReg = MRI->createVirtualRegister(PS->RC);
 451   BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PS->PoisonReg)
 452       .addImm(PoisonVal);
 453   ++NumInstsInserted;
 454
 455   // If we have loads being hardened and we've asked for call and ret edges to
 456   // get a full fence-based mitigation, inject that fence.
 457   if (HasVulnerableLoad && FenceCallAndRet) {
 458     // We need to insert an LFENCE at the start of the function to suspend any
 459     // incoming misspeculation from the caller. This helps two-fold: the caller
 460     // may not have been protected as this code has been, and this code gets to
 461     // not take any specific action to protect across calls.
 462     // FIXME: We could skip this for functions which unconditionally return
 463     // a constant.
 464     BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE));
 465     ++NumInstsInserted;
 466     ++NumLFENCEsInserted;
 467   }
 468
 469   // If we guarded the entry with an LFENCE and have no conditionals to protect
 470   // in blocks, then we're done.
 471   if (FenceCallAndRet && Infos.empty())
 472     // We may have changed the function's code at this point to insert fences.
 473     return true;
 474
 475   // For every basic block in the function which can b
 476   if (HardenInterprocedurally && !FenceCallAndRet) {
 477     // Set up the predicate state by extracting it from the incoming stack
 478     // pointer so we pick up any misspeculation in our caller.
 479     PS->InitialReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc);
 480   } else {
 481     // Otherwise, just build the predicate state itself by zeroing a register
 482     // as we don't need any initial state.
 483     PS->InitialReg = MRI->createVirtualRegister(PS->RC);
 484     unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass);
 485     auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0),
 486                          PredStateSubReg);
 487     ++NumInstsInserted;
 488     MachineOperand *ZeroEFLAGSDefOp =
 489         ZeroI->findRegisterDefOperand(X86::EFLAGS);
 490     assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() &&
 491            "Must have an implicit def of EFLAGS!");
 492     ZeroEFLAGSDefOp->setIsDead(true);
 493     BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG),
 494             PS->InitialReg)
 495         .addImm(0)
 496         .addReg(PredStateSubReg)
 497         .addImm(X86::sub_32bit);
 498   }
 499
 500   // We're going to need to trace predicate state throughout the function's
 501   // CFG. Prepare for this by setting up our initial state of PHIs with unique
 502   // predecessor entries and all the initial predicate state.
 503   canonicalizePHIOperands(MF);
 504
 505   // Track the updated values in an SSA updater to rewrite into SSA form at the
 506   // end.
 507   PS->SSA.Initialize(PS->InitialReg);
 508   PS->SSA.AddAvailableValue(&Entry, PS->InitialReg);
 509
 510   // Trace through the CFG.
 511   auto CMovs = tracePredStateThroughCFG(MF, Infos);
 512
 513   // We may also enter basic blocks in this function via exception handling
 514   // control flow. Here, if we are hardening interprocedurally, we need to
 515   // re-capture the predicate state from the throwing code. In the Itanium ABI,
 516   // the throw will always look like a call to __cxa_throw and will have the
 517   // predicate state in the stack pointer, so extract fresh predicate state from
 518   // the stack pointer and make it available in SSA.
 519   // FIXME: Handle non-itanium ABI EH models.
 520   if (HardenInterprocedurally) {
 521     for (MachineBasicBlock &MBB : MF) {
 522       assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!");
 523       assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!");
 524       assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!");
 525       if (!MBB.isEHPad())
 526         continue;
 527       PS->SSA.AddAvailableValue(
 528           &MBB,
 529           extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc));
 530     }
 531   }
 532
 533   if (HardenIndirectCallsAndJumps) {
 534     // If we are going to harden calls and jumps we need to unfold their memory
 535     // operands.
 536     unfoldCallAndJumpLoads(MF);
 537
 538     // Then we trace predicate state through the indirect branches.
 539     auto IndirectBrCMovs = tracePredStateThroughIndirectBranches(MF);
 540     CMovs.append(IndirectBrCMovs.begin(), IndirectBrCMovs.end());
 541   }
 542
 543   // Now that we have the predicate state available at the start of each block
 544   // in the CFG, trace it through each block, hardening vulnerable instructions
 545   // as we go.
 546   tracePredStateThroughBlocksAndHarden(MF);
 547
 548   // Now rewrite all the uses of the pred state using the SSA updater to insert
 549   // PHIs connecting the state between blocks along the CFG edges.
 550   for (MachineInstr *CMovI : CMovs)
 551     for (MachineOperand &Op : CMovI->operands()) {
 552       if (!Op.isReg() || Op.getReg() != PS->InitialReg)
 553         continue;
 554
 555       PS->SSA.RewriteUse(Op);
 556     }
 557
 558   LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump();
 559              dbgs() << "\n"; MF.verify(this));
 560   return true;
 561 }
 562
 563 /// Implements the naive hardening approach of putting an LFENCE after every
 564 /// potentially mis-predicted control flow construct.
 565 ///
 566 /// We include this as an alternative mostly for the purpose of comparison. The
 567 /// performance impact of this is expected to be extremely severe and not
 568 /// practical for any real-world users.
 569 void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE(
 570     MachineFunction &MF) {
 571   // First, we scan the function looking for blocks that are reached along edges
 572   // that we might want to harden.
 573   SmallSetVector<MachineBasicBlock *, 8> Blocks;
 574   for (MachineBasicBlock &MBB : MF) {
 575     // If there are no or only one successor, nothing to do here.
 576     if (MBB.succ_size() <= 1)
 577       continue;
 578
 579     // Skip blocks unless their terminators start with a branch. Other
 580     // terminators don't seem interesting for guarding against misspeculation.
 581     auto TermIt = MBB.getFirstTerminator();
 582     if (TermIt == MBB.end() || !TermIt->isBranch())
 583       continue;
 584
 585     // Add all the non-EH-pad succossors to the blocks we want to harden. We
 586     // skip EH pads because there isn't really a condition of interest on
 587     // entering.
 588     for (MachineBasicBlock *SuccMBB : MBB.successors())
 589       if (!SuccMBB->isEHPad())
 590         Blocks.insert(SuccMBB);
 591   }
 592
 593   for (MachineBasicBlock *MBB : Blocks) {
 594     auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin());
 595     BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE));
 596     ++NumInstsInserted;
 597     ++NumLFENCEsInserted;
 598   }
 599 }
 600
 601 SmallVector<X86SpeculativeLoadHardeningPass::BlockCondInfo, 16>
 602 X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) {
 603   SmallVector<BlockCondInfo, 16> Infos;
 604
 605   // Walk the function and build up a summary for each block's conditions that
 606   // we need to trace through.
 607   for (MachineBasicBlock &MBB : MF) {
 608     // If there are no or only one successor, nothing to do here.
 609     if (MBB.succ_size() <= 1)
 610       continue;
 611
 612     // We want to reliably handle any conditional branch terminators in the
 613     // MBB, so we manually analyze the branch. We can handle all of the
 614     // permutations here, including ones that analyze branch cannot.
 615     //
 616     // The approach is to walk backwards across the terminators, resetting at
 617     // any unconditional non-indirect branch, and track all conditional edges
 618     // to basic blocks as well as the fallthrough or unconditional successor
 619     // edge. For each conditional edge, we track the target and the opposite
 620     // condition code in order to inject a "no-op" cmov into that successor
 621     // that will harden the predicate. For the fallthrough/unconditional
 622     // edge, we inject a separate cmov for each conditional branch with
 623     // matching condition codes. This effectively implements an "and" of the
 624     // condition flags, even if there isn't a single condition flag that would
 625     // directly implement that. We don't bother trying to optimize either of
 626     // these cases because if such an optimization is possible, LLVM should
 627     // have optimized the conditional *branches* in that way already to reduce
 628     // instruction count. This late, we simply assume the minimal number of
 629     // branch instructions is being emitted and use that to guide our cmov
 630     // insertion.
 631
 632     BlockCondInfo Info = {&MBB, {}, nullptr};
 633
 634     // Now walk backwards through the terminators and build up successors they
 635     // reach and the conditions.
 636     for (MachineInstr &MI : llvm::reverse(MBB)) {
 637       // Once we've handled all the terminators, we're done.
 638       if (!MI.isTerminator())
 639         break;
 640
 641       // If we see a non-branch terminator, we can't handle anything so bail.
 642       if (!MI.isBranch()) {
 643         Info.CondBrs.clear();
 644         break;
 645       }
 646
 647       // If we see an unconditional branch, reset our state, clear any
 648       // fallthrough, and set this is the "else" successor.
 649       if (MI.getOpcode() == X86::JMP_1) {
 650         Info.CondBrs.clear();
 651         Info.UncondBr = &MI;
 652         continue;
 653       }
 654
 655       // If we get an invalid condition, we have an indirect branch or some
 656       // other unanalyzable "fallthrough" case. We model this as a nullptr for
 657       // the destination so we can still guard any conditional successors.
 658       // Consider code sequences like:
 659       // ```
 660       //   jCC L1
 661       //   jmpq *%rax
 662       // ```
 663       // We still want to harden the edge to `L1`.
 664       if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) {
 665         Info.CondBrs.clear();
 666         Info.UncondBr = &MI;
 667         continue;
 668       }
 669
 670       // We have a vanilla conditional branch, add it to our list.
 671       Info.CondBrs.push_back(&MI);
 672     }
 673     if (Info.CondBrs.empty()) {
 674       ++NumBranchesUntraced;
 675       LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n";
 676                  MBB.dump());
 677       continue;
 678     }
 679
 680     Infos.push_back(Info);
 681   }
 682
 683   return Infos;
 684 }
 685
 686 /// Trace the predicate state through the CFG, instrumenting each conditional
 687 /// branch such that misspeculation through an edge will poison the predicate
 688 /// state.
 689 ///
 690 /// Returns the list of inserted CMov instructions so that they can have their
 691 /// uses of the predicate state rewritten into proper SSA form once it is
 692 /// complete.
 693 SmallVector<MachineInstr *, 16>
 694 X86SpeculativeLoadHardeningPass::tracePredStateThroughCFG(
 695     MachineFunction &MF, ArrayRef<BlockCondInfo> Infos) {
 696   // Collect the inserted cmov instructions so we can rewrite their uses of the
 697   // predicate state into SSA form.
 698   SmallVector<MachineInstr *, 16> CMovs;
 699
 700   // Now walk all of the basic blocks looking for ones that end in conditional
 701   // jumps where we need to update this register along each edge.
 702   for (const BlockCondInfo &Info : Infos) {
 703     MachineBasicBlock &MBB = *Info.MBB;
 704     const SmallVectorImpl<MachineInstr *> &CondBrs = Info.CondBrs;
 705     MachineInstr *UncondBr = Info.UncondBr;
 706
 707     LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName()
 708                       << "\n");
 709     ++NumCondBranchesTraced;
 710
 711     // Compute the non-conditional successor as either the target of any
 712     // unconditional branch or the layout successor.
 713     MachineBasicBlock *UncondSucc =
 714         UncondBr ? (UncondBr->getOpcode() == X86::JMP_1
 715                         ? UncondBr->getOperand(0).getMBB()
 716                         : nullptr)
 717                  : &*std::next(MachineFunction::iterator(&MBB));
 718
 719     // Count how many edges there are to any given successor.
 720     SmallDenseMap<MachineBasicBlock *, int> SuccCounts;
 721     if (UncondSucc)
 722       ++SuccCounts[UncondSucc];
 723     for (auto *CondBr : CondBrs)
 724       ++SuccCounts[CondBr->getOperand(0).getMBB()];
 725
 726     // A lambda to insert cmov instructions into a block checking all of the
 727     // condition codes in a sequence.
 728     auto BuildCheckingBlockForSuccAndConds =
 729         [&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount,
 730             MachineInstr *Br, MachineInstr *&UncondBr,
 731             ArrayRef<X86::CondCode> Conds) {
 732           // First, we split the edge to insert the checking block into a safe
 733           // location.
 734           auto &CheckingMBB =
 735               (SuccCount == 1 && Succ.pred_size() == 1)
 736                   ? Succ
 737                   : splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII);
 738
 739           bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS);
 740           if (!LiveEFLAGS)
 741             CheckingMBB.addLiveIn(X86::EFLAGS);
 742
 743           // Now insert the cmovs to implement the checks.
 744           auto InsertPt = CheckingMBB.begin();
 745           assert((InsertPt == CheckingMBB.end() || !InsertPt->isPHI()) &&
 746                  "Should never have a PHI in the initial checking block as it "
 747                  "always has a single predecessor!");
 748
 749           // We will wire each cmov to each other, but need to start with the
 750           // incoming pred state.
 751           unsigned CurStateReg = PS->InitialReg;
 752
 753           for (X86::CondCode Cond : Conds) {
 754             int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
 755             auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes);
 756
 757             unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
 758             // Note that we intentionally use an empty debug location so that
 759             // this picks up the preceding location.
 760             auto CMovI = BuildMI(CheckingMBB, InsertPt, DebugLoc(),
 761                                  TII->get(CMovOp), UpdatedStateReg)
 762                              .addReg(CurStateReg)
 763                              .addReg(PS->PoisonReg);
 764             // If this is the last cmov and the EFLAGS weren't originally
 765             // live-in, mark them as killed.
 766             if (!LiveEFLAGS && Cond == Conds.back())
 767               CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
 768
 769             ++NumInstsInserted;
 770             LLVM_DEBUG(dbgs() << "  Inserting cmov: "; CMovI->dump();
 771                        dbgs() << "\n");
 772
 773             // The first one of the cmovs will be using the top level
 774             // `PredStateReg` and need to get rewritten into SSA form.
 775             if (CurStateReg == PS->InitialReg)
 776               CMovs.push_back(&*CMovI);
 777
 778             // The next cmov should start from this one's def.
 779             CurStateReg = UpdatedStateReg;
 780           }
 781
 782           // And put the last one into the available values for SSA form of our
 783           // predicate state.
 784           PS->SSA.AddAvailableValue(&CheckingMBB, CurStateReg);
 785         };
 786
 787     std::vector<X86::CondCode> UncondCodeSeq;
 788     for (auto *CondBr : CondBrs) {
 789       MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB();
 790       int &SuccCount = SuccCounts[&Succ];
 791
 792       X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode());
 793       X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond);
 794       UncondCodeSeq.push_back(Cond);
 795
 796       BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr,
 797                                         {InvCond});
 798
 799       // Decrement the successor count now that we've split one of the edges.
 800       // We need to keep the count of edges to the successor accurate in order
 801       // to know above when to *replace* the successor in the CFG vs. just
 802       // adding the new successor.
 803       --SuccCount;
 804     }
 805
 806     // Since we may have split edges and changed the number of successors,
 807     // normalize the probabilities. This avoids doing it each time we split an
 808     // edge.
 809     MBB.normalizeSuccProbs();
 810
 811     // Finally, we need to insert cmovs into the "fallthrough" edge. Here, we
 812     // need to intersect the other condition codes. We can do this by just
 813     // doing a cmov for each one.
 814     if (!UncondSucc)
 815       // If we have no fallthrough to protect (perhaps it is an indirect jump?)
 816       // just skip this and continue.
 817       continue;
 818
 819     assert(SuccCounts[UncondSucc] == 1 &&
 820            "We should never have more than one edge to the unconditional "
 821            "successor at this point because every other edge must have been "
 822            "split above!");
 823
 824     // Sort and unique the codes to minimize them.
 825     llvm::sort(UncondCodeSeq);
 826     UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()),
 827                         UncondCodeSeq.end());
 828
 829     // Build a checking version of the successor.
 830     BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1,
 831                                       UncondBr, UncondBr, UncondCodeSeq);
 832   }
 833
 834   return CMovs;
 835 }
 836
 837 /// Compute the register class for the unfolded load.
 838 ///
 839 /// FIXME: This should probably live in X86InstrInfo, potentially by adding
 840 /// a way to unfold into a newly created vreg rather than requiring a register
 841 /// input.
 842 static const TargetRegisterClass *
 843 getRegClassForUnfoldedLoad(MachineFunction &MF, const X86InstrInfo &TII,
 844                            unsigned Opcode) {
 845   unsigned Index;
 846   unsigned UnfoldedOpc = TII.getOpcodeAfterMemoryUnfold(
 847       Opcode, /*UnfoldLoad*/ true, /*UnfoldStore*/ false, &Index);
 848   const MCInstrDesc &MCID = TII.get(UnfoldedOpc);
 849   return TII.getRegClass(MCID, Index, &TII.getRegisterInfo(), MF);
 850 }
 851
 852 void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
 853     MachineFunction &MF) {
 854   for (MachineBasicBlock &MBB : MF)
 855     for (auto MII = MBB.instr_begin(), MIE = MBB.instr_end(); MII != MIE;) {
 856       // Grab a reference and increment the iterator so we can remove this
 857       // instruction if needed without disturbing the iteration.
 858       MachineInstr &MI = *MII++;
 859
 860       // Must either be a call or a branch.
 861       if (!MI.isCall() && !MI.isBranch())
 862         continue;
 863       // We only care about loading variants of these instructions.
 864       if (!MI.mayLoad())
 865         continue;
 866
 867       switch (MI.getOpcode()) {
 868       default: {
 869         LLVM_DEBUG(
 870             dbgs() << "ERROR: Found an unexpected loading branch or call "
 871                       "instruction:\n";
 872             MI.dump(); dbgs() << "\n");
 873         report_fatal_error("Unexpected loading branch or call!");
 874       }
 875
 876       case X86::FARCALL16m:
 877       case X86::FARCALL32m:
 878       case X86::FARCALL64:
 879       case X86::FARJMP16m:
 880       case X86::FARJMP32m:
 881       case X86::FARJMP64:
 882         // We cannot mitigate far jumps or calls, but we also don't expect them
 883         // to be vulnerable to Spectre v1.2 style attacks.
 884         continue;
 885
 886       case X86::CALL16m:
 887       case X86::CALL16m_NT:
 888       case X86::CALL32m:
 889       case X86::CALL32m_NT:
 890       case X86::CALL64m:
 891       case X86::CALL64m_NT:
 892       case X86::JMP16m:
 893       case X86::JMP16m_NT:
 894       case X86::JMP32m:
 895       case X86::JMP32m_NT:
 896       case X86::JMP64m:
 897       case X86::JMP64m_NT:
 898       case X86::TAILJMPm64:
 899       case X86::TAILJMPm64_REX:
 900       case X86::TAILJMPm:
 901       case X86::TCRETURNmi64:
 902       case X86::TCRETURNmi: {
 903         // Use the generic unfold logic now that we know we're dealing with
 904         // expected instructions.
 905         // FIXME: We don't have test coverage for all of these!
 906         auto *UnfoldedRC = getRegClassForUnfoldedLoad(MF, *TII, MI.getOpcode());
 907         if (!UnfoldedRC) {
 908           LLVM_DEBUG(dbgs()
 909                          << "ERROR: Unable to unfold load from instruction:\n";
 910                      MI.dump(); dbgs() << "\n");
 911           report_fatal_error("Unable to unfold load!");
 912         }
 913         unsigned Reg = MRI->createVirtualRegister(UnfoldedRC);
 914         SmallVector<MachineInstr *, 2> NewMIs;
 915         // If we were able to compute an unfolded reg class, any failure here
 916         // is just a programming error so just assert.
 917         bool Unfolded =
 918             TII->unfoldMemoryOperand(MF, MI, Reg, /*UnfoldLoad*/ true,
 919                                      /*UnfoldStore*/ false, NewMIs);
 920         (void)Unfolded;
 921         assert(Unfolded &&
 922                "Computed unfolded register class but failed to unfold");
 923         // Now stitch the new instructions into place and erase the old one.
 924         for (auto *NewMI : NewMIs)
 925           MBB.insert(MI.getIterator(), NewMI);
 926         MI.eraseFromParent();
 927         LLVM_DEBUG({
 928           dbgs() << "Unfolded load successfully into:\n";
 929           for (auto *NewMI : NewMIs) {
 930             NewMI->dump();
 931             dbgs() << "\n";
 932           }
 933         });
 934         continue;
 935       }
 936       }
 937       llvm_unreachable("Escaped switch with default!");
 938     }
 939 }
 940
 941 /// Trace the predicate state through indirect branches, instrumenting them to
 942 /// poison the state if a target is reached that does not match the expected
 943 /// target.
 944 ///
 945 /// This is designed to mitigate Spectre variant 1 attacks where an indirect
 946 /// branch is trained to predict a particular target and then mispredicts that
 947 /// target in a way that can leak data. Despite using an indirect branch, this
 948 /// is really a variant 1 style attack: it does not steer execution to an
 949 /// arbitrary or attacker controlled address, and it does not require any
 950 /// special code executing next to the victim. This attack can also be mitigated
 951 /// through retpolines, but those require either replacing indirect branches
 952 /// with conditional direct branches or lowering them through a device that
 953 /// blocks speculation. This mitigation can replace these retpoline-style
 954 /// mitigations for jump tables and other indirect branches within a function
 955 /// when variant 2 isn't a risk while allowing limited speculation. Indirect
 956 /// calls, however, cannot be mitigated through this technique without changing
 957 /// the ABI in a fundamental way.
 958 SmallVector<MachineInstr *, 16>
 959 X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
 960     MachineFunction &MF) {
 961   // We use the SSAUpdater to insert PHI nodes for the target addresses of
 962   // indirect branches. We don't actually need the full power of the SSA updater
 963   // in this particular case as we always have immediately available values, but
 964   // this avoids us having to re-implement the PHI construction logic.
 965   MachineSSAUpdater TargetAddrSSA(MF);
 966   TargetAddrSSA.Initialize(MRI->createVirtualRegister(&X86::GR64RegClass));
 967
 968   // Track which blocks were terminated with an indirect branch.
 969   SmallPtrSet<MachineBasicBlock *, 4> IndirectTerminatedMBBs;
 970
 971   // We need to know what blocks end up reached via indirect branches. We
 972   // expect this to be a subset of those whose address is taken and so track it
 973   // directly via the CFG.
 974   SmallPtrSet<MachineBasicBlock *, 4> IndirectTargetMBBs;
 975
 976   // Walk all the blocks which end in an indirect branch and make the
 977   // target address available.
 978   for (MachineBasicBlock &MBB : MF) {
 979     // Find the last terminator.
 980     auto MII = MBB.instr_rbegin();
 981     while (MII != MBB.instr_rend() && MII->isDebugInstr())
 982       ++MII;
 983     if (MII == MBB.instr_rend())
 984       continue;
 985     MachineInstr &TI = *MII;
 986     if (!TI.isTerminator() || !TI.isBranch())
 987       // No terminator or non-branch terminator.
 988       continue;
 989
 990     unsigned TargetReg;
 991
 992     switch (TI.getOpcode()) {
 993     default:
 994       // Direct branch or conditional branch (leading to fallthrough).
 995       continue;
 996
 997     case X86::FARJMP16m:
 998     case X86::FARJMP32m:
 999     case X86::FARJMP64:
1000       // We cannot mitigate far jumps or calls, but we also don't expect them
1001       // to be vulnerable to Spectre v1.2 or v2 (self trained) style attacks.
1002       continue;
1003
1004     case X86::JMP16m:
1005     case X86::JMP16m_NT:
1006     case X86::JMP32m:
1007     case X86::JMP32m_NT:
1008     case X86::JMP64m:
1009     case X86::JMP64m_NT:
1010       // Mostly as documentation.
1011       report_fatal_error("Memory operand jumps should have been unfolded!");
1012
1013     case X86::JMP16r:
1014       report_fatal_error(
1015           "Support for 16-bit indirect branches is not implemented.");
1016     case X86::JMP32r:
1017       report_fatal_error(
1018           "Support for 32-bit indirect branches is not implemented.");
1019
1020     case X86::JMP64r:
1021       TargetReg = TI.getOperand(0).getReg();
1022     }
1023
1024     // We have definitely found an indirect  branch. Verify that there are no
1025     // preceding conditional branches as we don't yet support that.
1026     if (llvm::any_of(MBB.terminators(), [&](MachineInstr &OtherTI) {
1027           return !OtherTI.isDebugInstr() && &OtherTI != &TI;
1028         })) {
1029       LLVM_DEBUG({
1030         dbgs() << "ERROR: Found other terminators in a block with an indirect "
1031                   "branch! This is not yet supported! Terminator sequence:\n";
1032         for (MachineInstr &MI : MBB.terminators()) {
1033           MI.dump();
1034           dbgs() << '\n';
1035         }
1036       });
1037       report_fatal_error("Unimplemented terminator sequence!");
1038     }
1039
1040     // Make the target register an available value for this block.
1041     TargetAddrSSA.AddAvailableValue(&MBB, TargetReg);
1042     IndirectTerminatedMBBs.insert(&MBB);
1043
1044     // Add all the successors to our target candidates.
1045     for (MachineBasicBlock *Succ : MBB.successors())
1046       IndirectTargetMBBs.insert(Succ);
1047   }
1048
1049   // Keep track of the cmov instructions we insert so we can return them.
1050   SmallVector<MachineInstr *, 16> CMovs;
1051
1052   // If we didn't find any indirect branches with targets, nothing to do here.
1053   if (IndirectTargetMBBs.empty())
1054     return CMovs;
1055
1056   // We found indirect branches and targets that need to be instrumented to
1057   // harden loads within them. Walk the blocks of the function (to get a stable
1058   // ordering) and instrument each target of an indirect branch.
1059   for (MachineBasicBlock &MBB : MF) {
1060     // Skip the blocks that aren't candidate targets.
1061     if (!IndirectTargetMBBs.count(&MBB))
1062       continue;
1063
1064     // We don't expect EH pads to ever be reached via an indirect branch. If
1065     // this is desired for some reason, we could simply skip them here rather
1066     // than asserting.
1067     assert(!MBB.isEHPad() &&
1068            "Unexpected EH pad as target of an indirect branch!");
1069
1070     // We should never end up threading EFLAGS into a block to harden
1071     // conditional jumps as there would be an additional successor via the
1072     // indirect branch. As a consequence, all such edges would be split before
1073     // reaching here, and the inserted block will handle the EFLAGS-based
1074     // hardening.
1075     assert(!MBB.isLiveIn(X86::EFLAGS) &&
1076            "Cannot check within a block that already has live-in EFLAGS!");
1077
1078     // We can't handle having non-indirect edges into this block unless this is
1079     // the only successor and we can synthesize the necessary target address.
1080     for (MachineBasicBlock *Pred : MBB.predecessors()) {
1081       // If we've already handled this by extracting the target directly,
1082       // nothing to do.
1083       if (IndirectTerminatedMBBs.count(Pred))
1084         continue;
1085
1086       // Otherwise, we have to be the only successor. We generally expect this
1087       // to be true as conditional branches should have had a critical edge
1088       // split already. We don't however need to worry about EH pad successors
1089       // as they'll happily ignore the target and their hardening strategy is
1090       // resilient to all ways in which they could be reached speculatively.
1091       if (!llvm::all_of(Pred->successors(), [&](MachineBasicBlock *Succ) {
1092             return Succ->isEHPad() || Succ == &MBB;
1093           })) {
1094         LLVM_DEBUG({
1095           dbgs() << "ERROR: Found conditional entry to target of indirect "
1096                     "branch!\n";
1097           Pred->dump();
1098           MBB.dump();
1099         });
1100         report_fatal_error("Cannot harden a conditional entry to a target of "
1101                            "an indirect branch!");
1102       }
1103
1104       // Now we need to compute the address of this block and install it as a
1105       // synthetic target in the predecessor. We do this at the bottom of the
1106       // predecessor.
1107       auto InsertPt = Pred->getFirstTerminator();
1108       unsigned TargetReg = MRI->createVirtualRegister(&X86::GR64RegClass);
1109       if (MF.getTarget().getCodeModel() == CodeModel::Small &&
1110           !Subtarget->isPositionIndependent()) {
1111         // Directly materialize it into an immediate.
1112         auto AddrI = BuildMI(*Pred, InsertPt, DebugLoc(),
1113                              TII->get(X86::MOV64ri32), TargetReg)
1114                          .addMBB(&MBB);
1115         ++NumInstsInserted;
1116         (void)AddrI;
1117         LLVM_DEBUG(dbgs() << "  Inserting mov: "; AddrI->dump();
1118                    dbgs() << "\n");
1119       } else {
1120         auto AddrI = BuildMI(*Pred, InsertPt, DebugLoc(), TII->get(X86::LEA64r),
1121                              TargetReg)
1122                          .addReg(/*Base*/ X86::RIP)
1123                          .addImm(/*Scale*/ 1)
1124                          .addReg(/*Index*/ 0)
1125                          .addMBB(&MBB)
1126                          .addReg(/*Segment*/ 0);
1127         ++NumInstsInserted;
1128         (void)AddrI;
1129         LLVM_DEBUG(dbgs() << "  Inserting lea: "; AddrI->dump();
1130                    dbgs() << "\n");
1131       }
1132       // And make this available.
1133       TargetAddrSSA.AddAvailableValue(Pred, TargetReg);
1134     }
1135
1136     // Materialize the needed SSA value of the target. Note that we need the
1137     // middle of the block as this block might at the bottom have an indirect
1138     // branch back to itself. We can do this here because at this point, every
1139     // predecessor of this block has an available value. This is basically just
1140     // automating the construction of a PHI node for this target.
1141     unsigned TargetReg = TargetAddrSSA.GetValueInMiddleOfBlock(&MBB);
1142
1143     // Insert a comparison of the incoming target register with this block's
1144     // address. This also requires us to mark the block as having its address
1145     // taken explicitly.
1146     MBB.setHasAddressTaken();
1147     auto InsertPt = MBB.SkipPHIsLabelsAndDebug(MBB.begin());
1148     if (MF.getTarget().getCodeModel() == CodeModel::Small &&
1149         !Subtarget->isPositionIndependent()) {
1150       // Check directly against a relocated immediate when we can.
1151       auto CheckI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::CMP64ri32))
1152                         .addReg(TargetReg, RegState::Kill)
1153                         .addMBB(&MBB);
1154       ++NumInstsInserted;
1155       (void)CheckI;
1156       LLVM_DEBUG(dbgs() << "  Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
1157     } else {
1158       // Otherwise compute the address into a register first.
1159       unsigned AddrReg = MRI->createVirtualRegister(&X86::GR64RegClass);
1160       auto AddrI =
1161           BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::LEA64r), AddrReg)
1162               .addReg(/*Base*/ X86::RIP)
1163               .addImm(/*Scale*/ 1)
1164               .addReg(/*Index*/ 0)
1165               .addMBB(&MBB)
1166               .addReg(/*Segment*/ 0);
1167       ++NumInstsInserted;
1168       (void)AddrI;
1169       LLVM_DEBUG(dbgs() << "  Inserting lea: "; AddrI->dump(); dbgs() << "\n");
1170       auto CheckI = BuildMI(MBB, InsertPt, DebugLoc(), TII->get(X86::CMP64rr))
1171                         .addReg(TargetReg, RegState::Kill)
1172                         .addReg(AddrReg, RegState::Kill);
1173       ++NumInstsInserted;
1174       (void)CheckI;
1175       LLVM_DEBUG(dbgs() << "  Inserting cmp: "; CheckI->dump(); dbgs() << "\n");
1176     }
1177
1178     // Now cmov over the predicate if the comparison wasn't equal.
1179     int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
1180     auto CMovOp = X86::getCMovFromCond(X86::COND_NE, PredStateSizeInBytes);
1181     unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
1182     auto CMovI =
1183         BuildMI(MBB, InsertPt, DebugLoc(), TII->get(CMovOp), UpdatedStateReg)
1184             .addReg(PS->InitialReg)
1185             .addReg(PS->PoisonReg);
1186     CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
1187     ++NumInstsInserted;
1188     LLVM_DEBUG(dbgs() << "  Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
1189     CMovs.push_back(&*CMovI);
1190
1191     // And put the new value into the available values for SSA form of our
1192     // predicate state.
1193     PS->SSA.AddAvailableValue(&MBB, UpdatedStateReg);
1194   }
1195
1196   // Return all the newly inserted cmov instructions of the predicate state.
1197   return CMovs;
1198 }
1199
1200 /// Returns true if the instruction has no behavior (specified or otherwise)
1201 /// that is based on the value of any of its register operands
1202 ///
1203 /// A classical example of something that is inherently not data invariant is an
1204 /// indirect jump -- the destination is loaded into icache based on the bits set
1205 /// in the jump destination register.
1206 ///
1207 /// FIXME: This should become part of our instruction tables.
1208 static bool isDataInvariant(MachineInstr &MI) {
1209   switch (MI.getOpcode()) {
1210   default:
1211     // By default, assume that the instruction is not data invariant.
1212     return false;
1213
1214     // Some target-independent operations that trivially lower to data-invariant
1215     // instructions.
1216   case TargetOpcode::COPY:
1217   case TargetOpcode::INSERT_SUBREG:
1218   case TargetOpcode::SUBREG_TO_REG:
1219     return true;
1220
1221   // On x86 it is believed that imul is constant time w.r.t. the loaded data.
1222   // However, they set flags and are perhaps the most surprisingly constant
1223   // time operations so we call them out here separately.
1224   case X86::IMUL16rr:
1225   case X86::IMUL16rri8:
1226   case X86::IMUL16rri:
1227   case X86::IMUL32rr:
1228   case X86::IMUL32rri8:
1229   case X86::IMUL32rri:
1230   case X86::IMUL64rr:
1231   case X86::IMUL64rri32:
1232   case X86::IMUL64rri8:
1233
1234   // Bit scanning and counting instructions that are somewhat surprisingly
1235   // constant time as they scan across bits and do other fairly complex
1236   // operations like popcnt, but are believed to be constant time on x86.
1237   // However, these set flags.
1238   case X86::BSF16rr:
1239   case X86::BSF32rr:
1240   case X86::BSF64rr:
1241   case X86::BSR16rr:
1242   case X86::BSR32rr:
1243   case X86::BSR64rr:
1244   case X86::LZCNT16rr:
1245   case X86::LZCNT32rr:
1246   case X86::LZCNT64rr:
1247   case X86::POPCNT16rr:
1248   case X86::POPCNT32rr:
1249   case X86::POPCNT64rr:
1250   case X86::TZCNT16rr:
1251   case X86::TZCNT32rr:
1252   case X86::TZCNT64rr:
1253
1254   // Bit manipulation instructions are effectively combinations of basic
1255   // arithmetic ops, and should still execute in constant time. These also
1256   // set flags.
1257   case X86::BLCFILL32rr:
1258   case X86::BLCFILL64rr:
1259   case X86::BLCI32rr:
1260   case X86::BLCI64rr:
1261   case X86::BLCIC32rr:
1262   case X86::BLCIC64rr:
1263   case X86::BLCMSK32rr:
1264   case X86::BLCMSK64rr:
1265   case X86::BLCS32rr:
1266   case X86::BLCS64rr:
1267   case X86::BLSFILL32rr:
1268   case X86::BLSFILL64rr:
1269   case X86::BLSI32rr:
1270   case X86::BLSI64rr:
1271   case X86::BLSIC32rr:
1272   case X86::BLSIC64rr:
1273   case X86::BLSMSK32rr:
1274   case X86::BLSMSK64rr:
1275   case X86::BLSR32rr:
1276   case X86::BLSR64rr:
1277   case X86::TZMSK32rr:
1278   case X86::TZMSK64rr:
1279
1280   // Bit extracting and clearing instructions should execute in constant time,
1281   // and set flags.
1282   case X86::BEXTR32rr:
1283   case X86::BEXTR64rr:
1284   case X86::BEXTRI32ri:
1285   case X86::BEXTRI64ri:
1286   case X86::BZHI32rr:
1287   case X86::BZHI64rr:
1288
1289   // Shift and rotate.
1290   case X86::ROL8r1:  case X86::ROL16r1:  case X86::ROL32r1:  case X86::ROL64r1:
1291   case X86::ROL8rCL: case X86::ROL16rCL: case X86::ROL32rCL: case X86::ROL64rCL:
1292   case X86::ROL8ri:  case X86::ROL16ri:  case X86::ROL32ri:  case X86::ROL64ri:
1293   case X86::ROR8r1:  case X86::ROR16r1:  case X86::ROR32r1:  case X86::ROR64r1:
1294   case X86::ROR8rCL: case X86::ROR16rCL: case X86::ROR32rCL: case X86::ROR64rCL:
1295   case X86::ROR8ri:  case X86::ROR16ri:  case X86::ROR32ri:  case X86::ROR64ri:
1296   case X86::SAR8r1:  case X86::SAR16r1:  case X86::SAR32r1:  case X86::SAR64r1:
1297   case X86::SAR8rCL: case X86::SAR16rCL: case X86::SAR32rCL: case X86::SAR64rCL:
1298   case X86::SAR8ri:  case X86::SAR16ri:  case X86::SAR32ri:  case X86::SAR64ri:
1299   case X86::SHL8r1:  case X86::SHL16r1:  case X86::SHL32r1:  case X86::SHL64r1:
1300   case X86::SHL8rCL: case X86::SHL16rCL: case X86::SHL32rCL: case X86::SHL64rCL:
1301   case X86::SHL8ri:  case X86::SHL16ri:  case X86::SHL32ri:  case X86::SHL64ri:
1302   case X86::SHR8r1:  case X86::SHR16r1:  case X86::SHR32r1:  case X86::SHR64r1:
1303   case X86::SHR8rCL: case X86::SHR16rCL: case X86::SHR32rCL: case X86::SHR64rCL:
1304   case X86::SHR8ri:  case X86::SHR16ri:  case X86::SHR32ri:  case X86::SHR64ri:
1305   case X86::SHLD16rrCL: case X86::SHLD32rrCL: case X86::SHLD64rrCL:
1306   case X86::SHLD16rri8: case X86::SHLD32rri8: case X86::SHLD64rri8:
1307   case X86::SHRD16rrCL: case X86::SHRD32rrCL: case X86::SHRD64rrCL:
1308   case X86::SHRD16rri8: case X86::SHRD32rri8: case X86::SHRD64rri8:
1309
1310   // Basic arithmetic is constant time on the input but does set flags.
1311   case X86::ADC8rr:   case X86::ADC8ri:
1312   case X86::ADC16rr:  case X86::ADC16ri:   case X86::ADC16ri8:
1313   case X86::ADC32rr:  case X86::ADC32ri:   case X86::ADC32ri8:
1314   case X86::ADC64rr:  case X86::ADC64ri8:  case X86::ADC64ri32:
1315   case X86::ADD8rr:   case X86::ADD8ri:
1316   case X86::ADD16rr:  case X86::ADD16ri:   case X86::ADD16ri8:
1317   case X86::ADD32rr:  case X86::ADD32ri:   case X86::ADD32ri8:
1318   case X86::ADD64rr:  case X86::ADD64ri8:  case X86::ADD64ri32:
1319   case X86::AND8rr:   case X86::AND8ri:
1320   case X86::AND16rr:  case X86::AND16ri:   case X86::AND16ri8:
1321   case X86::AND32rr:  case X86::AND32ri:   case X86::AND32ri8:
1322   case X86::AND64rr:  case X86::AND64ri8:  case X86::AND64ri32:
1323   case X86::OR8rr:    case X86::OR8ri:
1324   case X86::OR16rr:   case X86::OR16ri:    case X86::OR16ri8:
1325   case X86::OR32rr:   case X86::OR32ri:    case X86::OR32ri8:
1326   case X86::OR64rr:   case X86::OR64ri8:   case X86::OR64ri32:
1327   case X86::SBB8rr:   case X86::SBB8ri:
1328   case X86::SBB16rr:  case X86::SBB16ri:   case X86::SBB16ri8:
1329   case X86::SBB32rr:  case X86::SBB32ri:   case X86::SBB32ri8:
1330   case X86::SBB64rr:  case X86::SBB64ri8:  case X86::SBB64ri32:
1331   case X86::SUB8rr:   case X86::SUB8ri:
1332   case X86::SUB16rr:  case X86::SUB16ri:   case X86::SUB16ri8:
1333   case X86::SUB32rr:  case X86::SUB32ri:   case X86::SUB32ri8:
1334   case X86::SUB64rr:  case X86::SUB64ri8:  case X86::SUB64ri32:
1335   case X86::XOR8rr:   case X86::XOR8ri:
1336   case X86::XOR16rr:  case X86::XOR16ri:   case X86::XOR16ri8:
1337   case X86::XOR32rr:  case X86::XOR32ri:   case X86::XOR32ri8:
1338   case X86::XOR64rr:  case X86::XOR64ri8:  case X86::XOR64ri32:
1339   // Arithmetic with just 32-bit and 64-bit variants and no immediates.
1340   case X86::ADCX32rr: case X86::ADCX64rr:
1341   case X86::ADOX32rr: case X86::ADOX64rr:
1342   case X86::ANDN32rr: case X86::ANDN64rr:
1343   // Unary arithmetic operations.
1344   case X86::DEC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r:
1345   case X86::INC8r: case X86::INC16r: case X86::INC32r: case X86::INC64r:
1346   case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
1347     // Check whether the EFLAGS implicit-def is dead. We assume that this will
1348     // always find the implicit-def because this code should only be reached
1349     // for instructions that do in fact implicitly def this.
1350     if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
1351       // If we would clobber EFLAGS that are used, just bail for now.
1352       LLVM_DEBUG(dbgs() << "    Unable to harden post-load due to EFLAGS: ";
1353                  MI.dump(); dbgs() << "\n");
1354       return false;
1355     }
1356
1357     // Otherwise, fallthrough to handle these the same as instructions that
1358     // don't set EFLAGS.
1359     LLVM_FALLTHROUGH;
1360
1361   // Unlike other arithmetic, NOT doesn't set EFLAGS.
1362   case X86::NOT8r: case X86::NOT16r: case X86::NOT32r: case X86::NOT64r:
1363
1364   // Various move instructions used to zero or sign extend things. Note that we
1365   // intentionally don't support the _NOREX variants as we can't handle that
1366   // register constraint anyways.
1367   case X86::MOVSX16rr8:
1368   case X86::MOVSX32rr8: case X86::MOVSX32rr16:
1369   case X86::MOVSX64rr8: case X86::MOVSX64rr16: case X86::MOVSX64rr32:
1370   case X86::MOVZX16rr8:
1371   case X86::MOVZX32rr8: case X86::MOVZX32rr16:
1372   case X86::MOVZX64rr8: case X86::MOVZX64rr16:
1373   case X86::MOV32rr:
1374
1375   // Arithmetic instructions that are both constant time and don't set flags.
1376   case X86::RORX32ri:
1377   case X86::RORX64ri:
1378   case X86::SARX32rr:
1379   case X86::SARX64rr:
1380   case X86::SHLX32rr:
1381   case X86::SHLX64rr:
1382   case X86::SHRX32rr:
1383   case X86::SHRX64rr:
1384
1385   // LEA doesn't actually access memory, and its arithmetic is constant time.
1386   case X86::LEA16r:
1387   case X86::LEA32r:
1388   case X86::LEA64_32r:
1389   case X86::LEA64r:
1390     return true;
1391   }
1392 }
1393
1394 /// Returns true if the instruction has no behavior (specified or otherwise)
1395 /// that is based on the value loaded from memory or the value of any
1396 /// non-address register operands.
1397 ///
1398 /// For example, if the latency of the instruction is dependent on the
1399 /// particular bits set in any of the registers *or* any of the bits loaded from
1400 /// memory.
1401 ///
1402 /// A classical example of something that is inherently not data invariant is an
1403 /// indirect jump -- the destination is loaded into icache based on the bits set
1404 /// in the jump destination register.
1405 ///
1406 /// FIXME: This should become part of our instruction tables.
1407 static bool isDataInvariantLoad(MachineInstr &MI) {
1408   switch (MI.getOpcode()) {
1409   default:
1410     // By default, assume that the load will immediately leak.
1411     return false;
1412
1413   // On x86 it is believed that imul is constant time w.r.t. the loaded data.
1414   // However, they set flags and are perhaps the most surprisingly constant
1415   // time operations so we call them out here separately.
1416   case X86::IMUL16rm:
1417   case X86::IMUL16rmi8:
1418   case X86::IMUL16rmi:
1419   case X86::IMUL32rm:
1420   case X86::IMUL32rmi8:
1421   case X86::IMUL32rmi:
1422   case X86::IMUL64rm:
1423   case X86::IMUL64rmi32:
1424   case X86::IMUL64rmi8:
1425
1426   // Bit scanning and counting instructions that are somewhat surprisingly
1427   // constant time as they scan across bits and do other fairly complex
1428   // operations like popcnt, but are believed to be constant time on x86.
1429   // However, these set flags.
1430   case X86::BSF16rm:
1431   case X86::BSF32rm:
1432   case X86::BSF64rm:
1433   case X86::BSR16rm:
1434   case X86::BSR32rm:
1435   case X86::BSR64rm:
1436   case X86::LZCNT16rm:
1437   case X86::LZCNT32rm:
1438   case X86::LZCNT64rm:
1439   case X86::POPCNT16rm:
1440   case X86::POPCNT32rm:
1441   case X86::POPCNT64rm:
1442   case X86::TZCNT16rm:
1443   case X86::TZCNT32rm:
1444   case X86::TZCNT64rm:
1445
1446   // Bit manipulation instructions are effectively combinations of basic
1447   // arithmetic ops, and should still execute in constant time. These also
1448   // set flags.
1449   case X86::BLCFILL32rm:
1450   case X86::BLCFILL64rm:
1451   case X86::BLCI32rm:
1452   case X86::BLCI64rm:
1453   case X86::BLCIC32rm:
1454   case X86::BLCIC64rm:
1455   case X86::BLCMSK32rm:
1456   case X86::BLCMSK64rm:
1457   case X86::BLCS32rm:
1458   case X86::BLCS64rm:
1459   case X86::BLSFILL32rm:
1460   case X86::BLSFILL64rm:
1461   case X86::BLSI32rm:
1462   case X86::BLSI64rm:
1463   case X86::BLSIC32rm:
1464   case X86::BLSIC64rm:
1465   case X86::BLSMSK32rm:
1466   case X86::BLSMSK64rm:
1467   case X86::BLSR32rm:
1468   case X86::BLSR64rm:
1469   case X86::TZMSK32rm:
1470   case X86::TZMSK64rm:
1471
1472   // Bit extracting and clearing instructions should execute in constant time,
1473   // and set flags.
1474   case X86::BEXTR32rm:
1475   case X86::BEXTR64rm:
1476   case X86::BEXTRI32mi:
1477   case X86::BEXTRI64mi:
1478   case X86::BZHI32rm:
1479   case X86::BZHI64rm:
1480
1481   // Basic arithmetic is constant time on the input but does set flags.
1482   case X86::ADC8rm:
1483   case X86::ADC16rm:
1484   case X86::ADC32rm:
1485   case X86::ADC64rm:
1486   case X86::ADCX32rm:
1487   case X86::ADCX64rm:
1488   case X86::ADD8rm:
1489   case X86::ADD16rm:
1490   case X86::ADD32rm:
1491   case X86::ADD64rm:
1492   case X86::ADOX32rm:
1493   case X86::ADOX64rm:
1494   case X86::AND8rm:
1495   case X86::AND16rm:
1496   case X86::AND32rm:
1497   case X86::AND64rm:
1498   case X86::ANDN32rm:
1499   case X86::ANDN64rm:
1500   case X86::OR8rm:
1501   case X86::OR16rm:
1502   case X86::OR32rm:
1503   case X86::OR64rm:
1504   case X86::SBB8rm:
1505   case X86::SBB16rm:
1506   case X86::SBB32rm:
1507   case X86::SBB64rm:
1508   case X86::SUB8rm:
1509   case X86::SUB16rm:
1510   case X86::SUB32rm:
1511   case X86::SUB64rm:
1512   case X86::XOR8rm:
1513   case X86::XOR16rm:
1514   case X86::XOR32rm:
1515   case X86::XOR64rm:
1516     // Check whether the EFLAGS implicit-def is dead. We assume that this will
1517     // always find the implicit-def because this code should only be reached
1518     // for instructions that do in fact implicitly def this.
1519     if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
1520       // If we would clobber EFLAGS that are used, just bail for now.
1521       LLVM_DEBUG(dbgs() << "    Unable to harden post-load due to EFLAGS: ";
1522                  MI.dump(); dbgs() << "\n");
1523       return false;
1524     }
1525
1526     // Otherwise, fallthrough to handle these the same as instructions that
1527     // don't set EFLAGS.
1528     LLVM_FALLTHROUGH;
1529
1530   // Integer multiply w/o affecting flags is still believed to be constant
1531   // time on x86. Called out separately as this is among the most surprising
1532   // instructions to exhibit that behavior.
1533   case X86::MULX32rm:
1534   case X86::MULX64rm:
1535
1536   // Arithmetic instructions that are both constant time and don't set flags.
1537   case X86::RORX32mi:
1538   case X86::RORX64mi:
1539   case X86::SARX32rm:
1540   case X86::SARX64rm:
1541   case X86::SHLX32rm:
1542   case X86::SHLX64rm:
1543   case X86::SHRX32rm:
1544   case X86::SHRX64rm:
1545
1546   // Conversions are believed to be constant time and don't set flags.
1547   case X86::CVTTSD2SI64rm: case X86::VCVTTSD2SI64rm: case X86::VCVTTSD2SI64Zrm:
1548   case X86::CVTTSD2SIrm:   case X86::VCVTTSD2SIrm:   case X86::VCVTTSD2SIZrm:
1549   case X86::CVTTSS2SI64rm: case X86::VCVTTSS2SI64rm: case X86::VCVTTSS2SI64Zrm:
1550   case X86::CVTTSS2SIrm:   case X86::VCVTTSS2SIrm:   case X86::VCVTTSS2SIZrm:
1551   case X86::CVTSI2SDrm:    case X86::VCVTSI2SDrm:    case X86::VCVTSI2SDZrm:
1552   case X86::CVTSI2SSrm:    case X86::VCVTSI2SSrm:    case X86::VCVTSI2SSZrm:
1553   case X86::CVTSI642SDrm:  case X86::VCVTSI642SDrm:  case X86::VCVTSI642SDZrm:
1554   case X86::CVTSI642SSrm:  case X86::VCVTSI642SSrm:  case X86::VCVTSI642SSZrm:
1555   case X86::CVTSS2SDrm:    case X86::VCVTSS2SDrm:    case X86::VCVTSS2SDZrm:
1556   case X86::CVTSD2SSrm:    case X86::VCVTSD2SSrm:    case X86::VCVTSD2SSZrm:
1557   // AVX512 added unsigned integer conversions.
1558   case X86::VCVTTSD2USI64Zrm:
1559   case X86::VCVTTSD2USIZrm:
1560   case X86::VCVTTSS2USI64Zrm:
1561   case X86::VCVTTSS2USIZrm:
1562   case X86::VCVTUSI2SDZrm:
1563   case X86::VCVTUSI642SDZrm:
1564   case X86::VCVTUSI2SSZrm:
1565   case X86::VCVTUSI642SSZrm:
1566
1567   // Loads to register don't set flags.
1568   case X86::MOV8rm:
1569   case X86::MOV8rm_NOREX:
1570   case X86::MOV16rm:
1571   case X86::MOV32rm:
1572   case X86::MOV64rm:
1573   case X86::MOVSX16rm8:
1574   case X86::MOVSX32rm16:
1575   case X86::MOVSX32rm8:
1576   case X86::MOVSX32rm8_NOREX:
1577   case X86::MOVSX64rm16:
1578   case X86::MOVSX64rm32:
1579   case X86::MOVSX64rm8:
1580   case X86::MOVZX16rm8:
1581   case X86::MOVZX32rm16:
1582   case X86::MOVZX32rm8:
1583   case X86::MOVZX32rm8_NOREX:
1584   case X86::MOVZX64rm16:
1585   case X86::MOVZX64rm8:
1586     return true;
1587   }
1588 }
1589
1590 static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
1591                          const TargetRegisterInfo &TRI) {
1592   // Check if EFLAGS are alive by seeing if there is a def of them or they
1593   // live-in, and then seeing if that def is in turn used.
1594   for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) {
1595     if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
1596       // If the def is dead, then EFLAGS is not live.
1597       if (DefOp->isDead())
1598         return false;
1599
1600       // Otherwise we've def'ed it, and it is live.
1601       return true;
1602     }
1603     // While at this instruction, also check if we use and kill EFLAGS
1604     // which means it isn't live.
1605     if (MI.killsRegister(X86::EFLAGS, &TRI))
1606       return false;
1607   }
1608
1609   // If we didn't find anything conclusive (neither definitely alive or
1610   // definitely dead) return whether it lives into the block.
1611   return MBB.isLiveIn(X86::EFLAGS);
1612 }
1613
1614 /// Trace the predicate state through each of the blocks in the function,
1615 /// hardening everything necessary along the way.
1616 ///
1617 /// We call this routine once the initial predicate state has been established
1618 /// for each basic block in the function in the SSA updater. This routine traces
1619 /// it through the instructions within each basic block, and for non-returning
1620 /// blocks informs the SSA updater about the final state that lives out of the
1621 /// block. Along the way, it hardens any vulnerable instruction using the
1622 /// currently valid predicate state. We have to do these two things together
1623 /// because the SSA updater only works across blocks. Within a block, we track
1624 /// the current predicate state directly and update it as it changes.
1625 ///
1626 /// This operates in two passes over each block. First, we analyze the loads in
1627 /// the block to determine which strategy will be used to harden them: hardening
1628 /// the address or hardening the loaded value when loaded into a register
1629 /// amenable to hardening. We have to process these first because the two
1630 /// strategies may interact -- later hardening may change what strategy we wish
1631 /// to use. We also will analyze data dependencies between loads and avoid
1632 /// hardening those loads that are data dependent on a load with a hardened
1633 /// address. We also skip hardening loads already behind an LFENCE as that is
1634 /// sufficient to harden them against misspeculation.
1635 ///
1636 /// Second, we actively trace the predicate state through the block, applying
1637 /// the hardening steps we determined necessary in the first pass as we go.
1638 ///
1639 /// These two passes are applied to each basic block. We operate one block at a
1640 /// time to simplify reasoning about reachability and sequencing.
1641 void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
1642     MachineFunction &MF) {
1643   SmallPtrSet<MachineInstr *, 16> HardenPostLoad;
1644   SmallPtrSet<MachineInstr *, 16> HardenLoadAddr;
1645
1646   SmallSet<unsigned, 16> HardenedAddrRegs;
1647
1648   SmallDenseMap<unsigned, unsigned, 32> AddrRegToHardenedReg;
1649
1650   // Track the set of load-dependent registers through the basic block. Because
1651   // the values of these registers have an existing data dependency on a loaded
1652   // value which we would have checked, we can omit any checks on them.
1653   SparseBitVector<> LoadDepRegs;
1654
1655   for (MachineBasicBlock &MBB : MF) {
1656     // The first pass over the block: collect all the loads which can have their
1657     // loaded value hardened and all the loads that instead need their address
1658     // hardened. During this walk we propagate load dependence for address
1659     // hardened loads and also look for LFENCE to stop hardening wherever
1660     // possible. When deciding whether or not to harden the loaded value or not,
1661     // we check to see if any registers used in the address will have been
1662     // hardened at this point and if so, harden any remaining address registers
1663     // as that often successfully re-uses hardened addresses and minimizes
1664     // instructions.
1665     //
1666     // FIXME: We should consider an aggressive mode where we continue to keep as
1667     // many loads value hardened even when some address register hardening would
1668     // be free (due to reuse).
1669     //
1670     // Note that we only need this pass if we are actually hardening loads.
1671     if (HardenLoads)
1672       for (MachineInstr &MI : MBB) {
1673         // We naively assume that all def'ed registers of an instruction have
1674         // a data dependency on all of their operands.
1675         // FIXME: Do a more careful analysis of x86 to build a conservative
1676         // model here.
1677         if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) {
1678               return Op.isReg() && LoadDepRegs.test(Op.getReg());
1679             }))
1680           for (MachineOperand &Def : MI.defs())
1681             if (Def.isReg())
1682               LoadDepRegs.set(Def.getReg());
1683
1684         // Both Intel and AMD are guiding that they will change the semantics of
1685         // LFENCE to be a speculation barrier, so if we see an LFENCE, there is
1686         // no more need to guard things in this block.
1687         if (MI.getOpcode() == X86::LFENCE)
1688           break;
1689
1690         // If this instruction cannot load, nothing to do.
1691         if (!MI.mayLoad())
1692           continue;
1693
1694         // Some instructions which "load" are trivially safe or unimportant.
1695         if (MI.getOpcode() == X86::MFENCE)
1696           continue;
1697
1698         // Extract the memory operand information about this instruction.
1699         // FIXME: This doesn't handle loading pseudo instructions which we often
1700         // could handle with similarly generic logic. We probably need to add an
1701         // MI-layer routine similar to the MC-layer one we use here which maps
1702         // pseudos much like this maps real instructions.
1703         const MCInstrDesc &Desc = MI.getDesc();
1704         int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1705         if (MemRefBeginIdx < 0) {
1706           LLVM_DEBUG(dbgs()
1707                          << "WARNING: unable to harden loading instruction: ";
1708                      MI.dump());
1709           continue;
1710         }
1711
1712         MemRefBeginIdx += X86II::getOperandBias(Desc);
1713
1714         MachineOperand &BaseMO =
1715             MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1716         MachineOperand &IndexMO =
1717             MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1718
1719         // If we have at least one (non-frame-index, non-RIP) register operand,
1720         // and neither operand is load-dependent, we need to check the load.
1721         unsigned BaseReg = 0, IndexReg = 0;
1722         if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP &&
1723             BaseMO.getReg() != X86::NoRegister)
1724           BaseReg = BaseMO.getReg();
1725         if (IndexMO.getReg() != X86::NoRegister)
1726           IndexReg = IndexMO.getReg();
1727
1728         if (!BaseReg && !IndexReg)
1729           // No register operands!
1730           continue;
1731
1732         // If any register operand is dependent, this load is dependent and we
1733         // needn't check it.
1734         // FIXME: Is this true in the case where we are hardening loads after
1735         // they complete? Unclear, need to investigate.
1736         if ((BaseReg && LoadDepRegs.test(BaseReg)) ||
1737             (IndexReg && LoadDepRegs.test(IndexReg)))
1738           continue;
1739
1740         // If post-load hardening is enabled, this load is compatible with
1741         // post-load hardening, and we aren't already going to harden one of the
1742         // address registers, queue it up to be hardened post-load. Notably,
1743         // even once hardened this won't introduce a useful dependency that
1744         // could prune out subsequent loads.
1745         if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
1746             MI.getDesc().getNumDefs() == 1 && MI.getOperand(0).isReg() &&
1747             canHardenRegister(MI.getOperand(0).getReg()) &&
1748             !HardenedAddrRegs.count(BaseReg) &&
1749             !HardenedAddrRegs.count(IndexReg)) {
1750           HardenPostLoad.insert(&MI);
1751           HardenedAddrRegs.insert(MI.getOperand(0).getReg());
1752           continue;
1753         }
1754
1755         // Record this instruction for address hardening and record its register
1756         // operands as being address-hardened.
1757         HardenLoadAddr.insert(&MI);
1758         if (BaseReg)
1759           HardenedAddrRegs.insert(BaseReg);
1760         if (IndexReg)
1761           HardenedAddrRegs.insert(IndexReg);
1762
1763         for (MachineOperand &Def : MI.defs())
1764           if (Def.isReg())
1765             LoadDepRegs.set(Def.getReg());
1766       }
1767
1768     // Now re-walk the instructions in the basic block, and apply whichever
1769     // hardening strategy we have elected. Note that we do this in a second
1770     // pass specifically so that we have the complete set of instructions for
1771     // which we will do post-load hardening and can defer it in certain
1772     // circumstances.
1773     for (MachineInstr &MI : MBB) {
1774       if (HardenLoads) {
1775         // We cannot both require hardening the def of a load and its address.
1776         assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) &&
1777                "Requested to harden both the address and def of a load!");
1778
1779         // Check if this is a load whose address needs to be hardened.
1780         if (HardenLoadAddr.erase(&MI)) {
1781           const MCInstrDesc &Desc = MI.getDesc();
1782           int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
1783           assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!");
1784
1785           MemRefBeginIdx += X86II::getOperandBias(Desc);
1786
1787           MachineOperand &BaseMO =
1788               MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
1789           MachineOperand &IndexMO =
1790               MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
1791           hardenLoadAddr(MI, BaseMO, IndexMO, AddrRegToHardenedReg);
1792           continue;
1793         }
1794
1795         // Test if this instruction is one of our post load instructions (and
1796         // remove it from the set if so).
1797         if (HardenPostLoad.erase(&MI)) {
1798           assert(!MI.isCall() && "Must not try to post-load harden a call!");
1799
1800           // If this is a data-invariant load, we want to try and sink any
1801           // hardening as far as possible.
1802           if (isDataInvariantLoad(MI)) {
1803             // Sink the instruction we'll need to harden as far as we can down
1804             // the graph.
1805             MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
1806
1807             // If we managed to sink this instruction, update everything so we
1808             // harden that instruction when we reach it in the instruction
1809             // sequence.
1810             if (SunkMI != &MI) {
1811               // If in sinking there was no instruction needing to be hardened,
1812               // we're done.
1813               if (!SunkMI)
1814                 continue;
1815
1816               // Otherwise, add this to the set of defs we harden.
1817               HardenPostLoad.insert(SunkMI);
1818               continue;
1819             }
1820           }
1821
1822           unsigned HardenedReg = hardenPostLoad(MI);
1823
1824           // Mark the resulting hardened register as such so we don't re-harden.
1825           AddrRegToHardenedReg[HardenedReg] = HardenedReg;
1826
1827           continue;
1828         }
1829
1830         // Check for an indirect call or branch that may need its input hardened
1831         // even if we couldn't find the specific load used, or were able to
1832         // avoid hardening it for some reason. Note that here we cannot break
1833         // out afterward as we may still need to handle any call aspect of this
1834         // instruction.
1835         if ((MI.isCall() || MI.isBranch()) && HardenIndirectCallsAndJumps)
1836           hardenIndirectCallOrJumpInstr(MI, AddrRegToHardenedReg);
1837       }
1838
1839       // After we finish hardening loads we handle interprocedural hardening if
1840       // enabled and relevant for this instruction.
1841       if (!HardenInterprocedurally)
1842         continue;
1843       if (!MI.isCall() && !MI.isReturn())
1844         continue;
1845
1846       // If this is a direct return (IE, not a tail call) just directly harden
1847       // it.
1848       if (MI.isReturn() && !MI.isCall()) {
1849         hardenReturnInstr(MI);
1850         continue;
1851       }
1852
1853       // Otherwise we have a call. We need to handle transferring the predicate
1854       // state into a call and recovering it after the call returns (unless this
1855       // is a tail call).
1856       assert(MI.isCall() && "Should only reach here for calls!");
1857       tracePredStateThroughCall(MI);
1858     }
1859
1860     HardenPostLoad.clear();
1861     HardenLoadAddr.clear();
1862     HardenedAddrRegs.clear();
1863     AddrRegToHardenedReg.clear();
1864
1865     // Currently, we only track data-dependent loads within a basic block.
1866     // FIXME: We should see if this is necessary or if we could be more
1867     // aggressive here without opening up attack avenues.
1868     LoadDepRegs.clear();
1869   }
1870 }
1871
1872 /// Save EFLAGS into the returned GPR. This can in turn be restored with
1873 /// `restoreEFLAGS`.
1874 ///
1875 /// Note that LLVM can only lower very simple patterns of saved and restored
1876 /// EFLAGS registers. The restore should always be within the same basic block
1877 /// as the save so that no PHI nodes are inserted.
1878 unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS(
1879     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1880     DebugLoc Loc) {
1881   // FIXME: Hard coding this to a 32-bit register class seems weird, but matches
1882   // what instruction selection does.
1883   unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass);
1884   // We directly copy the FLAGS register and rely on later lowering to clean
1885   // this up into the appropriate setCC instructions.
1886   BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS);
1887   ++NumInstsInserted;
1888   return Reg;
1889 }
1890
1891 /// Restore EFLAGS from the provided GPR. This should be produced by
1892 /// `saveEFLAGS`.
1893 ///
1894 /// This must be done within the same basic block as the save in order to
1895 /// reliably lower.
1896 void X86SpeculativeLoadHardeningPass::restoreEFLAGS(
1897     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
1898     unsigned Reg) {
1899   BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg);
1900   ++NumInstsInserted;
1901 }
1902
1903 /// Takes the current predicate state (in a register) and merges it into the
1904 /// stack pointer. The state is essentially a single bit, but we merge this in
1905 /// a way that won't form non-canonical pointers and also will be preserved
1906 /// across normal stack adjustments.
1907 void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP(
1908     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc,
1909     unsigned PredStateReg) {
1910   unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
1911   // FIXME: This hard codes a shift distance based on the number of bits needed
1912   // to stay canonical on 64-bit. We should compute this somehow and support
1913   // 32-bit as part of that.
1914   auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg)
1915                     .addReg(PredStateReg, RegState::Kill)
1916                     .addImm(47);
1917   ShiftI->addRegisterDead(X86::EFLAGS, TRI);
1918   ++NumInstsInserted;
1919   auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP)
1920                  .addReg(X86::RSP)
1921                  .addReg(TmpReg, RegState::Kill);
1922   OrI->addRegisterDead(X86::EFLAGS, TRI);
1923   ++NumInstsInserted;
1924 }
1925
1926 /// Extracts the predicate state stored in the high bits of the stack pointer.
1927 unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP(
1928     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
1929     DebugLoc Loc) {
1930   unsigned PredStateReg = MRI->createVirtualRegister(PS->RC);
1931   unsigned TmpReg = MRI->createVirtualRegister(PS->RC);
1932
1933   // We know that the stack pointer will have any preserved predicate state in
1934   // its high bit. We just want to smear this across the other bits. Turns out,
1935   // this is exactly what an arithmetic right shift does.
1936   BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg)
1937       .addReg(X86::RSP);
1938   auto ShiftI =
1939       BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg)
1940           .addReg(TmpReg, RegState::Kill)
1941           .addImm(TRI->getRegSizeInBits(*PS->RC) - 1);
1942   ShiftI->addRegisterDead(X86::EFLAGS, TRI);
1943   ++NumInstsInserted;
1944
1945   return PredStateReg;
1946 }
1947
1948 void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
1949     MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO,
1950     SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
1951   MachineBasicBlock &MBB = *MI.getParent();
1952   DebugLoc Loc = MI.getDebugLoc();
1953
1954   // Check if EFLAGS are alive by seeing if there is a def of them or they
1955   // live-in, and then seeing if that def is in turn used.
1956   bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI);
1957
1958   SmallVector<MachineOperand *, 2> HardenOpRegs;
1959
1960   if (BaseMO.isFI()) {
1961     // A frame index is never a dynamically controllable load, so only
1962     // harden it if we're covering fixed address loads as well.
1963     LLVM_DEBUG(
1964         dbgs() << "  Skipping hardening base of explicit stack frame load: ";
1965         MI.dump(); dbgs() << "\n");
1966   } else if (BaseMO.getReg() == X86::RIP ||
1967              BaseMO.getReg() == X86::NoRegister) {
1968     // For both RIP-relative addressed loads or absolute loads, we cannot
1969     // meaningfully harden them because the address being loaded has no
1970     // dynamic component.
1971     //
1972     // FIXME: When using a segment base (like TLS does) we end up with the
1973     // dynamic address being the base plus -1 because we can't mutate the
1974     // segment register here. This allows the signed 32-bit offset to point at
1975     // valid segment-relative addresses and load them successfully.
1976     LLVM_DEBUG(
1977         dbgs() << "  Cannot harden base of "
1978                << (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base")
1979                << " address in a load!");
1980   } else {
1981     assert(BaseMO.isReg() &&
1982            "Only allowed to have a frame index or register base.");
1983     HardenOpRegs.push_back(&BaseMO);
1984   }
1985
1986   if (IndexMO.getReg() != X86::NoRegister &&
1987       (HardenOpRegs.empty() ||
1988        HardenOpRegs.front()->getReg() != IndexMO.getReg()))
1989     HardenOpRegs.push_back(&IndexMO);
1990
1991   assert((HardenOpRegs.size() == 1 || HardenOpRegs.size() == 2) &&
1992          "Should have exactly one or two registers to harden!");
1993   assert((HardenOpRegs.size() == 1 ||
1994           HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) &&
1995          "Should not have two of the same registers!");
1996
1997   // Remove any registers that have alreaded been checked.
1998   llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) {
1999     // See if this operand's register has already been checked.
2000     auto It = AddrRegToHardenedReg.find(Op->getReg());
2001     if (It == AddrRegToHardenedReg.end())
2002       // Not checked, so retain this one.
2003       return false;
2004
2005     // Otherwise, we can directly update this operand and remove it.
2006     Op->setReg(It->second);
2007     return true;
2008   });
2009   // If there are none left, we're done.
2010   if (HardenOpRegs.empty())
2011     return;
2012
2013   // Compute the current predicate state.
2014   unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
2015
2016   auto InsertPt = MI.getIterator();
2017
2018   // If EFLAGS are live and we don't have access to instructions that avoid
2019   // clobbering EFLAGS we need to save and restore them. This in turn makes
2020   // the EFLAGS no longer live.
2021   unsigned FlagsReg = 0;
2022   if (EFLAGSLive && !Subtarget->hasBMI2()) {
2023     EFLAGSLive = false;
2024     FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
2025   }
2026
2027   for (MachineOperand *Op : HardenOpRegs) {
2028     unsigned OpReg = Op->getReg();
2029     auto *OpRC = MRI->getRegClass(OpReg);
2030     unsigned TmpReg = MRI->createVirtualRegister(OpRC);
2031
2032     // If this is a vector register, we'll need somewhat custom logic to handle
2033     // hardening it.
2034     if (!Subtarget->hasVLX() && (OpRC->hasSuperClassEq(&X86::VR128RegClass) ||
2035                                  OpRC->hasSuperClassEq(&X86::VR256RegClass))) {
2036       assert(Subtarget->hasAVX2() && "AVX2-specific register classes!");
2037       bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128RegClass);
2038
2039       // Move our state into a vector register.
2040       // FIXME: We could skip this at the cost of longer encodings with AVX-512
2041       // but that doesn't seem likely worth it.
2042       unsigned VStateReg = MRI->createVirtualRegister(&X86::VR128RegClass);
2043       auto MovI =
2044           BuildMI(MBB, InsertPt, Loc, TII->get(X86::VMOV64toPQIrr), VStateReg)
2045               .addReg(StateReg);
2046       (void)MovI;
2047       ++NumInstsInserted;
2048       LLVM_DEBUG(dbgs() << "  Inserting mov: "; MovI->dump(); dbgs() << "\n");
2049
2050       // Broadcast it across the vector register.
2051       unsigned VBStateReg = MRI->createVirtualRegister(OpRC);
2052       auto BroadcastI = BuildMI(MBB, InsertPt, Loc,
2053                                 TII->get(Is128Bit ? X86::VPBROADCASTQrr
2054                                                   : X86::VPBROADCASTQYrr),
2055                                 VBStateReg)
2056                             .addReg(VStateReg);
2057       (void)BroadcastI;
2058       ++NumInstsInserted;
2059       LLVM_DEBUG(dbgs() << "  Inserting broadcast: "; BroadcastI->dump();
2060                  dbgs() << "\n");
2061
2062       // Merge our potential poison state into the value with a vector or.
2063       auto OrI =
2064           BuildMI(MBB, InsertPt, Loc,
2065                   TII->get(Is128Bit ? X86::VPORrr : X86::VPORYrr), TmpReg)
2066               .addReg(VBStateReg)
2067               .addReg(OpReg);
2068       (void)OrI;
2069       ++NumInstsInserted;
2070       LLVM_DEBUG(dbgs() << "  Inserting or: "; OrI->dump(); dbgs() << "\n");
2071     } else if (OpRC->hasSuperClassEq(&X86::VR128XRegClass) ||
2072                OpRC->hasSuperClassEq(&X86::VR256XRegClass) ||
2073                OpRC->hasSuperClassEq(&X86::VR512RegClass)) {
2074       assert(Subtarget->hasAVX512() && "AVX512-specific register classes!");
2075       bool Is128Bit = OpRC->hasSuperClassEq(&X86::VR128XRegClass);
2076       bool Is256Bit = OpRC->hasSuperClassEq(&X86::VR256XRegClass);
2077       if (Is128Bit || Is256Bit)
2078         assert(Subtarget->hasVLX() && "AVX512VL-specific register classes!");
2079
2080       // Broadcast our state into a vector register.
2081       unsigned VStateReg = MRI->createVirtualRegister(OpRC);
2082       unsigned BroadcastOp =
2083           Is128Bit ? X86::VPBROADCASTQrZ128r
2084                    : Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr;
2085       auto BroadcastI =
2086           BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
2087               .addReg(StateReg);
2088       (void)BroadcastI;
2089       ++NumInstsInserted;
2090       LLVM_DEBUG(dbgs() << "  Inserting broadcast: "; BroadcastI->dump();
2091                  dbgs() << "\n");
2092
2093       // Merge our potential poison state into the value with a vector or.
2094       unsigned OrOp = Is128Bit ? X86::VPORQZ128rr
2095                                : Is256Bit ? X86::VPORQZ256rr : X86::VPORQZrr;
2096       auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOp), TmpReg)
2097                      .addReg(VStateReg)
2098                      .addReg(OpReg);
2099       (void)OrI;
2100       ++NumInstsInserted;
2101       LLVM_DEBUG(dbgs() << "  Inserting or: "; OrI->dump(); dbgs() << "\n");
2102     } else {
2103       // FIXME: Need to support GR32 here for 32-bit code.
2104       assert(OpRC->hasSuperClassEq(&X86::GR64RegClass) &&
2105              "Not a supported register class for address hardening!");
2106
2107       if (!EFLAGSLive) {
2108         // Merge our potential poison state into the value with an or.
2109         auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg)
2110                        .addReg(StateReg)
2111                        .addReg(OpReg);
2112         OrI->addRegisterDead(X86::EFLAGS, TRI);
2113         ++NumInstsInserted;
2114         LLVM_DEBUG(dbgs() << "  Inserting or: "; OrI->dump(); dbgs() << "\n");
2115       } else {
2116         // We need to avoid touching EFLAGS so shift out all but the least
2117         // significant bit using the instruction that doesn't update flags.
2118         auto ShiftI =
2119             BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg)
2120                 .addReg(OpReg)
2121                 .addReg(StateReg);
2122         (void)ShiftI;
2123         ++NumInstsInserted;
2124         LLVM_DEBUG(dbgs() << "  Inserting shrx: "; ShiftI->dump();
2125                    dbgs() << "\n");
2126       }
2127     }
2128
2129     // Record this register as checked and update the operand.
2130     assert(!AddrRegToHardenedReg.count(Op->getReg()) &&
2131            "Should not have checked this register yet!");
2132     AddrRegToHardenedReg[Op->getReg()] = TmpReg;
2133     Op->setReg(TmpReg);
2134     ++NumAddrRegsHardened;
2135   }
2136
2137   // And restore the flags if needed.
2138   if (FlagsReg)
2139     restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
2140 }
2141
2142 MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
2143     MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
2144   assert(isDataInvariantLoad(InitialMI) &&
2145          "Cannot get here with a non-invariant load!");
2146
2147   // See if we can sink hardening the loaded value.
2148   auto SinkCheckToSingleUse =
2149       [&](MachineInstr &MI) -> Optional<MachineInstr *> {
2150     unsigned DefReg = MI.getOperand(0).getReg();
2151
2152     // We need to find a single use which we can sink the check. We can
2153     // primarily do this because many uses may already end up checked on their
2154     // own.
2155     MachineInstr *SingleUseMI = nullptr;
2156     for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
2157       // If we're already going to harden this use, it is data invariant and
2158       // within our block.
2159       if (HardenedInstrs.count(&UseMI)) {
2160         if (!isDataInvariantLoad(UseMI)) {
2161           // If we've already decided to harden a non-load, we must have sunk
2162           // some other post-load hardened instruction to it and it must itself
2163           // be data-invariant.
2164           assert(isDataInvariant(UseMI) &&
2165                  "Data variant instruction being hardened!");
2166           continue;
2167         }
2168
2169         // Otherwise, this is a load and the load component can't be data
2170         // invariant so check how this register is being used.
2171         const MCInstrDesc &Desc = UseMI.getDesc();
2172         int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
2173         assert(MemRefBeginIdx >= 0 &&
2174                "Should always have mem references here!");
2175         MemRefBeginIdx += X86II::getOperandBias(Desc);
2176
2177         MachineOperand &BaseMO =
2178             UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
2179         MachineOperand &IndexMO =
2180             UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
2181         if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) ||
2182             (IndexMO.isReg() && IndexMO.getReg() == DefReg))
2183           // The load uses the register as part of its address making it not
2184           // invariant.
2185           return {};
2186
2187         continue;
2188       }
2189
2190       if (SingleUseMI)
2191         // We already have a single use, this would make two. Bail.
2192         return {};
2193
2194       // If this single use isn't data invariant, isn't in this block, or has
2195       // interfering EFLAGS, we can't sink the hardening to it.
2196       if (!isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent())
2197         return {};
2198
2199       // If this instruction defines multiple registers bail as we won't harden
2200       // all of them.
2201       if (UseMI.getDesc().getNumDefs() > 1)
2202         return {};
2203
2204       // If this register isn't a virtual register we can't walk uses of sanely,
2205       // just bail. Also check that its register class is one of the ones we
2206       // can harden.
2207       unsigned UseDefReg = UseMI.getOperand(0).getReg();
2208       if (!TRI->isVirtualRegister(UseDefReg) ||
2209           !canHardenRegister(UseDefReg))
2210         return {};
2211
2212       SingleUseMI = &UseMI;
2213     }
2214
2215     // If SingleUseMI is still null, there is no use that needs its own
2216     // checking. Otherwise, it is the single use that needs checking.
2217     return {SingleUseMI};
2218   };
2219
2220   MachineInstr *MI = &InitialMI;
2221   while (Optional<MachineInstr *> SingleUse = SinkCheckToSingleUse(*MI)) {
2222     // Update which MI we're checking now.
2223     MI = *SingleUse;
2224     if (!MI)
2225       break;
2226   }
2227
2228   return MI;
2229 }
2230
2231 bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
2232   auto *RC = MRI->getRegClass(Reg);
2233   int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
2234   if (RegBytes > 8)
2235     // We don't support post-load hardening of vectors.
2236     return false;
2237
2238   // If this register class is explicitly constrained to a class that doesn't
2239   // require REX prefix, we may not be able to satisfy that constraint when
2240   // emitting the hardening instructions, so bail out here.
2241   // FIXME: This seems like a pretty lame hack. The way this comes up is when we
2242   // end up both with a NOREX and REX-only register as operands to the hardening
2243   // instructions. It would be better to fix that code to handle this situation
2244   // rather than hack around it in this way.
2245   const TargetRegisterClass *NOREXRegClasses[] = {
2246       &X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
2247       &X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
2248   if (RC == NOREXRegClasses[Log2_32(RegBytes)])
2249     return false;
2250
2251   const TargetRegisterClass *GPRRegClasses[] = {
2252       &X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
2253       &X86::GR64RegClass};
2254   return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]);
2255 }
2256
2257 /// Harden a value in a register.
2258 ///
2259 /// This is the low-level logic to fully harden a value sitting in a register
2260 /// against leaking during speculative execution.
2261 ///
2262 /// Unlike hardening an address that is used by a load, this routine is required
2263 /// to hide *all* incoming bits in the register.
2264 ///
2265 /// `Reg` must be a virtual register. Currently, it is required to be a GPR no
2266 /// larger than the predicate state register. FIXME: We should support vector
2267 /// registers here by broadcasting the predicate state.
2268 ///
2269 /// The new, hardened virtual register is returned. It will have the same
2270 /// register class as `Reg`.
2271 unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
2272     unsigned Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt,
2273     DebugLoc Loc) {
2274   assert(canHardenRegister(Reg) && "Cannot harden this register!");
2275   assert(TRI->isVirtualRegister(Reg) && "Cannot harden a physical register!");
2276
2277   auto *RC = MRI->getRegClass(Reg);
2278   int Bytes = TRI->getRegSizeInBits(*RC) / 8;
2279
2280   unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
2281
2282   // FIXME: Need to teach this about 32-bit mode.
2283   if (Bytes != 8) {
2284     unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit};
2285     unsigned SubRegImm = SubRegImms[Log2_32(Bytes)];
2286     unsigned NarrowStateReg = MRI->createVirtualRegister(RC);
2287     BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg)
2288         .addReg(StateReg, 0, SubRegImm);
2289     StateReg = NarrowStateReg;
2290   }
2291
2292   unsigned FlagsReg = 0;
2293   if (isEFLAGSLive(MBB, InsertPt, *TRI))
2294     FlagsReg = saveEFLAGS(MBB, InsertPt, Loc);
2295
2296   unsigned NewReg = MRI->createVirtualRegister(RC);
2297   unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr};
2298   unsigned OrOpCode = OrOpCodes[Log2_32(Bytes)];
2299   auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewReg)
2300                  .addReg(StateReg)
2301                  .addReg(Reg);
2302   OrI->addRegisterDead(X86::EFLAGS, TRI);
2303   ++NumInstsInserted;
2304   LLVM_DEBUG(dbgs() << "  Inserting or: "; OrI->dump(); dbgs() << "\n");
2305
2306   if (FlagsReg)
2307     restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg);
2308
2309   return NewReg;
2310 }
2311
2312 /// Harden a load by hardening the loaded value in the defined register.
2313 ///
2314 /// We can harden a non-leaking load into a register without touching the
2315 /// address by just hiding all of the loaded bits during misspeculation. We use
2316 /// an `or` instruction to do this because we set up our poison value as all
2317 /// ones. And the goal is just for the loaded bits to not be exposed to
2318 /// execution and coercing them to one is sufficient.
2319 ///
2320 /// Returns the newly hardened register.
2321 unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) {
2322   MachineBasicBlock &MBB = *MI.getParent();
2323   DebugLoc Loc = MI.getDebugLoc();
2324
2325   auto &DefOp = MI.getOperand(0);
2326   unsigned OldDefReg = DefOp.getReg();
2327   auto *DefRC = MRI->getRegClass(OldDefReg);
2328
2329   // Because we want to completely replace the uses of this def'ed value with
2330   // the hardened value, create a dedicated new register that will only be used
2331   // to communicate the unhardened value to the hardening.
2332   unsigned UnhardenedReg = MRI->createVirtualRegister(DefRC);
2333   DefOp.setReg(UnhardenedReg);
2334
2335   // Now harden this register's value, getting a hardened reg that is safe to
2336   // use. Note that we insert the instructions to compute this *after* the
2337   // defining instruction, not before it.
2338   unsigned HardenedReg = hardenValueInRegister(
2339       UnhardenedReg, MBB, std::next(MI.getIterator()), Loc);
2340
2341   // Finally, replace the old register (which now only has the uses of the
2342   // original def) with the hardened register.
2343   MRI->replaceRegWith(/*FromReg*/ OldDefReg, /*ToReg*/ HardenedReg);
2344
2345   ++NumPostLoadRegsHardened;
2346   return HardenedReg;
2347 }
2348
2349 /// Harden a return instruction.
2350 ///
2351 /// Returns implicitly perform a load which we need to harden. Without hardening
2352 /// this load, an attacker my speculatively write over the return address to
2353 /// steer speculation of the return to an attacker controlled address. This is
2354 /// called Spectre v1.1 or Bounds Check Bypass Store (BCBS) and is described in
2355 /// this paper:
2356 /// https://people.csail.mit.edu/vlk/spectre11.pdf
2357 ///
2358 /// We can harden this by introducing an LFENCE that will delay any load of the
2359 /// return address until prior instructions have retired (and thus are not being
2360 /// speculated), or we can harden the address used by the implicit load: the
2361 /// stack pointer.
2362 ///
2363 /// If we are not using an LFENCE, hardening the stack pointer has an additional
2364 /// benefit: it allows us to pass the predicate state accumulated in this
2365 /// function back to the caller. In the absence of a BCBS attack on the return,
2366 /// the caller will typically be resumed and speculatively executed due to the
2367 /// Return Stack Buffer (RSB) prediction which is very accurate and has a high
2368 /// priority. It is possible that some code from the caller will be executed
2369 /// speculatively even during a BCBS-attacked return until the steering takes
2370 /// effect. Whenever this happens, the caller can recover the (poisoned)
2371 /// predicate state from the stack pointer and continue to harden loads.
2372 void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) {
2373   MachineBasicBlock &MBB = *MI.getParent();
2374   DebugLoc Loc = MI.getDebugLoc();
2375   auto InsertPt = MI.getIterator();
2376
2377   if (FenceCallAndRet)
2378     // No need to fence here as we'll fence at the return site itself. That
2379     // handles more cases than we can handle here.
2380     return;
2381
2382   // Take our predicate state, shift it to the high 17 bits (so that we keep
2383   // pointers canonical) and merge it into RSP. This will allow the caller to
2384   // extract it when we return (speculatively).
2385   mergePredStateIntoSP(MBB, InsertPt, Loc, PS->SSA.GetValueAtEndOfBlock(&MBB));
2386 }
2387
2388 /// Trace the predicate state through a call.
2389 ///
2390 /// There are several layers of this needed to handle the full complexity of
2391 /// calls.
2392 ///
2393 /// First, we need to send the predicate state into the called function. We do
2394 /// this by merging it into the high bits of the stack pointer.
2395 ///
2396 /// For tail calls, this is all we need to do.
2397 ///
2398 /// For calls where we might return and resume the control flow, we need to
2399 /// extract the predicate state from the high bits of the stack pointer after
2400 /// control returns from the called function.
2401 ///
2402 /// We also need to verify that we intended to return to this location in the
2403 /// code. An attacker might arrange for the processor to mispredict the return
2404 /// to this valid but incorrect return address in the program rather than the
2405 /// correct one. See the paper on this attack, called "ret2spec" by the
2406 /// researchers, here:
2407 /// https://christian-rossow.de/publications/ret2spec-ccs2018.pdf
2408 ///
2409 /// The way we verify that we returned to the correct location is by preserving
2410 /// the expected return address across the call. One technique involves taking
2411 /// advantage of the red-zone to load the return address from `8(%rsp)` where it
2412 /// was left by the RET instruction when it popped `%rsp`. Alternatively, we can
2413 /// directly save the address into a register that will be preserved across the
2414 /// call. We compare this intended return address against the address
2415 /// immediately following the call (the observed return address). If these
2416 /// mismatch, we have detected misspeculation and can poison our predicate
2417 /// state.
2418 void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
2419     MachineInstr &MI) {
2420   MachineBasicBlock &MBB = *MI.getParent();
2421   MachineFunction &MF = *MBB.getParent();
2422   auto InsertPt = MI.getIterator();
2423   DebugLoc Loc = MI.getDebugLoc();
2424
2425   if (FenceCallAndRet) {
2426     if (MI.isReturn())
2427       // Tail call, we don't return to this function.
2428       // FIXME: We should also handle noreturn calls.
2429       return;
2430
2431     // We don't need to fence before the call because the function should fence
2432     // in its entry. However, we do need to fence after the call returns.
2433     // Fencing before the return doesn't correctly handle cases where the return
2434     // itself is mispredicted.
2435     BuildMI(MBB, std::next(InsertPt), Loc, TII->get(X86::LFENCE));
2436     ++NumInstsInserted;
2437     ++NumLFENCEsInserted;
2438     return;
2439   }
2440
2441   // First, we transfer the predicate state into the called function by merging
2442   // it into the stack pointer. This will kill the current def of the state.
2443   unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
2444   mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
2445
2446   // If this call is also a return, it is a tail call and we don't need anything
2447   // else to handle it so just return. Also, if there are no further
2448   // instructions and no successors, this call does not return so we can also
2449   // bail.
2450   if (MI.isReturn() || (std::next(InsertPt) == MBB.end() && MBB.succ_empty()))
2451     return;
2452
2453   // Create a symbol to track the return address and attach it to the call
2454   // machine instruction. We will lower extra symbols attached to call
2455   // instructions as label immediately following the call.
2456   MCSymbol *RetSymbol =
2457       MF.getContext().createTempSymbol("slh_ret_addr",
2458                                        /*AlwaysAddSuffix*/ true);
2459   MI.setPostInstrSymbol(MF, RetSymbol);
2460
2461   const TargetRegisterClass *AddrRC = &X86::GR64RegClass;
2462   unsigned ExpectedRetAddrReg = 0;
2463
2464   // If we have no red zones or if the function returns twice (possibly without
2465   // using the `ret` instruction) like setjmp, we need to save the expected
2466   // return address prior to the call.
2467   if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone) ||
2468       MF.exposesReturnsTwice()) {
2469     // If we don't have red zones, we need to compute the expected return
2470     // address prior to the call and store it in a register that lives across
2471     // the call.
2472     //
2473     // In some ways, this is doubly satisfying as a mitigation because it will
2474     // also successfully detect stack smashing bugs in some cases (typically,
2475     // when a callee-saved register is used and the callee doesn't push it onto
2476     // the stack). But that isn't our primary goal, so we only use it as
2477     // a fallback.
2478     //
2479     // FIXME: It isn't clear that this is reliable in the face of
2480     // rematerialization in the register allocator. We somehow need to force
2481     // that to not occur for this particular instruction, and instead to spill
2482     // or otherwise preserve the value computed *prior* to the call.
2483     //
2484     // FIXME: It is even less clear why MachineCSE can't just fold this when we
2485     // end up having to use identical instructions both before and after the
2486     // call to feed the comparison.
2487     ExpectedRetAddrReg = MRI->createVirtualRegister(AddrRC);
2488     if (MF.getTarget().getCodeModel() == CodeModel::Small &&
2489         !Subtarget->isPositionIndependent()) {
2490       BuildMI(MBB, InsertPt, Loc, TII->get(X86::MOV64ri32), ExpectedRetAddrReg)
2491           .addSym(RetSymbol);
2492     } else {
2493       BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ExpectedRetAddrReg)
2494           .addReg(/*Base*/ X86::RIP)
2495           .addImm(/*Scale*/ 1)
2496           .addReg(/*Index*/ 0)
2497           .addSym(RetSymbol)
2498           .addReg(/*Segment*/ 0);
2499     }
2500   }
2501
2502   // Step past the call to handle when it returns.
2503   ++InsertPt;
2504
2505   // If we didn't pre-compute the expected return address into a register, then
2506   // red zones are enabled and the return address is still available on the
2507   // stack immediately after the call. As the very first instruction, we load it
2508   // into a register.
2509   if (!ExpectedRetAddrReg) {
2510     ExpectedRetAddrReg = MRI->createVirtualRegister(AddrRC);
2511     BuildMI(MBB, InsertPt, Loc, TII->get(X86::MOV64rm), ExpectedRetAddrReg)
2512         .addReg(/*Base*/ X86::RSP)
2513         .addImm(/*Scale*/ 1)
2514         .addReg(/*Index*/ 0)
2515         .addImm(/*Displacement*/ -8) // The stack pointer has been popped, so
2516                                      // the return address is 8-bytes past it.
2517         .addReg(/*Segment*/ 0);
2518   }
2519
2520   // Now we extract the callee's predicate state from the stack pointer.
2521   unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
2522
2523   // Test the expected return address against our actual address. If we can
2524   // form this basic block's address as an immediate, this is easy. Otherwise
2525   // we compute it.
2526   if (MF.getTarget().getCodeModel() == CodeModel::Small &&
2527       !Subtarget->isPositionIndependent()) {
2528     // FIXME: Could we fold this with the load? It would require careful EFLAGS
2529     // management.
2530     BuildMI(MBB, InsertPt, Loc, TII->get(X86::CMP64ri32))
2531         .addReg(ExpectedRetAddrReg, RegState::Kill)
2532         .addSym(RetSymbol);
2533   } else {
2534     unsigned ActualRetAddrReg = MRI->createVirtualRegister(AddrRC);
2535     BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ActualRetAddrReg)
2536         .addReg(/*Base*/ X86::RIP)
2537         .addImm(/*Scale*/ 1)
2538         .addReg(/*Index*/ 0)
2539         .addSym(RetSymbol)
2540         .addReg(/*Segment*/ 0);
2541     BuildMI(MBB, InsertPt, Loc, TII->get(X86::CMP64rr))
2542         .addReg(ExpectedRetAddrReg, RegState::Kill)
2543         .addReg(ActualRetAddrReg, RegState::Kill);
2544   }
2545
2546   // Now conditionally update the predicate state we just extracted if we ended
2547   // up at a different return address than expected.
2548   int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
2549   auto CMovOp = X86::getCMovFromCond(X86::COND_NE, PredStateSizeInBytes);
2550
2551   unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
2552   auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp), UpdatedStateReg)
2553                    .addReg(NewStateReg, RegState::Kill)
2554                    .addReg(PS->PoisonReg);
2555   CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
2556   ++NumInstsInserted;
2557   LLVM_DEBUG(dbgs() << "  Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
2558
2559   PS->SSA.AddAvailableValue(&MBB, UpdatedStateReg);
2560 }
2561
2562 /// An attacker may speculatively store over a value that is then speculatively
2563 /// loaded and used as the target of an indirect call or jump instruction. This
2564 /// is called Spectre v1.2 or Bounds Check Bypass Store (BCBS) and is described
2565 /// in this paper:
2566 /// https://people.csail.mit.edu/vlk/spectre11.pdf
2567 ///
2568 /// When this happens, the speculative execution of the call or jump will end up
2569 /// being steered to this attacker controlled address. While most such loads
2570 /// will be adequately hardened already, we want to ensure that they are
2571 /// definitively treated as needing post-load hardening. While address hardening
2572 /// is sufficient to prevent secret data from leaking to the attacker, it may
2573 /// not be sufficient to prevent an attacker from steering speculative
2574 /// execution. We forcibly unfolded all relevant loads above and so will always
2575 /// have an opportunity to post-load harden here, we just need to scan for cases
2576 /// not already flagged and add them.
2577 void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
2578     MachineInstr &MI,
2579     SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg) {
2580   switch (MI.getOpcode()) {
2581   case X86::FARCALL16m:
2582   case X86::FARCALL32m:
2583   case X86::FARCALL64:
2584   case X86::FARJMP16m:
2585   case X86::FARJMP32m:
2586   case X86::FARJMP64:
2587     // We don't need to harden either far calls or far jumps as they are
2588     // safe from Spectre.
2589     return;
2590
2591   default:
2592     break;
2593   }
2594
2595   // We should never see a loading instruction at this point, as those should
2596   // have been unfolded.
2597   assert(!MI.mayLoad() && "Found a lingering loading instruction!");
2598
2599   // If the first operand isn't a register, this is a branch or call
2600   // instruction with an immediate operand which doesn't need to be hardened.
2601   if (!MI.getOperand(0).isReg())
2602     return;
2603
2604   // For all of these, the target register is the first operand of the
2605   // instruction.
2606   auto &TargetOp = MI.getOperand(0);
2607   unsigned OldTargetReg = TargetOp.getReg();
2608
2609   // Try to lookup a hardened version of this register. We retain a reference
2610   // here as we want to update the map to track any newly computed hardened
2611   // register.
2612   unsigned &HardenedTargetReg = AddrRegToHardenedReg[OldTargetReg];
2613
2614   // If we don't have a hardened register yet, compute one. Otherwise, just use
2615   // the already hardened register.
2616   //
2617   // FIXME: It is a little suspect that we use partially hardened registers that
2618   // only feed addresses. The complexity of partial hardening with SHRX
2619   // continues to pile up. Should definitively measure its value and consider
2620   // eliminating it.
2621   if (!HardenedTargetReg)
2622     HardenedTargetReg = hardenValueInRegister(
2623         OldTargetReg, *MI.getParent(), MI.getIterator(), MI.getDebugLoc());
2624
2625   // Set the target operand to the hardened register.
2626   TargetOp.setReg(HardenedTargetReg);
2627
2628   ++NumCallsOrJumpsHardened;
2629 }
2630
2631 INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, PASS_KEY,
2632                       "X86 speculative load hardener", false, false)
2633 INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, PASS_KEY,
2634                     "X86 speculative load hardener", false, false)
2635
2636 FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() {
2637   return new X86SpeculativeLoadHardeningPass();
2638 }