contrib/llvm-project/lld/ELF/Arch/X86_64.cpp

   1 //===- X86_64.cpp ---------------------------------------------------------===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "InputFiles.h"
  10 #include "Symbols.h"
  11 #include "SyntheticSections.h"
  12 #include "Target.h"
  13 #include "lld/Common/ErrorHandler.h"
  14 #include "llvm/Object/ELF.h"
  15 #include "llvm/Support/Endian.h"
  16
  17 using namespace llvm;
  18 using namespace llvm::object;
  19 using namespace llvm::support::endian;
  20 using namespace llvm::ELF;
  21
  22 namespace lld {
  23 namespace elf {
  24
  25 namespace {
  26 class X86_64 : public TargetInfo {
  27 public:
  28   X86_64();
  29   int getTlsGdRelaxSkip(RelType type) const override;
  30   RelExpr getRelExpr(RelType type, const Symbol &s,
  31                      const uint8_t *loc) const override;
  32   RelType getDynRel(RelType type) const override;
  33   void writeGotPltHeader(uint8_t *buf) const override;
  34   void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
  35   void writePltHeader(uint8_t *buf) const override;
  36   void writePlt(uint8_t *buf, const Symbol &sym,
  37                 uint64_t pltEntryAddr) const override;
  38   void relocateOne(uint8_t *loc, RelType type, uint64_t val) const override;
  39
  40   RelExpr adjustRelaxExpr(RelType type, const uint8_t *data,
  41                           RelExpr expr) const override;
  42   void relaxGot(uint8_t *loc, RelType type, uint64_t val) const override;
  43   void relaxTlsGdToIe(uint8_t *loc, RelType type, uint64_t val) const override;
  44   void relaxTlsGdToLe(uint8_t *loc, RelType type, uint64_t val) const override;
  45   void relaxTlsIeToLe(uint8_t *loc, RelType type, uint64_t val) const override;
  46   void relaxTlsLdToLe(uint8_t *loc, RelType type, uint64_t val) const override;
  47   bool adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
  48                                         uint8_t stOther) const override;
  49 };
  50 } // namespace
  51
  52 X86_64::X86_64() {
  53   copyRel = R_X86_64_COPY;
  54   gotRel = R_X86_64_GLOB_DAT;
  55   noneRel = R_X86_64_NONE;
  56   pltRel = R_X86_64_JUMP_SLOT;
  57   relativeRel = R_X86_64_RELATIVE;
  58   iRelativeRel = R_X86_64_IRELATIVE;
  59   symbolicRel = R_X86_64_64;
  60   tlsDescRel = R_X86_64_TLSDESC;
  61   tlsGotRel = R_X86_64_TPOFF64;
  62   tlsModuleIndexRel = R_X86_64_DTPMOD64;
  63   tlsOffsetRel = R_X86_64_DTPOFF64;
  64   pltHeaderSize = 16;
  65   pltEntrySize = 16;
  66   ipltEntrySize = 16;
  67   trapInstr = {0xcc, 0xcc, 0xcc, 0xcc}; // 0xcc = INT3
  68
  69   // Align to the large page size (known as a superpage or huge page).
  70   // FreeBSD automatically promotes large, superpage-aligned allocations.
  71   defaultImageBase = 0x200000;
  72 }
  73
  74 int X86_64::getTlsGdRelaxSkip(RelType type) const { return 2; }
  75
  76 RelExpr X86_64::getRelExpr(RelType type, const Symbol &s,
  77                            const uint8_t *loc) const {
  78   if (type == R_X86_64_GOTTPOFF)
  79     config->hasStaticTlsModel = true;
  80
  81   switch (type) {
  82   case R_X86_64_8:
  83   case R_X86_64_16:
  84   case R_X86_64_32:
  85   case R_X86_64_32S:
  86   case R_X86_64_64:
  87     return R_ABS;
  88   case R_X86_64_DTPOFF32:
  89   case R_X86_64_DTPOFF64:
  90     return R_DTPREL;
  91   case R_X86_64_TPOFF32:
  92     return R_TLS;
  93   case R_X86_64_TLSDESC_CALL:
  94     return R_TLSDESC_CALL;
  95   case R_X86_64_TLSLD:
  96     return R_TLSLD_PC;
  97   case R_X86_64_TLSGD:
  98     return R_TLSGD_PC;
  99   case R_X86_64_SIZE32:
 100   case R_X86_64_SIZE64:
 101     return R_SIZE;
 102   case R_X86_64_PLT32:
 103     return R_PLT_PC;
 104   case R_X86_64_PC8:
 105   case R_X86_64_PC16:
 106   case R_X86_64_PC32:
 107   case R_X86_64_PC64:
 108     return R_PC;
 109   case R_X86_64_GOT32:
 110   case R_X86_64_GOT64:
 111     return R_GOTPLT;
 112   case R_X86_64_GOTPC32_TLSDESC:
 113     return R_TLSDESC_PC;
 114   case R_X86_64_GOTPCREL:
 115   case R_X86_64_GOTPCRELX:
 116   case R_X86_64_REX_GOTPCRELX:
 117   case R_X86_64_GOTTPOFF:
 118     return R_GOT_PC;
 119   case R_X86_64_GOTOFF64:
 120     return R_GOTPLTREL;
 121   case R_X86_64_GOTPC32:
 122   case R_X86_64_GOTPC64:
 123     return R_GOTPLTONLY_PC;
 124   case R_X86_64_NONE:
 125     return R_NONE;
 126   default:
 127     error(getErrorLocation(loc) + "unknown relocation (" + Twine(type) +
 128           ") against symbol " + toString(s));
 129     return R_NONE;
 130   }
 131 }
 132
 133 void X86_64::writeGotPltHeader(uint8_t *buf) const {
 134   // The first entry holds the value of _DYNAMIC. It is not clear why that is
 135   // required, but it is documented in the psabi and the glibc dynamic linker
 136   // seems to use it (note that this is relevant for linking ld.so, not any
 137   // other program).
 138   write64le(buf, mainPart->dynamic->getVA());
 139 }
 140
 141 void X86_64::writeGotPlt(uint8_t *buf, const Symbol &s) const {
 142   // See comments in X86::writeGotPlt.
 143   write64le(buf, s.getPltVA() + 6);
 144 }
 145
 146 void X86_64::writePltHeader(uint8_t *buf) const {
 147   const uint8_t pltData[] = {
 148       0xff, 0x35, 0, 0, 0, 0, // pushq GOTPLT+8(%rip)
 149       0xff, 0x25, 0, 0, 0, 0, // jmp *GOTPLT+16(%rip)
 150       0x0f, 0x1f, 0x40, 0x00, // nop
 151   };
 152   memcpy(buf, pltData, sizeof(pltData));
 153   uint64_t gotPlt = in.gotPlt->getVA();
 154   uint64_t plt = in.ibtPlt ? in.ibtPlt->getVA() : in.plt->getVA();
 155   write32le(buf + 2, gotPlt - plt + 2); // GOTPLT+8
 156   write32le(buf + 8, gotPlt - plt + 4); // GOTPLT+16
 157 }
 158
 159 void X86_64::writePlt(uint8_t *buf, const Symbol &sym,
 160                       uint64_t pltEntryAddr) const {
 161   const uint8_t inst[] = {
 162       0xff, 0x25, 0, 0, 0, 0, // jmpq *got(%rip)
 163       0x68, 0, 0, 0, 0,       // pushq <relocation index>
 164       0xe9, 0, 0, 0, 0,       // jmpq plt[0]
 165   };
 166   memcpy(buf, inst, sizeof(inst));
 167
 168   write32le(buf + 2, sym.getGotPltVA() - pltEntryAddr - 6);
 169   write32le(buf + 7, sym.pltIndex);
 170   write32le(buf + 12, in.plt->getVA() - pltEntryAddr - 16);
 171 }
 172
 173 RelType X86_64::getDynRel(RelType type) const {
 174   if (type == R_X86_64_64 || type == R_X86_64_PC64 || type == R_X86_64_SIZE32 ||
 175       type == R_X86_64_SIZE64)
 176     return type;
 177   return R_X86_64_NONE;
 178 }
 179
 180 void X86_64::relaxTlsGdToLe(uint8_t *loc, RelType type, uint64_t val) const {
 181   if (type == R_X86_64_TLSGD) {
 182     // Convert
 183     //   .byte 0x66
 184     //   leaq x@tlsgd(%rip), %rdi
 185     //   .word 0x6666
 186     //   rex64
 187     //   call __tls_get_addr@plt
 188     // to the following two instructions.
 189     const uint8_t inst[] = {
 190         0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00,
 191         0x00, 0x00,                            // mov %fs:0x0,%rax
 192         0x48, 0x8d, 0x80, 0,    0,    0,    0, // lea x@tpoff,%rax
 193     };
 194     memcpy(loc - 4, inst, sizeof(inst));
 195
 196     // The original code used a pc relative relocation and so we have to
 197     // compensate for the -4 in had in the addend.
 198     write32le(loc + 8, val + 4);
 199   } else {
 200     // Convert
 201     //   lea x@tlsgd(%rip), %rax
 202     //   call *(%rax)
 203     // to the following two instructions.
 204     assert(type == R_X86_64_GOTPC32_TLSDESC);
 205     if (memcmp(loc - 3, "\x48\x8d\x05", 3)) {
 206       error(getErrorLocation(loc - 3) + "R_X86_64_GOTPC32_TLSDESC must be used "
 207                                         "in callq *x@tlsdesc(%rip), %rax");
 208       return;
 209     }
 210     // movq $x@tpoff(%rip),%rax
 211     loc[-2] = 0xc7;
 212     loc[-1] = 0xc0;
 213     write32le(loc, val + 4);
 214     // xchg ax,ax
 215     loc[4] = 0x66;
 216     loc[5] = 0x90;
 217   }
 218 }
 219
 220 void X86_64::relaxTlsGdToIe(uint8_t *loc, RelType type, uint64_t val) const {
 221   if (type == R_X86_64_TLSGD) {
 222     // Convert
 223     //   .byte 0x66
 224     //   leaq x@tlsgd(%rip), %rdi
 225     //   .word 0x6666
 226     //   rex64
 227     //   call __tls_get_addr@plt
 228     // to the following two instructions.
 229     const uint8_t inst[] = {
 230         0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00,
 231         0x00, 0x00,                            // mov %fs:0x0,%rax
 232         0x48, 0x03, 0x05, 0,    0,    0,    0, // addq x@gottpoff(%rip),%rax
 233     };
 234     memcpy(loc - 4, inst, sizeof(inst));
 235
 236     // Both code sequences are PC relatives, but since we are moving the
 237     // constant forward by 8 bytes we have to subtract the value by 8.
 238     write32le(loc + 8, val - 8);
 239   } else {
 240     // Convert
 241     //   lea x@tlsgd(%rip), %rax
 242     //   call *(%rax)
 243     // to the following two instructions.
 244     assert(type == R_X86_64_GOTPC32_TLSDESC);
 245     if (memcmp(loc - 3, "\x48\x8d\x05", 3)) {
 246       error(getErrorLocation(loc - 3) + "R_X86_64_GOTPC32_TLSDESC must be used "
 247                                         "in callq *x@tlsdesc(%rip), %rax");
 248       return;
 249     }
 250     // movq x@gottpoff(%rip),%rax
 251     loc[-2] = 0x8b;
 252     write32le(loc, val);
 253     // xchg ax,ax
 254     loc[4] = 0x66;
 255     loc[5] = 0x90;
 256   }
 257 }
 258
 259 // In some conditions, R_X86_64_GOTTPOFF relocation can be optimized to
 260 // R_X86_64_TPOFF32 so that it does not use GOT.
 261 void X86_64::relaxTlsIeToLe(uint8_t *loc, RelType type, uint64_t val) const {
 262   uint8_t *inst = loc - 3;
 263   uint8_t reg = loc[-1] >> 3;
 264   uint8_t *regSlot = loc - 1;
 265
 266   // Note that ADD with RSP or R12 is converted to ADD instead of LEA
 267   // because LEA with these registers needs 4 bytes to encode and thus
 268   // wouldn't fit the space.
 269
 270   if (memcmp(inst, "\x48\x03\x25", 3) == 0) {
 271     // "addq foo@gottpoff(%rip),%rsp" -> "addq $foo,%rsp"
 272     memcpy(inst, "\x48\x81\xc4", 3);
 273   } else if (memcmp(inst, "\x4c\x03\x25", 3) == 0) {
 274     // "addq foo@gottpoff(%rip),%r12" -> "addq $foo,%r12"
 275     memcpy(inst, "\x49\x81\xc4", 3);
 276   } else if (memcmp(inst, "\x4c\x03", 2) == 0) {
 277     // "addq foo@gottpoff(%rip),%r[8-15]" -> "leaq foo(%r[8-15]),%r[8-15]"
 278     memcpy(inst, "\x4d\x8d", 2);
 279     *regSlot = 0x80 | (reg << 3) | reg;
 280   } else if (memcmp(inst, "\x48\x03", 2) == 0) {
 281     // "addq foo@gottpoff(%rip),%reg -> "leaq foo(%reg),%reg"
 282     memcpy(inst, "\x48\x8d", 2);
 283     *regSlot = 0x80 | (reg << 3) | reg;
 284   } else if (memcmp(inst, "\x4c\x8b", 2) == 0) {
 285     // "movq foo@gottpoff(%rip),%r[8-15]" -> "movq $foo,%r[8-15]"
 286     memcpy(inst, "\x49\xc7", 2);
 287     *regSlot = 0xc0 | reg;
 288   } else if (memcmp(inst, "\x48\x8b", 2) == 0) {
 289     // "movq foo@gottpoff(%rip),%reg" -> "movq $foo,%reg"
 290     memcpy(inst, "\x48\xc7", 2);
 291     *regSlot = 0xc0 | reg;
 292   } else {
 293     error(getErrorLocation(loc - 3) +
 294           "R_X86_64_GOTTPOFF must be used in MOVQ or ADDQ instructions only");
 295   }
 296
 297   // The original code used a PC relative relocation.
 298   // Need to compensate for the -4 it had in the addend.
 299   write32le(loc, val + 4);
 300 }
 301
 302 void X86_64::relaxTlsLdToLe(uint8_t *loc, RelType type, uint64_t val) const {
 303   if (type == R_X86_64_DTPOFF64) {
 304     write64le(loc, val);
 305     return;
 306   }
 307   if (type == R_X86_64_DTPOFF32) {
 308     write32le(loc, val);
 309     return;
 310   }
 311
 312   const uint8_t inst[] = {
 313       0x66, 0x66,                                           // .word 0x6666
 314       0x66,                                                 // .byte 0x66
 315       0x64, 0x48, 0x8b, 0x04, 0x25, 0x00, 0x00, 0x00, 0x00, // mov %fs:0,%rax
 316   };
 317
 318   if (loc[4] == 0xe8) {
 319     // Convert
 320     //   leaq bar@tlsld(%rip), %rdi           # 48 8d 3d <Loc>
 321     //   callq __tls_get_addr@PLT             # e8 <disp32>
 322     //   leaq bar@dtpoff(%rax), %rcx
 323     // to
 324     //   .word 0x6666
 325     //   .byte 0x66
 326     //   mov %fs:0,%rax
 327     //   leaq bar@tpoff(%rax), %rcx
 328     memcpy(loc - 3, inst, sizeof(inst));
 329     return;
 330   }
 331
 332   if (loc[4] == 0xff && loc[5] == 0x15) {
 333     // Convert
 334     //   leaq  x@tlsld(%rip),%rdi               # 48 8d 3d <Loc>
 335     //   call *__tls_get_addr@GOTPCREL(%rip)    # ff 15 <disp32>
 336     // to
 337     //   .long  0x66666666
 338     //   movq   %fs:0,%rax
 339     // See "Table 11.9: LD -> LE Code Transition (LP64)" in
 340     // https://raw.githubusercontent.com/wiki/hjl-tools/x86-psABI/x86-64-psABI-1.0.pdf
 341     loc[-3] = 0x66;
 342     memcpy(loc - 2, inst, sizeof(inst));
 343     return;
 344   }
 345
 346   error(getErrorLocation(loc - 3) +
 347         "expected R_X86_64_PLT32 or R_X86_64_GOTPCRELX after R_X86_64_TLSLD");
 348 }
 349
 350 void X86_64::relocateOne(uint8_t *loc, RelType type, uint64_t val) const {
 351   switch (type) {
 352   case R_X86_64_8:
 353     checkIntUInt(loc, val, 8, type);
 354     *loc = val;
 355     break;
 356   case R_X86_64_PC8:
 357     checkInt(loc, val, 8, type);
 358     *loc = val;
 359     break;
 360   case R_X86_64_16:
 361     checkIntUInt(loc, val, 16, type);
 362     write16le(loc, val);
 363     break;
 364   case R_X86_64_PC16:
 365     checkInt(loc, val, 16, type);
 366     write16le(loc, val);
 367     break;
 368   case R_X86_64_32:
 369     checkUInt(loc, val, 32, type);
 370     write32le(loc, val);
 371     break;
 372   case R_X86_64_32S:
 373   case R_X86_64_TPOFF32:
 374   case R_X86_64_GOT32:
 375   case R_X86_64_GOTPC32:
 376   case R_X86_64_GOTPC32_TLSDESC:
 377   case R_X86_64_GOTPCREL:
 378   case R_X86_64_GOTPCRELX:
 379   case R_X86_64_REX_GOTPCRELX:
 380   case R_X86_64_PC32:
 381   case R_X86_64_GOTTPOFF:
 382   case R_X86_64_PLT32:
 383   case R_X86_64_TLSGD:
 384   case R_X86_64_TLSLD:
 385   case R_X86_64_DTPOFF32:
 386   case R_X86_64_SIZE32:
 387     checkInt(loc, val, 32, type);
 388     write32le(loc, val);
 389     break;
 390   case R_X86_64_64:
 391   case R_X86_64_DTPOFF64:
 392   case R_X86_64_PC64:
 393   case R_X86_64_SIZE64:
 394   case R_X86_64_GOT64:
 395   case R_X86_64_GOTOFF64:
 396   case R_X86_64_GOTPC64:
 397     write64le(loc, val);
 398     break;
 399   default:
 400     llvm_unreachable("unknown relocation");
 401   }
 402 }
 403
 404 RelExpr X86_64::adjustRelaxExpr(RelType type, const uint8_t *data,
 405                                 RelExpr relExpr) const {
 406   if (type != R_X86_64_GOTPCRELX && type != R_X86_64_REX_GOTPCRELX)
 407     return relExpr;
 408   const uint8_t op = data[-2];
 409   const uint8_t modRm = data[-1];
 410
 411   // FIXME: When PIC is disabled and foo is defined locally in the
 412   // lower 32 bit address space, memory operand in mov can be converted into
 413   // immediate operand. Otherwise, mov must be changed to lea. We support only
 414   // latter relaxation at this moment.
 415   if (op == 0x8b)
 416     return R_RELAX_GOT_PC;
 417
 418   // Relax call and jmp.
 419   if (op == 0xff && (modRm == 0x15 || modRm == 0x25))
 420     return R_RELAX_GOT_PC;
 421
 422   // Relaxation of test, adc, add, and, cmp, or, sbb, sub, xor.
 423   // If PIC then no relaxation is available.
 424   // We also don't relax test/binop instructions without REX byte,
 425   // they are 32bit operations and not common to have.
 426   assert(type == R_X86_64_REX_GOTPCRELX);
 427   return config->isPic ? relExpr : R_RELAX_GOT_PC_NOPIC;
 428 }
 429
 430 // A subset of relaxations can only be applied for no-PIC. This method
 431 // handles such relaxations. Instructions encoding information was taken from:
 432 // "Intel 64 and IA-32 Architectures Software Developer's Manual V2"
 433 // (http://www.intel.com/content/dam/www/public/us/en/documents/manuals/
 434 //    64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf)
 435 static void relaxGotNoPic(uint8_t *loc, uint64_t val, uint8_t op,
 436                           uint8_t modRm) {
 437   const uint8_t rex = loc[-3];
 438   // Convert "test %reg, foo@GOTPCREL(%rip)" to "test $foo, %reg".
 439   if (op == 0x85) {
 440     // See "TEST-Logical Compare" (4-428 Vol. 2B),
 441     // TEST r/m64, r64 uses "full" ModR / M byte (no opcode extension).
 442
 443     // ModR/M byte has form XX YYY ZZZ, where
 444     // YYY is MODRM.reg(register 2), ZZZ is MODRM.rm(register 1).
 445     // XX has different meanings:
 446     // 00: The operand's memory address is in reg1.
 447     // 01: The operand's memory address is reg1 + a byte-sized displacement.
 448     // 10: The operand's memory address is reg1 + a word-sized displacement.
 449     // 11: The operand is reg1 itself.
 450     // If an instruction requires only one operand, the unused reg2 field
 451     // holds extra opcode bits rather than a register code
 452     // 0xC0 == 11 000 000 binary.
 453     // 0x38 == 00 111 000 binary.
 454     // We transfer reg2 to reg1 here as operand.
 455     // See "2.1.3 ModR/M and SIB Bytes" (Vol. 2A 2-3).
 456     loc[-1] = 0xc0 | (modRm & 0x38) >> 3; // ModR/M byte.
 457
 458     // Change opcode from TEST r/m64, r64 to TEST r/m64, imm32
 459     // See "TEST-Logical Compare" (4-428 Vol. 2B).
 460     loc[-2] = 0xf7;
 461
 462     // Move R bit to the B bit in REX byte.
 463     // REX byte is encoded as 0100WRXB, where
 464     // 0100 is 4bit fixed pattern.
 465     // REX.W When 1, a 64-bit operand size is used. Otherwise, when 0, the
 466     //   default operand size is used (which is 32-bit for most but not all
 467     //   instructions).
 468     // REX.R This 1-bit value is an extension to the MODRM.reg field.
 469     // REX.X This 1-bit value is an extension to the SIB.index field.
 470     // REX.B This 1-bit value is an extension to the MODRM.rm field or the
 471     // SIB.base field.
 472     // See "2.2.1.2 More on REX Prefix Fields " (2-8 Vol. 2A).
 473     loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2;
 474     write32le(loc, val);
 475     return;
 476   }
 477
 478   // If we are here then we need to relax the adc, add, and, cmp, or, sbb, sub
 479   // or xor operations.
 480
 481   // Convert "binop foo@GOTPCREL(%rip), %reg" to "binop $foo, %reg".
 482   // Logic is close to one for test instruction above, but we also
 483   // write opcode extension here, see below for details.
 484   loc[-1] = 0xc0 | (modRm & 0x38) >> 3 | (op & 0x3c); // ModR/M byte.
 485
 486   // Primary opcode is 0x81, opcode extension is one of:
 487   // 000b = ADD, 001b is OR, 010b is ADC, 011b is SBB,
 488   // 100b is AND, 101b is SUB, 110b is XOR, 111b is CMP.
 489   // This value was wrote to MODRM.reg in a line above.
 490   // See "3.2 INSTRUCTIONS (A-M)" (Vol. 2A 3-15),
 491   // "INSTRUCTION SET REFERENCE, N-Z" (Vol. 2B 4-1) for
 492   // descriptions about each operation.
 493   loc[-2] = 0x81;
 494   loc[-3] = (rex & ~0x4) | (rex & 0x4) >> 2;
 495   write32le(loc, val);
 496 }
 497
 498 void X86_64::relaxGot(uint8_t *loc, RelType type, uint64_t val) const {
 499   const uint8_t op = loc[-2];
 500   const uint8_t modRm = loc[-1];
 501
 502   // Convert "mov foo@GOTPCREL(%rip),%reg" to "lea foo(%rip),%reg".
 503   if (op == 0x8b) {
 504     loc[-2] = 0x8d;
 505     write32le(loc, val);
 506     return;
 507   }
 508
 509   if (op != 0xff) {
 510     // We are relaxing a rip relative to an absolute, so compensate
 511     // for the old -4 addend.
 512     assert(!config->isPic);
 513     relaxGotNoPic(loc, val + 4, op, modRm);
 514     return;
 515   }
 516
 517   // Convert call/jmp instructions.
 518   if (modRm == 0x15) {
 519     // ABI says we can convert "call *foo@GOTPCREL(%rip)" to "nop; call foo".
 520     // Instead we convert to "addr32 call foo" where addr32 is an instruction
 521     // prefix. That makes result expression to be a single instruction.
 522     loc[-2] = 0x67; // addr32 prefix
 523     loc[-1] = 0xe8; // call
 524     write32le(loc, val);
 525     return;
 526   }
 527
 528   // Convert "jmp *foo@GOTPCREL(%rip)" to "jmp foo; nop".
 529   // jmp doesn't return, so it is fine to use nop here, it is just a stub.
 530   assert(modRm == 0x25);
 531   loc[-2] = 0xe9; // jmp
 532   loc[3] = 0x90;  // nop
 533   write32le(loc - 1, val + 1);
 534 }
 535
 536 // A split-stack prologue starts by checking the amount of stack remaining
 537 // in one of two ways:
 538 // A) Comparing of the stack pointer to a field in the tcb.
 539 // B) Or a load of a stack pointer offset with an lea to r10 or r11.
 540 bool X86_64::adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
 541                                               uint8_t stOther) const {
 542   if (!config->is64) {
 543     error("Target doesn't support split stacks.");
 544     return false;
 545   }
 546
 547   if (loc + 8 >= end)
 548     return false;
 549
 550   // Replace "cmp %fs:0x70,%rsp" and subsequent branch
 551   // with "stc, nopl 0x0(%rax,%rax,1)"
 552   if (memcmp(loc, "\x64\x48\x3b\x24\x25", 5) == 0) {
 553     memcpy(loc, "\xf9\x0f\x1f\x84\x00\x00\x00\x00", 8);
 554     return true;
 555   }
 556
 557   // Adjust "lea X(%rsp),%rYY" to lea "(X - 0x4000)(%rsp),%rYY" where rYY could
 558   // be r10 or r11. The lea instruction feeds a subsequent compare which checks
 559   // if there is X available stack space. Making X larger effectively reserves
 560   // that much additional space. The stack grows downward so subtract the value.
 561   if (memcmp(loc, "\x4c\x8d\x94\x24", 4) == 0 ||
 562       memcmp(loc, "\x4c\x8d\x9c\x24", 4) == 0) {
 563     // The offset bytes are encoded four bytes after the start of the
 564     // instruction.
 565     write32le(loc + 4, read32le(loc + 4) - 0x4000);
 566     return true;
 567   }
 568   return false;
 569 }
 570
 571 // If Intel Indirect Branch Tracking is enabled, we have to emit special PLT
 572 // entries containing endbr64 instructions. A PLT entry will be split into two
 573 // parts, one in .plt.sec (writePlt), and the other in .plt (writeIBTPlt).
 574 namespace {
 575 class IntelIBT : public X86_64 {
 576 public:
 577   IntelIBT();
 578   void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
 579   void writePlt(uint8_t *buf, const Symbol &sym,
 580                 uint64_t pltEntryAddr) const override;
 581   void writeIBTPlt(uint8_t *buf, size_t numEntries) const override;
 582
 583   static const unsigned IBTPltHeaderSize = 16;
 584 };
 585 } // namespace
 586
 587 IntelIBT::IntelIBT() { pltHeaderSize = 0; }
 588
 589 void IntelIBT::writeGotPlt(uint8_t *buf, const Symbol &s) const {
 590   uint64_t va =
 591       in.ibtPlt->getVA() + IBTPltHeaderSize + s.pltIndex * pltEntrySize;
 592   write64le(buf, va);
 593 }
 594
 595 void IntelIBT::writePlt(uint8_t *buf, const Symbol &sym,
 596                         uint64_t pltEntryAddr) const {
 597   const uint8_t Inst[] = {
 598       0xf3, 0x0f, 0x1e, 0xfa,       // endbr64
 599       0xff, 0x25, 0,    0,    0, 0, // jmpq *got(%rip)
 600       0x66, 0x0f, 0x1f, 0x44, 0, 0, // nop
 601   };
 602   memcpy(buf, Inst, sizeof(Inst));
 603   write32le(buf + 6, sym.getGotPltVA() - pltEntryAddr - 10);
 604 }
 605
 606 void IntelIBT::writeIBTPlt(uint8_t *buf, size_t numEntries) const {
 607   writePltHeader(buf);
 608   buf += IBTPltHeaderSize;
 609
 610   const uint8_t inst[] = {
 611       0xf3, 0x0f, 0x1e, 0xfa,    // endbr64
 612       0x68, 0,    0,    0,    0, // pushq <relocation index>
 613       0xe9, 0,    0,    0,    0, // jmpq plt[0]
 614       0x66, 0x90,                // nop
 615   };
 616
 617   for (size_t i = 0; i < numEntries; ++i) {
 618     memcpy(buf, inst, sizeof(inst));
 619     write32le(buf + 5, i);
 620     write32le(buf + 10, -pltHeaderSize - sizeof(inst) * i - 30);
 621     buf += sizeof(inst);
 622   }
 623 }
 624
 625 // These nonstandard PLT entries are to migtigate Spectre v2 security
 626 // vulnerability. In order to mitigate Spectre v2, we want to avoid indirect
 627 // branch instructions such as `jmp *GOTPLT(%rip)`. So, in the following PLT
 628 // entries, we use a CALL followed by MOV and RET to do the same thing as an
 629 // indirect jump. That instruction sequence is so-called "retpoline".
 630 //
 631 // We have two types of retpoline PLTs as a size optimization. If `-z now`
 632 // is specified, all dynamic symbols are resolved at load-time. Thus, when
 633 // that option is given, we can omit code for symbol lazy resolution.
 634 namespace {
 635 class Retpoline : public X86_64 {
 636 public:
 637   Retpoline();
 638   void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
 639   void writePltHeader(uint8_t *buf) const override;
 640   void writePlt(uint8_t *buf, const Symbol &sym,
 641                 uint64_t pltEntryAddr) const override;
 642 };
 643
 644 class RetpolineZNow : public X86_64 {
 645 public:
 646   RetpolineZNow();
 647   void writeGotPlt(uint8_t *buf, const Symbol &s) const override {}
 648   void writePltHeader(uint8_t *buf) const override;
 649   void writePlt(uint8_t *buf, const Symbol &sym,
 650                 uint64_t pltEntryAddr) const override;
 651 };
 652 } // namespace
 653
 654 Retpoline::Retpoline() {
 655   pltHeaderSize = 48;
 656   pltEntrySize = 32;
 657   ipltEntrySize = 32;
 658 }
 659
 660 void Retpoline::writeGotPlt(uint8_t *buf, const Symbol &s) const {
 661   write64le(buf, s.getPltVA() + 17);
 662 }
 663
 664 void Retpoline::writePltHeader(uint8_t *buf) const {
 665   const uint8_t insn[] = {
 666       0xff, 0x35, 0,    0,    0,    0,          // 0:    pushq GOTPLT+8(%rip)
 667       0x4c, 0x8b, 0x1d, 0,    0,    0,    0,    // 6:    mov GOTPLT+16(%rip), %r11
 668       0xe8, 0x0e, 0x00, 0x00, 0x00,             // d:    callq next
 669       0xf3, 0x90,                               // 12: loop: pause
 670       0x0f, 0xae, 0xe8,                         // 14:   lfence
 671       0xeb, 0xf9,                               // 17:   jmp loop
 672       0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 19:   int3; .align 16
 673       0x4c, 0x89, 0x1c, 0x24,                   // 20: next: mov %r11, (%rsp)
 674       0xc3,                                     // 24:   ret
 675       0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 25:   int3; padding
 676       0xcc, 0xcc, 0xcc, 0xcc,                   // 2c:   int3; padding
 677   };
 678   memcpy(buf, insn, sizeof(insn));
 679
 680   uint64_t gotPlt = in.gotPlt->getVA();
 681   uint64_t plt = in.plt->getVA();
 682   write32le(buf + 2, gotPlt - plt - 6 + 8);
 683   write32le(buf + 9, gotPlt - plt - 13 + 16);
 684 }
 685
 686 void Retpoline::writePlt(uint8_t *buf, const Symbol &sym,
 687                          uint64_t pltEntryAddr) const {
 688   const uint8_t insn[] = {
 689       0x4c, 0x8b, 0x1d, 0, 0, 0, 0, // 0:  mov foo@GOTPLT(%rip), %r11
 690       0xe8, 0,    0,    0,    0,    // 7:  callq plt+0x20
 691       0xe9, 0,    0,    0,    0,    // c:  jmp plt+0x12
 692       0x68, 0,    0,    0,    0,    // 11: pushq <relocation index>
 693       0xe9, 0,    0,    0,    0,    // 16: jmp plt+0
 694       0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 1b: int3; padding
 695   };
 696   memcpy(buf, insn, sizeof(insn));
 697
 698   uint64_t off = pltEntryAddr - in.plt->getVA();
 699
 700   write32le(buf + 3, sym.getGotPltVA() - pltEntryAddr - 7);
 701   write32le(buf + 8, -off - 12 + 32);
 702   write32le(buf + 13, -off - 17 + 18);
 703   write32le(buf + 18, sym.pltIndex);
 704   write32le(buf + 23, -off - 27);
 705 }
 706
 707 RetpolineZNow::RetpolineZNow() {
 708   pltHeaderSize = 32;
 709   pltEntrySize = 16;
 710   ipltEntrySize = 16;
 711 }
 712
 713 void RetpolineZNow::writePltHeader(uint8_t *buf) const {
 714   const uint8_t insn[] = {
 715       0xe8, 0x0b, 0x00, 0x00, 0x00, // 0:    call next
 716       0xf3, 0x90,                   // 5:  loop: pause
 717       0x0f, 0xae, 0xe8,             // 7:    lfence
 718       0xeb, 0xf9,                   // a:    jmp loop
 719       0xcc, 0xcc, 0xcc, 0xcc,       // c:    int3; .align 16
 720       0x4c, 0x89, 0x1c, 0x24,       // 10: next: mov %r11, (%rsp)
 721       0xc3,                         // 14:   ret
 722       0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 15:   int3; padding
 723       0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // 1a:   int3; padding
 724       0xcc,                         // 1f:   int3; padding
 725   };
 726   memcpy(buf, insn, sizeof(insn));
 727 }
 728
 729 void RetpolineZNow::writePlt(uint8_t *buf, const Symbol &sym,
 730                              uint64_t pltEntryAddr) const {
 731   const uint8_t insn[] = {
 732       0x4c, 0x8b, 0x1d, 0,    0, 0, 0, // mov foo@GOTPLT(%rip), %r11
 733       0xe9, 0,    0,    0,    0,       // jmp plt+0
 734       0xcc, 0xcc, 0xcc, 0xcc,          // int3; padding
 735   };
 736   memcpy(buf, insn, sizeof(insn));
 737
 738   write32le(buf + 3, sym.getGotPltVA() - pltEntryAddr - 7);
 739   write32le(buf + 8, in.plt->getVA() - pltEntryAddr - 12);
 740 }
 741
 742 static TargetInfo *getTargetInfo() {
 743   if (config->zRetpolineplt) {
 744     if (config->zNow) {
 745       static RetpolineZNow t;
 746       return &t;
 747     }
 748     static Retpoline t;
 749     return &t;
 750   }
 751
 752   if (config->andFeatures & GNU_PROPERTY_X86_FEATURE_1_IBT) {
 753     static IntelIBT t;
 754     return &t;
 755   }
 756
 757   static X86_64 t;
 758   return &t;
 759 }
 760
 761 TargetInfo *getX86_64TargetInfo() { return getTargetInfo(); }
 762
 763 } // namespace elf
 764 } // namespace lld