2 * Copyright (c) 1989, 1992, 1993
3 * The Regents of the University of California. All rights reserved.
5 * This code is derived from software developed by the Computer Systems
6 * Engineering group at Lawrence Berkeley Laboratory under DARPA contract
7 * BG 91-66 and contributed to Berkeley.
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
37 #include <sys/param.h>
38 #include <sys/fnv_hash.h>
43 #include <sys/linker.h>
65 #include "kvm_private.h"
68 * Routines private to libkvm.
71 /* from src/lib/libc/gen/nlist.c */
72 int __fdnlist(int, struct nlist *);
75 * Report an error using printf style arguments. "program" is kd->program
76 * on hard errors, and 0 on soft errors, so that under sun error emulation,
77 * only hard errors are printed out (otherwise, programs like gdb will
78 * generate tons of error messages when trying to access bogus pointers).
81 _kvm_err(kvm_t *kd, const char *program, const char *fmt, ...)
86 if (program != NULL) {
87 (void)fprintf(stderr, "%s: ", program);
88 (void)vfprintf(stderr, fmt, ap);
89 (void)fputc('\n', stderr);
91 (void)vsnprintf(kd->errbuf,
92 sizeof(kd->errbuf), fmt, ap);
98 _kvm_syserr(kvm_t *kd, const char *program, const char *fmt, ...)
104 if (program != NULL) {
105 (void)fprintf(stderr, "%s: ", program);
106 (void)vfprintf(stderr, fmt, ap);
107 (void)fprintf(stderr, ": %s\n", strerror(errno));
109 char *cp = kd->errbuf;
111 (void)vsnprintf(cp, sizeof(kd->errbuf), fmt, ap);
113 (void)snprintf(&cp[n], sizeof(kd->errbuf) - n, ": %s",
120 _kvm_malloc(kvm_t *kd, size_t n)
124 if ((p = calloc(n, sizeof(char))) == NULL)
125 _kvm_err(kd, kd->program, "can't allocate %zu bytes: %s",
131 _kvm_probe_elf_kernel(kvm_t *kd, int class, int machine)
134 return (kd->nlehdr.e_ident[EI_CLASS] == class &&
135 ((machine == EM_PPC || machine == EM_PPC64) ?
136 kd->nlehdr.e_type == ET_DYN : kd->nlehdr.e_type == ET_EXEC) &&
137 kd->nlehdr.e_machine == machine);
141 _kvm_is_minidump(kvm_t *kd)
147 if (pread(kd->pmfd, &minihdr, 8, 0) == 8 &&
148 memcmp(&minihdr, "minidump", 8) == 0)
154 * The powerpc backend has a hack to strip a leading kerneldump
155 * header from the core before treating it as an ELF header.
157 * We can add that here if we can get a change to libelf to support
158 * an initial offset into the file. Alternatively we could patch
159 * savecore to extract cores from a regular file instead.
162 _kvm_read_core_phdrs(kvm_t *kd, size_t *phnump, GElf_Phdr **phdrp)
169 elf = elf_begin(kd->pmfd, ELF_C_READ, NULL);
171 _kvm_err(kd, kd->program, "%s", elf_errmsg(0));
174 if (elf_kind(elf) != ELF_K_ELF) {
175 _kvm_err(kd, kd->program, "invalid core");
178 if (gelf_getclass(elf) != kd->nlehdr.e_ident[EI_CLASS]) {
179 _kvm_err(kd, kd->program, "invalid core");
182 if (gelf_getehdr(elf, &ehdr) == NULL) {
183 _kvm_err(kd, kd->program, "%s", elf_errmsg(0));
186 if (ehdr.e_type != ET_CORE) {
187 _kvm_err(kd, kd->program, "invalid core");
190 if (ehdr.e_machine != kd->nlehdr.e_machine) {
191 _kvm_err(kd, kd->program, "invalid core");
195 if (elf_getphdrnum(elf, &phnum) == -1) {
196 _kvm_err(kd, kd->program, "%s", elf_errmsg(0));
200 phdr = calloc(phnum, sizeof(*phdr));
202 _kvm_err(kd, kd->program, "failed to allocate phdrs");
206 for (i = 0; i < phnum; i++) {
207 if (gelf_getphdr(elf, i, &phdr[i]) == NULL) {
209 _kvm_err(kd, kd->program, "%s", elf_errmsg(0));
224 * Transform v such that only bits [bit0, bitN) may be set. Generates a
225 * bitmask covering the number of bits, then shifts so +bit0+ is the first.
228 bitmask_range(uint64_t v, uint64_t bit0, uint64_t bitN)
230 if (bit0 == 0 && bitN == BITS_IN(v))
233 return (v & (((1ULL << (bitN - bit0)) - 1ULL) << bit0));
237 * Returns the number of bits in a given byte array range starting at a
238 * given base, from bit0 to bitN. bit0 may be non-zero in the case of
239 * counting backwards from bitN.
242 popcount_bytes(uint64_t *addr, uint32_t bit0, uint32_t bitN)
244 uint32_t res = bitN - bit0;
248 /* Align to 64-bit boundary on the left side if needed. */
249 if ((bit0 % BITS_IN(*addr)) != 0) {
250 bound = MIN(bitN, roundup2(bit0, BITS_IN(*addr)));
251 count += __bitcount64(bitmask_range(*addr, bit0, bound));
252 res -= (bound - bit0);
257 bound = MIN(res, BITS_IN(*addr));
258 count += __bitcount64(bitmask_range(*addr, 0, bound));
267 _kvm_pmap_get(kvm_t *kd, u_long idx, size_t len)
269 uintptr_t off = idx * len;
271 if ((off_t)off >= kd->pt_sparse_off)
273 return (void *)((uintptr_t)kd->page_map + off);
277 _kvm_map_get(kvm_t *kd, u_long pa, unsigned int page_size)
282 off = _kvm_pt_find(kd, pa, page_size);
286 addr = (uintptr_t)kd->page_map + off;
287 if (off >= kd->pt_sparse_off)
288 addr = (uintptr_t)kd->sparse_map + (off - kd->pt_sparse_off);
293 _kvm_pt_init(kvm_t *kd, size_t dump_avail_size, off_t dump_avail_off,
294 size_t map_len, off_t map_off, off_t sparse_off, int page_size,
298 uint32_t *popcount_bin;
299 int bin_popcounts = 0;
300 uint64_t pc_bins, res;
303 kd->dump_avail_size = dump_avail_size;
304 if (dump_avail_size > 0) {
305 kd->dump_avail = mmap(NULL, kd->dump_avail_size, PROT_READ,
306 MAP_PRIVATE, kd->pmfd, dump_avail_off);
309 * Older version minidumps don't provide dump_avail[],
310 * so the bitmap is fully populated from 0 to
311 * last_pa. Create an implied dump_avail that
314 kd->dump_avail = calloc(4, word_size);
315 if (word_size == sizeof(uint32_t)) {
316 ((uint32_t *)kd->dump_avail)[1] = _kvm32toh(kd,
317 map_len * 8 * page_size);
319 kd->dump_avail[1] = _kvm64toh(kd,
320 map_len * 8 * page_size);
325 * Map the bitmap specified by the arguments.
327 kd->pt_map = _kvm_malloc(kd, map_len);
328 if (kd->pt_map == NULL) {
329 _kvm_err(kd, kd->program, "cannot allocate %zu bytes for bitmap",
333 rd = pread(kd->pmfd, kd->pt_map, map_len, map_off);
334 if (rd < 0 || rd != (ssize_t)map_len) {
335 _kvm_err(kd, kd->program, "cannot read %zu bytes for bitmap",
339 kd->pt_map_size = map_len;
342 * Generate a popcount cache for every POPCOUNT_BITS in the bitmap,
343 * so lookups only have to calculate the number of bits set between
344 * a cache point and their bit. This reduces lookups to O(1),
345 * without significantly increasing memory requirements.
347 * Round up the number of bins so that 'upper half' lookups work for
348 * the final bin, if needed. The first popcount is 0, since no bits
349 * precede bit 0, so add 1 for that also. Without this, extra work
350 * would be needed to handle the first PTEs in _kvm_pt_find().
354 pc_bins = 1 + (res * NBBY + POPCOUNT_BITS / 2) / POPCOUNT_BITS;
355 kd->pt_popcounts = calloc(pc_bins, sizeof(uint32_t));
356 if (kd->pt_popcounts == NULL) {
357 _kvm_err(kd, kd->program, "cannot allocate popcount bins");
361 for (popcount_bin = &kd->pt_popcounts[1]; res > 0;
362 addr++, res -= sizeof(*addr)) {
363 *popcount_bin += popcount_bytes(addr, 0,
364 MIN(res * NBBY, BITS_IN(*addr)));
365 if (++bin_popcounts == POPCOUNTS_IN(*addr)) {
367 *popcount_bin = *(popcount_bin - 1);
372 assert(pc_bins * sizeof(*popcount_bin) ==
373 ((uintptr_t)popcount_bin - (uintptr_t)kd->pt_popcounts));
375 kd->pt_sparse_off = sparse_off;
376 kd->pt_sparse_size = (uint64_t)*popcount_bin * page_size;
377 kd->pt_page_size = page_size;
378 kd->pt_word_size = word_size;
381 * Map the sparse page array. This is useful for performing point
382 * lookups of specific pages, e.g. for kvm_walk_pages. Generally,
383 * this is much larger than is reasonable to read in up front, so
384 * mmap it in instead.
386 kd->sparse_map = mmap(NULL, kd->pt_sparse_size, PROT_READ,
387 MAP_PRIVATE, kd->pmfd, kd->pt_sparse_off);
388 if (kd->sparse_map == MAP_FAILED) {
389 _kvm_err(kd, kd->program, "cannot map %" PRIu64
390 " bytes from fd %d offset %jd for sparse map: %s",
391 kd->pt_sparse_size, kd->pmfd,
392 (intmax_t)kd->pt_sparse_off, strerror(errno));
399 _kvm_pmap_init(kvm_t *kd, uint32_t pmap_size, off_t pmap_off)
401 ssize_t exp_len = pmap_size;
403 kd->page_map_size = pmap_size;
404 kd->page_map_off = pmap_off;
405 kd->page_map = _kvm_malloc(kd, pmap_size);
406 if (kd->page_map == NULL) {
407 _kvm_err(kd, kd->program, "cannot allocate %u bytes "
408 "for page map", pmap_size);
411 if (pread(kd->pmfd, kd->page_map, pmap_size, pmap_off) != exp_len) {
412 _kvm_err(kd, kd->program, "cannot read %d bytes from "
413 "offset %jd for page map", pmap_size, (intmax_t)pmap_off);
419 static inline uint64_t
420 dump_avail_n(kvm_t *kd, long i)
424 if (kd->pt_word_size == sizeof(uint32_t)) {
425 d32 = (uint32_t *)kd->dump_avail;
426 return (_kvm32toh(kd, d32[i]));
428 return (_kvm64toh(kd, kd->dump_avail[i]));
432 _kvm_pa_bit_id(kvm_t *kd, uint64_t pa, unsigned int page_size)
438 for (i = 0; dump_avail_n(kd, i + 1) != 0; i += 2) {
439 if (pa >= dump_avail_n(kd, i + 1)) {
440 adj += howmany(dump_avail_n(kd, i + 1), page_size) -
441 dump_avail_n(kd, i) / page_size;
443 return (pa / page_size -
444 dump_avail_n(kd, i) / page_size + adj);
447 return (_KVM_BIT_ID_INVALID);
451 _kvm_bit_id_pa(kvm_t *kd, uint64_t bit_id, unsigned int page_size)
456 for (i = 0; dump_avail_n(kd, i + 1) != 0; i += 2) {
457 sz = howmany(dump_avail_n(kd, i + 1), page_size) -
458 dump_avail_n(kd, i) / page_size;
460 return (rounddown2(dump_avail_n(kd, i), page_size) +
465 return (_KVM_PA_INVALID);
469 * Find the offset for the given physical page address; returns -1 otherwise.
471 * A page's offset is represented by the sparse page base offset plus the
472 * number of bits set before its bit multiplied by page size. This means
473 * that if a page exists in the dump, it's necessary to know how many pages
474 * in the dump precede it. Reduce this O(n) counting to O(1) by caching the
475 * number of bits set at POPCOUNT_BITS intervals.
477 * Then to find the number of pages before the requested address, simply
478 * index into the cache and count the number of bits set between that cache
479 * bin and the page's bit. Halve the number of bytes that have to be
480 * checked by also counting down from the next higher bin if it's closer.
483 _kvm_pt_find(kvm_t *kd, uint64_t pa, unsigned int page_size)
485 uint64_t *bitmap = kd->pt_map;
486 uint64_t pte_bit_id = _kvm_pa_bit_id(kd, pa, page_size);
487 uint64_t pte_u64 = pte_bit_id / BITS_IN(*bitmap);
488 uint64_t popcount_id = pte_bit_id / POPCOUNT_BITS;
489 uint64_t pte_mask = 1ULL << (pte_bit_id % BITS_IN(*bitmap));
493 /* Check whether the page address requested is in the dump. */
494 if (pte_bit_id == _KVM_BIT_ID_INVALID ||
495 pte_bit_id >= (kd->pt_map_size * NBBY) ||
496 (bitmap[pte_u64] & pte_mask) == 0)
500 * Add/sub popcounts from the bitmap until the PTE's bit is reached.
501 * For bits that are in the upper half between the calculated
502 * popcount id and the next one, use the next one and subtract to
503 * minimize the number of popcounts required.
505 if ((pte_bit_id % POPCOUNT_BITS) < (POPCOUNT_BITS / 2)) {
506 count = kd->pt_popcounts[popcount_id] + popcount_bytes(
507 bitmap + popcount_id * POPCOUNTS_IN(*bitmap),
508 0, pte_bit_id - popcount_id * POPCOUNT_BITS);
511 * Counting in reverse is trickier, since we must avoid
512 * reading from bytes that are not in range, and invert.
514 uint64_t pte_u64_bit_off = pte_u64 * BITS_IN(*bitmap);
517 bitN = MIN(popcount_id * POPCOUNT_BITS,
518 kd->pt_map_size * BITS_IN(uint8_t));
519 count = kd->pt_popcounts[popcount_id] - popcount_bytes(
521 pte_bit_id - pte_u64_bit_off, bitN - pte_u64_bit_off);
525 * This can only happen if the core is truncated. Treat these
526 * entries as if they don't exist, since their backing doesn't.
528 if (count >= (kd->pt_sparse_size / page_size))
531 return (kd->pt_sparse_off + (uint64_t)count * page_size);
535 kvm_fdnlist(kvm_t *kd, struct kvm_nlist *list)
540 if (kd->resolve_symbol == NULL) {
544 for (count = 0; list[count].n_name != NULL &&
545 list[count].n_name[0] != '\0'; count++)
547 nl = calloc(count + 1, sizeof(*nl));
548 for (i = 0; i < count; i++)
549 nl[i].n_name = list[i].n_name;
550 nfail = __fdnlist(kd->nlfd, nl);
551 for (i = 0; i < count; i++) {
552 list[i].n_type = nl[i].n_type;
553 list[i].n_value = nl[i].n_value;
560 while (list->n_name != NULL && list->n_name[0] != '\0') {
561 error = kd->resolve_symbol(list->n_name, &addr);
567 list->n_value = addr;
568 list->n_type = N_DATA | N_EXT;
576 * Walk the list of unresolved symbols, generate a new list and prefix the
577 * symbol names, try again, and merge back what we could resolve.
580 kvm_fdnlist_prefix(kvm_t *kd, struct kvm_nlist *nl, int missing,
581 const char *prefix, kvaddr_t (*validate_fn)(kvm_t *, kvaddr_t))
583 struct kvm_nlist *n, *np, *p;
587 int slen, unresolved;
590 * Calculate the space we need to malloc for nlist and names.
591 * We are going to store the name twice for later lookups: once
592 * with the prefix and once the unmodified name delmited by \0.
596 for (p = nl; p->n_name && p->n_name[0]; ++p) {
597 if (p->n_type != N_UNDF)
599 len += sizeof(struct kvm_nlist) + strlen(prefix) +
600 2 * (strlen(p->n_name) + 1);
605 /* Add space for the terminating nlist entry. */
606 len += sizeof(struct kvm_nlist);
609 /* Alloc one chunk for (nlist, [names]) and setup pointers. */
610 n = np = malloc(len);
614 cp = ce = (char *)np;
615 cp += unresolved * sizeof(struct kvm_nlist);
618 /* Generate shortened nlist with special prefix. */
620 for (p = nl; p->n_name && p->n_name[0]; ++p) {
621 if (p->n_type != N_UNDF)
624 /* Save the new\0orig. name so we can later match it again. */
625 slen = snprintf(cp, ce - cp, "%s%s%c%s", prefix,
626 (prefix[0] != '\0' && p->n_name[0] == '_') ?
627 (p->n_name + 1) : p->n_name, '\0', p->n_name);
628 if (slen < 0 || slen >= ce - cp)
636 /* Do lookup on the reduced list. */
638 unresolved = kvm_fdnlist(kd, np);
640 /* Check if we could resolve further symbols and update the list. */
641 if (unresolved >= 0 && unresolved < missing) {
642 /* Find the first freshly resolved entry. */
643 for (; np->n_name && np->n_name[0]; np++)
644 if (np->n_type != N_UNDF)
647 * The lists are both in the same order,
648 * so we can walk them in parallel.
650 for (p = nl; np->n_name && np->n_name[0] &&
651 p->n_name && p->n_name[0]; ++p) {
652 if (p->n_type != N_UNDF)
654 /* Skip expanded name and compare to orig. one. */
655 ccp = np->n_name + strlen(np->n_name) + 1;
656 if (strcmp(ccp, p->n_name) != 0)
658 /* Update nlist with new, translated results. */
659 p->n_type = np->n_type;
661 p->n_value = (*validate_fn)(kd, np->n_value);
663 p->n_value = np->n_value;
665 /* Find next freshly resolved entry. */
666 for (np++; np->n_name && np->n_name[0]; np++)
667 if (np->n_type != N_UNDF)
671 /* We could assert missing = unresolved here. */
678 _kvm_nlist(kvm_t *kd, struct kvm_nlist *nl, int initialize)
682 struct kld_sym_lookup lookup;
684 const char *prefix = "";
685 char symname[1024]; /* XXX-BZ symbol name length limit? */
686 int tried_vnet, tried_dpcpu;
689 * If we can't use the kld symbol lookup, revert to the
693 error = kvm_fdnlist(kd, nl);
694 if (error <= 0) /* Hard error or success. */
697 if (_kvm_vnet_initialized(kd, initialize))
698 error = kvm_fdnlist_prefix(kd, nl, error,
699 VNET_SYMPREFIX, _kvm_vnet_validaddr);
701 if (error > 0 && _kvm_dpcpu_initialized(kd, initialize))
702 error = kvm_fdnlist_prefix(kd, nl, error,
703 DPCPU_SYMPREFIX, _kvm_dpcpu_validaddr);
709 * We can use the kld lookup syscall. Go through each nlist entry
710 * and look it up with a kldsym(2) syscall.
716 for (p = nl; p->n_name && p->n_name[0]; ++p) {
717 if (p->n_type != N_UNDF)
720 lookup.version = sizeof(lookup);
724 error = snprintf(symname, sizeof(symname), "%s%s", prefix,
725 (prefix[0] != '\0' && p->n_name[0] == '_') ?
726 (p->n_name + 1) : p->n_name);
727 if (error < 0 || error >= (int)sizeof(symname))
729 lookup.symname = symname;
730 if (lookup.symname[0] == '_')
733 if (kldsym(0, KLDSYM_LOOKUP, &lookup) != -1) {
735 if (_kvm_vnet_initialized(kd, initialize) &&
736 strcmp(prefix, VNET_SYMPREFIX) == 0)
738 _kvm_vnet_validaddr(kd, lookup.symvalue);
739 else if (_kvm_dpcpu_initialized(kd, initialize) &&
740 strcmp(prefix, DPCPU_SYMPREFIX) == 0)
742 _kvm_dpcpu_validaddr(kd, lookup.symvalue);
744 p->n_value = lookup.symvalue;
751 * Check the number of entries that weren't found. If they exist,
752 * try again with a prefix for virtualized or DPCPU symbol names.
754 error = ((p - nl) - nvalid);
755 if (error && _kvm_vnet_initialized(kd, initialize) && !tried_vnet) {
757 prefix = VNET_SYMPREFIX;
760 if (error && _kvm_dpcpu_initialized(kd, initialize) && !tried_dpcpu) {
762 prefix = DPCPU_SYMPREFIX;
767 * Return the number of entries that weren't found. If they exist,
768 * also fill internal error buffer.
770 error = ((p - nl) - nvalid);
772 _kvm_syserr(kd, kd->program, "kvm_nlist");
777 _kvm_bitmap_init(struct kvm_bitmap *bm, u_long bitmapsize, u_long *idx)
781 bm->map = calloc(bitmapsize, sizeof *bm->map);
784 bm->size = bitmapsize;
789 _kvm_bitmap_set(struct kvm_bitmap *bm, u_long bm_index)
791 uint8_t *byte = &bm->map[bm_index / 8];
793 if (bm_index / 8 < bm->size)
794 *byte |= (1UL << (bm_index % 8));
798 _kvm_bitmap_next(struct kvm_bitmap *bm, u_long *idx)
800 u_long first_invalid = bm->size * CHAR_BIT;
802 if (*idx == ULONG_MAX)
807 /* Find the next valid idx. */
808 for (; *idx < first_invalid; (*idx)++) {
809 unsigned int mask = *idx % CHAR_BIT;
810 if ((bm->map[*idx * CHAR_BIT] & mask) == 0)
814 return (*idx < first_invalid);
818 _kvm_bitmap_deinit(struct kvm_bitmap *bm)
825 _kvm_visit_cb(kvm_t *kd, kvm_walk_pages_cb_t *cb, void *arg, u_long pa,
826 u_long kmap_vaddr, u_long dmap_vaddr, vm_prot_t prot, size_t len,
827 unsigned int page_size)
829 unsigned int pgsz = page_size ? page_size : len;
830 struct kvm_page p = {
831 .kp_version = LIBKVM_WALK_PAGES_VERSION,
833 .kp_kmap_vaddr = kmap_vaddr,
834 .kp_dmap_vaddr = dmap_vaddr,
836 .kp_offset = _kvm_pt_find(kd, pa, pgsz),