2 * Copyright (c) 2006 Kip Macy <kmacy@FreeBSD.org>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
31 #include "opt_kstack_pages.h"
32 #include "opt_msgbuf.h"
34 #include "opt_trap_trace.h"
36 #include <sys/param.h>
37 #include <sys/kernel.h>
41 #include <sys/msgbuf.h>
42 #include <sys/mutex.h>
45 #include <sys/sched.h>
46 #include <sys/sysctl.h>
47 #include <sys/systm.h>
48 #include <sys/vmmeter.h>
50 #include <dev/ofw/openfirm.h>
53 #include <vm/vm_page.h>
54 #include <vm/vm_param.h>
55 #include <vm/vm_kern.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_object.h>
58 #include <vm/vm_extern.h>
59 #include <vm/vm_pageout.h>
60 #include <vm/vm_pager.h>
61 #include <vm/vm_phys.h>
64 #include <machine/cpu.h>
65 #include <machine/frame.h>
66 #include <machine/instr.h>
67 #include <machine/md_var.h>
68 #include <machine/metadata.h>
69 #include <machine/ofw_mem.h>
70 #include <machine/mmu.h>
71 #include <machine/smp.h>
72 #include <machine/tlb.h>
73 #include <machine/tte.h>
74 #include <machine/tte_hash.h>
75 #include <machine/pcb.h>
76 #include <machine/pstate.h>
77 #include <machine/tsb.h>
79 #include <machine/hypervisorvar.h>
80 #include <machine/hv_api.h>
83 void trap_trace_report(int);
89 #ifndef PMAP_SHPGPERPROC
90 #define PMAP_SHPGPERPROC 200
94 * Virtual and physical address of message buffer.
96 struct msgbuf *msgbufp;
97 vm_paddr_t msgbuf_phys;
100 * Map of physical memory reagions.
102 vm_paddr_t phys_avail[128];
103 vm_paddr_t phys_avail_tmp[128];
104 static struct ofw_mem_region mra[128];
105 static struct ofw_map translations[128];
106 static int translations_size;
109 struct ofw_mem_region sparc64_memreg[128];
112 extern vm_paddr_t mmu_fault_status_area;
115 * First and last available kernel virtual addresses.
117 vm_offset_t virtual_avail;
118 vm_offset_t virtual_end;
119 vm_offset_t kernel_vm_end;
120 vm_offset_t vm_max_kernel_address;
122 #ifndef PMAP_SHPGPERPROC
123 #define PMAP_SHPGPERPROC 200
126 * Data for the pv entry allocation mechanism
128 static uma_zone_t pvzone;
129 static struct vm_object pvzone_obj;
130 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
132 static int pmap_debug_range = 1;
133 static int use_256M_pages = 1;
135 static struct mtx pmap_ctx_lock;
136 static uint16_t ctx_stack[PMAP_CONTEXT_MAX];
137 static int ctx_stack_top;
139 static int permanent_mappings = 0;
140 static uint64_t nucleus_memory;
141 static uint64_t nucleus_mappings[4];
145 struct pmap kernel_pmap_store;
147 hv_tsb_info_t kernel_td[MAX_TSB_INFO];
150 * This should be determined at boot time
151 * with tiny TLBS it doesn't make sense to try and selectively
152 * invalidate more than this
154 #define MAX_INVALIDATES 32
155 #define MAX_TSB_CLEARS 128
158 * Allocate physical memory for use in pmap_bootstrap.
160 static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size);
163 * If user pmap is processed with pmap_remove and with pmap_remove and the
164 * resident count drops to 0, there are no more pages to remove, so we
167 #define PMAP_REMOVE_DONE(pm) \
168 ((pm) != kernel_pmap && (pm)->pm_stats.resident_count == 0)
171 * Kernel MMU interface
173 #define curthread_pmap vmspace_pmap(curthread->td_proc->p_vmspace)
176 #define KDPRINTF if (pmap_debug) printf
178 if (curthread_pmap && (curthread_pmap->pm_context != 0) && ((PCPU_GET(cpumask) & curthread_pmap->pm_active) == 0)) \
179 panic("cpumask(0x%x) & active (0x%x) == 0 pid == %d\n", \
180 PCPU_GET(cpumask), curthread_pmap->pm_active, curthread->td_proc->p_pid); \
181 if (pmap_debug) printf
186 #define KDPRINTF(...)
190 static void free_pv_entry(pv_entry_t pv);
191 static pv_entry_t get_pv_entry(pmap_t locked_pmap);
193 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
194 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va);
195 static void pmap_remove_tte(pmap_t pmap, tte_t tte_data, vm_offset_t va);
196 static void pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot);
197 static void pmap_tsb_reset(pmap_t pmap);
198 static void pmap_tsb_resize(pmap_t pmap);
199 static void pmap_tte_hash_resize(pmap_t pmap);
201 void pmap_set_ctx_panic(uint64_t error, vm_paddr_t tsb_ra, pmap_t pmap);
203 struct tsb_resize_info {
204 uint64_t tri_tsbscratch;
209 * Quick sort callout for comparing memory regions.
211 static int mr_cmp(const void *a, const void *b);
212 static int om_cmp(const void *a, const void *b);
214 mr_cmp(const void *a, const void *b)
216 const struct ofw_mem_region *mra;
217 const struct ofw_mem_region *mrb;
221 if (mra->mr_start < mrb->mr_start)
223 else if (mra->mr_start > mrb->mr_start)
229 om_cmp(const void *a, const void *b)
231 const struct ofw_map *oma;
232 const struct ofw_map *omb;
236 if (oma->om_start < omb->om_start)
238 else if (oma->om_start > omb->om_start)
245 free_context(uint16_t ctx)
247 mtx_lock_spin(&pmap_ctx_lock);
248 ctx_stack[ctx_stack_top++] = ctx;
249 mtx_unlock_spin(&pmap_ctx_lock);
251 KASSERT(ctx_stack_top < PMAP_CONTEXT_MAX,
252 ("context stack overrun - system error"));
255 static __inline uint16_t
260 mtx_lock_spin(&pmap_ctx_lock);
261 ctx = ctx_stack[--ctx_stack_top];
262 mtx_unlock_spin(&pmap_ctx_lock);
264 KASSERT(ctx_stack_top > 0,
265 ("context stack underrun - need to implement context stealing"));
271 free_pv_entry(pv_entry_t pv)
274 uma_zfree(pvzone, pv);
278 * get a new pv_entry, allocating a block from the system
282 get_pv_entry(pmap_t locked_pmap)
284 static const struct timeval printinterval = { 60, 0 };
285 static struct timeval lastprint;
286 struct vpgqueues *vpq;
289 pv_entry_t allocated_pv, next_pv, pv;
293 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
294 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
295 allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
296 if (allocated_pv != NULL) {
298 if (pv_entry_count > pv_entry_high_water)
301 return (allocated_pv);
305 * Reclaim pv entries: At first, destroy mappings to inactive
306 * pages. After that, if a pv entry is still needed, destroy
307 * mappings to active pages.
309 if (ratecheck(&lastprint, &printinterval))
310 printf("Approaching the limit on PV entries, "
311 "increase the vm.pmap.shpgperproc tunable.\n");
313 vpq = &vm_page_queues[PQ_INACTIVE];
315 TAILQ_FOREACH(m, &vpq->pl, pageq) {
316 if (m->hold_count || m->busy)
318 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
321 /* Avoid deadlock and lock recursion. */
322 if (pmap > locked_pmap)
324 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
326 pmap->pm_stats.resident_count--;
328 tte_data = tte_hash_delete(pmap->pm_hash, va);
330 KASSERT((tte_data & VTD_WIRED) == 0,
331 ("get_pv_entry: wired pte %#jx", (uintmax_t)tte_data));
332 if (tte_data & VTD_REF)
333 vm_page_flag_set(m, PG_REFERENCED);
334 if (tte_data & VTD_W) {
335 KASSERT((tte_data & VTD_SW_W),
336 ("get_pv_entry: modified page not writable: va: %lx, tte: %lx",
341 pmap_invalidate_page(pmap, va, TRUE);
342 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
343 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
344 if (TAILQ_EMPTY(&m->md.pv_list))
345 vm_page_flag_clear(m, PG_WRITEABLE);
346 m->md.pv_list_count--;
348 if (pmap != locked_pmap)
350 if (allocated_pv == NULL)
356 if (allocated_pv == NULL) {
357 if (vpq == &vm_page_queues[PQ_INACTIVE]) {
358 vpq = &vm_page_queues[PQ_ACTIVE];
361 panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
363 return (allocated_pv);
367 * Allocate a physical page of memory directly from the phys_avail map.
368 * Can only be called from pmap_bootstrap before avail start and end are
372 pmap_bootstrap_alloc(vm_size_t size)
377 size = round_page(size);
379 for (i = 0; phys_avail[i + 1] != 0; i += 2) {
380 if (phys_avail[i + 1] - phys_avail[i] < size)
383 phys_avail[i] += size;
384 pmap_scrub_pages(pa, size);
387 panic("pmap_bootstrap_alloc");
391 * Activate a user pmap. The pmap must be activated before its address space
392 * can be accessed in any way.
395 pmap_activate(struct thread *td)
397 pmap_t pmap, oldpmap;
401 pmap = vmspace_pmap(td->td_proc->p_vmspace);
402 oldpmap = PCPU_GET(curpmap);
404 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
405 atomic_set_int(&pmap->pm_tlbactive, PCPU_GET(cpumask));
406 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
408 oldpmap->pm_active &= ~1;
409 pmap->pm_active |= 1;
410 pmap->pm_tlbactive |= 1;
413 pmap->pm_hashscratch = tte_hash_set_scratchpad_user(pmap->pm_hash, pmap->pm_context);
414 pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
415 pmap->pm_tsb_miss_count = pmap->pm_tsb_cap_miss_count = 0;
417 PCPU_SET(curpmap, pmap);
418 if (pmap->pm_context != 0)
419 if ((err = hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra)) != H_EOK)
420 panic("failed to set TSB 0x%lx - context == %ld\n",
421 pmap->pm_tsb_ra, pmap->pm_context);
422 stxa(MMU_CID_S, ASI_MMU_CONTEXTID, pmap->pm_context);
428 pmap_addr_hint(vm_object_t object, vm_offset_t va, vm_size_t size)
434 * Increase the starting virtual address of the given mapping if a
435 * different alignment might result in more superpage mappings.
438 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
439 vm_offset_t *addr, vm_size_t size)
444 * Bootstrap the system enough to run with virtual memory.
447 pmap_bootstrap(vm_offset_t ekva)
451 vm_paddr_t pa, tsb_8k_pa, tsb_4m_pa, kernel_hash_pa, nucleus_memory_start;
452 vm_size_t physsz, virtsz, kernel_hash_shift;
453 ihandle_t pmem, vmem;
455 uint64_t tsb_8k_size, tsb_4m_size, error, physmem_tunable, physmemstart_tunable;
456 vm_paddr_t real_phys_avail[128], tmp_phys_avail[128], bounds;
459 if ((vmem = OF_finddevice("/virtual-memory")) == -1)
460 panic("pmap_bootstrap: finddevice /virtual-memory");
461 if ((sz = OF_getproplen(vmem, "translations")) == -1)
462 panic("pmap_bootstrap: getproplen translations");
463 if (sizeof(translations) < sz)
464 panic("pmap_bootstrap: translations too small");
465 bzero(translations, sz);
466 if (OF_getprop(vmem, "translations", translations, sz) == -1)
467 panic("pmap_bootstrap: getprop /virtual-memory/translations");
468 sz /= sizeof(*translations);
469 translations_size = sz;
470 nucleus_memory_start = 0;
471 CTR0(KTR_PMAP, "pmap_bootstrap: translations");
472 qsort(translations, sz, sizeof (*translations), om_cmp);
474 for (i = 0; i < sz; i++) {
475 KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n",
476 translations[i].om_size, translations[i].om_start,
477 translations[i].om_tte);
478 if ((translations[i].om_start >= KERNBASE) &&
479 (translations[i].om_start <= KERNBASE + 3*PAGE_SIZE_4M)) {
480 for (j = 0; j < translations[i].om_size; j += PAGE_SIZE_4M) {
481 KDPRINTF("mapping permanent translation\n");
482 pa = TTE_GET_PA(translations[i].om_tte) + j;
483 va = translations[i].om_start + j;
484 error = hv_mmu_map_perm_addr(va, KCONTEXT,
485 pa | TTE_KERNEL | VTD_4M, MAP_ITLB | MAP_DTLB);
487 panic("map_perm_addr returned error=%ld", error);
489 if ((nucleus_memory_start == 0) || (pa < nucleus_memory_start))
490 nucleus_memory_start = pa;
491 printf("nucleus_mappings[%d] = 0x%lx\n", permanent_mappings, pa);
492 nucleus_mappings[permanent_mappings++] = pa;
493 nucleus_memory += PAGE_SIZE_4M;
495 mp_add_nucleus_mapping(va, pa|TTE_KERNEL|VTD_4M);
502 * Find out what physical memory is available from the prom and
503 * initialize the phys_avail array. This must be done before
504 * pmap_bootstrap_alloc is called.
506 if ((pmem = OF_finddevice("/memory")) == -1)
507 panic("pmap_bootstrap: finddevice /memory");
508 if ((sz = OF_getproplen(pmem, "available")) == -1)
509 panic("pmap_bootstrap: getproplen /memory/available");
510 if (sizeof(vm_paddr_t)*128 < sz) /* FIXME */
511 panic("pmap_bootstrap: phys_avail too small");
512 if (sizeof(mra) < sz)
513 panic("pmap_bootstrap: mra too small");
515 if (OF_getprop(pmem, "available", mra, sz) == -1)
516 panic("pmap_bootstrap: getprop /memory/available");
519 CTR0(KTR_PMAP, "pmap_bootstrap: physical memory");
521 qsort(mra, sz, sizeof (*mra), mr_cmp);
522 physmemstart_tunable = physmem_tunable = physmem = physsz = 0;
524 if (TUNABLE_ULONG_FETCH("hw.physmemstart", &physmemstart_tunable)) {
525 KDPRINTF("desired physmemstart=0x%lx\n", physmemstart_tunable);
527 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) {
528 physmem = atop(physmem_tunable);
529 KDPRINTF("desired physmem=0x%lx\n", physmem_tunable);
531 if ((physmem_tunable != 0) && (physmemstart_tunable != 0))
532 physmem_tunable += physmemstart_tunable;
534 bzero(real_phys_avail, sizeof(real_phys_avail));
535 bzero(tmp_phys_avail, sizeof(tmp_phys_avail));
537 for (i = 0, j = 0; i < sz; i++) {
539 KDPRINTF("start=%#lx size=%#lx\n", mra[i].mr_start, mra[i].mr_size);
540 if (mra[i].mr_size < PAGE_SIZE_4M)
543 if ((mra[i].mr_start & PAGE_MASK_4M) || (mra[i].mr_size & PAGE_MASK_4M)) {
544 uint64_t newstart, roundup;
545 newstart = ((mra[i].mr_start + (PAGE_MASK_4M)) & ~PAGE_MASK_4M);
546 roundup = newstart - mra[i].mr_start;
547 size = (mra[i].mr_size - roundup) & ~PAGE_MASK_4M;
548 mra[i].mr_start = newstart;
549 if (size < PAGE_SIZE_4M)
551 mra[i].mr_size = size;
553 real_phys_avail[j] = mra[i].mr_start;
554 if (physmem_tunable != 0 && ((physsz + mra[i].mr_size) >= physmem_tunable)) {
555 mra[i].mr_size = physmem_tunable - physsz;
556 physsz = physmem_tunable;
557 real_phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
560 physsz += mra[i].mr_size;
561 real_phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
564 physmem = btoc(physsz - physmemstart_tunable);
567 * This is needed for versions of OFW that would allocate us memory
568 * and then forget to remove it from the available ranges ...
569 * as well as for compensating for the above move of nucleus pages
571 for (i = 0, j = 0, bounds = (1UL<<32); real_phys_avail[i] != 0; i += 2) {
572 vm_paddr_t start = real_phys_avail[i];
573 uint64_t end = real_phys_avail[i + 1];
574 CTR2(KTR_PMAP, "start=%#lx size=%#lx\n", start, end);
575 KDPRINTF("real_phys start=%#lx end=%#lx\n", start, end);
577 * Is kernel memory at the beginning of range?
579 if (nucleus_memory_start == start) {
580 start += nucleus_memory;
583 * Is kernel memory at the end of range?
585 if (nucleus_memory_start == (end - nucleus_memory))
586 end -= nucleus_memory;
588 if (physmemstart_tunable != 0 &&
589 (end < physmemstart_tunable))
592 if (physmemstart_tunable != 0 &&
593 ((start < physmemstart_tunable))) {
594 start = physmemstart_tunable;
598 * Is kernel memory in the middle somewhere?
600 if ((nucleus_memory_start > start) &&
601 (nucleus_memory_start < end)) {
602 phys_avail[j] = start;
603 phys_avail[j+1] = nucleus_memory_start;
604 start = nucleus_memory_start + nucleus_memory;
608 * Break phys_avail up on 4GB boundaries to try
609 * to work around PCI-e allocation bug
610 * we rely on the fact that kernel memory is allocated
611 * from the first 4GB of physical memory
613 while (bounds < start)
616 while (bounds < end) {
617 phys_avail[j] = start;
618 phys_avail[j + 1] = bounds;
623 phys_avail[j] = start;
624 phys_avail[j + 1] = end;
629 * Merge nucleus memory in to real_phys_avail
632 for (i = 0; real_phys_avail[i] != 0; i += 2) {
633 if (real_phys_avail[i] == nucleus_memory_start + nucleus_memory)
634 real_phys_avail[i] -= nucleus_memory;
636 if (real_phys_avail[i + 1] == nucleus_memory_start)
637 real_phys_avail[i + 1] += nucleus_memory;
639 if (real_phys_avail[i + 1] == real_phys_avail[i + 2]) {
640 real_phys_avail[i + 1] = real_phys_avail[i + 3];
641 for (k = i + 2; real_phys_avail[k] != 0; k += 2) {
642 real_phys_avail[k] = real_phys_avail[k + 2];
643 real_phys_avail[k + 1] = real_phys_avail[k + 3];
647 for (i = 0; phys_avail[i] != 0; i += 2)
648 if (pmap_debug_range || pmap_debug)
649 printf("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
650 i, phys_avail[i], i+1, phys_avail[i+1]);
653 * Shuffle the memory range containing the 256MB page with
654 * nucleus_memory to the beginning of the phys_avail array
655 * so that physical memory from that page is preferentially
658 for (j = 0; phys_avail[j] != 0; j += 2)
659 if (nucleus_memory_start < phys_avail[j])
662 * Don't shuffle unless we have a full 256M page in the range
663 * our kernel malloc appears to be horribly brittle
665 if ((phys_avail[j + 1] - phys_avail[j]) <
666 (PAGE_SIZE_256M - nucleus_memory))
669 for (i = j, k = 0; phys_avail[i] != 0; k++, i++)
670 tmp_phys_avail[k] = phys_avail[i];
671 for (i = 0; i < j; i++)
672 tmp_phys_avail[k + i] = phys_avail[i];
673 for (i = 0; i < 128; i++)
674 phys_avail[i] = tmp_phys_avail[i];
677 for (i = 0; real_phys_avail[i] != 0; i += 2)
678 if (pmap_debug_range || pmap_debug)
679 printf("real_phys_avail[%d]=0x%lx real_phys_avail[%d]=0x%lx\n",
680 i, real_phys_avail[i], i+1, real_phys_avail[i+1]);
682 for (i = 0; phys_avail[i] != 0; i += 2)
683 if (pmap_debug_range || pmap_debug)
684 printf("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
685 i, phys_avail[i], i+1, phys_avail[i+1]);
687 * Calculate the size of kernel virtual memory, and the size and mask
688 * for the kernel tsb.
690 virtsz = roundup(physsz, PAGE_SIZE_4M << (PAGE_SHIFT - TTE_SHIFT));
691 vm_max_kernel_address = VM_MIN_KERNEL_ADDRESS + virtsz;
694 * Set the start and end of kva. The kernel is loaded at the first
695 * available 4 meg super page, so round up to the end of the page.
697 virtual_avail = roundup2(ekva, PAGE_SIZE_4M);
698 virtual_end = vm_max_kernel_address;
699 kernel_vm_end = vm_max_kernel_address;
702 * Allocate and map a 4MB page for the kernel hashtable
706 kernel_hash_shift = 10; /* PAGE_SIZE_4M*2 */
708 kernel_hash_shift = 6; /* PAGE_SIZE_8K*64 */
711 kernel_hash_pa = pmap_bootstrap_alloc((1<<(kernel_hash_shift + PAGE_SHIFT)));
712 if (kernel_hash_pa & PAGE_MASK_4M)
713 panic("pmap_bootstrap: hashtable pa unaligned\n");
715 * Set up TSB descriptors for the hypervisor
719 tsb_8k_size = virtsz >> (PAGE_SHIFT - TTE_SHIFT);
721 /* avoid alignment complaints from the hypervisor */
722 tsb_8k_size = PAGE_SIZE_4M;
725 tsb_8k_pa = pmap_bootstrap_alloc(tsb_8k_size);
726 if (tsb_8k_pa & PAGE_MASK_4M)
727 panic("pmap_bootstrap: tsb unaligned\n");
728 KDPRINTF("tsb_8k_size is 0x%lx, tsb_8k_pa is 0x%lx\n", tsb_8k_size, tsb_8k_pa);
730 tsb_4m_size = (virtsz >> (PAGE_SHIFT_4M - TTE_SHIFT)) << 3;
731 tsb_4m_pa = pmap_bootstrap_alloc(tsb_4m_size);
733 kernel_td[TSB8K_INDEX].hti_idxpgsz = TTE8K;
734 kernel_td[TSB8K_INDEX].hti_assoc = 1;
735 kernel_td[TSB8K_INDEX].hti_ntte = (tsb_8k_size >> TTE_SHIFT);
736 kernel_td[TSB8K_INDEX].hti_ctx_index = 0;
737 kernel_td[TSB8K_INDEX].hti_pgszs = TSB8K;
738 kernel_td[TSB8K_INDEX].hti_rsvd = 0;
739 kernel_td[TSB8K_INDEX].hti_ra = tsb_8k_pa;
742 * Initialize kernel's private TSB from 8K page TSB
745 kernel_pmap->pm_tsb.hti_idxpgsz = TTE8K;
746 kernel_pmap->pm_tsb.hti_assoc = 1;
747 kernel_pmap->pm_tsb.hti_ntte = (tsb_8k_size >> TTE_SHIFT);
748 kernel_pmap->pm_tsb.hti_ctx_index = 0;
749 kernel_pmap->pm_tsb.hti_pgszs = TSB8K;
750 kernel_pmap->pm_tsb.hti_rsvd = 0;
751 kernel_pmap->pm_tsb.hti_ra = tsb_8k_pa;
753 kernel_pmap->pm_tsb_ra = vtophys((vm_offset_t)&kernel_pmap->pm_tsb);
754 tsb_set_scratchpad_kernel(&kernel_pmap->pm_tsb);
757 * Initialize kernel TSB for 4M pages
758 * currently (not by design) used for permanent mappings
762 KDPRINTF("tsb_4m_pa is 0x%lx tsb_4m_size is 0x%lx\n", tsb_4m_pa, tsb_4m_size);
763 kernel_td[TSB4M_INDEX].hti_idxpgsz = TTE4M;
764 kernel_td[TSB4M_INDEX].hti_assoc = 1;
765 kernel_td[TSB4M_INDEX].hti_ntte = (tsb_4m_size >> TTE_SHIFT);
766 kernel_td[TSB4M_INDEX].hti_ctx_index = 0;
767 kernel_td[TSB4M_INDEX].hti_pgszs = TSB4M|TSB256M;
768 kernel_td[TSB4M_INDEX].hti_rsvd = 0;
769 kernel_td[TSB4M_INDEX].hti_ra = tsb_4m_pa;
771 * allocate MMU fault status areas for all CPUS
773 mmu_fault_status_area = pmap_bootstrap_alloc(MMFSA_SIZE*MAXCPU);
776 * Allocate and map the message buffer.
778 msgbuf_phys = pmap_bootstrap_alloc(MSGBUF_SIZE);
779 msgbufp = (struct msgbuf *)TLB_PHYS_TO_DIRECT(msgbuf_phys);
782 * Allocate a kernel stack with guard page for thread0 and map it into
785 pa = pmap_bootstrap_alloc(KSTACK_PAGES*PAGE_SIZE);
787 virtual_avail += KSTACK_GUARD_PAGES * PAGE_SIZE;
788 kstack0 = virtual_avail;
789 virtual_avail += KSTACK_PAGES * PAGE_SIZE;
790 for (i = 0; i < KSTACK_PAGES; i++) {
791 pa = kstack0_phys + i * PAGE_SIZE;
792 va = kstack0 + i * PAGE_SIZE;
793 tsb_set_tte_real(&kernel_td[TSB8K_INDEX], va, va,
794 pa | TTE_KERNEL | VTD_8K, 0);
797 * Calculate the last available physical address.
799 for (i = 0; phys_avail[i + 2] != 0; i += 2)
800 KDPRINTF("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
801 i, phys_avail[i], i+1, phys_avail[i+1]);
802 KDPRINTF("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
803 i, phys_avail[i], i+1, phys_avail[i+1]);
805 Maxmem = sparc64_btop(phys_avail[i + 1]);
808 * Add the prom mappings to the kernel tsb.
810 for (i = 0; i < sz; i++) {
812 "translation: start=%#lx size=%#lx tte=%#lx",
813 translations[i].om_start, translations[i].om_size,
814 translations[i].om_tte);
815 KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n",
816 translations[i].om_size, translations[i].om_start,
817 translations[i].om_tte);
819 if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
820 translations[i].om_start > VM_MAX_PROM_ADDRESS)
823 for (off = 0; off < translations[i].om_size;
825 va = translations[i].om_start + off;
826 pa = TTE_GET_PA(translations[i].om_tte) + off;
827 tsb_assert_invalid(&kernel_td[TSB8K_INDEX], va);
828 tsb_set_tte_real(&kernel_td[TSB8K_INDEX], va, va, pa |
829 TTE_KERNEL | VTD_8K, 0);
833 if ((error = hv_mmu_tsb_ctx0(MAX_TSB_INFO,
834 vtophys((vm_offset_t)kernel_td))) != H_EOK)
835 panic("failed to set ctx0 TSBs error: %ld", error);
838 mp_set_tsb_desc_ra(vtophys((vm_offset_t)&kernel_td));
841 * setup direct mappings
844 for (i = 0, pa = real_phys_avail[i]; pa != 0; i += 2, pa = real_phys_avail[i]) {
845 vm_paddr_t tag_pa = 0, next_pa = 0;
846 uint64_t size_bits = VTD_4M;
847 while (pa < real_phys_avail[i + 1]) {
848 if (use_256M_pages &&
849 (pa & PAGE_MASK_256M) == 0 &&
850 ((pa + PAGE_SIZE_256M) <= real_phys_avail[i + 1])) {
852 size_bits = VTD_256M;
853 next_pa = pa + PAGE_SIZE_256M;
854 } else if (next_pa <= pa) {
858 tsb_assert_invalid(&kernel_td[TSB4M_INDEX], TLB_PHYS_TO_DIRECT(pa));
859 tsb_set_tte_real(&kernel_td[TSB4M_INDEX], TLB_PHYS_TO_DIRECT(pa),
860 TLB_PHYS_TO_DIRECT(pa),
861 tag_pa | TTE_KERNEL | size_bits, 0);
867 * Get the available physical memory ranges from /memory/reg. These
868 * are only used for kernel dumps, but it may not be wise to do prom
869 * calls in that situation.
871 if ((sz = OF_getproplen(pmem, "reg")) == -1)
872 panic("pmap_bootstrap: getproplen /memory/reg");
873 if (sizeof(sparc64_memreg) < sz)
874 panic("pmap_bootstrap: sparc64_memreg too small");
875 if (OF_getprop(pmem, "reg", sparc64_memreg, sz) == -1)
876 panic("pmap_bootstrap: getprop /memory/reg");
877 sparc64_nmemreg = sz / sizeof(*sparc64_memreg);
881 pm->pm_tlbactive = ~0;
883 PMAP_LOCK_INIT(kernel_pmap);
885 TAILQ_INIT(&kernel_pmap->pm_pvlist);
888 * This could happen earlier - but I put it here to avoid
889 * attempts to do updates until they're legal
891 pm->pm_hash = tte_hash_kernel_create(TLB_PHYS_TO_DIRECT(kernel_hash_pa), kernel_hash_shift,
892 pmap_bootstrap_alloc(PAGE_SIZE));
893 pm->pm_hashscratch = tte_hash_set_scratchpad_kernel(pm->pm_hash);
895 for (i = 0; i < translations_size; i++) {
896 KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n",
897 translations[i].om_size, translations[i].om_start,
898 translations[i].om_tte);
900 if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
901 translations[i].om_start > VM_MAX_PROM_ADDRESS) {
902 KDPRINTF("skipping\n");
905 for (off = 0; off < translations[i].om_size; off += PAGE_SIZE) {
906 va = translations[i].om_start + off;
907 pa = TTE_GET_PA(translations[i].om_tte) + off;
908 tte_hash_insert(pm->pm_hash, va, pa | TTE_KERNEL | VTD_8K);
910 KDPRINTF("set om_size=%ld om_start=%lx om_tte=%lx\n",
911 translations[i].om_size, translations[i].om_start,
912 translations[i].om_tte);
914 for (i = 0; i < KSTACK_PAGES; i++) {
915 pa = kstack0_phys + i * PAGE_SIZE;
916 va = kstack0 + i * PAGE_SIZE;
917 tte_hash_insert(pm->pm_hash, va, pa | TTE_KERNEL | VTD_8K);
920 * Add direct mappings to hash
924 /* hash only supports 8k pages */
925 for (pa = PAGE_SIZE_4M; pa < phys_avail[2]; pa += PAGE_SIZE_4M)
926 tte_hash_insert(pm->pm_hash, TLB_PHYS_TO_DIRECT(pa),
927 pa | TTE_KERNEL | VTD_4M);
932 printf("pmap_bootstrap done\n");
938 * Routine: pmap_change_wiring
939 * Function: Change the wiring attribute for a map/virtual-address
942 * The mapping must already exist in the pmap.
945 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
949 iswired = tte_get_virt_bit(pmap, va, VTD_WIRED);
951 if (wired && !iswired) {
952 pmap->pm_stats.wired_count++;
953 tte_set_virt_bit(pmap, va, VTD_WIRED);
954 } else if (!wired && iswired) {
955 pmap->pm_stats.wired_count--;
956 tte_clear_virt_bit(pmap, va, VTD_WIRED);
962 pmap_clear_modify(vm_page_t m)
964 KDPRINTF("pmap_clear_modify(0x%lx)\n", VM_PAGE_TO_PHYS(m));
965 tte_clear_phys_bit(m, VTD_W);
969 pmap_clear_reference(vm_page_t m)
971 KDPRINTF("pmap_clear_reference(0x%lx)\n", VM_PAGE_TO_PHYS(m));
972 tte_clear_phys_bit(m, VTD_REF);
976 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
977 vm_size_t len, vm_offset_t src_addr)
979 vm_offset_t addr, end_addr;
981 end_addr = src_addr + len;
983 * Don't let optional prefaulting of pages make us go
984 * way below the low water mark of free pages or way
985 * above high water mark of used pv entries.
987 if (cnt.v_free_count < cnt.v_free_reserved ||
988 pv_entry_count > pv_entry_high_water)
992 vm_page_lock_queues();
993 if (dst_pmap < src_pmap) {
1000 for (addr = src_addr; addr < end_addr; addr += PAGE_SIZE) {
1004 tte_data = tte_hash_lookup(src_pmap->pm_hash, addr);
1006 if ((tte_data & VTD_MANAGED) != 0) {
1007 if (tte_hash_lookup(dst_pmap->pm_hash, addr) == 0) {
1008 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
1010 tte_hash_insert(dst_pmap->pm_hash, addr, tte_data & ~(VTD_W|VTD_REF|VTD_WIRED));
1011 dst_pmap->pm_stats.resident_count++;
1012 pmap_insert_entry(dst_pmap, addr, m);
1016 vm_page_unlock_queues();
1017 PMAP_UNLOCK(src_pmap);
1018 PMAP_UNLOCK(dst_pmap);
1022 pmap_copy_page(vm_page_t src, vm_page_t dst)
1024 vm_paddr_t srcpa, dstpa;
1025 srcpa = VM_PAGE_TO_PHYS(src);
1026 dstpa = VM_PAGE_TO_PHYS(dst);
1028 novbcopy((char *)TLB_PHYS_TO_DIRECT(srcpa), (char *)TLB_PHYS_TO_DIRECT(dstpa), PAGE_SIZE);
1033 static __inline void
1034 pmap_add_tte(pmap_t pmap, vm_offset_t va, vm_page_t m, tte_t *tte_data, int wired)
1038 pmap->pm_stats.wired_count++;
1040 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
1041 pmap_insert_entry(pmap, va, m);
1042 *tte_data |= VTD_MANAGED;
1047 * Map the given physical page at the specified virtual address in the
1048 * target pmap with the protection requested. If specified the page
1049 * will be wired down.
1052 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
1053 vm_prot_t prot, boolean_t wired)
1056 uint64_t tte_data, otte_data;
1060 if (pmap->pm_context)
1061 DPRINTF("pmap_enter(va=%lx, pa=0x%lx, prot=%x)\n", va,
1062 VM_PAGE_TO_PHYS(m), prot);
1066 vm_page_lock_queues();
1069 tte_data = pa = VM_PAGE_TO_PHYS(m);
1070 otte_data = tte_hash_delete(pmap->pm_hash, va);
1071 opa = TTE_GET_PA(otte_data);
1075 * This is a new mapping
1077 pmap->pm_stats.resident_count++;
1078 pmap_add_tte(pmap, va, m, &tte_data, wired);
1080 } else if (pa != opa) {
1082 * Mapping has changed, handle validating new mapping.
1085 if (otte_data & VTD_WIRED)
1086 pmap->pm_stats.wired_count--;
1088 if (otte_data & VTD_MANAGED) {
1089 om = PHYS_TO_VM_PAGE(opa);
1090 pmap_remove_entry(pmap, om, va);
1093 pmap_add_tte(pmap, va, m, &tte_data, wired);
1095 } else /* (pa == opa) */ {
1097 * Mapping has not changed, must be protection or wiring change.
1101 * Wiring change, just update stats. We don't worry about
1102 * wiring PT pages as they remain resident as long as there
1103 * are valid mappings in them. Hence, if a user page is wired,
1104 * the PT page will be also.
1106 if (wired && ((otte_data & VTD_WIRED) == 0))
1107 pmap->pm_stats.wired_count++;
1108 else if (!wired && (otte_data & VTD_WIRED))
1109 pmap->pm_stats.wired_count--;
1112 * We might be turning off write access to the page,
1113 * so we go ahead and sense modify status.
1115 if (otte_data & VTD_MANAGED) {
1117 tte_data |= VTD_MANAGED;
1122 * Now validate mapping with desired protection/wiring.
1124 if ((prot & VM_PROT_WRITE) != 0) {
1125 tte_data |= VTD_SW_W;
1126 vm_page_flag_set(m, PG_WRITEABLE);
1128 if ((prot & VM_PROT_EXECUTE) != 0)
1131 tte_data |= VTD_WIRED;
1132 if (pmap == kernel_pmap)
1136 if ((otte_data & ~(VTD_W|VTD_REF)) != tte_data) {
1137 if (otte_data & VTD_V) {
1138 if (otte_data & VTD_REF) {
1139 if (otte_data & VTD_MANAGED)
1140 vm_page_flag_set(om, PG_REFERENCED);
1141 if ((opa != pa) || ((opa & VTD_X) != (pa & VTD_X)))
1144 if (otte_data & VTD_W) {
1145 if (otte_data & VTD_MANAGED)
1147 if ((pa & VTD_SW_W) != 0)
1151 pmap_invalidate_page(pmap, va, TRUE);
1156 tte_hash_insert(pmap->pm_hash, va, tte_data|TTE_MINFLAGS|VTD_REF);
1158 * XXX this needs to be locked for the threaded / kernel case
1160 tsb_set_tte(&pmap->pm_tsb, va, tte_data|TTE_MINFLAGS|VTD_REF,
1163 if (tte_hash_needs_resize(pmap->pm_hash))
1164 pmap_tte_hash_resize(pmap);
1167 * 512 is an arbitrary number of tsb misses
1169 if (0 && pmap->pm_context != 0 && pmap->pm_tsb_miss_count > 512)
1170 pmap_tsb_resize(pmap);
1172 vm_page_unlock_queues();
1178 * Maps a sequence of resident pages belonging to the same object.
1179 * The sequence begins with the given page m_start. This page is
1180 * mapped at the given virtual address start. Each subsequent page is
1181 * mapped at a virtual address that is offset from start by the same
1182 * amount as the page is offset from m_start within the object. The
1183 * last page in the sequence is the page with the largest offset from
1184 * m_start that can be mapped at a virtual address less than the given
1185 * virtual address end. Not every virtual page between start and end
1186 * is mapped; only those for which a resident page exists with the
1187 * corresponding offset from m_start are mapped.
1190 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
1191 vm_page_t m_start, vm_prot_t prot)
1194 vm_pindex_t diff, psize;
1196 VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
1197 psize = atop(end - start);
1200 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
1201 pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot);
1202 m = TAILQ_NEXT(m, listq);
1208 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
1211 pmap_enter_quick_locked(pmap, va, m, prot);
1216 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
1221 if (pmap->pm_context)
1222 KDPRINTF("pmap_enter_quick(ctx=0x%lx va=%lx, pa=0x%lx prot=%x)\n",
1223 pmap->pm_context, va, VM_PAGE_TO_PHYS(m), prot);
1225 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1226 if (tte_hash_lookup(pmap->pm_hash, va))
1229 tte_data = VM_PAGE_TO_PHYS(m);
1231 * Enter on the PV list if part of our managed memory. Note that we
1232 * raise IPL while manipulating pv_table since pmap_enter can be
1233 * called at interrupt time.
1235 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
1236 pmap_insert_entry(pmap, va, m);
1237 tte_data |= VTD_MANAGED;
1240 pmap->pm_stats.resident_count++;
1242 if ((prot & VM_PROT_EXECUTE) != 0)
1245 tte_hash_insert(pmap->pm_hash, va, tte_data | TTE_MINFLAGS);
1249 * Extract the physical page address associated with the given
1250 * map/virtual_address pair.
1253 pmap_extract(pmap_t pmap, vm_offset_t va)
1258 tte_data = tte_hash_lookup(pmap->pm_hash, va);
1259 pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
1265 * Atomically extract and hold the physical page with the given
1266 * pmap and virtual address pair if that mapping permits the given
1270 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1276 vm_page_lock_queues();
1278 tte_data = tte_hash_lookup(pmap->pm_hash, va);
1279 if (tte_data != 0 &&
1280 ((tte_data & VTD_SW_W) || (prot & VM_PROT_WRITE) == 0)) {
1281 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
1284 vm_page_unlock_queues();
1291 pmap_alloc_zeroed_contig_pages(int npages, uint64_t alignment)
1299 for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1300 m = vm_phys_alloc_contig(npages, phys_avail[i],
1301 phys_avail[i + 1], alignment, (1UL<<34));
1306 printf("vm_phys_alloc_contig failed - waiting to retry\n");
1311 for (i = 0, tm = m; i < npages; i++, tm++) {
1313 if ((tm->flags & PG_ZERO) == 0)
1316 ptr = (void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m));
1322 pmap_free_contig_pages(void *ptr, int npages)
1327 m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS((vm_offset_t)ptr));
1328 for (i = 0; i < npages; i++, m++) {
1330 atomic_subtract_int(&cnt.v_wire_count, 1);
1336 pmap_growkernel(vm_offset_t addr)
1345 /* allocate pv_entry zones */
1346 int shpgperproc = PMAP_SHPGPERPROC;
1348 for (ctx_stack_top = 1; ctx_stack_top < PMAP_CONTEXT_MAX; ctx_stack_top++)
1349 ctx_stack[ctx_stack_top] = ctx_stack_top;
1351 mtx_init(&pmap_ctx_lock, "ctx lock", NULL, MTX_SPIN);
1354 * Initialize the address space (zone) for the pv entries. Set a
1355 * high water mark so that the system can recover from excessive
1356 * numbers of pv entries.
1358 pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL,
1359 NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
1360 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
1361 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
1362 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
1363 pv_entry_high_water = 9 * (pv_entry_max / 10);
1364 uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
1371 * Create a pv entry for page at pa for
1375 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1379 KDPRINTF("pmap_insert_entry(va=0x%lx, pa=0x%lx)\n", va, VM_PAGE_TO_PHYS(m));
1380 pv = get_pv_entry(pmap);
1384 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1385 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1386 TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1387 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1388 m->md.pv_list_count++;
1392 static int trap_trace_report_done;
1397 pmap_ipi(pmap_t pmap, char *func, uint64_t arg1, uint64_t arg2)
1400 int i, cpu_count, retried;
1402 cpumask_t cpumask, active, curactive;
1403 cpumask_t active_total, ackmask;
1411 cpumask = PCPU_GET(cpumask);
1412 cpulist = PCPU_GET(cpulist);
1415 if (rdpr(pil) != 14)
1416 panic("pil %ld != 14", rdpr(pil));
1418 #ifndef CPUMASK_NOT_BEING_ERRONEOUSLY_CHANGED
1419 /* by definition cpumask should have curcpu's bit set */
1420 if (cpumask != (1 << curcpu))
1421 panic("cpumask(0x%x) != (1 << curcpu) (0x%x)\n",
1422 cpumask, (1 << curcpu));
1426 if ((active_total = (pmap->pm_tlbactive & ~cpumask)) == 0)
1429 if (pmap->pm_context != 0)
1430 active_total = active = (pmap->pm_tlbactive & ~cpumask);
1433 active_total = active = PCPU_GET(other_cpus);
1440 for (i = curactive = cpu_count = 0, cpus = active; i < mp_ncpus && cpus; i++, cpus = (cpus>>1)) {
1441 if ((cpus & 0x1) == 0)
1444 curactive |= (1 << i);
1445 cpulist[cpu_count] = (uint16_t)i;
1450 cpu_ipi_selected(cpu_count, cpulist, (uint64_t)func, (uint64_t)arg1,
1451 (uint64_t)arg2, (uint64_t *)&ackmask);
1453 while (ackmask != curactive) {
1461 printf("cpu with cpumask=0x%x appears to not be responding to ipis\n",
1462 curactive & ~ackmask);
1465 if (!trap_trace_report_done) {
1466 trap_trace_report_done = 1;
1467 for (j = 0; j < MAXCPU; j++)
1468 if (((1 << j) & curactive & ~ackmask) != 0) {
1469 struct pcpu *pc = pcpu_find(j);
1470 printf("pcpu pad 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx\n",
1471 pc->pad[0], pc->pad[1], pc->pad[2], pc->pad[3],
1472 pc->pad[4], pc->pad[5], pc->pad[6]);
1473 trap_trace_report(j);
1478 hv_cpu_state((uint64_t)ffs64(curactive & ~ackmask), &cpu_state);
1479 printf("cpu_state of %ld is %ld\n", ffs64(curactive & ~ackmask), cpu_state);
1481 printf("I'm going to send off another ipi just to confirm that it isn't a memory barrier bug\n"
1482 "and then I'm going to panic\n");
1488 panic(" ackmask=0x%x active=0x%x\n", ackmask, curactive);
1492 active_total |= curactive;
1493 if ((active = ((pmap->pm_tlbactive & all_cpus) & ~(active_total|cpumask))) != 0) {
1494 printf("pmap_ipi: retrying");
1498 return (active_total);
1503 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, int cleartsb)
1506 if (cleartsb == TRUE)
1507 tsb_clear_tte(&pmap->pm_tsb, va);
1509 DPRINTF("pmap_invalidate_page(va=0x%lx)\n", va);
1511 invlpg(va, pmap->pm_context);
1513 pmap_ipi(pmap, (void *)tl_invlpg, (uint64_t)va, (uint64_t)pmap->pm_context);
1519 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int cleartsb)
1521 vm_offset_t tva, invlrngva;
1526 if ((eva - sva) == PAGE_SIZE) {
1527 pmap_invalidate_page(pmap, sva, cleartsb);
1532 KASSERT(sva < eva, ("invalidating negative or zero range sva=0x%lx eva=0x%lx", sva, eva));
1534 if (cleartsb == TRUE)
1535 tsb_clear_range(&pmap->pm_tsb, sva, eva);
1538 if ((sva - eva) < PAGE_SIZE*64) {
1539 for (tva = sva; tva < eva; tva += PAGE_SIZE_8K)
1540 invlpg(tva, pmap->pm_context);
1542 } else if (pmap->pm_context) {
1544 invlctx(pmap->pm_context);
1551 invlrngva = sva | ((eva - sva) >> PAGE_SHIFT);
1552 active = pmap_ipi(pmap, (void *)func, pmap->pm_context, invlrngva);
1553 active &= ~pmap->pm_active;
1554 atomic_clear_int(&pmap->pm_tlbactive, active);
1560 pmap_invalidate_all(pmap_t pmap)
1563 KASSERT(pmap != kernel_pmap, ("invalidate_all called on kernel_pmap"));
1565 tsb_clear(&pmap->pm_tsb);
1568 invlctx(pmap->pm_context);
1570 pmap_ipi(pmap, tl_invlctx, pmap->pm_context, 0);
1571 pmap->pm_tlbactive = pmap->pm_active;
1577 pmap_is_modified(vm_page_t m)
1580 return (tte_get_phys_bit(m, VTD_W));
1585 pmap_is_prefaultable(pmap_t pmap, vm_offset_t va)
1587 return (tte_hash_lookup(pmap->pm_hash, va) == 0);
1591 * Extract the physical page address associated with the given kernel virtual
1596 pmap_kextract(vm_offset_t va)
1602 if (va > KERNBASE && va < KERNBASE + nucleus_memory) {
1604 offset = va - KERNBASE;
1605 pa = nucleus_mappings[offset >> 22] | (va & PAGE_MASK_4M);
1607 if ((pa == 0) && (tte_data = tsb_lookup_tte(va, 0)) != 0)
1608 pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
1610 if ((pa == 0) && (tte_data = tte_hash_lookup(kernel_pmap->pm_hash, va)) != 0)
1611 pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
1617 * Map a range of physical addresses into kernel virtual address space.
1619 * The value passed in *virt is a suggested virtual address for the mapping.
1620 * Architectures which can support a direct-mapped physical to virtual region
1621 * can return the appropriate address within that region, leaving '*virt'
1625 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1627 return TLB_PHYS_TO_DIRECT(start);
1631 pmap_mincore(pmap_t pmap, vm_offset_t addr)
1637 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
1638 vm_pindex_t index, vm_size_t size)
1640 printf("pmap_object_init_pt\n");
1645 * Returns true if the pmap's pv is one of the first
1646 * 16 pvs linked to from this page. This count may
1647 * be changed upwards or downwards in the future; it
1648 * is only necessary that true be returned for a small
1649 * subset of pmaps for proper page aging.
1652 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
1657 if (m->flags & PG_FICTITIOUS)
1660 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1661 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1662 if (pv->pv_pmap == pmap) {
1673 * Initialize a vm_page's machine-dependent fields.
1676 pmap_page_init(vm_page_t m)
1679 TAILQ_INIT(&m->md.pv_list);
1680 m->md.pv_list_count = 0;
1683 * Lower the permission for all mappings to a given page.
1686 pmap_remove_write(vm_page_t m)
1688 if ((m->flags & PG_WRITEABLE) == 0)
1690 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1691 tte_clear_phys_bit(m, VTD_SW_W|VTD_W);
1692 vm_page_flag_clear(m, PG_WRITEABLE);
1695 * Initialize the pmap associated with process 0.
1698 pmap_pinit0(pmap_t pmap)
1700 PMAP_LOCK_INIT(pmap);
1701 pmap->pm_active = pmap->pm_tlbactive = ~0;
1702 pmap->pm_context = 0;
1703 pmap->pm_tsb_ra = kernel_pmap->pm_tsb_ra;
1704 pmap->pm_hash = kernel_pmap->pm_hash;
1706 PCPU_SET(curpmap, pmap);
1708 TAILQ_INIT(&pmap->pm_pvlist);
1709 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1713 * Initialize a preallocated and zeroed pmap structure, such as one in a
1714 * vmspace structure.
1717 pmap_pinit(pmap_t pmap)
1721 pmap->pm_context = get_context();
1722 pmap->pm_tsb_ra = vtophys(&pmap->pm_tsb);
1724 vm_page_lock_queues();
1725 pmap->pm_hash = tte_hash_create(pmap->pm_context, &pmap->pm_hashscratch);
1726 tsb_init(&pmap->pm_tsb, &pmap->pm_tsbscratch, TSB_INIT_SHIFT);
1727 vm_page_unlock_queues();
1728 pmap->pm_tsb_miss_count = pmap->pm_tsb_cap_miss_count = 0;
1729 pmap->pm_active = pmap->pm_tlbactive = 0;
1730 for (i = 0; i < TSB_MAX_RESIZE; i++)
1731 pmap->pm_old_tsb_ra[i] = 0;
1733 TAILQ_INIT(&pmap->pm_pvlist);
1734 PMAP_LOCK_INIT(pmap);
1735 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1740 * Set the physical protection on the specified range of this map as requested.
1743 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1750 DPRINTF("pmap_protect(0x%lx, 0x%lx, %d)\n", sva, eva, prot);
1752 if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1753 pmap_remove(pmap, sva, eva);
1757 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
1758 (VM_PROT_WRITE|VM_PROT_EXECUTE))
1761 clearbits = anychanged = 0;
1763 if ((prot & VM_PROT_WRITE) == 0)
1764 clearbits |= (VTD_W|VTD_SW_W);
1765 if ((prot & VM_PROT_EXECUTE) == 0)
1768 vm_page_lock_queues();
1770 for (tva = sva; tva < eva; tva += PAGE_SIZE) {
1774 if ((otte_data = tte_hash_clear_bits(pmap->pm_hash, tva,
1778 * XXX technically we should do a shootdown if it
1779 * was referenced and was executable - but is not now
1781 if (!anychanged && (otte_data & VTD_W))
1784 if (otte_data & VTD_MANAGED) {
1787 if (otte_data & VTD_REF) {
1788 m = PHYS_TO_VM_PAGE(TTE_GET_PA(otte_data));
1789 vm_page_flag_set(m, PG_REFERENCED);
1791 if (otte_data & VTD_W) {
1792 m = PHYS_TO_VM_PAGE(TTE_GET_PA(otte_data));
1798 vm_page_unlock_queues();
1800 pmap_invalidate_range(pmap, sva, eva, TRUE);
1805 * Map a list of wired pages into kernel virtual address space. This is
1806 * intended for temporary mappings which do not need page modification or
1807 * references recorded. Existing mappings in the region are overwritten.
1810 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
1817 while (count-- > 0) {
1818 otte |= tte_hash_update(kernel_pmap->pm_hash, va,
1819 VM_PAGE_TO_PHYS(*m) | TTE_KERNEL | VTD_8K);
1823 if ((otte & VTD_REF) != 0)
1824 pmap_invalidate_range(kernel_pmap, sva, va, FALSE);
1828 * Remove page mappings from kernel virtual address space. Intended for
1829 * temporary mappings entered by pmap_qenter.
1832 pmap_qremove(vm_offset_t sva, int count)
1840 while (count-- > 0) {
1841 otte |= tte_hash_delete(kernel_pmap->pm_hash, va);
1844 if ((otte & VTD_REF) != 0)
1845 pmap_invalidate_range(kernel_pmap, sva, va, TRUE);
1849 * Release any resources held by the given physical map.
1850 * Called when a pmap initialized by pmap_pinit is being released.
1851 * Should only be called if the map contains no valid mappings.
1854 pmap_release(pmap_t pmap)
1856 KASSERT(pmap->pm_stats.resident_count == 0,
1857 ("pmap_release: pmap resident count %ld != 0",
1858 pmap->pm_stats.resident_count));
1860 tsb_deinit(&pmap->pm_tsb);
1861 tte_hash_destroy(pmap->pm_hash);
1862 free_context(pmap->pm_context);
1863 PMAP_LOCK_DESTROY(pmap);
1867 * Remove the given range of addresses from the specified map.
1870 pmap_remove(pmap_t pmap, vm_offset_t start, vm_offset_t end)
1876 * Perform an unsynchronized read. This is, however, safe.
1878 if (pmap->pm_stats.resident_count == 0)
1881 DPRINTF("pmap_remove(start=0x%lx, end=0x%lx)\n",
1884 vm_page_lock_queues();
1886 for (tva = start; tva < end; tva += PAGE_SIZE) {
1887 if ((tte_data = tte_hash_delete(pmap->pm_hash, tva)) == 0)
1889 pmap_remove_tte(pmap, tte_data, tva);
1890 if (tte_data & (VTD_REF|VTD_W))
1893 vm_page_unlock_queues();
1895 pmap_invalidate_range(pmap, start, end, TRUE);
1900 * Routine: pmap_remove_all
1902 * Removes this physical page from
1903 * all physical maps in which it resides.
1904 * Reflects back modify bits to the pager.
1907 * Original versions of this routine were very
1908 * inefficient because they iteratively called
1909 * pmap_remove (slow...)
1913 pmap_remove_all(vm_page_t m)
1917 DPRINTF("pmap_remove_all 0x%lx\n", VM_PAGE_TO_PHYS(m));
1919 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1920 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1921 PMAP_LOCK(pv->pv_pmap);
1922 pv->pv_pmap->pm_stats.resident_count--;
1924 tte_data = tte_hash_delete(pv->pv_pmap->pm_hash, pv->pv_va);
1926 if (tte_data & VTD_WIRED)
1927 pv->pv_pmap->pm_stats.wired_count--;
1928 if (tte_data & VTD_REF)
1929 vm_page_flag_set(m, PG_REFERENCED);
1932 * Update the vm_page_t clean and reference bits.
1934 if (tte_data & VTD_W) {
1935 KASSERT((tte_data & VTD_SW_W),
1936 ("pmap_remove_all: modified page not writable: va: %lx, tte: %lx",
1937 pv->pv_va, tte_data));
1941 pmap_invalidate_page(pv->pv_pmap, pv->pv_va, TRUE);
1942 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1943 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1944 m->md.pv_list_count--;
1945 PMAP_UNLOCK(pv->pv_pmap);
1948 vm_page_flag_clear(m, PG_WRITEABLE);
1952 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1955 if (pmap != kernel_pmap)
1956 DPRINTF("pmap_remove_entry(va=0x%lx, pa=0x%lx)\n", va, VM_PAGE_TO_PHYS(m));
1957 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1958 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1959 if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1960 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1961 if (pmap == pv->pv_pmap && va == pv->pv_va)
1965 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1966 if (va == pv->pv_va)
1970 KASSERT(pv != NULL, ("pmap_remove_entry: pv not found va=0x%lx pa=0x%lx", va, VM_PAGE_TO_PHYS(m)));
1971 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1972 m->md.pv_list_count--;
1973 if (TAILQ_EMPTY(&m->md.pv_list))
1974 vm_page_flag_clear(m, PG_WRITEABLE);
1975 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1981 pmap_remove_pages(pmap_t pmap)
1988 DPRINTF("pmap_remove_pages(ctx=0x%lx)\n", pmap->pm_context);
1989 vm_page_lock_queues();
1991 for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
1992 tte_data = tte_hash_delete(pmap->pm_hash, pv->pv_va);
1994 if (tte_data == 0) {
1995 printf("TTE IS ZERO @ VA %016lx\n", pv->pv_va);
1998 if (tte_data & VTD_WIRED) {
1999 panic("wired page in process not handled correctly");
2000 pmap->pm_stats.wired_count--;
2002 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
2004 pmap->pm_stats.resident_count--;
2006 if (tte_data & VTD_W) {
2010 npv = TAILQ_NEXT(pv, pv_plist);
2011 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2013 m->md.pv_list_count--;
2014 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2015 if (TAILQ_EMPTY(&m->md.pv_list))
2016 vm_page_flag_clear(m, PG_WRITEABLE);
2020 pmap->pm_hash = tte_hash_reset(pmap->pm_hash, &pmap->pm_hashscratch);
2022 pmap_tsb_reset(pmap);
2024 vm_page_unlock_queues();
2025 pmap_invalidate_all(pmap);
2030 pmap_tsb_reset(pmap_t pmap)
2034 for (i = 1; i < TSB_MAX_RESIZE && pmap->pm_old_tsb_ra[i]; i++) {
2035 pmap_free_contig_pages((void *)TLB_PHYS_TO_DIRECT(pmap->pm_old_tsb_ra[i]),
2036 (1 << (TSB_INIT_SHIFT + i)));
2037 pmap->pm_old_tsb_ra[i] = 0;
2039 if (pmap->pm_old_tsb_ra[0] != 0) {
2040 vm_paddr_t tsb_pa = pmap->pm_tsb.hti_ra;
2041 int size = tsb_size(&pmap->pm_tsb);
2042 pmap->pm_tsb.hti_ntte = (1 << (TSB_INIT_SHIFT + PAGE_SHIFT - TTE_SHIFT));
2043 pmap->pm_tsb.hti_ra = pmap->pm_old_tsb_ra[0];
2044 pmap_free_contig_pages((void *)TLB_PHYS_TO_DIRECT(tsb_pa), size);
2045 pmap->pm_tsbscratch = pmap->pm_tsb.hti_ra | (uint64_t)TSB_INIT_SHIFT;
2046 pmap->pm_old_tsb_ra[0] = 0;
2051 pmap_scrub_pages(vm_paddr_t pa, int64_t size)
2053 uint64_t bytes_zeroed;
2055 hv_mem_scrub(pa, size, &bytes_zeroed);
2057 size -= bytes_zeroed;
2062 pmap_remove_tte(pmap_t pmap, tte_t tte_data, vm_offset_t va)
2067 if (pmap != kernel_pmap)
2068 DPRINTF("pmap_remove_tte(va=0x%lx, pa=0x%lx)\n", va, TTE_GET_PA(tte_data));
2070 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2071 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2072 if (tte_data & VTD_WIRED)
2073 pmap->pm_stats.wired_count--;
2075 pmap->pm_stats.resident_count--;
2077 if (tte_data & VTD_MANAGED) {
2078 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
2079 if (tte_data & VTD_W) {
2082 if (tte_data & VTD_REF)
2083 vm_page_flag_set(m, PG_REFERENCED);
2084 pmap_remove_entry(pmap, m, va);
2088 /* resize the tsb if the number of capacity misses is greater than 1/4 of
2092 pmap_tsb_resize(pmap_t pmap)
2094 uint32_t miss_count;
2095 uint32_t cap_miss_count;
2096 struct tsb_resize_info info;
2097 hv_tsb_info_t hvtsb;
2098 uint64_t tsbscratch;
2100 KASSERT(pmap == curthread_pmap, ("operating on non-current pmap"));
2101 miss_count = pmap->pm_tsb_miss_count;
2102 cap_miss_count = pmap->pm_tsb_cap_miss_count;
2103 int npages_shift = tsb_page_shift(pmap);
2105 if (npages_shift < (TSB_INIT_SHIFT + TSB_MAX_RESIZE) &&
2106 cap_miss_count > (miss_count >> 1)) {
2107 DPRINTF("resizing tsb for proc=%s pid=%d\n",
2108 curthread->td_proc->p_comm, curthread->td_proc->p_pid);
2109 pmap->pm_old_tsb_ra[npages_shift - TSB_INIT_SHIFT] = pmap->pm_tsb.hti_ra;
2111 /* double TSB size */
2112 tsb_init(&hvtsb, &tsbscratch, npages_shift + 1);
2116 bcopy(&hvtsb, &pmap->pm_tsb, sizeof(hv_tsb_info_t));
2117 pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
2119 if (hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra) != H_EOK)
2120 panic("failed to set TSB 0x%lx - context == %ld\n",
2121 pmap->pm_tsb_ra, pmap->pm_context);
2122 info.tri_tsbscratch = pmap->pm_tsbscratch;
2123 info.tri_tsb_ra = pmap->pm_tsb_ra;
2124 pmap_ipi(pmap, tl_tsbupdate, pmap->pm_context, vtophys(&info));
2125 pmap->pm_tlbactive = pmap->pm_active;
2128 bcopy(&hvtsb, &pmap->pm_tsb, sizeof(hvtsb));
2129 if (hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra) != H_EOK)
2130 panic("failed to set TSB 0x%lx - context == %ld\n",
2131 pmap->pm_tsb_ra, pmap->pm_context);
2132 pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
2135 pmap->pm_tsb_miss_count = 0;
2136 pmap->pm_tsb_cap_miss_count = 0;
2140 pmap_tte_hash_resize(pmap_t pmap)
2142 tte_hash_t old_th = pmap->pm_hash;
2144 pmap->pm_hash = tte_hash_resize(pmap->pm_hash);
2146 if (curthread->td_proc->p_numthreads != 1)
2147 pmap_ipi(pmap, tl_ttehashupdate, pmap->pm_context, pmap->pm_hashscratch);
2149 pmap->pm_hashscratch = tte_hash_set_scratchpad_user(pmap->pm_hash, pmap->pm_context);
2151 tte_hash_destroy(old_th);
2155 * pmap_ts_referenced:
2157 * Return a count of reference bits for a page, clearing those bits.
2158 * It is not necessary for every reference bit to be cleared, but it
2159 * is necessary that 0 only be returned when there are truly no
2160 * reference bits set.
2162 * XXX: The exact number of bits to check and clear is a matter that
2163 * should be tested and standardized at some point in the future for
2164 * optimal aging of shared pages.
2168 pmap_ts_referenced(vm_page_t m)
2172 pv_entry_t pv, pvf, pvn;
2177 if (m->flags & PG_FICTITIOUS)
2180 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2181 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2186 pvn = TAILQ_NEXT(pv, pv_list);
2188 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2190 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2194 otte_data = tte_hash_clear_bits(pmap->pm_hash, pv->pv_va, VTD_REF);
2195 if ((otte_data & VTD_REF) != 0) {
2196 pmap_invalidate_page(pmap, pv->pv_va, TRUE);
2206 } while ((pv = pvn) != NULL && pv != pvf);
2212 pmap_zero_page(vm_page_t m)
2214 hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
2218 pmap_zero_page_area(vm_page_t m, int off, int size)
2223 pa = VM_PAGE_TO_PHYS(m);
2224 va = TLB_PHYS_TO_DIRECT(pa);
2225 if (off == 0 && size == PAGE_SIZE)
2226 hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
2228 bzero((char *)(va + off), size);
2233 pmap_zero_page_idle(vm_page_t m)
2235 hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
2239 pmap_set_ctx_panic(uint64_t error, vm_paddr_t tsb_ra, pmap_t pmap)
2241 panic("setting ctxnon0 failed ctx=0x%lx hvtsb_ra=0x%lx tsbscratch=0x%lx error=0x%lx",
2242 pmap->pm_context, tsb_ra, pmap->pm_tsbscratch, error);