2 * Copyright (c) 1991 Regents of the University of California.
4 * Copyright (c) 1994 John S. Dyson
6 * Copyright (c) 1994 David Greenman
8 * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 * must display the following acknowledgement:
25 * This product includes software developed by the University of
26 * California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 * may be used to endorse or promote products derived from this software
29 * without specific prior written permission.
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
58 * 1. Redistributions of source code must retain the above copyright
59 * notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 * notice, this list of conditions and the following disclaimer in the
62 * documentation and/or other materials provided with the distribution.
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
77 #include <sys/cdefs.h>
78 __FBSDID("$FreeBSD$");
81 * Manages physical address maps.
83 * In addition to hardware address maps, this
84 * module is called upon to provide software-use-only
85 * maps which may or may not be stored in the same
86 * form as hardware maps. These pseudo-maps are
87 * used to store intermediate results from copy
88 * operations to and from address spaces.
90 * Since the information managed by this module is
91 * also stored by the logical address mapping module,
92 * this module may throw away valid virtual-to-physical
93 * mappings at almost any time. However, invalidations
94 * of virtual-to-physical mappings must be done as
97 * In order to cope with hardware architectures which
98 * make virtual-to-physical map invalidates expensive,
99 * this module may delay invalidate or reduced protection
100 * operations until such time as they are actually
101 * necessary. This module is given full information as
102 * to which processors are currently using which maps,
103 * and to when physical maps must be made correct.
107 #include "opt_pmap.h"
108 #include "opt_msgbuf.h"
110 #include "opt_xbox.h"
112 #include <sys/param.h>
113 #include <sys/systm.h>
114 #include <sys/kernel.h>
115 #include <sys/lock.h>
116 #include <sys/malloc.h>
117 #include <sys/mman.h>
118 #include <sys/msgbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/proc.h>
122 #include <sys/vmmeter.h>
123 #include <sys/sched.h>
124 #include <sys/sysctl.h>
130 #include <vm/vm_param.h>
131 #include <vm/vm_kern.h>
132 #include <vm/vm_page.h>
133 #include <vm/vm_map.h>
134 #include <vm/vm_object.h>
135 #include <vm/vm_extern.h>
136 #include <vm/vm_pageout.h>
137 #include <vm/vm_pager.h>
140 #include <machine/cpu.h>
141 #include <machine/cputypes.h>
142 #include <machine/md_var.h>
143 #include <machine/pcb.h>
144 #include <machine/specialreg.h>
146 #include <machine/smp.h>
150 #include <machine/xbox.h>
153 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
154 #define CPU_ENABLE_SSE
157 #ifndef PMAP_SHPGPERPROC
158 #define PMAP_SHPGPERPROC 200
161 #if defined(DIAGNOSTIC)
162 #define PMAP_DIAGNOSTIC
165 #if !defined(PMAP_DIAGNOSTIC)
166 #define PMAP_INLINE __inline
173 #define PV_STAT(x) do { x ; } while (0)
175 #define PV_STAT(x) do { } while (0)
179 * Get PDEs and PTEs for user/kernel address space
181 #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
182 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
184 #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0)
185 #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0)
186 #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0)
187 #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0)
188 #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0)
190 #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
191 atomic_clear_int((u_int *)(pte), PG_W))
192 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
194 struct pmap kernel_pmap_store;
195 LIST_HEAD(pmaplist, pmap);
196 static struct pmaplist allpmaps;
197 static struct mtx allpmaps_lock;
199 vm_paddr_t avail_end; /* PA of last available physical page */
200 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
201 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
202 int pgeflag = 0; /* PG_G or-in */
203 int pseflag = 0; /* PG_PS or-in */
206 vm_offset_t kernel_vm_end;
207 extern u_int32_t KERNend;
210 static uma_zone_t pdptzone;
214 * Data for the pv entry allocation mechanism
216 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
217 static int shpgperproc = PMAP_SHPGPERPROC;
220 * All those kernel PT submaps that BSD is so fond of
229 static struct sysmaps sysmaps_pcpu[MAXCPU];
230 pt_entry_t *CMAP1 = 0;
231 static pt_entry_t *CMAP3;
232 caddr_t CADDR1 = 0, ptvmmap = 0;
233 static caddr_t CADDR3;
234 struct msgbuf *msgbufp = 0;
239 static caddr_t crashdumpmap;
242 extern pt_entry_t *SMPpt;
244 static pt_entry_t *PMAP1 = 0, *PMAP2;
245 static pt_entry_t *PADDR1 = 0, *PADDR2;
248 static int PMAP1changedcpu;
249 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
251 "Number of times pmap_pte_quick changed CPU with same PMAP1");
253 static int PMAP1changed;
254 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
256 "Number of times pmap_pte_quick changed PMAP1");
257 static int PMAP1unchanged;
258 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
260 "Number of times pmap_pte_quick didn't change PMAP1");
261 static struct mtx PMAP2mutex;
263 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
264 static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
265 static void pmap_clear_ptes(vm_page_t m, int bit);
267 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
268 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
269 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
271 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
272 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
275 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
277 static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
278 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m);
279 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
280 static void pmap_pte_release(pt_entry_t *pte);
281 static int pmap_unuse_pt(pmap_t, vm_offset_t);
282 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
284 static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
287 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
288 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
291 * Move the kernel virtual free pointer to the next
292 * 4MB. This is used to help improve performance
293 * by using a large (4MB) page for much of the kernel
294 * (.text, .data, .bss)
297 pmap_kmem_choose(vm_offset_t addr)
299 vm_offset_t newaddr = addr;
302 if (cpu_feature & CPUID_PSE)
303 newaddr = (addr + PDRMASK) & ~PDRMASK;
309 * Bootstrap the system enough to run with virtual memory.
311 * On the i386 this is called after mapping has already been enabled
312 * and just syncs the pmap module with what has already been done.
313 * [We can't call it easily with mapping off since the kernel is not
314 * mapped with PA == VA, hence we would have to relocate every address
315 * from the linked base (virtual) address "KERNBASE" to the actual
316 * (physical) address starting relative to 0]
319 pmap_bootstrap(firstaddr, loadaddr)
320 vm_paddr_t firstaddr;
324 pt_entry_t *pte, *unused;
325 struct sysmaps *sysmaps;
329 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
330 * large. It should instead be correctly calculated in locore.s and
331 * not based on 'first' (which is a physical address, not a virtual
332 * address, for the start of unused physical memory). The kernel
333 * page tables are NOT double mapped and thus should not be included
334 * in this calculation.
336 virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
337 virtual_avail = pmap_kmem_choose(virtual_avail);
339 virtual_end = VM_MAX_KERNEL_ADDRESS;
342 * Initialize the kernel pmap (which is statically allocated).
344 PMAP_LOCK_INIT(kernel_pmap);
345 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
347 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
349 kernel_pmap->pm_active = -1; /* don't allow deactivation */
350 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
351 LIST_INIT(&allpmaps);
352 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
353 mtx_lock_spin(&allpmaps_lock);
354 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
355 mtx_unlock_spin(&allpmaps_lock);
359 * Reserve some special page table entries/VA space for temporary
362 #define SYSMAP(c, p, v, n) \
363 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
369 * CMAP1/CMAP2 are used for zeroing and copying pages.
370 * CMAP3 is used for the idle process page zeroing.
372 for (i = 0; i < MAXCPU; i++) {
373 sysmaps = &sysmaps_pcpu[i];
374 mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
375 SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
376 SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
378 SYSMAP(caddr_t, CMAP1, CADDR1, 1)
379 SYSMAP(caddr_t, CMAP3, CADDR3, 1)
385 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
388 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
390 SYSMAP(caddr_t, unused, ptvmmap, 1)
393 * msgbufp is used to map the system message buffer.
395 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
398 * ptemap is used for pmap_pte_quick
400 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
401 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1);
403 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
410 /* FIXME: This is gross, but needed for the XBOX. Since we are in such
411 * an early stadium, we cannot yet neatly map video memory ... :-(
412 * Better fixes are very welcome! */
413 if (!arch_i386_is_xbox)
415 for (i = 0; i < NKPT; i++)
418 /* Turn on PG_G on kernel page(s) */
423 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on.
430 vm_offset_t va, endva;
437 endva = KERNBASE + KERNend;
440 va = KERNBASE + KERNLOAD;
442 pdir = kernel_pmap->pm_pdir[KPTDI+i];
444 kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
445 invltlb(); /* Play it safe, invltlb() every time */
450 va = (vm_offset_t)btext;
455 invltlb(); /* Play it safe, invltlb() every time */
462 * Initialize a vm_page's machine-dependent fields.
465 pmap_page_init(vm_page_t m)
468 TAILQ_INIT(&m->md.pv_list);
469 m->md.pv_list_count = 0;
474 static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt");
477 pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
479 *flags = UMA_SLAB_PRIV;
480 return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL,
486 * Initialize the pmap module.
487 * Called by vm_init, to initialize any structures that the pmap
488 * system needs to map virtual memory.
495 * Initialize the address space (zone) for the pv entries. Set a
496 * high water mark so that the system can recover from excessive
497 * numbers of pv entries.
499 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
500 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
501 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
502 pv_entry_high_water = 9 * (pv_entry_max / 10);
505 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
506 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
507 UMA_ZONE_VM | UMA_ZONE_NOFREE);
508 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
513 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
515 pmap_pventry_proc(SYSCTL_HANDLER_ARGS)
519 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
520 if (error == 0 && req->newptr) {
521 shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc;
522 pv_entry_high_water = 9 * (pv_entry_max / 10);
526 SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW,
527 &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries");
530 pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS)
534 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
535 if (error == 0 && req->newptr) {
536 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
537 pv_entry_high_water = 9 * (pv_entry_max / 10);
541 SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW,
542 &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc");
544 /***************************************************
545 * Low level helper routines.....
546 ***************************************************/
550 * For SMP, these functions have to use the IPI mechanism for coherence.
553 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
559 if (!(read_eflags() & PSL_I))
560 panic("%s: interrupts disabled", __func__);
561 mtx_lock_spin(&smp_ipi_mtx);
565 * We need to disable interrupt preemption but MUST NOT have
566 * interrupts disabled here.
567 * XXX we may need to hold schedlock to get a coherent pm_active
568 * XXX critical sections disable interrupts again
570 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
574 cpumask = PCPU_GET(cpumask);
575 other_cpus = PCPU_GET(other_cpus);
576 if (pmap->pm_active & cpumask)
578 if (pmap->pm_active & other_cpus)
579 smp_masked_invlpg(pmap->pm_active & other_cpus, va);
582 mtx_unlock_spin(&smp_ipi_mtx);
588 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
595 if (!(read_eflags() & PSL_I))
596 panic("%s: interrupts disabled", __func__);
597 mtx_lock_spin(&smp_ipi_mtx);
601 * We need to disable interrupt preemption but MUST NOT have
602 * interrupts disabled here.
603 * XXX we may need to hold schedlock to get a coherent pm_active
604 * XXX critical sections disable interrupts again
606 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
607 for (addr = sva; addr < eva; addr += PAGE_SIZE)
609 smp_invlpg_range(sva, eva);
611 cpumask = PCPU_GET(cpumask);
612 other_cpus = PCPU_GET(other_cpus);
613 if (pmap->pm_active & cpumask)
614 for (addr = sva; addr < eva; addr += PAGE_SIZE)
616 if (pmap->pm_active & other_cpus)
617 smp_masked_invlpg_range(pmap->pm_active & other_cpus,
621 mtx_unlock_spin(&smp_ipi_mtx);
627 pmap_invalidate_all(pmap_t pmap)
633 if (!(read_eflags() & PSL_I))
634 panic("%s: interrupts disabled", __func__);
635 mtx_lock_spin(&smp_ipi_mtx);
639 * We need to disable interrupt preemption but MUST NOT have
640 * interrupts disabled here.
641 * XXX we may need to hold schedlock to get a coherent pm_active
642 * XXX critical sections disable interrupts again
644 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
648 cpumask = PCPU_GET(cpumask);
649 other_cpus = PCPU_GET(other_cpus);
650 if (pmap->pm_active & cpumask)
652 if (pmap->pm_active & other_cpus)
653 smp_masked_invltlb(pmap->pm_active & other_cpus);
656 mtx_unlock_spin(&smp_ipi_mtx);
662 * Normal, non-SMP, 486+ invalidation functions.
663 * We inline these within pmap.c for speed.
666 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
669 if (pmap == kernel_pmap || pmap->pm_active)
674 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
678 if (pmap == kernel_pmap || pmap->pm_active)
679 for (addr = sva; addr < eva; addr += PAGE_SIZE)
684 pmap_invalidate_all(pmap_t pmap)
687 if (pmap == kernel_pmap || pmap->pm_active)
693 * Are we current address space or kernel? N.B. We return FALSE when
694 * a pmap's page table is in use because a kernel thread is borrowing
695 * it. The borrowed page table can change spontaneously, making any
696 * dependence on its continued use subject to a race condition.
699 pmap_is_current(pmap_t pmap)
702 return (pmap == kernel_pmap ||
703 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
704 (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
708 * If the given pmap is not the current or kernel pmap, the returned pte must
709 * be released by passing it to pmap_pte_release().
712 pmap_pte(pmap_t pmap, vm_offset_t va)
717 pde = pmap_pde(pmap, va);
721 /* are we current address space or kernel? */
722 if (pmap_is_current(pmap))
724 mtx_lock(&PMAP2mutex);
725 newpf = *pde & PG_FRAME;
726 if ((*PMAP2 & PG_FRAME) != newpf) {
727 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
728 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
730 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
736 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte
740 pmap_pte_release(pt_entry_t *pte)
743 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
744 mtx_unlock(&PMAP2mutex);
748 invlcaddr(void *caddr)
751 invlpg((u_int)caddr);
755 * Super fast pmap_pte routine best used when scanning
756 * the pv lists. This eliminates many coarse-grained
757 * invltlb calls. Note that many of the pv list
758 * scans are across different pmaps. It is very wasteful
759 * to do an entire invltlb for checking a single mapping.
761 * If the given pmap is not the current pmap, vm_page_queue_mtx
762 * must be held and curthread pinned to a CPU.
765 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
770 pde = pmap_pde(pmap, va);
774 /* are we current address space or kernel? */
775 if (pmap_is_current(pmap))
777 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
778 KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
779 newpf = *pde & PG_FRAME;
780 if ((*PMAP1 & PG_FRAME) != newpf) {
781 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
783 PMAP1cpu = PCPU_GET(cpuid);
789 if (PMAP1cpu != PCPU_GET(cpuid)) {
790 PMAP1cpu = PCPU_GET(cpuid);
796 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
802 * Routine: pmap_extract
804 * Extract the physical page address associated
805 * with the given map/virtual_address pair.
808 pmap_extract(pmap_t pmap, vm_offset_t va)
816 pde = pmap->pm_pdir[va >> PDRSHIFT];
818 if ((pde & PG_PS) != 0) {
819 rtval = (pde & ~PDRMASK) | (va & PDRMASK);
823 pte = pmap_pte(pmap, va);
824 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
825 pmap_pte_release(pte);
832 * Routine: pmap_extract_and_hold
834 * Atomically extract and hold the physical page
835 * with the given pmap and virtual address pair
836 * if that mapping permits the given protection.
839 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
846 vm_page_lock_queues();
848 pde = *pmap_pde(pmap, va);
851 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
852 m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) |
858 pte = *pmap_pte_quick(pmap, va);
860 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
861 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
867 vm_page_unlock_queues();
872 /***************************************************
873 * Low level mapping routines.....
874 ***************************************************/
877 * Add a wired page to the kva.
878 * Note: not SMP coherent.
881 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
886 pte_store(pte, pa | PG_RW | PG_V | pgeflag);
890 * Remove a page from the kernel pagetables.
891 * Note: not SMP coherent.
894 pmap_kremove(vm_offset_t va)
903 * Used to map a range of physical addresses into kernel
904 * virtual address space.
906 * The value passed in '*virt' is a suggested virtual address for
907 * the mapping. Architectures which can support a direct-mapped
908 * physical to virtual region can return the appropriate address
909 * within that region, leaving '*virt' unchanged. Other
910 * architectures should map the pages starting at '*virt' and
911 * update '*virt' with the first usable address after the mapped
915 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
920 while (start < end) {
921 pmap_kenter(va, start);
925 pmap_invalidate_range(kernel_pmap, sva, va);
932 * Add a list of wired pages to the kva
933 * this routine is only used for temporary
934 * kernel mappings that do not need to have
935 * page modification or references recorded.
936 * Note that old mappings are simply written
937 * over. The page *must* be wired.
938 * Note: SMP coherent. Uses a ranged shootdown IPI.
941 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
946 while (count-- > 0) {
947 pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
951 pmap_invalidate_range(kernel_pmap, sva, va);
955 * This routine tears out page mappings from the
956 * kernel -- it is meant only for temporary mappings.
957 * Note: SMP coherent. Uses a ranged shootdown IPI.
960 pmap_qremove(vm_offset_t sva, int count)
965 while (count-- > 0) {
969 pmap_invalidate_range(kernel_pmap, sva, va);
972 /***************************************************
973 * Page table page management routines.....
974 ***************************************************/
977 * This routine unholds page table pages, and if the hold count
978 * drops to zero, then it decrements the wire count.
980 static PMAP_INLINE int
981 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
985 if (m->wire_count == 0)
986 return _pmap_unwire_pte_hold(pmap, m);
992 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
997 * unmap the page table page
999 pmap->pm_pdir[m->pindex] = 0;
1000 --pmap->pm_stats.resident_count;
1003 * Do an invltlb to make the invalidated mapping
1004 * take effect immediately.
1006 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1007 pmap_invalidate_page(pmap, pteva);
1009 vm_page_free_zero(m);
1010 atomic_subtract_int(&cnt.v_wire_count, 1);
1015 * After removing a page table entry, this routine is used to
1016 * conditionally free the page, and manage the hold/wire counts.
1019 pmap_unuse_pt(pmap_t pmap, vm_offset_t va)
1024 if (va >= VM_MAXUSER_ADDRESS)
1026 ptepde = *pmap_pde(pmap, va);
1027 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1028 return pmap_unwire_pte_hold(pmap, mpte);
1036 PMAP_LOCK_INIT(pmap);
1037 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1039 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1041 pmap->pm_active = 0;
1042 PCPU_SET(curpmap, pmap);
1043 TAILQ_INIT(&pmap->pm_pvchunk);
1044 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1045 mtx_lock_spin(&allpmaps_lock);
1046 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1047 mtx_unlock_spin(&allpmaps_lock);
1051 * Initialize a preallocated and zeroed pmap structure,
1052 * such as one in a vmspace structure.
1056 register struct pmap *pmap;
1058 vm_page_t m, ptdpg[NPGPTD];
1063 PMAP_LOCK_INIT(pmap);
1066 * No need to allocate page table space yet but we do need a valid
1067 * page directory table.
1069 if (pmap->pm_pdir == NULL) {
1070 pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1073 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1074 KASSERT(((vm_offset_t)pmap->pm_pdpt &
1075 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1076 ("pmap_pinit: pdpt misaligned"));
1077 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1078 ("pmap_pinit: pdpt above 4g"));
1083 * allocate the page directory page(s)
1085 for (i = 0; i < NPGPTD;) {
1086 m = vm_page_alloc(NULL, color++,
1087 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1096 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1098 for (i = 0; i < NPGPTD; i++) {
1099 if ((ptdpg[i]->flags & PG_ZERO) == 0)
1100 bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1103 mtx_lock_spin(&allpmaps_lock);
1104 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1105 mtx_unlock_spin(&allpmaps_lock);
1106 /* Wire in kernel global address entries. */
1107 /* XXX copies current process, does not fill in MPPTDI */
1108 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1110 pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1113 /* install self-referential address mapping entry(s) */
1114 for (i = 0; i < NPGPTD; i++) {
1115 pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1116 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1118 pmap->pm_pdpt[i] = pa | PG_V;
1122 pmap->pm_active = 0;
1123 TAILQ_INIT(&pmap->pm_pvchunk);
1124 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1128 * this routine is called if the page table page is not
1132 _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
1137 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1138 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1139 ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1142 * Allocate a page table page.
1144 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1145 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1146 if (flags & M_WAITOK) {
1148 vm_page_unlock_queues();
1150 vm_page_lock_queues();
1155 * Indicate the need to retry. While waiting, the page table
1156 * page may have been allocated.
1160 if ((m->flags & PG_ZERO) == 0)
1164 * Map the pagetable page into the process address space, if
1165 * it isn't already there.
1168 pmap->pm_stats.resident_count++;
1170 ptepa = VM_PAGE_TO_PHYS(m);
1171 pmap->pm_pdir[ptepindex] =
1172 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1178 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1184 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1185 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1186 ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1189 * Calculate pagetable page index
1191 ptepindex = va >> PDRSHIFT;
1194 * Get the page directory entry
1196 ptepa = pmap->pm_pdir[ptepindex];
1199 * This supports switching from a 4MB page to a
1202 if (ptepa & PG_PS) {
1203 pmap->pm_pdir[ptepindex] = 0;
1205 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1206 pmap_invalidate_all(kernel_pmap);
1210 * If the page table page is mapped, we just increment the
1211 * hold count, and activate it.
1214 m = PHYS_TO_VM_PAGE(ptepa);
1218 * Here if the pte page isn't mapped, or if it has
1221 m = _pmap_allocpte(pmap, ptepindex, flags);
1222 if (m == NULL && (flags & M_WAITOK))
1229 /***************************************************
1230 * Pmap allocation/deallocation routines.
1231 ***************************************************/
1235 * Deal with a SMP shootdown of other users of the pmap that we are
1236 * trying to dispose of. This can be a bit hairy.
1238 static u_int *lazymask;
1239 static u_int lazyptd;
1240 static volatile u_int lazywait;
1242 void pmap_lazyfix_action(void);
1245 pmap_lazyfix_action(void)
1247 u_int mymask = PCPU_GET(cpumask);
1250 *ipi_lazypmap_counts[PCPU_GET(cpuid)]++;
1252 if (rcr3() == lazyptd)
1253 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1254 atomic_clear_int(lazymask, mymask);
1255 atomic_store_rel_int(&lazywait, 1);
1259 pmap_lazyfix_self(u_int mymask)
1262 if (rcr3() == lazyptd)
1263 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1264 atomic_clear_int(lazymask, mymask);
1269 pmap_lazyfix(pmap_t pmap)
1273 register u_int spins;
1275 while ((mask = pmap->pm_active) != 0) {
1277 mask = mask & -mask; /* Find least significant set bit */
1278 mtx_lock_spin(&smp_ipi_mtx);
1280 lazyptd = vtophys(pmap->pm_pdpt);
1282 lazyptd = vtophys(pmap->pm_pdir);
1284 mymask = PCPU_GET(cpumask);
1285 if (mask == mymask) {
1286 lazymask = &pmap->pm_active;
1287 pmap_lazyfix_self(mymask);
1289 atomic_store_rel_int((u_int *)&lazymask,
1290 (u_int)&pmap->pm_active);
1291 atomic_store_rel_int(&lazywait, 0);
1292 ipi_selected(mask, IPI_LAZYPMAP);
1293 while (lazywait == 0) {
1299 mtx_unlock_spin(&smp_ipi_mtx);
1301 printf("pmap_lazyfix: spun for 50000000\n");
1308 * Cleaning up on uniprocessor is easy. For various reasons, we're
1309 * unlikely to have to even execute this code, including the fact
1310 * that the cleanup is deferred until the parent does a wait(2), which
1311 * means that another userland process has run.
1314 pmap_lazyfix(pmap_t pmap)
1318 cr3 = vtophys(pmap->pm_pdir);
1319 if (cr3 == rcr3()) {
1320 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1321 pmap->pm_active &= ~(PCPU_GET(cpumask));
1327 * Release any resources held by the given physical map.
1328 * Called when a pmap initialized by pmap_pinit is being released.
1329 * Should only be called if the map contains no valid mappings.
1332 pmap_release(pmap_t pmap)
1334 vm_page_t m, ptdpg[NPGPTD];
1337 KASSERT(pmap->pm_stats.resident_count == 0,
1338 ("pmap_release: pmap resident count %ld != 0",
1339 pmap->pm_stats.resident_count));
1342 mtx_lock_spin(&allpmaps_lock);
1343 LIST_REMOVE(pmap, pm_list);
1344 mtx_unlock_spin(&allpmaps_lock);
1346 for (i = 0; i < NPGPTD; i++)
1347 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]);
1349 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1350 sizeof(*pmap->pm_pdir));
1352 pmap->pm_pdir[MPPTDI] = 0;
1355 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1357 vm_page_lock_queues();
1358 for (i = 0; i < NPGPTD; i++) {
1361 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1362 ("pmap_release: got wrong ptd page"));
1365 atomic_subtract_int(&cnt.v_wire_count, 1);
1366 vm_page_free_zero(m);
1368 vm_page_unlock_queues();
1369 PMAP_LOCK_DESTROY(pmap);
1373 kvm_size(SYSCTL_HANDLER_ARGS)
1375 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1377 return sysctl_handle_long(oidp, &ksize, 0, req);
1379 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1380 0, 0, kvm_size, "IU", "Size of KVM");
1383 kvm_free(SYSCTL_HANDLER_ARGS)
1385 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1387 return sysctl_handle_long(oidp, &kfree, 0, req);
1389 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1390 0, 0, kvm_free, "IU", "Amount of KVM free");
1393 * grow the number of kernel page table entries, if needed
1396 pmap_growkernel(vm_offset_t addr)
1399 vm_paddr_t ptppaddr;
1404 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1405 if (kernel_vm_end == 0) {
1406 kernel_vm_end = KERNBASE;
1408 while (pdir_pde(PTD, kernel_vm_end)) {
1409 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1411 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1412 kernel_vm_end = kernel_map->max_offset;
1417 addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1418 if (addr - 1 >= kernel_map->max_offset)
1419 addr = kernel_map->max_offset;
1420 while (kernel_vm_end < addr) {
1421 if (pdir_pde(PTD, kernel_vm_end)) {
1422 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1423 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1424 kernel_vm_end = kernel_map->max_offset;
1431 * This index is bogus, but out of the way
1433 nkpg = vm_page_alloc(NULL, nkpt,
1434 VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1436 panic("pmap_growkernel: no memory to grow kernel");
1440 pmap_zero_page(nkpg);
1441 ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1442 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1443 pdir_pde(PTD, kernel_vm_end) = newpdir;
1445 mtx_lock_spin(&allpmaps_lock);
1446 LIST_FOREACH(pmap, &allpmaps, pm_list) {
1447 pde = pmap_pde(pmap, kernel_vm_end);
1448 pde_store(pde, newpdir);
1450 mtx_unlock_spin(&allpmaps_lock);
1451 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1452 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1453 kernel_vm_end = kernel_map->max_offset;
1460 /***************************************************
1461 * page management routines.
1462 ***************************************************/
1464 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1465 CTASSERT(_NPCM == 11);
1467 static __inline struct pv_chunk *
1468 pv_to_chunk(pv_entry_t pv)
1471 return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
1474 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1476 #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */
1477 #define PC_FREE10 0x0000fffful /* Free values for index 10 */
1479 static uint64_t pc_freemask[11] = {
1480 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1481 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1482 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1483 PC_FREE0_9, PC_FREE10
1486 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1487 "Current number of pv entries");
1490 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1492 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1493 "Current number of pv entry chunks");
1494 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1495 "Current number of pv entry chunks allocated");
1496 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1497 "Current number of pv entry chunks frees");
1498 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1499 "Number of times tried to get a chunk page but failed.");
1501 static long pv_entry_frees, pv_entry_allocs;
1502 static int pv_entry_spare;
1504 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1505 "Current number of pv entry frees");
1506 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1507 "Current number of pv entry allocs");
1508 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1509 "Current number of spare pv entries");
1511 static int pmap_collect_inactive, pmap_collect_active;
1513 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
1514 "Current number times pmap_collect called on inactive queue");
1515 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
1516 "Current number times pmap_collect called on active queue");
1520 * We are in a serious low memory condition. Resort to
1521 * drastic measures to free some pages so we can allocate
1522 * another pv entry chunk. This is normally called to
1523 * unmap inactive pages, and if necessary, active pages.
1526 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
1529 pt_entry_t *pte, tpte;
1530 pv_entry_t next_pv, pv;
1535 TAILQ_FOREACH(m, &vpq->pl, pageq) {
1536 if (m->hold_count || m->busy || (m->flags & PG_BUSY))
1538 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
1541 /* Avoid deadlock and lock recursion. */
1542 if (pmap > locked_pmap)
1544 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
1546 pmap->pm_stats.resident_count--;
1547 pte = pmap_pte_quick(pmap, va);
1548 tpte = pte_load_clear(pte);
1549 KASSERT((tpte & PG_W) == 0,
1550 ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
1552 vm_page_flag_set(m, PG_REFERENCED);
1554 KASSERT((tpte & PG_RW),
1555 ("pmap_collect: modified page not writable: va: %#x, pte: %#jx",
1556 va, (uintmax_t)tpte));
1559 pmap_invalidate_page(pmap, va);
1560 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1561 if (TAILQ_EMPTY(&m->md.pv_list))
1562 vm_page_flag_clear(m, PG_WRITEABLE);
1563 m->md.pv_list_count--;
1564 pmap_unuse_pt(pmap, va);
1565 if (pmap != locked_pmap)
1567 free_pv_entry(locked_pmap, pv);
1575 * free the pv_entry back to the free list
1578 free_pv_entry(pmap_t pmap, pv_entry_t pv)
1581 struct pv_chunk *pc;
1582 int idx, field, bit;
1584 PV_STAT(pv_entry_frees++);
1585 PV_STAT(pv_entry_spare++);
1587 pc = pv_to_chunk(pv);
1588 idx = pv - &pc->pc_pventry[0];
1591 pc->pc_map[field] |= 1ul << bit;
1592 /* move to head of list */
1593 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1594 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1595 for (idx = 0; idx < _NPCM; idx++)
1596 if (pc->pc_map[idx] != pc_freemask[idx])
1598 PV_STAT(pv_entry_spare -= _NPCPV);
1599 PV_STAT(pc_chunk_count--);
1600 PV_STAT(pc_chunk_frees++);
1601 /* entire chunk is free, return it */
1602 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1603 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
1604 pmap_qremove((vm_offset_t)pc, 1);
1605 vm_page_lock_queues();
1606 vm_page_unwire(m, 0);
1608 vm_page_unlock_queues();
1609 kmem_free(kernel_map, (vm_offset_t)pc, PAGE_SIZE);
1613 * get a new pv_entry, allocating a block from the system
1617 get_pv_entry(pmap_t pmap, int try)
1619 static const struct timeval printinterval = { 60, 0 };
1620 static struct timeval lastprint;
1621 static vm_pindex_t colour;
1624 struct pv_chunk *pc;
1627 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1628 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1629 PV_STAT(pv_entry_allocs++);
1631 if (pv_entry_count > pv_entry_high_water)
1632 pagedaemon_wakeup();
1633 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1635 for (field = 0; field < _NPCM; field++) {
1636 if (pc->pc_map[field]) {
1637 bit = bsfl(pc->pc_map[field]);
1641 if (field < _NPCM) {
1642 pv = &pc->pc_pventry[field * 32 + bit];
1643 pc->pc_map[field] &= ~(1ul << bit);
1644 /* If this was the last item, move it to tail */
1645 for (field = 0; field < _NPCM; field++)
1646 if (pc->pc_map[field] != 0) {
1647 PV_STAT(pv_entry_spare--);
1648 return (pv); /* not full, return */
1650 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1651 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1652 PV_STAT(pv_entry_spare--);
1656 /* No free items, allocate another chunk */
1657 pc = (struct pv_chunk *)kmem_alloc_nofault(kernel_map, PAGE_SIZE);
1661 PV_STAT(pc_chunk_tryfail++);
1664 panic("get_pv_entry: out of kvm for pv entry chunk!");
1666 m = vm_page_alloc(NULL, colour, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
1670 PV_STAT(pc_chunk_tryfail++);
1671 kmem_free(kernel_map, (vm_offset_t)pc, PAGE_SIZE);
1675 * Reclaim pv entries: At first, destroy mappings to inactive
1676 * pages. After that, if a pv chunk entry is still needed,
1677 * destroy mappings to active pages.
1679 if (ratecheck(&lastprint, &printinterval))
1680 printf("Approaching the limit on PV entries, consider"
1681 "increasing sysctl vm.pmap.shpgperproc or "
1682 "vm.pmap.pv_entry_max\n");
1683 PV_STAT(pmap_collect_inactive++);
1684 pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]);
1685 m = vm_page_alloc(NULL, colour,
1686 VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ);
1688 PV_STAT(pmap_collect_active++);
1689 pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]);
1690 m = vm_page_alloc(NULL, colour,
1691 VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ);
1693 panic("get_pv_entry: increase vm.pmap.shpgperproc");
1696 PV_STAT(pc_chunk_count++);
1697 PV_STAT(pc_chunk_allocs++);
1699 pmap_qenter((vm_offset_t)pc, &m, 1);
1701 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */
1702 for (field = 1; field < _NPCM; field++)
1703 pc->pc_map[field] = pc_freemask[field];
1704 pv = &pc->pc_pventry[0];
1705 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1706 PV_STAT(pv_entry_spare += _NPCPV - 1);
1711 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1715 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1716 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1717 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1718 if (pmap == PV_PMAP(pv) && va == pv->pv_va)
1721 KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
1722 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1723 m->md.pv_list_count--;
1724 if (TAILQ_EMPTY(&m->md.pv_list))
1725 vm_page_flag_clear(m, PG_WRITEABLE);
1726 free_pv_entry(pmap, pv);
1730 * Create a pv entry for page at pa for
1734 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1738 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1739 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1740 pv = get_pv_entry(pmap, FALSE);
1742 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1743 m->md.pv_list_count++;
1747 * Conditionally create a pv entry.
1750 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1754 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1755 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1756 if (pv_entry_count < pv_entry_high_water &&
1757 (pv = get_pv_entry(pmap, TRUE)) != NULL) {
1759 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1760 m->md.pv_list_count++;
1767 * pmap_remove_pte: do the things to unmap a page in a process
1770 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1775 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1776 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1777 oldpte = pte_load_clear(ptq);
1779 pmap->pm_stats.wired_count -= 1;
1781 * Machines that don't support invlpg, also don't support
1785 pmap_invalidate_page(kernel_pmap, va);
1786 pmap->pm_stats.resident_count -= 1;
1787 if (oldpte & PG_MANAGED) {
1788 m = PHYS_TO_VM_PAGE(oldpte);
1789 if (oldpte & PG_M) {
1790 KASSERT((oldpte & PG_RW),
1791 ("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx",
1792 va, (uintmax_t)oldpte));
1796 vm_page_flag_set(m, PG_REFERENCED);
1797 pmap_remove_entry(pmap, m, va);
1799 return (pmap_unuse_pt(pmap, va));
1803 * Remove a single page from a process address space
1806 pmap_remove_page(pmap_t pmap, vm_offset_t va)
1810 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1811 KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1812 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1813 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
1815 pmap_remove_pte(pmap, pte, va);
1816 pmap_invalidate_page(pmap, va);
1820 * Remove the given range of addresses from the specified map.
1822 * It is assumed that the start and end are properly
1823 * rounded to the page size.
1826 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1834 * Perform an unsynchronized read. This is, however, safe.
1836 if (pmap->pm_stats.resident_count == 0)
1841 vm_page_lock_queues();
1846 * special handling of removing one page. a very
1847 * common operation and easy to short circuit some
1850 if ((sva + PAGE_SIZE == eva) &&
1851 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1852 pmap_remove_page(pmap, sva);
1856 for (; sva < eva; sva = pdnxt) {
1860 * Calculate index for next page table.
1862 pdnxt = (sva + NBPDR) & ~PDRMASK;
1863 if (pmap->pm_stats.resident_count == 0)
1866 pdirindex = sva >> PDRSHIFT;
1867 ptpaddr = pmap->pm_pdir[pdirindex];
1870 * Weed out invalid mappings. Note: we assume that the page
1871 * directory table is always allocated, and in kernel virtual.
1877 * Check for large page.
1879 if ((ptpaddr & PG_PS) != 0) {
1880 pmap->pm_pdir[pdirindex] = 0;
1881 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1887 * Limit our scan to either the end of the va represented
1888 * by the current page table page, or to the end of the
1889 * range being removed.
1894 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
1900 * The TLB entry for a PG_G mapping is invalidated
1901 * by pmap_remove_pte().
1903 if ((*pte & PG_G) == 0)
1905 if (pmap_remove_pte(pmap, pte, sva))
1911 vm_page_unlock_queues();
1913 pmap_invalidate_all(pmap);
1918 * Routine: pmap_remove_all
1920 * Removes this physical page from
1921 * all physical maps in which it resides.
1922 * Reflects back modify bits to the pager.
1925 * Original versions of this routine were very
1926 * inefficient because they iteratively called
1927 * pmap_remove (slow...)
1931 pmap_remove_all(vm_page_t m)
1933 register pv_entry_t pv;
1935 pt_entry_t *pte, tpte;
1937 #if defined(PMAP_DIAGNOSTIC)
1939 * XXX This makes pmap_remove_all() illegal for non-managed pages!
1941 if (m->flags & PG_FICTITIOUS) {
1942 panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
1943 VM_PAGE_TO_PHYS(m));
1946 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1948 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1951 pmap->pm_stats.resident_count--;
1952 pte = pmap_pte_quick(pmap, pv->pv_va);
1953 tpte = pte_load_clear(pte);
1955 pmap->pm_stats.wired_count--;
1957 vm_page_flag_set(m, PG_REFERENCED);
1960 * Update the vm_page_t clean and reference bits.
1963 KASSERT((tpte & PG_RW),
1964 ("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx",
1965 pv->pv_va, (uintmax_t)tpte));
1968 pmap_invalidate_page(pmap, pv->pv_va);
1969 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1970 m->md.pv_list_count--;
1971 pmap_unuse_pt(pmap, pv->pv_va);
1973 free_pv_entry(pmap, pv);
1975 vm_page_flag_clear(m, PG_WRITEABLE);
1980 * Set the physical protection on the
1981 * specified range of this map as requested.
1984 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1991 if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1992 pmap_remove(pmap, sva, eva);
1996 if (prot & VM_PROT_WRITE)
2001 vm_page_lock_queues();
2004 for (; sva < eva; sva = pdnxt) {
2005 unsigned obits, pbits, pdirindex;
2007 pdnxt = (sva + NBPDR) & ~PDRMASK;
2009 pdirindex = sva >> PDRSHIFT;
2010 ptpaddr = pmap->pm_pdir[pdirindex];
2013 * Weed out invalid mappings. Note: we assume that the page
2014 * directory table is always allocated, and in kernel virtual.
2020 * Check for large page.
2022 if ((ptpaddr & PG_PS) != 0) {
2023 pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
2031 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2037 * Regardless of whether a pte is 32 or 64 bits in
2038 * size, PG_RW, PG_A, and PG_M are among the least
2039 * significant 32 bits.
2041 obits = pbits = *(u_int *)pte;
2042 if (pbits & PG_MANAGED) {
2045 m = PHYS_TO_VM_PAGE(*pte);
2046 vm_page_flag_set(m, PG_REFERENCED);
2049 if ((pbits & PG_M) != 0) {
2051 m = PHYS_TO_VM_PAGE(*pte);
2056 pbits &= ~(PG_RW | PG_M);
2058 if (pbits != obits) {
2059 if (!atomic_cmpset_int((u_int *)pte, obits,
2063 pmap_invalidate_page(pmap, sva);
2070 vm_page_unlock_queues();
2072 pmap_invalidate_all(pmap);
2077 * Insert the given physical page (p) at
2078 * the specified virtual address (v) in the
2079 * target physical map with the protection requested.
2081 * If specified, the page will be wired down, meaning
2082 * that the related pte can not be reclaimed.
2084 * NB: This is the only routine which MAY NOT lazy-evaluate
2085 * or lose information. That is, this routine must actually
2086 * insert this page into the given map NOW.
2089 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2093 register pt_entry_t *pte;
2095 pt_entry_t origpte, newpte;
2100 #ifdef PMAP_DIAGNOSTIC
2101 if (va > VM_MAX_KERNEL_ADDRESS)
2102 panic("pmap_enter: toobig");
2103 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
2104 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
2109 vm_page_lock_queues();
2114 * In the case that a page table page is not
2115 * resident, we are creating it here.
2117 if (va < VM_MAXUSER_ADDRESS) {
2118 mpte = pmap_allocpte(pmap, va, M_WAITOK);
2120 #if 0 && defined(PMAP_DIAGNOSTIC)
2122 pd_entry_t *pdeaddr = pmap_pde(pmap, va);
2124 if ((origpte & PG_V) == 0) {
2125 panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
2126 pmap->pm_pdir[PTDPTDI], origpte, va);
2131 pte = pmap_pte_quick(pmap, va);
2134 * Page Directory table entry not valid, we need a new PT page
2137 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
2138 (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
2141 pa = VM_PAGE_TO_PHYS(m);
2144 opa = origpte & PG_FRAME;
2146 if (origpte & PG_PS) {
2148 * Yes, I know this will truncate upper address bits for PAE,
2149 * but I'm actually more interested in the lower bits
2151 printf("pmap_enter: va %p, pte %p, origpte %p\n",
2152 (void *)va, (void *)pte, (void *)(uintptr_t)origpte);
2153 panic("pmap_enter: attempted pmap_enter on 4MB page");
2157 * Mapping has not changed, must be protection or wiring change.
2159 if (origpte && (opa == pa)) {
2161 * Wiring change, just update stats. We don't worry about
2162 * wiring PT pages as they remain resident as long as there
2163 * are valid mappings in them. Hence, if a user page is wired,
2164 * the PT page will be also.
2166 if (wired && ((origpte & PG_W) == 0))
2167 pmap->pm_stats.wired_count++;
2168 else if (!wired && (origpte & PG_W))
2169 pmap->pm_stats.wired_count--;
2172 * Remove extra pte reference
2178 * We might be turning off write access to the page,
2179 * so we go ahead and sense modify status.
2181 if (origpte & PG_MANAGED) {
2188 * Mapping has changed, invalidate old range and fall through to
2189 * handle validating new mapping.
2193 pmap->pm_stats.wired_count--;
2194 if (origpte & PG_MANAGED) {
2195 om = PHYS_TO_VM_PAGE(opa);
2196 pmap_remove_entry(pmap, om, va);
2200 KASSERT(mpte->wire_count > 0,
2201 ("pmap_enter: missing reference to page table page,"
2205 pmap->pm_stats.resident_count++;
2208 * Enter on the PV list if part of our managed memory.
2210 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
2211 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
2212 ("pmap_enter: managed mapping within the clean submap"));
2213 pmap_insert_entry(pmap, va, m);
2218 * Increment counters
2221 pmap->pm_stats.wired_count++;
2225 * Now validate mapping with desired protection/wiring.
2227 newpte = (pt_entry_t)(pa | PG_V);
2228 if ((prot & VM_PROT_WRITE) != 0)
2232 if (va < VM_MAXUSER_ADDRESS)
2234 if (pmap == kernel_pmap)
2238 * if the mapping or permission bits are different, we need
2239 * to update the pte.
2241 if ((origpte & ~(PG_M|PG_A)) != newpte) {
2242 if (origpte & PG_V) {
2244 origpte = pte_load_store(pte, newpte | PG_A);
2245 if (origpte & PG_A) {
2246 if (origpte & PG_MANAGED)
2247 vm_page_flag_set(om, PG_REFERENCED);
2248 if (opa != VM_PAGE_TO_PHYS(m))
2251 if (origpte & PG_M) {
2252 KASSERT((origpte & PG_RW),
2253 ("pmap_enter: modified page not writable: va: %#x, pte: %#jx",
2254 va, (uintmax_t)origpte));
2255 if ((origpte & PG_MANAGED) != 0)
2257 if ((prot & VM_PROT_WRITE) == 0)
2261 pmap_invalidate_page(pmap, va);
2263 pte_store(pte, newpte | PG_A);
2266 vm_page_unlock_queues();
2271 * this code makes some *MAJOR* assumptions:
2272 * 1. Current pmap & pmap exists.
2275 * 4. No page table pages.
2276 * but is *MUCH* faster than pmap_enter...
2280 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2286 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
2287 (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
2288 ("pmap_enter_quick: managed mapping within the clean submap"));
2289 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2290 VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2294 * In the case that a page table page is not
2295 * resident, we are creating it here.
2297 if (va < VM_MAXUSER_ADDRESS) {
2302 * Calculate pagetable page index
2304 ptepindex = va >> PDRSHIFT;
2305 if (mpte && (mpte->pindex == ptepindex)) {
2310 * Get the page directory entry
2312 ptepa = pmap->pm_pdir[ptepindex];
2315 * If the page table page is mapped, we just increment
2316 * the hold count, and activate it.
2320 panic("pmap_enter_quick: unexpected mapping into 4MB page");
2321 mpte = PHYS_TO_VM_PAGE(ptepa);
2324 mpte = _pmap_allocpte(pmap, ptepindex,
2329 vm_page_unlock_queues();
2330 VM_OBJECT_UNLOCK(m->object);
2332 VM_OBJECT_LOCK(m->object);
2333 vm_page_lock_queues();
2345 * This call to vtopte makes the assumption that we are
2346 * entering the page into the current pmap. In order to support
2347 * quick entry into any pmap, one would likely use pmap_pte_quick.
2348 * But that isn't as quick as vtopte.
2353 pmap_unwire_pte_hold(pmap, mpte);
2360 * Enter on the PV list if part of our managed memory. Note that we
2361 * raise IPL while manipulating pv_table since pmap_enter can be
2362 * called at interrupt time.
2364 if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2365 pmap_insert_entry(pmap, va, m);
2368 * Increment counters
2370 pmap->pm_stats.resident_count++;
2372 pa = VM_PAGE_TO_PHYS(m);
2375 * Now validate mapping with RO protection
2377 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2378 pte_store(pte, pa | PG_V | PG_U);
2380 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2387 * Make a temporary mapping for a physical address. This is only intended
2388 * to be used for panic dumps.
2391 pmap_kenter_temporary(vm_paddr_t pa, int i)
2395 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2396 pmap_kenter(va, pa);
2398 return ((void *)crashdumpmap);
2402 * This code maps large physical mmap regions into the
2403 * processor address space. Note that some shortcuts
2404 * are taken, but the code works.
2407 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2408 vm_object_t object, vm_pindex_t pindex,
2413 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2414 KASSERT(object->type == OBJT_DEVICE,
2415 ("pmap_object_init_pt: non-device object"));
2417 ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2420 unsigned int ptepindex;
2425 if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2429 p = vm_page_lookup(object, pindex);
2431 vm_page_lock_queues();
2432 if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2435 p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2440 if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2441 vm_page_lock_queues();
2443 vm_page_unlock_queues();
2447 p = vm_page_lookup(object, pindex);
2448 vm_page_lock_queues();
2451 vm_page_unlock_queues();
2453 ptepa = VM_PAGE_TO_PHYS(p);
2454 if (ptepa & (NBPDR - 1))
2457 p->valid = VM_PAGE_BITS_ALL;
2460 pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2461 npdes = size >> PDRSHIFT;
2462 for(i = 0; i < npdes; i++) {
2463 pde_store(&pmap->pm_pdir[ptepindex],
2464 ptepa | PG_U | PG_RW | PG_V | PG_PS);
2468 pmap_invalidate_all(pmap);
2475 * Routine: pmap_change_wiring
2476 * Function: Change the wiring attribute for a map/virtual-address
2478 * In/out conditions:
2479 * The mapping must already exist in the pmap.
2482 pmap_change_wiring(pmap, va, wired)
2483 register pmap_t pmap;
2487 register pt_entry_t *pte;
2490 pte = pmap_pte(pmap, va);
2492 if (wired && !pmap_pte_w(pte))
2493 pmap->pm_stats.wired_count++;
2494 else if (!wired && pmap_pte_w(pte))
2495 pmap->pm_stats.wired_count--;
2498 * Wiring is not a hardware characteristic so there is no need to
2501 pmap_pte_set_w(pte, wired);
2502 pmap_pte_release(pte);
2509 * Copy the range specified by src_addr/len
2510 * from the source map to the range dst_addr/len
2511 * in the destination map.
2513 * This routine is only advisory and need not do anything.
2517 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2518 vm_offset_t src_addr)
2521 vm_offset_t end_addr = src_addr + len;
2524 if (dst_addr != src_addr)
2527 if (!pmap_is_current(src_pmap))
2530 vm_page_lock_queues();
2531 if (dst_pmap < src_pmap) {
2532 PMAP_LOCK(dst_pmap);
2533 PMAP_LOCK(src_pmap);
2535 PMAP_LOCK(src_pmap);
2536 PMAP_LOCK(dst_pmap);
2539 for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2540 pt_entry_t *src_pte, *dst_pte;
2541 vm_page_t dstmpte, srcmpte;
2542 pd_entry_t srcptepaddr;
2545 if (addr >= UPT_MIN_ADDRESS)
2546 panic("pmap_copy: invalid to pmap_copy page tables");
2548 pdnxt = (addr + NBPDR) & ~PDRMASK;
2549 ptepindex = addr >> PDRSHIFT;
2551 srcptepaddr = src_pmap->pm_pdir[ptepindex];
2552 if (srcptepaddr == 0)
2555 if (srcptepaddr & PG_PS) {
2556 if (dst_pmap->pm_pdir[ptepindex] == 0) {
2557 dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
2558 dst_pmap->pm_stats.resident_count +=
2564 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2565 if (srcmpte->wire_count == 0)
2566 panic("pmap_copy: source page table page is unused");
2568 if (pdnxt > end_addr)
2571 src_pte = vtopte(addr);
2572 while (addr < pdnxt) {
2576 * we only virtual copy managed pages
2578 if ((ptetemp & PG_MANAGED) != 0) {
2580 * We have to check after allocpte for the
2581 * pte still being around... allocpte can
2584 dstmpte = pmap_allocpte(dst_pmap, addr,
2586 if (dstmpte == NULL)
2588 dst_pte = pmap_pte_quick(dst_pmap, addr);
2589 if (*dst_pte == 0 &&
2590 pmap_try_insert_pv_entry(dst_pmap, addr,
2591 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
2593 * Clear the modified and
2594 * accessed (referenced) bits
2597 *dst_pte = ptetemp & ~(PG_M | PG_A);
2598 dst_pmap->pm_stats.resident_count++;
2600 pmap_unwire_pte_hold(dst_pmap, dstmpte);
2601 if (dstmpte->wire_count >= srcmpte->wire_count)
2609 vm_page_unlock_queues();
2610 PMAP_UNLOCK(src_pmap);
2611 PMAP_UNLOCK(dst_pmap);
2614 static __inline void
2615 pagezero(void *page)
2617 #if defined(I686_CPU)
2618 if (cpu_class == CPUCLASS_686) {
2619 #if defined(CPU_ENABLE_SSE)
2620 if (cpu_feature & CPUID_SSE2)
2621 sse2_pagezero(page);
2624 i686_pagezero(page);
2627 bzero(page, PAGE_SIZE);
2631 * pmap_zero_page zeros the specified hardware page by mapping
2632 * the page into KVM and using bzero to clear its contents.
2635 pmap_zero_page(vm_page_t m)
2637 struct sysmaps *sysmaps;
2639 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2640 mtx_lock(&sysmaps->lock);
2641 if (*sysmaps->CMAP2)
2642 panic("pmap_zero_page: CMAP2 busy");
2644 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2645 invlcaddr(sysmaps->CADDR2);
2646 pagezero(sysmaps->CADDR2);
2647 *sysmaps->CMAP2 = 0;
2649 mtx_unlock(&sysmaps->lock);
2653 * pmap_zero_page_area zeros the specified hardware page by mapping
2654 * the page into KVM and using bzero to clear its contents.
2656 * off and size may not cover an area beyond a single hardware page.
2659 pmap_zero_page_area(vm_page_t m, int off, int size)
2661 struct sysmaps *sysmaps;
2663 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2664 mtx_lock(&sysmaps->lock);
2665 if (*sysmaps->CMAP2)
2666 panic("pmap_zero_page: CMAP2 busy");
2668 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2669 invlcaddr(sysmaps->CADDR2);
2670 if (off == 0 && size == PAGE_SIZE)
2671 pagezero(sysmaps->CADDR2);
2673 bzero((char *)sysmaps->CADDR2 + off, size);
2674 *sysmaps->CMAP2 = 0;
2676 mtx_unlock(&sysmaps->lock);
2680 * pmap_zero_page_idle zeros the specified hardware page by mapping
2681 * the page into KVM and using bzero to clear its contents. This
2682 * is intended to be called from the vm_pagezero process only and
2686 pmap_zero_page_idle(vm_page_t m)
2690 panic("pmap_zero_page: CMAP3 busy");
2692 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2700 * pmap_copy_page copies the specified (machine independent)
2701 * page by mapping the page into virtual memory and using
2702 * bcopy to copy the page, one machine dependent page at a
2706 pmap_copy_page(vm_page_t src, vm_page_t dst)
2708 struct sysmaps *sysmaps;
2710 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2711 mtx_lock(&sysmaps->lock);
2712 if (*sysmaps->CMAP1)
2713 panic("pmap_copy_page: CMAP1 busy");
2714 if (*sysmaps->CMAP2)
2715 panic("pmap_copy_page: CMAP2 busy");
2717 invlpg((u_int)sysmaps->CADDR1);
2718 invlpg((u_int)sysmaps->CADDR2);
2719 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
2720 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
2721 bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
2722 *sysmaps->CMAP1 = 0;
2723 *sysmaps->CMAP2 = 0;
2725 mtx_unlock(&sysmaps->lock);
2729 * Returns true if the pmap's pv is one of the first
2730 * 16 pvs linked to from this page. This count may
2731 * be changed upwards or downwards in the future; it
2732 * is only necessary that true be returned for a small
2733 * subset of pmaps for proper page aging.
2736 pmap_page_exists_quick(pmap, m)
2743 if (m->flags & PG_FICTITIOUS)
2746 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2747 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2748 if (PV_PMAP(pv) == pmap) {
2759 * Remove all pages from specified address space
2760 * this aids process exit speeds. Also, this code
2761 * is special cased for current process only, but
2762 * can have the more generic (and slightly slower)
2763 * mode enabled. This is much faster than pmap_remove
2764 * in the case of running down an entire address space.
2767 pmap_remove_pages(pmap_t pmap)
2769 pt_entry_t *pte, tpte;
2772 struct pv_chunk *pc, *npc;
2775 uint32_t inuse, bitmask;
2778 if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
2779 printf("warning: pmap_remove_pages called with non-current pmap\n");
2782 vm_page_lock_queues();
2785 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
2787 for (field = 0; field < _NPCM; field++) {
2788 inuse = (~(pc->pc_map[field])) & pc_freemask[field];
2789 while (inuse != 0) {
2791 bitmask = 1UL << bit;
2792 idx = field * 32 + bit;
2793 pv = &pc->pc_pventry[idx];
2796 pte = vtopte(pv->pv_va);
2801 "TPTE at %p IS ZERO @ VA %08x\n",
2807 * We cannot remove wired pages from a process' mapping at this time
2814 m = PHYS_TO_VM_PAGE(tpte);
2815 KASSERT(m->phys_addr == (tpte & PG_FRAME),
2816 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2817 m, (uintmax_t)m->phys_addr,
2820 KASSERT(m < &vm_page_array[vm_page_array_size],
2821 ("pmap_remove_pages: bad tpte %#jx",
2824 pmap->pm_stats.resident_count--;
2829 * Update the vm_page_t clean/reference bits.
2835 PV_STAT(pv_entry_frees++);
2836 PV_STAT(pv_entry_spare++);
2838 pc->pc_map[field] |= bitmask;
2839 m->md.pv_list_count--;
2840 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2841 if (TAILQ_EMPTY(&m->md.pv_list))
2842 vm_page_flag_clear(m, PG_WRITEABLE);
2844 pmap_unuse_pt(pmap, pv->pv_va);
2848 PV_STAT(pv_entry_spare -= _NPCPV);
2849 PV_STAT(pc_chunk_count--);
2850 PV_STAT(pc_chunk_frees++);
2851 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2852 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
2853 pmap_qremove((vm_offset_t)pc, 1);
2854 vm_page_lock_queues();
2855 vm_page_unwire(m, 0);
2857 vm_page_unlock_queues();
2858 kmem_free(kernel_map, (vm_offset_t)pc, PAGE_SIZE);
2862 pmap_invalidate_all(pmap);
2864 vm_page_unlock_queues();
2870 * Return whether or not the specified physical page was modified
2871 * in any physical maps.
2874 pmap_is_modified(vm_page_t m)
2882 if (m->flags & PG_FICTITIOUS)
2886 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2887 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2890 pte = pmap_pte_quick(pmap, pv->pv_va);
2891 rv = (*pte & PG_M) != 0;
2901 * pmap_is_prefaultable:
2903 * Return whether or not the specified virtual address is elgible
2907 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
2914 if (*pmap_pde(pmap, addr)) {
2923 * Clear the given bit in each of the given page's ptes. The bit is
2924 * expressed as a 32-bit mask. Consequently, if the pte is 64 bits in
2925 * size, only a bit within the least significant 32 can be cleared.
2927 static __inline void
2928 pmap_clear_ptes(vm_page_t m, int bit)
2930 register pv_entry_t pv;
2932 pt_entry_t pbits, *pte;
2934 if ((m->flags & PG_FICTITIOUS) ||
2935 (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
2939 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2941 * Loop over all current mappings setting/clearing as appropos If
2942 * setting RO do we need to clear the VAC?
2944 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2947 pte = pmap_pte_quick(pmap, pv->pv_va);
2953 * Regardless of whether a pte is 32 or 64 bits
2954 * in size, PG_RW and PG_M are among the least
2955 * significant 32 bits.
2957 if (!atomic_cmpset_int((u_int *)pte, pbits,
2958 pbits & ~(PG_RW | PG_M)))
2964 atomic_clear_int((u_int *)pte, bit);
2966 pmap_invalidate_page(pmap, pv->pv_va);
2971 vm_page_flag_clear(m, PG_WRITEABLE);
2976 * pmap_page_protect:
2978 * Lower the permission for all mappings to a given page.
2981 pmap_page_protect(vm_page_t m, vm_prot_t prot)
2983 if ((prot & VM_PROT_WRITE) == 0) {
2984 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
2985 pmap_clear_ptes(m, PG_RW);
2993 * pmap_ts_referenced:
2995 * Return a count of reference bits for a page, clearing those bits.
2996 * It is not necessary for every reference bit to be cleared, but it
2997 * is necessary that 0 only be returned when there are truly no
2998 * reference bits set.
3000 * XXX: The exact number of bits to check and clear is a matter that
3001 * should be tested and standardized at some point in the future for
3002 * optimal aging of shared pages.
3005 pmap_ts_referenced(vm_page_t m)
3007 register pv_entry_t pv, pvf, pvn;
3013 if (m->flags & PG_FICTITIOUS)
3017 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3018 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3023 pvn = TAILQ_NEXT(pv, pv_list);
3025 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3027 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3031 pte = pmap_pte_quick(pmap, pv->pv_va);
3033 if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
3034 atomic_clear_int((u_int *)pte, PG_A);
3035 pmap_invalidate_page(pmap, pv->pv_va);
3044 } while ((pv = pvn) != NULL && pv != pvf);
3052 * Clear the modify bits on the specified physical page.
3055 pmap_clear_modify(vm_page_t m)
3057 pmap_clear_ptes(m, PG_M);
3061 * pmap_clear_reference:
3063 * Clear the reference bit on the specified physical page.
3066 pmap_clear_reference(vm_page_t m)
3068 pmap_clear_ptes(m, PG_A);
3072 * Miscellaneous support routines follow
3076 * Map a set of physical memory pages into the kernel virtual
3077 * address space. Return a pointer to where it is mapped. This
3078 * routine is intended to be used for mapping device memory,
3082 pmap_mapdev(pa, size)
3086 vm_offset_t va, tmpva, offset;
3088 offset = pa & PAGE_MASK;
3089 size = roundup(offset + size, PAGE_SIZE);
3092 if (pa < KERNLOAD && pa + size <= KERNLOAD)
3095 va = kmem_alloc_nofault(kernel_map, size);
3097 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3099 for (tmpva = va; size > 0; ) {
3100 pmap_kenter(tmpva, pa);
3105 pmap_invalidate_range(kernel_pmap, va, tmpva);
3106 return ((void *)(va + offset));
3110 pmap_unmapdev(va, size)
3114 vm_offset_t base, offset, tmpva;
3116 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
3118 base = va & PG_FRAME;
3119 offset = va & PAGE_MASK;
3120 size = roundup(offset + size, PAGE_SIZE);
3121 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
3122 pmap_kremove(tmpva);
3123 pmap_invalidate_range(kernel_pmap, va, tmpva);
3124 kmem_free(kernel_map, base, size);
3128 * perform the pmap work for mincore
3131 pmap_mincore(pmap, addr)
3135 pt_entry_t *ptep, pte;
3140 ptep = pmap_pte(pmap, addr);
3141 pte = (ptep != NULL) ? *ptep : 0;
3142 pmap_pte_release(ptep);
3148 val = MINCORE_INCORE;
3149 if ((pte & PG_MANAGED) == 0)
3152 pa = pte & PG_FRAME;
3154 m = PHYS_TO_VM_PAGE(pa);
3160 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3163 * Modified by someone else
3165 vm_page_lock_queues();
3166 if (m->dirty || pmap_is_modified(m))
3167 val |= MINCORE_MODIFIED_OTHER;
3168 vm_page_unlock_queues();
3174 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3177 * Referenced by someone else
3179 vm_page_lock_queues();
3180 if ((m->flags & PG_REFERENCED) ||
3181 pmap_ts_referenced(m)) {
3182 val |= MINCORE_REFERENCED_OTHER;
3183 vm_page_flag_set(m, PG_REFERENCED);
3185 vm_page_unlock_queues();
3192 pmap_activate(struct thread *td)
3194 pmap_t pmap, oldpmap;
3198 pmap = vmspace_pmap(td->td_proc->p_vmspace);
3199 oldpmap = PCPU_GET(curpmap);
3201 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
3202 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
3204 oldpmap->pm_active &= ~1;
3205 pmap->pm_active |= 1;
3208 cr3 = vtophys(pmap->pm_pdpt);
3210 cr3 = vtophys(pmap->pm_pdir);
3213 * pmap_activate is for the current thread on the current cpu
3215 td->td_pcb->pcb_cr3 = cr3;
3217 PCPU_SET(curpmap, pmap);
3222 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3225 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3229 addr = (addr + PDRMASK) & ~PDRMASK;
3234 #if defined(PMAP_DEBUG)
3235 pmap_pid_dump(int pid)
3242 sx_slock(&allproc_lock);
3243 LIST_FOREACH(p, &allproc, p_list) {
3244 if (p->p_pid != pid)
3250 pmap = vmspace_pmap(p->p_vmspace);
3251 for (i = 0; i < NPDEPTD; i++) {
3254 vm_offset_t base = i << PDRSHIFT;
3256 pde = &pmap->pm_pdir[i];
3257 if (pde && pmap_pde_v(pde)) {
3258 for (j = 0; j < NPTEPG; j++) {
3259 vm_offset_t va = base + (j << PAGE_SHIFT);
3260 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3265 sx_sunlock(&allproc_lock);
3268 pte = pmap_pte(pmap, va);
3269 if (pte && pmap_pte_v(pte)) {
3273 m = PHYS_TO_VM_PAGE(pa);
3274 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3275 va, pa, m->hold_count, m->wire_count, m->flags);
3290 sx_sunlock(&allproc_lock);
3297 static void pads(pmap_t pm);
3298 void pmap_pvdump(vm_offset_t pa);
3300 /* print address space of pmap*/
3309 if (pm == kernel_pmap)
3311 for (i = 0; i < NPDEPTD; i++)
3313 for (j = 0; j < NPTEPG; j++) {
3314 va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3315 if (pm == kernel_pmap && va < KERNBASE)
3317 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3319 ptep = pmap_pte(pm, va);
3320 if (pmap_pte_v(ptep))
3321 printf("%x:%x ", va, *ptep);
3334 printf("pa %x", pa);
3335 m = PHYS_TO_VM_PAGE(pa);
3336 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3338 printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);