sys/i386/i386/pmap.c

   1 /*-
   2  * Copyright (c) 1991 Regents of the University of California.
   3  * All rights reserved.
   4  * Copyright (c) 1994 John S. Dyson
   5  * All rights reserved.
   6  * Copyright (c) 1994 David Greenman
   7  * All rights reserved.
   8  * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
   9  * All rights reserved.
  10  *
  11  * This code is derived from software contributed to Berkeley by
  12  * the Systems Programming Group of the University of Utah Computer
  13  * Science Department and William Jolitz of UUNET Technologies Inc.
  14  *
  15  * Redistribution and use in source and binary forms, with or without
  16  * modification, are permitted provided that the following conditions
  17  * are met:
  18  * 1. Redistributions of source code must retain the above copyright
  19  *    notice, this list of conditions and the following disclaimer.
  20  * 2. Redistributions in binary form must reproduce the above copyright
  21  *    notice, this list of conditions and the following disclaimer in the
  22  *    documentation and/or other materials provided with the distribution.
  23  * 3. All advertising materials mentioning features or use of this software
  24  *    must display the following acknowledgement:
  25  *      This product includes software developed by the University of
  26  *      California, Berkeley and its contributors.
  27  * 4. Neither the name of the University nor the names of its contributors
  28  *    may be used to endorse or promote products derived from this software
  29  *    without specific prior written permission.
  30  *
  31  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  32  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  33  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  34  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  35  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  36  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  37  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  38  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  39  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  40  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  41  * SUCH DAMAGE.
  42  *
  43  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
  44  */
  45 /*-
  46  * Copyright (c) 2003 Networks Associates Technology, Inc.
  47  * All rights reserved.
  48  *
  49  * This software was developed for the FreeBSD Project by Jake Burkholder,
  50  * Safeport Network Services, and Network Associates Laboratories, the
  51  * Security Research Division of Network Associates, Inc. under
  52  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  53  * CHATS research program.
  54  *
  55  * Redistribution and use in source and binary forms, with or without
  56  * modification, are permitted provided that the following conditions
  57  * are met:
  58  * 1. Redistributions of source code must retain the above copyright
  59  *    notice, this list of conditions and the following disclaimer.
  60  * 2. Redistributions in binary form must reproduce the above copyright
  61  *    notice, this list of conditions and the following disclaimer in the
  62  *    documentation and/or other materials provided with the distribution.
  63  *
  64  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  65  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  66  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  67  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  68  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  69  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  70  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  71  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  72  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  73  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  74  * SUCH DAMAGE.
  75  */
  76
  77 #include <sys/cdefs.h>
  78 __FBSDID("$FreeBSD$");
  79
  80 /*
  81  *      Manages physical address maps.
  82  *
  83  *      In addition to hardware address maps, this
  84  *      module is called upon to provide software-use-only
  85  *      maps which may or may not be stored in the same
  86  *      form as hardware maps.  These pseudo-maps are
  87  *      used to store intermediate results from copy
  88  *      operations to and from address spaces.
  89  *
  90  *      Since the information managed by this module is
  91  *      also stored by the logical address mapping module,
  92  *      this module may throw away valid virtual-to-physical
  93  *      mappings at almost any time.  However, invalidations
  94  *      of virtual-to-physical mappings must be done as
  95  *      requested.
  96  *
  97  *      In order to cope with hardware architectures which
  98  *      make virtual-to-physical map invalidates expensive,
  99  *      this module may delay invalidate or reduced protection
 100  *      operations until such time as they are actually
 101  *      necessary.  This module is given full information as
 102  *      to which processors are currently using which maps,
 103  *      and to when physical maps must be made correct.
 104  */
 105
 106 #include "opt_cpu.h"
 107 #include "opt_pmap.h"
 108 #include "opt_msgbuf.h"
 109 #include "opt_smp.h"
 110 #include "opt_xbox.h"
 111
 112 #include <sys/param.h>
 113 #include <sys/systm.h>
 114 #include <sys/kernel.h>
 115 #include <sys/lock.h>
 116 #include <sys/malloc.h>
 117 #include <sys/mman.h>
 118 #include <sys/msgbuf.h>
 119 #include <sys/mutex.h>
 120 #include <sys/proc.h>
 121 #include <sys/sx.h>
 122 #include <sys/vmmeter.h>
 123 #include <sys/sched.h>
 124 #include <sys/sysctl.h>
 125 #ifdef SMP
 126 #include <sys/smp.h>
 127 #endif
 128
 129 #include <vm/vm.h>
 130 #include <vm/vm_param.h>
 131 #include <vm/vm_kern.h>
 132 #include <vm/vm_page.h>
 133 #include <vm/vm_map.h>
 134 #include <vm/vm_object.h>
 135 #include <vm/vm_extern.h>
 136 #include <vm/vm_pageout.h>
 137 #include <vm/vm_pager.h>
 138 #include <vm/uma.h>
 139
 140 #include <machine/cpu.h>
 141 #include <machine/cputypes.h>
 142 #include <machine/md_var.h>
 143 #include <machine/pcb.h>
 144 #include <machine/specialreg.h>
 145 #ifdef SMP
 146 #include <machine/smp.h>
 147 #endif
 148
 149 #ifdef XBOX
 150 #include <machine/xbox.h>
 151 #endif
 152
 153 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 154 #define CPU_ENABLE_SSE
 155 #endif
 156
 157 #ifndef PMAP_SHPGPERPROC
 158 #define PMAP_SHPGPERPROC 200
 159 #endif
 160
 161 #if defined(DIAGNOSTIC)
 162 #define PMAP_DIAGNOSTIC
 163 #endif
 164
 165 #if !defined(PMAP_DIAGNOSTIC)
 166 #define PMAP_INLINE __inline
 167 #else
 168 #define PMAP_INLINE
 169 #endif
 170
 171 /*
 172  * Get PDEs and PTEs for user/kernel address space
 173  */
 174 #define pmap_pde(m, v)  (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
 175 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
 176
 177 #define pmap_pde_v(pte)         ((*(int *)pte & PG_V) != 0)
 178 #define pmap_pte_w(pte)         ((*(int *)pte & PG_W) != 0)
 179 #define pmap_pte_m(pte)         ((*(int *)pte & PG_M) != 0)
 180 #define pmap_pte_u(pte)         ((*(int *)pte & PG_A) != 0)
 181 #define pmap_pte_v(pte)         ((*(int *)pte & PG_V) != 0)
 182
 183 #define pmap_pte_set_w(pte, v)  ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
 184     atomic_clear_int((u_int *)(pte), PG_W))
 185 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
 186
 187 struct pmap kernel_pmap_store;
 188 LIST_HEAD(pmaplist, pmap);
 189 static struct pmaplist allpmaps;
 190 static struct mtx allpmaps_lock;
 191
 192 vm_paddr_t avail_end;   /* PA of last available physical page */
 193 vm_offset_t virtual_avail;      /* VA of first avail page (after kernel bss) */
 194 vm_offset_t virtual_end;        /* VA of last avail page (end of kernel AS) */
 195 int pgeflag = 0;                /* PG_G or-in */
 196 int pseflag = 0;                /* PG_PS or-in */
 197
 198 static int nkpt;
 199 vm_offset_t kernel_vm_end;
 200 extern u_int32_t KERNend;
 201
 202 #ifdef PAE
 203 static uma_zone_t pdptzone;
 204 #endif
 205
 206 /*
 207  * Data for the pv entry allocation mechanism
 208  */
 209 static uma_zone_t pvzone;
 210 static struct vm_object pvzone_obj;
 211 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 212
 213 /*
 214  * All those kernel PT submaps that BSD is so fond of
 215  */
 216 struct sysmaps {
 217         struct  mtx lock;
 218         pt_entry_t *CMAP1;
 219         pt_entry_t *CMAP2;
 220         caddr_t CADDR1;
 221         caddr_t CADDR2;
 222 };
 223 static struct sysmaps sysmaps_pcpu[MAXCPU];
 224 pt_entry_t *CMAP1 = 0;
 225 static pt_entry_t *CMAP3;
 226 caddr_t CADDR1 = 0, ptvmmap = 0;
 227 static caddr_t CADDR3;
 228 struct msgbuf *msgbufp = 0;
 229
 230 /*
 231  * Crashdump maps.
 232  */
 233 static caddr_t crashdumpmap;
 234
 235 #ifdef SMP
 236 extern pt_entry_t *SMPpt;
 237 #endif
 238 static pt_entry_t *PMAP1 = 0, *PMAP2;
 239 static pt_entry_t *PADDR1 = 0, *PADDR2;
 240 #ifdef SMP
 241 static int PMAP1cpu;
 242 static int PMAP1changedcpu;
 243 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
 244            &PMAP1changedcpu, 0,
 245            "Number of times pmap_pte_quick changed CPU with same PMAP1");
 246 #endif
 247 static int PMAP1changed;
 248 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
 249            &PMAP1changed, 0,
 250            "Number of times pmap_pte_quick changed PMAP1");
 251 static int PMAP1unchanged;
 252 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
 253            &PMAP1unchanged, 0,
 254            "Number of times pmap_pte_quick didn't change PMAP1");
 255 static struct mtx PMAP2mutex;
 256
 257 static PMAP_INLINE void free_pv_entry(pv_entry_t pv);
 258 static pv_entry_t get_pv_entry(pmap_t locked_pmap);
 259 static void     pmap_clear_ptes(vm_page_t m, int bit);
 260
 261 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
 262 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
 263 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 264                                         vm_offset_t va);
 265 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 266
 267 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
 268
 269 static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
 270 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m);
 271 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
 272 static void pmap_pte_release(pt_entry_t *pte);
 273 static int pmap_unuse_pt(pmap_t, vm_offset_t);
 274 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 275 #ifdef PAE
 276 static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
 277 #endif
 278
 279 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
 280 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
 281
 282 /*
 283  * Move the kernel virtual free pointer to the next
 284  * 4MB.  This is used to help improve performance
 285  * by using a large (4MB) page for much of the kernel
 286  * (.text, .data, .bss)
 287  */
 288 static vm_offset_t
 289 pmap_kmem_choose(vm_offset_t addr)
 290 {
 291         vm_offset_t newaddr = addr;
 292
 293 #ifndef DISABLE_PSE
 294         if (cpu_feature & CPUID_PSE)
 295                 newaddr = (addr + PDRMASK) & ~PDRMASK;
 296 #endif
 297         return newaddr;
 298 }
 299
 300 /*
 301  *      Bootstrap the system enough to run with virtual memory.
 302  *
 303  *      On the i386 this is called after mapping has already been enabled
 304  *      and just syncs the pmap module with what has already been done.
 305  *      [We can't call it easily with mapping off since the kernel is not
 306  *      mapped with PA == VA, hence we would have to relocate every address
 307  *      from the linked base (virtual) address "KERNBASE" to the actual
 308  *      (physical) address starting relative to 0]
 309  */
 310 void
 311 pmap_bootstrap(firstaddr, loadaddr)
 312         vm_paddr_t firstaddr;
 313         vm_paddr_t loadaddr;
 314 {
 315         vm_offset_t va;
 316         pt_entry_t *pte, *unused;
 317         struct sysmaps *sysmaps;
 318         int i;
 319
 320         /*
 321          * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
 322          * large. It should instead be correctly calculated in locore.s and
 323          * not based on 'first' (which is a physical address, not a virtual
 324          * address, for the start of unused physical memory). The kernel
 325          * page tables are NOT double mapped and thus should not be included
 326          * in this calculation.
 327          */
 328         virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
 329         virtual_avail = pmap_kmem_choose(virtual_avail);
 330
 331         virtual_end = VM_MAX_KERNEL_ADDRESS;
 332
 333         /*
 334          * Initialize the kernel pmap (which is statically allocated).
 335          */
 336         PMAP_LOCK_INIT(kernel_pmap);
 337         kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
 338 #ifdef PAE
 339         kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
 340 #endif
 341         kernel_pmap->pm_active = -1;    /* don't allow deactivation */
 342         TAILQ_INIT(&kernel_pmap->pm_pvlist);
 343         LIST_INIT(&allpmaps);
 344         mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
 345         mtx_lock_spin(&allpmaps_lock);
 346         LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
 347         mtx_unlock_spin(&allpmaps_lock);
 348         nkpt = NKPT;
 349
 350         /*
 351          * Reserve some special page table entries/VA space for temporary
 352          * mapping of pages.
 353          */
 354 #define SYSMAP(c, p, v, n)      \
 355         v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 356
 357         va = virtual_avail;
 358         pte = vtopte(va);
 359
 360         /*
 361          * CMAP1/CMAP2 are used for zeroing and copying pages.
 362          * CMAP3 is used for the idle process page zeroing.
 363          */
 364         for (i = 0; i < MAXCPU; i++) {
 365                 sysmaps = &sysmaps_pcpu[i];
 366                 mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
 367                 SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
 368                 SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
 369         }
 370         SYSMAP(caddr_t, CMAP1, CADDR1, 1)
 371         SYSMAP(caddr_t, CMAP3, CADDR3, 1)
 372         *CMAP3 = 0;
 373
 374         /*
 375          * Crashdump maps.
 376          */
 377         SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
 378
 379         /*
 380          * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
 381          */
 382         SYSMAP(caddr_t, unused, ptvmmap, 1)
 383
 384         /*
 385          * msgbufp is used to map the system message buffer.
 386          */
 387         SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
 388
 389         /*
 390          * ptemap is used for pmap_pte_quick
 391          */
 392         SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
 393         SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1);
 394
 395         mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
 396
 397         virtual_avail = va;
 398
 399         *CMAP1 = 0;
 400
 401 #ifdef XBOX
 402         /* FIXME: This is gross, but needed for the XBOX. Since we are in such
 403          * an early stadium, we cannot yet neatly map video memory ... :-(
 404          * Better fixes are very welcome! */
 405         if (!arch_i386_is_xbox)
 406 #endif
 407         for (i = 0; i < NKPT; i++)
 408                 PTD[i] = 0;
 409
 410         /* Turn on PG_G on kernel page(s) */
 411         pmap_set_pg();
 412 }
 413
 414 /*
 415  * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
 416  */
 417 void
 418 pmap_set_pg(void)
 419 {
 420         pd_entry_t pdir;
 421         pt_entry_t *pte;
 422         vm_offset_t va, endva;
 423         int i;
 424
 425         if (pgeflag == 0)
 426                 return;
 427
 428         i = KERNLOAD/NBPDR;
 429         endva = KERNBASE + KERNend;
 430
 431         if (pseflag) {
 432                 va = KERNBASE + KERNLOAD;
 433                 while (va  < endva) {
 434                         pdir = kernel_pmap->pm_pdir[KPTDI+i];
 435                         pdir |= pgeflag;
 436                         kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
 437                         invltlb();      /* Play it safe, invltlb() every time */
 438                         i++;
 439                         va += NBPDR;
 440                 }
 441         } else {
 442                 va = (vm_offset_t)btext;
 443                 while (va < endva) {
 444                         pte = vtopte(va);
 445                         if (*pte)
 446                                 *pte |= pgeflag;
 447                         invltlb();      /* Play it safe, invltlb() every time */
 448                         va += PAGE_SIZE;
 449                 }
 450         }
 451 }
 452
 453 /*
 454  * Initialize a vm_page's machine-dependent fields.
 455  */
 456 void
 457 pmap_page_init(vm_page_t m)
 458 {
 459
 460         TAILQ_INIT(&m->md.pv_list);
 461         m->md.pv_list_count = 0;
 462 }
 463
 464 #ifdef PAE
 465
 466 static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt");
 467
 468 static void *
 469 pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 470 {
 471         *flags = UMA_SLAB_PRIV;
 472         return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL,
 473             1, 0));
 474 }
 475 #endif
 476
 477 /*
 478  *      Initialize the pmap module.
 479  *      Called by vm_init, to initialize any structures that the pmap
 480  *      system needs to map virtual memory.
 481  */
 482 void
 483 pmap_init(void)
 484 {
 485         int shpgperproc = PMAP_SHPGPERPROC;
 486
 487         /*
 488          * Initialize the address space (zone) for the pv entries.  Set a
 489          * high water mark so that the system can recover from excessive
 490          * numbers of pv entries.
 491          */
 492         pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL,
 493             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 494         TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 495         pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
 496         TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 497         pv_entry_high_water = 9 * (pv_entry_max / 10);
 498         uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
 499
 500 #ifdef PAE
 501         pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
 502             NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
 503             UMA_ZONE_VM | UMA_ZONE_NOFREE);
 504         uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
 505 #endif
 506 }
 507
 508
 509 /***************************************************
 510  * Low level helper routines.....
 511  ***************************************************/
 512
 513
 514 /*
 515  * this routine defines the region(s) of memory that should
 516  * not be tested for the modified bit.
 517  */
 518 static PMAP_INLINE int
 519 pmap_track_modified(vm_offset_t va)
 520 {
 521         if ((va < kmi.clean_sva) || (va >= kmi.clean_eva))
 522                 return 1;
 523         else
 524                 return 0;
 525 }
 526
 527 #ifdef SMP
 528 /*
 529  * For SMP, these functions have to use the IPI mechanism for coherence.
 530  */
 531 void
 532 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 533 {
 534         u_int cpumask;
 535         u_int other_cpus;
 536
 537         if (smp_started) {
 538                 if (!(read_eflags() & PSL_I))
 539                         panic("%s: interrupts disabled", __func__);
 540                 mtx_lock_spin(&smp_ipi_mtx);
 541         } else
 542                 critical_enter();
 543         /*
 544          * We need to disable interrupt preemption but MUST NOT have
 545          * interrupts disabled here.
 546          * XXX we may need to hold schedlock to get a coherent pm_active
 547          * XXX critical sections disable interrupts again
 548          */
 549         if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 550                 invlpg(va);
 551                 smp_invlpg(va);
 552         } else {
 553                 cpumask = PCPU_GET(cpumask);
 554                 other_cpus = PCPU_GET(other_cpus);
 555                 if (pmap->pm_active & cpumask)
 556                         invlpg(va);
 557                 if (pmap->pm_active & other_cpus)
 558                         smp_masked_invlpg(pmap->pm_active & other_cpus, va);
 559         }
 560         if (smp_started)
 561                 mtx_unlock_spin(&smp_ipi_mtx);
 562         else
 563                 critical_exit();
 564 }
 565
 566 void
 567 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 568 {
 569         u_int cpumask;
 570         u_int other_cpus;
 571         vm_offset_t addr;
 572
 573         if (smp_started) {
 574                 if (!(read_eflags() & PSL_I))
 575                         panic("%s: interrupts disabled", __func__);
 576                 mtx_lock_spin(&smp_ipi_mtx);
 577         } else
 578                 critical_enter();
 579         /*
 580          * We need to disable interrupt preemption but MUST NOT have
 581          * interrupts disabled here.
 582          * XXX we may need to hold schedlock to get a coherent pm_active
 583          * XXX critical sections disable interrupts again
 584          */
 585         if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 586                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
 587                         invlpg(addr);
 588                 smp_invlpg_range(sva, eva);
 589         } else {
 590                 cpumask = PCPU_GET(cpumask);
 591                 other_cpus = PCPU_GET(other_cpus);
 592                 if (pmap->pm_active & cpumask)
 593                         for (addr = sva; addr < eva; addr += PAGE_SIZE)
 594                                 invlpg(addr);
 595                 if (pmap->pm_active & other_cpus)
 596                         smp_masked_invlpg_range(pmap->pm_active & other_cpus,
 597                             sva, eva);
 598         }
 599         if (smp_started)
 600                 mtx_unlock_spin(&smp_ipi_mtx);
 601         else
 602                 critical_exit();
 603 }
 604
 605 void
 606 pmap_invalidate_all(pmap_t pmap)
 607 {
 608         u_int cpumask;
 609         u_int other_cpus;
 610
 611         if (smp_started) {
 612                 if (!(read_eflags() & PSL_I))
 613                         panic("%s: interrupts disabled", __func__);
 614                 mtx_lock_spin(&smp_ipi_mtx);
 615         } else
 616                 critical_enter();
 617         /*
 618          * We need to disable interrupt preemption but MUST NOT have
 619          * interrupts disabled here.
 620          * XXX we may need to hold schedlock to get a coherent pm_active
 621          * XXX critical sections disable interrupts again
 622          */
 623         if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 624                 invltlb();
 625                 smp_invltlb();
 626         } else {
 627                 cpumask = PCPU_GET(cpumask);
 628                 other_cpus = PCPU_GET(other_cpus);
 629                 if (pmap->pm_active & cpumask)
 630                         invltlb();
 631                 if (pmap->pm_active & other_cpus)
 632                         smp_masked_invltlb(pmap->pm_active & other_cpus);
 633         }
 634         if (smp_started)
 635                 mtx_unlock_spin(&smp_ipi_mtx);
 636         else
 637                 critical_exit();
 638 }
 639 #else /* !SMP */
 640 /*
 641  * Normal, non-SMP, 486+ invalidation functions.
 642  * We inline these within pmap.c for speed.
 643  */
 644 PMAP_INLINE void
 645 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 646 {
 647
 648         if (pmap == kernel_pmap || pmap->pm_active)
 649                 invlpg(va);
 650 }
 651
 652 PMAP_INLINE void
 653 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 654 {
 655         vm_offset_t addr;
 656
 657         if (pmap == kernel_pmap || pmap->pm_active)
 658                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
 659                         invlpg(addr);
 660 }
 661
 662 PMAP_INLINE void
 663 pmap_invalidate_all(pmap_t pmap)
 664 {
 665
 666         if (pmap == kernel_pmap || pmap->pm_active)
 667                 invltlb();
 668 }
 669 #endif /* !SMP */
 670
 671 /*
 672  * Are we current address space or kernel?  N.B. We return FALSE when
 673  * a pmap's page table is in use because a kernel thread is borrowing
 674  * it.  The borrowed page table can change spontaneously, making any
 675  * dependence on its continued use subject to a race condition.
 676  */
 677 static __inline int
 678 pmap_is_current(pmap_t pmap)
 679 {
 680
 681         return (pmap == kernel_pmap ||
 682                 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
 683             (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
 684 }
 685
 686 /*
 687  * If the given pmap is not the current or kernel pmap, the returned pte must
 688  * be released by passing it to pmap_pte_release().
 689  */
 690 pt_entry_t *
 691 pmap_pte(pmap_t pmap, vm_offset_t va)
 692 {
 693         pd_entry_t newpf;
 694         pd_entry_t *pde;
 695
 696         pde = pmap_pde(pmap, va);
 697         if (*pde & PG_PS)
 698                 return (pde);
 699         if (*pde != 0) {
 700                 /* are we current address space or kernel? */
 701                 if (pmap_is_current(pmap))
 702                         return (vtopte(va));
 703                 mtx_lock(&PMAP2mutex);
 704                 newpf = *pde & PG_FRAME;
 705                 if ((*PMAP2 & PG_FRAME) != newpf) {
 706                         *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
 707                         pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
 708                 }
 709                 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
 710         }
 711         return (0);
 712 }
 713
 714 /*
 715  * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
 716  * being NULL.
 717  */
 718 static __inline void
 719 pmap_pte_release(pt_entry_t *pte)
 720 {
 721
 722         if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
 723                 mtx_unlock(&PMAP2mutex);
 724 }
 725
 726 static __inline void
 727 invlcaddr(void *caddr)
 728 {
 729
 730         invlpg((u_int)caddr);
 731 }
 732
 733 /*
 734  * Super fast pmap_pte routine best used when scanning
 735  * the pv lists.  This eliminates many coarse-grained
 736  * invltlb calls.  Note that many of the pv list
 737  * scans are across different pmaps.  It is very wasteful
 738  * to do an entire invltlb for checking a single mapping.
 739  *
 740  * If the given pmap is not the current pmap, vm_page_queue_mtx
 741  * must be held and curthread pinned to a CPU.
 742  */
 743 static pt_entry_t *
 744 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
 745 {
 746         pd_entry_t newpf;
 747         pd_entry_t *pde;
 748
 749         pde = pmap_pde(pmap, va);
 750         if (*pde & PG_PS)
 751                 return (pde);
 752         if (*pde != 0) {
 753                 /* are we current address space or kernel? */
 754                 if (pmap_is_current(pmap))
 755                         return (vtopte(va));
 756                 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 757                 KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 758                 newpf = *pde & PG_FRAME;
 759                 if ((*PMAP1 & PG_FRAME) != newpf) {
 760                         *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
 761 #ifdef SMP
 762                         PMAP1cpu = PCPU_GET(cpuid);
 763 #endif
 764                         invlcaddr(PADDR1);
 765                         PMAP1changed++;
 766                 } else
 767 #ifdef SMP
 768                 if (PMAP1cpu != PCPU_GET(cpuid)) {
 769                         PMAP1cpu = PCPU_GET(cpuid);
 770                         invlcaddr(PADDR1);
 771                         PMAP1changedcpu++;
 772                 } else
 773 #endif
 774                         PMAP1unchanged++;
 775                 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
 776         }
 777         return (0);
 778 }
 779
 780 /*
 781  *      Routine:        pmap_extract
 782  *      Function:
 783  *              Extract the physical page address associated
 784  *              with the given map/virtual_address pair.
 785  */
 786 vm_paddr_t
 787 pmap_extract(pmap_t pmap, vm_offset_t va)
 788 {
 789         vm_paddr_t rtval;
 790         pt_entry_t *pte;
 791         pd_entry_t pde;
 792
 793         rtval = 0;
 794         PMAP_LOCK(pmap);
 795         pde = pmap->pm_pdir[va >> PDRSHIFT];
 796         if (pde != 0) {
 797                 if ((pde & PG_PS) != 0) {
 798                         rtval = (pde & ~PDRMASK) | (va & PDRMASK);
 799                         PMAP_UNLOCK(pmap);
 800                         return rtval;
 801                 }
 802                 pte = pmap_pte(pmap, va);
 803                 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
 804                 pmap_pte_release(pte);
 805         }
 806         PMAP_UNLOCK(pmap);
 807         return (rtval);
 808 }
 809
 810 /*
 811  *      Routine:        pmap_extract_and_hold
 812  *      Function:
 813  *              Atomically extract and hold the physical page
 814  *              with the given pmap and virtual address pair
 815  *              if that mapping permits the given protection.
 816  */
 817 vm_page_t
 818 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 819 {
 820         pd_entry_t pde;
 821         pt_entry_t pte;
 822         vm_page_t m;
 823
 824         m = NULL;
 825         vm_page_lock_queues();
 826         PMAP_LOCK(pmap);
 827         pde = *pmap_pde(pmap, va);
 828         if (pde != 0) {
 829                 if (pde & PG_PS) {
 830                         if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 831                                 m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) |
 832                                     (va & PDRMASK));
 833                                 vm_page_hold(m);
 834                         }
 835                 } else {
 836                         sched_pin();
 837                         pte = *pmap_pte_quick(pmap, va);
 838                         if (pte != 0 &&
 839                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 840                                 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 841                                 vm_page_hold(m);
 842                         }
 843                         sched_unpin();
 844                 }
 845         }
 846         vm_page_unlock_queues();
 847         PMAP_UNLOCK(pmap);
 848         return (m);
 849 }
 850
 851 /***************************************************
 852  * Low level mapping routines.....
 853  ***************************************************/
 854
 855 /*
 856  * Add a wired page to the kva.
 857  * Note: not SMP coherent.
 858  */
 859 PMAP_INLINE void
 860 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 861 {
 862         pt_entry_t *pte;
 863
 864         pte = vtopte(va);
 865         pte_store(pte, pa | PG_RW | PG_V | pgeflag);
 866 }
 867
 868 /*
 869  * Remove a page from the kernel pagetables.
 870  * Note: not SMP coherent.
 871  */
 872 PMAP_INLINE void
 873 pmap_kremove(vm_offset_t va)
 874 {
 875         pt_entry_t *pte;
 876
 877         pte = vtopte(va);
 878         pte_clear(pte);
 879 }
 880
 881 /*
 882  *      Used to map a range of physical addresses into kernel
 883  *      virtual address space.
 884  *
 885  *      The value passed in '*virt' is a suggested virtual address for
 886  *      the mapping. Architectures which can support a direct-mapped
 887  *      physical to virtual region can return the appropriate address
 888  *      within that region, leaving '*virt' unchanged. Other
 889  *      architectures should map the pages starting at '*virt' and
 890  *      update '*virt' with the first usable address after the mapped
 891  *      region.
 892  */
 893 vm_offset_t
 894 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 895 {
 896         vm_offset_t va, sva;
 897
 898         va = sva = *virt;
 899         while (start < end) {
 900                 pmap_kenter(va, start);
 901                 va += PAGE_SIZE;
 902                 start += PAGE_SIZE;
 903         }
 904         pmap_invalidate_range(kernel_pmap, sva, va);
 905         *virt = va;
 906         return (sva);
 907 }
 908
 909
 910 /*
 911  * Add a list of wired pages to the kva
 912  * this routine is only used for temporary
 913  * kernel mappings that do not need to have
 914  * page modification or references recorded.
 915  * Note that old mappings are simply written
 916  * over.  The page *must* be wired.
 917  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 918  */
 919 void
 920 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
 921 {
 922         vm_offset_t va;
 923
 924         va = sva;
 925         while (count-- > 0) {
 926                 pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
 927                 va += PAGE_SIZE;
 928                 m++;
 929         }
 930         pmap_invalidate_range(kernel_pmap, sva, va);
 931 }
 932
 933 /*
 934  * This routine tears out page mappings from the
 935  * kernel -- it is meant only for temporary mappings.
 936  * Note: SMP coherent.  Uses a ranged shootdown IPI.
 937  */
 938 void
 939 pmap_qremove(vm_offset_t sva, int count)
 940 {
 941         vm_offset_t va;
 942
 943         va = sva;
 944         while (count-- > 0) {
 945                 pmap_kremove(va);
 946                 va += PAGE_SIZE;
 947         }
 948         pmap_invalidate_range(kernel_pmap, sva, va);
 949 }
 950
 951 /***************************************************
 952  * Page table page management routines.....
 953  ***************************************************/
 954
 955 /*
 956  * This routine unholds page table pages, and if the hold count
 957  * drops to zero, then it decrements the wire count.
 958  */
 959 static PMAP_INLINE int
 960 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
 961 {
 962
 963         --m->wire_count;
 964         if (m->wire_count == 0)
 965                 return _pmap_unwire_pte_hold(pmap, m);
 966         else
 967                 return 0;
 968 }
 969
 970 static int
 971 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
 972 {
 973         vm_offset_t pteva;
 974
 975         /*
 976          * unmap the page table page
 977          */
 978         pmap->pm_pdir[m->pindex] = 0;
 979         --pmap->pm_stats.resident_count;
 980
 981         /*
 982          * Do an invltlb to make the invalidated mapping
 983          * take effect immediately.
 984          */
 985         pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
 986         pmap_invalidate_page(pmap, pteva);
 987
 988         vm_page_free_zero(m);
 989         atomic_subtract_int(&cnt.v_wire_count, 1);
 990         return 1;
 991 }
 992
 993 /*
 994  * After removing a page table entry, this routine is used to
 995  * conditionally free the page, and manage the hold/wire counts.
 996  */
 997 static int
 998 pmap_unuse_pt(pmap_t pmap, vm_offset_t va)
 999 {
1000         pd_entry_t ptepde;
1001         vm_page_t mpte;
1002
1003         if (va >= VM_MAXUSER_ADDRESS)
1004                 return 0;
1005         ptepde = *pmap_pde(pmap, va);
1006         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1007         return pmap_unwire_pte_hold(pmap, mpte);
1008 }
1009
1010 void
1011 pmap_pinit0(pmap)
1012         struct pmap *pmap;
1013 {
1014
1015         PMAP_LOCK_INIT(pmap);
1016         pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1017 #ifdef PAE
1018         pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1019 #endif
1020         pmap->pm_active = 0;
1021         PCPU_SET(curpmap, pmap);
1022         TAILQ_INIT(&pmap->pm_pvlist);
1023         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1024         mtx_lock_spin(&allpmaps_lock);
1025         LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1026         mtx_unlock_spin(&allpmaps_lock);
1027 }
1028
1029 /*
1030  * Initialize a preallocated and zeroed pmap structure,
1031  * such as one in a vmspace structure.
1032  */
1033 void
1034 pmap_pinit(pmap)
1035         register struct pmap *pmap;
1036 {
1037         vm_page_t m, ptdpg[NPGPTD];
1038         vm_paddr_t pa;
1039         static int color;
1040         int i;
1041
1042         PMAP_LOCK_INIT(pmap);
1043
1044         /*
1045          * No need to allocate page table space yet but we do need a valid
1046          * page directory table.
1047          */
1048         if (pmap->pm_pdir == NULL) {
1049                 pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1050                     NBPTD);
1051 #ifdef PAE
1052                 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1053                 KASSERT(((vm_offset_t)pmap->pm_pdpt &
1054                     ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1055                     ("pmap_pinit: pdpt misaligned"));
1056                 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1057                     ("pmap_pinit: pdpt above 4g"));
1058 #endif
1059         }
1060
1061         /*
1062          * allocate the page directory page(s)
1063          */
1064         for (i = 0; i < NPGPTD;) {
1065                 m = vm_page_alloc(NULL, color++,
1066                     VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1067                     VM_ALLOC_ZERO);
1068                 if (m == NULL)
1069                         VM_WAIT;
1070                 else {
1071                         ptdpg[i++] = m;
1072                 }
1073         }
1074
1075         pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1076
1077         for (i = 0; i < NPGPTD; i++) {
1078                 if ((ptdpg[i]->flags & PG_ZERO) == 0)
1079                         bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1080         }
1081
1082         mtx_lock_spin(&allpmaps_lock);
1083         LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1084         mtx_unlock_spin(&allpmaps_lock);
1085         /* Wire in kernel global address entries. */
1086         /* XXX copies current process, does not fill in MPPTDI */
1087         bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1088 #ifdef SMP
1089         pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1090 #endif
1091
1092         /* install self-referential address mapping entry(s) */
1093         for (i = 0; i < NPGPTD; i++) {
1094                 pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1095                 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1096 #ifdef PAE
1097                 pmap->pm_pdpt[i] = pa | PG_V;
1098 #endif
1099         }
1100
1101         pmap->pm_active = 0;
1102         TAILQ_INIT(&pmap->pm_pvlist);
1103         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1104 }
1105
1106 /*
1107  * this routine is called if the page table page is not
1108  * mapped correctly.
1109  */
1110 static vm_page_t
1111 _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
1112 {
1113         vm_paddr_t ptepa;
1114         vm_page_t m;
1115
1116         KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1117             (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1118             ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1119
1120         /*
1121          * Allocate a page table page.
1122          */
1123         if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1124             VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1125                 if (flags & M_WAITOK) {
1126                         PMAP_UNLOCK(pmap);
1127                         vm_page_unlock_queues();
1128                         VM_WAIT;
1129                         vm_page_lock_queues();
1130                         PMAP_LOCK(pmap);
1131                 }
1132
1133                 /*
1134                  * Indicate the need to retry.  While waiting, the page table
1135                  * page may have been allocated.
1136                  */
1137                 return (NULL);
1138         }
1139         if ((m->flags & PG_ZERO) == 0)
1140                 pmap_zero_page(m);
1141
1142         /*
1143          * Map the pagetable page into the process address space, if
1144          * it isn't already there.
1145          */
1146
1147         pmap->pm_stats.resident_count++;
1148
1149         ptepa = VM_PAGE_TO_PHYS(m);
1150         pmap->pm_pdir[ptepindex] =
1151                 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1152
1153         return m;
1154 }
1155
1156 static vm_page_t
1157 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1158 {
1159         unsigned ptepindex;
1160         pd_entry_t ptepa;
1161         vm_page_t m;
1162
1163         KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1164             (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1165             ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1166
1167         /*
1168          * Calculate pagetable page index
1169          */
1170         ptepindex = va >> PDRSHIFT;
1171 retry:
1172         /*
1173          * Get the page directory entry
1174          */
1175         ptepa = pmap->pm_pdir[ptepindex];
1176
1177         /*
1178          * This supports switching from a 4MB page to a
1179          * normal 4K page.
1180          */
1181         if (ptepa & PG_PS) {
1182                 pmap->pm_pdir[ptepindex] = 0;
1183                 ptepa = 0;
1184                 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1185                 pmap_invalidate_all(kernel_pmap);
1186         }
1187
1188         /*
1189          * If the page table page is mapped, we just increment the
1190          * hold count, and activate it.
1191          */
1192         if (ptepa) {
1193                 m = PHYS_TO_VM_PAGE(ptepa);
1194                 m->wire_count++;
1195         } else {
1196                 /*
1197                  * Here if the pte page isn't mapped, or if it has
1198                  * been deallocated.
1199                  */
1200                 m = _pmap_allocpte(pmap, ptepindex, flags);
1201                 if (m == NULL && (flags & M_WAITOK))
1202                         goto retry;
1203         }
1204         return (m);
1205 }
1206
1207
1208 /***************************************************
1209 * Pmap allocation/deallocation routines.
1210  ***************************************************/
1211
1212 #ifdef SMP
1213 /*
1214  * Deal with a SMP shootdown of other users of the pmap that we are
1215  * trying to dispose of.  This can be a bit hairy.
1216  */
1217 static u_int *lazymask;
1218 static u_int lazyptd;
1219 static volatile u_int lazywait;
1220
1221 void pmap_lazyfix_action(void);
1222
1223 void
1224 pmap_lazyfix_action(void)
1225 {
1226         u_int mymask = PCPU_GET(cpumask);
1227
1228 #ifdef COUNT_IPIS
1229         *ipi_lazypmap_counts[PCPU_GET(cpuid)]++;
1230 #endif
1231         if (rcr3() == lazyptd)
1232                 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1233         atomic_clear_int(lazymask, mymask);
1234         atomic_store_rel_int(&lazywait, 1);
1235 }
1236
1237 static void
1238 pmap_lazyfix_self(u_int mymask)
1239 {
1240
1241         if (rcr3() == lazyptd)
1242                 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1243         atomic_clear_int(lazymask, mymask);
1244 }
1245
1246
1247 static void
1248 pmap_lazyfix(pmap_t pmap)
1249 {
1250         u_int mymask;
1251         u_int mask;
1252         register u_int spins;
1253
1254         while ((mask = pmap->pm_active) != 0) {
1255                 spins = 50000000;
1256                 mask = mask & -mask;    /* Find least significant set bit */
1257                 mtx_lock_spin(&smp_ipi_mtx);
1258 #ifdef PAE
1259                 lazyptd = vtophys(pmap->pm_pdpt);
1260 #else
1261                 lazyptd = vtophys(pmap->pm_pdir);
1262 #endif
1263                 mymask = PCPU_GET(cpumask);
1264                 if (mask == mymask) {
1265                         lazymask = &pmap->pm_active;
1266                         pmap_lazyfix_self(mymask);
1267                 } else {
1268                         atomic_store_rel_int((u_int *)&lazymask,
1269                             (u_int)&pmap->pm_active);
1270                         atomic_store_rel_int(&lazywait, 0);
1271                         ipi_selected(mask, IPI_LAZYPMAP);
1272                         while (lazywait == 0) {
1273                                 ia32_pause();
1274                                 if (--spins == 0)
1275                                         break;
1276                         }
1277                 }
1278                 mtx_unlock_spin(&smp_ipi_mtx);
1279                 if (spins == 0)
1280                         printf("pmap_lazyfix: spun for 50000000\n");
1281         }
1282 }
1283
1284 #else   /* SMP */
1285
1286 /*
1287  * Cleaning up on uniprocessor is easy.  For various reasons, we're
1288  * unlikely to have to even execute this code, including the fact
1289  * that the cleanup is deferred until the parent does a wait(2), which
1290  * means that another userland process has run.
1291  */
1292 static void
1293 pmap_lazyfix(pmap_t pmap)
1294 {
1295         u_int cr3;
1296
1297         cr3 = vtophys(pmap->pm_pdir);
1298         if (cr3 == rcr3()) {
1299                 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1300                 pmap->pm_active &= ~(PCPU_GET(cpumask));
1301         }
1302 }
1303 #endif  /* SMP */
1304
1305 /*
1306  * Release any resources held by the given physical map.
1307  * Called when a pmap initialized by pmap_pinit is being released.
1308  * Should only be called if the map contains no valid mappings.
1309  */
1310 void
1311 pmap_release(pmap_t pmap)
1312 {
1313         vm_page_t m, ptdpg[NPGPTD];
1314         int i;
1315
1316         KASSERT(pmap->pm_stats.resident_count == 0,
1317             ("pmap_release: pmap resident count %ld != 0",
1318             pmap->pm_stats.resident_count));
1319
1320         pmap_lazyfix(pmap);
1321         mtx_lock_spin(&allpmaps_lock);
1322         LIST_REMOVE(pmap, pm_list);
1323         mtx_unlock_spin(&allpmaps_lock);
1324
1325         for (i = 0; i < NPGPTD; i++)
1326                 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]);
1327
1328         bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1329             sizeof(*pmap->pm_pdir));
1330 #ifdef SMP
1331         pmap->pm_pdir[MPPTDI] = 0;
1332 #endif
1333
1334         pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1335
1336         vm_page_lock_queues();
1337         for (i = 0; i < NPGPTD; i++) {
1338                 m = ptdpg[i];
1339 #ifdef PAE
1340                 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1341                     ("pmap_release: got wrong ptd page"));
1342 #endif
1343                 m->wire_count--;
1344                 atomic_subtract_int(&cnt.v_wire_count, 1);
1345                 vm_page_free_zero(m);
1346         }
1347         vm_page_unlock_queues();
1348         PMAP_LOCK_DESTROY(pmap);
1349 }
1350 \f
1351 static int
1352 kvm_size(SYSCTL_HANDLER_ARGS)
1353 {
1354         unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1355
1356         return sysctl_handle_long(oidp, &ksize, 0, req);
1357 }
1358 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1359     0, 0, kvm_size, "IU", "Size of KVM");
1360
1361 static int
1362 kvm_free(SYSCTL_HANDLER_ARGS)
1363 {
1364         unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1365
1366         return sysctl_handle_long(oidp, &kfree, 0, req);
1367 }
1368 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1369     0, 0, kvm_free, "IU", "Amount of KVM free");
1370
1371 /*
1372  * grow the number of kernel page table entries, if needed
1373  */
1374 void
1375 pmap_growkernel(vm_offset_t addr)
1376 {
1377         struct pmap *pmap;
1378         vm_paddr_t ptppaddr;
1379         vm_page_t nkpg;
1380         pd_entry_t newpdir;
1381         pt_entry_t *pde;
1382
1383         mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1384         if (kernel_vm_end == 0) {
1385                 kernel_vm_end = KERNBASE;
1386                 nkpt = 0;
1387                 while (pdir_pde(PTD, kernel_vm_end)) {
1388                         kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1389                         nkpt++;
1390                         if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1391                                 kernel_vm_end = kernel_map->max_offset;
1392                                 break;
1393                         }
1394                 }
1395         }
1396         addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1397         if (addr - 1 >= kernel_map->max_offset)
1398                 addr = kernel_map->max_offset;
1399         while (kernel_vm_end < addr) {
1400                 if (pdir_pde(PTD, kernel_vm_end)) {
1401                         kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1402                         if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1403                                 kernel_vm_end = kernel_map->max_offset;
1404                                 break;
1405                         }
1406                         continue;
1407                 }
1408
1409                 /*
1410                  * This index is bogus, but out of the way
1411                  */
1412                 nkpg = vm_page_alloc(NULL, nkpt,
1413                     VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1414                 if (!nkpg)
1415                         panic("pmap_growkernel: no memory to grow kernel");
1416
1417                 nkpt++;
1418
1419                 pmap_zero_page(nkpg);
1420                 ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1421                 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1422                 pdir_pde(PTD, kernel_vm_end) = newpdir;
1423
1424                 mtx_lock_spin(&allpmaps_lock);
1425                 LIST_FOREACH(pmap, &allpmaps, pm_list) {
1426                         pde = pmap_pde(pmap, kernel_vm_end);
1427                         pde_store(pde, newpdir);
1428                 }
1429                 mtx_unlock_spin(&allpmaps_lock);
1430                 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1431                 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1432                         kernel_vm_end = kernel_map->max_offset;
1433                         break;
1434                 }
1435         }
1436 }
1437
1438
1439 /***************************************************
1440  * page management routines.
1441  ***************************************************/
1442
1443 /*
1444  * free the pv_entry back to the free list
1445  */
1446 static PMAP_INLINE void
1447 free_pv_entry(pv_entry_t pv)
1448 {
1449         pv_entry_count--;
1450         uma_zfree(pvzone, pv);
1451 }
1452
1453 /*
1454  * get a new pv_entry, allocating a block from the system
1455  * when needed.
1456  */
1457 static pv_entry_t
1458 get_pv_entry(pmap_t locked_pmap)
1459 {
1460         static const struct timeval printinterval = { 60, 0 };
1461         static struct timeval lastprint;
1462         struct vpgqueues *vpq;
1463         pmap_t pmap;
1464         pt_entry_t *pte, tpte;
1465         pv_entry_t allocated_pv, next_pv, pv;
1466         vm_offset_t va;
1467         vm_page_t m;
1468
1469         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
1470         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1471         allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
1472         if (allocated_pv != NULL) {
1473                 pv_entry_count++;
1474                 if (pv_entry_count > pv_entry_high_water)
1475                         pagedaemon_wakeup();
1476                 else
1477                         return (allocated_pv);
1478         }
1479
1480         /*
1481          * Reclaim pv entries: At first, destroy mappings to inactive
1482          * pages.  After that, if a pv entry is still needed, destroy
1483          * mappings to active pages.
1484          */
1485         if (ratecheck(&lastprint, &printinterval))
1486                 printf("Approaching the limit on PV entries, "
1487                     "increase the vm.pmap.shpgperproc tunable.\n");
1488         vpq = &vm_page_queues[PQ_INACTIVE];
1489 retry:
1490         sched_pin();
1491         TAILQ_FOREACH(m, &vpq->pl, pageq) {
1492                 if (m->hold_count || m->busy || (m->flags & PG_BUSY))
1493                         continue;
1494                 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
1495                         va = pv->pv_va;
1496                         pmap = pv->pv_pmap;
1497                         /* Avoid deadlock and lock recursion. */
1498                         if (pmap > locked_pmap)
1499                                 PMAP_LOCK(pmap);
1500                         else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
1501                                 continue;
1502                         pmap->pm_stats.resident_count--;
1503                         pte = pmap_pte_quick(pmap, va);
1504                         tpte = pte_load_clear(pte);
1505                         KASSERT((tpte & PG_W) == 0,
1506                             ("get_pv_entry: wired pte %#jx", (uintmax_t)tpte));
1507                         if (tpte & PG_A)
1508                                 vm_page_flag_set(m, PG_REFERENCED);
1509                         if (tpte & PG_M) {
1510                                 KASSERT((tpte & PG_RW),
1511         ("get_pv_entry: modified page not writable: va: %#x, pte: %#jx",
1512                                     va, (uintmax_t)tpte));
1513                                 if (pmap_track_modified(va))
1514                                         vm_page_dirty(m);
1515                         }
1516                         pmap_invalidate_page(pmap, va);
1517                         TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1518                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1519                         if (TAILQ_EMPTY(&m->md.pv_list))
1520                                 vm_page_flag_clear(m, PG_WRITEABLE);
1521                         m->md.pv_list_count--;
1522                         pmap_unuse_pt(pmap, va);
1523                         if (pmap != locked_pmap)
1524                                 PMAP_UNLOCK(pmap);
1525                         if (allocated_pv == NULL)
1526                                 allocated_pv = pv;
1527                         else
1528                                 free_pv_entry(pv);
1529                 }
1530         }
1531         sched_unpin();
1532         if (allocated_pv == NULL) {
1533                 if (vpq == &vm_page_queues[PQ_INACTIVE]) {
1534                         vpq = &vm_page_queues[PQ_ACTIVE];
1535                         goto retry;
1536                 }
1537                 panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
1538         }
1539         return (allocated_pv);
1540 }
1541
1542 static void
1543 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1544 {
1545         pv_entry_t pv;
1546
1547         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1548         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1549         if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1550                 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1551                         if (pmap == pv->pv_pmap && va == pv->pv_va)
1552                                 break;
1553                 }
1554         } else {
1555                 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1556                         if (va == pv->pv_va)
1557                                 break;
1558                 }
1559         }
1560         KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
1561         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1562         m->md.pv_list_count--;
1563         if (TAILQ_EMPTY(&m->md.pv_list))
1564                 vm_page_flag_clear(m, PG_WRITEABLE);
1565         TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1566         free_pv_entry(pv);
1567 }
1568
1569 /*
1570  * Create a pv entry for page at pa for
1571  * (pmap, va).
1572  */
1573 static void
1574 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1575 {
1576         pv_entry_t pv;
1577
1578         pv = get_pv_entry(pmap);
1579         pv->pv_va = va;
1580         pv->pv_pmap = pmap;
1581
1582         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1583         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1584         TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1585         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1586         m->md.pv_list_count++;
1587 }
1588
1589 /*
1590  * pmap_remove_pte: do the things to unmap a page in a process
1591  */
1592 static int
1593 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1594 {
1595         pt_entry_t oldpte;
1596         vm_page_t m;
1597
1598         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1599         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1600         oldpte = pte_load_clear(ptq);
1601         if (oldpte & PG_W)
1602                 pmap->pm_stats.wired_count -= 1;
1603         /*
1604          * Machines that don't support invlpg, also don't support
1605          * PG_G.
1606          */
1607         if (oldpte & PG_G)
1608                 pmap_invalidate_page(kernel_pmap, va);
1609         pmap->pm_stats.resident_count -= 1;
1610         if (oldpte & PG_MANAGED) {
1611                 m = PHYS_TO_VM_PAGE(oldpte);
1612                 if (oldpte & PG_M) {
1613                         KASSERT((oldpte & PG_RW),
1614         ("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx",
1615                             va, (uintmax_t)oldpte));
1616                         if (pmap_track_modified(va))
1617                                 vm_page_dirty(m);
1618                 }
1619                 if (oldpte & PG_A)
1620                         vm_page_flag_set(m, PG_REFERENCED);
1621                 pmap_remove_entry(pmap, m, va);
1622         }
1623         return (pmap_unuse_pt(pmap, va));
1624 }
1625
1626 /*
1627  * Remove a single page from a process address space
1628  */
1629 static void
1630 pmap_remove_page(pmap_t pmap, vm_offset_t va)
1631 {
1632         pt_entry_t *pte;
1633
1634         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1635         KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
1636         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1637         if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
1638                 return;
1639         pmap_remove_pte(pmap, pte, va);
1640         pmap_invalidate_page(pmap, va);
1641 }
1642
1643 /*
1644  *      Remove the given range of addresses from the specified map.
1645  *
1646  *      It is assumed that the start and end are properly
1647  *      rounded to the page size.
1648  */
1649 void
1650 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1651 {
1652         vm_offset_t pdnxt;
1653         pd_entry_t ptpaddr;
1654         pt_entry_t *pte;
1655         int anyvalid;
1656
1657         /*
1658          * Perform an unsynchronized read.  This is, however, safe.
1659          */
1660         if (pmap->pm_stats.resident_count == 0)
1661                 return;
1662
1663         anyvalid = 0;
1664
1665         vm_page_lock_queues();
1666         sched_pin();
1667         PMAP_LOCK(pmap);
1668
1669         /*
1670          * special handling of removing one page.  a very
1671          * common operation and easy to short circuit some
1672          * code.
1673          */
1674         if ((sva + PAGE_SIZE == eva) &&
1675             ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
1676                 pmap_remove_page(pmap, sva);
1677                 goto out;
1678         }
1679
1680         for (; sva < eva; sva = pdnxt) {
1681                 unsigned pdirindex;
1682
1683                 /*
1684                  * Calculate index for next page table.
1685                  */
1686                 pdnxt = (sva + NBPDR) & ~PDRMASK;
1687                 if (pmap->pm_stats.resident_count == 0)
1688                         break;
1689
1690                 pdirindex = sva >> PDRSHIFT;
1691                 ptpaddr = pmap->pm_pdir[pdirindex];
1692
1693                 /*
1694                  * Weed out invalid mappings. Note: we assume that the page
1695                  * directory table is always allocated, and in kernel virtual.
1696                  */
1697                 if (ptpaddr == 0)
1698                         continue;
1699
1700                 /*
1701                  * Check for large page.
1702                  */
1703                 if ((ptpaddr & PG_PS) != 0) {
1704                         pmap->pm_pdir[pdirindex] = 0;
1705                         pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1706                         anyvalid = 1;
1707                         continue;
1708                 }
1709
1710                 /*
1711                  * Limit our scan to either the end of the va represented
1712                  * by the current page table page, or to the end of the
1713                  * range being removed.
1714                  */
1715                 if (pdnxt > eva)
1716                         pdnxt = eva;
1717
1718                 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
1719                     sva += PAGE_SIZE) {
1720                         if (*pte == 0)
1721                                 continue;
1722
1723                         /*
1724                          * The TLB entry for a PG_G mapping is invalidated
1725                          * by pmap_remove_pte().
1726                          */
1727                         if ((*pte & PG_G) == 0)
1728                                 anyvalid = 1;
1729                         if (pmap_remove_pte(pmap, pte, sva))
1730                                 break;
1731                 }
1732         }
1733 out:
1734         sched_unpin();
1735         vm_page_unlock_queues();
1736         if (anyvalid)
1737                 pmap_invalidate_all(pmap);
1738         PMAP_UNLOCK(pmap);
1739 }
1740
1741 /*
1742  *      Routine:        pmap_remove_all
1743  *      Function:
1744  *              Removes this physical page from
1745  *              all physical maps in which it resides.
1746  *              Reflects back modify bits to the pager.
1747  *
1748  *      Notes:
1749  *              Original versions of this routine were very
1750  *              inefficient because they iteratively called
1751  *              pmap_remove (slow...)
1752  */
1753
1754 void
1755 pmap_remove_all(vm_page_t m)
1756 {
1757         register pv_entry_t pv;
1758         pt_entry_t *pte, tpte;
1759
1760 #if defined(PMAP_DIAGNOSTIC)
1761         /*
1762          * XXX This makes pmap_remove_all() illegal for non-managed pages!
1763          */
1764         if (m->flags & PG_FICTITIOUS) {
1765                 panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
1766                     VM_PAGE_TO_PHYS(m));
1767         }
1768 #endif
1769         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1770         sched_pin();
1771         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1772                 PMAP_LOCK(pv->pv_pmap);
1773                 pv->pv_pmap->pm_stats.resident_count--;
1774                 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
1775                 tpte = pte_load_clear(pte);
1776                 if (tpte & PG_W)
1777                         pv->pv_pmap->pm_stats.wired_count--;
1778                 if (tpte & PG_A)
1779                         vm_page_flag_set(m, PG_REFERENCED);
1780
1781                 /*
1782                  * Update the vm_page_t clean and reference bits.
1783                  */
1784                 if (tpte & PG_M) {
1785                         KASSERT((tpte & PG_RW),
1786         ("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx",
1787                             pv->pv_va, (uintmax_t)tpte));
1788                         if (pmap_track_modified(pv->pv_va))
1789                                 vm_page_dirty(m);
1790                 }
1791                 pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
1792                 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1793                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1794                 m->md.pv_list_count--;
1795                 pmap_unuse_pt(pv->pv_pmap, pv->pv_va);
1796                 PMAP_UNLOCK(pv->pv_pmap);
1797                 free_pv_entry(pv);
1798         }
1799         vm_page_flag_clear(m, PG_WRITEABLE);
1800         sched_unpin();
1801 }
1802
1803 /*
1804  *      Set the physical protection on the
1805  *      specified range of this map as requested.
1806  */
1807 void
1808 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1809 {
1810         vm_offset_t pdnxt;
1811         pd_entry_t ptpaddr;
1812         pt_entry_t *pte;
1813         int anychanged;
1814
1815         if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1816                 pmap_remove(pmap, sva, eva);
1817                 return;
1818         }
1819
1820         if (prot & VM_PROT_WRITE)
1821                 return;
1822
1823         anychanged = 0;
1824
1825         vm_page_lock_queues();
1826         sched_pin();
1827         PMAP_LOCK(pmap);
1828         for (; sva < eva; sva = pdnxt) {
1829                 unsigned obits, pbits, pdirindex;
1830
1831                 pdnxt = (sva + NBPDR) & ~PDRMASK;
1832
1833                 pdirindex = sva >> PDRSHIFT;
1834                 ptpaddr = pmap->pm_pdir[pdirindex];
1835
1836                 /*
1837                  * Weed out invalid mappings. Note: we assume that the page
1838                  * directory table is always allocated, and in kernel virtual.
1839                  */
1840                 if (ptpaddr == 0)
1841                         continue;
1842
1843                 /*
1844                  * Check for large page.
1845                  */
1846                 if ((ptpaddr & PG_PS) != 0) {
1847                         pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
1848                         anychanged = 1;
1849                         continue;
1850                 }
1851
1852                 if (pdnxt > eva)
1853                         pdnxt = eva;
1854
1855                 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
1856                     sva += PAGE_SIZE) {
1857                         vm_page_t m;
1858
1859 retry:
1860                         /*
1861                          * Regardless of whether a pte is 32 or 64 bits in
1862                          * size, PG_RW, PG_A, and PG_M are among the least
1863                          * significant 32 bits.
1864                          */
1865                         obits = pbits = *(u_int *)pte;
1866                         if (pbits & PG_MANAGED) {
1867                                 m = NULL;
1868                                 if (pbits & PG_A) {
1869                                         m = PHYS_TO_VM_PAGE(*pte);
1870                                         vm_page_flag_set(m, PG_REFERENCED);
1871                                         pbits &= ~PG_A;
1872                                 }
1873                                 if ((pbits & PG_M) != 0 &&
1874                                     pmap_track_modified(sva)) {
1875                                         if (m == NULL)
1876                                                 m = PHYS_TO_VM_PAGE(*pte);
1877                                         vm_page_dirty(m);
1878                                 }
1879                         }
1880
1881                         pbits &= ~(PG_RW | PG_M);
1882
1883                         if (pbits != obits) {
1884                                 if (!atomic_cmpset_int((u_int *)pte, obits,
1885                                     pbits))
1886                                         goto retry;
1887                                 if (obits & PG_G)
1888                                         pmap_invalidate_page(pmap, sva);
1889                                 else
1890                                         anychanged = 1;
1891                         }
1892                 }
1893         }
1894         sched_unpin();
1895         vm_page_unlock_queues();
1896         if (anychanged)
1897                 pmap_invalidate_all(pmap);
1898         PMAP_UNLOCK(pmap);
1899 }
1900
1901 /*
1902  *      Insert the given physical page (p) at
1903  *      the specified virtual address (v) in the
1904  *      target physical map with the protection requested.
1905  *
1906  *      If specified, the page will be wired down, meaning
1907  *      that the related pte can not be reclaimed.
1908  *
1909  *      NB:  This is the only routine which MAY NOT lazy-evaluate
1910  *      or lose information.  That is, this routine must actually
1911  *      insert this page into the given map NOW.
1912  */
1913 void
1914 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1915            boolean_t wired)
1916 {
1917         vm_paddr_t pa;
1918         register pt_entry_t *pte;
1919         vm_paddr_t opa;
1920         pt_entry_t origpte, newpte;
1921         vm_page_t mpte, om;
1922         boolean_t invlva;
1923
1924         va &= PG_FRAME;
1925 #ifdef PMAP_DIAGNOSTIC
1926         if (va > VM_MAX_KERNEL_ADDRESS)
1927                 panic("pmap_enter: toobig");
1928         if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
1929                 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
1930 #endif
1931
1932         mpte = NULL;
1933
1934         vm_page_lock_queues();
1935         PMAP_LOCK(pmap);
1936         sched_pin();
1937
1938         /*
1939          * In the case that a page table page is not
1940          * resident, we are creating it here.
1941          */
1942         if (va < VM_MAXUSER_ADDRESS) {
1943                 mpte = pmap_allocpte(pmap, va, M_WAITOK);
1944         }
1945 #if 0 && defined(PMAP_DIAGNOSTIC)
1946         else {
1947                 pd_entry_t *pdeaddr = pmap_pde(pmap, va);
1948                 origpte = *pdeaddr;
1949                 if ((origpte & PG_V) == 0) {
1950                         panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
1951                                 pmap->pm_pdir[PTDPTDI], origpte, va);
1952                 }
1953         }
1954 #endif
1955
1956         pte = pmap_pte_quick(pmap, va);
1957
1958         /*
1959          * Page Directory table entry not valid, we need a new PT page
1960          */
1961         if (pte == NULL) {
1962                 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
1963                         (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
1964         }
1965
1966         pa = VM_PAGE_TO_PHYS(m);
1967         om = NULL;
1968         origpte = *pte;
1969         opa = origpte & PG_FRAME;
1970
1971         if (origpte & PG_PS) {
1972                 /*
1973                  * Yes, I know this will truncate upper address bits for PAE,
1974                  * but I'm actually more interested in the lower bits
1975                  */
1976                 printf("pmap_enter: va %p, pte %p, origpte %p\n",
1977                     (void *)va, (void *)pte, (void *)(uintptr_t)origpte);
1978                 panic("pmap_enter: attempted pmap_enter on 4MB page");
1979         }
1980
1981         /*
1982          * Mapping has not changed, must be protection or wiring change.
1983          */
1984         if (origpte && (opa == pa)) {
1985                 /*
1986                  * Wiring change, just update stats. We don't worry about
1987                  * wiring PT pages as they remain resident as long as there
1988                  * are valid mappings in them. Hence, if a user page is wired,
1989                  * the PT page will be also.
1990                  */
1991                 if (wired && ((origpte & PG_W) == 0))
1992                         pmap->pm_stats.wired_count++;
1993                 else if (!wired && (origpte & PG_W))
1994                         pmap->pm_stats.wired_count--;
1995
1996                 /*
1997                  * Remove extra pte reference
1998                  */
1999                 if (mpte)
2000                         mpte->wire_count--;
2001
2002                 /*
2003                  * We might be turning off write access to the page,
2004                  * so we go ahead and sense modify status.
2005                  */
2006                 if (origpte & PG_MANAGED) {
2007                         om = m;
2008                         pa |= PG_MANAGED;
2009                 }
2010                 goto validate;
2011         }
2012         /*
2013          * Mapping has changed, invalidate old range and fall through to
2014          * handle validating new mapping.
2015          */
2016         if (opa) {
2017                 if (origpte & PG_W)
2018                         pmap->pm_stats.wired_count--;
2019                 if (origpte & PG_MANAGED) {
2020                         om = PHYS_TO_VM_PAGE(opa);
2021                         pmap_remove_entry(pmap, om, va);
2022                 }
2023                 if (mpte != NULL) {
2024                         mpte->wire_count--;
2025                         KASSERT(mpte->wire_count > 0,
2026                             ("pmap_enter: missing reference to page table page,"
2027                              " va: 0x%x", va));
2028                 }
2029         } else
2030                 pmap->pm_stats.resident_count++;
2031
2032         /*
2033          * Enter on the PV list if part of our managed memory.
2034          */
2035         if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
2036                 pmap_insert_entry(pmap, va, m);
2037                 pa |= PG_MANAGED;
2038         }
2039
2040         /*
2041          * Increment counters
2042          */
2043         if (wired)
2044                 pmap->pm_stats.wired_count++;
2045
2046 validate:
2047         /*
2048          * Now validate mapping with desired protection/wiring.
2049          */
2050         newpte = (pt_entry_t)(pa | PG_V);
2051         if ((prot & VM_PROT_WRITE) != 0)
2052                 newpte |= PG_RW;
2053         if (wired)
2054                 newpte |= PG_W;
2055         if (va < VM_MAXUSER_ADDRESS)
2056                 newpte |= PG_U;
2057         if (pmap == kernel_pmap)
2058                 newpte |= pgeflag;
2059
2060         /*
2061          * if the mapping or permission bits are different, we need
2062          * to update the pte.
2063          */
2064         if ((origpte & ~(PG_M|PG_A)) != newpte) {
2065                 if (origpte & PG_V) {
2066                         invlva = FALSE;
2067                         origpte = pte_load_store(pte, newpte | PG_A);
2068                         if (origpte & PG_A) {
2069                                 if (origpte & PG_MANAGED)
2070                                         vm_page_flag_set(om, PG_REFERENCED);
2071                                 if (opa != VM_PAGE_TO_PHYS(m))
2072                                         invlva = TRUE;
2073                         }
2074                         if (origpte & PG_M) {
2075                                 KASSERT((origpte & PG_RW),
2076         ("pmap_enter: modified page not writable: va: %#x, pte: %#jx",
2077                                     va, (uintmax_t)origpte));
2078                                 if ((origpte & PG_MANAGED) &&
2079                                     pmap_track_modified(va))
2080                                         vm_page_dirty(om);
2081                                 if ((prot & VM_PROT_WRITE) == 0)
2082                                         invlva = TRUE;
2083                         }
2084                         if (invlva)
2085                                 pmap_invalidate_page(pmap, va);
2086                 } else
2087                         pte_store(pte, newpte | PG_A);
2088         }
2089         sched_unpin();
2090         vm_page_unlock_queues();
2091         PMAP_UNLOCK(pmap);
2092 }
2093
2094 /*
2095  * this code makes some *MAJOR* assumptions:
2096  * 1. Current pmap & pmap exists.
2097  * 2. Not wired.
2098  * 3. Read access.
2099  * 4. No page table pages.
2100  * but is *MUCH* faster than pmap_enter...
2101  */
2102
2103 vm_page_t
2104 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2105     vm_page_t mpte)
2106 {
2107         pt_entry_t *pte;
2108         vm_paddr_t pa;
2109
2110         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2111         VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
2112         PMAP_LOCK(pmap);
2113
2114         /*
2115          * In the case that a page table page is not
2116          * resident, we are creating it here.
2117          */
2118         if (va < VM_MAXUSER_ADDRESS) {
2119                 unsigned ptepindex;
2120                 pd_entry_t ptepa;
2121
2122                 /*
2123                  * Calculate pagetable page index
2124                  */
2125                 ptepindex = va >> PDRSHIFT;
2126                 if (mpte && (mpte->pindex == ptepindex)) {
2127                         mpte->wire_count++;
2128                 } else {
2129 retry:
2130                         /*
2131                          * Get the page directory entry
2132                          */
2133                         ptepa = pmap->pm_pdir[ptepindex];
2134
2135                         /*
2136                          * If the page table page is mapped, we just increment
2137                          * the hold count, and activate it.
2138                          */
2139                         if (ptepa) {
2140                                 if (ptepa & PG_PS)
2141                                         panic("pmap_enter_quick: unexpected mapping into 4MB page");
2142                                 mpte = PHYS_TO_VM_PAGE(ptepa);
2143                                 mpte->wire_count++;
2144                         } else {
2145                                 mpte = _pmap_allocpte(pmap, ptepindex,
2146                                     M_NOWAIT);
2147                                 if (mpte == NULL) {
2148                                         PMAP_UNLOCK(pmap);
2149                                         vm_page_busy(m);
2150                                         vm_page_unlock_queues();
2151                                         VM_OBJECT_UNLOCK(m->object);
2152                                         VM_WAIT;
2153                                         VM_OBJECT_LOCK(m->object);
2154                                         vm_page_lock_queues();
2155                                         vm_page_wakeup(m);
2156                                         PMAP_LOCK(pmap);
2157                                         goto retry;
2158                                 }
2159                         }
2160                 }
2161         } else {
2162                 mpte = NULL;
2163         }
2164
2165         /*
2166          * This call to vtopte makes the assumption that we are
2167          * entering the page into the current pmap.  In order to support
2168          * quick entry into any pmap, one would likely use pmap_pte_quick.
2169          * But that isn't as quick as vtopte.
2170          */
2171         pte = vtopte(va);
2172         if (*pte) {
2173                 if (mpte != NULL) {
2174                         pmap_unwire_pte_hold(pmap, mpte);
2175                         mpte = NULL;
2176                 }
2177                 goto out;
2178         }
2179
2180         /*
2181          * Enter on the PV list if part of our managed memory. Note that we
2182          * raise IPL while manipulating pv_table since pmap_enter can be
2183          * called at interrupt time.
2184          */
2185         if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
2186                 pmap_insert_entry(pmap, va, m);
2187
2188         /*
2189          * Increment counters
2190          */
2191         pmap->pm_stats.resident_count++;
2192
2193         pa = VM_PAGE_TO_PHYS(m);
2194
2195         /*
2196          * Now validate mapping with RO protection
2197          */
2198         if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2199                 pte_store(pte, pa | PG_V | PG_U);
2200         else
2201                 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2202 out:
2203         PMAP_UNLOCK(pmap);
2204         return mpte;
2205 }
2206
2207 /*
2208  * Make a temporary mapping for a physical address.  This is only intended
2209  * to be used for panic dumps.
2210  */
2211 void *
2212 pmap_kenter_temporary(vm_paddr_t pa, int i)
2213 {
2214         vm_offset_t va;
2215
2216         va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2217         pmap_kenter(va, pa);
2218         invlpg(va);
2219         return ((void *)crashdumpmap);
2220 }
2221
2222 /*
2223  * This code maps large physical mmap regions into the
2224  * processor address space.  Note that some shortcuts
2225  * are taken, but the code works.
2226  */
2227 void
2228 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2229                     vm_object_t object, vm_pindex_t pindex,
2230                     vm_size_t size)
2231 {
2232         vm_page_t p;
2233
2234         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2235         KASSERT(object->type == OBJT_DEVICE,
2236             ("pmap_object_init_pt: non-device object"));
2237         if (pseflag &&
2238             ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2239                 int i;
2240                 vm_page_t m[1];
2241                 unsigned int ptepindex;
2242                 int npdes;
2243                 pd_entry_t ptepa;
2244
2245                 PMAP_LOCK(pmap);
2246                 if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2247                         goto out;
2248                 PMAP_UNLOCK(pmap);
2249 retry:
2250                 p = vm_page_lookup(object, pindex);
2251                 if (p != NULL) {
2252                         vm_page_lock_queues();
2253                         if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2254                                 goto retry;
2255                 } else {
2256                         p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2257                         if (p == NULL)
2258                                 return;
2259                         m[0] = p;
2260
2261                         if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2262                                 vm_page_lock_queues();
2263                                 vm_page_free(p);
2264                                 vm_page_unlock_queues();
2265                                 return;
2266                         }
2267
2268                         p = vm_page_lookup(object, pindex);
2269                         vm_page_lock_queues();
2270                         vm_page_wakeup(p);
2271                 }
2272                 vm_page_unlock_queues();
2273
2274                 ptepa = VM_PAGE_TO_PHYS(p);
2275                 if (ptepa & (NBPDR - 1))
2276                         return;
2277
2278                 p->valid = VM_PAGE_BITS_ALL;
2279
2280                 PMAP_LOCK(pmap);
2281                 pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2282                 npdes = size >> PDRSHIFT;
2283                 for(i = 0; i < npdes; i++) {
2284                         pde_store(&pmap->pm_pdir[ptepindex],
2285                             ptepa | PG_U | PG_RW | PG_V | PG_PS);
2286                         ptepa += NBPDR;
2287                         ptepindex += 1;
2288                 }
2289                 pmap_invalidate_all(pmap);
2290 out:
2291                 PMAP_UNLOCK(pmap);
2292         }
2293 }
2294
2295 /*
2296  *      Routine:        pmap_change_wiring
2297  *      Function:       Change the wiring attribute for a map/virtual-address
2298  *                      pair.
2299  *      In/out conditions:
2300  *                      The mapping must already exist in the pmap.
2301  */
2302 void
2303 pmap_change_wiring(pmap, va, wired)
2304         register pmap_t pmap;
2305         vm_offset_t va;
2306         boolean_t wired;
2307 {
2308         register pt_entry_t *pte;
2309
2310         PMAP_LOCK(pmap);
2311         pte = pmap_pte(pmap, va);
2312
2313         if (wired && !pmap_pte_w(pte))
2314                 pmap->pm_stats.wired_count++;
2315         else if (!wired && pmap_pte_w(pte))
2316                 pmap->pm_stats.wired_count--;
2317
2318         /*
2319          * Wiring is not a hardware characteristic so there is no need to
2320          * invalidate TLB.
2321          */
2322         pmap_pte_set_w(pte, wired);
2323         pmap_pte_release(pte);
2324         PMAP_UNLOCK(pmap);
2325 }
2326
2327
2328
2329 /*
2330  *      Copy the range specified by src_addr/len
2331  *      from the source map to the range dst_addr/len
2332  *      in the destination map.
2333  *
2334  *      This routine is only advisory and need not do anything.
2335  */
2336
2337 void
2338 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2339           vm_offset_t src_addr)
2340 {
2341         vm_offset_t addr;
2342         vm_offset_t end_addr = src_addr + len;
2343         vm_offset_t pdnxt;
2344         vm_page_t m;
2345
2346         if (dst_addr != src_addr)
2347                 return;
2348
2349         if (!pmap_is_current(src_pmap))
2350                 return;
2351
2352         vm_page_lock_queues();
2353         if (dst_pmap < src_pmap) {
2354                 PMAP_LOCK(dst_pmap);
2355                 PMAP_LOCK(src_pmap);
2356         } else {
2357                 PMAP_LOCK(src_pmap);
2358                 PMAP_LOCK(dst_pmap);
2359         }
2360         sched_pin();
2361         for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2362                 pt_entry_t *src_pte, *dst_pte;
2363                 vm_page_t dstmpte, srcmpte;
2364                 pd_entry_t srcptepaddr;
2365                 unsigned ptepindex;
2366
2367                 if (addr >= UPT_MIN_ADDRESS)
2368                         panic("pmap_copy: invalid to pmap_copy page tables");
2369
2370                 /*
2371                  * Don't let optional prefaulting of pages make us go
2372                  * way below the low water mark of free pages or way
2373                  * above high water mark of used pv entries.
2374                  */
2375                 if (cnt.v_free_count < cnt.v_free_reserved ||
2376                     pv_entry_count > pv_entry_high_water)
2377                         break;
2378
2379                 pdnxt = (addr + NBPDR) & ~PDRMASK;
2380                 ptepindex = addr >> PDRSHIFT;
2381
2382                 srcptepaddr = src_pmap->pm_pdir[ptepindex];
2383                 if (srcptepaddr == 0)
2384                         continue;
2385
2386                 if (srcptepaddr & PG_PS) {
2387                         if (dst_pmap->pm_pdir[ptepindex] == 0) {
2388                                 dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
2389                                 dst_pmap->pm_stats.resident_count +=
2390                                     NBPDR / PAGE_SIZE;
2391                         }
2392                         continue;
2393                 }
2394
2395                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2396                 if (srcmpte->wire_count == 0)
2397                         panic("pmap_copy: source page table page is unused");
2398
2399                 if (pdnxt > end_addr)
2400                         pdnxt = end_addr;
2401
2402                 src_pte = vtopte(addr);
2403                 while (addr < pdnxt) {
2404                         pt_entry_t ptetemp;
2405                         ptetemp = *src_pte;
2406                         /*
2407                          * we only virtual copy managed pages
2408                          */
2409                         if ((ptetemp & PG_MANAGED) != 0) {
2410                                 /*
2411                                  * We have to check after allocpte for the
2412                                  * pte still being around...  allocpte can
2413                                  * block.
2414                                  */
2415                                 dstmpte = pmap_allocpte(dst_pmap, addr,
2416                                     M_NOWAIT);
2417                                 if (dstmpte == NULL)
2418                                         break;
2419                                 dst_pte = pmap_pte_quick(dst_pmap, addr);
2420                                 if (*dst_pte == 0) {
2421                                         /*
2422                                          * Clear the modified and
2423                                          * accessed (referenced) bits
2424                                          * during the copy.
2425                                          */
2426                                         m = PHYS_TO_VM_PAGE(ptetemp);
2427                                         *dst_pte = ptetemp & ~(PG_M | PG_A);
2428                                         dst_pmap->pm_stats.resident_count++;
2429                                         pmap_insert_entry(dst_pmap, addr, m);
2430                                 } else
2431                                         pmap_unwire_pte_hold(dst_pmap, dstmpte);
2432                                 if (dstmpte->wire_count >= srcmpte->wire_count)
2433                                         break;
2434                         }
2435                         addr += PAGE_SIZE;
2436                         src_pte++;
2437                 }
2438         }
2439         sched_unpin();
2440         vm_page_unlock_queues();
2441         PMAP_UNLOCK(src_pmap);
2442         PMAP_UNLOCK(dst_pmap);
2443 }
2444
2445 static __inline void
2446 pagezero(void *page)
2447 {
2448 #if defined(I686_CPU)
2449         if (cpu_class == CPUCLASS_686) {
2450 #if defined(CPU_ENABLE_SSE)
2451                 if (cpu_feature & CPUID_SSE2)
2452                         sse2_pagezero(page);
2453                 else
2454 #endif
2455                         i686_pagezero(page);
2456         } else
2457 #endif
2458                 bzero(page, PAGE_SIZE);
2459 }
2460
2461 /*
2462  *      pmap_zero_page zeros the specified hardware page by mapping
2463  *      the page into KVM and using bzero to clear its contents.
2464  */
2465 void
2466 pmap_zero_page(vm_page_t m)
2467 {
2468         struct sysmaps *sysmaps;
2469
2470         sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2471         mtx_lock(&sysmaps->lock);
2472         if (*sysmaps->CMAP2)
2473                 panic("pmap_zero_page: CMAP2 busy");
2474         sched_pin();
2475         *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2476         invlcaddr(sysmaps->CADDR2);
2477         pagezero(sysmaps->CADDR2);
2478         *sysmaps->CMAP2 = 0;
2479         sched_unpin();
2480         mtx_unlock(&sysmaps->lock);
2481 }
2482
2483 /*
2484  *      pmap_zero_page_area zeros the specified hardware page by mapping
2485  *      the page into KVM and using bzero to clear its contents.
2486  *
2487  *      off and size may not cover an area beyond a single hardware page.
2488  */
2489 void
2490 pmap_zero_page_area(vm_page_t m, int off, int size)
2491 {
2492         struct sysmaps *sysmaps;
2493
2494         sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2495         mtx_lock(&sysmaps->lock);
2496         if (*sysmaps->CMAP2)
2497                 panic("pmap_zero_page: CMAP2 busy");
2498         sched_pin();
2499         *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2500         invlcaddr(sysmaps->CADDR2);
2501         if (off == 0 && size == PAGE_SIZE)
2502                 pagezero(sysmaps->CADDR2);
2503         else
2504                 bzero((char *)sysmaps->CADDR2 + off, size);
2505         *sysmaps->CMAP2 = 0;
2506         sched_unpin();
2507         mtx_unlock(&sysmaps->lock);
2508 }
2509
2510 /*
2511  *      pmap_zero_page_idle zeros the specified hardware page by mapping
2512  *      the page into KVM and using bzero to clear its contents.  This
2513  *      is intended to be called from the vm_pagezero process only and
2514  *      outside of Giant.
2515  */
2516 void
2517 pmap_zero_page_idle(vm_page_t m)
2518 {
2519
2520         if (*CMAP3)
2521                 panic("pmap_zero_page: CMAP3 busy");
2522         sched_pin();
2523         *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2524         invlcaddr(CADDR3);
2525         pagezero(CADDR3);
2526         *CMAP3 = 0;
2527         sched_unpin();
2528 }
2529
2530 /*
2531  *      pmap_copy_page copies the specified (machine independent)
2532  *      page by mapping the page into virtual memory and using
2533  *      bcopy to copy the page, one machine dependent page at a
2534  *      time.
2535  */
2536 void
2537 pmap_copy_page(vm_page_t src, vm_page_t dst)
2538 {
2539         struct sysmaps *sysmaps;
2540
2541         sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2542         mtx_lock(&sysmaps->lock);
2543         if (*sysmaps->CMAP1)
2544                 panic("pmap_copy_page: CMAP1 busy");
2545         if (*sysmaps->CMAP2)
2546                 panic("pmap_copy_page: CMAP2 busy");
2547         sched_pin();
2548         invlpg((u_int)sysmaps->CADDR1);
2549         invlpg((u_int)sysmaps->CADDR2);
2550         *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
2551         *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
2552         bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
2553         *sysmaps->CMAP1 = 0;
2554         *sysmaps->CMAP2 = 0;
2555         sched_unpin();
2556         mtx_unlock(&sysmaps->lock);
2557 }
2558
2559 /*
2560  * Returns true if the pmap's pv is one of the first
2561  * 16 pvs linked to from this page.  This count may
2562  * be changed upwards or downwards in the future; it
2563  * is only necessary that true be returned for a small
2564  * subset of pmaps for proper page aging.
2565  */
2566 boolean_t
2567 pmap_page_exists_quick(pmap, m)
2568         pmap_t pmap;
2569         vm_page_t m;
2570 {
2571         pv_entry_t pv;
2572         int loops = 0;
2573
2574         if (m->flags & PG_FICTITIOUS)
2575                 return FALSE;
2576
2577         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2578         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2579                 if (pv->pv_pmap == pmap) {
2580                         return TRUE;
2581                 }
2582                 loops++;
2583                 if (loops >= 16)
2584                         break;
2585         }
2586         return (FALSE);
2587 }
2588
2589 #define PMAP_REMOVE_PAGES_CURPROC_ONLY
2590 /*
2591  * Remove all pages from specified address space
2592  * this aids process exit speeds.  Also, this code
2593  * is special cased for current process only, but
2594  * can have the more generic (and slightly slower)
2595  * mode enabled.  This is much faster than pmap_remove
2596  * in the case of running down an entire address space.
2597  */
2598 void
2599 pmap_remove_pages(pmap, sva, eva)
2600         pmap_t pmap;
2601         vm_offset_t sva, eva;
2602 {
2603         pt_entry_t *pte, tpte;
2604         vm_page_t m;
2605         pv_entry_t pv, npv;
2606
2607 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2608         if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
2609                 printf("warning: pmap_remove_pages called with non-current pmap\n");
2610                 return;
2611         }
2612 #endif
2613         vm_page_lock_queues();
2614         PMAP_LOCK(pmap);
2615         sched_pin();
2616         for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2617
2618                 if (pv->pv_va >= eva || pv->pv_va < sva) {
2619                         npv = TAILQ_NEXT(pv, pv_plist);
2620                         continue;
2621                 }
2622
2623 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
2624                 pte = vtopte(pv->pv_va);
2625 #else
2626                 pte = pmap_pte_quick(pmap, pv->pv_va);
2627 #endif
2628                 tpte = *pte;
2629
2630                 if (tpte == 0) {
2631                         printf("TPTE at %p  IS ZERO @ VA %08x\n",
2632                                                         pte, pv->pv_va);
2633                         panic("bad pte");
2634                 }
2635
2636 /*
2637  * We cannot remove wired pages from a process' mapping at this time
2638  */
2639                 if (tpte & PG_W) {
2640                         npv = TAILQ_NEXT(pv, pv_plist);
2641                         continue;
2642                 }
2643
2644                 m = PHYS_TO_VM_PAGE(tpte);
2645                 KASSERT(m->phys_addr == (tpte & PG_FRAME),
2646                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
2647                     m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
2648
2649                 KASSERT(m < &vm_page_array[vm_page_array_size],
2650                         ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
2651
2652                 pmap->pm_stats.resident_count--;
2653
2654                 pte_clear(pte);
2655
2656                 /*
2657                  * Update the vm_page_t clean and reference bits.
2658                  */
2659                 if (tpte & PG_M) {
2660                         vm_page_dirty(m);
2661                 }
2662
2663                 npv = TAILQ_NEXT(pv, pv_plist);
2664                 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2665
2666                 m->md.pv_list_count--;
2667                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2668                 if (TAILQ_EMPTY(&m->md.pv_list))
2669                         vm_page_flag_clear(m, PG_WRITEABLE);
2670
2671                 pmap_unuse_pt(pmap, pv->pv_va);
2672                 free_pv_entry(pv);
2673         }
2674         sched_unpin();
2675         pmap_invalidate_all(pmap);
2676         PMAP_UNLOCK(pmap);
2677         vm_page_unlock_queues();
2678 }
2679
2680 /*
2681  *      pmap_is_modified:
2682  *
2683  *      Return whether or not the specified physical page was modified
2684  *      in any physical maps.
2685  */
2686 boolean_t
2687 pmap_is_modified(vm_page_t m)
2688 {
2689         pv_entry_t pv;
2690         pt_entry_t *pte;
2691         boolean_t rv;
2692
2693         rv = FALSE;
2694         if (m->flags & PG_FICTITIOUS)
2695                 return (rv);
2696
2697         sched_pin();
2698         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2699         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2700                 /*
2701                  * if the bit being tested is the modified bit, then
2702                  * mark clean_map and ptes as never
2703                  * modified.
2704                  */
2705                 if (!pmap_track_modified(pv->pv_va))
2706                         continue;
2707                 PMAP_LOCK(pv->pv_pmap);
2708                 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2709                 rv = (*pte & PG_M) != 0;
2710                 PMAP_UNLOCK(pv->pv_pmap);
2711                 if (rv)
2712                         break;
2713         }
2714         sched_unpin();
2715         return (rv);
2716 }
2717
2718 /*
2719  *      pmap_is_prefaultable:
2720  *
2721  *      Return whether or not the specified virtual address is elgible
2722  *      for prefault.
2723  */
2724 boolean_t
2725 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
2726 {
2727         pt_entry_t *pte;
2728         boolean_t rv;
2729
2730         rv = FALSE;
2731         PMAP_LOCK(pmap);
2732         if (*pmap_pde(pmap, addr)) {
2733                 pte = vtopte(addr);
2734                 rv = *pte == 0;
2735         }
2736         PMAP_UNLOCK(pmap);
2737         return (rv);
2738 }
2739
2740 /*
2741  *      Clear the given bit in each of the given page's ptes.  The bit is
2742  *      expressed as a 32-bit mask.  Consequently, if the pte is 64 bits in
2743  *      size, only a bit within the least significant 32 can be cleared.
2744  */
2745 static __inline void
2746 pmap_clear_ptes(vm_page_t m, int bit)
2747 {
2748         register pv_entry_t pv;
2749         pt_entry_t pbits, *pte;
2750
2751         if ((m->flags & PG_FICTITIOUS) ||
2752             (bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
2753                 return;
2754
2755         sched_pin();
2756         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2757         /*
2758          * Loop over all current mappings setting/clearing as appropos If
2759          * setting RO do we need to clear the VAC?
2760          */
2761         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2762                 /*
2763                  * don't write protect pager mappings
2764                  */
2765                 if (bit == PG_RW) {
2766                         if (!pmap_track_modified(pv->pv_va))
2767                                 continue;
2768                 }
2769
2770                 PMAP_LOCK(pv->pv_pmap);
2771                 pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2772 retry:
2773                 pbits = *pte;
2774                 if (pbits & bit) {
2775                         if (bit == PG_RW) {
2776                                 /*
2777                                  * Regardless of whether a pte is 32 or 64 bits
2778                                  * in size, PG_RW and PG_M are among the least
2779                                  * significant 32 bits.
2780                                  */
2781                                 if (!atomic_cmpset_int((u_int *)pte, pbits,
2782                                     pbits & ~(PG_RW | PG_M)))
2783                                         goto retry;
2784                                 if (pbits & PG_M) {
2785                                         vm_page_dirty(m);
2786                                 }
2787                         } else {
2788                                 atomic_clear_int((u_int *)pte, bit);
2789                         }
2790                         pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2791                 }
2792                 PMAP_UNLOCK(pv->pv_pmap);
2793         }
2794         if (bit == PG_RW)
2795                 vm_page_flag_clear(m, PG_WRITEABLE);
2796         sched_unpin();
2797 }
2798
2799 /*
2800  *      pmap_page_protect:
2801  *
2802  *      Lower the permission for all mappings to a given page.
2803  */
2804 void
2805 pmap_page_protect(vm_page_t m, vm_prot_t prot)
2806 {
2807         if ((prot & VM_PROT_WRITE) == 0) {
2808                 if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
2809                         pmap_clear_ptes(m, PG_RW);
2810                 } else {
2811                         pmap_remove_all(m);
2812                 }
2813         }
2814 }
2815
2816 /*
2817  *      pmap_ts_referenced:
2818  *
2819  *      Return a count of reference bits for a page, clearing those bits.
2820  *      It is not necessary for every reference bit to be cleared, but it
2821  *      is necessary that 0 only be returned when there are truly no
2822  *      reference bits set.
2823  *
2824  *      XXX: The exact number of bits to check and clear is a matter that
2825  *      should be tested and standardized at some point in the future for
2826  *      optimal aging of shared pages.
2827  */
2828 int
2829 pmap_ts_referenced(vm_page_t m)
2830 {
2831         register pv_entry_t pv, pvf, pvn;
2832         pt_entry_t *pte;
2833         pt_entry_t v;
2834         int rtval = 0;
2835
2836         if (m->flags & PG_FICTITIOUS)
2837                 return (rtval);
2838
2839         sched_pin();
2840         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2841         if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2842
2843                 pvf = pv;
2844
2845                 do {
2846                         pvn = TAILQ_NEXT(pv, pv_list);
2847
2848                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2849
2850                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2851
2852                         if (!pmap_track_modified(pv->pv_va))
2853                                 continue;
2854
2855                         PMAP_LOCK(pv->pv_pmap);
2856                         pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
2857
2858                         if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
2859                                 atomic_clear_int((u_int *)pte, PG_A);
2860                                 pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
2861
2862                                 rtval++;
2863                                 if (rtval > 4) {
2864                                         PMAP_UNLOCK(pv->pv_pmap);
2865                                         break;
2866                                 }
2867                         }
2868                         PMAP_UNLOCK(pv->pv_pmap);
2869                 } while ((pv = pvn) != NULL && pv != pvf);
2870         }
2871         sched_unpin();
2872
2873         return (rtval);
2874 }
2875
2876 /*
2877  *      Clear the modify bits on the specified physical page.
2878  */
2879 void
2880 pmap_clear_modify(vm_page_t m)
2881 {
2882         pmap_clear_ptes(m, PG_M);
2883 }
2884
2885 /*
2886  *      pmap_clear_reference:
2887  *
2888  *      Clear the reference bit on the specified physical page.
2889  */
2890 void
2891 pmap_clear_reference(vm_page_t m)
2892 {
2893         pmap_clear_ptes(m, PG_A);
2894 }
2895
2896 /*
2897  * Miscellaneous support routines follow
2898  */
2899
2900 /*
2901  * Map a set of physical memory pages into the kernel virtual
2902  * address space. Return a pointer to where it is mapped. This
2903  * routine is intended to be used for mapping device memory,
2904  * NOT real memory.
2905  */
2906 void *
2907 pmap_mapdev(pa, size)
2908         vm_paddr_t pa;
2909         vm_size_t size;
2910 {
2911         vm_offset_t va, tmpva, offset;
2912
2913         offset = pa & PAGE_MASK;
2914         size = roundup(offset + size, PAGE_SIZE);
2915         pa = pa & PG_FRAME;
2916
2917         if (pa < KERNLOAD && pa + size <= KERNLOAD)
2918                 va = KERNBASE + pa;
2919         else
2920                 va = kmem_alloc_nofault(kernel_map, size);
2921         if (!va)
2922                 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
2923
2924         for (tmpva = va; size > 0; ) {
2925                 pmap_kenter(tmpva, pa);
2926                 size -= PAGE_SIZE;
2927                 tmpva += PAGE_SIZE;
2928                 pa += PAGE_SIZE;
2929         }
2930         pmap_invalidate_range(kernel_pmap, va, tmpva);
2931         return ((void *)(va + offset));
2932 }
2933
2934 void
2935 pmap_unmapdev(va, size)
2936         vm_offset_t va;
2937         vm_size_t size;
2938 {
2939         vm_offset_t base, offset, tmpva;
2940
2941         if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
2942                 return;
2943         base = va & PG_FRAME;
2944         offset = va & PAGE_MASK;
2945         size = roundup(offset + size, PAGE_SIZE);
2946         for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
2947                 pmap_kremove(tmpva);
2948         pmap_invalidate_range(kernel_pmap, va, tmpva);
2949         kmem_free(kernel_map, base, size);
2950 }
2951
2952 /*
2953  * perform the pmap work for mincore
2954  */
2955 int
2956 pmap_mincore(pmap, addr)
2957         pmap_t pmap;
2958         vm_offset_t addr;
2959 {
2960         pt_entry_t *ptep, pte;
2961         vm_page_t m;
2962         int val = 0;
2963
2964         PMAP_LOCK(pmap);
2965         ptep = pmap_pte(pmap, addr);
2966         pte = (ptep != NULL) ? *ptep : 0;
2967         pmap_pte_release(ptep);
2968         PMAP_UNLOCK(pmap);
2969
2970         if (pte != 0) {
2971                 vm_paddr_t pa;
2972
2973                 val = MINCORE_INCORE;
2974                 if ((pte & PG_MANAGED) == 0)
2975                         return val;
2976
2977                 pa = pte & PG_FRAME;
2978
2979                 m = PHYS_TO_VM_PAGE(pa);
2980
2981                 /*
2982                  * Modified by us
2983                  */
2984                 if (pte & PG_M)
2985                         val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
2986                 else {
2987                         /*
2988                          * Modified by someone else
2989                          */
2990                         vm_page_lock_queues();
2991                         if (m->dirty || pmap_is_modified(m))
2992                                 val |= MINCORE_MODIFIED_OTHER;
2993                         vm_page_unlock_queues();
2994                 }
2995                 /*
2996                  * Referenced by us
2997                  */
2998                 if (pte & PG_A)
2999                         val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3000                 else {
3001                         /*
3002                          * Referenced by someone else
3003                          */
3004                         vm_page_lock_queues();
3005                         if ((m->flags & PG_REFERENCED) ||
3006                             pmap_ts_referenced(m)) {
3007                                 val |= MINCORE_REFERENCED_OTHER;
3008                                 vm_page_flag_set(m, PG_REFERENCED);
3009                         }
3010                         vm_page_unlock_queues();
3011                 }
3012         }
3013         return val;
3014 }
3015
3016 void
3017 pmap_activate(struct thread *td)
3018 {
3019         pmap_t  pmap, oldpmap;
3020         u_int32_t  cr3;
3021
3022         critical_enter();
3023         pmap = vmspace_pmap(td->td_proc->p_vmspace);
3024         oldpmap = PCPU_GET(curpmap);
3025 #if defined(SMP)
3026         atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
3027         atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
3028 #else
3029         oldpmap->pm_active &= ~1;
3030         pmap->pm_active |= 1;
3031 #endif
3032 #ifdef PAE
3033         cr3 = vtophys(pmap->pm_pdpt);
3034 #else
3035         cr3 = vtophys(pmap->pm_pdir);
3036 #endif
3037         /*
3038          * pmap_activate is for the current thread on the current cpu
3039          */
3040         td->td_pcb->pcb_cr3 = cr3;
3041         load_cr3(cr3);
3042         PCPU_SET(curpmap, pmap);
3043         critical_exit();
3044 }
3045
3046 vm_offset_t
3047 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3048 {
3049
3050         if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3051                 return addr;
3052         }
3053
3054         addr = (addr + PDRMASK) & ~PDRMASK;
3055         return addr;
3056 }
3057
3058
3059 #if defined(PMAP_DEBUG)
3060 pmap_pid_dump(int pid)
3061 {
3062         pmap_t pmap;
3063         struct proc *p;
3064         int npte = 0;
3065         int index;
3066
3067         sx_slock(&allproc_lock);
3068         LIST_FOREACH(p, &allproc, p_list) {
3069                 if (p->p_pid != pid)
3070                         continue;
3071
3072                 if (p->p_vmspace) {
3073                         int i,j;
3074                         index = 0;
3075                         pmap = vmspace_pmap(p->p_vmspace);
3076                         for (i = 0; i < NPDEPTD; i++) {
3077                                 pd_entry_t *pde;
3078                                 pt_entry_t *pte;
3079                                 vm_offset_t base = i << PDRSHIFT;
3080
3081                                 pde = &pmap->pm_pdir[i];
3082                                 if (pde && pmap_pde_v(pde)) {
3083                                         for (j = 0; j < NPTEPG; j++) {
3084                                                 vm_offset_t va = base + (j << PAGE_SHIFT);
3085                                                 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3086                                                         if (index) {
3087                                                                 index = 0;
3088                                                                 printf("\n");
3089                                                         }
3090                                                         sx_sunlock(&allproc_lock);
3091                                                         return npte;
3092                                                 }
3093                                                 pte = pmap_pte(pmap, va);
3094                                                 if (pte && pmap_pte_v(pte)) {
3095                                                         pt_entry_t pa;
3096                                                         vm_page_t m;
3097                                                         pa = *pte;
3098                                                         m = PHYS_TO_VM_PAGE(pa);
3099                                                         printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3100                                                                 va, pa, m->hold_count, m->wire_count, m->flags);
3101                                                         npte++;
3102                                                         index++;
3103                                                         if (index >= 2) {
3104                                                                 index = 0;
3105                                                                 printf("\n");
3106                                                         } else {
3107                                                                 printf(" ");
3108                                                         }
3109                                                 }
3110                                         }
3111                                 }
3112                         }
3113                 }
3114         }
3115         sx_sunlock(&allproc_lock);
3116         return npte;
3117 }
3118 #endif
3119
3120 #if defined(DEBUG)
3121
3122 static void     pads(pmap_t pm);
3123 void            pmap_pvdump(vm_offset_t pa);
3124
3125 /* print address space of pmap*/
3126 static void
3127 pads(pm)
3128         pmap_t pm;
3129 {
3130         int i, j;
3131         vm_paddr_t va;
3132         pt_entry_t *ptep;
3133
3134         if (pm == kernel_pmap)
3135                 return;
3136         for (i = 0; i < NPDEPTD; i++)
3137                 if (pm->pm_pdir[i])
3138                         for (j = 0; j < NPTEPG; j++) {
3139                                 va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3140                                 if (pm == kernel_pmap && va < KERNBASE)
3141                                         continue;
3142                                 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3143                                         continue;
3144                                 ptep = pmap_pte(pm, va);
3145                                 if (pmap_pte_v(ptep))
3146                                         printf("%x:%x ", va, *ptep);
3147                         };
3148
3149 }
3150
3151 void
3152 pmap_pvdump(pa)
3153         vm_paddr_t pa;
3154 {
3155         pv_entry_t pv;
3156         vm_page_t m;
3157
3158         printf("pa %x", pa);
3159         m = PHYS_TO_VM_PAGE(pa);
3160         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3161                 printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
3162                 pads(pv->pv_pmap);
3163         }
3164         printf(" ");
3165 }
3166 #endif