sys/i386/i386/pmap.c

   1 /*-
   2  * Copyright (c) 1991 Regents of the University of California.
   3  * All rights reserved.
   4  * Copyright (c) 1994 John S. Dyson
   5  * All rights reserved.
   6  * Copyright (c) 1994 David Greenman
   7  * All rights reserved.
   8  * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
   9  * All rights reserved.
  10  *
  11  * This code is derived from software contributed to Berkeley by
  12  * the Systems Programming Group of the University of Utah Computer
  13  * Science Department and William Jolitz of UUNET Technologies Inc.
  14  *
  15  * Redistribution and use in source and binary forms, with or without
  16  * modification, are permitted provided that the following conditions
  17  * are met:
  18  * 1. Redistributions of source code must retain the above copyright
  19  *    notice, this list of conditions and the following disclaimer.
  20  * 2. Redistributions in binary form must reproduce the above copyright
  21  *    notice, this list of conditions and the following disclaimer in the
  22  *    documentation and/or other materials provided with the distribution.
  23  * 3. All advertising materials mentioning features or use of this software
  24  *    must display the following acknowledgement:
  25  *      This product includes software developed by the University of
  26  *      California, Berkeley and its contributors.
  27  * 4. Neither the name of the University nor the names of its contributors
  28  *    may be used to endorse or promote products derived from this software
  29  *    without specific prior written permission.
  30  *
  31  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  32  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  33  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  34  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  35  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  36  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  37  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  38  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  39  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  40  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  41  * SUCH DAMAGE.
  42  *
  43  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
  44  */
  45 /*-
  46  * Copyright (c) 2003 Networks Associates Technology, Inc.
  47  * All rights reserved.
  48  *
  49  * This software was developed for the FreeBSD Project by Jake Burkholder,
  50  * Safeport Network Services, and Network Associates Laboratories, the
  51  * Security Research Division of Network Associates, Inc. under
  52  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  53  * CHATS research program.
  54  *
  55  * Redistribution and use in source and binary forms, with or without
  56  * modification, are permitted provided that the following conditions
  57  * are met:
  58  * 1. Redistributions of source code must retain the above copyright
  59  *    notice, this list of conditions and the following disclaimer.
  60  * 2. Redistributions in binary form must reproduce the above copyright
  61  *    notice, this list of conditions and the following disclaimer in the
  62  *    documentation and/or other materials provided with the distribution.
  63  *
  64  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  65  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  66  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  67  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  68  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  69  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  70  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  71  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  72  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  73  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  74  * SUCH DAMAGE.
  75  */
  76
  77 #include <sys/cdefs.h>
  78 __FBSDID("$FreeBSD$");
  79
  80 /*
  81  *      Manages physical address maps.
  82  *
  83  *      In addition to hardware address maps, this
  84  *      module is called upon to provide software-use-only
  85  *      maps which may or may not be stored in the same
  86  *      form as hardware maps.  These pseudo-maps are
  87  *      used to store intermediate results from copy
  88  *      operations to and from address spaces.
  89  *
  90  *      Since the information managed by this module is
  91  *      also stored by the logical address mapping module,
  92  *      this module may throw away valid virtual-to-physical
  93  *      mappings at almost any time.  However, invalidations
  94  *      of virtual-to-physical mappings must be done as
  95  *      requested.
  96  *
  97  *      In order to cope with hardware architectures which
  98  *      make virtual-to-physical map invalidates expensive,
  99  *      this module may delay invalidate or reduced protection
 100  *      operations until such time as they are actually
 101  *      necessary.  This module is given full information as
 102  *      to which processors are currently using which maps,
 103  *      and to when physical maps must be made correct.
 104  */
 105
 106 #include "opt_cpu.h"
 107 #include "opt_pmap.h"
 108 #include "opt_msgbuf.h"
 109 #include "opt_smp.h"
 110 #include "opt_xbox.h"
 111
 112 #include <sys/param.h>
 113 #include <sys/systm.h>
 114 #include <sys/kernel.h>
 115 #include <sys/lock.h>
 116 #include <sys/malloc.h>
 117 #include <sys/mman.h>
 118 #include <sys/msgbuf.h>
 119 #include <sys/mutex.h>
 120 #include <sys/proc.h>
 121 #include <sys/sx.h>
 122 #include <sys/vmmeter.h>
 123 #include <sys/sched.h>
 124 #include <sys/sysctl.h>
 125 #ifdef SMP
 126 #include <sys/smp.h>
 127 #endif
 128
 129 #include <vm/vm.h>
 130 #include <vm/vm_param.h>
 131 #include <vm/vm_kern.h>
 132 #include <vm/vm_page.h>
 133 #include <vm/vm_map.h>
 134 #include <vm/vm_object.h>
 135 #include <vm/vm_extern.h>
 136 #include <vm/vm_pageout.h>
 137 #include <vm/vm_pager.h>
 138 #include <vm/uma.h>
 139
 140 #include <machine/cpu.h>
 141 #include <machine/cputypes.h>
 142 #include <machine/md_var.h>
 143 #include <machine/pcb.h>
 144 #include <machine/specialreg.h>
 145 #ifdef SMP
 146 #include <machine/smp.h>
 147 #endif
 148
 149 #ifdef XBOX
 150 #include <machine/xbox.h>
 151 #endif
 152
 153 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 154 #define CPU_ENABLE_SSE
 155 #endif
 156
 157 #ifndef PMAP_SHPGPERPROC
 158 #define PMAP_SHPGPERPROC 200
 159 #endif
 160
 161 #if defined(DIAGNOSTIC)
 162 #define PMAP_DIAGNOSTIC
 163 #endif
 164
 165 #if !defined(PMAP_DIAGNOSTIC)
 166 #define PMAP_INLINE __inline
 167 #else
 168 #define PMAP_INLINE
 169 #endif
 170
 171 #define PV_STATS
 172 #ifdef PV_STATS
 173 #define PV_STAT(x)      do { x ; } while (0)
 174 #else
 175 #define PV_STAT(x)      do { } while (0)
 176 #endif
 177
 178 /*
 179  * Get PDEs and PTEs for user/kernel address space
 180  */
 181 #define pmap_pde(m, v)  (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
 182 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
 183
 184 #define pmap_pde_v(pte)         ((*(int *)pte & PG_V) != 0)
 185 #define pmap_pte_w(pte)         ((*(int *)pte & PG_W) != 0)
 186 #define pmap_pte_m(pte)         ((*(int *)pte & PG_M) != 0)
 187 #define pmap_pte_u(pte)         ((*(int *)pte & PG_A) != 0)
 188 #define pmap_pte_v(pte)         ((*(int *)pte & PG_V) != 0)
 189
 190 #define pmap_pte_set_w(pte, v)  ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
 191     atomic_clear_int((u_int *)(pte), PG_W))
 192 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
 193
 194 struct pmap kernel_pmap_store;
 195 LIST_HEAD(pmaplist, pmap);
 196 static struct pmaplist allpmaps;
 197 static struct mtx allpmaps_lock;
 198
 199 vm_paddr_t avail_end;   /* PA of last available physical page */
 200 vm_offset_t virtual_avail;      /* VA of first avail page (after kernel bss) */
 201 vm_offset_t virtual_end;        /* VA of last avail page (end of kernel AS) */
 202 int pgeflag = 0;                /* PG_G or-in */
 203 int pseflag = 0;                /* PG_PS or-in */
 204
 205 static int nkpt;
 206 vm_offset_t kernel_vm_end;
 207 extern u_int32_t KERNend;
 208
 209 #ifdef PAE
 210 static uma_zone_t pdptzone;
 211 #endif
 212
 213 /*
 214  * Data for the pv entry allocation mechanism
 215  */
 216 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 217 static int shpgperproc = PMAP_SHPGPERPROC;
 218
 219 struct pv_chunk *pv_chunkbase;          /* KVA block for pv_chunks */
 220 int pv_maxchunks;                       /* How many chunks we have KVA for */
 221 vm_offset_t pv_vafree;                  /* freelist stored in the PTE */
 222
 223 /*
 224  * All those kernel PT submaps that BSD is so fond of
 225  */
 226 struct sysmaps {
 227         struct  mtx lock;
 228         pt_entry_t *CMAP1;
 229         pt_entry_t *CMAP2;
 230         caddr_t CADDR1;
 231         caddr_t CADDR2;
 232 };
 233 static struct sysmaps sysmaps_pcpu[MAXCPU];
 234 pt_entry_t *CMAP1 = 0;
 235 static pt_entry_t *CMAP3;
 236 caddr_t CADDR1 = 0, ptvmmap = 0;
 237 static caddr_t CADDR3;
 238 struct msgbuf *msgbufp = 0;
 239
 240 /*
 241  * Crashdump maps.
 242  */
 243 static caddr_t crashdumpmap;
 244
 245 #ifdef SMP
 246 extern pt_entry_t *SMPpt;
 247 #endif
 248 static pt_entry_t *PMAP1 = 0, *PMAP2;
 249 static pt_entry_t *PADDR1 = 0, *PADDR2;
 250 #ifdef SMP
 251 static int PMAP1cpu;
 252 static int PMAP1changedcpu;
 253 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
 254            &PMAP1changedcpu, 0,
 255            "Number of times pmap_pte_quick changed CPU with same PMAP1");
 256 #endif
 257 static int PMAP1changed;
 258 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
 259            &PMAP1changed, 0,
 260            "Number of times pmap_pte_quick changed PMAP1");
 261 static int PMAP1unchanged;
 262 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
 263            &PMAP1unchanged, 0,
 264            "Number of times pmap_pte_quick didn't change PMAP1");
 265 static struct mtx PMAP2mutex;
 266
 267 static void     free_pv_entry(pmap_t pmap, pv_entry_t pv);
 268 static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
 269
 270 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
 271     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
 272 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
 273 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
 274 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 275                                         vm_offset_t va);
 276 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 277 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
 278     vm_page_t m);
 279
 280 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
 281
 282 static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
 283 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m);
 284 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
 285 static void pmap_pte_release(pt_entry_t *pte);
 286 static int pmap_unuse_pt(pmap_t, vm_offset_t);
 287 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 288 #ifdef PAE
 289 static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
 290 #endif
 291
 292 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
 293 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
 294
 295 /*
 296  * Move the kernel virtual free pointer to the next
 297  * 4MB.  This is used to help improve performance
 298  * by using a large (4MB) page for much of the kernel
 299  * (.text, .data, .bss)
 300  */
 301 static vm_offset_t
 302 pmap_kmem_choose(vm_offset_t addr)
 303 {
 304         vm_offset_t newaddr = addr;
 305
 306 #ifndef DISABLE_PSE
 307         if (cpu_feature & CPUID_PSE)
 308                 newaddr = (addr + PDRMASK) & ~PDRMASK;
 309 #endif
 310         return newaddr;
 311 }
 312
 313 /*
 314  *      Bootstrap the system enough to run with virtual memory.
 315  *
 316  *      On the i386 this is called after mapping has already been enabled
 317  *      and just syncs the pmap module with what has already been done.
 318  *      [We can't call it easily with mapping off since the kernel is not
 319  *      mapped with PA == VA, hence we would have to relocate every address
 320  *      from the linked base (virtual) address "KERNBASE" to the actual
 321  *      (physical) address starting relative to 0]
 322  */
 323 void
 324 pmap_bootstrap(vm_paddr_t firstaddr, vm_paddr_t loadaddr)
 325 {
 326         vm_offset_t va;
 327         pt_entry_t *pte, *unused;
 328         struct sysmaps *sysmaps;
 329         int i;
 330
 331         /*
 332          * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
 333          * large. It should instead be correctly calculated in locore.s and
 334          * not based on 'first' (which is a physical address, not a virtual
 335          * address, for the start of unused physical memory). The kernel
 336          * page tables are NOT double mapped and thus should not be included
 337          * in this calculation.
 338          */
 339         virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
 340         virtual_avail = pmap_kmem_choose(virtual_avail);
 341
 342         virtual_end = VM_MAX_KERNEL_ADDRESS;
 343
 344         /*
 345          * Initialize the kernel pmap (which is statically allocated).
 346          */
 347         PMAP_LOCK_INIT(kernel_pmap);
 348         kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
 349 #ifdef PAE
 350         kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
 351 #endif
 352         kernel_pmap->pm_active = -1;    /* don't allow deactivation */
 353         TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 354         LIST_INIT(&allpmaps);
 355         mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
 356         mtx_lock_spin(&allpmaps_lock);
 357         LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
 358         mtx_unlock_spin(&allpmaps_lock);
 359         nkpt = NKPT;
 360
 361         /*
 362          * Reserve some special page table entries/VA space for temporary
 363          * mapping of pages.
 364          */
 365 #define SYSMAP(c, p, v, n)      \
 366         v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 367
 368         va = virtual_avail;
 369         pte = vtopte(va);
 370
 371         /*
 372          * CMAP1/CMAP2 are used for zeroing and copying pages.
 373          * CMAP3 is used for the idle process page zeroing.
 374          */
 375         for (i = 0; i < MAXCPU; i++) {
 376                 sysmaps = &sysmaps_pcpu[i];
 377                 mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
 378                 SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
 379                 SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
 380         }
 381         SYSMAP(caddr_t, CMAP1, CADDR1, 1)
 382         SYSMAP(caddr_t, CMAP3, CADDR3, 1)
 383         *CMAP3 = 0;
 384
 385         /*
 386          * Crashdump maps.
 387          */
 388         SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
 389
 390         /*
 391          * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
 392          */
 393         SYSMAP(caddr_t, unused, ptvmmap, 1)
 394
 395         /*
 396          * msgbufp is used to map the system message buffer.
 397          */
 398         SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
 399
 400         /*
 401          * ptemap is used for pmap_pte_quick
 402          */
 403         SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
 404         SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1);
 405
 406         mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
 407
 408         virtual_avail = va;
 409
 410         *CMAP1 = 0;
 411
 412 #ifdef XBOX
 413         /* FIXME: This is gross, but needed for the XBOX. Since we are in such
 414          * an early stadium, we cannot yet neatly map video memory ... :-(
 415          * Better fixes are very welcome! */
 416         if (!arch_i386_is_xbox)
 417 #endif
 418         for (i = 0; i < NKPT; i++)
 419                 PTD[i] = 0;
 420
 421         /* Initialize the PAT MSR if present. */
 422         pmap_init_pat();
 423
 424         /* Turn on PG_G on kernel page(s) */
 425         pmap_set_pg();
 426 }
 427
 428 /*
 429  * Setup the PAT MSR.
 430  */
 431 void
 432 pmap_init_pat(void)
 433 {
 434         uint64_t pat_msr;
 435
 436         /* Bail if this CPU doesn't implement PAT. */
 437         if (!(cpu_feature & CPUID_PAT))
 438                 return;
 439
 440 #ifdef PAT_WORKS
 441         /*
 442          * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
 443          * Program 4 and 5 as WP and WC.
 444          * Leave 6 and 7 as UC and UC-.
 445          */
 446         pat_msr = rdmsr(MSR_PAT);
 447         pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
 448         pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
 449             PAT_VALUE(5, PAT_WRITE_COMBINING);
 450 #else
 451         /*
 452          * Due to some Intel errata, we can only safely use the lower 4
 453          * PAT entries.  Thus, just replace PAT Index 2 with WC instead
 454          * of UC-.
 455          *
 456          *   Intel Pentium III Processor Specification Update
 457          * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
 458          * or Mode C Paging)
 459          *
 460          *   Intel Pentium IV  Processor Specification Update
 461          * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
 462          */
 463         pat_msr = rdmsr(MSR_PAT);
 464         pat_msr &= ~PAT_MASK(2);
 465         pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
 466 #endif
 467         wrmsr(MSR_PAT, pat_msr);
 468 }
 469
 470 /*
 471  * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
 472  */
 473 void
 474 pmap_set_pg(void)
 475 {
 476         pd_entry_t pdir;
 477         pt_entry_t *pte;
 478         vm_offset_t va, endva;
 479         int i;
 480
 481         if (pgeflag == 0)
 482                 return;
 483
 484         i = KERNLOAD/NBPDR;
 485         endva = KERNBASE + KERNend;
 486
 487         if (pseflag) {
 488                 va = KERNBASE + KERNLOAD;
 489                 while (va  < endva) {
 490                         pdir = kernel_pmap->pm_pdir[KPTDI+i];
 491                         pdir |= pgeflag;
 492                         kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
 493                         invltlb();      /* Play it safe, invltlb() every time */
 494                         i++;
 495                         va += NBPDR;
 496                 }
 497         } else {
 498                 va = (vm_offset_t)btext;
 499                 while (va < endva) {
 500                         pte = vtopte(va);
 501                         if (*pte)
 502                                 *pte |= pgeflag;
 503                         invltlb();      /* Play it safe, invltlb() every time */
 504                         va += PAGE_SIZE;
 505                 }
 506         }
 507 }
 508
 509 /*
 510  * Initialize a vm_page's machine-dependent fields.
 511  */
 512 void
 513 pmap_page_init(vm_page_t m)
 514 {
 515
 516         TAILQ_INIT(&m->md.pv_list);
 517         m->md.pv_list_count = 0;
 518 }
 519
 520 #ifdef PAE
 521
 522 static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt");
 523
 524 static void *
 525 pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 526 {
 527         *flags = UMA_SLAB_PRIV;
 528         return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL,
 529             1, 0));
 530 }
 531 #endif
 532
 533 /*
 534  * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
 535  * Requirements:
 536  *  - Must deal with pages in order to ensure that none of the PG_* bits
 537  *    are ever set, PG_V in particular.
 538  *  - Assumes we can write to ptes without pte_store() atomic ops, even
 539  *    on PAE systems.  This should be ok.
 540  *  - Assumes nothing will ever test these addresses for 0 to indicate
 541  *    no mapping instead of correctly checking PG_V.
 542  *  - Assumes a vm_offset_t will fit in a pte (true for i386).
 543  * Because PG_V is never set, there can be no mappings to invalidate.
 544  */
 545 static vm_offset_t
 546 pmap_ptelist_alloc(vm_offset_t *head)
 547 {
 548         pt_entry_t *pte;
 549         vm_offset_t va;
 550
 551         va = *head;
 552         if (va == 0)
 553                 return (va);    /* Out of memory */
 554         pte = vtopte(va);
 555         *head = *pte;
 556         if (*head & PG_V)
 557                 panic("pmap_ptelist_alloc: va with PG_V set!");
 558         *pte = 0;
 559         return (va);
 560 }
 561
 562 static void
 563 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
 564 {
 565         pt_entry_t *pte;
 566
 567         if (va & PG_V)
 568                 panic("pmap_ptelist_free: freeing va with PG_V set!");
 569         pte = vtopte(va);
 570         *pte = *head;           /* virtual! PG_V is 0 though */
 571         *head = va;
 572 }
 573
 574 static void
 575 pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
 576 {
 577         int i;
 578         vm_offset_t va;
 579
 580         *head = 0;
 581         for (i = npages - 1; i >= 0; i--) {
 582                 va = (vm_offset_t)base + i * PAGE_SIZE;
 583                 pmap_ptelist_free(head, va);
 584         }
 585 }
 586
 587
 588 /*
 589  *      Initialize the pmap module.
 590  *      Called by vm_init, to initialize any structures that the pmap
 591  *      system needs to map virtual memory.
 592  */
 593 void
 594 pmap_init(void)
 595 {
 596
 597         /*
 598          * Initialize the address space (zone) for the pv entries.  Set a
 599          * high water mark so that the system can recover from excessive
 600          * numbers of pv entries.
 601          */
 602         TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 603         pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
 604         TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 605         pv_entry_max = roundup(pv_entry_max, _NPCPV);
 606         pv_entry_high_water = 9 * (pv_entry_max / 10);
 607
 608         pv_maxchunks = pv_entry_max / _NPCPV;
 609         pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
 610             PAGE_SIZE * pv_maxchunks);
 611         if (pv_chunkbase == NULL)
 612                 panic("pmap_init: not enough kvm for pv chunks");
 613         pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
 614 #ifdef PAE
 615         pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
 616             NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
 617             UMA_ZONE_VM | UMA_ZONE_NOFREE);
 618         uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
 619 #endif
 620 }
 621
 622
 623 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 624 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
 625         "Max number of PV entries");
 626 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
 627         "Page share factor per proc");
 628
 629 /***************************************************
 630  * Low level helper routines.....
 631  ***************************************************/
 632
 633 /*
 634  * Determine the appropriate bits to set in a PTE or PDE for a specified
 635  * caching mode.
 636  */
 637 static int
 638 pmap_cache_bits(int mode, boolean_t is_pde)
 639 {
 640         int pat_flag, pat_index, cache_bits;
 641
 642         /* The PAT bit is different for PTE's and PDE's. */
 643         pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
 644
 645         /* If we don't support PAT, map extended modes to older ones. */
 646         if (!(cpu_feature & CPUID_PAT)) {
 647                 switch (mode) {
 648                 case PAT_UNCACHEABLE:
 649                 case PAT_WRITE_THROUGH:
 650                 case PAT_WRITE_BACK:
 651                         break;
 652                 case PAT_UNCACHED:
 653                 case PAT_WRITE_COMBINING:
 654                 case PAT_WRITE_PROTECTED:
 655                         mode = PAT_UNCACHEABLE;
 656                         break;
 657                 }
 658         }
 659
 660         /* Map the caching mode to a PAT index. */
 661         switch (mode) {
 662 #ifdef PAT_WORKS
 663         case PAT_UNCACHEABLE:
 664                 pat_index = 3;
 665                 break;
 666         case PAT_WRITE_THROUGH:
 667                 pat_index = 1;
 668                 break;
 669         case PAT_WRITE_BACK:
 670                 pat_index = 0;
 671                 break;
 672         case PAT_UNCACHED:
 673                 pat_index = 2;
 674                 break;
 675         case PAT_WRITE_COMBINING:
 676                 pat_index = 5;
 677                 break;
 678         case PAT_WRITE_PROTECTED:
 679                 pat_index = 4;
 680                 break;
 681 #else
 682         case PAT_UNCACHED:
 683         case PAT_UNCACHEABLE:
 684         case PAT_WRITE_PROTECTED:
 685                 pat_index = 3;
 686                 break;
 687         case PAT_WRITE_THROUGH:
 688                 pat_index = 1;
 689                 break;
 690         case PAT_WRITE_BACK:
 691                 pat_index = 0;
 692                 break;
 693         case PAT_WRITE_COMBINING:
 694                 pat_index = 2;
 695                 break;
 696 #endif
 697         default:
 698                 panic("Unknown caching mode %d\n", mode);
 699         }
 700
 701         /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 702         cache_bits = 0;
 703         if (pat_index & 0x4)
 704                 cache_bits |= pat_flag;
 705         if (pat_index & 0x2)
 706                 cache_bits |= PG_NC_PCD;
 707         if (pat_index & 0x1)
 708                 cache_bits |= PG_NC_PWT;
 709         return (cache_bits);
 710 }
 711 #ifdef SMP
 712 /*
 713  * For SMP, these functions have to use the IPI mechanism for coherence.
 714  */
 715 void
 716 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 717 {
 718         u_int cpumask;
 719         u_int other_cpus;
 720
 721         if (smp_started) {
 722                 if (!(read_eflags() & PSL_I))
 723                         panic("%s: interrupts disabled", __func__);
 724                 mtx_lock_spin(&smp_ipi_mtx);
 725         } else
 726                 critical_enter();
 727         /*
 728          * We need to disable interrupt preemption but MUST NOT have
 729          * interrupts disabled here.
 730          * XXX we may need to hold schedlock to get a coherent pm_active
 731          * XXX critical sections disable interrupts again
 732          */
 733         if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 734                 invlpg(va);
 735                 smp_invlpg(va);
 736         } else {
 737                 cpumask = PCPU_GET(cpumask);
 738                 other_cpus = PCPU_GET(other_cpus);
 739                 if (pmap->pm_active & cpumask)
 740                         invlpg(va);
 741                 if (pmap->pm_active & other_cpus)
 742                         smp_masked_invlpg(pmap->pm_active & other_cpus, va);
 743         }
 744         if (smp_started)
 745                 mtx_unlock_spin(&smp_ipi_mtx);
 746         else
 747                 critical_exit();
 748 }
 749
 750 void
 751 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 752 {
 753         u_int cpumask;
 754         u_int other_cpus;
 755         vm_offset_t addr;
 756
 757         if (smp_started) {
 758                 if (!(read_eflags() & PSL_I))
 759                         panic("%s: interrupts disabled", __func__);
 760                 mtx_lock_spin(&smp_ipi_mtx);
 761         } else
 762                 critical_enter();
 763         /*
 764          * We need to disable interrupt preemption but MUST NOT have
 765          * interrupts disabled here.
 766          * XXX we may need to hold schedlock to get a coherent pm_active
 767          * XXX critical sections disable interrupts again
 768          */
 769         if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 770                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
 771                         invlpg(addr);
 772                 smp_invlpg_range(sva, eva);
 773         } else {
 774                 cpumask = PCPU_GET(cpumask);
 775                 other_cpus = PCPU_GET(other_cpus);
 776                 if (pmap->pm_active & cpumask)
 777                         for (addr = sva; addr < eva; addr += PAGE_SIZE)
 778                                 invlpg(addr);
 779                 if (pmap->pm_active & other_cpus)
 780                         smp_masked_invlpg_range(pmap->pm_active & other_cpus,
 781                             sva, eva);
 782         }
 783         if (smp_started)
 784                 mtx_unlock_spin(&smp_ipi_mtx);
 785         else
 786                 critical_exit();
 787 }
 788
 789 void
 790 pmap_invalidate_all(pmap_t pmap)
 791 {
 792         u_int cpumask;
 793         u_int other_cpus;
 794
 795         if (smp_started) {
 796                 if (!(read_eflags() & PSL_I))
 797                         panic("%s: interrupts disabled", __func__);
 798                 mtx_lock_spin(&smp_ipi_mtx);
 799         } else
 800                 critical_enter();
 801         /*
 802          * We need to disable interrupt preemption but MUST NOT have
 803          * interrupts disabled here.
 804          * XXX we may need to hold schedlock to get a coherent pm_active
 805          * XXX critical sections disable interrupts again
 806          */
 807         if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 808                 invltlb();
 809                 smp_invltlb();
 810         } else {
 811                 cpumask = PCPU_GET(cpumask);
 812                 other_cpus = PCPU_GET(other_cpus);
 813                 if (pmap->pm_active & cpumask)
 814                         invltlb();
 815                 if (pmap->pm_active & other_cpus)
 816                         smp_masked_invltlb(pmap->pm_active & other_cpus);
 817         }
 818         if (smp_started)
 819                 mtx_unlock_spin(&smp_ipi_mtx);
 820         else
 821                 critical_exit();
 822 }
 823
 824 void
 825 pmap_invalidate_cache(void)
 826 {
 827
 828         if (smp_started) {
 829                 if (!(read_eflags() & PSL_I))
 830                         panic("%s: interrupts disabled", __func__);
 831                 mtx_lock_spin(&smp_ipi_mtx);
 832         } else
 833                 critical_enter();
 834         /*
 835          * We need to disable interrupt preemption but MUST NOT have
 836          * interrupts disabled here.
 837          * XXX we may need to hold schedlock to get a coherent pm_active
 838          * XXX critical sections disable interrupts again
 839          */
 840         wbinvd();
 841         smp_cache_flush();
 842         if (smp_started)
 843                 mtx_unlock_spin(&smp_ipi_mtx);
 844         else
 845                 critical_exit();
 846 }
 847 #else /* !SMP */
 848 /*
 849  * Normal, non-SMP, 486+ invalidation functions.
 850  * We inline these within pmap.c for speed.
 851  */
 852 PMAP_INLINE void
 853 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 854 {
 855
 856         if (pmap == kernel_pmap || pmap->pm_active)
 857                 invlpg(va);
 858 }
 859
 860 PMAP_INLINE void
 861 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 862 {
 863         vm_offset_t addr;
 864
 865         if (pmap == kernel_pmap || pmap->pm_active)
 866                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
 867                         invlpg(addr);
 868 }
 869
 870 PMAP_INLINE void
 871 pmap_invalidate_all(pmap_t pmap)
 872 {
 873
 874         if (pmap == kernel_pmap || pmap->pm_active)
 875                 invltlb();
 876 }
 877
 878 PMAP_INLINE void
 879 pmap_invalidate_cache(void)
 880 {
 881
 882         wbinvd();
 883 }
 884 #endif /* !SMP */
 885
 886 /*
 887  * Are we current address space or kernel?  N.B. We return FALSE when
 888  * a pmap's page table is in use because a kernel thread is borrowing
 889  * it.  The borrowed page table can change spontaneously, making any
 890  * dependence on its continued use subject to a race condition.
 891  */
 892 static __inline int
 893 pmap_is_current(pmap_t pmap)
 894 {
 895
 896         return (pmap == kernel_pmap ||
 897                 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
 898             (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
 899 }
 900
 901 /*
 902  * If the given pmap is not the current or kernel pmap, the returned pte must
 903  * be released by passing it to pmap_pte_release().
 904  */
 905 pt_entry_t *
 906 pmap_pte(pmap_t pmap, vm_offset_t va)
 907 {
 908         pd_entry_t newpf;
 909         pd_entry_t *pde;
 910
 911         pde = pmap_pde(pmap, va);
 912         if (*pde & PG_PS)
 913                 return (pde);
 914         if (*pde != 0) {
 915                 /* are we current address space or kernel? */
 916                 if (pmap_is_current(pmap))
 917                         return (vtopte(va));
 918                 mtx_lock(&PMAP2mutex);
 919                 newpf = *pde & PG_FRAME;
 920                 if ((*PMAP2 & PG_FRAME) != newpf) {
 921                         *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
 922                         pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
 923                 }
 924                 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
 925         }
 926         return (0);
 927 }
 928
 929 /*
 930  * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
 931  * being NULL.
 932  */
 933 static __inline void
 934 pmap_pte_release(pt_entry_t *pte)
 935 {
 936
 937         if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
 938                 mtx_unlock(&PMAP2mutex);
 939 }
 940
 941 static __inline void
 942 invlcaddr(void *caddr)
 943 {
 944
 945         invlpg((u_int)caddr);
 946 }
 947
 948 /*
 949  * Super fast pmap_pte routine best used when scanning
 950  * the pv lists.  This eliminates many coarse-grained
 951  * invltlb calls.  Note that many of the pv list
 952  * scans are across different pmaps.  It is very wasteful
 953  * to do an entire invltlb for checking a single mapping.
 954  *
 955  * If the given pmap is not the current pmap, vm_page_queue_mtx
 956  * must be held and curthread pinned to a CPU.
 957  */
 958 static pt_entry_t *
 959 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
 960 {
 961         pd_entry_t newpf;
 962         pd_entry_t *pde;
 963
 964         pde = pmap_pde(pmap, va);
 965         if (*pde & PG_PS)
 966                 return (pde);
 967         if (*pde != 0) {
 968                 /* are we current address space or kernel? */
 969                 if (pmap_is_current(pmap))
 970                         return (vtopte(va));
 971                 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 972                 KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 973                 newpf = *pde & PG_FRAME;
 974                 if ((*PMAP1 & PG_FRAME) != newpf) {
 975                         *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
 976 #ifdef SMP
 977                         PMAP1cpu = PCPU_GET(cpuid);
 978 #endif
 979                         invlcaddr(PADDR1);
 980                         PMAP1changed++;
 981                 } else
 982 #ifdef SMP
 983                 if (PMAP1cpu != PCPU_GET(cpuid)) {
 984                         PMAP1cpu = PCPU_GET(cpuid);
 985                         invlcaddr(PADDR1);
 986                         PMAP1changedcpu++;
 987                 } else
 988 #endif
 989                         PMAP1unchanged++;
 990                 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
 991         }
 992         return (0);
 993 }
 994
 995 /*
 996  *      Routine:        pmap_extract
 997  *      Function:
 998  *              Extract the physical page address associated
 999  *              with the given map/virtual_address pair.
1000  */
1001 vm_paddr_t
1002 pmap_extract(pmap_t pmap, vm_offset_t va)
1003 {
1004         vm_paddr_t rtval;
1005         pt_entry_t *pte;
1006         pd_entry_t pde;
1007
1008         rtval = 0;
1009         PMAP_LOCK(pmap);
1010         pde = pmap->pm_pdir[va >> PDRSHIFT];
1011         if (pde != 0) {
1012                 if ((pde & PG_PS) != 0) {
1013                         rtval = (pde & ~PDRMASK) | (va & PDRMASK);
1014                         PMAP_UNLOCK(pmap);
1015                         return rtval;
1016                 }
1017                 pte = pmap_pte(pmap, va);
1018                 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
1019                 pmap_pte_release(pte);
1020         }
1021         PMAP_UNLOCK(pmap);
1022         return (rtval);
1023 }
1024
1025 /*
1026  *      Routine:        pmap_extract_and_hold
1027  *      Function:
1028  *              Atomically extract and hold the physical page
1029  *              with the given pmap and virtual address pair
1030  *              if that mapping permits the given protection.
1031  */
1032 vm_page_t
1033 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1034 {
1035         pd_entry_t pde;
1036         pt_entry_t pte;
1037         vm_page_t m;
1038
1039         m = NULL;
1040         vm_page_lock_queues();
1041         PMAP_LOCK(pmap);
1042         pde = *pmap_pde(pmap, va);
1043         if (pde != 0) {
1044                 if (pde & PG_PS) {
1045                         if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1046                                 m = PHYS_TO_VM_PAGE((pde & ~PDRMASK) |
1047                                     (va & PDRMASK));
1048                                 vm_page_hold(m);
1049                         }
1050                 } else {
1051                         sched_pin();
1052                         pte = *pmap_pte_quick(pmap, va);
1053                         if (pte != 0 &&
1054                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1055                                 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1056                                 vm_page_hold(m);
1057                         }
1058                         sched_unpin();
1059                 }
1060         }
1061         vm_page_unlock_queues();
1062         PMAP_UNLOCK(pmap);
1063         return (m);
1064 }
1065
1066 /***************************************************
1067  * Low level mapping routines.....
1068  ***************************************************/
1069
1070 /*
1071  * Add a wired page to the kva.
1072  * Note: not SMP coherent.
1073  */
1074 PMAP_INLINE void
1075 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1076 {
1077         pt_entry_t *pte;
1078
1079         pte = vtopte(va);
1080         pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1081 }
1082
1083 PMAP_INLINE void
1084 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1085 {
1086         pt_entry_t *pte;
1087
1088         pte = vtopte(va);
1089         pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1090 }
1091
1092 /*
1093  * Remove a page from the kernel pagetables.
1094  * Note: not SMP coherent.
1095  */
1096 PMAP_INLINE void
1097 pmap_kremove(vm_offset_t va)
1098 {
1099         pt_entry_t *pte;
1100
1101         pte = vtopte(va);
1102         pte_clear(pte);
1103 }
1104
1105 /*
1106  *      Used to map a range of physical addresses into kernel
1107  *      virtual address space.
1108  *
1109  *      The value passed in '*virt' is a suggested virtual address for
1110  *      the mapping. Architectures which can support a direct-mapped
1111  *      physical to virtual region can return the appropriate address
1112  *      within that region, leaving '*virt' unchanged. Other
1113  *      architectures should map the pages starting at '*virt' and
1114  *      update '*virt' with the first usable address after the mapped
1115  *      region.
1116  */
1117 vm_offset_t
1118 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1119 {
1120         vm_offset_t va, sva;
1121
1122         va = sva = *virt;
1123         while (start < end) {
1124                 pmap_kenter(va, start);
1125                 va += PAGE_SIZE;
1126                 start += PAGE_SIZE;
1127         }
1128         pmap_invalidate_range(kernel_pmap, sva, va);
1129         *virt = va;
1130         return (sva);
1131 }
1132
1133
1134 /*
1135  * Add a list of wired pages to the kva
1136  * this routine is only used for temporary
1137  * kernel mappings that do not need to have
1138  * page modification or references recorded.
1139  * Note that old mappings are simply written
1140  * over.  The page *must* be wired.
1141  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1142  */
1143 void
1144 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1145 {
1146         pt_entry_t *endpte, oldpte, *pte;
1147
1148         oldpte = 0;
1149         pte = vtopte(sva);
1150         endpte = pte + count;
1151         while (pte < endpte) {
1152                 oldpte |= *pte;
1153                 pte_store(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag | PG_RW | PG_V);
1154                 pte++;
1155                 ma++;
1156         }
1157         if ((oldpte & PG_V) != 0)
1158                 pmap_invalidate_range(kernel_pmap, sva, sva + count *
1159                     PAGE_SIZE);
1160 }
1161
1162 /*
1163  * This routine tears out page mappings from the
1164  * kernel -- it is meant only for temporary mappings.
1165  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1166  */
1167 void
1168 pmap_qremove(vm_offset_t sva, int count)
1169 {
1170         vm_offset_t va;
1171
1172         va = sva;
1173         while (count-- > 0) {
1174                 pmap_kremove(va);
1175                 va += PAGE_SIZE;
1176         }
1177         pmap_invalidate_range(kernel_pmap, sva, va);
1178 }
1179
1180 /***************************************************
1181  * Page table page management routines.....
1182  ***************************************************/
1183
1184 /*
1185  * This routine unholds page table pages, and if the hold count
1186  * drops to zero, then it decrements the wire count.
1187  */
1188 static PMAP_INLINE int
1189 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1190 {
1191
1192         --m->wire_count;
1193         if (m->wire_count == 0)
1194                 return _pmap_unwire_pte_hold(pmap, m);
1195         else
1196                 return 0;
1197 }
1198
1199 static int
1200 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
1201 {
1202         vm_offset_t pteva;
1203
1204         /*
1205          * unmap the page table page
1206          */
1207         pmap->pm_pdir[m->pindex] = 0;
1208         --pmap->pm_stats.resident_count;
1209
1210         /*
1211          * Do an invltlb to make the invalidated mapping
1212          * take effect immediately.
1213          */
1214         pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1215         pmap_invalidate_page(pmap, pteva);
1216
1217         vm_page_free_zero(m);
1218         atomic_subtract_int(&cnt.v_wire_count, 1);
1219         return 1;
1220 }
1221
1222 /*
1223  * After removing a page table entry, this routine is used to
1224  * conditionally free the page, and manage the hold/wire counts.
1225  */
1226 static int
1227 pmap_unuse_pt(pmap_t pmap, vm_offset_t va)
1228 {
1229         pd_entry_t ptepde;
1230         vm_page_t mpte;
1231
1232         if (va >= VM_MAXUSER_ADDRESS)
1233                 return 0;
1234         ptepde = *pmap_pde(pmap, va);
1235         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1236         return pmap_unwire_pte_hold(pmap, mpte);
1237 }
1238
1239 void
1240 pmap_pinit0(pmap_t pmap)
1241 {
1242
1243         PMAP_LOCK_INIT(pmap);
1244         pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1245 #ifdef PAE
1246         pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1247 #endif
1248         pmap->pm_active = 0;
1249         PCPU_SET(curpmap, pmap);
1250         TAILQ_INIT(&pmap->pm_pvchunk);
1251         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1252         mtx_lock_spin(&allpmaps_lock);
1253         LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1254         mtx_unlock_spin(&allpmaps_lock);
1255 }
1256
1257 /*
1258  * Initialize a preallocated and zeroed pmap structure,
1259  * such as one in a vmspace structure.
1260  */
1261 void
1262 pmap_pinit(pmap_t pmap)
1263 {
1264         vm_page_t m, ptdpg[NPGPTD];
1265         vm_paddr_t pa;
1266         static int color;
1267         int i;
1268
1269         PMAP_LOCK_INIT(pmap);
1270
1271         /*
1272          * No need to allocate page table space yet but we do need a valid
1273          * page directory table.
1274          */
1275         if (pmap->pm_pdir == NULL) {
1276                 pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1277                     NBPTD);
1278 #ifdef PAE
1279                 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1280                 KASSERT(((vm_offset_t)pmap->pm_pdpt &
1281                     ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1282                     ("pmap_pinit: pdpt misaligned"));
1283                 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1284                     ("pmap_pinit: pdpt above 4g"));
1285 #endif
1286         }
1287
1288         /*
1289          * allocate the page directory page(s)
1290          */
1291         for (i = 0; i < NPGPTD;) {
1292                 m = vm_page_alloc(NULL, color++,
1293                     VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1294                     VM_ALLOC_ZERO);
1295                 if (m == NULL)
1296                         VM_WAIT;
1297                 else {
1298                         ptdpg[i++] = m;
1299                 }
1300         }
1301
1302         pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1303
1304         for (i = 0; i < NPGPTD; i++) {
1305                 if ((ptdpg[i]->flags & PG_ZERO) == 0)
1306                         bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1307         }
1308
1309         mtx_lock_spin(&allpmaps_lock);
1310         LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1311         mtx_unlock_spin(&allpmaps_lock);
1312         /* Wire in kernel global address entries. */
1313         /* XXX copies current process, does not fill in MPPTDI */
1314         bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1315 #ifdef SMP
1316         pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1317 #endif
1318
1319         /* install self-referential address mapping entry(s) */
1320         for (i = 0; i < NPGPTD; i++) {
1321                 pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1322                 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1323 #ifdef PAE
1324                 pmap->pm_pdpt[i] = pa | PG_V;
1325 #endif
1326         }
1327
1328         pmap->pm_active = 0;
1329         TAILQ_INIT(&pmap->pm_pvchunk);
1330         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1331 }
1332
1333 /*
1334  * this routine is called if the page table page is not
1335  * mapped correctly.
1336  */
1337 static vm_page_t
1338 _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
1339 {
1340         vm_paddr_t ptepa;
1341         vm_page_t m;
1342
1343         KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1344             (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1345             ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1346
1347         /*
1348          * Allocate a page table page.
1349          */
1350         if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1351             VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1352                 if (flags & M_WAITOK) {
1353                         PMAP_UNLOCK(pmap);
1354                         vm_page_unlock_queues();
1355                         VM_WAIT;
1356                         vm_page_lock_queues();
1357                         PMAP_LOCK(pmap);
1358                 }
1359
1360                 /*
1361                  * Indicate the need to retry.  While waiting, the page table
1362                  * page may have been allocated.
1363                  */
1364                 return (NULL);
1365         }
1366         if ((m->flags & PG_ZERO) == 0)
1367                 pmap_zero_page(m);
1368
1369         /*
1370          * Map the pagetable page into the process address space, if
1371          * it isn't already there.
1372          */
1373
1374         pmap->pm_stats.resident_count++;
1375
1376         ptepa = VM_PAGE_TO_PHYS(m);
1377         pmap->pm_pdir[ptepindex] =
1378                 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1379
1380         return m;
1381 }
1382
1383 static vm_page_t
1384 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1385 {
1386         unsigned ptepindex;
1387         pd_entry_t ptepa;
1388         vm_page_t m;
1389
1390         KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1391             (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1392             ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1393
1394         /*
1395          * Calculate pagetable page index
1396          */
1397         ptepindex = va >> PDRSHIFT;
1398 retry:
1399         /*
1400          * Get the page directory entry
1401          */
1402         ptepa = pmap->pm_pdir[ptepindex];
1403
1404         /*
1405          * This supports switching from a 4MB page to a
1406          * normal 4K page.
1407          */
1408         if (ptepa & PG_PS) {
1409                 pmap->pm_pdir[ptepindex] = 0;
1410                 ptepa = 0;
1411                 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1412                 pmap_invalidate_all(kernel_pmap);
1413         }
1414
1415         /*
1416          * If the page table page is mapped, we just increment the
1417          * hold count, and activate it.
1418          */
1419         if (ptepa) {
1420                 m = PHYS_TO_VM_PAGE(ptepa);
1421                 m->wire_count++;
1422         } else {
1423                 /*
1424                  * Here if the pte page isn't mapped, or if it has
1425                  * been deallocated.
1426                  */
1427                 m = _pmap_allocpte(pmap, ptepindex, flags);
1428                 if (m == NULL && (flags & M_WAITOK))
1429                         goto retry;
1430         }
1431         return (m);
1432 }
1433
1434
1435 /***************************************************
1436 * Pmap allocation/deallocation routines.
1437  ***************************************************/
1438
1439 #ifdef SMP
1440 /*
1441  * Deal with a SMP shootdown of other users of the pmap that we are
1442  * trying to dispose of.  This can be a bit hairy.
1443  */
1444 static u_int *lazymask;
1445 static u_int lazyptd;
1446 static volatile u_int lazywait;
1447
1448 void pmap_lazyfix_action(void);
1449
1450 void
1451 pmap_lazyfix_action(void)
1452 {
1453         u_int mymask = PCPU_GET(cpumask);
1454
1455 #ifdef COUNT_IPIS
1456         *ipi_lazypmap_counts[PCPU_GET(cpuid)]++;
1457 #endif
1458         if (rcr3() == lazyptd)
1459                 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1460         atomic_clear_int(lazymask, mymask);
1461         atomic_store_rel_int(&lazywait, 1);
1462 }
1463
1464 static void
1465 pmap_lazyfix_self(u_int mymask)
1466 {
1467
1468         if (rcr3() == lazyptd)
1469                 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1470         atomic_clear_int(lazymask, mymask);
1471 }
1472
1473
1474 static void
1475 pmap_lazyfix(pmap_t pmap)
1476 {
1477         u_int mymask;
1478         u_int mask;
1479         u_int spins;
1480
1481         while ((mask = pmap->pm_active) != 0) {
1482                 spins = 50000000;
1483                 mask = mask & -mask;    /* Find least significant set bit */
1484                 mtx_lock_spin(&smp_ipi_mtx);
1485 #ifdef PAE
1486                 lazyptd = vtophys(pmap->pm_pdpt);
1487 #else
1488                 lazyptd = vtophys(pmap->pm_pdir);
1489 #endif
1490                 mymask = PCPU_GET(cpumask);
1491                 if (mask == mymask) {
1492                         lazymask = &pmap->pm_active;
1493                         pmap_lazyfix_self(mymask);
1494                 } else {
1495                         atomic_store_rel_int((u_int *)&lazymask,
1496                             (u_int)&pmap->pm_active);
1497                         atomic_store_rel_int(&lazywait, 0);
1498                         ipi_selected(mask, IPI_LAZYPMAP);
1499                         while (lazywait == 0) {
1500                                 ia32_pause();
1501                                 if (--spins == 0)
1502                                         break;
1503                         }
1504                 }
1505                 mtx_unlock_spin(&smp_ipi_mtx);
1506                 if (spins == 0)
1507                         printf("pmap_lazyfix: spun for 50000000\n");
1508         }
1509 }
1510
1511 #else   /* SMP */
1512
1513 /*
1514  * Cleaning up on uniprocessor is easy.  For various reasons, we're
1515  * unlikely to have to even execute this code, including the fact
1516  * that the cleanup is deferred until the parent does a wait(2), which
1517  * means that another userland process has run.
1518  */
1519 static void
1520 pmap_lazyfix(pmap_t pmap)
1521 {
1522         u_int cr3;
1523
1524         cr3 = vtophys(pmap->pm_pdir);
1525         if (cr3 == rcr3()) {
1526                 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1527                 pmap->pm_active &= ~(PCPU_GET(cpumask));
1528         }
1529 }
1530 #endif  /* SMP */
1531
1532 /*
1533  * Release any resources held by the given physical map.
1534  * Called when a pmap initialized by pmap_pinit is being released.
1535  * Should only be called if the map contains no valid mappings.
1536  */
1537 void
1538 pmap_release(pmap_t pmap)
1539 {
1540         vm_page_t m, ptdpg[NPGPTD];
1541         int i;
1542
1543         KASSERT(pmap->pm_stats.resident_count == 0,
1544             ("pmap_release: pmap resident count %ld != 0",
1545             pmap->pm_stats.resident_count));
1546
1547         pmap_lazyfix(pmap);
1548         mtx_lock_spin(&allpmaps_lock);
1549         LIST_REMOVE(pmap, pm_list);
1550         mtx_unlock_spin(&allpmaps_lock);
1551
1552         for (i = 0; i < NPGPTD; i++)
1553                 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i]);
1554
1555         bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1556             sizeof(*pmap->pm_pdir));
1557 #ifdef SMP
1558         pmap->pm_pdir[MPPTDI] = 0;
1559 #endif
1560
1561         pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1562
1563         vm_page_lock_queues();
1564         for (i = 0; i < NPGPTD; i++) {
1565                 m = ptdpg[i];
1566 #ifdef PAE
1567                 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1568                     ("pmap_release: got wrong ptd page"));
1569 #endif
1570                 m->wire_count--;
1571                 atomic_subtract_int(&cnt.v_wire_count, 1);
1572                 vm_page_free_zero(m);
1573         }
1574         vm_page_unlock_queues();
1575         PMAP_LOCK_DESTROY(pmap);
1576 }
1577 \f
1578 static int
1579 kvm_size(SYSCTL_HANDLER_ARGS)
1580 {
1581         unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1582
1583         return sysctl_handle_long(oidp, &ksize, 0, req);
1584 }
1585 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1586     0, 0, kvm_size, "IU", "Size of KVM");
1587
1588 static int
1589 kvm_free(SYSCTL_HANDLER_ARGS)
1590 {
1591         unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1592
1593         return sysctl_handle_long(oidp, &kfree, 0, req);
1594 }
1595 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1596     0, 0, kvm_free, "IU", "Amount of KVM free");
1597
1598 /*
1599  * grow the number of kernel page table entries, if needed
1600  */
1601 void
1602 pmap_growkernel(vm_offset_t addr)
1603 {
1604         struct pmap *pmap;
1605         vm_paddr_t ptppaddr;
1606         vm_page_t nkpg;
1607         pd_entry_t newpdir;
1608         pt_entry_t *pde;
1609
1610         mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1611         if (kernel_vm_end == 0) {
1612                 kernel_vm_end = KERNBASE;
1613                 nkpt = 0;
1614                 while (pdir_pde(PTD, kernel_vm_end)) {
1615                         kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1616                         nkpt++;
1617                         if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1618                                 kernel_vm_end = kernel_map->max_offset;
1619                                 break;
1620                         }
1621                 }
1622         }
1623         addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1624         if (addr - 1 >= kernel_map->max_offset)
1625                 addr = kernel_map->max_offset;
1626         while (kernel_vm_end < addr) {
1627                 if (pdir_pde(PTD, kernel_vm_end)) {
1628                         kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1629                         if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1630                                 kernel_vm_end = kernel_map->max_offset;
1631                                 break;
1632                         }
1633                         continue;
1634                 }
1635
1636                 /*
1637                  * This index is bogus, but out of the way
1638                  */
1639                 nkpg = vm_page_alloc(NULL, nkpt,
1640                     VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1641                 if (!nkpg)
1642                         panic("pmap_growkernel: no memory to grow kernel");
1643
1644                 nkpt++;
1645
1646                 pmap_zero_page(nkpg);
1647                 ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1648                 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1649                 pdir_pde(PTD, kernel_vm_end) = newpdir;
1650
1651                 mtx_lock_spin(&allpmaps_lock);
1652                 LIST_FOREACH(pmap, &allpmaps, pm_list) {
1653                         pde = pmap_pde(pmap, kernel_vm_end);
1654                         pde_store(pde, newpdir);
1655                 }
1656                 mtx_unlock_spin(&allpmaps_lock);
1657                 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1658                 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1659                         kernel_vm_end = kernel_map->max_offset;
1660                         break;
1661                 }
1662         }
1663 }
1664
1665
1666 /***************************************************
1667  * page management routines.
1668  ***************************************************/
1669
1670 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1671 CTASSERT(_NPCM == 11);
1672
1673 static __inline struct pv_chunk *
1674 pv_to_chunk(pv_entry_t pv)
1675 {
1676
1677         return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
1678 }
1679
1680 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1681
1682 #define PC_FREE0_9      0xfffffffful    /* Free values for index 0 through 9 */
1683 #define PC_FREE10       0x0000fffful    /* Free values for index 10 */
1684
1685 static uint32_t pc_freemask[11] = {
1686         PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1687         PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1688         PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1689         PC_FREE0_9, PC_FREE10
1690 };
1691
1692 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1693         "Current number of pv entries");
1694
1695 #ifdef PV_STATS
1696 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1697
1698 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1699         "Current number of pv entry chunks");
1700 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1701         "Current number of pv entry chunks allocated");
1702 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1703         "Current number of pv entry chunks frees");
1704 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1705         "Number of times tried to get a chunk page but failed.");
1706
1707 static long pv_entry_frees, pv_entry_allocs;
1708 static int pv_entry_spare;
1709
1710 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1711         "Current number of pv entry frees");
1712 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1713         "Current number of pv entry allocs");
1714 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1715         "Current number of spare pv entries");
1716
1717 static int pmap_collect_inactive, pmap_collect_active;
1718
1719 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
1720         "Current number times pmap_collect called on inactive queue");
1721 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
1722         "Current number times pmap_collect called on active queue");
1723 #endif
1724
1725 /*
1726  * We are in a serious low memory condition.  Resort to
1727  * drastic measures to free some pages so we can allocate
1728  * another pv entry chunk.  This is normally called to
1729  * unmap inactive pages, and if necessary, active pages.
1730  */
1731 static void
1732 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
1733 {
1734         pmap_t pmap;
1735         pt_entry_t *pte, tpte;
1736         pv_entry_t next_pv, pv;
1737         vm_offset_t va;
1738         vm_page_t m;
1739
1740         sched_pin();
1741         TAILQ_FOREACH(m, &vpq->pl, pageq) {
1742                 if (m->hold_count || m->busy || (m->flags & PG_BUSY))
1743                         continue;
1744                 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
1745                         va = pv->pv_va;
1746                         pmap = PV_PMAP(pv);
1747                         /* Avoid deadlock and lock recursion. */
1748                         if (pmap > locked_pmap)
1749                                 PMAP_LOCK(pmap);
1750                         else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
1751                                 continue;
1752                         pmap->pm_stats.resident_count--;
1753                         pte = pmap_pte_quick(pmap, va);
1754                         tpte = pte_load_clear(pte);
1755                         KASSERT((tpte & PG_W) == 0,
1756                             ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
1757                         if (tpte & PG_A)
1758                                 vm_page_flag_set(m, PG_REFERENCED);
1759                         if (tpte & PG_M) {
1760                                 KASSERT((tpte & PG_RW),
1761         ("pmap_collect: modified page not writable: va: %#x, pte: %#jx",
1762                                     va, (uintmax_t)tpte));
1763                                 vm_page_dirty(m);
1764                         }
1765                         pmap_invalidate_page(pmap, va);
1766                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1767                         if (TAILQ_EMPTY(&m->md.pv_list))
1768                                 vm_page_flag_clear(m, PG_WRITEABLE);
1769                         m->md.pv_list_count--;
1770                         pmap_unuse_pt(pmap, va);
1771                         free_pv_entry(pmap, pv);
1772                         if (pmap != locked_pmap)
1773                                 PMAP_UNLOCK(pmap);
1774                 }
1775         }
1776         sched_unpin();
1777 }
1778
1779
1780 /*
1781  * free the pv_entry back to the free list
1782  */
1783 static void
1784 free_pv_entry(pmap_t pmap, pv_entry_t pv)
1785 {
1786         vm_page_t m;
1787         struct pv_chunk *pc;
1788         int idx, field, bit;
1789
1790         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1791         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1792         PV_STAT(pv_entry_frees++);
1793         PV_STAT(pv_entry_spare++);
1794         pv_entry_count--;
1795         pc = pv_to_chunk(pv);
1796         idx = pv - &pc->pc_pventry[0];
1797         field = idx / 32;
1798         bit = idx % 32;
1799         pc->pc_map[field] |= 1ul << bit;
1800         /* move to head of list */
1801         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1802         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1803         for (idx = 0; idx < _NPCM; idx++)
1804                 if (pc->pc_map[idx] != pc_freemask[idx])
1805                         return;
1806         PV_STAT(pv_entry_spare -= _NPCPV);
1807         PV_STAT(pc_chunk_count--);
1808         PV_STAT(pc_chunk_frees++);
1809         /* entire chunk is free, return it */
1810         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1811         m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
1812         pmap_qremove((vm_offset_t)pc, 1);
1813         vm_page_unwire(m, 0);
1814         vm_page_free(m);
1815         pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
1816 }
1817
1818 /*
1819  * get a new pv_entry, allocating a block from the system
1820  * when needed.
1821  */
1822 static pv_entry_t
1823 get_pv_entry(pmap_t pmap, int try)
1824 {
1825         static const struct timeval printinterval = { 60, 0 };
1826         static struct timeval lastprint;
1827         static vm_pindex_t colour;
1828         int bit, field, page_req;
1829         pv_entry_t pv;
1830         struct pv_chunk *pc;
1831         vm_page_t m;
1832
1833         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1834         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1835         PV_STAT(pv_entry_allocs++);
1836         pv_entry_count++;
1837         if (pv_entry_count > pv_entry_high_water)
1838                 pagedaemon_wakeup();
1839         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1840         if (pc != NULL) {
1841                 for (field = 0; field < _NPCM; field++) {
1842                         if (pc->pc_map[field]) {
1843                                 bit = bsfl(pc->pc_map[field]);
1844                                 break;
1845                         }
1846                 }
1847                 if (field < _NPCM) {
1848                         pv = &pc->pc_pventry[field * 32 + bit];
1849                         pc->pc_map[field] &= ~(1ul << bit);
1850                         /* If this was the last item, move it to tail */
1851                         for (field = 0; field < _NPCM; field++)
1852                                 if (pc->pc_map[field] != 0) {
1853                                         PV_STAT(pv_entry_spare--);
1854                                         return (pv);    /* not full, return */
1855                                 }
1856                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1857                         TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1858                         PV_STAT(pv_entry_spare--);
1859                         return (pv);
1860                 }
1861         }
1862         pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
1863         page_req = try ? VM_ALLOC_NORMAL : VM_ALLOC_SYSTEM;
1864         m = vm_page_alloc(NULL, colour, page_req |
1865             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
1866         if (m == NULL || pc == NULL) {
1867                 if (try) {
1868                         pv_entry_count--;
1869                         PV_STAT(pc_chunk_tryfail++);
1870                         if (m) {
1871                                 vm_page_lock_queues();
1872                                 vm_page_unwire(m, 0);
1873                                 vm_page_free(m);
1874                                 vm_page_unlock_queues();
1875                         }
1876                         if (pc)
1877                                 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
1878                         return (NULL);
1879                 }
1880                 /*
1881                  * Reclaim pv entries: At first, destroy mappings to
1882                  * inactive pages.  After that, if a pv chunk entry
1883                  * is still needed, destroy mappings to active pages.
1884                  */
1885                 if (ratecheck(&lastprint, &printinterval))
1886                         printf("Approaching the limit on PV entries, "
1887                             "consider increasing tunables "
1888                             "vm.pmap.shpgperproc or "
1889                             "vm.pmap.pv_entry_max\n");
1890                 PV_STAT(pmap_collect_inactive++);
1891                 pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]);
1892                 if (m == NULL)
1893                         m = vm_page_alloc(NULL, colour, VM_ALLOC_SYSTEM |
1894                             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
1895                 if (pc == NULL)
1896                         pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
1897                 if (m == NULL || pc == NULL) {
1898                         PV_STAT(pmap_collect_active++);
1899                         pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]);
1900                         if (m == NULL)
1901                                 m = vm_page_alloc(NULL, colour,
1902                                     VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
1903                                     VM_ALLOC_WIRED);
1904                         if (pc == NULL)
1905                                 pc = (struct pv_chunk *)
1906                                     pmap_ptelist_alloc(&pv_vafree);
1907                         if (m == NULL || pc == NULL)
1908                                 panic("get_pv_entry: increase vm.pmap.shpgperproc");
1909                 }
1910         }
1911         PV_STAT(pc_chunk_count++);
1912         PV_STAT(pc_chunk_allocs++);
1913         colour++;
1914         pmap_qenter((vm_offset_t)pc, &m, 1);
1915         pc->pc_pmap = pmap;
1916         pc->pc_map[0] = pc_freemask[0] & ~1ul;  /* preallocated bit 0 */
1917         for (field = 1; field < _NPCM; field++)
1918                 pc->pc_map[field] = pc_freemask[field];
1919         pv = &pc->pc_pventry[0];
1920         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1921         PV_STAT(pv_entry_spare += _NPCPV - 1);
1922         return (pv);
1923 }
1924
1925 static void
1926 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1927 {
1928         pv_entry_t pv;
1929
1930         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1931         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1932         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1933                 if (pmap == PV_PMAP(pv) && va == pv->pv_va)
1934                         break;
1935         }
1936         KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
1937         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1938         m->md.pv_list_count--;
1939         if (TAILQ_EMPTY(&m->md.pv_list))
1940                 vm_page_flag_clear(m, PG_WRITEABLE);
1941         free_pv_entry(pmap, pv);
1942 }
1943
1944 /*
1945  * Create a pv entry for page at pa for
1946  * (pmap, va).
1947  */
1948 static void
1949 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1950 {
1951         pv_entry_t pv;
1952
1953         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1954         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1955         pv = get_pv_entry(pmap, FALSE);
1956         pv->pv_va = va;
1957         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1958         m->md.pv_list_count++;
1959 }
1960
1961 /*
1962  * Conditionally create a pv entry.
1963  */
1964 static boolean_t
1965 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1966 {
1967         pv_entry_t pv;
1968
1969         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1970         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1971         if (pv_entry_count < pv_entry_high_water &&
1972             (pv = get_pv_entry(pmap, TRUE)) != NULL) {
1973                 pv->pv_va = va;
1974                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1975                 m->md.pv_list_count++;
1976                 return (TRUE);
1977         } else
1978                 return (FALSE);
1979 }
1980
1981 /*
1982  * pmap_remove_pte: do the things to unmap a page in a process
1983  */
1984 static int
1985 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
1986 {
1987         pt_entry_t oldpte;
1988         vm_page_t m;
1989
1990         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1991         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1992         oldpte = pte_load_clear(ptq);
1993         if (oldpte & PG_W)
1994                 pmap->pm_stats.wired_count -= 1;
1995         /*
1996          * Machines that don't support invlpg, also don't support
1997          * PG_G.
1998          */
1999         if (oldpte & PG_G)
2000                 pmap_invalidate_page(kernel_pmap, va);
2001         pmap->pm_stats.resident_count -= 1;
2002         if (oldpte & PG_MANAGED) {
2003                 m = PHYS_TO_VM_PAGE(oldpte);
2004                 if (oldpte & PG_M) {
2005                         KASSERT((oldpte & PG_RW),
2006         ("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx",
2007                             va, (uintmax_t)oldpte));
2008                         vm_page_dirty(m);
2009                 }
2010                 if (oldpte & PG_A)
2011                         vm_page_flag_set(m, PG_REFERENCED);
2012                 pmap_remove_entry(pmap, m, va);
2013         }
2014         return (pmap_unuse_pt(pmap, va));
2015 }
2016
2017 /*
2018  * Remove a single page from a process address space
2019  */
2020 static void
2021 pmap_remove_page(pmap_t pmap, vm_offset_t va)
2022 {
2023         pt_entry_t *pte;
2024
2025         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2026         KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2027         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2028         if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2029                 return;
2030         pmap_remove_pte(pmap, pte, va);
2031         pmap_invalidate_page(pmap, va);
2032 }
2033
2034 /*
2035  *      Remove the given range of addresses from the specified map.
2036  *
2037  *      It is assumed that the start and end are properly
2038  *      rounded to the page size.
2039  */
2040 void
2041 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2042 {
2043         vm_offset_t pdnxt;
2044         pd_entry_t ptpaddr;
2045         pt_entry_t *pte;
2046         int anyvalid;
2047
2048         /*
2049          * Perform an unsynchronized read.  This is, however, safe.
2050          */
2051         if (pmap->pm_stats.resident_count == 0)
2052                 return;
2053
2054         anyvalid = 0;
2055
2056         vm_page_lock_queues();
2057         sched_pin();
2058         PMAP_LOCK(pmap);
2059
2060         /*
2061          * special handling of removing one page.  a very
2062          * common operation and easy to short circuit some
2063          * code.
2064          */
2065         if ((sva + PAGE_SIZE == eva) &&
2066             ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2067                 pmap_remove_page(pmap, sva);
2068                 goto out;
2069         }
2070
2071         for (; sva < eva; sva = pdnxt) {
2072                 unsigned pdirindex;
2073
2074                 /*
2075                  * Calculate index for next page table.
2076                  */
2077                 pdnxt = (sva + NBPDR) & ~PDRMASK;
2078                 if (pmap->pm_stats.resident_count == 0)
2079                         break;
2080
2081                 pdirindex = sva >> PDRSHIFT;
2082                 ptpaddr = pmap->pm_pdir[pdirindex];
2083
2084                 /*
2085                  * Weed out invalid mappings. Note: we assume that the page
2086                  * directory table is always allocated, and in kernel virtual.
2087                  */
2088                 if (ptpaddr == 0)
2089                         continue;
2090
2091                 /*
2092                  * Check for large page.
2093                  */
2094                 if ((ptpaddr & PG_PS) != 0) {
2095                         pmap->pm_pdir[pdirindex] = 0;
2096                         pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2097                         anyvalid = 1;
2098                         continue;
2099                 }
2100
2101                 /*
2102                  * Limit our scan to either the end of the va represented
2103                  * by the current page table page, or to the end of the
2104                  * range being removed.
2105                  */
2106                 if (pdnxt > eva)
2107                         pdnxt = eva;
2108
2109                 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2110                     sva += PAGE_SIZE) {
2111                         if (*pte == 0)
2112                                 continue;
2113
2114                         /*
2115                          * The TLB entry for a PG_G mapping is invalidated
2116                          * by pmap_remove_pte().
2117                          */
2118                         if ((*pte & PG_G) == 0)
2119                                 anyvalid = 1;
2120                         if (pmap_remove_pte(pmap, pte, sva))
2121                                 break;
2122                 }
2123         }
2124 out:
2125         sched_unpin();
2126         vm_page_unlock_queues();
2127         if (anyvalid)
2128                 pmap_invalidate_all(pmap);
2129         PMAP_UNLOCK(pmap);
2130 }
2131
2132 /*
2133  *      Routine:        pmap_remove_all
2134  *      Function:
2135  *              Removes this physical page from
2136  *              all physical maps in which it resides.
2137  *              Reflects back modify bits to the pager.
2138  *
2139  *      Notes:
2140  *              Original versions of this routine were very
2141  *              inefficient because they iteratively called
2142  *              pmap_remove (slow...)
2143  */
2144
2145 void
2146 pmap_remove_all(vm_page_t m)
2147 {
2148         pv_entry_t pv;
2149         pmap_t pmap;
2150         pt_entry_t *pte, tpte;
2151
2152 #if defined(PMAP_DIAGNOSTIC)
2153         /*
2154          * XXX This makes pmap_remove_all() illegal for non-managed pages!
2155          */
2156         if (m->flags & PG_FICTITIOUS) {
2157                 panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
2158                     VM_PAGE_TO_PHYS(m));
2159         }
2160 #endif
2161         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2162         sched_pin();
2163         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2164                 pmap = PV_PMAP(pv);
2165                 PMAP_LOCK(pmap);
2166                 pmap->pm_stats.resident_count--;
2167                 pte = pmap_pte_quick(pmap, pv->pv_va);
2168                 tpte = pte_load_clear(pte);
2169                 if (tpte & PG_W)
2170                         pmap->pm_stats.wired_count--;
2171                 if (tpte & PG_A)
2172                         vm_page_flag_set(m, PG_REFERENCED);
2173
2174                 /*
2175                  * Update the vm_page_t clean and reference bits.
2176                  */
2177                 if (tpte & PG_M) {
2178                         KASSERT((tpte & PG_RW),
2179         ("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx",
2180                             pv->pv_va, (uintmax_t)tpte));
2181                         vm_page_dirty(m);
2182                 }
2183                 pmap_invalidate_page(pmap, pv->pv_va);
2184                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2185                 m->md.pv_list_count--;
2186                 pmap_unuse_pt(pmap, pv->pv_va);
2187                 free_pv_entry(pmap, pv);
2188                 PMAP_UNLOCK(pmap);
2189         }
2190         vm_page_flag_clear(m, PG_WRITEABLE);
2191         sched_unpin();
2192 }
2193
2194 /*
2195  *      Set the physical protection on the
2196  *      specified range of this map as requested.
2197  */
2198 void
2199 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2200 {
2201         vm_offset_t pdnxt;
2202         pd_entry_t ptpaddr;
2203         pt_entry_t *pte;
2204         int anychanged;
2205
2206         if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2207                 pmap_remove(pmap, sva, eva);
2208                 return;
2209         }
2210
2211         if (prot & VM_PROT_WRITE)
2212                 return;
2213
2214         anychanged = 0;
2215
2216         vm_page_lock_queues();
2217         sched_pin();
2218         PMAP_LOCK(pmap);
2219         for (; sva < eva; sva = pdnxt) {
2220                 unsigned obits, pbits, pdirindex;
2221
2222                 pdnxt = (sva + NBPDR) & ~PDRMASK;
2223
2224                 pdirindex = sva >> PDRSHIFT;
2225                 ptpaddr = pmap->pm_pdir[pdirindex];
2226
2227                 /*
2228                  * Weed out invalid mappings. Note: we assume that the page
2229                  * directory table is always allocated, and in kernel virtual.
2230                  */
2231                 if (ptpaddr == 0)
2232                         continue;
2233
2234                 /*
2235                  * Check for large page.
2236                  */
2237                 if ((ptpaddr & PG_PS) != 0) {
2238                         pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
2239                         anychanged = 1;
2240                         continue;
2241                 }
2242
2243                 if (pdnxt > eva)
2244                         pdnxt = eva;
2245
2246                 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2247                     sva += PAGE_SIZE) {
2248                         vm_page_t m;
2249
2250 retry:
2251                         /*
2252                          * Regardless of whether a pte is 32 or 64 bits in
2253                          * size, PG_RW, PG_A, and PG_M are among the least
2254                          * significant 32 bits.
2255                          */
2256                         obits = pbits = *(u_int *)pte;
2257                         if (pbits & PG_MANAGED) {
2258                                 m = NULL;
2259                                 if (pbits & PG_A) {
2260                                         m = PHYS_TO_VM_PAGE(*pte);
2261                                         vm_page_flag_set(m, PG_REFERENCED);
2262                                         pbits &= ~PG_A;
2263                                 }
2264                                 if ((pbits & PG_M) != 0) {
2265                                         if (m == NULL)
2266                                                 m = PHYS_TO_VM_PAGE(*pte);
2267                                         vm_page_dirty(m);
2268                                 }
2269                         }
2270
2271                         pbits &= ~(PG_RW | PG_M);
2272
2273                         if (pbits != obits) {
2274                                 if (!atomic_cmpset_int((u_int *)pte, obits,
2275                                     pbits))
2276                                         goto retry;
2277                                 if (obits & PG_G)
2278                                         pmap_invalidate_page(pmap, sva);
2279                                 else
2280                                         anychanged = 1;
2281                         }
2282                 }
2283         }
2284         sched_unpin();
2285         vm_page_unlock_queues();
2286         if (anychanged)
2287                 pmap_invalidate_all(pmap);
2288         PMAP_UNLOCK(pmap);
2289 }
2290
2291 /*
2292  *      Insert the given physical page (p) at
2293  *      the specified virtual address (v) in the
2294  *      target physical map with the protection requested.
2295  *
2296  *      If specified, the page will be wired down, meaning
2297  *      that the related pte can not be reclaimed.
2298  *
2299  *      NB:  This is the only routine which MAY NOT lazy-evaluate
2300  *      or lose information.  That is, this routine must actually
2301  *      insert this page into the given map NOW.
2302  */
2303 void
2304 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2305            boolean_t wired)
2306 {
2307         vm_paddr_t pa;
2308         pd_entry_t *pde;
2309         pt_entry_t *pte;
2310         vm_paddr_t opa;
2311         pt_entry_t origpte, newpte;
2312         vm_page_t mpte, om;
2313         boolean_t invlva;
2314
2315         va &= PG_FRAME;
2316 #ifdef PMAP_DIAGNOSTIC
2317         if (va > VM_MAX_KERNEL_ADDRESS)
2318                 panic("pmap_enter: toobig");
2319         if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
2320                 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
2321 #endif
2322
2323         mpte = NULL;
2324
2325         vm_page_lock_queues();
2326         PMAP_LOCK(pmap);
2327         sched_pin();
2328
2329         /*
2330          * In the case that a page table page is not
2331          * resident, we are creating it here.
2332          */
2333         if (va < VM_MAXUSER_ADDRESS) {
2334                 mpte = pmap_allocpte(pmap, va, M_WAITOK);
2335         }
2336 #if 0 && defined(PMAP_DIAGNOSTIC)
2337         else {
2338                 pd_entry_t *pdeaddr = pmap_pde(pmap, va);
2339                 origpte = *pdeaddr;
2340                 if ((origpte & PG_V) == 0) {
2341                         panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
2342                                 pmap->pm_pdir[PTDPTDI], origpte, va);
2343                 }
2344         }
2345 #endif
2346
2347         pde = pmap_pde(pmap, va);
2348         if ((*pde & PG_PS) != 0)
2349                 panic("pmap_enter: attempted pmap_enter on 4MB page");
2350         pte = pmap_pte_quick(pmap, va);
2351
2352         /*
2353          * Page Directory table entry not valid, we need a new PT page
2354          */
2355         if (pte == NULL) {
2356                 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
2357                         (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
2358         }
2359
2360         pa = VM_PAGE_TO_PHYS(m);
2361         om = NULL;
2362         origpte = *pte;
2363         opa = origpte & PG_FRAME;
2364
2365         /*
2366          * Mapping has not changed, must be protection or wiring change.
2367          */
2368         if (origpte && (opa == pa)) {
2369                 /*
2370                  * Wiring change, just update stats. We don't worry about
2371                  * wiring PT pages as they remain resident as long as there
2372                  * are valid mappings in them. Hence, if a user page is wired,
2373                  * the PT page will be also.
2374                  */
2375                 if (wired && ((origpte & PG_W) == 0))
2376                         pmap->pm_stats.wired_count++;
2377                 else if (!wired && (origpte & PG_W))
2378                         pmap->pm_stats.wired_count--;
2379
2380                 /*
2381                  * Remove extra pte reference
2382                  */
2383                 if (mpte)
2384                         mpte->wire_count--;
2385
2386                 /*
2387                  * We might be turning off write access to the page,
2388                  * so we go ahead and sense modify status.
2389                  */
2390                 if (origpte & PG_MANAGED) {
2391                         om = m;
2392                         pa |= PG_MANAGED;
2393                 }
2394                 goto validate;
2395         }
2396         /*
2397          * Mapping has changed, invalidate old range and fall through to
2398          * handle validating new mapping.
2399          */
2400         if (opa) {
2401                 if (origpte & PG_W)
2402                         pmap->pm_stats.wired_count--;
2403                 if (origpte & PG_MANAGED) {
2404                         om = PHYS_TO_VM_PAGE(opa);
2405                         pmap_remove_entry(pmap, om, va);
2406                 }
2407                 if (mpte != NULL) {
2408                         mpte->wire_count--;
2409                         KASSERT(mpte->wire_count > 0,
2410                             ("pmap_enter: missing reference to page table page,"
2411                              " va: 0x%x", va));
2412                 }
2413         } else
2414                 pmap->pm_stats.resident_count++;
2415
2416         /*
2417          * Enter on the PV list if part of our managed memory.
2418          */
2419         if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
2420                 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
2421                     ("pmap_enter: managed mapping within the clean submap"));
2422                 pmap_insert_entry(pmap, va, m);
2423                 pa |= PG_MANAGED;
2424         }
2425
2426         /*
2427          * Increment counters
2428          */
2429         if (wired)
2430                 pmap->pm_stats.wired_count++;
2431
2432 validate:
2433         /*
2434          * Now validate mapping with desired protection/wiring.
2435          */
2436         newpte = (pt_entry_t)(pa | PG_V);
2437         if ((prot & VM_PROT_WRITE) != 0)
2438                 newpte |= PG_RW;
2439         if (wired)
2440                 newpte |= PG_W;
2441         if (va < VM_MAXUSER_ADDRESS)
2442                 newpte |= PG_U;
2443         if (pmap == kernel_pmap)
2444                 newpte |= pgeflag;
2445
2446         /*
2447          * if the mapping or permission bits are different, we need
2448          * to update the pte.
2449          */
2450         if ((origpte & ~(PG_M|PG_A)) != newpte) {
2451                 if (origpte & PG_V) {
2452                         invlva = FALSE;
2453                         origpte = pte_load_store(pte, newpte | PG_A);
2454                         if (origpte & PG_A) {
2455                                 if (origpte & PG_MANAGED)
2456                                         vm_page_flag_set(om, PG_REFERENCED);
2457                                 if (opa != VM_PAGE_TO_PHYS(m))
2458                                         invlva = TRUE;
2459                         }
2460                         if (origpte & PG_M) {
2461                                 KASSERT((origpte & PG_RW),
2462         ("pmap_enter: modified page not writable: va: %#x, pte: %#jx",
2463                                     va, (uintmax_t)origpte));
2464                                 if ((origpte & PG_MANAGED) != 0)
2465                                         vm_page_dirty(om);
2466                                 if ((prot & VM_PROT_WRITE) == 0)
2467                                         invlva = TRUE;
2468                         }
2469                         if (invlva)
2470                                 pmap_invalidate_page(pmap, va);
2471                 } else
2472                         pte_store(pte, newpte | PG_A);
2473         }
2474         sched_unpin();
2475         vm_page_unlock_queues();
2476         PMAP_UNLOCK(pmap);
2477 }
2478
2479 /*
2480  * Maps a sequence of resident pages belonging to the same object.
2481  * The sequence begins with the given page m_start.  This page is
2482  * mapped at the given virtual address start.  Each subsequent page is
2483  * mapped at a virtual address that is offset from start by the same
2484  * amount as the page is offset from m_start within the object.  The
2485  * last page in the sequence is the page with the largest offset from
2486  * m_start that can be mapped at a virtual address less than the given
2487  * virtual address end.  Not every virtual page between start and end
2488  * is mapped; only those for which a resident page exists with the
2489  * corresponding offset from m_start are mapped.
2490  */
2491 void
2492 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
2493     vm_page_t m_start, vm_prot_t prot)
2494 {
2495         vm_page_t m, mpte;
2496         vm_pindex_t diff, psize;
2497
2498         VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
2499         psize = atop(end - start);
2500         mpte = NULL;
2501         m = m_start;
2502         PMAP_LOCK(pmap);
2503         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
2504                 mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m,
2505                     prot, mpte);
2506                 m = TAILQ_NEXT(m, listq);
2507         }
2508         PMAP_UNLOCK(pmap);
2509 }
2510
2511 /*
2512  * this code makes some *MAJOR* assumptions:
2513  * 1. Current pmap & pmap exists.
2514  * 2. Not wired.
2515  * 3. Read access.
2516  * 4. No page table pages.
2517  * but is *MUCH* faster than pmap_enter...
2518  */
2519
2520 void
2521 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
2522 {
2523
2524         PMAP_LOCK(pmap);
2525         (void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
2526         PMAP_UNLOCK(pmap);
2527 }
2528
2529 static vm_page_t
2530 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
2531     vm_prot_t prot, vm_page_t mpte)
2532 {
2533         pt_entry_t *pte;
2534         vm_paddr_t pa;
2535
2536         KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
2537             (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
2538             ("pmap_enter_quick_locked: managed mapping within the clean submap"));
2539         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2540         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2541
2542         /*
2543          * In the case that a page table page is not
2544          * resident, we are creating it here.
2545          */
2546         if (va < VM_MAXUSER_ADDRESS) {
2547                 unsigned ptepindex;
2548                 pd_entry_t ptepa;
2549
2550                 /*
2551                  * Calculate pagetable page index
2552                  */
2553                 ptepindex = va >> PDRSHIFT;
2554                 if (mpte && (mpte->pindex == ptepindex)) {
2555                         mpte->wire_count++;
2556                 } else {
2557                         /*
2558                          * Get the page directory entry
2559                          */
2560                         ptepa = pmap->pm_pdir[ptepindex];
2561
2562                         /*
2563                          * If the page table page is mapped, we just increment
2564                          * the hold count, and activate it.
2565                          */
2566                         if (ptepa) {
2567                                 if (ptepa & PG_PS)
2568                                         panic("pmap_enter_quick: unexpected mapping into 4MB page");
2569                                 mpte = PHYS_TO_VM_PAGE(ptepa);
2570                                 mpte->wire_count++;
2571                         } else {
2572                                 mpte = _pmap_allocpte(pmap, ptepindex,
2573                                     M_NOWAIT);
2574                                 if (mpte == NULL)
2575                                         return (mpte);
2576                         }
2577                 }
2578         } else {
2579                 mpte = NULL;
2580         }
2581
2582         /*
2583          * This call to vtopte makes the assumption that we are
2584          * entering the page into the current pmap.  In order to support
2585          * quick entry into any pmap, one would likely use pmap_pte_quick.
2586          * But that isn't as quick as vtopte.
2587          */
2588         pte = vtopte(va);
2589         if (*pte) {
2590                 if (mpte != NULL) {
2591                         pmap_unwire_pte_hold(pmap, mpte);
2592                         mpte = NULL;
2593                 }
2594                 return (mpte);
2595         }
2596
2597         /*
2598          * Enter on the PV list if part of our managed memory.
2599          */
2600         if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
2601             !pmap_try_insert_pv_entry(pmap, va, m)) {
2602                 if (mpte != NULL) {
2603                         pmap_unwire_pte_hold(pmap, mpte);
2604                         mpte = NULL;
2605                 }
2606                 return (mpte);
2607         }
2608
2609         /*
2610          * Increment counters
2611          */
2612         pmap->pm_stats.resident_count++;
2613
2614         pa = VM_PAGE_TO_PHYS(m);
2615
2616         /*
2617          * Now validate mapping with RO protection
2618          */
2619         if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2620                 pte_store(pte, pa | PG_V | PG_U);
2621         else
2622                 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2623         return mpte;
2624 }
2625
2626 /*
2627  * Make a temporary mapping for a physical address.  This is only intended
2628  * to be used for panic dumps.
2629  */
2630 void *
2631 pmap_kenter_temporary(vm_paddr_t pa, int i)
2632 {
2633         vm_offset_t va;
2634
2635         va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2636         pmap_kenter(va, pa);
2637         invlpg(va);
2638         return ((void *)crashdumpmap);
2639 }
2640
2641 /*
2642  * This code maps large physical mmap regions into the
2643  * processor address space.  Note that some shortcuts
2644  * are taken, but the code works.
2645  */
2646 void
2647 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2648                     vm_object_t object, vm_pindex_t pindex,
2649                     vm_size_t size)
2650 {
2651         vm_page_t p;
2652
2653         VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2654         KASSERT(object->type == OBJT_DEVICE,
2655             ("pmap_object_init_pt: non-device object"));
2656         if (pseflag &&
2657             ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2658                 int i;
2659                 vm_page_t m[1];
2660                 unsigned int ptepindex;
2661                 int npdes;
2662                 pd_entry_t ptepa;
2663
2664                 PMAP_LOCK(pmap);
2665                 if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2666                         goto out;
2667                 PMAP_UNLOCK(pmap);
2668 retry:
2669                 p = vm_page_lookup(object, pindex);
2670                 if (p != NULL) {
2671                         if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2672                                 goto retry;
2673                 } else {
2674                         p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2675                         if (p == NULL)
2676                                 return;
2677                         m[0] = p;
2678
2679                         if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2680                                 vm_page_lock_queues();
2681                                 vm_page_free(p);
2682                                 vm_page_unlock_queues();
2683                                 return;
2684                         }
2685
2686                         p = vm_page_lookup(object, pindex);
2687                         vm_page_lock_queues();
2688                         vm_page_wakeup(p);
2689                         vm_page_unlock_queues();
2690                 }
2691
2692                 ptepa = VM_PAGE_TO_PHYS(p);
2693                 if (ptepa & (NBPDR - 1))
2694                         return;
2695
2696                 p->valid = VM_PAGE_BITS_ALL;
2697
2698                 PMAP_LOCK(pmap);
2699                 pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2700                 npdes = size >> PDRSHIFT;
2701                 for(i = 0; i < npdes; i++) {
2702                         pde_store(&pmap->pm_pdir[ptepindex],
2703                             ptepa | PG_U | PG_RW | PG_V | PG_PS);
2704                         ptepa += NBPDR;
2705                         ptepindex += 1;
2706                 }
2707                 pmap_invalidate_all(pmap);
2708 out:
2709                 PMAP_UNLOCK(pmap);
2710         }
2711 }
2712
2713 /*
2714  *      Routine:        pmap_change_wiring
2715  *      Function:       Change the wiring attribute for a map/virtual-address
2716  *                      pair.
2717  *      In/out conditions:
2718  *                      The mapping must already exist in the pmap.
2719  */
2720 void
2721 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
2722 {
2723         pt_entry_t *pte;
2724
2725         PMAP_LOCK(pmap);
2726         pte = pmap_pte(pmap, va);
2727
2728         if (wired && !pmap_pte_w(pte))
2729                 pmap->pm_stats.wired_count++;
2730         else if (!wired && pmap_pte_w(pte))
2731                 pmap->pm_stats.wired_count--;
2732
2733         /*
2734          * Wiring is not a hardware characteristic so there is no need to
2735          * invalidate TLB.
2736          */
2737         pmap_pte_set_w(pte, wired);
2738         pmap_pte_release(pte);
2739         PMAP_UNLOCK(pmap);
2740 }
2741
2742
2743
2744 /*
2745  *      Copy the range specified by src_addr/len
2746  *      from the source map to the range dst_addr/len
2747  *      in the destination map.
2748  *
2749  *      This routine is only advisory and need not do anything.
2750  */
2751
2752 void
2753 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2754           vm_offset_t src_addr)
2755 {
2756         vm_offset_t addr;
2757         vm_offset_t end_addr = src_addr + len;
2758         vm_offset_t pdnxt;
2759
2760         if (dst_addr != src_addr)
2761                 return;
2762
2763         if (!pmap_is_current(src_pmap))
2764                 return;
2765
2766         vm_page_lock_queues();
2767         if (dst_pmap < src_pmap) {
2768                 PMAP_LOCK(dst_pmap);
2769                 PMAP_LOCK(src_pmap);
2770         } else {
2771                 PMAP_LOCK(src_pmap);
2772                 PMAP_LOCK(dst_pmap);
2773         }
2774         sched_pin();
2775         for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2776                 pt_entry_t *src_pte, *dst_pte;
2777                 vm_page_t dstmpte, srcmpte;
2778                 pd_entry_t srcptepaddr;
2779                 unsigned ptepindex;
2780
2781                 if (addr >= UPT_MIN_ADDRESS)
2782                         panic("pmap_copy: invalid to pmap_copy page tables");
2783
2784                 pdnxt = (addr + NBPDR) & ~PDRMASK;
2785                 ptepindex = addr >> PDRSHIFT;
2786
2787                 srcptepaddr = src_pmap->pm_pdir[ptepindex];
2788                 if (srcptepaddr == 0)
2789                         continue;
2790
2791                 if (srcptepaddr & PG_PS) {
2792                         if (dst_pmap->pm_pdir[ptepindex] == 0) {
2793                                 dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
2794                                     ~PG_W;
2795                                 dst_pmap->pm_stats.resident_count +=
2796                                     NBPDR / PAGE_SIZE;
2797                         }
2798                         continue;
2799                 }
2800
2801                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2802                 if (srcmpte->wire_count == 0)
2803                         panic("pmap_copy: source page table page is unused");
2804
2805                 if (pdnxt > end_addr)
2806                         pdnxt = end_addr;
2807
2808                 src_pte = vtopte(addr);
2809                 while (addr < pdnxt) {
2810                         pt_entry_t ptetemp;
2811                         ptetemp = *src_pte;
2812                         /*
2813                          * we only virtual copy managed pages
2814                          */
2815                         if ((ptetemp & PG_MANAGED) != 0) {
2816                                 dstmpte = pmap_allocpte(dst_pmap, addr,
2817                                     M_NOWAIT);
2818                                 if (dstmpte == NULL)
2819                                         break;
2820                                 dst_pte = pmap_pte_quick(dst_pmap, addr);
2821                                 if (*dst_pte == 0 &&
2822                                     pmap_try_insert_pv_entry(dst_pmap, addr,
2823                                     PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
2824                                         /*
2825                                          * Clear the wired, modified, and
2826                                          * accessed (referenced) bits
2827                                          * during the copy.
2828                                          */
2829                                         *dst_pte = ptetemp & ~(PG_W | PG_M |
2830                                             PG_A);
2831                                         dst_pmap->pm_stats.resident_count++;
2832                                 } else
2833                                         pmap_unwire_pte_hold(dst_pmap, dstmpte);
2834                                 if (dstmpte->wire_count >= srcmpte->wire_count)
2835                                         break;
2836                         }
2837                         addr += PAGE_SIZE;
2838                         src_pte++;
2839                 }
2840         }
2841         sched_unpin();
2842         vm_page_unlock_queues();
2843         PMAP_UNLOCK(src_pmap);
2844         PMAP_UNLOCK(dst_pmap);
2845 }
2846
2847 static __inline void
2848 pagezero(void *page)
2849 {
2850 #if defined(I686_CPU)
2851         if (cpu_class == CPUCLASS_686) {
2852 #if defined(CPU_ENABLE_SSE)
2853                 if (cpu_feature & CPUID_SSE2)
2854                         sse2_pagezero(page);
2855                 else
2856 #endif
2857                         i686_pagezero(page);
2858         } else
2859 #endif
2860                 bzero(page, PAGE_SIZE);
2861 }
2862
2863 /*
2864  *      pmap_zero_page zeros the specified hardware page by mapping
2865  *      the page into KVM and using bzero to clear its contents.
2866  */
2867 void
2868 pmap_zero_page(vm_page_t m)
2869 {
2870         struct sysmaps *sysmaps;
2871
2872         sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2873         mtx_lock(&sysmaps->lock);
2874         if (*sysmaps->CMAP2)
2875                 panic("pmap_zero_page: CMAP2 busy");
2876         sched_pin();
2877         *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2878         invlcaddr(sysmaps->CADDR2);
2879         pagezero(sysmaps->CADDR2);
2880         *sysmaps->CMAP2 = 0;
2881         sched_unpin();
2882         mtx_unlock(&sysmaps->lock);
2883 }
2884
2885 /*
2886  *      pmap_zero_page_area zeros the specified hardware page by mapping
2887  *      the page into KVM and using bzero to clear its contents.
2888  *
2889  *      off and size may not cover an area beyond a single hardware page.
2890  */
2891 void
2892 pmap_zero_page_area(vm_page_t m, int off, int size)
2893 {
2894         struct sysmaps *sysmaps;
2895
2896         sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2897         mtx_lock(&sysmaps->lock);
2898         if (*sysmaps->CMAP2)
2899                 panic("pmap_zero_page: CMAP2 busy");
2900         sched_pin();
2901         *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2902         invlcaddr(sysmaps->CADDR2);
2903         if (off == 0 && size == PAGE_SIZE)
2904                 pagezero(sysmaps->CADDR2);
2905         else
2906                 bzero((char *)sysmaps->CADDR2 + off, size);
2907         *sysmaps->CMAP2 = 0;
2908         sched_unpin();
2909         mtx_unlock(&sysmaps->lock);
2910 }
2911
2912 /*
2913  *      pmap_zero_page_idle zeros the specified hardware page by mapping
2914  *      the page into KVM and using bzero to clear its contents.  This
2915  *      is intended to be called from the vm_pagezero process only and
2916  *      outside of Giant.
2917  */
2918 void
2919 pmap_zero_page_idle(vm_page_t m)
2920 {
2921
2922         if (*CMAP3)
2923                 panic("pmap_zero_page: CMAP3 busy");
2924         sched_pin();
2925         *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2926         invlcaddr(CADDR3);
2927         pagezero(CADDR3);
2928         *CMAP3 = 0;
2929         sched_unpin();
2930 }
2931
2932 /*
2933  *      pmap_copy_page copies the specified (machine independent)
2934  *      page by mapping the page into virtual memory and using
2935  *      bcopy to copy the page, one machine dependent page at a
2936  *      time.
2937  */
2938 void
2939 pmap_copy_page(vm_page_t src, vm_page_t dst)
2940 {
2941         struct sysmaps *sysmaps;
2942
2943         sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2944         mtx_lock(&sysmaps->lock);
2945         if (*sysmaps->CMAP1)
2946                 panic("pmap_copy_page: CMAP1 busy");
2947         if (*sysmaps->CMAP2)
2948                 panic("pmap_copy_page: CMAP2 busy");
2949         sched_pin();
2950         invlpg((u_int)sysmaps->CADDR1);
2951         invlpg((u_int)sysmaps->CADDR2);
2952         *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
2953         *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
2954         bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
2955         *sysmaps->CMAP1 = 0;
2956         *sysmaps->CMAP2 = 0;
2957         sched_unpin();
2958         mtx_unlock(&sysmaps->lock);
2959 }
2960
2961 /*
2962  * Returns true if the pmap's pv is one of the first
2963  * 16 pvs linked to from this page.  This count may
2964  * be changed upwards or downwards in the future; it
2965  * is only necessary that true be returned for a small
2966  * subset of pmaps for proper page aging.
2967  */
2968 boolean_t
2969 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
2970 {
2971         pv_entry_t pv;
2972         int loops = 0;
2973
2974         if (m->flags & PG_FICTITIOUS)
2975                 return FALSE;
2976
2977         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2978         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
2979                 if (PV_PMAP(pv) == pmap) {
2980                         return TRUE;
2981                 }
2982                 loops++;
2983                 if (loops >= 16)
2984                         break;
2985         }
2986         return (FALSE);
2987 }
2988
2989 /*
2990  * Remove all pages from specified address space
2991  * this aids process exit speeds.  Also, this code
2992  * is special cased for current process only, but
2993  * can have the more generic (and slightly slower)
2994  * mode enabled.  This is much faster than pmap_remove
2995  * in the case of running down an entire address space.
2996  */
2997 void
2998 pmap_remove_pages(pmap_t pmap)
2999 {
3000         pt_entry_t *pte, tpte;
3001         vm_page_t m;
3002         pv_entry_t pv;
3003         struct pv_chunk *pc, *npc;
3004         int field, idx;
3005         int32_t bit;
3006         uint32_t inuse, bitmask;
3007         int allfree;
3008
3009         if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
3010                 printf("warning: pmap_remove_pages called with non-current pmap\n");
3011                 return;
3012         }
3013         vm_page_lock_queues();
3014         PMAP_LOCK(pmap);
3015         sched_pin();
3016         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
3017                 allfree = 1;
3018                 for (field = 0; field < _NPCM; field++) {
3019                         inuse = (~(pc->pc_map[field])) & pc_freemask[field];
3020                         while (inuse != 0) {
3021                                 bit = bsfl(inuse);
3022                                 bitmask = 1UL << bit;
3023                                 idx = field * 32 + bit;
3024                                 pv = &pc->pc_pventry[idx];
3025                                 inuse &= ~bitmask;
3026
3027                                 pte = vtopte(pv->pv_va);
3028                                 tpte = *pte;
3029
3030                                 if (tpte == 0) {
3031                                         printf(
3032                                             "TPTE at %p  IS ZERO @ VA %08x\n",
3033                                             pte, pv->pv_va);
3034                                         panic("bad pte");
3035                                 }
3036
3037 /*
3038  * We cannot remove wired pages from a process' mapping at this time
3039  */
3040                                 if (tpte & PG_W) {
3041                                         allfree = 0;
3042                                         continue;
3043                                 }
3044
3045                                 m = PHYS_TO_VM_PAGE(tpte);
3046                                 KASSERT(m->phys_addr == (tpte & PG_FRAME),
3047                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
3048                                     m, (uintmax_t)m->phys_addr,
3049                                     (uintmax_t)tpte));
3050
3051                                 KASSERT(m < &vm_page_array[vm_page_array_size],
3052                                         ("pmap_remove_pages: bad tpte %#jx",
3053                                         (uintmax_t)tpte));
3054
3055                                 pmap->pm_stats.resident_count--;
3056
3057                                 pte_clear(pte);
3058
3059                                 /*
3060                                  * Update the vm_page_t clean/reference bits.
3061                                  */
3062                                 if (tpte & PG_M)
3063                                         vm_page_dirty(m);
3064
3065                                 /* Mark free */
3066                                 PV_STAT(pv_entry_frees++);
3067                                 PV_STAT(pv_entry_spare++);
3068                                 pv_entry_count--;
3069                                 pc->pc_map[field] |= bitmask;
3070                                 m->md.pv_list_count--;
3071                                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3072                                 if (TAILQ_EMPTY(&m->md.pv_list))
3073                                         vm_page_flag_clear(m, PG_WRITEABLE);
3074
3075                                 pmap_unuse_pt(pmap, pv->pv_va);
3076                         }
3077                 }
3078                 if (allfree) {
3079                         PV_STAT(pv_entry_spare -= _NPCPV);
3080                         PV_STAT(pc_chunk_count--);
3081                         PV_STAT(pc_chunk_frees++);
3082                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3083                         m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
3084                         pmap_qremove((vm_offset_t)pc, 1);
3085                         vm_page_unwire(m, 0);
3086                         vm_page_free(m);
3087                         pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
3088                 }
3089         }
3090         sched_unpin();
3091         vm_page_unlock_queues();
3092         pmap_invalidate_all(pmap);
3093         PMAP_UNLOCK(pmap);
3094 }
3095
3096 /*
3097  *      pmap_is_modified:
3098  *
3099  *      Return whether or not the specified physical page was modified
3100  *      in any physical maps.
3101  */
3102 boolean_t
3103 pmap_is_modified(vm_page_t m)
3104 {
3105         pv_entry_t pv;
3106         pt_entry_t *pte;
3107         pmap_t pmap;
3108         boolean_t rv;
3109
3110         rv = FALSE;
3111         if (m->flags & PG_FICTITIOUS)
3112                 return (rv);
3113
3114         sched_pin();
3115         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3116         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3117                 pmap = PV_PMAP(pv);
3118                 PMAP_LOCK(pmap);
3119                 pte = pmap_pte_quick(pmap, pv->pv_va);
3120                 rv = (*pte & PG_M) != 0;
3121                 PMAP_UNLOCK(pmap);
3122                 if (rv)
3123                         break;
3124         }
3125         sched_unpin();
3126         return (rv);
3127 }
3128
3129 /*
3130  *      pmap_is_prefaultable:
3131  *
3132  *      Return whether or not the specified virtual address is elgible
3133  *      for prefault.
3134  */
3135 boolean_t
3136 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3137 {
3138         pt_entry_t *pte;
3139         boolean_t rv;
3140
3141         rv = FALSE;
3142         PMAP_LOCK(pmap);
3143         if (*pmap_pde(pmap, addr)) {
3144                 pte = vtopte(addr);
3145                 rv = *pte == 0;
3146         }
3147         PMAP_UNLOCK(pmap);
3148         return (rv);
3149 }
3150
3151 /*
3152  * Clear the write and modified bits in each of the given page's mappings.
3153  */
3154 void
3155 pmap_remove_write(vm_page_t m)
3156 {
3157         pv_entry_t pv;
3158         pmap_t pmap;
3159         pt_entry_t oldpte, *pte;
3160
3161         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3162         if ((m->flags & PG_FICTITIOUS) != 0 ||
3163             (m->flags & PG_WRITEABLE) == 0)
3164                 return;
3165         sched_pin();
3166         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3167                 pmap = PV_PMAP(pv);
3168                 PMAP_LOCK(pmap);
3169                 pte = pmap_pte_quick(pmap, pv->pv_va);
3170 retry:
3171                 oldpte = *pte;
3172                 if ((oldpte & PG_RW) != 0) {
3173                         /*
3174                          * Regardless of whether a pte is 32 or 64 bits
3175                          * in size, PG_RW and PG_M are among the least
3176                          * significant 32 bits.
3177                          */
3178                         if (!atomic_cmpset_int((u_int *)pte, oldpte,
3179                             oldpte & ~(PG_RW | PG_M)))
3180                                 goto retry;
3181                         if ((oldpte & PG_M) != 0)
3182                                 vm_page_dirty(m);
3183                         pmap_invalidate_page(pmap, pv->pv_va);
3184                 }
3185                 PMAP_UNLOCK(pmap);
3186         }
3187         vm_page_flag_clear(m, PG_WRITEABLE);
3188         sched_unpin();
3189 }
3190
3191 /*
3192  *      pmap_ts_referenced:
3193  *
3194  *      Return a count of reference bits for a page, clearing those bits.
3195  *      It is not necessary for every reference bit to be cleared, but it
3196  *      is necessary that 0 only be returned when there are truly no
3197  *      reference bits set.
3198  *
3199  *      XXX: The exact number of bits to check and clear is a matter that
3200  *      should be tested and standardized at some point in the future for
3201  *      optimal aging of shared pages.
3202  */
3203 int
3204 pmap_ts_referenced(vm_page_t m)
3205 {
3206         pv_entry_t pv, pvf, pvn;
3207         pmap_t pmap;
3208         pt_entry_t *pte;
3209         int rtval = 0;
3210
3211         if (m->flags & PG_FICTITIOUS)
3212                 return (rtval);
3213         sched_pin();
3214         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3215         if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3216                 pvf = pv;
3217                 do {
3218                         pvn = TAILQ_NEXT(pv, pv_list);
3219                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3220                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3221                         pmap = PV_PMAP(pv);
3222                         PMAP_LOCK(pmap);
3223                         pte = pmap_pte_quick(pmap, pv->pv_va);
3224                         if ((*pte & PG_A) != 0) {
3225                                 atomic_clear_int((u_int *)pte, PG_A);
3226                                 pmap_invalidate_page(pmap, pv->pv_va);
3227                                 rtval++;
3228                                 if (rtval > 4)
3229                                         pvn = NULL;
3230                         }
3231                         PMAP_UNLOCK(pmap);
3232                 } while ((pv = pvn) != NULL && pv != pvf);
3233         }
3234         sched_unpin();
3235         return (rtval);
3236 }
3237
3238 /*
3239  *      Clear the modify bits on the specified physical page.
3240  */
3241 void
3242 pmap_clear_modify(vm_page_t m)
3243 {
3244         pv_entry_t pv;
3245         pmap_t pmap;
3246         pt_entry_t *pte;
3247
3248         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3249         if ((m->flags & PG_FICTITIOUS) != 0)
3250                 return;
3251         sched_pin();
3252         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3253                 pmap = PV_PMAP(pv);
3254                 PMAP_LOCK(pmap);
3255                 pte = pmap_pte_quick(pmap, pv->pv_va);
3256                 if ((*pte & PG_M) != 0) {
3257                         /*
3258                          * Regardless of whether a pte is 32 or 64 bits
3259                          * in size, PG_M is among the least significant
3260                          * 32 bits.
3261                          */
3262                         atomic_clear_int((u_int *)pte, PG_M);
3263                         pmap_invalidate_page(pmap, pv->pv_va);
3264                 }
3265                 PMAP_UNLOCK(pmap);
3266         }
3267         sched_unpin();
3268 }
3269
3270 /*
3271  *      pmap_clear_reference:
3272  *
3273  *      Clear the reference bit on the specified physical page.
3274  */
3275 void
3276 pmap_clear_reference(vm_page_t m)
3277 {
3278         pv_entry_t pv;
3279         pmap_t pmap;
3280         pt_entry_t *pte;
3281
3282         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3283         if ((m->flags & PG_FICTITIOUS) != 0)
3284                 return;
3285         sched_pin();
3286         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3287                 pmap = PV_PMAP(pv);
3288                 PMAP_LOCK(pmap);
3289                 pte = pmap_pte_quick(pmap, pv->pv_va);
3290                 if ((*pte & PG_A) != 0) {
3291                         /*
3292                          * Regardless of whether a pte is 32 or 64 bits
3293                          * in size, PG_A is among the least significant
3294                          * 32 bits.
3295                          */
3296                         atomic_clear_int((u_int *)pte, PG_A);
3297                         pmap_invalidate_page(pmap, pv->pv_va);
3298                 }
3299                 PMAP_UNLOCK(pmap);
3300         }
3301         sched_unpin();
3302 }
3303
3304 /*
3305  * Miscellaneous support routines follow
3306  */
3307
3308 /*
3309  * Map a set of physical memory pages into the kernel virtual
3310  * address space. Return a pointer to where it is mapped. This
3311  * routine is intended to be used for mapping device memory,
3312  * NOT real memory.
3313  */
3314 void *
3315 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
3316 {
3317         vm_offset_t va, tmpva, offset;
3318
3319         offset = pa & PAGE_MASK;
3320         size = roundup(offset + size, PAGE_SIZE);
3321         pa = pa & PG_FRAME;
3322
3323         if (pa < KERNLOAD && pa + size <= KERNLOAD)
3324                 va = KERNBASE + pa;
3325         else
3326                 va = kmem_alloc_nofault(kernel_map, size);
3327         if (!va)
3328                 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3329
3330         for (tmpva = va; size > 0; ) {
3331                 pmap_kenter_attr(tmpva, pa, mode);
3332                 size -= PAGE_SIZE;
3333                 tmpva += PAGE_SIZE;
3334                 pa += PAGE_SIZE;
3335         }
3336         pmap_invalidate_range(kernel_pmap, va, tmpva);
3337         pmap_invalidate_cache();
3338         return ((void *)(va + offset));
3339 }
3340
3341 void *
3342 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
3343 {
3344
3345         return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
3346 }
3347
3348 void *
3349 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
3350 {
3351
3352         return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
3353 }
3354
3355 void
3356 pmap_unmapdev(vm_offset_t va, vm_size_t size)
3357 {
3358         vm_offset_t base, offset, tmpva;
3359
3360         if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
3361                 return;
3362         base = va & PG_FRAME;
3363         offset = va & PAGE_MASK;
3364         size = roundup(offset + size, PAGE_SIZE);
3365         for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
3366                 pmap_kremove(tmpva);
3367         pmap_invalidate_range(kernel_pmap, va, tmpva);
3368         kmem_free(kernel_map, base, size);
3369 }
3370
3371 int
3372 pmap_change_attr(va, size, mode)
3373         vm_offset_t va;
3374         vm_size_t size;
3375         int mode;
3376 {
3377         vm_offset_t base, offset, tmpva;
3378         pt_entry_t *pte;
3379         u_int opte, npte;
3380         pd_entry_t *pde;
3381
3382         base = va & PG_FRAME;
3383         offset = va & PAGE_MASK;
3384         size = roundup(offset + size, PAGE_SIZE);
3385
3386         /* Only supported on kernel virtual addresses. */
3387         if (base <= VM_MAXUSER_ADDRESS)
3388                 return (EINVAL);
3389
3390         /* 4MB pages and pages that aren't mapped aren't supported. */
3391         for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
3392                 pde = pmap_pde(kernel_pmap, tmpva);
3393                 if (*pde & PG_PS)
3394                         return (EINVAL);
3395                 if (*pde == 0)
3396                         return (EINVAL);
3397                 pte = vtopte(va);
3398                 if (*pte == 0)
3399                         return (EINVAL);
3400         }
3401
3402         /*
3403          * Ok, all the pages exist and are 4k, so run through them updating
3404          * their cache mode.
3405          */
3406         for (tmpva = base; size > 0; ) {
3407                 pte = vtopte(tmpva);
3408
3409                 /*
3410                  * The cache mode bits are all in the low 32-bits of the
3411                  * PTE, so we can just spin on updating the low 32-bits.
3412                  */
3413                 do {
3414                         opte = *(u_int *)pte;
3415                         npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT);
3416                         npte |= pmap_cache_bits(mode, 0);
3417                 } while (npte != opte &&
3418                     !atomic_cmpset_int((u_int *)pte, opte, npte));
3419                 tmpva += PAGE_SIZE;
3420                 size -= PAGE_SIZE;
3421         }
3422
3423         /*
3424          * Flush CPU caches to make sure any data isn't cached that shouldn't
3425          * be, etc.
3426          */
3427         pmap_invalidate_range(kernel_pmap, base, tmpva);
3428         pmap_invalidate_cache();
3429         return (0);
3430 }
3431
3432 /*
3433  * perform the pmap work for mincore
3434  */
3435 int
3436 pmap_mincore(pmap_t pmap, vm_offset_t addr)
3437 {
3438         pt_entry_t *ptep, pte;
3439         vm_page_t m;
3440         int val = 0;
3441
3442         PMAP_LOCK(pmap);
3443         ptep = pmap_pte(pmap, addr);
3444         pte = (ptep != NULL) ? *ptep : 0;
3445         pmap_pte_release(ptep);
3446         PMAP_UNLOCK(pmap);
3447
3448         if (pte != 0) {
3449                 vm_paddr_t pa;
3450
3451                 val = MINCORE_INCORE;
3452                 if ((pte & PG_MANAGED) == 0)
3453                         return val;
3454
3455                 pa = pte & PG_FRAME;
3456
3457                 m = PHYS_TO_VM_PAGE(pa);
3458
3459                 /*
3460                  * Modified by us
3461                  */
3462                 if (pte & PG_M)
3463                         val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3464                 else {
3465                         /*
3466                          * Modified by someone else
3467                          */
3468                         vm_page_lock_queues();
3469                         if (m->dirty || pmap_is_modified(m))
3470                                 val |= MINCORE_MODIFIED_OTHER;
3471                         vm_page_unlock_queues();
3472                 }
3473                 /*
3474                  * Referenced by us
3475                  */
3476                 if (pte & PG_A)
3477                         val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3478                 else {
3479                         /*
3480                          * Referenced by someone else
3481                          */
3482                         vm_page_lock_queues();
3483                         if ((m->flags & PG_REFERENCED) ||
3484                             pmap_ts_referenced(m)) {
3485                                 val |= MINCORE_REFERENCED_OTHER;
3486                                 vm_page_flag_set(m, PG_REFERENCED);
3487                         }
3488                         vm_page_unlock_queues();
3489                 }
3490         }
3491         return val;
3492 }
3493
3494 void
3495 pmap_activate(struct thread *td)
3496 {
3497         pmap_t  pmap, oldpmap;
3498         u_int32_t  cr3;
3499
3500         critical_enter();
3501         pmap = vmspace_pmap(td->td_proc->p_vmspace);
3502         oldpmap = PCPU_GET(curpmap);
3503 #if defined(SMP)
3504         atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
3505         atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
3506 #else
3507         oldpmap->pm_active &= ~1;
3508         pmap->pm_active |= 1;
3509 #endif
3510 #ifdef PAE
3511         cr3 = vtophys(pmap->pm_pdpt);
3512 #else
3513         cr3 = vtophys(pmap->pm_pdir);
3514 #endif
3515         /*
3516          * pmap_activate is for the current thread on the current cpu
3517          */
3518         td->td_pcb->pcb_cr3 = cr3;
3519         load_cr3(cr3);
3520         PCPU_SET(curpmap, pmap);
3521         critical_exit();
3522 }
3523
3524 vm_offset_t
3525 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3526 {
3527
3528         if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3529                 return addr;
3530         }
3531
3532         addr = (addr + PDRMASK) & ~PDRMASK;
3533         return addr;
3534 }
3535
3536
3537 #if defined(PMAP_DEBUG)
3538 pmap_pid_dump(int pid)
3539 {
3540         pmap_t pmap;
3541         struct proc *p;
3542         int npte = 0;
3543         int index;
3544
3545         sx_slock(&allproc_lock);
3546         LIST_FOREACH(p, &allproc, p_list) {
3547                 if (p->p_pid != pid)
3548                         continue;
3549
3550                 if (p->p_vmspace) {
3551                         int i,j;
3552                         index = 0;
3553                         pmap = vmspace_pmap(p->p_vmspace);
3554                         for (i = 0; i < NPDEPTD; i++) {
3555                                 pd_entry_t *pde;
3556                                 pt_entry_t *pte;
3557                                 vm_offset_t base = i << PDRSHIFT;
3558
3559                                 pde = &pmap->pm_pdir[i];
3560                                 if (pde && pmap_pde_v(pde)) {
3561                                         for (j = 0; j < NPTEPG; j++) {
3562                                                 vm_offset_t va = base + (j << PAGE_SHIFT);
3563                                                 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3564                                                         if (index) {
3565                                                                 index = 0;
3566                                                                 printf("\n");
3567                                                         }
3568                                                         sx_sunlock(&allproc_lock);
3569                                                         return npte;
3570                                                 }
3571                                                 pte = pmap_pte(pmap, va);
3572                                                 if (pte && pmap_pte_v(pte)) {
3573                                                         pt_entry_t pa;
3574                                                         vm_page_t m;
3575                                                         pa = *pte;
3576                                                         m = PHYS_TO_VM_PAGE(pa);
3577                                                         printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3578                                                                 va, pa, m->hold_count, m->wire_count, m->flags);
3579                                                         npte++;
3580                                                         index++;
3581                                                         if (index >= 2) {
3582                                                                 index = 0;
3583                                                                 printf("\n");
3584                                                         } else {
3585                                                                 printf(" ");
3586                                                         }
3587                                                 }
3588                                         }
3589                                 }
3590                         }
3591                 }
3592         }
3593         sx_sunlock(&allproc_lock);
3594         return npte;
3595 }
3596 #endif
3597
3598 #if defined(DEBUG)
3599
3600 static void     pads(pmap_t pm);
3601 void            pmap_pvdump(vm_offset_t pa);
3602
3603 /* print address space of pmap*/
3604 static void
3605 pads(pmap_t pm)
3606 {
3607         int i, j;
3608         vm_paddr_t va;
3609         pt_entry_t *ptep;
3610
3611         if (pm == kernel_pmap)
3612                 return;
3613         for (i = 0; i < NPDEPTD; i++)
3614                 if (pm->pm_pdir[i])
3615                         for (j = 0; j < NPTEPG; j++) {
3616                                 va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3617                                 if (pm == kernel_pmap && va < KERNBASE)
3618                                         continue;
3619                                 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3620                                         continue;
3621                                 ptep = pmap_pte(pm, va);
3622                                 if (pmap_pte_v(ptep))
3623                                         printf("%x:%x ", va, *ptep);
3624                         };
3625
3626 }
3627
3628 void
3629 pmap_pvdump(vm_paddr_t pa)
3630 {
3631         pv_entry_t pv;
3632         pmap_t pmap;
3633         vm_page_t m;
3634
3635         printf("pa %x", pa);
3636         m = PHYS_TO_VM_PAGE(pa);
3637         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3638                 pmap = PV_PMAP(pv);
3639                 printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
3640                 pads(pmap);
3641         }
3642         printf(" ");
3643 }
3644 #endif