sys/amd64/amd64/pmap.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-4-Clause
   3  *
   4  * Copyright (c) 1991 Regents of the University of California.
   5  * All rights reserved.
   6  * Copyright (c) 1994 John S. Dyson
   7  * All rights reserved.
   8  * Copyright (c) 1994 David Greenman
   9  * All rights reserved.
  10  * Copyright (c) 2003 Peter Wemm
  11  * All rights reserved.
  12  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  13  * All rights reserved.
  14  *
  15  * This code is derived from software contributed to Berkeley by
  16  * the Systems Programming Group of the University of Utah Computer
  17  * Science Department and William Jolitz of UUNET Technologies Inc.
  18  *
  19  * Redistribution and use in source and binary forms, with or without
  20  * modification, are permitted provided that the following conditions
  21  * are met:
  22  * 1. Redistributions of source code must retain the above copyright
  23  *    notice, this list of conditions and the following disclaimer.
  24  * 2. Redistributions in binary form must reproduce the above copyright
  25  *    notice, this list of conditions and the following disclaimer in the
  26  *    documentation and/or other materials provided with the distribution.
  27  * 3. All advertising materials mentioning features or use of this software
  28  *    must display the following acknowledgement:
  29  *      This product includes software developed by the University of
  30  *      California, Berkeley and its contributors.
  31  * 4. Neither the name of the University nor the names of its contributors
  32  *    may be used to endorse or promote products derived from this software
  33  *    without specific prior written permission.
  34  *
  35  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  36  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  38  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  39  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  40  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  41  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  42  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  43  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  44  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  45  * SUCH DAMAGE.
  46  *
  47  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
  48  */
  49 /*-
  50  * Copyright (c) 2003 Networks Associates Technology, Inc.
  51  * Copyright (c) 2014-2018 The FreeBSD Foundation
  52  * All rights reserved.
  53  *
  54  * This software was developed for the FreeBSD Project by Jake Burkholder,
  55  * Safeport Network Services, and Network Associates Laboratories, the
  56  * Security Research Division of Network Associates, Inc. under
  57  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  58  * CHATS research program.
  59  *
  60  * Portions of this software were developed by
  61  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
  62  * the FreeBSD Foundation.
  63  *
  64  * Redistribution and use in source and binary forms, with or without
  65  * modification, are permitted provided that the following conditions
  66  * are met:
  67  * 1. Redistributions of source code must retain the above copyright
  68  *    notice, this list of conditions and the following disclaimer.
  69  * 2. Redistributions in binary form must reproduce the above copyright
  70  *    notice, this list of conditions and the following disclaimer in the
  71  *    documentation and/or other materials provided with the distribution.
  72  *
  73  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  74  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  75  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  76  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  77  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  78  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  79  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  80  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  81  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  82  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  83  * SUCH DAMAGE.
  84  */
  85
  86 #define AMD64_NPT_AWARE
  87
  88 #include <sys/cdefs.h>
  89 __FBSDID("$FreeBSD$");
  90
  91 /*
  92  *      Manages physical address maps.
  93  *
  94  *      Since the information managed by this module is
  95  *      also stored by the logical address mapping module,
  96  *      this module may throw away valid virtual-to-physical
  97  *      mappings at almost any time.  However, invalidations
  98  *      of virtual-to-physical mappings must be done as
  99  *      requested.
 100  *
 101  *      In order to cope with hardware architectures which
 102  *      make virtual-to-physical map invalidates expensive,
 103  *      this module may delay invalidate or reduced protection
 104  *      operations until such time as they are actually
 105  *      necessary.  This module is given full information as
 106  *      to which processors are currently using which maps,
 107  *      and to when physical maps must be made correct.
 108  */
 109
 110 #include "opt_pmap.h"
 111 #include "opt_vm.h"
 112
 113 #include <sys/param.h>
 114 #include <sys/bitstring.h>
 115 #include <sys/bus.h>
 116 #include <sys/systm.h>
 117 #include <sys/kernel.h>
 118 #include <sys/ktr.h>
 119 #include <sys/lock.h>
 120 #include <sys/malloc.h>
 121 #include <sys/mman.h>
 122 #include <sys/mutex.h>
 123 #include <sys/proc.h>
 124 #include <sys/rwlock.h>
 125 #include <sys/sx.h>
 126 #include <sys/turnstile.h>
 127 #include <sys/vmem.h>
 128 #include <sys/vmmeter.h>
 129 #include <sys/sched.h>
 130 #include <sys/sysctl.h>
 131 #include <sys/smp.h>
 132
 133 #include <vm/vm.h>
 134 #include <vm/vm_param.h>
 135 #include <vm/vm_kern.h>
 136 #include <vm/vm_page.h>
 137 #include <vm/vm_map.h>
 138 #include <vm/vm_object.h>
 139 #include <vm/vm_extern.h>
 140 #include <vm/vm_pageout.h>
 141 #include <vm/vm_pager.h>
 142 #include <vm/vm_phys.h>
 143 #include <vm/vm_radix.h>
 144 #include <vm/vm_reserv.h>
 145 #include <vm/uma.h>
 146
 147 #include <machine/intr_machdep.h>
 148 #include <x86/apicvar.h>
 149 #include <x86/ifunc.h>
 150 #include <machine/cpu.h>
 151 #include <machine/cputypes.h>
 152 #include <machine/md_var.h>
 153 #include <machine/pcb.h>
 154 #include <machine/specialreg.h>
 155 #ifdef SMP
 156 #include <machine/smp.h>
 157 #endif
 158 #include <machine/tss.h>
 159
 160 static __inline boolean_t
 161 pmap_type_guest(pmap_t pmap)
 162 {
 163
 164         return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
 165 }
 166
 167 static __inline boolean_t
 168 pmap_emulate_ad_bits(pmap_t pmap)
 169 {
 170
 171         return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
 172 }
 173
 174 static __inline pt_entry_t
 175 pmap_valid_bit(pmap_t pmap)
 176 {
 177         pt_entry_t mask;
 178
 179         switch (pmap->pm_type) {
 180         case PT_X86:
 181         case PT_RVI:
 182                 mask = X86_PG_V;
 183                 break;
 184         case PT_EPT:
 185                 if (pmap_emulate_ad_bits(pmap))
 186                         mask = EPT_PG_EMUL_V;
 187                 else
 188                         mask = EPT_PG_READ;
 189                 break;
 190         default:
 191                 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
 192         }
 193
 194         return (mask);
 195 }
 196
 197 static __inline pt_entry_t
 198 pmap_rw_bit(pmap_t pmap)
 199 {
 200         pt_entry_t mask;
 201
 202         switch (pmap->pm_type) {
 203         case PT_X86:
 204         case PT_RVI:
 205                 mask = X86_PG_RW;
 206                 break;
 207         case PT_EPT:
 208                 if (pmap_emulate_ad_bits(pmap))
 209                         mask = EPT_PG_EMUL_RW;
 210                 else
 211                         mask = EPT_PG_WRITE;
 212                 break;
 213         default:
 214                 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
 215         }
 216
 217         return (mask);
 218 }
 219
 220 static pt_entry_t pg_g;
 221
 222 static __inline pt_entry_t
 223 pmap_global_bit(pmap_t pmap)
 224 {
 225         pt_entry_t mask;
 226
 227         switch (pmap->pm_type) {
 228         case PT_X86:
 229                 mask = pg_g;
 230                 break;
 231         case PT_RVI:
 232         case PT_EPT:
 233                 mask = 0;
 234                 break;
 235         default:
 236                 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
 237         }
 238
 239         return (mask);
 240 }
 241
 242 static __inline pt_entry_t
 243 pmap_accessed_bit(pmap_t pmap)
 244 {
 245         pt_entry_t mask;
 246
 247         switch (pmap->pm_type) {
 248         case PT_X86:
 249         case PT_RVI:
 250                 mask = X86_PG_A;
 251                 break;
 252         case PT_EPT:
 253                 if (pmap_emulate_ad_bits(pmap))
 254                         mask = EPT_PG_READ;
 255                 else
 256                         mask = EPT_PG_A;
 257                 break;
 258         default:
 259                 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
 260         }
 261
 262         return (mask);
 263 }
 264
 265 static __inline pt_entry_t
 266 pmap_modified_bit(pmap_t pmap)
 267 {
 268         pt_entry_t mask;
 269
 270         switch (pmap->pm_type) {
 271         case PT_X86:
 272         case PT_RVI:
 273                 mask = X86_PG_M;
 274                 break;
 275         case PT_EPT:
 276                 if (pmap_emulate_ad_bits(pmap))
 277                         mask = EPT_PG_WRITE;
 278                 else
 279                         mask = EPT_PG_M;
 280                 break;
 281         default:
 282                 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
 283         }
 284
 285         return (mask);
 286 }
 287
 288 #if !defined(DIAGNOSTIC)
 289 #ifdef __GNUC_GNU_INLINE__
 290 #define PMAP_INLINE     __attribute__((__gnu_inline__)) inline
 291 #else
 292 #define PMAP_INLINE     extern inline
 293 #endif
 294 #else
 295 #define PMAP_INLINE
 296 #endif
 297
 298 #ifdef PV_STATS
 299 #define PV_STAT(x)      do { x ; } while (0)
 300 #else
 301 #define PV_STAT(x)      do { } while (0)
 302 #endif
 303
 304 #define pa_index(pa)    ((pa) >> PDRSHIFT)
 305 #define pa_to_pvh(pa)   (&pv_table[pa_index(pa)])
 306
 307 #define NPV_LIST_LOCKS  MAXCPU
 308
 309 #define PHYS_TO_PV_LIST_LOCK(pa)        \
 310                         (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
 311
 312 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)  do {    \
 313         struct rwlock **_lockp = (lockp);               \
 314         struct rwlock *_new_lock;                       \
 315                                                         \
 316         _new_lock = PHYS_TO_PV_LIST_LOCK(pa);           \
 317         if (_new_lock != *_lockp) {                     \
 318                 if (*_lockp != NULL)                    \
 319                         rw_wunlock(*_lockp);            \
 320                 *_lockp = _new_lock;                    \
 321                 rw_wlock(*_lockp);                      \
 322         }                                               \
 323 } while (0)
 324
 325 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)        \
 326                         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
 327
 328 #define RELEASE_PV_LIST_LOCK(lockp)             do {    \
 329         struct rwlock **_lockp = (lockp);               \
 330                                                         \
 331         if (*_lockp != NULL) {                          \
 332                 rw_wunlock(*_lockp);                    \
 333                 *_lockp = NULL;                         \
 334         }                                               \
 335 } while (0)
 336
 337 #define VM_PAGE_TO_PV_LIST_LOCK(m)      \
 338                         PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
 339
 340 struct pmap kernel_pmap_store;
 341
 342 vm_offset_t virtual_avail;      /* VA of first avail page (after kernel bss) */
 343 vm_offset_t virtual_end;        /* VA of last avail page (end of kernel AS) */
 344
 345 int nkpt;
 346 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
 347     "Number of kernel page table pages allocated on bootup");
 348
 349 static int ndmpdp;
 350 vm_paddr_t dmaplimit;
 351 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
 352 pt_entry_t pg_nx;
 353
 354 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 355
 356 static int pat_works = 1;
 357 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
 358     "Is page attribute table fully functional?");
 359
 360 static int pg_ps_enabled = 1;
 361 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
 362     &pg_ps_enabled, 0, "Are large page mappings enabled?");
 363
 364 #define PAT_INDEX_SIZE  8
 365 static int pat_index[PAT_INDEX_SIZE];   /* cache mode to PAT index conversion */
 366
 367 static u_int64_t        KPTphys;        /* phys addr of kernel level 1 */
 368 static u_int64_t        KPDphys;        /* phys addr of kernel level 2 */
 369 u_int64_t               KPDPphys;       /* phys addr of kernel level 3 */
 370 u_int64_t               KPML4phys;      /* phys addr of kernel level 4 */
 371
 372 static u_int64_t        DMPDphys;       /* phys addr of direct mapped level 2 */
 373 static u_int64_t        DMPDPphys;      /* phys addr of direct mapped level 3 */
 374 static int              ndmpdpphys;     /* number of DMPDPphys pages */
 375
 376 static vm_paddr_t       KERNend;        /* phys addr of end of bootstrap data */
 377
 378 /*
 379  * pmap_mapdev support pre initialization (i.e. console)
 380  */
 381 #define PMAP_PREINIT_MAPPING_COUNT      8
 382 static struct pmap_preinit_mapping {
 383         vm_paddr_t      pa;
 384         vm_offset_t     va;
 385         vm_size_t       sz;
 386         int             mode;
 387 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
 388 static int pmap_initialized;
 389
 390 /*
 391  * Data for the pv entry allocation mechanism.
 392  * Updates to pv_invl_gen are protected by the pv_list_locks[]
 393  * elements, but reads are not.
 394  */
 395 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 396 static struct mtx __exclusive_cache_line pv_chunks_mutex;
 397 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
 398 static u_long pv_invl_gen[NPV_LIST_LOCKS];
 399 static struct md_page *pv_table;
 400 static struct md_page pv_dummy;
 401
 402 /*
 403  * All those kernel PT submaps that BSD is so fond of
 404  */
 405 pt_entry_t *CMAP1 = NULL;
 406 caddr_t CADDR1 = 0;
 407 static vm_offset_t qframe = 0;
 408 static struct mtx qframe_mtx;
 409
 410 static int pmap_flags = PMAP_PDE_SUPERPAGE;     /* flags for x86 pmaps */
 411
 412 int pmap_pcid_enabled = 1;
 413 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
 414     &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
 415 int invpcid_works = 0;
 416 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
 417     "Is the invpcid instruction available ?");
 418
 419 int __read_frequently pti = 0;
 420 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
 421     &pti, 0,
 422     "Page Table Isolation enabled");
 423 static vm_object_t pti_obj;
 424 static pml4_entry_t *pti_pml4;
 425 static vm_pindex_t pti_pg_idx;
 426 static bool pti_finalized;
 427
 428 static int
 429 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
 430 {
 431         int i;
 432         uint64_t res;
 433
 434         res = 0;
 435         CPU_FOREACH(i) {
 436                 res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
 437         }
 438         return (sysctl_handle_64(oidp, &res, 0, req));
 439 }
 440 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
 441     CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
 442     "Count of saved TLB context on switch");
 443
 444 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
 445     LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
 446 static struct mtx invl_gen_mtx;
 447 static u_long pmap_invl_gen = 0;
 448 /* Fake lock object to satisfy turnstiles interface. */
 449 static struct lock_object invl_gen_ts = {
 450         .lo_name = "invlts",
 451 };
 452
 453 static bool
 454 pmap_not_in_di(void)
 455 {
 456
 457         return (curthread->td_md.md_invl_gen.gen == 0);
 458 }
 459
 460 #define PMAP_ASSERT_NOT_IN_DI() \
 461     KASSERT(pmap_not_in_di(), ("DI already started"))
 462
 463 /*
 464  * Start a new Delayed Invalidation (DI) block of code, executed by
 465  * the current thread.  Within a DI block, the current thread may
 466  * destroy both the page table and PV list entries for a mapping and
 467  * then release the corresponding PV list lock before ensuring that
 468  * the mapping is flushed from the TLBs of any processors with the
 469  * pmap active.
 470  */
 471 static void
 472 pmap_delayed_invl_started(void)
 473 {
 474         struct pmap_invl_gen *invl_gen;
 475         u_long currgen;
 476
 477         invl_gen = &curthread->td_md.md_invl_gen;
 478         PMAP_ASSERT_NOT_IN_DI();
 479         mtx_lock(&invl_gen_mtx);
 480         if (LIST_EMPTY(&pmap_invl_gen_tracker))
 481                 currgen = pmap_invl_gen;
 482         else
 483                 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
 484         invl_gen->gen = currgen + 1;
 485         LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
 486         mtx_unlock(&invl_gen_mtx);
 487 }
 488
 489 /*
 490  * Finish the DI block, previously started by the current thread.  All
 491  * required TLB flushes for the pages marked by
 492  * pmap_delayed_invl_page() must be finished before this function is
 493  * called.
 494  *
 495  * This function works by bumping the global DI generation number to
 496  * the generation number of the current thread's DI, unless there is a
 497  * pending DI that started earlier.  In the latter case, bumping the
 498  * global DI generation number would incorrectly signal that the
 499  * earlier DI had finished.  Instead, this function bumps the earlier
 500  * DI's generation number to match the generation number of the
 501  * current thread's DI.
 502  */
 503 static void
 504 pmap_delayed_invl_finished(void)
 505 {
 506         struct pmap_invl_gen *invl_gen, *next;
 507         struct turnstile *ts;
 508
 509         invl_gen = &curthread->td_md.md_invl_gen;
 510         KASSERT(invl_gen->gen != 0, ("missed invl_started"));
 511         mtx_lock(&invl_gen_mtx);
 512         next = LIST_NEXT(invl_gen, link);
 513         if (next == NULL) {
 514                 turnstile_chain_lock(&invl_gen_ts);
 515                 ts = turnstile_lookup(&invl_gen_ts);
 516                 pmap_invl_gen = invl_gen->gen;
 517                 if (ts != NULL) {
 518                         turnstile_broadcast(ts, TS_SHARED_QUEUE);
 519                         turnstile_unpend(ts);
 520                 }
 521                 turnstile_chain_unlock(&invl_gen_ts);
 522         } else {
 523                 next->gen = invl_gen->gen;
 524         }
 525         LIST_REMOVE(invl_gen, link);
 526         mtx_unlock(&invl_gen_mtx);
 527         invl_gen->gen = 0;
 528 }
 529
 530 #ifdef PV_STATS
 531 static long invl_wait;
 532 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
 533     "Number of times DI invalidation blocked pmap_remove_all/write");
 534 #endif
 535
 536 static u_long *
 537 pmap_delayed_invl_genp(vm_page_t m)
 538 {
 539
 540         return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
 541 }
 542
 543 /*
 544  * Ensure that all currently executing DI blocks, that need to flush
 545  * TLB for the given page m, actually flushed the TLB at the time the
 546  * function returned.  If the page m has an empty PV list and we call
 547  * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
 548  * valid mapping for the page m in either its page table or TLB.
 549  *
 550  * This function works by blocking until the global DI generation
 551  * number catches up with the generation number associated with the
 552  * given page m and its PV list.  Since this function's callers
 553  * typically own an object lock and sometimes own a page lock, it
 554  * cannot sleep.  Instead, it blocks on a turnstile to relinquish the
 555  * processor.
 556  */
 557 static void
 558 pmap_delayed_invl_wait(vm_page_t m)
 559 {
 560         struct turnstile *ts;
 561         u_long *m_gen;
 562 #ifdef PV_STATS
 563         bool accounted = false;
 564 #endif
 565
 566         m_gen = pmap_delayed_invl_genp(m);
 567         while (*m_gen > pmap_invl_gen) {
 568 #ifdef PV_STATS
 569                 if (!accounted) {
 570                         atomic_add_long(&invl_wait, 1);
 571                         accounted = true;
 572                 }
 573 #endif
 574                 ts = turnstile_trywait(&invl_gen_ts);
 575                 if (*m_gen > pmap_invl_gen)
 576                         turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
 577                 else
 578                         turnstile_cancel(ts);
 579         }
 580 }
 581
 582 /*
 583  * Mark the page m's PV list as participating in the current thread's
 584  * DI block.  Any threads concurrently using m's PV list to remove or
 585  * restrict all mappings to m will wait for the current thread's DI
 586  * block to complete before proceeding.
 587  *
 588  * The function works by setting the DI generation number for m's PV
 589  * list to at least the DI generation number of the current thread.
 590  * This forces a caller of pmap_delayed_invl_wait() to block until
 591  * current thread calls pmap_delayed_invl_finished().
 592  */
 593 static void
 594 pmap_delayed_invl_page(vm_page_t m)
 595 {
 596         u_long gen, *m_gen;
 597
 598         rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
 599         gen = curthread->td_md.md_invl_gen.gen;
 600         if (gen == 0)
 601                 return;
 602         m_gen = pmap_delayed_invl_genp(m);
 603         if (*m_gen < gen)
 604                 *m_gen = gen;
 605 }
 606
 607 /*
 608  * Crashdump maps.
 609  */
 610 static caddr_t crashdumpmap;
 611
 612 /*
 613  * Internal flags for pmap_enter()'s helper functions.
 614  */
 615 #define PMAP_ENTER_NORECLAIM    0x1000000       /* Don't reclaim PV entries. */
 616 #define PMAP_ENTER_NOREPLACE    0x2000000       /* Don't replace mappings. */
 617
 618 static void     free_pv_chunk(struct pv_chunk *pc);
 619 static void     free_pv_entry(pmap_t pmap, pv_entry_t pv);
 620 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
 621 static int      popcnt_pc_map_pq(uint64_t *map);
 622 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
 623 static void     reserve_pv_entries(pmap_t pmap, int needed,
 624                     struct rwlock **lockp);
 625 static void     pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 626                     struct rwlock **lockp);
 627 static bool     pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
 628                     u_int flags, struct rwlock **lockp);
 629 #if VM_NRESERVLEVEL > 0
 630 static void     pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 631                     struct rwlock **lockp);
 632 #endif
 633 static void     pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 634 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 635                     vm_offset_t va);
 636
 637 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
 638 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
 639 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
 640     vm_offset_t va, struct rwlock **lockp);
 641 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
 642     vm_offset_t va);
 643 static bool     pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
 644                     vm_prot_t prot, struct rwlock **lockp);
 645 static int      pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
 646                     u_int flags, vm_page_t m, struct rwlock **lockp);
 647 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
 648     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 649 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
 650 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
 651 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,
 652     vm_offset_t eva);
 653 static void pmap_invalidate_cache_range_all(vm_offset_t sva,
 654     vm_offset_t eva);
 655 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
 656                     pd_entry_t pde);
 657 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
 658 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
 659 #if VM_NRESERVLEVEL > 0
 660 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 661     struct rwlock **lockp);
 662 #endif
 663 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
 664     vm_prot_t prot);
 665 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
 666 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
 667     bool exec);
 668 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
 669 static pd_entry_t *pmap_pti_pde(vm_offset_t va);
 670 static void pmap_pti_wire_pte(void *pte);
 671 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
 672     struct spglist *free, struct rwlock **lockp);
 673 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
 674     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
 675 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
 676 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
 677     struct spglist *free);
 678 static bool     pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 679                     pd_entry_t *pde, struct spglist *free,
 680                     struct rwlock **lockp);
 681 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
 682     vm_page_t m, struct rwlock **lockp);
 683 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
 684     pd_entry_t newpde);
 685 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
 686
 687 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
 688                 struct rwlock **lockp);
 689 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
 690                 struct rwlock **lockp);
 691 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
 692                 struct rwlock **lockp);
 693
 694 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
 695     struct spglist *free);
 696 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 697
 698 /********************/
 699 /* Inline functions */
 700 /********************/
 701
 702 /* Return a non-clipped PD index for a given VA */
 703 static __inline vm_pindex_t
 704 pmap_pde_pindex(vm_offset_t va)
 705 {
 706         return (va >> PDRSHIFT);
 707 }
 708
 709
 710 /* Return a pointer to the PML4 slot that corresponds to a VA */
 711 static __inline pml4_entry_t *
 712 pmap_pml4e(pmap_t pmap, vm_offset_t va)
 713 {
 714
 715         return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
 716 }
 717
 718 /* Return a pointer to the PDP slot that corresponds to a VA */
 719 static __inline pdp_entry_t *
 720 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
 721 {
 722         pdp_entry_t *pdpe;
 723
 724         pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
 725         return (&pdpe[pmap_pdpe_index(va)]);
 726 }
 727
 728 /* Return a pointer to the PDP slot that corresponds to a VA */
 729 static __inline pdp_entry_t *
 730 pmap_pdpe(pmap_t pmap, vm_offset_t va)
 731 {
 732         pml4_entry_t *pml4e;
 733         pt_entry_t PG_V;
 734
 735         PG_V = pmap_valid_bit(pmap);
 736         pml4e = pmap_pml4e(pmap, va);
 737         if ((*pml4e & PG_V) == 0)
 738                 return (NULL);
 739         return (pmap_pml4e_to_pdpe(pml4e, va));
 740 }
 741
 742 /* Return a pointer to the PD slot that corresponds to a VA */
 743 static __inline pd_entry_t *
 744 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
 745 {
 746         pd_entry_t *pde;
 747
 748         pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
 749         return (&pde[pmap_pde_index(va)]);
 750 }
 751
 752 /* Return a pointer to the PD slot that corresponds to a VA */
 753 static __inline pd_entry_t *
 754 pmap_pde(pmap_t pmap, vm_offset_t va)
 755 {
 756         pdp_entry_t *pdpe;
 757         pt_entry_t PG_V;
 758
 759         PG_V = pmap_valid_bit(pmap);
 760         pdpe = pmap_pdpe(pmap, va);
 761         if (pdpe == NULL || (*pdpe & PG_V) == 0)
 762                 return (NULL);
 763         return (pmap_pdpe_to_pde(pdpe, va));
 764 }
 765
 766 /* Return a pointer to the PT slot that corresponds to a VA */
 767 static __inline pt_entry_t *
 768 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
 769 {
 770         pt_entry_t *pte;
 771
 772         pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 773         return (&pte[pmap_pte_index(va)]);
 774 }
 775
 776 /* Return a pointer to the PT slot that corresponds to a VA */
 777 static __inline pt_entry_t *
 778 pmap_pte(pmap_t pmap, vm_offset_t va)
 779 {
 780         pd_entry_t *pde;
 781         pt_entry_t PG_V;
 782
 783         PG_V = pmap_valid_bit(pmap);
 784         pde = pmap_pde(pmap, va);
 785         if (pde == NULL || (*pde & PG_V) == 0)
 786                 return (NULL);
 787         if ((*pde & PG_PS) != 0)        /* compat with i386 pmap_pte() */
 788                 return ((pt_entry_t *)pde);
 789         return (pmap_pde_to_pte(pde, va));
 790 }
 791
 792 static __inline void
 793 pmap_resident_count_inc(pmap_t pmap, int count)
 794 {
 795
 796         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 797         pmap->pm_stats.resident_count += count;
 798 }
 799
 800 static __inline void
 801 pmap_resident_count_dec(pmap_t pmap, int count)
 802 {
 803
 804         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 805         KASSERT(pmap->pm_stats.resident_count >= count,
 806             ("pmap %p resident count underflow %ld %d", pmap,
 807             pmap->pm_stats.resident_count, count));
 808         pmap->pm_stats.resident_count -= count;
 809 }
 810
 811 PMAP_INLINE pt_entry_t *
 812 vtopte(vm_offset_t va)
 813 {
 814         u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 815
 816         KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
 817
 818         return (PTmap + ((va >> PAGE_SHIFT) & mask));
 819 }
 820
 821 static __inline pd_entry_t *
 822 vtopde(vm_offset_t va)
 823 {
 824         u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 825
 826         KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
 827
 828         return (PDmap + ((va >> PDRSHIFT) & mask));
 829 }
 830
 831 static u_int64_t
 832 allocpages(vm_paddr_t *firstaddr, int n)
 833 {
 834         u_int64_t ret;
 835
 836         ret = *firstaddr;
 837         bzero((void *)ret, n * PAGE_SIZE);
 838         *firstaddr += n * PAGE_SIZE;
 839         return (ret);
 840 }
 841
 842 CTASSERT(powerof2(NDMPML4E));
 843
 844 /* number of kernel PDP slots */
 845 #define NKPDPE(ptpgs)           howmany(ptpgs, NPDEPG)
 846
 847 static void
 848 nkpt_init(vm_paddr_t addr)
 849 {
 850         int pt_pages;
 851
 852 #ifdef NKPT
 853         pt_pages = NKPT;
 854 #else
 855         pt_pages = howmany(addr, 1 << PDRSHIFT);
 856         pt_pages += NKPDPE(pt_pages);
 857
 858         /*
 859          * Add some slop beyond the bare minimum required for bootstrapping
 860          * the kernel.
 861          *
 862          * This is quite important when allocating KVA for kernel modules.
 863          * The modules are required to be linked in the negative 2GB of
 864          * the address space.  If we run out of KVA in this region then
 865          * pmap_growkernel() will need to allocate page table pages to map
 866          * the entire 512GB of KVA space which is an unnecessary tax on
 867          * physical memory.
 868          *
 869          * Secondly, device memory mapped as part of setting up the low-
 870          * level console(s) is taken from KVA, starting at virtual_avail.
 871          * This is because cninit() is called after pmap_bootstrap() but
 872          * before vm_init() and pmap_init(). 20MB for a frame buffer is
 873          * not uncommon.
 874          */
 875         pt_pages += 32;         /* 64MB additional slop. */
 876 #endif
 877         nkpt = pt_pages;
 878 }
 879
 880 /*
 881  * Returns the proper write/execute permission for a physical page that is
 882  * part of the initial boot allocations.
 883  *
 884  * If the page has kernel text, it is marked as read-only. If the page has
 885  * kernel read-only data, it is marked as read-only/not-executable. If the
 886  * page has only read-write data, it is marked as read-write/not-executable.
 887  * If the page is below/above the kernel range, it is marked as read-write.
 888  *
 889  * This function operates on 2M pages, since we map the kernel space that
 890  * way.
 891  *
 892  * Note that this doesn't currently provide any protection for modules.
 893  */
 894 static inline pt_entry_t
 895 bootaddr_rwx(vm_paddr_t pa)
 896 {
 897
 898         /*
 899          * Everything in the same 2M page as the start of the kernel
 900          * should be static. On the other hand, things in the same 2M
 901          * page as the end of the kernel could be read-write/executable,
 902          * as the kernel image is not guaranteed to end on a 2M boundary.
 903          */
 904         if (pa < trunc_2mpage(btext - KERNBASE) ||
 905            pa >= trunc_2mpage(_end - KERNBASE))
 906                 return (X86_PG_RW);
 907         /*
 908          * The linker should ensure that the read-only and read-write
 909          * portions don't share the same 2M page, so this shouldn't
 910          * impact read-only data. However, in any case, any page with
 911          * read-write data needs to be read-write.
 912          */
 913         if (pa >= trunc_2mpage(brwsection - KERNBASE))
 914                 return (X86_PG_RW | pg_nx);
 915         /*
 916          * Mark any 2M page containing kernel text as read-only. Mark
 917          * other pages with read-only data as read-only and not executable.
 918          * (It is likely a small portion of the read-only data section will
 919          * be marked as read-only, but executable. This should be acceptable
 920          * since the read-only protection will keep the data from changing.)
 921          * Note that fixups to the .text section will still work until we
 922          * set CR0.WP.
 923          */
 924         if (pa < round_2mpage(etext - KERNBASE))
 925                 return (0);
 926         return (pg_nx);
 927 }
 928
 929 static void
 930 create_pagetables(vm_paddr_t *firstaddr)
 931 {
 932         int i, j, ndm1g, nkpdpe, nkdmpde;
 933         pt_entry_t *pt_p;
 934         pd_entry_t *pd_p;
 935         pdp_entry_t *pdp_p;
 936         pml4_entry_t *p4_p;
 937         uint64_t DMPDkernphys;
 938
 939         /* Allocate page table pages for the direct map */
 940         ndmpdp = howmany(ptoa(Maxmem), NBPDP);
 941         if (ndmpdp < 4)         /* Minimum 4GB of dirmap */
 942                 ndmpdp = 4;
 943         ndmpdpphys = howmany(ndmpdp, NPDPEPG);
 944         if (ndmpdpphys > NDMPML4E) {
 945                 /*
 946                  * Each NDMPML4E allows 512 GB, so limit to that,
 947                  * and then readjust ndmpdp and ndmpdpphys.
 948                  */
 949                 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
 950                 Maxmem = atop(NDMPML4E * NBPML4);
 951                 ndmpdpphys = NDMPML4E;
 952                 ndmpdp = NDMPML4E * NPDEPG;
 953         }
 954         DMPDPphys = allocpages(firstaddr, ndmpdpphys);
 955         ndm1g = 0;
 956         if ((amd_feature & AMDID_PAGE1GB) != 0) {
 957                 /*
 958                  * Calculate the number of 1G pages that will fully fit in
 959                  * Maxmem.
 960                  */
 961                 ndm1g = ptoa(Maxmem) >> PDPSHIFT;
 962
 963                 /*
 964                  * Allocate 2M pages for the kernel. These will be used in
 965                  * place of the first one or more 1G pages from ndm1g.
 966                  */
 967                 nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP);
 968                 DMPDkernphys = allocpages(firstaddr, nkdmpde);
 969         }
 970         if (ndm1g < ndmpdp)
 971                 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
 972         dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
 973
 974         /* Allocate pages */
 975         KPML4phys = allocpages(firstaddr, 1);
 976         KPDPphys = allocpages(firstaddr, NKPML4E);
 977
 978         /*
 979          * Allocate the initial number of kernel page table pages required to
 980          * bootstrap.  We defer this until after all memory-size dependent
 981          * allocations are done (e.g. direct map), so that we don't have to
 982          * build in too much slop in our estimate.
 983          *
 984          * Note that when NKPML4E > 1, we have an empty page underneath
 985          * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
 986          * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
 987          */
 988         nkpt_init(*firstaddr);
 989         nkpdpe = NKPDPE(nkpt);
 990
 991         KPTphys = allocpages(firstaddr, nkpt);
 992         KPDphys = allocpages(firstaddr, nkpdpe);
 993
 994         /* Fill in the underlying page table pages */
 995         /* XXX not fully used, underneath 2M pages */
 996         pt_p = (pt_entry_t *)KPTphys;
 997         for (i = 0; ptoa(i) < *firstaddr; i++)
 998                 pt_p[i] = ptoa(i) | X86_PG_V | pg_g | bootaddr_rwx(ptoa(i));
 999
1000         /* Now map the page tables at their location within PTmap */
1001         pd_p = (pd_entry_t *)KPDphys;
1002         for (i = 0; i < nkpt; i++)
1003                 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
1004
1005         /* Map from zero to end of allocations under 2M pages */
1006         /* This replaces some of the KPTphys entries above */
1007         for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
1008                 /* Preset PG_M and PG_A because demotion expects it. */
1009                 pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
1010                     X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT);
1011
1012         /*
1013          * Because we map the physical blocks in 2M pages, adjust firstaddr
1014          * to record the physical blocks we've actually mapped into kernel
1015          * virtual address space.
1016          */
1017         *firstaddr = round_2mpage(*firstaddr);
1018
1019         /* And connect up the PD to the PDP (leaving room for L4 pages) */
1020         pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
1021         for (i = 0; i < nkpdpe; i++)
1022                 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
1023
1024         /*
1025          * Now, set up the direct map region using 2MB and/or 1GB pages.  If
1026          * the end of physical memory is not aligned to a 1GB page boundary,
1027          * then the residual physical memory is mapped with 2MB pages.  Later,
1028          * if pmap_mapdev{_attr}() uses the direct map for non-write-back
1029          * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
1030          * that are partially used.
1031          */
1032         pd_p = (pd_entry_t *)DMPDphys;
1033         for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
1034                 pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
1035                 /* Preset PG_M and PG_A because demotion expects it. */
1036                 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
1037                     X86_PG_M | X86_PG_A | pg_nx;
1038         }
1039         pdp_p = (pdp_entry_t *)DMPDPphys;
1040         for (i = 0; i < ndm1g; i++) {
1041                 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
1042                 /* Preset PG_M and PG_A because demotion expects it. */
1043                 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
1044                     X86_PG_M | X86_PG_A | pg_nx;
1045         }
1046         for (j = 0; i < ndmpdp; i++, j++) {
1047                 pdp_p[i] = DMPDphys + ptoa(j);
1048                 pdp_p[i] |= X86_PG_RW | X86_PG_V;
1049         }
1050
1051         /*
1052          * Instead of using a 1G page for the memory containing the kernel,
1053          * use 2M pages with appropriate permissions. (If using 1G pages,
1054          * this will partially overwrite the PDPEs above.)
1055          */
1056         if (ndm1g) {
1057                 pd_p = (pd_entry_t *)DMPDkernphys;
1058                 for (i = 0; i < (NPDEPG * nkdmpde); i++)
1059                         pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
1060                             X86_PG_M | X86_PG_A | pg_nx |
1061                             bootaddr_rwx(i << PDRSHIFT);
1062                 for (i = 0; i < nkdmpde; i++)
1063                         pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW |
1064                             X86_PG_V;
1065         }
1066
1067         /* And recursively map PML4 to itself in order to get PTmap */
1068         p4_p = (pml4_entry_t *)KPML4phys;
1069         p4_p[PML4PML4I] = KPML4phys;
1070         p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
1071
1072         /* Connect the Direct Map slot(s) up to the PML4. */
1073         for (i = 0; i < ndmpdpphys; i++) {
1074                 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
1075                 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V;
1076         }
1077
1078         /* Connect the KVA slots up to the PML4 */
1079         for (i = 0; i < NKPML4E; i++) {
1080                 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
1081                 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
1082         }
1083 }
1084
1085 /*
1086  *      Bootstrap the system enough to run with virtual memory.
1087  *
1088  *      On amd64 this is called after mapping has already been enabled
1089  *      and just syncs the pmap module with what has already been done.
1090  *      [We can't call it easily with mapping off since the kernel is not
1091  *      mapped with PA == VA, hence we would have to relocate every address
1092  *      from the linked base (virtual) address "KERNBASE" to the actual
1093  *      (physical) address starting relative to 0]
1094  */
1095 void
1096 pmap_bootstrap(vm_paddr_t *firstaddr)
1097 {
1098         vm_offset_t va;
1099         pt_entry_t *pte;
1100         uint64_t cr4;
1101         int i;
1102
1103         KERNend = *firstaddr;
1104
1105         if (!pti)
1106                 pg_g = X86_PG_G;
1107
1108         /*
1109          * Create an initial set of page tables to run the kernel in.
1110          */
1111         create_pagetables(firstaddr);
1112
1113         /*
1114          * Add a physical memory segment (vm_phys_seg) corresponding to the
1115          * preallocated kernel page table pages so that vm_page structures
1116          * representing these pages will be created.  The vm_page structures
1117          * are required for promotion of the corresponding kernel virtual
1118          * addresses to superpage mappings.
1119          */
1120         vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
1121
1122         virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
1123
1124         virtual_end = VM_MAX_KERNEL_ADDRESS;
1125
1126
1127         /*
1128          * Enable PG_G global pages, then switch to the kernel page
1129          * table from the bootstrap page table.  After the switch, it
1130          * is possible to enable SMEP and SMAP since PG_U bits are
1131          * correct now.
1132          */
1133         cr4 = rcr4();
1134         cr4 |= CR4_PGE;
1135         load_cr4(cr4);
1136         load_cr3(KPML4phys);
1137         if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
1138                 cr4 |= CR4_SMEP;
1139         if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
1140                 cr4 |= CR4_SMAP;
1141         load_cr4(cr4);
1142
1143         /*
1144          * Initialize the kernel pmap (which is statically allocated).
1145          */
1146         PMAP_LOCK_INIT(kernel_pmap);
1147         kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
1148         kernel_pmap->pm_cr3 = KPML4phys;
1149         kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
1150         CPU_FILL(&kernel_pmap->pm_active);      /* don't allow deactivation */
1151         TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1152         kernel_pmap->pm_flags = pmap_flags;
1153
1154         /*
1155          * Initialize the TLB invalidations generation number lock.
1156          */
1157         mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
1158
1159         /*
1160          * Reserve some special page table entries/VA space for temporary
1161          * mapping of pages.
1162          */
1163 #define SYSMAP(c, p, v, n)      \
1164         v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
1165
1166         va = virtual_avail;
1167         pte = vtopte(va);
1168
1169         /*
1170          * Crashdump maps.  The first page is reused as CMAP1 for the
1171          * memory test.
1172          */
1173         SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
1174         CADDR1 = crashdumpmap;
1175
1176         virtual_avail = va;
1177
1178         /*
1179          * Initialize the PAT MSR.
1180          * pmap_init_pat() clears and sets CR4_PGE, which, as a
1181          * side-effect, invalidates stale PG_G TLB entries that might
1182          * have been created in our pre-boot environment.
1183          */
1184         pmap_init_pat();
1185
1186         /* Initialize TLB Context Id. */
1187         if (pmap_pcid_enabled) {
1188                 for (i = 0; i < MAXCPU; i++) {
1189                         kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN;
1190                         kernel_pmap->pm_pcids[i].pm_gen = 1;
1191                 }
1192
1193                 /*
1194                  * PMAP_PCID_KERN + 1 is used for initialization of
1195                  * proc0 pmap.  The pmap' pcid state might be used by
1196                  * EFIRT entry before first context switch, so it
1197                  * needs to be valid.
1198                  */
1199                 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2);
1200                 PCPU_SET(pcid_gen, 1);
1201
1202                 /*
1203                  * pcpu area for APs is zeroed during AP startup.
1204                  * pc_pcid_next and pc_pcid_gen are initialized by AP
1205                  * during pcpu setup.
1206                  */
1207                 load_cr4(rcr4() | CR4_PCIDE);
1208         }
1209 }
1210
1211 /*
1212  * Setup the PAT MSR.
1213  */
1214 void
1215 pmap_init_pat(void)
1216 {
1217         int pat_table[PAT_INDEX_SIZE];
1218         uint64_t pat_msr;
1219         u_long cr0, cr4;
1220         int i;
1221
1222         /* Bail if this CPU doesn't implement PAT. */
1223         if ((cpu_feature & CPUID_PAT) == 0)
1224                 panic("no PAT??");
1225
1226         /* Set default PAT index table. */
1227         for (i = 0; i < PAT_INDEX_SIZE; i++)
1228                 pat_table[i] = -1;
1229         pat_table[PAT_WRITE_BACK] = 0;
1230         pat_table[PAT_WRITE_THROUGH] = 1;
1231         pat_table[PAT_UNCACHEABLE] = 3;
1232         pat_table[PAT_WRITE_COMBINING] = 3;
1233         pat_table[PAT_WRITE_PROTECTED] = 3;
1234         pat_table[PAT_UNCACHED] = 3;
1235
1236         /* Initialize default PAT entries. */
1237         pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
1238             PAT_VALUE(1, PAT_WRITE_THROUGH) |
1239             PAT_VALUE(2, PAT_UNCACHED) |
1240             PAT_VALUE(3, PAT_UNCACHEABLE) |
1241             PAT_VALUE(4, PAT_WRITE_BACK) |
1242             PAT_VALUE(5, PAT_WRITE_THROUGH) |
1243             PAT_VALUE(6, PAT_UNCACHED) |
1244             PAT_VALUE(7, PAT_UNCACHEABLE);
1245
1246         if (pat_works) {
1247                 /*
1248                  * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
1249                  * Program 5 and 6 as WP and WC.
1250                  * Leave 4 and 7 as WB and UC.
1251                  */
1252                 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
1253                 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
1254                     PAT_VALUE(6, PAT_WRITE_COMBINING);
1255                 pat_table[PAT_UNCACHED] = 2;
1256                 pat_table[PAT_WRITE_PROTECTED] = 5;
1257                 pat_table[PAT_WRITE_COMBINING] = 6;
1258         } else {
1259                 /*
1260                  * Just replace PAT Index 2 with WC instead of UC-.
1261                  */
1262                 pat_msr &= ~PAT_MASK(2);
1263                 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
1264                 pat_table[PAT_WRITE_COMBINING] = 2;
1265         }
1266
1267         /* Disable PGE. */
1268         cr4 = rcr4();
1269         load_cr4(cr4 & ~CR4_PGE);
1270
1271         /* Disable caches (CD = 1, NW = 0). */
1272         cr0 = rcr0();
1273         load_cr0((cr0 & ~CR0_NW) | CR0_CD);
1274
1275         /* Flushes caches and TLBs. */
1276         wbinvd();
1277         invltlb();
1278
1279         /* Update PAT and index table. */
1280         wrmsr(MSR_PAT, pat_msr);
1281         for (i = 0; i < PAT_INDEX_SIZE; i++)
1282                 pat_index[i] = pat_table[i];
1283
1284         /* Flush caches and TLBs again. */
1285         wbinvd();
1286         invltlb();
1287
1288         /* Restore caches and PGE. */
1289         load_cr0(cr0);
1290         load_cr4(cr4);
1291 }
1292
1293 /*
1294  *      Initialize a vm_page's machine-dependent fields.
1295  */
1296 void
1297 pmap_page_init(vm_page_t m)
1298 {
1299
1300         TAILQ_INIT(&m->md.pv_list);
1301         m->md.pat_mode = PAT_WRITE_BACK;
1302 }
1303
1304 /*
1305  *      Initialize the pmap module.
1306  *      Called by vm_init, to initialize any structures that the pmap
1307  *      system needs to map virtual memory.
1308  */
1309 void
1310 pmap_init(void)
1311 {
1312         struct pmap_preinit_mapping *ppim;
1313         vm_page_t mpte;
1314         vm_size_t s;
1315         int error, i, pv_npg, ret, skz63;
1316
1317         /* L1TF, reserve page @0 unconditionally */
1318         vm_page_blacklist_add(0, bootverbose);
1319
1320         /* Detect bare-metal Skylake Server and Skylake-X. */
1321         if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL &&
1322             CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) {
1323                 /*
1324                  * Skylake-X errata SKZ63. Processor May Hang When
1325                  * Executing Code In an HLE Transaction Region between
1326                  * 40000000H and 403FFFFFH.
1327                  *
1328                  * Mark the pages in the range as preallocated.  It
1329                  * seems to be impossible to distinguish between
1330                  * Skylake Server and Skylake X.
1331                  */
1332                 skz63 = 1;
1333                 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63);
1334                 if (skz63 != 0) {
1335                         if (bootverbose)
1336                                 printf("SKZ63: skipping 4M RAM starting "
1337                                     "at physical 1G\n");
1338                         for (i = 0; i < atop(0x400000); i++) {
1339                                 ret = vm_page_blacklist_add(0x40000000 +
1340                                     ptoa(i), FALSE);
1341                                 if (!ret && bootverbose)
1342                                         printf("page at %#lx already used\n",
1343                                             0x40000000 + ptoa(i));
1344                         }
1345                 }
1346         }
1347
1348         /*
1349          * Initialize the vm page array entries for the kernel pmap's
1350          * page table pages.
1351          */
1352         PMAP_LOCK(kernel_pmap);
1353         for (i = 0; i < nkpt; i++) {
1354                 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
1355                 KASSERT(mpte >= vm_page_array &&
1356                     mpte < &vm_page_array[vm_page_array_size],
1357                     ("pmap_init: page table page is out of range"));
1358                 mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
1359                 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
1360                 mpte->wire_count = 1;
1361                 if (i << PDRSHIFT < KERNend &&
1362                     pmap_insert_pt_page(kernel_pmap, mpte))
1363                         panic("pmap_init: pmap_insert_pt_page failed");
1364         }
1365         PMAP_UNLOCK(kernel_pmap);
1366         vm_wire_add(nkpt);
1367
1368         /*
1369          * If the kernel is running on a virtual machine, then it must assume
1370          * that MCA is enabled by the hypervisor.  Moreover, the kernel must
1371          * be prepared for the hypervisor changing the vendor and family that
1372          * are reported by CPUID.  Consequently, the workaround for AMD Family
1373          * 10h Erratum 383 is enabled if the processor's feature set does not
1374          * include at least one feature that is only supported by older Intel
1375          * or newer AMD processors.
1376          */
1377         if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
1378             (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
1379             CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
1380             AMDID2_FMA4)) == 0)
1381                 workaround_erratum383 = 1;
1382
1383         /*
1384          * Are large page mappings enabled?
1385          */
1386         TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
1387         if (pg_ps_enabled) {
1388                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1389                     ("pmap_init: can't assign to pagesizes[1]"));
1390                 pagesizes[1] = NBPDR;
1391         }
1392
1393         /*
1394          * Initialize the pv chunk list mutex.
1395          */
1396         mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1397
1398         /*
1399          * Initialize the pool of pv list locks.
1400          */
1401         for (i = 0; i < NPV_LIST_LOCKS; i++)
1402                 rw_init(&pv_list_locks[i], "pmap pv list");
1403
1404         /*
1405          * Calculate the size of the pv head table for superpages.
1406          */
1407         pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
1408
1409         /*
1410          * Allocate memory for the pv head table for superpages.
1411          */
1412         s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1413         s = round_page(s);
1414         pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
1415         for (i = 0; i < pv_npg; i++)
1416                 TAILQ_INIT(&pv_table[i].pv_list);
1417         TAILQ_INIT(&pv_dummy.pv_list);
1418
1419         pmap_initialized = 1;
1420         for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
1421                 ppim = pmap_preinit_mapping + i;
1422                 if (ppim->va == 0)
1423                         continue;
1424                 /* Make the direct map consistent */
1425                 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) {
1426                         (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
1427                             ppim->sz, ppim->mode);
1428                 }
1429                 if (!bootverbose)
1430                         continue;
1431                 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
1432                     ppim->pa, ppim->va, ppim->sz, ppim->mode);
1433         }
1434
1435         mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
1436         error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
1437             (vmem_addr_t *)&qframe);
1438         if (error != 0)
1439                 panic("qframe allocation failed");
1440 }
1441
1442 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
1443     "2MB page mapping counters");
1444
1445 static u_long pmap_pde_demotions;
1446 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
1447     &pmap_pde_demotions, 0, "2MB page demotions");
1448
1449 static u_long pmap_pde_mappings;
1450 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
1451     &pmap_pde_mappings, 0, "2MB page mappings");
1452
1453 static u_long pmap_pde_p_failures;
1454 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
1455     &pmap_pde_p_failures, 0, "2MB page promotion failures");
1456
1457 static u_long pmap_pde_promotions;
1458 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
1459     &pmap_pde_promotions, 0, "2MB page promotions");
1460
1461 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
1462     "1GB page mapping counters");
1463
1464 static u_long pmap_pdpe_demotions;
1465 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
1466     &pmap_pdpe_demotions, 0, "1GB page demotions");
1467
1468 /***************************************************
1469  * Low level helper routines.....
1470  ***************************************************/
1471
1472 static pt_entry_t
1473 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
1474 {
1475         int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
1476
1477         switch (pmap->pm_type) {
1478         case PT_X86:
1479         case PT_RVI:
1480                 /* Verify that both PAT bits are not set at the same time */
1481                 KASSERT((entry & x86_pat_bits) != x86_pat_bits,
1482                     ("Invalid PAT bits in entry %#lx", entry));
1483
1484                 /* Swap the PAT bits if one of them is set */
1485                 if ((entry & x86_pat_bits) != 0)
1486                         entry ^= x86_pat_bits;
1487                 break;
1488         case PT_EPT:
1489                 /*
1490                  * Nothing to do - the memory attributes are represented
1491                  * the same way for regular pages and superpages.
1492                  */
1493                 break;
1494         default:
1495                 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
1496         }
1497
1498         return (entry);
1499 }
1500
1501 boolean_t
1502 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
1503 {
1504
1505         return (mode >= 0 && mode < PAT_INDEX_SIZE &&
1506             pat_index[(int)mode] >= 0);
1507 }
1508
1509 /*
1510  * Determine the appropriate bits to set in a PTE or PDE for a specified
1511  * caching mode.
1512  */
1513 int
1514 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
1515 {
1516         int cache_bits, pat_flag, pat_idx;
1517
1518         if (!pmap_is_valid_memattr(pmap, mode))
1519                 panic("Unknown caching mode %d\n", mode);
1520
1521         switch (pmap->pm_type) {
1522         case PT_X86:
1523         case PT_RVI:
1524                 /* The PAT bit is different for PTE's and PDE's. */
1525                 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
1526
1527                 /* Map the caching mode to a PAT index. */
1528                 pat_idx = pat_index[mode];
1529
1530                 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
1531                 cache_bits = 0;
1532                 if (pat_idx & 0x4)
1533                         cache_bits |= pat_flag;
1534                 if (pat_idx & 0x2)
1535                         cache_bits |= PG_NC_PCD;
1536                 if (pat_idx & 0x1)
1537                         cache_bits |= PG_NC_PWT;
1538                 break;
1539
1540         case PT_EPT:
1541                 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
1542                 break;
1543
1544         default:
1545                 panic("unsupported pmap type %d", pmap->pm_type);
1546         }
1547
1548         return (cache_bits);
1549 }
1550
1551 static int
1552 pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
1553 {
1554         int mask;
1555
1556         switch (pmap->pm_type) {
1557         case PT_X86:
1558         case PT_RVI:
1559                 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
1560                 break;
1561         case PT_EPT:
1562                 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
1563                 break;
1564         default:
1565                 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
1566         }
1567
1568         return (mask);
1569 }
1570
1571 bool
1572 pmap_ps_enabled(pmap_t pmap)
1573 {
1574
1575         return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
1576 }
1577
1578 static void
1579 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
1580 {
1581
1582         switch (pmap->pm_type) {
1583         case PT_X86:
1584                 break;
1585         case PT_RVI:
1586         case PT_EPT:
1587                 /*
1588                  * XXX
1589                  * This is a little bogus since the generation number is
1590                  * supposed to be bumped up when a region of the address
1591                  * space is invalidated in the page tables.
1592                  *
1593                  * In this case the old PDE entry is valid but yet we want
1594                  * to make sure that any mappings using the old entry are
1595                  * invalidated in the TLB.
1596                  *
1597                  * The reason this works as expected is because we rendezvous
1598                  * "all" host cpus and force any vcpu context to exit as a
1599                  * side-effect.
1600                  */
1601                 atomic_add_acq_long(&pmap->pm_eptgen, 1);
1602                 break;
1603         default:
1604                 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
1605         }
1606         pde_store(pde, newpde);
1607 }
1608
1609 /*
1610  * After changing the page size for the specified virtual address in the page
1611  * table, flush the corresponding entries from the processor's TLB.  Only the
1612  * calling processor's TLB is affected.
1613  *
1614  * The calling thread must be pinned to a processor.
1615  */
1616 static void
1617 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
1618 {
1619         pt_entry_t PG_G;
1620
1621         if (pmap_type_guest(pmap))
1622                 return;
1623
1624         KASSERT(pmap->pm_type == PT_X86,
1625             ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
1626
1627         PG_G = pmap_global_bit(pmap);
1628
1629         if ((newpde & PG_PS) == 0)
1630                 /* Demotion: flush a specific 2MB page mapping. */
1631                 invlpg(va);
1632         else if ((newpde & PG_G) == 0)
1633                 /*
1634                  * Promotion: flush every 4KB page mapping from the TLB
1635                  * because there are too many to flush individually.
1636                  */
1637                 invltlb();
1638         else {
1639                 /*
1640                  * Promotion: flush every 4KB page mapping from the TLB,
1641                  * including any global (PG_G) mappings.
1642                  */
1643                 invltlb_glob();
1644         }
1645 }
1646 #ifdef SMP
1647
1648 /*
1649  * For SMP, these functions have to use the IPI mechanism for coherence.
1650  *
1651  * N.B.: Before calling any of the following TLB invalidation functions,
1652  * the calling processor must ensure that all stores updating a non-
1653  * kernel page table are globally performed.  Otherwise, another
1654  * processor could cache an old, pre-update entry without being
1655  * invalidated.  This can happen one of two ways: (1) The pmap becomes
1656  * active on another processor after its pm_active field is checked by
1657  * one of the following functions but before a store updating the page
1658  * table is globally performed. (2) The pmap becomes active on another
1659  * processor before its pm_active field is checked but due to
1660  * speculative loads one of the following functions stills reads the
1661  * pmap as inactive on the other processor.
1662  *
1663  * The kernel page table is exempt because its pm_active field is
1664  * immutable.  The kernel page table is always active on every
1665  * processor.
1666  */
1667
1668 /*
1669  * Interrupt the cpus that are executing in the guest context.
1670  * This will force the vcpu to exit and the cached EPT mappings
1671  * will be invalidated by the host before the next vmresume.
1672  */
1673 static __inline void
1674 pmap_invalidate_ept(pmap_t pmap)
1675 {
1676         int ipinum;
1677
1678         sched_pin();
1679         KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1680             ("pmap_invalidate_ept: absurd pm_active"));
1681
1682         /*
1683          * The TLB mappings associated with a vcpu context are not
1684          * flushed each time a different vcpu is chosen to execute.
1685          *
1686          * This is in contrast with a process's vtop mappings that
1687          * are flushed from the TLB on each context switch.
1688          *
1689          * Therefore we need to do more than just a TLB shootdown on
1690          * the active cpus in 'pmap->pm_active'. To do this we keep
1691          * track of the number of invalidations performed on this pmap.
1692          *
1693          * Each vcpu keeps a cache of this counter and compares it
1694          * just before a vmresume. If the counter is out-of-date an
1695          * invept will be done to flush stale mappings from the TLB.
1696          */
1697         atomic_add_acq_long(&pmap->pm_eptgen, 1);
1698
1699         /*
1700          * Force the vcpu to exit and trap back into the hypervisor.
1701          */
1702         ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
1703         ipi_selected(pmap->pm_active, ipinum);
1704         sched_unpin();
1705 }
1706
1707 void
1708 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1709 {
1710         cpuset_t *mask;
1711         struct invpcid_descr d;
1712         uint64_t kcr3, ucr3;
1713         uint32_t pcid;
1714         u_int cpuid, i;
1715
1716         if (pmap_type_guest(pmap)) {
1717                 pmap_invalidate_ept(pmap);
1718                 return;
1719         }
1720
1721         KASSERT(pmap->pm_type == PT_X86,
1722             ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
1723
1724         sched_pin();
1725         if (pmap == kernel_pmap) {
1726                 invlpg(va);
1727                 mask = &all_cpus;
1728         } else {
1729                 cpuid = PCPU_GET(cpuid);
1730                 if (pmap == PCPU_GET(curpmap)) {
1731                         invlpg(va);
1732                         if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) {
1733                                 /*
1734                                  * Disable context switching. pm_pcid
1735                                  * is recalculated on switch, which
1736                                  * might make us use wrong pcid below.
1737                                  */
1738                                 critical_enter();
1739                                 pcid = pmap->pm_pcids[cpuid].pm_pcid;
1740
1741                                 if (invpcid_works) {
1742                                         d.pcid = pcid | PMAP_PCID_USER_PT;
1743                                         d.pad = 0;
1744                                         d.addr = va;
1745                                         invpcid(&d, INVPCID_ADDR);
1746                                 } else {
1747                                         kcr3 = pmap->pm_cr3 | pcid |
1748                                             CR3_PCID_SAVE;
1749                                         ucr3 = pmap->pm_ucr3 | pcid |
1750                                             PMAP_PCID_USER_PT | CR3_PCID_SAVE;
1751                                         pmap_pti_pcid_invlpg(ucr3, kcr3, va);
1752                                 }
1753                                 critical_exit();
1754                         }
1755                 } else if (pmap_pcid_enabled)
1756                         pmap->pm_pcids[cpuid].pm_gen = 0;
1757                 if (pmap_pcid_enabled) {
1758                         CPU_FOREACH(i) {
1759                                 if (cpuid != i)
1760                                         pmap->pm_pcids[i].pm_gen = 0;
1761                         }
1762
1763                         /*
1764                          * The fence is between stores to pm_gen and the read of
1765                          * the pm_active mask.  We need to ensure that it is
1766                          * impossible for us to miss the bit update in pm_active
1767                          * and simultaneously observe a non-zero pm_gen in
1768                          * pmap_activate_sw(), otherwise TLB update is missed.
1769                          * Without the fence, IA32 allows such an outcome.
1770                          * Note that pm_active is updated by a locked operation,
1771                          * which provides the reciprocal fence.
1772                          */
1773                         atomic_thread_fence_seq_cst();
1774                 }
1775                 mask = &pmap->pm_active;
1776         }
1777         smp_masked_invlpg(*mask, va, pmap);
1778         sched_unpin();
1779 }
1780
1781 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
1782 #define PMAP_INVLPG_THRESHOLD   (4 * 1024 * PAGE_SIZE)
1783
1784 void
1785 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1786 {
1787         cpuset_t *mask;
1788         struct invpcid_descr d;
1789         vm_offset_t addr;
1790         uint64_t kcr3, ucr3;
1791         uint32_t pcid;
1792         u_int cpuid, i;
1793
1794         if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
1795                 pmap_invalidate_all(pmap);
1796                 return;
1797         }
1798
1799         if (pmap_type_guest(pmap)) {
1800                 pmap_invalidate_ept(pmap);
1801                 return;
1802         }
1803
1804         KASSERT(pmap->pm_type == PT_X86,
1805             ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
1806
1807         sched_pin();
1808         cpuid = PCPU_GET(cpuid);
1809         if (pmap == kernel_pmap) {
1810                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
1811                         invlpg(addr);
1812                 mask = &all_cpus;
1813         } else {
1814                 if (pmap == PCPU_GET(curpmap)) {
1815                         for (addr = sva; addr < eva; addr += PAGE_SIZE)
1816                                 invlpg(addr);
1817                         if (pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) {
1818                                 critical_enter();
1819                                 pcid = pmap->pm_pcids[cpuid].pm_pcid;
1820                                 if (invpcid_works) {
1821                                         d.pcid = pcid | PMAP_PCID_USER_PT;
1822                                         d.pad = 0;
1823                                         d.addr = sva;
1824                                         for (; d.addr < eva; d.addr +=
1825                                             PAGE_SIZE)
1826                                                 invpcid(&d, INVPCID_ADDR);
1827                                 } else {
1828                                         kcr3 = pmap->pm_cr3 | pcid |
1829                                             CR3_PCID_SAVE;
1830                                         ucr3 = pmap->pm_ucr3 | pcid |
1831                                             PMAP_PCID_USER_PT | CR3_PCID_SAVE;
1832                                         pmap_pti_pcid_invlrng(ucr3, kcr3, sva,
1833                                             eva);
1834                                 }
1835                                 critical_exit();
1836                         }
1837                 } else if (pmap_pcid_enabled) {
1838                         pmap->pm_pcids[cpuid].pm_gen = 0;
1839                 }
1840                 if (pmap_pcid_enabled) {
1841                         CPU_FOREACH(i) {
1842                                 if (cpuid != i)
1843                                         pmap->pm_pcids[i].pm_gen = 0;
1844                         }
1845                         /* See the comment in pmap_invalidate_page(). */
1846                         atomic_thread_fence_seq_cst();
1847                 }
1848                 mask = &pmap->pm_active;
1849         }
1850         smp_masked_invlpg_range(*mask, sva, eva, pmap);
1851         sched_unpin();
1852 }
1853
1854 void
1855 pmap_invalidate_all(pmap_t pmap)
1856 {
1857         cpuset_t *mask;
1858         struct invpcid_descr d;
1859         uint64_t kcr3, ucr3;
1860         uint32_t pcid;
1861         u_int cpuid, i;
1862
1863         if (pmap_type_guest(pmap)) {
1864                 pmap_invalidate_ept(pmap);
1865                 return;
1866         }
1867
1868         KASSERT(pmap->pm_type == PT_X86,
1869             ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
1870
1871         sched_pin();
1872         if (pmap == kernel_pmap) {
1873                 if (pmap_pcid_enabled && invpcid_works) {
1874                         bzero(&d, sizeof(d));
1875                         invpcid(&d, INVPCID_CTXGLOB);
1876                 } else {
1877                         invltlb_glob();
1878                 }
1879                 mask = &all_cpus;
1880         } else {
1881                 cpuid = PCPU_GET(cpuid);
1882                 if (pmap == PCPU_GET(curpmap)) {
1883                         if (pmap_pcid_enabled) {
1884                                 critical_enter();
1885                                 pcid = pmap->pm_pcids[cpuid].pm_pcid;
1886                                 if (invpcid_works) {
1887                                         d.pcid = pcid;
1888                                         d.pad = 0;
1889                                         d.addr = 0;
1890                                         invpcid(&d, INVPCID_CTX);
1891                                         if (pmap->pm_ucr3 != PMAP_NO_CR3) {
1892                                                 d.pcid |= PMAP_PCID_USER_PT;
1893                                                 invpcid(&d, INVPCID_CTX);
1894                                         }
1895                                 } else {
1896                                         kcr3 = pmap->pm_cr3 | pcid;
1897                                         ucr3 = pmap->pm_ucr3;
1898                                         if (ucr3 != PMAP_NO_CR3) {
1899                                                 ucr3 |= pcid | PMAP_PCID_USER_PT;
1900                                                 pmap_pti_pcid_invalidate(ucr3,
1901                                                     kcr3);
1902                                         } else {
1903                                                 load_cr3(kcr3);
1904                                         }
1905                                 }
1906                                 critical_exit();
1907                         } else {
1908                                 invltlb();
1909                         }
1910                 } else if (pmap_pcid_enabled) {
1911                         pmap->pm_pcids[cpuid].pm_gen = 0;
1912                 }
1913                 if (pmap_pcid_enabled) {
1914                         CPU_FOREACH(i) {
1915                                 if (cpuid != i)
1916                                         pmap->pm_pcids[i].pm_gen = 0;
1917                         }
1918                         /* See the comment in pmap_invalidate_page(). */
1919                         atomic_thread_fence_seq_cst();
1920                 }
1921                 mask = &pmap->pm_active;
1922         }
1923         smp_masked_invltlb(*mask, pmap);
1924         sched_unpin();
1925 }
1926
1927 void
1928 pmap_invalidate_cache(void)
1929 {
1930
1931         sched_pin();
1932         wbinvd();
1933         smp_cache_flush();
1934         sched_unpin();
1935 }
1936
1937 struct pde_action {
1938         cpuset_t invalidate;    /* processors that invalidate their TLB */
1939         pmap_t pmap;
1940         vm_offset_t va;
1941         pd_entry_t *pde;
1942         pd_entry_t newpde;
1943         u_int store;            /* processor that updates the PDE */
1944 };
1945
1946 static void
1947 pmap_update_pde_action(void *arg)
1948 {
1949         struct pde_action *act = arg;
1950
1951         if (act->store == PCPU_GET(cpuid))
1952                 pmap_update_pde_store(act->pmap, act->pde, act->newpde);
1953 }
1954
1955 static void
1956 pmap_update_pde_teardown(void *arg)
1957 {
1958         struct pde_action *act = arg;
1959
1960         if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
1961                 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
1962 }
1963
1964 /*
1965  * Change the page size for the specified virtual address in a way that
1966  * prevents any possibility of the TLB ever having two entries that map the
1967  * same virtual address using different page sizes.  This is the recommended
1968  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
1969  * machine check exception for a TLB state that is improperly diagnosed as a
1970  * hardware error.
1971  */
1972 static void
1973 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
1974 {
1975         struct pde_action act;
1976         cpuset_t active, other_cpus;
1977         u_int cpuid;
1978
1979         sched_pin();
1980         cpuid = PCPU_GET(cpuid);
1981         other_cpus = all_cpus;
1982         CPU_CLR(cpuid, &other_cpus);
1983         if (pmap == kernel_pmap || pmap_type_guest(pmap))
1984                 active = all_cpus;
1985         else {
1986                 active = pmap->pm_active;
1987         }
1988         if (CPU_OVERLAP(&active, &other_cpus)) {
1989                 act.store = cpuid;
1990                 act.invalidate = active;
1991                 act.va = va;
1992                 act.pmap = pmap;
1993                 act.pde = pde;
1994                 act.newpde = newpde;
1995                 CPU_SET(cpuid, &active);
1996                 smp_rendezvous_cpus(active,
1997                     smp_no_rendezvous_barrier, pmap_update_pde_action,
1998                     pmap_update_pde_teardown, &act);
1999         } else {
2000                 pmap_update_pde_store(pmap, pde, newpde);
2001                 if (CPU_ISSET(cpuid, &active))
2002                         pmap_update_pde_invalidate(pmap, va, newpde);
2003         }
2004         sched_unpin();
2005 }
2006 #else /* !SMP */
2007 /*
2008  * Normal, non-SMP, invalidation functions.
2009  */
2010 void
2011 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
2012 {
2013         struct invpcid_descr d;
2014         uint64_t kcr3, ucr3;
2015         uint32_t pcid;
2016
2017         if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
2018                 pmap->pm_eptgen++;
2019                 return;
2020         }
2021         KASSERT(pmap->pm_type == PT_X86,
2022             ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
2023
2024         if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
2025                 invlpg(va);
2026                 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
2027                     pmap->pm_ucr3 != PMAP_NO_CR3) {
2028                         critical_enter();
2029                         pcid = pmap->pm_pcids[0].pm_pcid;
2030                         if (invpcid_works) {
2031                                 d.pcid = pcid | PMAP_PCID_USER_PT;
2032                                 d.pad = 0;
2033                                 d.addr = va;
2034                                 invpcid(&d, INVPCID_ADDR);
2035                         } else {
2036                                 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
2037                                 ucr3 = pmap->pm_ucr3 | pcid |
2038                                     PMAP_PCID_USER_PT | CR3_PCID_SAVE;
2039                                 pmap_pti_pcid_invlpg(ucr3, kcr3, va);
2040                         }
2041                         critical_exit();
2042                 }
2043         } else if (pmap_pcid_enabled)
2044                 pmap->pm_pcids[0].pm_gen = 0;
2045 }
2046
2047 void
2048 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2049 {
2050         struct invpcid_descr d;
2051         vm_offset_t addr;
2052         uint64_t kcr3, ucr3;
2053
2054         if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
2055                 pmap->pm_eptgen++;
2056                 return;
2057         }
2058         KASSERT(pmap->pm_type == PT_X86,
2059             ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
2060
2061         if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
2062                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
2063                         invlpg(addr);
2064                 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
2065                     pmap->pm_ucr3 != PMAP_NO_CR3) {
2066                         critical_enter();
2067                         if (invpcid_works) {
2068                                 d.pcid = pmap->pm_pcids[0].pm_pcid |
2069                                     PMAP_PCID_USER_PT;
2070                                 d.pad = 0;
2071                                 d.addr = sva;
2072                                 for (; d.addr < eva; d.addr += PAGE_SIZE)
2073                                         invpcid(&d, INVPCID_ADDR);
2074                         } else {
2075                                 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].
2076                                     pm_pcid | CR3_PCID_SAVE;
2077                                 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0].
2078                                     pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
2079                                 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
2080                         }
2081                         critical_exit();
2082                 }
2083         } else if (pmap_pcid_enabled) {
2084                 pmap->pm_pcids[0].pm_gen = 0;
2085         }
2086 }
2087
2088 void
2089 pmap_invalidate_all(pmap_t pmap)
2090 {
2091         struct invpcid_descr d;
2092         uint64_t kcr3, ucr3;
2093
2094         if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
2095                 pmap->pm_eptgen++;
2096                 return;
2097         }
2098         KASSERT(pmap->pm_type == PT_X86,
2099             ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
2100
2101         if (pmap == kernel_pmap) {
2102                 if (pmap_pcid_enabled && invpcid_works) {
2103                         bzero(&d, sizeof(d));
2104                         invpcid(&d, INVPCID_CTXGLOB);
2105                 } else {
2106                         invltlb_glob();
2107                 }
2108         } else if (pmap == PCPU_GET(curpmap)) {
2109                 if (pmap_pcid_enabled) {
2110                         critical_enter();
2111                         if (invpcid_works) {
2112                                 d.pcid = pmap->pm_pcids[0].pm_pcid;
2113                                 d.pad = 0;
2114                                 d.addr = 0;
2115                                 invpcid(&d, INVPCID_CTX);
2116                                 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
2117                                         d.pcid |= PMAP_PCID_USER_PT;
2118                                         invpcid(&d, INVPCID_CTX);
2119                                 }
2120                         } else {
2121                                 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid;
2122                                 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
2123                                         ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[
2124                                             0].pm_pcid | PMAP_PCID_USER_PT;
2125                                         pmap_pti_pcid_invalidate(ucr3, kcr3);
2126                                 } else
2127                                         load_cr3(kcr3);
2128                         }
2129                         critical_exit();
2130                 } else {
2131                         invltlb();
2132                 }
2133         } else if (pmap_pcid_enabled) {
2134                 pmap->pm_pcids[0].pm_gen = 0;
2135         }
2136 }
2137
2138 PMAP_INLINE void
2139 pmap_invalidate_cache(void)
2140 {
2141
2142         wbinvd();
2143 }
2144
2145 static void
2146 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
2147 {
2148
2149         pmap_update_pde_store(pmap, pde, newpde);
2150         if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
2151                 pmap_update_pde_invalidate(pmap, va, newpde);
2152         else
2153                 pmap->pm_pcids[0].pm_gen = 0;
2154 }
2155 #endif /* !SMP */
2156
2157 static void
2158 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
2159 {
2160
2161         /*
2162          * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
2163          * by a promotion that did not invalidate the 512 4KB page mappings
2164          * that might exist in the TLB.  Consequently, at this point, the TLB
2165          * may hold both 4KB and 2MB page mappings for the address range [va,
2166          * va + NBPDR).  Therefore, the entire range must be invalidated here.
2167          * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
2168          * 4KB page mappings for the address range [va, va + NBPDR), and so a
2169          * single INVLPG suffices to invalidate the 2MB page mapping from the
2170          * TLB.
2171          */
2172         if ((pde & PG_PROMOTED) != 0)
2173                 pmap_invalidate_range(pmap, va, va + NBPDR - 1);
2174         else
2175                 pmap_invalidate_page(pmap, va);
2176 }
2177
2178 DEFINE_IFUNC(, void, pmap_invalidate_cache_range,
2179     (vm_offset_t sva, vm_offset_t eva), static)
2180 {
2181
2182         if ((cpu_feature & CPUID_SS) != 0)
2183                 return (pmap_invalidate_cache_range_selfsnoop);
2184         if ((cpu_feature & CPUID_CLFSH) != 0)
2185                 return (pmap_force_invalidate_cache_range);
2186         return (pmap_invalidate_cache_range_all);
2187 }
2188
2189 #define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
2190
2191 static void
2192 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva)
2193 {
2194
2195         KASSERT((sva & PAGE_MASK) == 0,
2196             ("pmap_invalidate_cache_range: sva not page-aligned"));
2197         KASSERT((eva & PAGE_MASK) == 0,
2198             ("pmap_invalidate_cache_range: eva not page-aligned"));
2199 }
2200
2201 static void
2202 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva)
2203 {
2204
2205         pmap_invalidate_cache_range_check_align(sva, eva);
2206 }
2207
2208 void
2209 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
2210 {
2211
2212         sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
2213         if (eva - sva >= PMAP_CLFLUSH_THRESHOLD) {
2214                 /*
2215                  * The supplied range is bigger than 2MB.
2216                  * Globally invalidate cache.
2217                  */
2218                 pmap_invalidate_cache();
2219                 return;
2220         }
2221
2222         /*
2223          * XXX: Some CPUs fault, hang, or trash the local APIC
2224          * registers if we use CLFLUSH on the local APIC range.  The
2225          * local APIC is always uncached, so we don't need to flush
2226          * for that range anyway.
2227          */
2228         if (pmap_kextract(sva) == lapic_paddr)
2229                 return;
2230
2231         if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {
2232                 /*
2233                  * Do per-cache line flush.  Use the sfence
2234                  * instruction to insure that previous stores are
2235                  * included in the write-back.  The processor
2236                  * propagates flush to other processors in the cache
2237                  * coherence domain.
2238                  */
2239                 sfence();
2240                 for (; sva < eva; sva += cpu_clflush_line_size)
2241                         clflushopt(sva);
2242                 sfence();
2243         } else {
2244                 /*
2245                  * Writes are ordered by CLFLUSH on Intel CPUs.
2246                  */
2247                 if (cpu_vendor_id != CPU_VENDOR_INTEL)
2248                         mfence();
2249                 for (; sva < eva; sva += cpu_clflush_line_size)
2250                         clflush(sva);
2251                 if (cpu_vendor_id != CPU_VENDOR_INTEL)
2252                         mfence();
2253         }
2254 }
2255
2256 static void
2257 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva)
2258 {
2259
2260         pmap_invalidate_cache_range_check_align(sva, eva);
2261         pmap_invalidate_cache();
2262 }
2263
2264 /*
2265  * Remove the specified set of pages from the data and instruction caches.
2266  *
2267  * In contrast to pmap_invalidate_cache_range(), this function does not
2268  * rely on the CPU's self-snoop feature, because it is intended for use
2269  * when moving pages into a different cache domain.
2270  */
2271 void
2272 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
2273 {
2274         vm_offset_t daddr, eva;
2275         int i;
2276         bool useclflushopt;
2277
2278         useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
2279         if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
2280             ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
2281                 pmap_invalidate_cache();
2282         else {
2283                 if (useclflushopt)
2284                         sfence();
2285                 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
2286                         mfence();
2287                 for (i = 0; i < count; i++) {
2288                         daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
2289                         eva = daddr + PAGE_SIZE;
2290                         for (; daddr < eva; daddr += cpu_clflush_line_size) {
2291                                 if (useclflushopt)
2292                                         clflushopt(daddr);
2293                                 else
2294                                         clflush(daddr);
2295                         }
2296                 }
2297                 if (useclflushopt)
2298                         sfence();
2299                 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
2300                         mfence();
2301         }
2302 }
2303
2304 /*
2305  *      Routine:        pmap_extract
2306  *      Function:
2307  *              Extract the physical page address associated
2308  *              with the given map/virtual_address pair.
2309  */
2310 vm_paddr_t
2311 pmap_extract(pmap_t pmap, vm_offset_t va)
2312 {
2313         pdp_entry_t *pdpe;
2314         pd_entry_t *pde;
2315         pt_entry_t *pte, PG_V;
2316         vm_paddr_t pa;
2317
2318         pa = 0;
2319         PG_V = pmap_valid_bit(pmap);
2320         PMAP_LOCK(pmap);
2321         pdpe = pmap_pdpe(pmap, va);
2322         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
2323                 if ((*pdpe & PG_PS) != 0)
2324                         pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
2325                 else {
2326                         pde = pmap_pdpe_to_pde(pdpe, va);
2327                         if ((*pde & PG_V) != 0) {
2328                                 if ((*pde & PG_PS) != 0) {
2329                                         pa = (*pde & PG_PS_FRAME) |
2330                                             (va & PDRMASK);
2331                                 } else {
2332                                         pte = pmap_pde_to_pte(pde, va);
2333                                         pa = (*pte & PG_FRAME) |
2334                                             (va & PAGE_MASK);
2335                                 }
2336                         }
2337                 }
2338         }
2339         PMAP_UNLOCK(pmap);
2340         return (pa);
2341 }
2342
2343 /*
2344  *      Routine:        pmap_extract_and_hold
2345  *      Function:
2346  *              Atomically extract and hold the physical page
2347  *              with the given pmap and virtual address pair
2348  *              if that mapping permits the given protection.
2349  */
2350 vm_page_t
2351 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
2352 {
2353         pd_entry_t pde, *pdep;
2354         pt_entry_t pte, PG_RW, PG_V;
2355         vm_paddr_t pa;
2356         vm_page_t m;
2357
2358         pa = 0;
2359         m = NULL;
2360         PG_RW = pmap_rw_bit(pmap);
2361         PG_V = pmap_valid_bit(pmap);
2362         PMAP_LOCK(pmap);
2363 retry:
2364         pdep = pmap_pde(pmap, va);
2365         if (pdep != NULL && (pde = *pdep)) {
2366                 if (pde & PG_PS) {
2367                         if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
2368                                 if (vm_page_pa_tryrelock(pmap, (pde &
2369                                     PG_PS_FRAME) | (va & PDRMASK), &pa))
2370                                         goto retry;
2371                                 m = PHYS_TO_VM_PAGE(pa);
2372                         }
2373                 } else {
2374                         pte = *pmap_pde_to_pte(pdep, va);
2375                         if ((pte & PG_V) &&
2376                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
2377                                 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
2378                                     &pa))
2379                                         goto retry;
2380                                 m = PHYS_TO_VM_PAGE(pa);
2381                         }
2382                 }
2383                 if (m != NULL)
2384                         vm_page_hold(m);
2385         }
2386         PA_UNLOCK_COND(pa);
2387         PMAP_UNLOCK(pmap);
2388         return (m);
2389 }
2390
2391 vm_paddr_t
2392 pmap_kextract(vm_offset_t va)
2393 {
2394         pd_entry_t pde;
2395         vm_paddr_t pa;
2396
2397         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
2398                 pa = DMAP_TO_PHYS(va);
2399         } else {
2400                 pde = *vtopde(va);
2401                 if (pde & PG_PS) {
2402                         pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
2403                 } else {
2404                         /*
2405                          * Beware of a concurrent promotion that changes the
2406                          * PDE at this point!  For example, vtopte() must not
2407                          * be used to access the PTE because it would use the
2408                          * new PDE.  It is, however, safe to use the old PDE
2409                          * because the page table page is preserved by the
2410                          * promotion.
2411                          */
2412                         pa = *pmap_pde_to_pte(&pde, va);
2413                         pa = (pa & PG_FRAME) | (va & PAGE_MASK);
2414                 }
2415         }
2416         return (pa);
2417 }
2418
2419 /***************************************************
2420  * Low level mapping routines.....
2421  ***************************************************/
2422
2423 /*
2424  * Add a wired page to the kva.
2425  * Note: not SMP coherent.
2426  */
2427 PMAP_INLINE void
2428 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
2429 {
2430         pt_entry_t *pte;
2431
2432         pte = vtopte(va);
2433         pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g);
2434 }
2435
2436 static __inline void
2437 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
2438 {
2439         pt_entry_t *pte;
2440         int cache_bits;
2441
2442         pte = vtopte(va);
2443         cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
2444         pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits);
2445 }
2446
2447 /*
2448  * Remove a page from the kernel pagetables.
2449  * Note: not SMP coherent.
2450  */
2451 PMAP_INLINE void
2452 pmap_kremove(vm_offset_t va)
2453 {
2454         pt_entry_t *pte;
2455
2456         pte = vtopte(va);
2457         pte_clear(pte);
2458 }
2459
2460 /*
2461  *      Used to map a range of physical addresses into kernel
2462  *      virtual address space.
2463  *
2464  *      The value passed in '*virt' is a suggested virtual address for
2465  *      the mapping. Architectures which can support a direct-mapped
2466  *      physical to virtual region can return the appropriate address
2467  *      within that region, leaving '*virt' unchanged. Other
2468  *      architectures should map the pages starting at '*virt' and
2469  *      update '*virt' with the first usable address after the mapped
2470  *      region.
2471  */
2472 vm_offset_t
2473 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2474 {
2475         return PHYS_TO_DMAP(start);
2476 }
2477
2478
2479 /*
2480  * Add a list of wired pages to the kva
2481  * this routine is only used for temporary
2482  * kernel mappings that do not need to have
2483  * page modification or references recorded.
2484  * Note that old mappings are simply written
2485  * over.  The page *must* be wired.
2486  * Note: SMP coherent.  Uses a ranged shootdown IPI.
2487  */
2488 void
2489 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2490 {
2491         pt_entry_t *endpte, oldpte, pa, *pte;
2492         vm_page_t m;
2493         int cache_bits;
2494
2495         oldpte = 0;
2496         pte = vtopte(sva);
2497         endpte = pte + count;
2498         while (pte < endpte) {
2499                 m = *ma++;
2500                 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
2501                 pa = VM_PAGE_TO_PHYS(m) | cache_bits;
2502                 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
2503                         oldpte |= *pte;
2504                         pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V);
2505                 }
2506                 pte++;
2507         }
2508         if (__predict_false((oldpte & X86_PG_V) != 0))
2509                 pmap_invalidate_range(kernel_pmap, sva, sva + count *
2510                     PAGE_SIZE);
2511 }
2512
2513 /*
2514  * This routine tears out page mappings from the
2515  * kernel -- it is meant only for temporary mappings.
2516  * Note: SMP coherent.  Uses a ranged shootdown IPI.
2517  */
2518 void
2519 pmap_qremove(vm_offset_t sva, int count)
2520 {
2521         vm_offset_t va;
2522
2523         va = sva;
2524         while (count-- > 0) {
2525                 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
2526                 pmap_kremove(va);
2527                 va += PAGE_SIZE;
2528         }
2529         pmap_invalidate_range(kernel_pmap, sva, va);
2530 }
2531
2532 /***************************************************
2533  * Page table page management routines.....
2534  ***************************************************/
2535 /*
2536  * Schedule the specified unused page table page to be freed.  Specifically,
2537  * add the page to the specified list of pages that will be released to the
2538  * physical memory manager after the TLB has been updated.
2539  */
2540 static __inline void
2541 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2542     boolean_t set_PG_ZERO)
2543 {
2544
2545         if (set_PG_ZERO)
2546                 m->flags |= PG_ZERO;
2547         else
2548                 m->flags &= ~PG_ZERO;
2549         SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2550 }
2551
2552 /*
2553  * Inserts the specified page table page into the specified pmap's collection
2554  * of idle page table pages.  Each of a pmap's page table pages is responsible
2555  * for mapping a distinct range of virtual addresses.  The pmap's collection is
2556  * ordered by this virtual address range.
2557  */
2558 static __inline int
2559 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2560 {
2561
2562         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2563         return (vm_radix_insert(&pmap->pm_root, mpte));
2564 }
2565
2566 /*
2567  * Removes the page table page mapping the specified virtual address from the
2568  * specified pmap's collection of idle page table pages, and returns it.
2569  * Otherwise, returns NULL if there is no page table page corresponding to the
2570  * specified virtual address.
2571  */
2572 static __inline vm_page_t
2573 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
2574 {
2575
2576         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2577         return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va)));
2578 }
2579
2580 /*
2581  * Decrements a page table page's wire count, which is used to record the
2582  * number of valid page table entries within the page.  If the wire count
2583  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
2584  * page table page was unmapped and FALSE otherwise.
2585  */
2586 static inline boolean_t
2587 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2588 {
2589
2590         --m->wire_count;
2591         if (m->wire_count == 0) {
2592                 _pmap_unwire_ptp(pmap, va, m, free);
2593                 return (TRUE);
2594         } else
2595                 return (FALSE);
2596 }
2597
2598 static void
2599 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2600 {
2601
2602         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2603         /*
2604          * unmap the page table page
2605          */
2606         if (m->pindex >= (NUPDE + NUPDPE)) {
2607                 /* PDP page */
2608                 pml4_entry_t *pml4;
2609                 pml4 = pmap_pml4e(pmap, va);
2610                 *pml4 = 0;
2611                 if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
2612                         pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
2613                         *pml4 = 0;
2614                 }
2615         } else if (m->pindex >= NUPDE) {
2616                 /* PD page */
2617                 pdp_entry_t *pdp;
2618                 pdp = pmap_pdpe(pmap, va);
2619                 *pdp = 0;
2620         } else {
2621                 /* PTE page */
2622                 pd_entry_t *pd;
2623                 pd = pmap_pde(pmap, va);
2624                 *pd = 0;
2625         }
2626         pmap_resident_count_dec(pmap, 1);
2627         if (m->pindex < NUPDE) {
2628                 /* We just released a PT, unhold the matching PD */
2629                 vm_page_t pdpg;
2630
2631                 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
2632                 pmap_unwire_ptp(pmap, va, pdpg, free);
2633         }
2634         if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
2635                 /* We just released a PD, unhold the matching PDP */
2636                 vm_page_t pdppg;
2637
2638                 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
2639                 pmap_unwire_ptp(pmap, va, pdppg, free);
2640         }
2641
2642         /*
2643          * Put page on a list so that it is released after
2644          * *ALL* TLB shootdown is done
2645          */
2646         pmap_add_delayed_free_list(m, free, TRUE);
2647 }
2648
2649 /*
2650  * After removing a page table entry, this routine is used to
2651  * conditionally free the page, and manage the hold/wire counts.
2652  */
2653 static int
2654 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2655     struct spglist *free)
2656 {
2657         vm_page_t mpte;
2658
2659         if (va >= VM_MAXUSER_ADDRESS)
2660                 return (0);
2661         KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2662         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
2663         return (pmap_unwire_ptp(pmap, va, mpte, free));
2664 }
2665
2666 void
2667 pmap_pinit0(pmap_t pmap)
2668 {
2669         int i;
2670
2671         PMAP_LOCK_INIT(pmap);
2672         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
2673         pmap->pm_pml4u = NULL;
2674         pmap->pm_cr3 = KPML4phys;
2675         /* hack to keep pmap_pti_pcid_invalidate() alive */
2676         pmap->pm_ucr3 = PMAP_NO_CR3;
2677         pmap->pm_root.rt_root = 0;
2678         CPU_ZERO(&pmap->pm_active);
2679         TAILQ_INIT(&pmap->pm_pvchunk);
2680         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2681         pmap->pm_flags = pmap_flags;
2682         CPU_FOREACH(i) {
2683                 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1;
2684                 pmap->pm_pcids[i].pm_gen = 1;
2685         }
2686         pmap_activate_boot(pmap);
2687 }
2688
2689 void
2690 pmap_pinit_pml4(vm_page_t pml4pg)
2691 {
2692         pml4_entry_t *pm_pml4;
2693         int i;
2694
2695         pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
2696
2697         /* Wire in kernel global address entries. */
2698         for (i = 0; i < NKPML4E; i++) {
2699                 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
2700                     X86_PG_V;
2701         }
2702         for (i = 0; i < ndmpdpphys; i++) {
2703                 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
2704                     X86_PG_V;
2705         }
2706
2707         /* install self-referential address mapping entry(s) */
2708         pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
2709             X86_PG_A | X86_PG_M;
2710 }
2711
2712 static void
2713 pmap_pinit_pml4_pti(vm_page_t pml4pg)
2714 {
2715         pml4_entry_t *pm_pml4;
2716         int i;
2717
2718         pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
2719         for (i = 0; i < NPML4EPG; i++)
2720                 pm_pml4[i] = pti_pml4[i];
2721 }
2722
2723 /*
2724  * Initialize a preallocated and zeroed pmap structure,
2725  * such as one in a vmspace structure.
2726  */
2727 int
2728 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
2729 {
2730         vm_page_t pml4pg, pml4pgu;
2731         vm_paddr_t pml4phys;
2732         int i;
2733
2734         /*
2735          * allocate the page directory page
2736          */
2737         pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2738             VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK);
2739
2740         pml4phys = VM_PAGE_TO_PHYS(pml4pg);
2741         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
2742         CPU_FOREACH(i) {
2743                 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
2744                 pmap->pm_pcids[i].pm_gen = 0;
2745         }
2746         pmap->pm_cr3 = PMAP_NO_CR3;     /* initialize to an invalid value */
2747         pmap->pm_ucr3 = PMAP_NO_CR3;
2748         pmap->pm_pml4u = NULL;
2749
2750         pmap->pm_type = pm_type;
2751         if ((pml4pg->flags & PG_ZERO) == 0)
2752                 pagezero(pmap->pm_pml4);
2753
2754         /*
2755          * Do not install the host kernel mappings in the nested page
2756          * tables. These mappings are meaningless in the guest physical
2757          * address space.
2758          * Install minimal kernel mappings in PTI case.
2759          */
2760         if (pm_type == PT_X86) {
2761                 pmap->pm_cr3 = pml4phys;
2762                 pmap_pinit_pml4(pml4pg);
2763                 if (pti) {
2764                         pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2765                             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
2766                         pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
2767                             VM_PAGE_TO_PHYS(pml4pgu));
2768                         pmap_pinit_pml4_pti(pml4pgu);
2769                         pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
2770                 }
2771         }
2772
2773         pmap->pm_root.rt_root = 0;
2774         CPU_ZERO(&pmap->pm_active);
2775         TAILQ_INIT(&pmap->pm_pvchunk);
2776         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2777         pmap->pm_flags = flags;
2778         pmap->pm_eptgen = 0;
2779
2780         return (1);
2781 }
2782
2783 int
2784 pmap_pinit(pmap_t pmap)
2785 {
2786
2787         return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
2788 }
2789
2790 /*
2791  * This routine is called if the desired page table page does not exist.
2792  *
2793  * If page table page allocation fails, this routine may sleep before
2794  * returning NULL.  It sleeps only if a lock pointer was given.
2795  *
2796  * Note: If a page allocation fails at page table level two or three,
2797  * one or two pages may be held during the wait, only to be released
2798  * afterwards.  This conservative approach is easily argued to avoid
2799  * race conditions.
2800  */
2801 static vm_page_t
2802 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2803 {
2804         vm_page_t m, pdppg, pdpg;
2805         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
2806
2807         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2808
2809         PG_A = pmap_accessed_bit(pmap);
2810         PG_M = pmap_modified_bit(pmap);
2811         PG_V = pmap_valid_bit(pmap);
2812         PG_RW = pmap_rw_bit(pmap);
2813
2814         /*
2815          * Allocate a page table page.
2816          */
2817         if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
2818             VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2819                 if (lockp != NULL) {
2820                         RELEASE_PV_LIST_LOCK(lockp);
2821                         PMAP_UNLOCK(pmap);
2822                         PMAP_ASSERT_NOT_IN_DI();
2823                         vm_wait(NULL);
2824                         PMAP_LOCK(pmap);
2825                 }
2826
2827                 /*
2828                  * Indicate the need to retry.  While waiting, the page table
2829                  * page may have been allocated.
2830                  */
2831                 return (NULL);
2832         }
2833         if ((m->flags & PG_ZERO) == 0)
2834                 pmap_zero_page(m);
2835
2836         /*
2837          * Map the pagetable page into the process address space, if
2838          * it isn't already there.
2839          */
2840
2841         if (ptepindex >= (NUPDE + NUPDPE)) {
2842                 pml4_entry_t *pml4, *pml4u;
2843                 vm_pindex_t pml4index;
2844
2845                 /* Wire up a new PDPE page */
2846                 pml4index = ptepindex - (NUPDE + NUPDPE);
2847                 pml4 = &pmap->pm_pml4[pml4index];
2848                 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2849                 if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
2850                         /*
2851                          * PTI: Make all user-space mappings in the
2852                          * kernel-mode page table no-execute so that
2853                          * we detect any programming errors that leave
2854                          * the kernel-mode page table active on return
2855                          * to user space.
2856                          */
2857                         if (pmap->pm_ucr3 != PMAP_NO_CR3)
2858                                 *pml4 |= pg_nx;
2859
2860                         pml4u = &pmap->pm_pml4u[pml4index];
2861                         *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
2862                             PG_A | PG_M;
2863                 }
2864
2865         } else if (ptepindex >= NUPDE) {
2866                 vm_pindex_t pml4index;
2867                 vm_pindex_t pdpindex;
2868                 pml4_entry_t *pml4;
2869                 pdp_entry_t *pdp;
2870
2871                 /* Wire up a new PDE page */
2872                 pdpindex = ptepindex - NUPDE;
2873                 pml4index = pdpindex >> NPML4EPGSHIFT;
2874
2875                 pml4 = &pmap->pm_pml4[pml4index];
2876                 if ((*pml4 & PG_V) == 0) {
2877                         /* Have to allocate a new pdp, recurse */
2878                         if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
2879                             lockp) == NULL) {
2880                                 vm_page_unwire_noq(m);
2881                                 vm_page_free_zero(m);
2882                                 return (NULL);
2883                         }
2884                 } else {
2885                         /* Add reference to pdp page */
2886                         pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
2887                         pdppg->wire_count++;
2888                 }
2889                 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2890
2891                 /* Now find the pdp page */
2892                 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2893                 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2894
2895         } else {
2896                 vm_pindex_t pml4index;
2897                 vm_pindex_t pdpindex;
2898                 pml4_entry_t *pml4;
2899                 pdp_entry_t *pdp;
2900                 pd_entry_t *pd;
2901
2902                 /* Wire up a new PTE page */
2903                 pdpindex = ptepindex >> NPDPEPGSHIFT;
2904                 pml4index = pdpindex >> NPML4EPGSHIFT;
2905
2906                 /* First, find the pdp and check that its valid. */
2907                 pml4 = &pmap->pm_pml4[pml4index];
2908                 if ((*pml4 & PG_V) == 0) {
2909                         /* Have to allocate a new pd, recurse */
2910                         if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2911                             lockp) == NULL) {
2912                                 vm_page_unwire_noq(m);
2913                                 vm_page_free_zero(m);
2914                                 return (NULL);
2915                         }
2916                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2917                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2918                 } else {
2919                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2920                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2921                         if ((*pdp & PG_V) == 0) {
2922                                 /* Have to allocate a new pd, recurse */
2923                                 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
2924                                     lockp) == NULL) {
2925                                         vm_page_unwire_noq(m);
2926                                         vm_page_free_zero(m);
2927                                         return (NULL);
2928                                 }
2929                         } else {
2930                                 /* Add reference to the pd page */
2931                                 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
2932                                 pdpg->wire_count++;
2933                         }
2934                 }
2935                 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
2936
2937                 /* Now we know where the page directory page is */
2938                 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
2939                 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2940         }
2941
2942         pmap_resident_count_inc(pmap, 1);
2943
2944         return (m);
2945 }
2946
2947 static vm_page_t
2948 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2949 {
2950         vm_pindex_t pdpindex, ptepindex;
2951         pdp_entry_t *pdpe, PG_V;
2952         vm_page_t pdpg;
2953
2954         PG_V = pmap_valid_bit(pmap);
2955
2956 retry:
2957         pdpe = pmap_pdpe(pmap, va);
2958         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
2959                 /* Add a reference to the pd page. */
2960                 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
2961                 pdpg->wire_count++;
2962         } else {
2963                 /* Allocate a pd page. */
2964                 ptepindex = pmap_pde_pindex(va);
2965                 pdpindex = ptepindex >> NPDPEPGSHIFT;
2966                 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
2967                 if (pdpg == NULL && lockp != NULL)
2968                         goto retry;
2969         }
2970         return (pdpg);
2971 }
2972
2973 static vm_page_t
2974 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2975 {
2976         vm_pindex_t ptepindex;
2977         pd_entry_t *pd, PG_V;
2978         vm_page_t m;
2979
2980         PG_V = pmap_valid_bit(pmap);
2981
2982         /*
2983          * Calculate pagetable page index
2984          */
2985         ptepindex = pmap_pde_pindex(va);
2986 retry:
2987         /*
2988          * Get the page directory entry
2989          */
2990         pd = pmap_pde(pmap, va);
2991
2992         /*
2993          * This supports switching from a 2MB page to a
2994          * normal 4K page.
2995          */
2996         if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
2997                 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
2998                         /*
2999                          * Invalidation of the 2MB page mapping may have caused
3000                          * the deallocation of the underlying PD page.
3001                          */
3002                         pd = NULL;
3003                 }
3004         }
3005
3006         /*
3007          * If the page table page is mapped, we just increment the
3008          * hold count, and activate it.
3009          */
3010         if (pd != NULL && (*pd & PG_V) != 0) {
3011                 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
3012                 m->wire_count++;
3013         } else {
3014                 /*
3015                  * Here if the pte page isn't mapped, or if it has been
3016                  * deallocated.
3017                  */
3018                 m = _pmap_allocpte(pmap, ptepindex, lockp);
3019                 if (m == NULL && lockp != NULL)
3020                         goto retry;
3021         }
3022         return (m);
3023 }
3024
3025
3026 /***************************************************
3027  * Pmap allocation/deallocation routines.
3028  ***************************************************/
3029
3030 /*
3031  * Release any resources held by the given physical map.
3032  * Called when a pmap initialized by pmap_pinit is being released.
3033  * Should only be called if the map contains no valid mappings.
3034  */
3035 void
3036 pmap_release(pmap_t pmap)
3037 {
3038         vm_page_t m;
3039         int i;
3040
3041         KASSERT(pmap->pm_stats.resident_count == 0,
3042             ("pmap_release: pmap resident count %ld != 0",
3043             pmap->pm_stats.resident_count));
3044         KASSERT(vm_radix_is_empty(&pmap->pm_root),
3045             ("pmap_release: pmap has reserved page table page(s)"));
3046         KASSERT(CPU_EMPTY(&pmap->pm_active),
3047             ("releasing active pmap %p", pmap));
3048
3049         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
3050
3051         for (i = 0; i < NKPML4E; i++)   /* KVA */
3052                 pmap->pm_pml4[KPML4BASE + i] = 0;
3053         for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
3054                 pmap->pm_pml4[DMPML4I + i] = 0;
3055         pmap->pm_pml4[PML4PML4I] = 0;   /* Recursive Mapping */
3056
3057         vm_page_unwire_noq(m);
3058         vm_page_free_zero(m);
3059
3060         if (pmap->pm_pml4u != NULL) {
3061                 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
3062                 vm_page_unwire_noq(m);
3063                 vm_page_free(m);
3064         }
3065 }
3066
3067 static int
3068 kvm_size(SYSCTL_HANDLER_ARGS)
3069 {
3070         unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
3071
3072         return sysctl_handle_long(oidp, &ksize, 0, req);
3073 }
3074 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
3075     0, 0, kvm_size, "LU", "Size of KVM");
3076
3077 static int
3078 kvm_free(SYSCTL_HANDLER_ARGS)
3079 {
3080         unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
3081
3082         return sysctl_handle_long(oidp, &kfree, 0, req);
3083 }
3084 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
3085     0, 0, kvm_free, "LU", "Amount of KVM free");
3086
3087 /*
3088  * grow the number of kernel page table entries, if needed
3089  */
3090 void
3091 pmap_growkernel(vm_offset_t addr)
3092 {
3093         vm_paddr_t paddr;
3094         vm_page_t nkpg;
3095         pd_entry_t *pde, newpdir;
3096         pdp_entry_t *pdpe;
3097
3098         mtx_assert(&kernel_map->system_mtx, MA_OWNED);
3099
3100         /*
3101          * Return if "addr" is within the range of kernel page table pages
3102          * that were preallocated during pmap bootstrap.  Moreover, leave
3103          * "kernel_vm_end" and the kernel page table as they were.
3104          *
3105          * The correctness of this action is based on the following
3106          * argument: vm_map_insert() allocates contiguous ranges of the
3107          * kernel virtual address space.  It calls this function if a range
3108          * ends after "kernel_vm_end".  If the kernel is mapped between
3109          * "kernel_vm_end" and "addr", then the range cannot begin at
3110          * "kernel_vm_end".  In fact, its beginning address cannot be less
3111          * than the kernel.  Thus, there is no immediate need to allocate
3112          * any new kernel page table pages between "kernel_vm_end" and
3113          * "KERNBASE".
3114          */
3115         if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
3116                 return;
3117
3118         addr = roundup2(addr, NBPDR);
3119         if (addr - 1 >= vm_map_max(kernel_map))
3120                 addr = vm_map_max(kernel_map);
3121         while (kernel_vm_end < addr) {
3122                 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
3123                 if ((*pdpe & X86_PG_V) == 0) {
3124                         /* We need a new PDP entry */
3125                         nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
3126                             VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
3127                             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3128                         if (nkpg == NULL)
3129                                 panic("pmap_growkernel: no memory to grow kernel");
3130                         if ((nkpg->flags & PG_ZERO) == 0)
3131                                 pmap_zero_page(nkpg);
3132                         paddr = VM_PAGE_TO_PHYS(nkpg);
3133                         *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
3134                             X86_PG_A | X86_PG_M);
3135                         continue; /* try again */
3136                 }
3137                 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
3138                 if ((*pde & X86_PG_V) != 0) {
3139                         kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
3140                         if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3141                                 kernel_vm_end = vm_map_max(kernel_map);
3142                                 break;
3143                         }
3144                         continue;
3145                 }
3146
3147                 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
3148                     VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
3149                     VM_ALLOC_ZERO);
3150                 if (nkpg == NULL)
3151                         panic("pmap_growkernel: no memory to grow kernel");
3152                 if ((nkpg->flags & PG_ZERO) == 0)
3153                         pmap_zero_page(nkpg);
3154                 paddr = VM_PAGE_TO_PHYS(nkpg);
3155                 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
3156                 pde_store(pde, newpdir);
3157
3158                 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
3159                 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3160                         kernel_vm_end = vm_map_max(kernel_map);
3161                         break;
3162                 }
3163         }
3164 }
3165
3166
3167 /***************************************************
3168  * page management routines.
3169  ***************************************************/
3170
3171 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
3172 CTASSERT(_NPCM == 3);
3173 CTASSERT(_NPCPV == 168);
3174
3175 static __inline struct pv_chunk *
3176 pv_to_chunk(pv_entry_t pv)
3177 {
3178
3179         return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
3180 }
3181
3182 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
3183
3184 #define PC_FREE0        0xfffffffffffffffful
3185 #define PC_FREE1        0xfffffffffffffffful
3186 #define PC_FREE2        0x000000fffffffffful
3187
3188 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
3189
3190 #ifdef PV_STATS
3191 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
3192
3193 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
3194         "Current number of pv entry chunks");
3195 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
3196         "Current number of pv entry chunks allocated");
3197 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
3198         "Current number of pv entry chunks frees");
3199 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
3200         "Number of times tried to get a chunk page but failed.");
3201
3202 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
3203 static int pv_entry_spare;
3204
3205 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
3206         "Current number of pv entry frees");
3207 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
3208         "Current number of pv entry allocs");
3209 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
3210         "Current number of pv entries");
3211 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
3212         "Current number of spare pv entries");
3213 #endif
3214
3215 static void
3216 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di)
3217 {
3218
3219         if (pmap == NULL)
3220                 return;
3221         pmap_invalidate_all(pmap);
3222         if (pmap != locked_pmap)
3223                 PMAP_UNLOCK(pmap);
3224         if (start_di)
3225                 pmap_delayed_invl_finished();
3226 }
3227
3228 /*
3229  * We are in a serious low memory condition.  Resort to
3230  * drastic measures to free some pages so we can allocate
3231  * another pv entry chunk.
3232  *
3233  * Returns NULL if PV entries were reclaimed from the specified pmap.
3234  *
3235  * We do not, however, unmap 2mpages because subsequent accesses will
3236  * allocate per-page pv entries until repromotion occurs, thereby
3237  * exacerbating the shortage of free pv entries.
3238  */
3239 static vm_page_t
3240 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
3241 {
3242         struct pv_chunk *pc, *pc_marker, *pc_marker_end;
3243         struct pv_chunk_header pc_marker_b, pc_marker_end_b;
3244         struct md_page *pvh;
3245         pd_entry_t *pde;
3246         pmap_t next_pmap, pmap;
3247         pt_entry_t *pte, tpte;
3248         pt_entry_t PG_G, PG_A, PG_M, PG_RW;
3249         pv_entry_t pv;
3250         vm_offset_t va;
3251         vm_page_t m, m_pc;
3252         struct spglist free;
3253         uint64_t inuse;
3254         int bit, field, freed;
3255         bool start_di;
3256         static int active_reclaims = 0;
3257
3258         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
3259         KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
3260         pmap = NULL;
3261         m_pc = NULL;
3262         PG_G = PG_A = PG_M = PG_RW = 0;
3263         SLIST_INIT(&free);
3264         bzero(&pc_marker_b, sizeof(pc_marker_b));
3265         bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
3266         pc_marker = (struct pv_chunk *)&pc_marker_b;
3267         pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
3268
3269         /*
3270          * A delayed invalidation block should already be active if
3271          * pmap_advise() or pmap_remove() called this function by way
3272          * of pmap_demote_pde_locked().
3273          */
3274         start_di = pmap_not_in_di();
3275
3276         mtx_lock(&pv_chunks_mutex);
3277         active_reclaims++;
3278         TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
3279         TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
3280         while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
3281             SLIST_EMPTY(&free)) {
3282                 next_pmap = pc->pc_pmap;
3283                 if (next_pmap == NULL) {
3284                         /*
3285                          * The next chunk is a marker.  However, it is
3286                          * not our marker, so active_reclaims must be
3287                          * > 1.  Consequently, the next_chunk code
3288                          * will not rotate the pv_chunks list.
3289                          */
3290                         goto next_chunk;
3291                 }
3292                 mtx_unlock(&pv_chunks_mutex);
3293
3294                 /*
3295                  * A pv_chunk can only be removed from the pc_lru list
3296                  * when both pc_chunks_mutex is owned and the
3297                  * corresponding pmap is locked.
3298                  */
3299                 if (pmap != next_pmap) {
3300                         reclaim_pv_chunk_leave_pmap(pmap, locked_pmap,
3301                             start_di);
3302                         pmap = next_pmap;
3303                         /* Avoid deadlock and lock recursion. */
3304                         if (pmap > locked_pmap) {
3305                                 RELEASE_PV_LIST_LOCK(lockp);
3306                                 PMAP_LOCK(pmap);
3307                                 if (start_di)
3308                                         pmap_delayed_invl_started();
3309                                 mtx_lock(&pv_chunks_mutex);
3310                                 continue;
3311                         } else if (pmap != locked_pmap) {
3312                                 if (PMAP_TRYLOCK(pmap)) {
3313                                         if (start_di)
3314                                                 pmap_delayed_invl_started();
3315                                         mtx_lock(&pv_chunks_mutex);
3316                                         continue;
3317                                 } else {
3318                                         pmap = NULL; /* pmap is not locked */
3319                                         mtx_lock(&pv_chunks_mutex);
3320                                         pc = TAILQ_NEXT(pc_marker, pc_lru);
3321                                         if (pc == NULL ||
3322                                             pc->pc_pmap != next_pmap)
3323                                                 continue;
3324                                         goto next_chunk;
3325                                 }
3326                         } else if (start_di)
3327                                 pmap_delayed_invl_started();
3328                         PG_G = pmap_global_bit(pmap);
3329                         PG_A = pmap_accessed_bit(pmap);
3330                         PG_M = pmap_modified_bit(pmap);
3331                         PG_RW = pmap_rw_bit(pmap);
3332                 }
3333
3334                 /*
3335                  * Destroy every non-wired, 4 KB page mapping in the chunk.
3336                  */
3337                 freed = 0;
3338                 for (field = 0; field < _NPCM; field++) {
3339                         for (inuse = ~pc->pc_map[field] & pc_freemask[field];
3340                             inuse != 0; inuse &= ~(1UL << bit)) {
3341                                 bit = bsfq(inuse);
3342                                 pv = &pc->pc_pventry[field * 64 + bit];
3343                                 va = pv->pv_va;
3344                                 pde = pmap_pde(pmap, va);
3345                                 if ((*pde & PG_PS) != 0)
3346                                         continue;
3347                                 pte = pmap_pde_to_pte(pde, va);
3348                                 if ((*pte & PG_W) != 0)
3349                                         continue;
3350                                 tpte = pte_load_clear(pte);
3351                                 if ((tpte & PG_G) != 0)
3352                                         pmap_invalidate_page(pmap, va);
3353                                 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
3354                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3355                                         vm_page_dirty(m);
3356                                 if ((tpte & PG_A) != 0)
3357                                         vm_page_aflag_set(m, PGA_REFERENCED);
3358                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3359                                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3360                                 m->md.pv_gen++;
3361                                 if (TAILQ_EMPTY(&m->md.pv_list) &&
3362                                     (m->flags & PG_FICTITIOUS) == 0) {
3363                                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3364                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
3365                                                 vm_page_aflag_clear(m,
3366                                                     PGA_WRITEABLE);
3367                                         }
3368                                 }
3369                                 pmap_delayed_invl_page(m);
3370                                 pc->pc_map[field] |= 1UL << bit;
3371                                 pmap_unuse_pt(pmap, va, *pde, &free);
3372                                 freed++;
3373                         }
3374                 }
3375                 if (freed == 0) {
3376                         mtx_lock(&pv_chunks_mutex);
3377                         goto next_chunk;
3378                 }
3379                 /* Every freed mapping is for a 4 KB page. */
3380                 pmap_resident_count_dec(pmap, freed);
3381                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3382                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3383                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3384                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3385                 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
3386                     pc->pc_map[2] == PC_FREE2) {
3387                         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3388                         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3389                         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3390                         /* Entire chunk is free; return it. */
3391                         m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3392                         dump_drop_page(m_pc->phys_addr);
3393                         mtx_lock(&pv_chunks_mutex);
3394                         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
3395                         break;
3396                 }
3397                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3398                 mtx_lock(&pv_chunks_mutex);
3399                 /* One freed pv entry in locked_pmap is sufficient. */
3400                 if (pmap == locked_pmap)
3401                         break;
3402 next_chunk:
3403                 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
3404                 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
3405                 if (active_reclaims == 1 && pmap != NULL) {
3406                         /*
3407                          * Rotate the pv chunks list so that we do not
3408                          * scan the same pv chunks that could not be
3409                          * freed (because they contained a wired
3410                          * and/or superpage mapping) on every
3411                          * invocation of reclaim_pv_chunk().
3412                          */
3413                         while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
3414                                 MPASS(pc->pc_pmap != NULL);
3415                                 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
3416                                 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
3417                         }
3418                 }
3419         }
3420         TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
3421         TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
3422         active_reclaims--;
3423         mtx_unlock(&pv_chunks_mutex);
3424         reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di);
3425         if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3426                 m_pc = SLIST_FIRST(&free);
3427                 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3428                 /* Recycle a freed page table page. */
3429                 m_pc->wire_count = 1;
3430         }
3431         vm_page_free_pages_toq(&free, true);
3432         return (m_pc);
3433 }
3434
3435 /*
3436  * free the pv_entry back to the free list
3437  */
3438 static void
3439 free_pv_entry(pmap_t pmap, pv_entry_t pv)
3440 {
3441         struct pv_chunk *pc;
3442         int idx, field, bit;
3443
3444         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3445         PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3446         PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3447         PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3448         pc = pv_to_chunk(pv);
3449         idx = pv - &pc->pc_pventry[0];
3450         field = idx / 64;
3451         bit = idx % 64;
3452         pc->pc_map[field] |= 1ul << bit;
3453         if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
3454             pc->pc_map[2] != PC_FREE2) {
3455                 /* 98% of the time, pc is already at the head of the list. */
3456                 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3457                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3458                         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3459                 }
3460                 return;
3461         }
3462         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3463         free_pv_chunk(pc);
3464 }
3465
3466 static void
3467 free_pv_chunk(struct pv_chunk *pc)
3468 {
3469         vm_page_t m;
3470
3471         mtx_lock(&pv_chunks_mutex);
3472         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
3473         mtx_unlock(&pv_chunks_mutex);
3474         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3475         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3476         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3477         /* entire chunk is free, return it */
3478         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3479         dump_drop_page(m->phys_addr);
3480         vm_page_unwire(m, PQ_NONE);
3481         vm_page_free(m);
3482 }
3483
3484 /*
3485  * Returns a new PV entry, allocating a new PV chunk from the system when
3486  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
3487  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
3488  * returned.
3489  *
3490  * The given PV list lock may be released.
3491  */
3492 static pv_entry_t
3493 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3494 {
3495         int bit, field;
3496         pv_entry_t pv;
3497         struct pv_chunk *pc;
3498         vm_page_t m;
3499
3500         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3501         PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3502 retry:
3503         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3504         if (pc != NULL) {
3505                 for (field = 0; field < _NPCM; field++) {
3506                         if (pc->pc_map[field]) {
3507                                 bit = bsfq(pc->pc_map[field]);
3508                                 break;
3509                         }
3510                 }
3511                 if (field < _NPCM) {
3512                         pv = &pc->pc_pventry[field * 64 + bit];
3513                         pc->pc_map[field] &= ~(1ul << bit);
3514                         /* If this was the last item, move it to tail */
3515                         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
3516                             pc->pc_map[2] == 0) {
3517                                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3518                                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3519                                     pc_list);
3520                         }
3521                         PV_STAT(atomic_add_long(&pv_entry_count, 1));
3522                         PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3523                         return (pv);
3524                 }
3525         }
3526         /* No free items, allocate another chunk */
3527         m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3528             VM_ALLOC_WIRED);
3529         if (m == NULL) {
3530                 if (lockp == NULL) {
3531                         PV_STAT(pc_chunk_tryfail++);
3532                         return (NULL);
3533                 }
3534                 m = reclaim_pv_chunk(pmap, lockp);
3535                 if (m == NULL)
3536                         goto retry;
3537         }
3538         PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3539         PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3540         dump_add_page(m->phys_addr);
3541         pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3542         pc->pc_pmap = pmap;
3543         pc->pc_map[0] = PC_FREE0 & ~1ul;        /* preallocated bit 0 */
3544         pc->pc_map[1] = PC_FREE1;
3545         pc->pc_map[2] = PC_FREE2;
3546         mtx_lock(&pv_chunks_mutex);
3547         TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
3548         mtx_unlock(&pv_chunks_mutex);
3549         pv = &pc->pc_pventry[0];
3550         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3551         PV_STAT(atomic_add_long(&pv_entry_count, 1));
3552         PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3553         return (pv);
3554 }
3555
3556 /*
3557  * Returns the number of one bits within the given PV chunk map.
3558  *
3559  * The erratas for Intel processors state that "POPCNT Instruction May
3560  * Take Longer to Execute Than Expected".  It is believed that the
3561  * issue is the spurious dependency on the destination register.
3562  * Provide a hint to the register rename logic that the destination
3563  * value is overwritten, by clearing it, as suggested in the
3564  * optimization manual.  It should be cheap for unaffected processors
3565  * as well.
3566  *
3567  * Reference numbers for erratas are
3568  * 4th Gen Core: HSD146
3569  * 5th Gen Core: BDM85
3570  * 6th Gen Core: SKL029
3571  */
3572 static int
3573 popcnt_pc_map_pq(uint64_t *map)
3574 {
3575         u_long result, tmp;
3576
3577         __asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
3578             "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
3579             "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
3580             : "=&r" (result), "=&r" (tmp)
3581             : "m" (map[0]), "m" (map[1]), "m" (map[2]));
3582         return (result);
3583 }
3584
3585 /*
3586  * Ensure that the number of spare PV entries in the specified pmap meets or
3587  * exceeds the given count, "needed".
3588  *
3589  * The given PV list lock may be released.
3590  */
3591 static void
3592 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3593 {
3594         struct pch new_tail;
3595         struct pv_chunk *pc;
3596         vm_page_t m;
3597         int avail, free;
3598         bool reclaimed;
3599
3600         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3601         KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3602
3603         /*
3604          * Newly allocated PV chunks must be stored in a private list until
3605          * the required number of PV chunks have been allocated.  Otherwise,
3606          * reclaim_pv_chunk() could recycle one of these chunks.  In
3607          * contrast, these chunks must be added to the pmap upon allocation.
3608          */
3609         TAILQ_INIT(&new_tail);
3610 retry:
3611         avail = 0;
3612         TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3613 #ifndef __POPCNT__
3614                 if ((cpu_feature2 & CPUID2_POPCNT) == 0)
3615                         bit_count((bitstr_t *)pc->pc_map, 0,
3616                             sizeof(pc->pc_map) * NBBY, &free);
3617                 else
3618 #endif
3619                 free = popcnt_pc_map_pq(pc->pc_map);
3620                 if (free == 0)
3621                         break;
3622                 avail += free;
3623                 if (avail >= needed)
3624                         break;
3625         }
3626         for (reclaimed = false; avail < needed; avail += _NPCPV) {
3627                 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3628                     VM_ALLOC_WIRED);
3629                 if (m == NULL) {
3630                         m = reclaim_pv_chunk(pmap, lockp);
3631                         if (m == NULL)
3632                                 goto retry;
3633                         reclaimed = true;
3634                 }
3635                 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3636                 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3637                 dump_add_page(m->phys_addr);
3638                 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3639                 pc->pc_pmap = pmap;
3640                 pc->pc_map[0] = PC_FREE0;
3641                 pc->pc_map[1] = PC_FREE1;
3642                 pc->pc_map[2] = PC_FREE2;
3643                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3644                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
3645                 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3646
3647                 /*
3648                  * The reclaim might have freed a chunk from the current pmap.
3649                  * If that chunk contained available entries, we need to
3650                  * re-count the number of available entries.
3651                  */
3652                 if (reclaimed)
3653                         goto retry;
3654         }
3655         if (!TAILQ_EMPTY(&new_tail)) {
3656                 mtx_lock(&pv_chunks_mutex);
3657                 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
3658                 mtx_unlock(&pv_chunks_mutex);
3659         }
3660 }
3661
3662 /*
3663  * First find and then remove the pv entry for the specified pmap and virtual
3664  * address from the specified pv list.  Returns the pv entry if found and NULL
3665  * otherwise.  This operation can be performed on pv lists for either 4KB or
3666  * 2MB page mappings.
3667  */
3668 static __inline pv_entry_t
3669 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3670 {
3671         pv_entry_t pv;
3672
3673         TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3674                 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3675                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3676                         pvh->pv_gen++;
3677                         break;
3678                 }
3679         }
3680         return (pv);
3681 }
3682
3683 /*
3684  * After demotion from a 2MB page mapping to 512 4KB page mappings,
3685  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3686  * entries for each of the 4KB page mappings.
3687  */
3688 static void
3689 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3690     struct rwlock **lockp)
3691 {
3692         struct md_page *pvh;
3693         struct pv_chunk *pc;
3694         pv_entry_t pv;
3695         vm_offset_t va_last;
3696         vm_page_t m;
3697         int bit, field;
3698
3699         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3700         KASSERT((pa & PDRMASK) == 0,
3701             ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
3702         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3703
3704         /*
3705          * Transfer the 2mpage's pv entry for this mapping to the first
3706          * page's pv list.  Once this transfer begins, the pv list lock
3707          * must not be released until the last pv entry is reinstantiated.
3708          */
3709         pvh = pa_to_pvh(pa);
3710         va = trunc_2mpage(va);
3711         pv = pmap_pvh_remove(pvh, pmap, va);
3712         KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
3713         m = PHYS_TO_VM_PAGE(pa);
3714         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3715         m->md.pv_gen++;
3716         /* Instantiate the remaining NPTEPG - 1 pv entries. */
3717         PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
3718         va_last = va + NBPDR - PAGE_SIZE;
3719         for (;;) {
3720                 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3721                 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
3722                     pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
3723                 for (field = 0; field < _NPCM; field++) {
3724                         while (pc->pc_map[field]) {
3725                                 bit = bsfq(pc->pc_map[field]);
3726                                 pc->pc_map[field] &= ~(1ul << bit);
3727                                 pv = &pc->pc_pventry[field * 64 + bit];
3728                                 va += PAGE_SIZE;
3729                                 pv->pv_va = va;
3730                                 m++;
3731                                 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3732                             ("pmap_pv_demote_pde: page %p is not managed", m));
3733                                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3734                                 m->md.pv_gen++;
3735                                 if (va == va_last)
3736                                         goto out;
3737                         }
3738                 }
3739                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3740                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3741         }
3742 out:
3743         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
3744                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3745                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3746         }
3747         PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
3748         PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
3749 }
3750
3751 #if VM_NRESERVLEVEL > 0
3752 /*
3753  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3754  * replace the many pv entries for the 4KB page mappings by a single pv entry
3755  * for the 2MB page mapping.
3756  */
3757 static void
3758 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3759     struct rwlock **lockp)
3760 {
3761         struct md_page *pvh;
3762         pv_entry_t pv;
3763         vm_offset_t va_last;
3764         vm_page_t m;
3765
3766         KASSERT((pa & PDRMASK) == 0,
3767             ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
3768         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3769
3770         /*
3771          * Transfer the first page's pv entry for this mapping to the 2mpage's
3772          * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
3773          * a transfer avoids the possibility that get_pv_entry() calls
3774          * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3775          * mappings that is being promoted.
3776          */
3777         m = PHYS_TO_VM_PAGE(pa);
3778         va = trunc_2mpage(va);
3779         pv = pmap_pvh_remove(&m->md, pmap, va);
3780         KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
3781         pvh = pa_to_pvh(pa);
3782         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3783         pvh->pv_gen++;
3784         /* Free the remaining NPTEPG - 1 pv entries. */
3785         va_last = va + NBPDR - PAGE_SIZE;
3786         do {
3787                 m++;
3788                 va += PAGE_SIZE;
3789                 pmap_pvh_free(&m->md, pmap, va);
3790         } while (va < va_last);
3791 }
3792 #endif /* VM_NRESERVLEVEL > 0 */
3793
3794 /*
3795  * First find and then destroy the pv entry for the specified pmap and virtual
3796  * address.  This operation can be performed on pv lists for either 4KB or 2MB
3797  * page mappings.
3798  */
3799 static void
3800 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3801 {
3802         pv_entry_t pv;
3803
3804         pv = pmap_pvh_remove(pvh, pmap, va);
3805         KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3806         free_pv_entry(pmap, pv);
3807 }
3808
3809 /*
3810  * Conditionally create the PV entry for a 4KB page mapping if the required
3811  * memory can be allocated without resorting to reclamation.
3812  */
3813 static boolean_t
3814 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3815     struct rwlock **lockp)
3816 {
3817         pv_entry_t pv;
3818
3819         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3820         /* Pass NULL instead of the lock pointer to disable reclamation. */
3821         if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3822                 pv->pv_va = va;
3823                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3824                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3825                 m->md.pv_gen++;
3826                 return (TRUE);
3827         } else
3828                 return (FALSE);
3829 }
3830
3831 /*
3832  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
3833  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
3834  * false if the PV entry cannot be allocated without resorting to reclamation.
3835  */
3836 static bool
3837 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags,
3838     struct rwlock **lockp)
3839 {
3840         struct md_page *pvh;
3841         pv_entry_t pv;
3842         vm_paddr_t pa;
3843
3844         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3845         /* Pass NULL instead of the lock pointer to disable reclamation. */
3846         if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3847             NULL : lockp)) == NULL)
3848                 return (false);
3849         pv->pv_va = va;
3850         pa = pde & PG_PS_FRAME;
3851         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3852         pvh = pa_to_pvh(pa);
3853         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3854         pvh->pv_gen++;
3855         return (true);
3856 }
3857
3858 /*
3859  * Fills a page table page with mappings to consecutive physical pages.
3860  */
3861 static void
3862 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
3863 {
3864         pt_entry_t *pte;
3865
3866         for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
3867                 *pte = newpte;
3868                 newpte += PAGE_SIZE;
3869         }
3870 }
3871
3872 /*
3873  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
3874  * mapping is invalidated.
3875  */
3876 static boolean_t
3877 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3878 {
3879         struct rwlock *lock;
3880         boolean_t rv;
3881
3882         lock = NULL;
3883         rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
3884         if (lock != NULL)
3885                 rw_wunlock(lock);
3886         return (rv);
3887 }
3888
3889 static boolean_t
3890 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3891     struct rwlock **lockp)
3892 {
3893         pd_entry_t newpde, oldpde;
3894         pt_entry_t *firstpte, newpte;
3895         pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
3896         vm_paddr_t mptepa;
3897         vm_page_t mpte;
3898         struct spglist free;
3899         vm_offset_t sva;
3900         int PG_PTE_CACHE;
3901
3902         PG_G = pmap_global_bit(pmap);
3903         PG_A = pmap_accessed_bit(pmap);
3904         PG_M = pmap_modified_bit(pmap);
3905         PG_RW = pmap_rw_bit(pmap);
3906         PG_V = pmap_valid_bit(pmap);
3907         PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
3908
3909         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3910         oldpde = *pde;
3911         KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
3912             ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
3913         if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
3914             NULL) {
3915                 KASSERT((oldpde & PG_W) == 0,
3916                     ("pmap_demote_pde: page table page for a wired mapping"
3917                     " is missing"));
3918
3919                 /*
3920                  * Invalidate the 2MB page mapping and return "failure" if the
3921                  * mapping was never accessed or the allocation of the new
3922                  * page table page fails.  If the 2MB page mapping belongs to
3923                  * the direct map region of the kernel's address space, then
3924                  * the page allocation request specifies the highest possible
3925                  * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
3926                  * normal.  Page table pages are preallocated for every other
3927                  * part of the kernel address space, so the direct map region
3928                  * is the only part of the kernel address space that must be
3929                  * handled here.
3930                  */
3931                 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
3932                     pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
3933                     DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
3934                     VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
3935                         SLIST_INIT(&free);
3936                         sva = trunc_2mpage(va);
3937                         pmap_remove_pde(pmap, pde, sva, &free, lockp);
3938                         if ((oldpde & PG_G) == 0)
3939                                 pmap_invalidate_pde_page(pmap, sva, oldpde);
3940                         vm_page_free_pages_toq(&free, true);
3941                         CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
3942                             " in pmap %p", va, pmap);
3943                         return (FALSE);
3944                 }
3945                 if (va < VM_MAXUSER_ADDRESS)
3946                         pmap_resident_count_inc(pmap, 1);
3947         }
3948         mptepa = VM_PAGE_TO_PHYS(mpte);
3949         firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
3950         newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
3951         KASSERT((oldpde & PG_A) != 0,
3952             ("pmap_demote_pde: oldpde is missing PG_A"));
3953         KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
3954             ("pmap_demote_pde: oldpde is missing PG_M"));
3955         newpte = oldpde & ~PG_PS;
3956         newpte = pmap_swap_pat(pmap, newpte);
3957
3958         /*
3959          * If the page table page is new, initialize it.
3960          */
3961         if (mpte->wire_count == 1) {
3962                 mpte->wire_count = NPTEPG;
3963                 pmap_fill_ptp(firstpte, newpte);
3964         }
3965         KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
3966             ("pmap_demote_pde: firstpte and newpte map different physical"
3967             " addresses"));
3968
3969         /*
3970          * If the mapping has changed attributes, update the page table
3971          * entries.
3972          */
3973         if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
3974                 pmap_fill_ptp(firstpte, newpte);
3975
3976         /*
3977          * The spare PV entries must be reserved prior to demoting the
3978          * mapping, that is, prior to changing the PDE.  Otherwise, the state
3979          * of the PDE and the PV lists will be inconsistent, which can result
3980          * in reclaim_pv_chunk() attempting to remove a PV entry from the
3981          * wrong PV list and pmap_pv_demote_pde() failing to find the expected
3982          * PV entry for the 2MB page mapping that is being demoted.
3983          */
3984         if ((oldpde & PG_MANAGED) != 0)
3985                 reserve_pv_entries(pmap, NPTEPG - 1, lockp);
3986
3987         /*
3988          * Demote the mapping.  This pmap is locked.  The old PDE has
3989          * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
3990          * set.  Thus, there is no danger of a race with another
3991          * processor changing the setting of PG_A and/or PG_M between
3992          * the read above and the store below.
3993          */
3994         if (workaround_erratum383)
3995                 pmap_update_pde(pmap, va, pde, newpde);
3996         else
3997                 pde_store(pde, newpde);
3998
3999         /*
4000          * Invalidate a stale recursive mapping of the page table page.
4001          */
4002         if (va >= VM_MAXUSER_ADDRESS)
4003                 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
4004
4005         /*
4006          * Demote the PV entry.
4007          */
4008         if ((oldpde & PG_MANAGED) != 0)
4009                 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
4010
4011         atomic_add_long(&pmap_pde_demotions, 1);
4012         CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
4013             " in pmap %p", va, pmap);
4014         return (TRUE);
4015 }
4016
4017 /*
4018  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
4019  */
4020 static void
4021 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
4022 {
4023         pd_entry_t newpde;
4024         vm_paddr_t mptepa;
4025         vm_page_t mpte;
4026
4027         KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
4028         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4029         mpte = pmap_remove_pt_page(pmap, va);
4030         if (mpte == NULL)
4031                 panic("pmap_remove_kernel_pde: Missing pt page.");
4032
4033         mptepa = VM_PAGE_TO_PHYS(mpte);
4034         newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
4035
4036         /*
4037          * Initialize the page table page.
4038          */
4039         pagezero((void *)PHYS_TO_DMAP(mptepa));
4040
4041         /*
4042          * Demote the mapping.
4043          */
4044         if (workaround_erratum383)
4045                 pmap_update_pde(pmap, va, pde, newpde);
4046         else
4047                 pde_store(pde, newpde);
4048
4049         /*
4050          * Invalidate a stale recursive mapping of the page table page.
4051          */
4052         pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
4053 }
4054
4055 /*
4056  * pmap_remove_pde: do the things to unmap a superpage in a process
4057  */
4058 static int
4059 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
4060     struct spglist *free, struct rwlock **lockp)
4061 {
4062         struct md_page *pvh;
4063         pd_entry_t oldpde;
4064         vm_offset_t eva, va;
4065         vm_page_t m, mpte;
4066         pt_entry_t PG_G, PG_A, PG_M, PG_RW;
4067
4068         PG_G = pmap_global_bit(pmap);
4069         PG_A = pmap_accessed_bit(pmap);
4070         PG_M = pmap_modified_bit(pmap);
4071         PG_RW = pmap_rw_bit(pmap);
4072
4073         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4074         KASSERT((sva & PDRMASK) == 0,
4075             ("pmap_remove_pde: sva is not 2mpage aligned"));
4076         oldpde = pte_load_clear(pdq);
4077         if (oldpde & PG_W)
4078                 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
4079         if ((oldpde & PG_G) != 0)
4080                 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
4081         pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
4082         if (oldpde & PG_MANAGED) {
4083                 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
4084                 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
4085                 pmap_pvh_free(pvh, pmap, sva);
4086                 eva = sva + NBPDR;
4087                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
4088                     va < eva; va += PAGE_SIZE, m++) {
4089                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
4090                                 vm_page_dirty(m);
4091                         if (oldpde & PG_A)
4092                                 vm_page_aflag_set(m, PGA_REFERENCED);
4093                         if (TAILQ_EMPTY(&m->md.pv_list) &&
4094                             TAILQ_EMPTY(&pvh->pv_list))
4095                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
4096                         pmap_delayed_invl_page(m);
4097                 }
4098         }
4099         if (pmap == kernel_pmap) {
4100                 pmap_remove_kernel_pde(pmap, pdq, sva);
4101         } else {
4102                 mpte = pmap_remove_pt_page(pmap, sva);
4103                 if (mpte != NULL) {
4104                         pmap_resident_count_dec(pmap, 1);
4105                         KASSERT(mpte->wire_count == NPTEPG,
4106                             ("pmap_remove_pde: pte page wire count error"));
4107                         mpte->wire_count = 0;
4108                         pmap_add_delayed_free_list(mpte, free, FALSE);
4109                 }
4110         }
4111         return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
4112 }
4113
4114 /*
4115  * pmap_remove_pte: do the things to unmap a page in a process
4116  */
4117 static int
4118 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
4119     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
4120 {
4121         struct md_page *pvh;
4122         pt_entry_t oldpte, PG_A, PG_M, PG_RW;
4123         vm_page_t m;
4124
4125         PG_A = pmap_accessed_bit(pmap);
4126         PG_M = pmap_modified_bit(pmap);
4127         PG_RW = pmap_rw_bit(pmap);
4128
4129         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4130         oldpte = pte_load_clear(ptq);
4131         if (oldpte & PG_W)
4132                 pmap->pm_stats.wired_count -= 1;
4133         pmap_resident_count_dec(pmap, 1);
4134         if (oldpte & PG_MANAGED) {
4135                 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
4136                 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4137                         vm_page_dirty(m);
4138                 if (oldpte & PG_A)
4139                         vm_page_aflag_set(m, PGA_REFERENCED);
4140                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
4141                 pmap_pvh_free(&m->md, pmap, va);
4142                 if (TAILQ_EMPTY(&m->md.pv_list) &&
4143                     (m->flags & PG_FICTITIOUS) == 0) {
4144                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4145                         if (TAILQ_EMPTY(&pvh->pv_list))
4146                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
4147                 }
4148                 pmap_delayed_invl_page(m);
4149         }
4150         return (pmap_unuse_pt(pmap, va, ptepde, free));
4151 }
4152
4153 /*
4154  * Remove a single page from a process address space
4155  */
4156 static void
4157 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
4158     struct spglist *free)
4159 {
4160         struct rwlock *lock;
4161         pt_entry_t *pte, PG_V;
4162
4163         PG_V = pmap_valid_bit(pmap);
4164         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4165         if ((*pde & PG_V) == 0)
4166                 return;
4167         pte = pmap_pde_to_pte(pde, va);
4168         if ((*pte & PG_V) == 0)
4169                 return;
4170         lock = NULL;
4171         pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
4172         if (lock != NULL)
4173                 rw_wunlock(lock);
4174         pmap_invalidate_page(pmap, va);
4175 }
4176
4177 /*
4178  * Removes the specified range of addresses from the page table page.
4179  */
4180 static bool
4181 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
4182     pd_entry_t *pde, struct spglist *free, struct rwlock **lockp)
4183 {
4184         pt_entry_t PG_G, *pte;
4185         vm_offset_t va;
4186         bool anyvalid;
4187
4188         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4189         PG_G = pmap_global_bit(pmap);
4190         anyvalid = false;
4191         va = eva;
4192         for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++,
4193             sva += PAGE_SIZE) {
4194                 if (*pte == 0) {
4195                         if (va != eva) {
4196                                 pmap_invalidate_range(pmap, va, sva);
4197                                 va = eva;
4198                         }
4199                         continue;
4200                 }
4201                 if ((*pte & PG_G) == 0)
4202                         anyvalid = true;
4203                 else if (va == eva)
4204                         va = sva;
4205                 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) {
4206                         sva += PAGE_SIZE;
4207                         break;
4208                 }
4209         }
4210         if (va != eva)
4211                 pmap_invalidate_range(pmap, va, sva);
4212         return (anyvalid);
4213 }
4214
4215 /*
4216  *      Remove the given range of addresses from the specified map.
4217  *
4218  *      It is assumed that the start and end are properly
4219  *      rounded to the page size.
4220  */
4221 void
4222 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4223 {
4224         struct rwlock *lock;
4225         vm_offset_t va_next;
4226         pml4_entry_t *pml4e;
4227         pdp_entry_t *pdpe;
4228         pd_entry_t ptpaddr, *pde;
4229         pt_entry_t PG_G, PG_V;
4230         struct spglist free;
4231         int anyvalid;
4232
4233         PG_G = pmap_global_bit(pmap);
4234         PG_V = pmap_valid_bit(pmap);
4235
4236         /*
4237          * Perform an unsynchronized read.  This is, however, safe.
4238          */
4239         if (pmap->pm_stats.resident_count == 0)
4240                 return;
4241
4242         anyvalid = 0;
4243         SLIST_INIT(&free);
4244
4245         pmap_delayed_invl_started();
4246         PMAP_LOCK(pmap);
4247
4248         /*
4249          * special handling of removing one page.  a very
4250          * common operation and easy to short circuit some
4251          * code.
4252          */
4253         if (sva + PAGE_SIZE == eva) {
4254                 pde = pmap_pde(pmap, sva);
4255                 if (pde && (*pde & PG_PS) == 0) {
4256                         pmap_remove_page(pmap, sva, pde, &free);
4257                         goto out;
4258                 }
4259         }
4260
4261         lock = NULL;
4262         for (; sva < eva; sva = va_next) {
4263
4264                 if (pmap->pm_stats.resident_count == 0)
4265                         break;
4266
4267                 pml4e = pmap_pml4e(pmap, sva);
4268                 if ((*pml4e & PG_V) == 0) {
4269                         va_next = (sva + NBPML4) & ~PML4MASK;
4270                         if (va_next < sva)
4271                                 va_next = eva;
4272                         continue;
4273                 }
4274
4275                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
4276                 if ((*pdpe & PG_V) == 0) {
4277                         va_next = (sva + NBPDP) & ~PDPMASK;
4278                         if (va_next < sva)
4279                                 va_next = eva;
4280                         continue;
4281                 }
4282
4283                 /*
4284                  * Calculate index for next page table.
4285                  */
4286                 va_next = (sva + NBPDR) & ~PDRMASK;
4287                 if (va_next < sva)
4288                         va_next = eva;
4289
4290                 pde = pmap_pdpe_to_pde(pdpe, sva);
4291                 ptpaddr = *pde;
4292
4293                 /*
4294                  * Weed out invalid mappings.
4295                  */
4296                 if (ptpaddr == 0)
4297                         continue;
4298
4299                 /*
4300                  * Check for large page.
4301                  */
4302                 if ((ptpaddr & PG_PS) != 0) {
4303                         /*
4304                          * Are we removing the entire large page?  If not,
4305                          * demote the mapping and fall through.
4306                          */
4307                         if (sva + NBPDR == va_next && eva >= va_next) {
4308                                 /*
4309                                  * The TLB entry for a PG_G mapping is
4310                                  * invalidated by pmap_remove_pde().
4311                                  */
4312                                 if ((ptpaddr & PG_G) == 0)
4313                                         anyvalid = 1;
4314                                 pmap_remove_pde(pmap, pde, sva, &free, &lock);
4315                                 continue;
4316                         } else if (!pmap_demote_pde_locked(pmap, pde, sva,
4317                             &lock)) {
4318                                 /* The large page mapping was destroyed. */
4319                                 continue;
4320                         } else
4321                                 ptpaddr = *pde;
4322                 }
4323
4324                 /*
4325                  * Limit our scan to either the end of the va represented
4326                  * by the current page table page, or to the end of the
4327                  * range being removed.
4328                  */
4329                 if (va_next > eva)
4330                         va_next = eva;
4331
4332                 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock))
4333                         anyvalid = 1;
4334         }
4335         if (lock != NULL)
4336                 rw_wunlock(lock);
4337 out:
4338         if (anyvalid)
4339                 pmap_invalidate_all(pmap);
4340         PMAP_UNLOCK(pmap);
4341         pmap_delayed_invl_finished();
4342         vm_page_free_pages_toq(&free, true);
4343 }
4344
4345 /*
4346  *      Routine:        pmap_remove_all
4347  *      Function:
4348  *              Removes this physical page from
4349  *              all physical maps in which it resides.
4350  *              Reflects back modify bits to the pager.
4351  *
4352  *      Notes:
4353  *              Original versions of this routine were very
4354  *              inefficient because they iteratively called
4355  *              pmap_remove (slow...)
4356  */
4357
4358 void
4359 pmap_remove_all(vm_page_t m)
4360 {
4361         struct md_page *pvh;
4362         pv_entry_t pv;
4363         pmap_t pmap;
4364         struct rwlock *lock;
4365         pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
4366         pd_entry_t *pde;
4367         vm_offset_t va;
4368         struct spglist free;
4369         int pvh_gen, md_gen;
4370
4371         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4372             ("pmap_remove_all: page %p is not managed", m));
4373         SLIST_INIT(&free);
4374         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4375         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4376             pa_to_pvh(VM_PAGE_TO_PHYS(m));
4377 retry:
4378         rw_wlock(lock);
4379         while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4380                 pmap = PV_PMAP(pv);
4381                 if (!PMAP_TRYLOCK(pmap)) {
4382                         pvh_gen = pvh->pv_gen;
4383                         rw_wunlock(lock);
4384                         PMAP_LOCK(pmap);
4385                         rw_wlock(lock);
4386                         if (pvh_gen != pvh->pv_gen) {
4387                                 rw_wunlock(lock);
4388                                 PMAP_UNLOCK(pmap);
4389                                 goto retry;
4390                         }
4391                 }
4392                 va = pv->pv_va;
4393                 pde = pmap_pde(pmap, va);
4394                 (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
4395                 PMAP_UNLOCK(pmap);
4396         }
4397         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4398                 pmap = PV_PMAP(pv);
4399                 if (!PMAP_TRYLOCK(pmap)) {
4400                         pvh_gen = pvh->pv_gen;
4401                         md_gen = m->md.pv_gen;
4402                         rw_wunlock(lock);
4403                         PMAP_LOCK(pmap);
4404                         rw_wlock(lock);
4405                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4406                                 rw_wunlock(lock);
4407                                 PMAP_UNLOCK(pmap);
4408                                 goto retry;
4409                         }
4410                 }
4411                 PG_A = pmap_accessed_bit(pmap);
4412                 PG_M = pmap_modified_bit(pmap);
4413                 PG_RW = pmap_rw_bit(pmap);
4414                 pmap_resident_count_dec(pmap, 1);
4415                 pde = pmap_pde(pmap, pv->pv_va);
4416                 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
4417                     " a 2mpage in page %p's pv list", m));
4418                 pte = pmap_pde_to_pte(pde, pv->pv_va);
4419                 tpte = pte_load_clear(pte);
4420                 if (tpte & PG_W)
4421                         pmap->pm_stats.wired_count--;
4422                 if (tpte & PG_A)
4423                         vm_page_aflag_set(m, PGA_REFERENCED);
4424
4425                 /*
4426                  * Update the vm_page_t clean and reference bits.
4427                  */
4428                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4429                         vm_page_dirty(m);
4430                 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
4431                 pmap_invalidate_page(pmap, pv->pv_va);
4432                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4433                 m->md.pv_gen++;
4434                 free_pv_entry(pmap, pv);
4435                 PMAP_UNLOCK(pmap);
4436         }
4437         vm_page_aflag_clear(m, PGA_WRITEABLE);
4438         rw_wunlock(lock);
4439         pmap_delayed_invl_wait(m);
4440         vm_page_free_pages_toq(&free, true);
4441 }
4442
4443 /*
4444  * pmap_protect_pde: do the things to protect a 2mpage in a process
4445  */
4446 static boolean_t
4447 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
4448 {
4449         pd_entry_t newpde, oldpde;
4450         vm_offset_t eva, va;
4451         vm_page_t m;
4452         boolean_t anychanged;
4453         pt_entry_t PG_G, PG_M, PG_RW;
4454
4455         PG_G = pmap_global_bit(pmap);
4456         PG_M = pmap_modified_bit(pmap);
4457         PG_RW = pmap_rw_bit(pmap);
4458
4459         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4460         KASSERT((sva & PDRMASK) == 0,
4461             ("pmap_protect_pde: sva is not 2mpage aligned"));
4462         anychanged = FALSE;
4463 retry:
4464         oldpde = newpde = *pde;
4465         if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
4466             (PG_MANAGED | PG_M | PG_RW)) {
4467                 eva = sva + NBPDR;
4468                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
4469                     va < eva; va += PAGE_SIZE, m++)
4470                         vm_page_dirty(m);
4471         }
4472         if ((prot & VM_PROT_WRITE) == 0)
4473                 newpde &= ~(PG_RW | PG_M);
4474         if ((prot & VM_PROT_EXECUTE) == 0)
4475                 newpde |= pg_nx;
4476         if (newpde != oldpde) {
4477                 /*
4478                  * As an optimization to future operations on this PDE, clear
4479                  * PG_PROMOTED.  The impending invalidation will remove any
4480                  * lingering 4KB page mappings from the TLB.
4481                  */
4482                 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
4483                         goto retry;
4484                 if ((oldpde & PG_G) != 0)
4485                         pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
4486                 else
4487                         anychanged = TRUE;
4488         }
4489         return (anychanged);
4490 }
4491
4492 /*
4493  *      Set the physical protection on the
4494  *      specified range of this map as requested.
4495  */
4496 void
4497 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4498 {
4499         vm_offset_t va_next;
4500         pml4_entry_t *pml4e;
4501         pdp_entry_t *pdpe;
4502         pd_entry_t ptpaddr, *pde;
4503         pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
4504         boolean_t anychanged;
4505
4506         KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4507         if (prot == VM_PROT_NONE) {
4508                 pmap_remove(pmap, sva, eva);
4509                 return;
4510         }
4511
4512         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
4513             (VM_PROT_WRITE|VM_PROT_EXECUTE))
4514                 return;
4515
4516         PG_G = pmap_global_bit(pmap);
4517         PG_M = pmap_modified_bit(pmap);
4518         PG_V = pmap_valid_bit(pmap);
4519         PG_RW = pmap_rw_bit(pmap);
4520         anychanged = FALSE;
4521
4522         /*
4523          * Although this function delays and batches the invalidation
4524          * of stale TLB entries, it does not need to call
4525          * pmap_delayed_invl_started() and
4526          * pmap_delayed_invl_finished(), because it does not
4527          * ordinarily destroy mappings.  Stale TLB entries from
4528          * protection-only changes need only be invalidated before the
4529          * pmap lock is released, because protection-only changes do
4530          * not destroy PV entries.  Even operations that iterate over
4531          * a physical page's PV list of mappings, like
4532          * pmap_remove_write(), acquire the pmap lock for each
4533          * mapping.  Consequently, for protection-only changes, the
4534          * pmap lock suffices to synchronize both page table and TLB
4535          * updates.
4536          *
4537          * This function only destroys a mapping if pmap_demote_pde()
4538          * fails.  In that case, stale TLB entries are immediately
4539          * invalidated.
4540          */
4541
4542         PMAP_LOCK(pmap);
4543         for (; sva < eva; sva = va_next) {
4544
4545                 pml4e = pmap_pml4e(pmap, sva);
4546                 if ((*pml4e & PG_V) == 0) {
4547                         va_next = (sva + NBPML4) & ~PML4MASK;
4548                         if (va_next < sva)
4549                                 va_next = eva;
4550                         continue;
4551                 }
4552
4553                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
4554                 if ((*pdpe & PG_V) == 0) {
4555                         va_next = (sva + NBPDP) & ~PDPMASK;
4556                         if (va_next < sva)
4557                                 va_next = eva;
4558                         continue;
4559                 }
4560
4561                 va_next = (sva + NBPDR) & ~PDRMASK;
4562                 if (va_next < sva)
4563                         va_next = eva;
4564
4565                 pde = pmap_pdpe_to_pde(pdpe, sva);
4566                 ptpaddr = *pde;
4567
4568                 /*
4569                  * Weed out invalid mappings.
4570                  */
4571                 if (ptpaddr == 0)
4572                         continue;
4573
4574                 /*
4575                  * Check for large page.
4576                  */
4577                 if ((ptpaddr & PG_PS) != 0) {
4578                         /*
4579                          * Are we protecting the entire large page?  If not,
4580                          * demote the mapping and fall through.
4581                          */
4582                         if (sva + NBPDR == va_next && eva >= va_next) {
4583                                 /*
4584                                  * The TLB entry for a PG_G mapping is
4585                                  * invalidated by pmap_protect_pde().
4586                                  */
4587                                 if (pmap_protect_pde(pmap, pde, sva, prot))
4588                                         anychanged = TRUE;
4589                                 continue;
4590                         } else if (!pmap_demote_pde(pmap, pde, sva)) {
4591                                 /*
4592                                  * The large page mapping was destroyed.
4593                                  */
4594                                 continue;
4595                         }
4596                 }
4597
4598                 if (va_next > eva)
4599                         va_next = eva;
4600
4601                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
4602                     sva += PAGE_SIZE) {
4603                         pt_entry_t obits, pbits;
4604                         vm_page_t m;
4605
4606 retry:
4607                         obits = pbits = *pte;
4608                         if ((pbits & PG_V) == 0)
4609                                 continue;
4610
4611                         if ((prot & VM_PROT_WRITE) == 0) {
4612                                 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
4613                                     (PG_MANAGED | PG_M | PG_RW)) {
4614                                         m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
4615                                         vm_page_dirty(m);
4616                                 }
4617                                 pbits &= ~(PG_RW | PG_M);
4618                         }
4619                         if ((prot & VM_PROT_EXECUTE) == 0)
4620                                 pbits |= pg_nx;
4621
4622                         if (pbits != obits) {
4623                                 if (!atomic_cmpset_long(pte, obits, pbits))
4624                                         goto retry;
4625                                 if (obits & PG_G)
4626                                         pmap_invalidate_page(pmap, sva);
4627                                 else
4628                                         anychanged = TRUE;
4629                         }
4630                 }
4631         }
4632         if (anychanged)
4633                 pmap_invalidate_all(pmap);
4634         PMAP_UNLOCK(pmap);
4635 }
4636
4637 #if VM_NRESERVLEVEL > 0
4638 /*
4639  * Tries to promote the 512, contiguous 4KB page mappings that are within a
4640  * single page table page (PTP) to a single 2MB page mapping.  For promotion
4641  * to occur, two conditions must be met: (1) the 4KB page mappings must map
4642  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4643  * identical characteristics.
4644  */
4645 static void
4646 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
4647     struct rwlock **lockp)
4648 {
4649         pd_entry_t newpde;
4650         pt_entry_t *firstpte, oldpte, pa, *pte;
4651         pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
4652         vm_page_t mpte;
4653         int PG_PTE_CACHE;
4654
4655         PG_A = pmap_accessed_bit(pmap);
4656         PG_G = pmap_global_bit(pmap);
4657         PG_M = pmap_modified_bit(pmap);
4658         PG_V = pmap_valid_bit(pmap);
4659         PG_RW = pmap_rw_bit(pmap);
4660         PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
4661
4662         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4663
4664         /*
4665          * Examine the first PTE in the specified PTP.  Abort if this PTE is
4666          * either invalid, unused, or does not map the first 4KB physical page
4667          * within a 2MB page.
4668          */
4669         firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
4670 setpde:
4671         newpde = *firstpte;
4672         if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
4673                 atomic_add_long(&pmap_pde_p_failures, 1);
4674                 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4675                     " in pmap %p", va, pmap);
4676                 return;
4677         }
4678         if ((newpde & (PG_M | PG_RW)) == PG_RW) {
4679                 /*
4680                  * When PG_M is already clear, PG_RW can be cleared without
4681                  * a TLB invalidation.
4682                  */
4683                 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
4684                         goto setpde;
4685                 newpde &= ~PG_RW;
4686         }
4687
4688         /*
4689          * Examine each of the other PTEs in the specified PTP.  Abort if this
4690          * PTE maps an unexpected 4KB physical page or does not have identical
4691          * characteristics to the first PTE.
4692          */
4693         pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
4694         for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
4695 setpte:
4696                 oldpte = *pte;
4697                 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
4698                         atomic_add_long(&pmap_pde_p_failures, 1);
4699                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4700                             " in pmap %p", va, pmap);
4701                         return;
4702                 }
4703                 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
4704                         /*
4705                          * When PG_M is already clear, PG_RW can be cleared
4706                          * without a TLB invalidation.
4707                          */
4708                         if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
4709                                 goto setpte;
4710                         oldpte &= ~PG_RW;
4711                         CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
4712                             " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
4713                             (va & ~PDRMASK), pmap);
4714                 }
4715                 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
4716                         atomic_add_long(&pmap_pde_p_failures, 1);
4717                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4718                             " in pmap %p", va, pmap);
4719                         return;
4720                 }
4721                 pa -= PAGE_SIZE;
4722         }
4723
4724         /*
4725          * Save the page table page in its current state until the PDE
4726          * mapping the superpage is demoted by pmap_demote_pde() or
4727          * destroyed by pmap_remove_pde().
4728          */
4729         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4730         KASSERT(mpte >= vm_page_array &&
4731             mpte < &vm_page_array[vm_page_array_size],
4732             ("pmap_promote_pde: page table page is out of range"));
4733         KASSERT(mpte->pindex == pmap_pde_pindex(va),
4734             ("pmap_promote_pde: page table page's pindex is wrong"));
4735         if (pmap_insert_pt_page(pmap, mpte)) {
4736                 atomic_add_long(&pmap_pde_p_failures, 1);
4737                 CTR2(KTR_PMAP,
4738                     "pmap_promote_pde: failure for va %#lx in pmap %p", va,
4739                     pmap);
4740                 return;
4741         }
4742
4743         /*
4744          * Promote the pv entries.
4745          */
4746         if ((newpde & PG_MANAGED) != 0)
4747                 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
4748
4749         /*
4750          * Propagate the PAT index to its proper position.
4751          */
4752         newpde = pmap_swap_pat(pmap, newpde);
4753
4754         /*
4755          * Map the superpage.
4756          */
4757         if (workaround_erratum383)
4758                 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
4759         else
4760                 pde_store(pde, PG_PROMOTED | PG_PS | newpde);
4761
4762         atomic_add_long(&pmap_pde_promotions, 1);
4763         CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
4764             " in pmap %p", va, pmap);
4765 }
4766 #endif /* VM_NRESERVLEVEL > 0 */
4767
4768 /*
4769  *      Insert the given physical page (p) at
4770  *      the specified virtual address (v) in the
4771  *      target physical map with the protection requested.
4772  *
4773  *      If specified, the page will be wired down, meaning
4774  *      that the related pte can not be reclaimed.
4775  *
4776  *      NB:  This is the only routine which MAY NOT lazy-evaluate
4777  *      or lose information.  That is, this routine must actually
4778  *      insert this page into the given map NOW.
4779  *
4780  *      When destroying both a page table and PV entry, this function
4781  *      performs the TLB invalidation before releasing the PV list
4782  *      lock, so we do not need pmap_delayed_invl_page() calls here.
4783  */
4784 int
4785 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4786     u_int flags, int8_t psind)
4787 {
4788         struct rwlock *lock;
4789         pd_entry_t *pde;
4790         pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
4791         pt_entry_t newpte, origpte;
4792         pv_entry_t pv;
4793         vm_paddr_t opa, pa;
4794         vm_page_t mpte, om;
4795         int rv;
4796         boolean_t nosleep;
4797
4798         PG_A = pmap_accessed_bit(pmap);
4799         PG_G = pmap_global_bit(pmap);
4800         PG_M = pmap_modified_bit(pmap);
4801         PG_V = pmap_valid_bit(pmap);
4802         PG_RW = pmap_rw_bit(pmap);
4803
4804         va = trunc_page(va);
4805         KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
4806         KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
4807             ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
4808             va));
4809         KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
4810             va >= kmi.clean_eva,
4811             ("pmap_enter: managed mapping within the clean submap"));
4812         if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
4813                 VM_OBJECT_ASSERT_LOCKED(m->object);
4814         KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
4815             ("pmap_enter: flags %u has reserved bits set", flags));
4816         pa = VM_PAGE_TO_PHYS(m);
4817         newpte = (pt_entry_t)(pa | PG_A | PG_V);
4818         if ((flags & VM_PROT_WRITE) != 0)
4819                 newpte |= PG_M;
4820         if ((prot & VM_PROT_WRITE) != 0)
4821                 newpte |= PG_RW;
4822         KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
4823             ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
4824         if ((prot & VM_PROT_EXECUTE) == 0)
4825                 newpte |= pg_nx;
4826         if ((flags & PMAP_ENTER_WIRED) != 0)
4827                 newpte |= PG_W;
4828         if (va < VM_MAXUSER_ADDRESS)
4829                 newpte |= PG_U;
4830         if (pmap == kernel_pmap)
4831                 newpte |= PG_G;
4832         newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
4833
4834         /*
4835          * Set modified bit gratuitously for writeable mappings if
4836          * the page is unmanaged. We do not want to take a fault
4837          * to do the dirty bit accounting for these mappings.
4838          */
4839         if ((m->oflags & VPO_UNMANAGED) != 0) {
4840                 if ((newpte & PG_RW) != 0)
4841                         newpte |= PG_M;
4842         } else
4843                 newpte |= PG_MANAGED;
4844
4845         lock = NULL;
4846         PMAP_LOCK(pmap);
4847         if (psind == 1) {
4848                 /* Assert the required virtual and physical alignment. */
4849                 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
4850                 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
4851                 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock);
4852                 goto out;
4853         }
4854         mpte = NULL;
4855
4856         /*
4857          * In the case that a page table page is not
4858          * resident, we are creating it here.
4859          */
4860 retry:
4861         pde = pmap_pde(pmap, va);
4862         if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
4863             pmap_demote_pde_locked(pmap, pde, va, &lock))) {
4864                 pte = pmap_pde_to_pte(pde, va);
4865                 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
4866                         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4867                         mpte->wire_count++;
4868                 }
4869         } else if (va < VM_MAXUSER_ADDRESS) {
4870                 /*
4871                  * Here if the pte page isn't mapped, or if it has been
4872                  * deallocated.
4873                  */
4874                 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
4875                 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
4876                     nosleep ? NULL : &lock);
4877                 if (mpte == NULL && nosleep) {
4878                         rv = KERN_RESOURCE_SHORTAGE;
4879                         goto out;
4880                 }
4881                 goto retry;
4882         } else
4883                 panic("pmap_enter: invalid page directory va=%#lx", va);
4884
4885         origpte = *pte;
4886         pv = NULL;
4887
4888         /*
4889          * Is the specified virtual address already mapped?
4890          */
4891         if ((origpte & PG_V) != 0) {
4892                 /*
4893                  * Wiring change, just update stats. We don't worry about
4894                  * wiring PT pages as they remain resident as long as there
4895                  * are valid mappings in them. Hence, if a user page is wired,
4896                  * the PT page will be also.
4897                  */
4898                 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
4899                         pmap->pm_stats.wired_count++;
4900                 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
4901                         pmap->pm_stats.wired_count--;
4902
4903                 /*
4904                  * Remove the extra PT page reference.
4905                  */
4906                 if (mpte != NULL) {
4907                         mpte->wire_count--;
4908                         KASSERT(mpte->wire_count > 0,
4909                             ("pmap_enter: missing reference to page table page,"
4910                              " va: 0x%lx", va));
4911                 }
4912
4913                 /*
4914                  * Has the physical page changed?
4915                  */
4916                 opa = origpte & PG_FRAME;
4917                 if (opa == pa) {
4918                         /*
4919                          * No, might be a protection or wiring change.
4920                          */
4921                         if ((origpte & PG_MANAGED) != 0 &&
4922                             (newpte & PG_RW) != 0)
4923                                 vm_page_aflag_set(m, PGA_WRITEABLE);
4924                         if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
4925                                 goto unchanged;
4926                         goto validate;
4927                 }
4928
4929                 /*
4930                  * The physical page has changed.  Temporarily invalidate
4931                  * the mapping.  This ensures that all threads sharing the
4932                  * pmap keep a consistent view of the mapping, which is
4933                  * necessary for the correct handling of COW faults.  It
4934                  * also permits reuse of the old mapping's PV entry,
4935                  * avoiding an allocation.
4936                  *
4937                  * For consistency, handle unmanaged mappings the same way.
4938                  */
4939                 origpte = pte_load_clear(pte);
4940                 KASSERT((origpte & PG_FRAME) == opa,
4941                     ("pmap_enter: unexpected pa update for %#lx", va));
4942                 if ((origpte & PG_MANAGED) != 0) {
4943                         om = PHYS_TO_VM_PAGE(opa);
4944
4945                         /*
4946                          * The pmap lock is sufficient to synchronize with
4947                          * concurrent calls to pmap_page_test_mappings() and
4948                          * pmap_ts_referenced().
4949                          */
4950                         if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4951                                 vm_page_dirty(om);
4952                         if ((origpte & PG_A) != 0)
4953                                 vm_page_aflag_set(om, PGA_REFERENCED);
4954                         CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
4955                         pv = pmap_pvh_remove(&om->md, pmap, va);
4956                         if ((newpte & PG_MANAGED) == 0)
4957                                 free_pv_entry(pmap, pv);
4958                         if ((om->aflags & PGA_WRITEABLE) != 0 &&
4959                             TAILQ_EMPTY(&om->md.pv_list) &&
4960                             ((om->flags & PG_FICTITIOUS) != 0 ||
4961                             TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
4962                                 vm_page_aflag_clear(om, PGA_WRITEABLE);
4963                 }
4964                 if ((origpte & PG_A) != 0)
4965                         pmap_invalidate_page(pmap, va);
4966                 origpte = 0;
4967         } else {
4968                 /*
4969                  * Increment the counters.
4970                  */
4971                 if ((newpte & PG_W) != 0)
4972                         pmap->pm_stats.wired_count++;
4973                 pmap_resident_count_inc(pmap, 1);
4974         }
4975
4976         /*
4977          * Enter on the PV list if part of our managed memory.
4978          */
4979         if ((newpte & PG_MANAGED) != 0) {
4980                 if (pv == NULL) {
4981                         pv = get_pv_entry(pmap, &lock);
4982                         pv->pv_va = va;
4983                 }
4984                 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4985                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4986                 m->md.pv_gen++;
4987                 if ((newpte & PG_RW) != 0)
4988                         vm_page_aflag_set(m, PGA_WRITEABLE);
4989         }
4990
4991         /*
4992          * Update the PTE.
4993          */
4994         if ((origpte & PG_V) != 0) {
4995 validate:
4996                 origpte = pte_load_store(pte, newpte);
4997                 KASSERT((origpte & PG_FRAME) == pa,
4998                     ("pmap_enter: unexpected pa update for %#lx", va));
4999                 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
5000                     (PG_M | PG_RW)) {
5001                         if ((origpte & PG_MANAGED) != 0)
5002                                 vm_page_dirty(m);
5003
5004                         /*
5005                          * Although the PTE may still have PG_RW set, TLB
5006                          * invalidation may nonetheless be required because
5007                          * the PTE no longer has PG_M set.
5008                          */
5009                 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
5010                         /*
5011                          * This PTE change does not require TLB invalidation.
5012                          */
5013                         goto unchanged;
5014                 }
5015                 if ((origpte & PG_A) != 0)
5016                         pmap_invalidate_page(pmap, va);
5017         } else
5018                 pte_store(pte, newpte);
5019
5020 unchanged:
5021
5022 #if VM_NRESERVLEVEL > 0
5023         /*
5024          * If both the page table page and the reservation are fully
5025          * populated, then attempt promotion.
5026          */
5027         if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
5028             pmap_ps_enabled(pmap) &&
5029             (m->flags & PG_FICTITIOUS) == 0 &&
5030             vm_reserv_level_iffullpop(m) == 0)
5031                 pmap_promote_pde(pmap, pde, va, &lock);
5032 #endif
5033
5034         rv = KERN_SUCCESS;
5035 out:
5036         if (lock != NULL)
5037                 rw_wunlock(lock);
5038         PMAP_UNLOCK(pmap);
5039         return (rv);
5040 }
5041
5042 /*
5043  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
5044  * if successful.  Returns false if (1) a page table page cannot be allocated
5045  * without sleeping, (2) a mapping already exists at the specified virtual
5046  * address, or (3) a PV entry cannot be allocated without reclaiming another
5047  * PV entry.
5048  */
5049 static bool
5050 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5051     struct rwlock **lockp)
5052 {
5053         pd_entry_t newpde;
5054         pt_entry_t PG_V;
5055
5056         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5057         PG_V = pmap_valid_bit(pmap);
5058         newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
5059             PG_PS | PG_V;
5060         if ((m->oflags & VPO_UNMANAGED) == 0)
5061                 newpde |= PG_MANAGED;
5062         if ((prot & VM_PROT_EXECUTE) == 0)
5063                 newpde |= pg_nx;
5064         if (va < VM_MAXUSER_ADDRESS)
5065                 newpde |= PG_U;
5066         return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
5067             PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
5068             KERN_SUCCESS);
5069 }
5070
5071 /*
5072  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
5073  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
5074  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
5075  * a mapping already exists at the specified virtual address.  Returns
5076  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
5077  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
5078  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
5079  *
5080  * The parameter "m" is only used when creating a managed, writeable mapping.
5081  */
5082 static int
5083 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
5084     vm_page_t m, struct rwlock **lockp)
5085 {
5086         struct spglist free;
5087         pd_entry_t oldpde, *pde;
5088         pt_entry_t PG_G, PG_RW, PG_V;
5089         vm_page_t mt, pdpg;
5090
5091         PG_G = pmap_global_bit(pmap);
5092         PG_RW = pmap_rw_bit(pmap);
5093         KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
5094             ("pmap_enter_pde: newpde is missing PG_M"));
5095         PG_V = pmap_valid_bit(pmap);
5096         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5097
5098         if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
5099             NULL : lockp)) == NULL) {
5100                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
5101                     " in pmap %p", va, pmap);
5102                 return (KERN_RESOURCE_SHORTAGE);
5103         }
5104         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
5105         pde = &pde[pmap_pde_index(va)];
5106         oldpde = *pde;
5107         if ((oldpde & PG_V) != 0) {
5108                 KASSERT(pdpg->wire_count > 1,
5109                     ("pmap_enter_pde: pdpg's wire count is too low"));
5110                 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5111                         pdpg->wire_count--;
5112                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
5113                             " in pmap %p", va, pmap);
5114                         return (KERN_FAILURE);
5115                 }
5116                 /* Break the existing mapping(s). */
5117                 SLIST_INIT(&free);
5118                 if ((oldpde & PG_PS) != 0) {
5119                         /*
5120                          * The reference to the PD page that was acquired by
5121                          * pmap_allocpde() ensures that it won't be freed.
5122                          * However, if the PDE resulted from a promotion, then
5123                          * a reserved PT page could be freed.
5124                          */
5125                         (void)pmap_remove_pde(pmap, pde, va, &free, lockp);
5126                         if ((oldpde & PG_G) == 0)
5127                                 pmap_invalidate_pde_page(pmap, va, oldpde);
5128                 } else {
5129                         pmap_delayed_invl_started();
5130                         if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free,
5131                             lockp))
5132                                pmap_invalidate_all(pmap);
5133                         pmap_delayed_invl_finished();
5134                 }
5135                 vm_page_free_pages_toq(&free, true);
5136                 if (va >= VM_MAXUSER_ADDRESS) {
5137                         mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
5138                         if (pmap_insert_pt_page(pmap, mt)) {
5139                                 /*
5140                                  * XXX Currently, this can't happen because
5141                                  * we do not perform pmap_enter(psind == 1)
5142                                  * on the kernel pmap.
5143                                  */
5144                                 panic("pmap_enter_pde: trie insert failed");
5145                         }
5146                 } else
5147                         KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
5148                             pde));
5149         }
5150         if ((newpde & PG_MANAGED) != 0) {
5151                 /*
5152                  * Abort this mapping if its PV entry could not be created.
5153                  */
5154                 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
5155                         SLIST_INIT(&free);
5156                         if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
5157                                 /*
5158                                  * Although "va" is not mapped, paging-
5159                                  * structure caches could nonetheless have
5160                                  * entries that refer to the freed page table
5161                                  * pages.  Invalidate those entries.
5162                                  */
5163                                 pmap_invalidate_page(pmap, va);
5164                                 vm_page_free_pages_toq(&free, true);
5165                         }
5166                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
5167                             " in pmap %p", va, pmap);
5168                         return (KERN_RESOURCE_SHORTAGE);
5169                 }
5170                 if ((newpde & PG_RW) != 0) {
5171                         for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5172                                 vm_page_aflag_set(mt, PGA_WRITEABLE);
5173                 }
5174         }
5175
5176         /*
5177          * Increment counters.
5178          */
5179         if ((newpde & PG_W) != 0)
5180                 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
5181         pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
5182
5183         /*
5184          * Map the superpage.  (This is not a promoted mapping; there will not
5185          * be any lingering 4KB page mappings in the TLB.)
5186          */
5187         pde_store(pde, newpde);
5188
5189         atomic_add_long(&pmap_pde_mappings, 1);
5190         CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
5191             " in pmap %p", va, pmap);
5192         return (KERN_SUCCESS);
5193 }
5194
5195 /*
5196  * Maps a sequence of resident pages belonging to the same object.
5197  * The sequence begins with the given page m_start.  This page is
5198  * mapped at the given virtual address start.  Each subsequent page is
5199  * mapped at a virtual address that is offset from start by the same
5200  * amount as the page is offset from m_start within the object.  The
5201  * last page in the sequence is the page with the largest offset from
5202  * m_start that can be mapped at a virtual address less than the given
5203  * virtual address end.  Not every virtual page between start and end
5204  * is mapped; only those for which a resident page exists with the
5205  * corresponding offset from m_start are mapped.
5206  */
5207 void
5208 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
5209     vm_page_t m_start, vm_prot_t prot)
5210 {
5211         struct rwlock *lock;
5212         vm_offset_t va;
5213         vm_page_t m, mpte;
5214         vm_pindex_t diff, psize;
5215
5216         VM_OBJECT_ASSERT_LOCKED(m_start->object);
5217
5218         psize = atop(end - start);
5219         mpte = NULL;
5220         m = m_start;
5221         lock = NULL;
5222         PMAP_LOCK(pmap);
5223         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
5224                 va = start + ptoa(diff);
5225                 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
5226                     m->psind == 1 && pmap_ps_enabled(pmap) &&
5227                     pmap_enter_2mpage(pmap, va, m, prot, &lock))
5228                         m = &m[NBPDR / PAGE_SIZE - 1];
5229                 else
5230                         mpte = pmap_enter_quick_locked(pmap, va, m, prot,
5231                             mpte, &lock);
5232                 m = TAILQ_NEXT(m, listq);
5233         }
5234         if (lock != NULL)
5235                 rw_wunlock(lock);
5236         PMAP_UNLOCK(pmap);
5237 }
5238
5239 /*
5240  * this code makes some *MAJOR* assumptions:
5241  * 1. Current pmap & pmap exists.
5242  * 2. Not wired.
5243  * 3. Read access.
5244  * 4. No page table pages.
5245  * but is *MUCH* faster than pmap_enter...
5246  */
5247
5248 void
5249 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
5250 {
5251         struct rwlock *lock;
5252
5253         lock = NULL;
5254         PMAP_LOCK(pmap);
5255         (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
5256         if (lock != NULL)
5257                 rw_wunlock(lock);
5258         PMAP_UNLOCK(pmap);
5259 }
5260
5261 static vm_page_t
5262 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
5263     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
5264 {
5265         struct spglist free;
5266         pt_entry_t *pte, PG_V;
5267         vm_paddr_t pa;
5268
5269         KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
5270             (m->oflags & VPO_UNMANAGED) != 0,
5271             ("pmap_enter_quick_locked: managed mapping within the clean submap"));
5272         PG_V = pmap_valid_bit(pmap);
5273         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5274
5275         /*
5276          * In the case that a page table page is not
5277          * resident, we are creating it here.
5278          */
5279         if (va < VM_MAXUSER_ADDRESS) {
5280                 vm_pindex_t ptepindex;
5281                 pd_entry_t *ptepa;
5282
5283                 /*
5284                  * Calculate pagetable page index
5285                  */
5286                 ptepindex = pmap_pde_pindex(va);
5287                 if (mpte && (mpte->pindex == ptepindex)) {
5288                         mpte->wire_count++;
5289                 } else {
5290                         /*
5291                          * Get the page directory entry
5292                          */
5293                         ptepa = pmap_pde(pmap, va);
5294
5295                         /*
5296                          * If the page table page is mapped, we just increment
5297                          * the hold count, and activate it.  Otherwise, we
5298                          * attempt to allocate a page table page.  If this
5299                          * attempt fails, we don't retry.  Instead, we give up.
5300                          */
5301                         if (ptepa && (*ptepa & PG_V) != 0) {
5302                                 if (*ptepa & PG_PS)
5303                                         return (NULL);
5304                                 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
5305                                 mpte->wire_count++;
5306                         } else {
5307                                 /*
5308                                  * Pass NULL instead of the PV list lock
5309                                  * pointer, because we don't intend to sleep.
5310                                  */
5311                                 mpte = _pmap_allocpte(pmap, ptepindex, NULL);
5312                                 if (mpte == NULL)
5313                                         return (mpte);
5314                         }
5315                 }
5316                 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
5317                 pte = &pte[pmap_pte_index(va)];
5318         } else {
5319                 mpte = NULL;
5320                 pte = vtopte(va);
5321         }
5322         if (*pte) {
5323                 if (mpte != NULL) {
5324                         mpte->wire_count--;
5325                         mpte = NULL;
5326                 }
5327                 return (mpte);
5328         }
5329
5330         /*
5331          * Enter on the PV list if part of our managed memory.
5332          */
5333         if ((m->oflags & VPO_UNMANAGED) == 0 &&
5334             !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
5335                 if (mpte != NULL) {
5336                         SLIST_INIT(&free);
5337                         if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
5338                                 /*
5339                                  * Although "va" is not mapped, paging-
5340                                  * structure caches could nonetheless have
5341                                  * entries that refer to the freed page table
5342                                  * pages.  Invalidate those entries.
5343                                  */
5344                                 pmap_invalidate_page(pmap, va);
5345                                 vm_page_free_pages_toq(&free, true);
5346                         }
5347                         mpte = NULL;
5348                 }
5349                 return (mpte);
5350         }
5351
5352         /*
5353          * Increment counters
5354          */
5355         pmap_resident_count_inc(pmap, 1);
5356
5357         pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
5358         if ((prot & VM_PROT_EXECUTE) == 0)
5359                 pa |= pg_nx;
5360
5361         /*
5362          * Now validate mapping with RO protection
5363          */
5364         if ((m->oflags & VPO_UNMANAGED) != 0)
5365                 pte_store(pte, pa | PG_V | PG_U);
5366         else
5367                 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
5368         return (mpte);
5369 }
5370
5371 /*
5372  * Make a temporary mapping for a physical address.  This is only intended
5373  * to be used for panic dumps.
5374  */
5375 void *
5376 pmap_kenter_temporary(vm_paddr_t pa, int i)
5377 {
5378         vm_offset_t va;
5379
5380         va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
5381         pmap_kenter(va, pa);
5382         invlpg(va);
5383         return ((void *)crashdumpmap);
5384 }
5385
5386 /*
5387  * This code maps large physical mmap regions into the
5388  * processor address space.  Note that some shortcuts
5389  * are taken, but the code works.
5390  */
5391 void
5392 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
5393     vm_pindex_t pindex, vm_size_t size)
5394 {
5395         pd_entry_t *pde;
5396         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
5397         vm_paddr_t pa, ptepa;
5398         vm_page_t p, pdpg;
5399         int pat_mode;
5400
5401         PG_A = pmap_accessed_bit(pmap);
5402         PG_M = pmap_modified_bit(pmap);
5403         PG_V = pmap_valid_bit(pmap);
5404         PG_RW = pmap_rw_bit(pmap);
5405
5406         VM_OBJECT_ASSERT_WLOCKED(object);
5407         KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
5408             ("pmap_object_init_pt: non-device object"));
5409         if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
5410                 if (!pmap_ps_enabled(pmap))
5411                         return;
5412                 if (!vm_object_populate(object, pindex, pindex + atop(size)))
5413                         return;
5414                 p = vm_page_lookup(object, pindex);
5415                 KASSERT(p->valid == VM_PAGE_BITS_ALL,
5416                     ("pmap_object_init_pt: invalid page %p", p));
5417                 pat_mode = p->md.pat_mode;
5418
5419                 /*
5420                  * Abort the mapping if the first page is not physically
5421                  * aligned to a 2MB page boundary.
5422                  */
5423                 ptepa = VM_PAGE_TO_PHYS(p);
5424                 if (ptepa & (NBPDR - 1))
5425                         return;
5426
5427                 /*
5428                  * Skip the first page.  Abort the mapping if the rest of
5429                  * the pages are not physically contiguous or have differing
5430                  * memory attributes.
5431                  */
5432                 p = TAILQ_NEXT(p, listq);
5433                 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
5434                     pa += PAGE_SIZE) {
5435                         KASSERT(p->valid == VM_PAGE_BITS_ALL,
5436                             ("pmap_object_init_pt: invalid page %p", p));
5437                         if (pa != VM_PAGE_TO_PHYS(p) ||
5438                             pat_mode != p->md.pat_mode)
5439                                 return;
5440                         p = TAILQ_NEXT(p, listq);
5441                 }
5442
5443                 /*
5444                  * Map using 2MB pages.  Since "ptepa" is 2M aligned and
5445                  * "size" is a multiple of 2M, adding the PAT setting to "pa"
5446                  * will not affect the termination of this loop.
5447                  */
5448                 PMAP_LOCK(pmap);
5449                 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
5450                     pa < ptepa + size; pa += NBPDR) {
5451                         pdpg = pmap_allocpde(pmap, addr, NULL);
5452                         if (pdpg == NULL) {
5453                                 /*
5454                                  * The creation of mappings below is only an
5455                                  * optimization.  If a page directory page
5456                                  * cannot be allocated without blocking,
5457                                  * continue on to the next mapping rather than
5458                                  * blocking.
5459                                  */
5460                                 addr += NBPDR;
5461                                 continue;
5462                         }
5463                         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
5464                         pde = &pde[pmap_pde_index(addr)];
5465                         if ((*pde & PG_V) == 0) {
5466                                 pde_store(pde, pa | PG_PS | PG_M | PG_A |
5467                                     PG_U | PG_RW | PG_V);
5468                                 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
5469                                 atomic_add_long(&pmap_pde_mappings, 1);
5470                         } else {
5471                                 /* Continue on if the PDE is already valid. */
5472                                 pdpg->wire_count--;
5473                                 KASSERT(pdpg->wire_count > 0,
5474                                     ("pmap_object_init_pt: missing reference "
5475                                     "to page directory page, va: 0x%lx", addr));
5476                         }
5477                         addr += NBPDR;
5478                 }
5479                 PMAP_UNLOCK(pmap);
5480         }
5481 }
5482
5483 /*
5484  *      Clear the wired attribute from the mappings for the specified range of
5485  *      addresses in the given pmap.  Every valid mapping within that range
5486  *      must have the wired attribute set.  In contrast, invalid mappings
5487  *      cannot have the wired attribute set, so they are ignored.
5488  *
5489  *      The wired attribute of the page table entry is not a hardware
5490  *      feature, so there is no need to invalidate any TLB entries.
5491  *      Since pmap_demote_pde() for the wired entry must never fail,
5492  *      pmap_delayed_invl_started()/finished() calls around the
5493  *      function are not needed.
5494  */
5495 void
5496 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5497 {
5498         vm_offset_t va_next;
5499         pml4_entry_t *pml4e;
5500         pdp_entry_t *pdpe;
5501         pd_entry_t *pde;
5502         pt_entry_t *pte, PG_V;
5503
5504         PG_V = pmap_valid_bit(pmap);
5505         PMAP_LOCK(pmap);
5506         for (; sva < eva; sva = va_next) {
5507                 pml4e = pmap_pml4e(pmap, sva);
5508                 if ((*pml4e & PG_V) == 0) {
5509                         va_next = (sva + NBPML4) & ~PML4MASK;
5510                         if (va_next < sva)
5511                                 va_next = eva;
5512                         continue;
5513                 }
5514                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
5515                 if ((*pdpe & PG_V) == 0) {
5516                         va_next = (sva + NBPDP) & ~PDPMASK;
5517                         if (va_next < sva)
5518                                 va_next = eva;
5519                         continue;
5520                 }
5521                 va_next = (sva + NBPDR) & ~PDRMASK;
5522                 if (va_next < sva)
5523                         va_next = eva;
5524                 pde = pmap_pdpe_to_pde(pdpe, sva);
5525                 if ((*pde & PG_V) == 0)
5526                         continue;
5527                 if ((*pde & PG_PS) != 0) {
5528                         if ((*pde & PG_W) == 0)
5529                                 panic("pmap_unwire: pde %#jx is missing PG_W",
5530                                     (uintmax_t)*pde);
5531
5532                         /*
5533                          * Are we unwiring the entire large page?  If not,
5534                          * demote the mapping and fall through.
5535                          */
5536                         if (sva + NBPDR == va_next && eva >= va_next) {
5537                                 atomic_clear_long(pde, PG_W);
5538                                 pmap->pm_stats.wired_count -= NBPDR /
5539                                     PAGE_SIZE;
5540                                 continue;
5541                         } else if (!pmap_demote_pde(pmap, pde, sva))
5542                                 panic("pmap_unwire: demotion failed");
5543                 }
5544                 if (va_next > eva)
5545                         va_next = eva;
5546                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
5547                     sva += PAGE_SIZE) {
5548                         if ((*pte & PG_V) == 0)
5549                                 continue;
5550                         if ((*pte & PG_W) == 0)
5551                                 panic("pmap_unwire: pte %#jx is missing PG_W",
5552                                     (uintmax_t)*pte);
5553
5554                         /*
5555                          * PG_W must be cleared atomically.  Although the pmap
5556                          * lock synchronizes access to PG_W, another processor
5557                          * could be setting PG_M and/or PG_A concurrently.
5558                          */
5559                         atomic_clear_long(pte, PG_W);
5560                         pmap->pm_stats.wired_count--;
5561                 }
5562         }
5563         PMAP_UNLOCK(pmap);
5564 }
5565
5566 /*
5567  *      Copy the range specified by src_addr/len
5568  *      from the source map to the range dst_addr/len
5569  *      in the destination map.
5570  *
5571  *      This routine is only advisory and need not do anything.
5572  */
5573
5574 void
5575 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
5576     vm_offset_t src_addr)
5577 {
5578         struct rwlock *lock;
5579         struct spglist free;
5580         vm_offset_t addr;
5581         vm_offset_t end_addr = src_addr + len;
5582         vm_offset_t va_next;
5583         vm_page_t dst_pdpg, dstmpte, srcmpte;
5584         pt_entry_t PG_A, PG_M, PG_V;
5585
5586         if (dst_addr != src_addr)
5587                 return;
5588
5589         if (dst_pmap->pm_type != src_pmap->pm_type)
5590                 return;
5591
5592         /*
5593          * EPT page table entries that require emulation of A/D bits are
5594          * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
5595          * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
5596          * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
5597          * implementations flag an EPT misconfiguration for exec-only
5598          * mappings we skip this function entirely for emulated pmaps.
5599          */
5600         if (pmap_emulate_ad_bits(dst_pmap))
5601                 return;
5602
5603         lock = NULL;
5604         if (dst_pmap < src_pmap) {
5605                 PMAP_LOCK(dst_pmap);
5606                 PMAP_LOCK(src_pmap);
5607         } else {
5608                 PMAP_LOCK(src_pmap);
5609                 PMAP_LOCK(dst_pmap);
5610         }
5611
5612         PG_A = pmap_accessed_bit(dst_pmap);
5613         PG_M = pmap_modified_bit(dst_pmap);
5614         PG_V = pmap_valid_bit(dst_pmap);
5615
5616         for (addr = src_addr; addr < end_addr; addr = va_next) {
5617                 pt_entry_t *src_pte, *dst_pte;
5618                 pml4_entry_t *pml4e;
5619                 pdp_entry_t *pdpe;
5620                 pd_entry_t srcptepaddr, *pde;
5621
5622                 KASSERT(addr < UPT_MIN_ADDRESS,
5623                     ("pmap_copy: invalid to pmap_copy page tables"));
5624
5625                 pml4e = pmap_pml4e(src_pmap, addr);
5626                 if ((*pml4e & PG_V) == 0) {
5627                         va_next = (addr + NBPML4) & ~PML4MASK;
5628                         if (va_next < addr)
5629                                 va_next = end_addr;
5630                         continue;
5631                 }
5632
5633                 pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
5634                 if ((*pdpe & PG_V) == 0) {
5635                         va_next = (addr + NBPDP) & ~PDPMASK;
5636                         if (va_next < addr)
5637                                 va_next = end_addr;
5638                         continue;
5639                 }
5640
5641                 va_next = (addr + NBPDR) & ~PDRMASK;
5642                 if (va_next < addr)
5643                         va_next = end_addr;
5644
5645                 pde = pmap_pdpe_to_pde(pdpe, addr);
5646                 srcptepaddr = *pde;
5647                 if (srcptepaddr == 0)
5648                         continue;
5649
5650                 if (srcptepaddr & PG_PS) {
5651                         if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
5652                                 continue;
5653                         dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL);
5654                         if (dst_pdpg == NULL)
5655                                 break;
5656                         pde = (pd_entry_t *)
5657                             PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
5658                         pde = &pde[pmap_pde_index(addr)];
5659                         if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
5660                             pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
5661                             PMAP_ENTER_NORECLAIM, &lock))) {
5662                                 *pde = srcptepaddr & ~PG_W;
5663                                 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
5664                                 atomic_add_long(&pmap_pde_mappings, 1);
5665                         } else
5666                                 dst_pdpg->wire_count--;
5667                         continue;
5668                 }
5669
5670                 srcptepaddr &= PG_FRAME;
5671                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
5672                 KASSERT(srcmpte->wire_count > 0,
5673                     ("pmap_copy: source page table page is unused"));
5674
5675                 if (va_next > end_addr)
5676                         va_next = end_addr;
5677
5678                 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
5679                 src_pte = &src_pte[pmap_pte_index(addr)];
5680                 dstmpte = NULL;
5681                 while (addr < va_next) {
5682                         pt_entry_t ptetemp;
5683                         ptetemp = *src_pte;
5684                         /*
5685                          * we only virtual copy managed pages
5686                          */
5687                         if ((ptetemp & PG_MANAGED) != 0) {
5688                                 if (dstmpte != NULL &&
5689                                     dstmpte->pindex == pmap_pde_pindex(addr))
5690                                         dstmpte->wire_count++;
5691                                 else if ((dstmpte = pmap_allocpte(dst_pmap,
5692                                     addr, NULL)) == NULL)
5693                                         goto out;
5694                                 dst_pte = (pt_entry_t *)
5695                                     PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
5696                                 dst_pte = &dst_pte[pmap_pte_index(addr)];
5697                                 if (*dst_pte == 0 &&
5698                                     pmap_try_insert_pv_entry(dst_pmap, addr,
5699                                     PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
5700                                     &lock)) {
5701                                         /*
5702                                          * Clear the wired, modified, and
5703                                          * accessed (referenced) bits
5704                                          * during the copy.
5705                                          */
5706                                         *dst_pte = ptetemp & ~(PG_W | PG_M |
5707                                             PG_A);
5708                                         pmap_resident_count_inc(dst_pmap, 1);
5709                                 } else {
5710                                         SLIST_INIT(&free);
5711                                         if (pmap_unwire_ptp(dst_pmap, addr,
5712                                             dstmpte, &free)) {
5713                                                 /*
5714                                                  * Although "addr" is not
5715                                                  * mapped, paging-structure
5716                                                  * caches could nonetheless
5717                                                  * have entries that refer to
5718                                                  * the freed page table pages.
5719                                                  * Invalidate those entries.
5720                                                  */
5721                                                 pmap_invalidate_page(dst_pmap,
5722                                                     addr);
5723                                                 vm_page_free_pages_toq(&free,
5724                                                     true);
5725                                         }
5726                                         goto out;
5727                                 }
5728                                 if (dstmpte->wire_count >= srcmpte->wire_count)
5729                                         break;
5730                         }
5731                         addr += PAGE_SIZE;
5732                         src_pte++;
5733                 }
5734         }
5735 out:
5736         if (lock != NULL)
5737                 rw_wunlock(lock);
5738         PMAP_UNLOCK(src_pmap);
5739         PMAP_UNLOCK(dst_pmap);
5740 }
5741
5742 /*
5743  * Zero the specified hardware page.
5744  */
5745 void
5746 pmap_zero_page(vm_page_t m)
5747 {
5748         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5749
5750         pagezero((void *)va);
5751 }
5752
5753 /*
5754  * Zero an an area within a single hardware page.  off and size must not
5755  * cover an area beyond a single hardware page.
5756  */
5757 void
5758 pmap_zero_page_area(vm_page_t m, int off, int size)
5759 {
5760         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5761
5762         if (off == 0 && size == PAGE_SIZE)
5763                 pagezero((void *)va);
5764         else
5765                 bzero((char *)va + off, size);
5766 }
5767
5768 /*
5769  * Copy 1 specified hardware page to another.
5770  */
5771 void
5772 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
5773 {
5774         vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
5775         vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
5776
5777         pagecopy((void *)src, (void *)dst);
5778 }
5779
5780 int unmapped_buf_allowed = 1;
5781
5782 void
5783 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
5784     vm_offset_t b_offset, int xfersize)
5785 {
5786         void *a_cp, *b_cp;
5787         vm_page_t pages[2];
5788         vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
5789         int cnt;
5790         boolean_t mapped;
5791
5792         while (xfersize > 0) {
5793                 a_pg_offset = a_offset & PAGE_MASK;
5794                 pages[0] = ma[a_offset >> PAGE_SHIFT];
5795                 b_pg_offset = b_offset & PAGE_MASK;
5796                 pages[1] = mb[b_offset >> PAGE_SHIFT];
5797                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
5798                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
5799                 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
5800                 a_cp = (char *)vaddr[0] + a_pg_offset;
5801                 b_cp = (char *)vaddr[1] + b_pg_offset;
5802                 bcopy(a_cp, b_cp, cnt);
5803                 if (__predict_false(mapped))
5804                         pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
5805                 a_offset += cnt;
5806                 b_offset += cnt;
5807                 xfersize -= cnt;
5808         }
5809 }
5810
5811 /*
5812  * Returns true if the pmap's pv is one of the first
5813  * 16 pvs linked to from this page.  This count may
5814  * be changed upwards or downwards in the future; it
5815  * is only necessary that true be returned for a small
5816  * subset of pmaps for proper page aging.
5817  */
5818 boolean_t
5819 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
5820 {
5821         struct md_page *pvh;
5822         struct rwlock *lock;
5823         pv_entry_t pv;
5824         int loops = 0;
5825         boolean_t rv;
5826
5827         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5828             ("pmap_page_exists_quick: page %p is not managed", m));
5829         rv = FALSE;
5830         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5831         rw_rlock(lock);
5832         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5833                 if (PV_PMAP(pv) == pmap) {
5834                         rv = TRUE;
5835                         break;
5836                 }
5837                 loops++;
5838                 if (loops >= 16)
5839                         break;
5840         }
5841         if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5842                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5843                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5844                         if (PV_PMAP(pv) == pmap) {
5845                                 rv = TRUE;
5846                                 break;
5847                         }
5848                         loops++;
5849                         if (loops >= 16)
5850                                 break;
5851                 }
5852         }
5853         rw_runlock(lock);
5854         return (rv);
5855 }
5856
5857 /*
5858  *      pmap_page_wired_mappings:
5859  *
5860  *      Return the number of managed mappings to the given physical page
5861  *      that are wired.
5862  */
5863 int
5864 pmap_page_wired_mappings(vm_page_t m)
5865 {
5866         struct rwlock *lock;
5867         struct md_page *pvh;
5868         pmap_t pmap;
5869         pt_entry_t *pte;
5870         pv_entry_t pv;
5871         int count, md_gen, pvh_gen;
5872
5873         if ((m->oflags & VPO_UNMANAGED) != 0)
5874                 return (0);
5875         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5876         rw_rlock(lock);
5877 restart:
5878         count = 0;
5879         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5880                 pmap = PV_PMAP(pv);
5881                 if (!PMAP_TRYLOCK(pmap)) {
5882                         md_gen = m->md.pv_gen;
5883                         rw_runlock(lock);
5884                         PMAP_LOCK(pmap);
5885                         rw_rlock(lock);
5886                         if (md_gen != m->md.pv_gen) {
5887                                 PMAP_UNLOCK(pmap);
5888                                 goto restart;
5889                         }
5890                 }
5891                 pte = pmap_pte(pmap, pv->pv_va);
5892                 if ((*pte & PG_W) != 0)
5893                         count++;
5894                 PMAP_UNLOCK(pmap);
5895         }
5896         if ((m->flags & PG_FICTITIOUS) == 0) {
5897                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5898                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5899                         pmap = PV_PMAP(pv);
5900                         if (!PMAP_TRYLOCK(pmap)) {
5901                                 md_gen = m->md.pv_gen;
5902                                 pvh_gen = pvh->pv_gen;
5903                                 rw_runlock(lock);
5904                                 PMAP_LOCK(pmap);
5905                                 rw_rlock(lock);
5906                                 if (md_gen != m->md.pv_gen ||
5907                                     pvh_gen != pvh->pv_gen) {
5908                                         PMAP_UNLOCK(pmap);
5909                                         goto restart;
5910                                 }
5911                         }
5912                         pte = pmap_pde(pmap, pv->pv_va);
5913                         if ((*pte & PG_W) != 0)
5914                                 count++;
5915                         PMAP_UNLOCK(pmap);
5916                 }
5917         }
5918         rw_runlock(lock);
5919         return (count);
5920 }
5921
5922 /*
5923  * Returns TRUE if the given page is mapped individually or as part of
5924  * a 2mpage.  Otherwise, returns FALSE.
5925  */
5926 boolean_t
5927 pmap_page_is_mapped(vm_page_t m)
5928 {
5929         struct rwlock *lock;
5930         boolean_t rv;
5931
5932         if ((m->oflags & VPO_UNMANAGED) != 0)
5933                 return (FALSE);
5934         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5935         rw_rlock(lock);
5936         rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5937             ((m->flags & PG_FICTITIOUS) == 0 &&
5938             !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
5939         rw_runlock(lock);
5940         return (rv);
5941 }
5942
5943 /*
5944  * Destroy all managed, non-wired mappings in the given user-space
5945  * pmap.  This pmap cannot be active on any processor besides the
5946  * caller.
5947  *
5948  * This function cannot be applied to the kernel pmap.  Moreover, it
5949  * is not intended for general use.  It is only to be used during
5950  * process termination.  Consequently, it can be implemented in ways
5951  * that make it faster than pmap_remove().  First, it can more quickly
5952  * destroy mappings by iterating over the pmap's collection of PV
5953  * entries, rather than searching the page table.  Second, it doesn't
5954  * have to test and clear the page table entries atomically, because
5955  * no processor is currently accessing the user address space.  In
5956  * particular, a page table entry's dirty bit won't change state once
5957  * this function starts.
5958  *
5959  * Although this function destroys all of the pmap's managed,
5960  * non-wired mappings, it can delay and batch the invalidation of TLB
5961  * entries without calling pmap_delayed_invl_started() and
5962  * pmap_delayed_invl_finished().  Because the pmap is not active on
5963  * any other processor, none of these TLB entries will ever be used
5964  * before their eventual invalidation.  Consequently, there is no need
5965  * for either pmap_remove_all() or pmap_remove_write() to wait for
5966  * that eventual TLB invalidation.
5967  */
5968 void
5969 pmap_remove_pages(pmap_t pmap)
5970 {
5971         pd_entry_t ptepde;
5972         pt_entry_t *pte, tpte;
5973         pt_entry_t PG_M, PG_RW, PG_V;
5974         struct spglist free;
5975         vm_page_t m, mpte, mt;
5976         pv_entry_t pv;
5977         struct md_page *pvh;
5978         struct pv_chunk *pc, *npc;
5979         struct rwlock *lock;
5980         int64_t bit;
5981         uint64_t inuse, bitmask;
5982         int allfree, field, freed, idx;
5983         boolean_t superpage;
5984         vm_paddr_t pa;
5985
5986         /*
5987          * Assert that the given pmap is only active on the current
5988          * CPU.  Unfortunately, we cannot block another CPU from
5989          * activating the pmap while this function is executing.
5990          */
5991         KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
5992 #ifdef INVARIANTS
5993         {
5994                 cpuset_t other_cpus;
5995
5996                 other_cpus = all_cpus;
5997                 critical_enter();
5998                 CPU_CLR(PCPU_GET(cpuid), &other_cpus);
5999                 CPU_AND(&other_cpus, &pmap->pm_active);
6000                 critical_exit();
6001                 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
6002         }
6003 #endif
6004
6005         lock = NULL;
6006         PG_M = pmap_modified_bit(pmap);
6007         PG_V = pmap_valid_bit(pmap);
6008         PG_RW = pmap_rw_bit(pmap);
6009
6010         SLIST_INIT(&free);
6011         PMAP_LOCK(pmap);
6012         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
6013                 allfree = 1;
6014                 freed = 0;
6015                 for (field = 0; field < _NPCM; field++) {
6016                         inuse = ~pc->pc_map[field] & pc_freemask[field];
6017                         while (inuse != 0) {
6018                                 bit = bsfq(inuse);
6019                                 bitmask = 1UL << bit;
6020                                 idx = field * 64 + bit;
6021                                 pv = &pc->pc_pventry[idx];
6022                                 inuse &= ~bitmask;
6023
6024                                 pte = pmap_pdpe(pmap, pv->pv_va);
6025                                 ptepde = *pte;
6026                                 pte = pmap_pdpe_to_pde(pte, pv->pv_va);
6027                                 tpte = *pte;
6028                                 if ((tpte & (PG_PS | PG_V)) == PG_V) {
6029                                         superpage = FALSE;
6030                                         ptepde = tpte;
6031                                         pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
6032                                             PG_FRAME);
6033                                         pte = &pte[pmap_pte_index(pv->pv_va)];
6034                                         tpte = *pte;
6035                                 } else {
6036                                         /*
6037                                          * Keep track whether 'tpte' is a
6038                                          * superpage explicitly instead of
6039                                          * relying on PG_PS being set.
6040                                          *
6041                                          * This is because PG_PS is numerically
6042                                          * identical to PG_PTE_PAT and thus a
6043                                          * regular page could be mistaken for
6044                                          * a superpage.
6045                                          */
6046                                         superpage = TRUE;
6047                                 }
6048
6049                                 if ((tpte & PG_V) == 0) {
6050                                         panic("bad pte va %lx pte %lx",
6051                                             pv->pv_va, tpte);
6052                                 }
6053
6054 /*
6055  * We cannot remove wired pages from a process' mapping at this time
6056  */
6057                                 if (tpte & PG_W) {
6058                                         allfree = 0;
6059                                         continue;
6060                                 }
6061
6062                                 if (superpage)
6063                                         pa = tpte & PG_PS_FRAME;
6064                                 else
6065                                         pa = tpte & PG_FRAME;
6066
6067                                 m = PHYS_TO_VM_PAGE(pa);
6068                                 KASSERT(m->phys_addr == pa,
6069                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
6070                                     m, (uintmax_t)m->phys_addr,
6071                                     (uintmax_t)tpte));
6072
6073                                 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
6074                                     m < &vm_page_array[vm_page_array_size],
6075                                     ("pmap_remove_pages: bad tpte %#jx",
6076                                     (uintmax_t)tpte));
6077
6078                                 pte_clear(pte);
6079
6080                                 /*
6081                                  * Update the vm_page_t clean/reference bits.
6082                                  */
6083                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6084                                         if (superpage) {
6085                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
6086                                                         vm_page_dirty(mt);
6087                                         } else
6088                                                 vm_page_dirty(m);
6089                                 }
6090
6091                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
6092
6093                                 /* Mark free */
6094                                 pc->pc_map[field] |= bitmask;
6095                                 if (superpage) {
6096                                         pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
6097                                         pvh = pa_to_pvh(tpte & PG_PS_FRAME);
6098                                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
6099                                         pvh->pv_gen++;
6100                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
6101                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
6102                                                         if ((mt->aflags & PGA_WRITEABLE) != 0 &&
6103                                                             TAILQ_EMPTY(&mt->md.pv_list))
6104                                                                 vm_page_aflag_clear(mt, PGA_WRITEABLE);
6105                                         }
6106                                         mpte = pmap_remove_pt_page(pmap, pv->pv_va);
6107                                         if (mpte != NULL) {
6108                                                 pmap_resident_count_dec(pmap, 1);
6109                                                 KASSERT(mpte->wire_count == NPTEPG,
6110                                                     ("pmap_remove_pages: pte page wire count error"));
6111                                                 mpte->wire_count = 0;
6112                                                 pmap_add_delayed_free_list(mpte, &free, FALSE);
6113                                         }
6114                                 } else {
6115                                         pmap_resident_count_dec(pmap, 1);
6116                                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
6117                                         m->md.pv_gen++;
6118                                         if ((m->aflags & PGA_WRITEABLE) != 0 &&
6119                                             TAILQ_EMPTY(&m->md.pv_list) &&
6120                                             (m->flags & PG_FICTITIOUS) == 0) {
6121                                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6122                                                 if (TAILQ_EMPTY(&pvh->pv_list))
6123                                                         vm_page_aflag_clear(m, PGA_WRITEABLE);
6124                                         }
6125                                 }
6126                                 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
6127                                 freed++;
6128                         }
6129                 }
6130                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
6131                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
6132                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
6133                 if (allfree) {
6134                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
6135                         free_pv_chunk(pc);
6136                 }
6137         }
6138         if (lock != NULL)
6139                 rw_wunlock(lock);
6140         pmap_invalidate_all(pmap);
6141         PMAP_UNLOCK(pmap);
6142         vm_page_free_pages_toq(&free, true);
6143 }
6144
6145 static boolean_t
6146 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
6147 {
6148         struct rwlock *lock;
6149         pv_entry_t pv;
6150         struct md_page *pvh;
6151         pt_entry_t *pte, mask;
6152         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
6153         pmap_t pmap;
6154         int md_gen, pvh_gen;
6155         boolean_t rv;
6156
6157         rv = FALSE;
6158         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6159         rw_rlock(lock);
6160 restart:
6161         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6162                 pmap = PV_PMAP(pv);
6163                 if (!PMAP_TRYLOCK(pmap)) {
6164                         md_gen = m->md.pv_gen;
6165                         rw_runlock(lock);
6166                         PMAP_LOCK(pmap);
6167                         rw_rlock(lock);
6168                         if (md_gen != m->md.pv_gen) {
6169                                 PMAP_UNLOCK(pmap);
6170                                 goto restart;
6171                         }
6172                 }
6173                 pte = pmap_pte(pmap, pv->pv_va);
6174                 mask = 0;
6175                 if (modified) {
6176                         PG_M = pmap_modified_bit(pmap);
6177                         PG_RW = pmap_rw_bit(pmap);
6178                         mask |= PG_RW | PG_M;
6179                 }
6180                 if (accessed) {
6181                         PG_A = pmap_accessed_bit(pmap);
6182                         PG_V = pmap_valid_bit(pmap);
6183                         mask |= PG_V | PG_A;
6184                 }
6185                 rv = (*pte & mask) == mask;
6186                 PMAP_UNLOCK(pmap);
6187                 if (rv)
6188                         goto out;
6189         }
6190         if ((m->flags & PG_FICTITIOUS) == 0) {
6191                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6192                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6193                         pmap = PV_PMAP(pv);
6194                         if (!PMAP_TRYLOCK(pmap)) {
6195                                 md_gen = m->md.pv_gen;
6196                                 pvh_gen = pvh->pv_gen;
6197                                 rw_runlock(lock);
6198                                 PMAP_LOCK(pmap);
6199                                 rw_rlock(lock);
6200                                 if (md_gen != m->md.pv_gen ||
6201                                     pvh_gen != pvh->pv_gen) {
6202                                         PMAP_UNLOCK(pmap);
6203                                         goto restart;
6204                                 }
6205                         }
6206                         pte = pmap_pde(pmap, pv->pv_va);
6207                         mask = 0;
6208                         if (modified) {
6209                                 PG_M = pmap_modified_bit(pmap);
6210                                 PG_RW = pmap_rw_bit(pmap);
6211                                 mask |= PG_RW | PG_M;
6212                         }
6213                         if (accessed) {
6214                                 PG_A = pmap_accessed_bit(pmap);
6215                                 PG_V = pmap_valid_bit(pmap);
6216                                 mask |= PG_V | PG_A;
6217                         }
6218                         rv = (*pte & mask) == mask;
6219                         PMAP_UNLOCK(pmap);
6220                         if (rv)
6221                                 goto out;
6222                 }
6223         }
6224 out:
6225         rw_runlock(lock);
6226         return (rv);
6227 }
6228
6229 /*
6230  *      pmap_is_modified:
6231  *
6232  *      Return whether or not the specified physical page was modified
6233  *      in any physical maps.
6234  */
6235 boolean_t
6236 pmap_is_modified(vm_page_t m)
6237 {
6238
6239         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6240             ("pmap_is_modified: page %p is not managed", m));
6241
6242         /*
6243          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
6244          * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
6245          * is clear, no PTEs can have PG_M set.
6246          */
6247         VM_OBJECT_ASSERT_WLOCKED(m->object);
6248         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
6249                 return (FALSE);
6250         return (pmap_page_test_mappings(m, FALSE, TRUE));
6251 }
6252
6253 /*
6254  *      pmap_is_prefaultable:
6255  *
6256  *      Return whether or not the specified virtual address is eligible
6257  *      for prefault.
6258  */
6259 boolean_t
6260 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
6261 {
6262         pd_entry_t *pde;
6263         pt_entry_t *pte, PG_V;
6264         boolean_t rv;
6265
6266         PG_V = pmap_valid_bit(pmap);
6267         rv = FALSE;
6268         PMAP_LOCK(pmap);
6269         pde = pmap_pde(pmap, addr);
6270         if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
6271                 pte = pmap_pde_to_pte(pde, addr);
6272                 rv = (*pte & PG_V) == 0;
6273         }
6274         PMAP_UNLOCK(pmap);
6275         return (rv);
6276 }
6277
6278 /*
6279  *      pmap_is_referenced:
6280  *
6281  *      Return whether or not the specified physical page was referenced
6282  *      in any physical maps.
6283  */
6284 boolean_t
6285 pmap_is_referenced(vm_page_t m)
6286 {
6287
6288         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6289             ("pmap_is_referenced: page %p is not managed", m));
6290         return (pmap_page_test_mappings(m, TRUE, FALSE));
6291 }
6292
6293 /*
6294  * Clear the write and modified bits in each of the given page's mappings.
6295  */
6296 void
6297 pmap_remove_write(vm_page_t m)
6298 {
6299         struct md_page *pvh;
6300         pmap_t pmap;
6301         struct rwlock *lock;
6302         pv_entry_t next_pv, pv;
6303         pd_entry_t *pde;
6304         pt_entry_t oldpte, *pte, PG_M, PG_RW;
6305         vm_offset_t va;
6306         int pvh_gen, md_gen;
6307
6308         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6309             ("pmap_remove_write: page %p is not managed", m));
6310
6311         /*
6312          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
6313          * set by another thread while the object is locked.  Thus,
6314          * if PGA_WRITEABLE is clear, no page table entries need updating.
6315          */
6316         VM_OBJECT_ASSERT_WLOCKED(m->object);
6317         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
6318                 return;
6319         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6320         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
6321             pa_to_pvh(VM_PAGE_TO_PHYS(m));
6322 retry_pv_loop:
6323         rw_wlock(lock);
6324         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6325                 pmap = PV_PMAP(pv);
6326                 if (!PMAP_TRYLOCK(pmap)) {
6327                         pvh_gen = pvh->pv_gen;
6328                         rw_wunlock(lock);
6329                         PMAP_LOCK(pmap);
6330                         rw_wlock(lock);
6331                         if (pvh_gen != pvh->pv_gen) {
6332                                 PMAP_UNLOCK(pmap);
6333                                 rw_wunlock(lock);
6334                                 goto retry_pv_loop;
6335                         }
6336                 }
6337                 PG_RW = pmap_rw_bit(pmap);
6338                 va = pv->pv_va;
6339                 pde = pmap_pde(pmap, va);
6340                 if ((*pde & PG_RW) != 0)
6341                         (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
6342                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
6343                     ("inconsistent pv lock %p %p for page %p",
6344                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
6345                 PMAP_UNLOCK(pmap);
6346         }
6347         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6348                 pmap = PV_PMAP(pv);
6349                 if (!PMAP_TRYLOCK(pmap)) {
6350                         pvh_gen = pvh->pv_gen;
6351                         md_gen = m->md.pv_gen;
6352                         rw_wunlock(lock);
6353                         PMAP_LOCK(pmap);
6354                         rw_wlock(lock);
6355                         if (pvh_gen != pvh->pv_gen ||
6356                             md_gen != m->md.pv_gen) {
6357                                 PMAP_UNLOCK(pmap);
6358                                 rw_wunlock(lock);
6359                                 goto retry_pv_loop;
6360                         }
6361                 }
6362                 PG_M = pmap_modified_bit(pmap);
6363                 PG_RW = pmap_rw_bit(pmap);
6364                 pde = pmap_pde(pmap, pv->pv_va);
6365                 KASSERT((*pde & PG_PS) == 0,
6366                     ("pmap_remove_write: found a 2mpage in page %p's pv list",
6367                     m));
6368                 pte = pmap_pde_to_pte(pde, pv->pv_va);
6369 retry:
6370                 oldpte = *pte;
6371                 if (oldpte & PG_RW) {
6372                         if (!atomic_cmpset_long(pte, oldpte, oldpte &
6373                             ~(PG_RW | PG_M)))
6374                                 goto retry;
6375                         if ((oldpte & PG_M) != 0)
6376                                 vm_page_dirty(m);
6377                         pmap_invalidate_page(pmap, pv->pv_va);
6378                 }
6379                 PMAP_UNLOCK(pmap);
6380         }
6381         rw_wunlock(lock);
6382         vm_page_aflag_clear(m, PGA_WRITEABLE);
6383         pmap_delayed_invl_wait(m);
6384 }
6385
6386 static __inline boolean_t
6387 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
6388 {
6389
6390         if (!pmap_emulate_ad_bits(pmap))
6391                 return (TRUE);
6392
6393         KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
6394
6395         /*
6396          * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
6397          * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
6398          * if the EPT_PG_WRITE bit is set.
6399          */
6400         if ((pte & EPT_PG_WRITE) != 0)
6401                 return (FALSE);
6402
6403         /*
6404          * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
6405          */
6406         if ((pte & EPT_PG_EXECUTE) == 0 ||
6407             ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
6408                 return (TRUE);
6409         else
6410                 return (FALSE);
6411 }
6412
6413 /*
6414  *      pmap_ts_referenced:
6415  *
6416  *      Return a count of reference bits for a page, clearing those bits.
6417  *      It is not necessary for every reference bit to be cleared, but it
6418  *      is necessary that 0 only be returned when there are truly no
6419  *      reference bits set.
6420  *
6421  *      As an optimization, update the page's dirty field if a modified bit is
6422  *      found while counting reference bits.  This opportunistic update can be
6423  *      performed at low cost and can eliminate the need for some future calls
6424  *      to pmap_is_modified().  However, since this function stops after
6425  *      finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
6426  *      dirty pages.  Those dirty pages will only be detected by a future call
6427  *      to pmap_is_modified().
6428  *
6429  *      A DI block is not needed within this function, because
6430  *      invalidations are performed before the PV list lock is
6431  *      released.
6432  */
6433 int
6434 pmap_ts_referenced(vm_page_t m)
6435 {
6436         struct md_page *pvh;
6437         pv_entry_t pv, pvf;
6438         pmap_t pmap;
6439         struct rwlock *lock;
6440         pd_entry_t oldpde, *pde;
6441         pt_entry_t *pte, PG_A, PG_M, PG_RW;
6442         vm_offset_t va;
6443         vm_paddr_t pa;
6444         int cleared, md_gen, not_cleared, pvh_gen;
6445         struct spglist free;
6446         boolean_t demoted;
6447
6448         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6449             ("pmap_ts_referenced: page %p is not managed", m));
6450         SLIST_INIT(&free);
6451         cleared = 0;
6452         pa = VM_PAGE_TO_PHYS(m);
6453         lock = PHYS_TO_PV_LIST_LOCK(pa);
6454         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
6455         rw_wlock(lock);
6456 retry:
6457         not_cleared = 0;
6458         if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
6459                 goto small_mappings;
6460         pv = pvf;
6461         do {
6462                 if (pvf == NULL)
6463                         pvf = pv;
6464                 pmap = PV_PMAP(pv);
6465                 if (!PMAP_TRYLOCK(pmap)) {
6466                         pvh_gen = pvh->pv_gen;
6467                         rw_wunlock(lock);
6468                         PMAP_LOCK(pmap);
6469                         rw_wlock(lock);
6470                         if (pvh_gen != pvh->pv_gen) {
6471                                 PMAP_UNLOCK(pmap);
6472                                 goto retry;
6473                         }
6474                 }
6475                 PG_A = pmap_accessed_bit(pmap);
6476                 PG_M = pmap_modified_bit(pmap);
6477                 PG_RW = pmap_rw_bit(pmap);
6478                 va = pv->pv_va;
6479                 pde = pmap_pde(pmap, pv->pv_va);
6480                 oldpde = *pde;
6481                 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6482                         /*
6483                          * Although "oldpde" is mapping a 2MB page, because
6484                          * this function is called at a 4KB page granularity,
6485                          * we only update the 4KB page under test.
6486                          */
6487                         vm_page_dirty(m);
6488                 }
6489                 if ((oldpde & PG_A) != 0) {
6490                         /*
6491                          * Since this reference bit is shared by 512 4KB
6492                          * pages, it should not be cleared every time it is
6493                          * tested.  Apply a simple "hash" function on the
6494                          * physical page number, the virtual superpage number,
6495                          * and the pmap address to select one 4KB page out of
6496                          * the 512 on which testing the reference bit will
6497                          * result in clearing that reference bit.  This
6498                          * function is designed to avoid the selection of the
6499                          * same 4KB page for every 2MB page mapping.
6500                          *
6501                          * On demotion, a mapping that hasn't been referenced
6502                          * is simply destroyed.  To avoid the possibility of a
6503                          * subsequent page fault on a demoted wired mapping,
6504                          * always leave its reference bit set.  Moreover,
6505                          * since the superpage is wired, the current state of
6506                          * its reference bit won't affect page replacement.
6507                          */
6508                         if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
6509                             (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
6510                             (oldpde & PG_W) == 0) {
6511                                 if (safe_to_clear_referenced(pmap, oldpde)) {
6512                                         atomic_clear_long(pde, PG_A);
6513                                         pmap_invalidate_page(pmap, pv->pv_va);
6514                                         demoted = FALSE;
6515                                 } else if (pmap_demote_pde_locked(pmap, pde,
6516                                     pv->pv_va, &lock)) {
6517                                         /*
6518                                          * Remove the mapping to a single page
6519                                          * so that a subsequent access may
6520                                          * repromote.  Since the underlying
6521                                          * page table page is fully populated,
6522                                          * this removal never frees a page
6523                                          * table page.
6524                                          */
6525                                         demoted = TRUE;
6526                                         va += VM_PAGE_TO_PHYS(m) - (oldpde &
6527                                             PG_PS_FRAME);
6528                                         pte = pmap_pde_to_pte(pde, va);
6529                                         pmap_remove_pte(pmap, pte, va, *pde,
6530                                             NULL, &lock);
6531                                         pmap_invalidate_page(pmap, va);
6532                                 } else
6533                                         demoted = TRUE;
6534
6535                                 if (demoted) {
6536                                         /*
6537                                          * The superpage mapping was removed
6538                                          * entirely and therefore 'pv' is no
6539                                          * longer valid.
6540                                          */
6541                                         if (pvf == pv)
6542                                                 pvf = NULL;
6543                                         pv = NULL;
6544                                 }
6545                                 cleared++;
6546                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
6547                                     ("inconsistent pv lock %p %p for page %p",
6548                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
6549                         } else
6550                                 not_cleared++;
6551                 }
6552                 PMAP_UNLOCK(pmap);
6553                 /* Rotate the PV list if it has more than one entry. */
6554                 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
6555                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
6556                         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
6557                         pvh->pv_gen++;
6558                 }
6559                 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
6560                         goto out;
6561         } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
6562 small_mappings:
6563         if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
6564                 goto out;
6565         pv = pvf;
6566         do {
6567                 if (pvf == NULL)
6568                         pvf = pv;
6569                 pmap = PV_PMAP(pv);
6570                 if (!PMAP_TRYLOCK(pmap)) {
6571                         pvh_gen = pvh->pv_gen;
6572                         md_gen = m->md.pv_gen;
6573                         rw_wunlock(lock);
6574                         PMAP_LOCK(pmap);
6575                         rw_wlock(lock);
6576                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6577                                 PMAP_UNLOCK(pmap);
6578                                 goto retry;
6579                         }
6580                 }
6581                 PG_A = pmap_accessed_bit(pmap);
6582                 PG_M = pmap_modified_bit(pmap);
6583                 PG_RW = pmap_rw_bit(pmap);
6584                 pde = pmap_pde(pmap, pv->pv_va);
6585                 KASSERT((*pde & PG_PS) == 0,
6586                     ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
6587                     m));
6588                 pte = pmap_pde_to_pte(pde, pv->pv_va);
6589                 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6590                         vm_page_dirty(m);
6591                 if ((*pte & PG_A) != 0) {
6592                         if (safe_to_clear_referenced(pmap, *pte)) {
6593                                 atomic_clear_long(pte, PG_A);
6594                                 pmap_invalidate_page(pmap, pv->pv_va);
6595                                 cleared++;
6596                         } else if ((*pte & PG_W) == 0) {
6597                                 /*
6598                                  * Wired pages cannot be paged out so
6599                                  * doing accessed bit emulation for
6600                                  * them is wasted effort. We do the
6601                                  * hard work for unwired pages only.
6602                                  */
6603                                 pmap_remove_pte(pmap, pte, pv->pv_va,
6604                                     *pde, &free, &lock);
6605                                 pmap_invalidate_page(pmap, pv->pv_va);
6606                                 cleared++;
6607                                 if (pvf == pv)
6608                                         pvf = NULL;
6609                                 pv = NULL;
6610                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
6611                                     ("inconsistent pv lock %p %p for page %p",
6612                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
6613                         } else
6614                                 not_cleared++;
6615                 }
6616                 PMAP_UNLOCK(pmap);
6617                 /* Rotate the PV list if it has more than one entry. */
6618                 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
6619                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
6620                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
6621                         m->md.pv_gen++;
6622                 }
6623         } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
6624             not_cleared < PMAP_TS_REFERENCED_MAX);
6625 out:
6626         rw_wunlock(lock);
6627         vm_page_free_pages_toq(&free, true);
6628         return (cleared + not_cleared);
6629 }
6630
6631 /*
6632  *      Apply the given advice to the specified range of addresses within the
6633  *      given pmap.  Depending on the advice, clear the referenced and/or
6634  *      modified flags in each mapping and set the mapped page's dirty field.
6635  */
6636 void
6637 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
6638 {
6639         struct rwlock *lock;
6640         pml4_entry_t *pml4e;
6641         pdp_entry_t *pdpe;
6642         pd_entry_t oldpde, *pde;
6643         pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
6644         vm_offset_t va, va_next;
6645         vm_page_t m;
6646         boolean_t anychanged;
6647
6648         if (advice != MADV_DONTNEED && advice != MADV_FREE)
6649                 return;
6650
6651         /*
6652          * A/D bit emulation requires an alternate code path when clearing
6653          * the modified and accessed bits below. Since this function is
6654          * advisory in nature we skip it entirely for pmaps that require
6655          * A/D bit emulation.
6656          */
6657         if (pmap_emulate_ad_bits(pmap))
6658                 return;
6659
6660         PG_A = pmap_accessed_bit(pmap);
6661         PG_G = pmap_global_bit(pmap);
6662         PG_M = pmap_modified_bit(pmap);
6663         PG_V = pmap_valid_bit(pmap);
6664         PG_RW = pmap_rw_bit(pmap);
6665         anychanged = FALSE;
6666         pmap_delayed_invl_started();
6667         PMAP_LOCK(pmap);
6668         for (; sva < eva; sva = va_next) {
6669                 pml4e = pmap_pml4e(pmap, sva);
6670                 if ((*pml4e & PG_V) == 0) {
6671                         va_next = (sva + NBPML4) & ~PML4MASK;
6672                         if (va_next < sva)
6673                                 va_next = eva;
6674                         continue;
6675                 }
6676                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
6677                 if ((*pdpe & PG_V) == 0) {
6678                         va_next = (sva + NBPDP) & ~PDPMASK;
6679                         if (va_next < sva)
6680                                 va_next = eva;
6681                         continue;
6682                 }
6683                 va_next = (sva + NBPDR) & ~PDRMASK;
6684                 if (va_next < sva)
6685                         va_next = eva;
6686                 pde = pmap_pdpe_to_pde(pdpe, sva);
6687                 oldpde = *pde;
6688                 if ((oldpde & PG_V) == 0)
6689                         continue;
6690                 else if ((oldpde & PG_PS) != 0) {
6691                         if ((oldpde & PG_MANAGED) == 0)
6692                                 continue;
6693                         lock = NULL;
6694                         if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
6695                                 if (lock != NULL)
6696                                         rw_wunlock(lock);
6697
6698                                 /*
6699                                  * The large page mapping was destroyed.
6700                                  */
6701                                 continue;
6702                         }
6703
6704                         /*
6705                          * Unless the page mappings are wired, remove the
6706                          * mapping to a single page so that a subsequent
6707                          * access may repromote.  Since the underlying page
6708                          * table page is fully populated, this removal never
6709                          * frees a page table page.
6710                          */
6711                         if ((oldpde & PG_W) == 0) {
6712                                 pte = pmap_pde_to_pte(pde, sva);
6713                                 KASSERT((*pte & PG_V) != 0,
6714                                     ("pmap_advise: invalid PTE"));
6715                                 pmap_remove_pte(pmap, pte, sva, *pde, NULL,
6716                                     &lock);
6717                                 anychanged = TRUE;
6718                         }
6719                         if (lock != NULL)
6720                                 rw_wunlock(lock);
6721                 }
6722                 if (va_next > eva)
6723                         va_next = eva;
6724                 va = va_next;
6725                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
6726                     sva += PAGE_SIZE) {
6727                         if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
6728                                 goto maybe_invlrng;
6729                         else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6730                                 if (advice == MADV_DONTNEED) {
6731                                         /*
6732                                          * Future calls to pmap_is_modified()
6733                                          * can be avoided by making the page
6734                                          * dirty now.
6735                                          */
6736                                         m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6737                                         vm_page_dirty(m);
6738                                 }
6739                                 atomic_clear_long(pte, PG_M | PG_A);
6740                         } else if ((*pte & PG_A) != 0)
6741                                 atomic_clear_long(pte, PG_A);
6742                         else
6743                                 goto maybe_invlrng;
6744
6745                         if ((*pte & PG_G) != 0) {
6746                                 if (va == va_next)
6747                                         va = sva;
6748                         } else
6749                                 anychanged = TRUE;
6750                         continue;
6751 maybe_invlrng:
6752                         if (va != va_next) {
6753                                 pmap_invalidate_range(pmap, va, sva);
6754                                 va = va_next;
6755                         }
6756                 }
6757                 if (va != va_next)
6758                         pmap_invalidate_range(pmap, va, sva);
6759         }
6760         if (anychanged)
6761                 pmap_invalidate_all(pmap);
6762         PMAP_UNLOCK(pmap);
6763         pmap_delayed_invl_finished();
6764 }
6765
6766 /*
6767  *      Clear the modify bits on the specified physical page.
6768  */
6769 void
6770 pmap_clear_modify(vm_page_t m)
6771 {
6772         struct md_page *pvh;
6773         pmap_t pmap;
6774         pv_entry_t next_pv, pv;
6775         pd_entry_t oldpde, *pde;
6776         pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
6777         struct rwlock *lock;
6778         vm_offset_t va;
6779         int md_gen, pvh_gen;
6780
6781         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6782             ("pmap_clear_modify: page %p is not managed", m));
6783         VM_OBJECT_ASSERT_WLOCKED(m->object);
6784         KASSERT(!vm_page_xbusied(m),
6785             ("pmap_clear_modify: page %p is exclusive busied", m));
6786
6787         /*
6788          * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
6789          * If the object containing the page is locked and the page is not
6790          * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
6791          */
6792         if ((m->aflags & PGA_WRITEABLE) == 0)
6793                 return;
6794         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
6795             pa_to_pvh(VM_PAGE_TO_PHYS(m));
6796         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6797         rw_wlock(lock);
6798 restart:
6799         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6800                 pmap = PV_PMAP(pv);
6801                 if (!PMAP_TRYLOCK(pmap)) {
6802                         pvh_gen = pvh->pv_gen;
6803                         rw_wunlock(lock);
6804                         PMAP_LOCK(pmap);
6805                         rw_wlock(lock);
6806                         if (pvh_gen != pvh->pv_gen) {
6807                                 PMAP_UNLOCK(pmap);
6808                                 goto restart;
6809                         }
6810                 }
6811                 PG_M = pmap_modified_bit(pmap);
6812                 PG_V = pmap_valid_bit(pmap);
6813                 PG_RW = pmap_rw_bit(pmap);
6814                 va = pv->pv_va;
6815                 pde = pmap_pde(pmap, va);
6816                 oldpde = *pde;
6817                 if ((oldpde & PG_RW) != 0) {
6818                         if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
6819                                 if ((oldpde & PG_W) == 0) {
6820                                         /*
6821                                          * Write protect the mapping to a
6822                                          * single page so that a subsequent
6823                                          * write access may repromote.
6824                                          */
6825                                         va += VM_PAGE_TO_PHYS(m) - (oldpde &
6826                                             PG_PS_FRAME);
6827                                         pte = pmap_pde_to_pte(pde, va);
6828                                         oldpte = *pte;
6829                                         if ((oldpte & PG_V) != 0) {
6830                                                 while (!atomic_cmpset_long(pte,
6831                                                     oldpte,
6832                                                     oldpte & ~(PG_M | PG_RW)))
6833                                                         oldpte = *pte;
6834                                                 vm_page_dirty(m);
6835                                                 pmap_invalidate_page(pmap, va);
6836                                         }
6837                                 }
6838                         }
6839                 }
6840                 PMAP_UNLOCK(pmap);
6841         }
6842         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6843                 pmap = PV_PMAP(pv);
6844                 if (!PMAP_TRYLOCK(pmap)) {
6845                         md_gen = m->md.pv_gen;
6846                         pvh_gen = pvh->pv_gen;
6847                         rw_wunlock(lock);
6848                         PMAP_LOCK(pmap);
6849                         rw_wlock(lock);
6850                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6851                                 PMAP_UNLOCK(pmap);
6852                                 goto restart;
6853                         }
6854                 }
6855                 PG_M = pmap_modified_bit(pmap);
6856                 PG_RW = pmap_rw_bit(pmap);
6857                 pde = pmap_pde(pmap, pv->pv_va);
6858                 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
6859                     " a 2mpage in page %p's pv list", m));
6860                 pte = pmap_pde_to_pte(pde, pv->pv_va);
6861                 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6862                         atomic_clear_long(pte, PG_M);
6863                         pmap_invalidate_page(pmap, pv->pv_va);
6864                 }
6865                 PMAP_UNLOCK(pmap);
6866         }
6867         rw_wunlock(lock);
6868 }
6869
6870 /*
6871  * Miscellaneous support routines follow
6872  */
6873
6874 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
6875 static __inline void
6876 pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
6877 {
6878         u_int opte, npte;
6879
6880         /*
6881          * The cache mode bits are all in the low 32-bits of the
6882          * PTE, so we can just spin on updating the low 32-bits.
6883          */
6884         do {
6885                 opte = *(u_int *)pte;
6886                 npte = opte & ~mask;
6887                 npte |= cache_bits;
6888         } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
6889 }
6890
6891 /* Adjust the cache mode for a 2MB page mapped via a PDE. */
6892 static __inline void
6893 pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
6894 {
6895         u_int opde, npde;
6896
6897         /*
6898          * The cache mode bits are all in the low 32-bits of the
6899          * PDE, so we can just spin on updating the low 32-bits.
6900          */
6901         do {
6902                 opde = *(u_int *)pde;
6903                 npde = opde & ~mask;
6904                 npde |= cache_bits;
6905         } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
6906 }
6907
6908 /*
6909  * Map a set of physical memory pages into the kernel virtual
6910  * address space. Return a pointer to where it is mapped. This
6911  * routine is intended to be used for mapping device memory,
6912  * NOT real memory.
6913  */
6914 void *
6915 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
6916 {
6917         struct pmap_preinit_mapping *ppim;
6918         vm_offset_t va, offset;
6919         vm_size_t tmpsize;
6920         int i;
6921
6922         offset = pa & PAGE_MASK;
6923         size = round_page(offset + size);
6924         pa = trunc_page(pa);
6925
6926         if (!pmap_initialized) {
6927                 va = 0;
6928                 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6929                         ppim = pmap_preinit_mapping + i;
6930                         if (ppim->va == 0) {
6931                                 ppim->pa = pa;
6932                                 ppim->sz = size;
6933                                 ppim->mode = mode;
6934                                 ppim->va = virtual_avail;
6935                                 virtual_avail += size;
6936                                 va = ppim->va;
6937                                 break;
6938                         }
6939                 }
6940                 if (va == 0)
6941                         panic("%s: too many preinit mappings", __func__);
6942         } else {
6943                 /*
6944                  * If we have a preinit mapping, re-use it.
6945                  */
6946                 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6947                         ppim = pmap_preinit_mapping + i;
6948                         if (ppim->pa == pa && ppim->sz == size &&
6949                             ppim->mode == mode)
6950                                 return ((void *)(ppim->va + offset));
6951                 }
6952                 /*
6953                  * If the specified range of physical addresses fits within
6954                  * the direct map window, use the direct map.
6955                  */
6956                 if (pa < dmaplimit && pa + size < dmaplimit) {
6957                         va = PHYS_TO_DMAP(pa);
6958                         if (!pmap_change_attr(va, size, mode))
6959                                 return ((void *)(va + offset));
6960                 }
6961                 va = kva_alloc(size);
6962                 if (va == 0)
6963                         panic("%s: Couldn't allocate KVA", __func__);
6964         }
6965         for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
6966                 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
6967         pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
6968         pmap_invalidate_cache_range(va, va + tmpsize);
6969         return ((void *)(va + offset));
6970 }
6971
6972 void *
6973 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
6974 {
6975
6976         return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
6977 }
6978
6979 void *
6980 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
6981 {
6982
6983         return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
6984 }
6985
6986 void
6987 pmap_unmapdev(vm_offset_t va, vm_size_t size)
6988 {
6989         struct pmap_preinit_mapping *ppim;
6990         vm_offset_t offset;
6991         int i;
6992
6993         /* If we gave a direct map region in pmap_mapdev, do nothing */
6994         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
6995                 return;
6996         offset = va & PAGE_MASK;
6997         size = round_page(offset + size);
6998         va = trunc_page(va);
6999         for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7000                 ppim = pmap_preinit_mapping + i;
7001                 if (ppim->va == va && ppim->sz == size) {
7002                         if (pmap_initialized)
7003                                 return;
7004                         ppim->pa = 0;
7005                         ppim->va = 0;
7006                         ppim->sz = 0;
7007                         ppim->mode = 0;
7008                         if (va + size == virtual_avail)
7009                                 virtual_avail = va;
7010                         return;
7011                 }
7012         }
7013         if (pmap_initialized)
7014                 kva_free(va, size);
7015 }
7016
7017 /*
7018  * Tries to demote a 1GB page mapping.
7019  */
7020 static boolean_t
7021 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
7022 {
7023         pdp_entry_t newpdpe, oldpdpe;
7024         pd_entry_t *firstpde, newpde, *pde;
7025         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
7026         vm_paddr_t pdpgpa;
7027         vm_page_t pdpg;
7028
7029         PG_A = pmap_accessed_bit(pmap);
7030         PG_M = pmap_modified_bit(pmap);
7031         PG_V = pmap_valid_bit(pmap);
7032         PG_RW = pmap_rw_bit(pmap);
7033
7034         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
7035         oldpdpe = *pdpe;
7036         KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
7037             ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
7038         if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
7039             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
7040                 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
7041                     " in pmap %p", va, pmap);
7042                 return (FALSE);
7043         }
7044         pdpgpa = VM_PAGE_TO_PHYS(pdpg);
7045         firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa);
7046         newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
7047         KASSERT((oldpdpe & PG_A) != 0,
7048             ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
7049         KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
7050             ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
7051         newpde = oldpdpe;
7052
7053         /*
7054          * Initialize the page directory page.
7055          */
7056         for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
7057                 *pde = newpde;
7058                 newpde += NBPDR;
7059         }
7060
7061         /*
7062          * Demote the mapping.
7063          */
7064         *pdpe = newpdpe;
7065
7066         /*
7067          * Invalidate a stale recursive mapping of the page directory page.
7068          */
7069         pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
7070
7071         pmap_pdpe_demotions++;
7072         CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
7073             " in pmap %p", va, pmap);
7074         return (TRUE);
7075 }
7076
7077 /*
7078  * Sets the memory attribute for the specified page.
7079  */
7080 void
7081 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
7082 {
7083
7084         m->md.pat_mode = ma;
7085
7086         /*
7087          * If "m" is a normal page, update its direct mapping.  This update
7088          * can be relied upon to perform any cache operations that are
7089          * required for data coherence.
7090          */
7091         if ((m->flags & PG_FICTITIOUS) == 0 &&
7092             pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
7093             m->md.pat_mode))
7094                 panic("memory attribute change on the direct map failed");
7095 }
7096
7097 /*
7098  * Changes the specified virtual address range's memory type to that given by
7099  * the parameter "mode".  The specified virtual address range must be
7100  * completely contained within either the direct map or the kernel map.  If
7101  * the virtual address range is contained within the kernel map, then the
7102  * memory type for each of the corresponding ranges of the direct map is also
7103  * changed.  (The corresponding ranges of the direct map are those ranges that
7104  * map the same physical pages as the specified virtual address range.)  These
7105  * changes to the direct map are necessary because Intel describes the
7106  * behavior of their processors as "undefined" if two or more mappings to the
7107  * same physical page have different memory types.
7108  *
7109  * Returns zero if the change completed successfully, and either EINVAL or
7110  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
7111  * of the virtual address range was not mapped, and ENOMEM is returned if
7112  * there was insufficient memory available to complete the change.  In the
7113  * latter case, the memory type may have been changed on some part of the
7114  * virtual address range or the direct map.
7115  */
7116 int
7117 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
7118 {
7119         int error;
7120
7121         PMAP_LOCK(kernel_pmap);
7122         error = pmap_change_attr_locked(va, size, mode);
7123         PMAP_UNLOCK(kernel_pmap);
7124         return (error);
7125 }
7126
7127 static int
7128 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
7129 {
7130         vm_offset_t base, offset, tmpva;
7131         vm_paddr_t pa_start, pa_end, pa_end1;
7132         pdp_entry_t *pdpe;
7133         pd_entry_t *pde;
7134         pt_entry_t *pte;
7135         int cache_bits_pte, cache_bits_pde, error;
7136         boolean_t changed;
7137
7138         PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
7139         base = trunc_page(va);
7140         offset = va & PAGE_MASK;
7141         size = round_page(offset + size);
7142
7143         /*
7144          * Only supported on kernel virtual addresses, including the direct
7145          * map but excluding the recursive map.
7146          */
7147         if (base < DMAP_MIN_ADDRESS)
7148                 return (EINVAL);
7149
7150         cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
7151         cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
7152         changed = FALSE;
7153
7154         /*
7155          * Pages that aren't mapped aren't supported.  Also break down 2MB pages
7156          * into 4KB pages if required.
7157          */
7158         for (tmpva = base; tmpva < base + size; ) {
7159                 pdpe = pmap_pdpe(kernel_pmap, tmpva);
7160                 if (pdpe == NULL || *pdpe == 0)
7161                         return (EINVAL);
7162                 if (*pdpe & PG_PS) {
7163                         /*
7164                          * If the current 1GB page already has the required
7165                          * memory type, then we need not demote this page. Just
7166                          * increment tmpva to the next 1GB page frame.
7167                          */
7168                         if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
7169                                 tmpva = trunc_1gpage(tmpva) + NBPDP;
7170                                 continue;
7171                         }
7172
7173                         /*
7174                          * If the current offset aligns with a 1GB page frame
7175                          * and there is at least 1GB left within the range, then
7176                          * we need not break down this page into 2MB pages.
7177                          */
7178                         if ((tmpva & PDPMASK) == 0 &&
7179                             tmpva + PDPMASK < base + size) {
7180                                 tmpva += NBPDP;
7181                                 continue;
7182                         }
7183                         if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
7184                                 return (ENOMEM);
7185                 }
7186                 pde = pmap_pdpe_to_pde(pdpe, tmpva);
7187                 if (*pde == 0)
7188                         return (EINVAL);
7189                 if (*pde & PG_PS) {
7190                         /*
7191                          * If the current 2MB page already has the required
7192                          * memory type, then we need not demote this page. Just
7193                          * increment tmpva to the next 2MB page frame.
7194                          */
7195                         if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
7196                                 tmpva = trunc_2mpage(tmpva) + NBPDR;
7197                                 continue;
7198                         }
7199
7200                         /*
7201                          * If the current offset aligns with a 2MB page frame
7202                          * and there is at least 2MB left within the range, then
7203                          * we need not break down this page into 4KB pages.
7204                          */
7205                         if ((tmpva & PDRMASK) == 0 &&
7206                             tmpva + PDRMASK < base + size) {
7207                                 tmpva += NBPDR;
7208                                 continue;
7209                         }
7210                         if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
7211                                 return (ENOMEM);
7212                 }
7213                 pte = pmap_pde_to_pte(pde, tmpva);
7214                 if (*pte == 0)
7215                         return (EINVAL);
7216                 tmpva += PAGE_SIZE;
7217         }
7218         error = 0;
7219
7220         /*
7221          * Ok, all the pages exist, so run through them updating their
7222          * cache mode if required.
7223          */
7224         pa_start = pa_end = 0;
7225         for (tmpva = base; tmpva < base + size; ) {
7226                 pdpe = pmap_pdpe(kernel_pmap, tmpva);
7227                 if (*pdpe & PG_PS) {
7228                         if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
7229                                 pmap_pde_attr(pdpe, cache_bits_pde,
7230                                     X86_PG_PDE_CACHE);
7231                                 changed = TRUE;
7232                         }
7233                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
7234                             (*pdpe & PG_PS_FRAME) < dmaplimit) {
7235                                 if (pa_start == pa_end) {
7236                                         /* Start physical address run. */
7237                                         pa_start = *pdpe & PG_PS_FRAME;
7238                                         pa_end = pa_start + NBPDP;
7239                                 } else if (pa_end == (*pdpe & PG_PS_FRAME))
7240                                         pa_end += NBPDP;
7241                                 else {
7242                                         /* Run ended, update direct map. */
7243                                         error = pmap_change_attr_locked(
7244                                             PHYS_TO_DMAP(pa_start),
7245                                             pa_end - pa_start, mode);
7246                                         if (error != 0)
7247                                                 break;
7248                                         /* Start physical address run. */
7249                                         pa_start = *pdpe & PG_PS_FRAME;
7250                                         pa_end = pa_start + NBPDP;
7251                                 }
7252                         }
7253                         tmpva = trunc_1gpage(tmpva) + NBPDP;
7254                         continue;
7255                 }
7256                 pde = pmap_pdpe_to_pde(pdpe, tmpva);
7257                 if (*pde & PG_PS) {
7258                         if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
7259                                 pmap_pde_attr(pde, cache_bits_pde,
7260                                     X86_PG_PDE_CACHE);
7261                                 changed = TRUE;
7262                         }
7263                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
7264                             (*pde & PG_PS_FRAME) < dmaplimit) {
7265                                 if (pa_start == pa_end) {
7266                                         /* Start physical address run. */
7267                                         pa_start = *pde & PG_PS_FRAME;
7268                                         pa_end = pa_start + NBPDR;
7269                                 } else if (pa_end == (*pde & PG_PS_FRAME))
7270                                         pa_end += NBPDR;
7271                                 else {
7272                                         /* Run ended, update direct map. */
7273                                         error = pmap_change_attr_locked(
7274                                             PHYS_TO_DMAP(pa_start),
7275                                             pa_end - pa_start, mode);
7276                                         if (error != 0)
7277                                                 break;
7278                                         /* Start physical address run. */
7279                                         pa_start = *pde & PG_PS_FRAME;
7280                                         pa_end = pa_start + NBPDR;
7281                                 }
7282                         }
7283                         tmpva = trunc_2mpage(tmpva) + NBPDR;
7284                 } else {
7285                         pte = pmap_pde_to_pte(pde, tmpva);
7286                         if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
7287                                 pmap_pte_attr(pte, cache_bits_pte,
7288                                     X86_PG_PTE_CACHE);
7289                                 changed = TRUE;
7290                         }
7291                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
7292                             (*pte & PG_FRAME) < dmaplimit) {
7293                                 if (pa_start == pa_end) {
7294                                         /* Start physical address run. */
7295                                         pa_start = *pte & PG_FRAME;
7296                                         pa_end = pa_start + PAGE_SIZE;
7297                                 } else if (pa_end == (*pte & PG_FRAME))
7298                                         pa_end += PAGE_SIZE;
7299                                 else {
7300                                         /* Run ended, update direct map. */
7301                                         error = pmap_change_attr_locked(
7302                                             PHYS_TO_DMAP(pa_start),
7303                                             pa_end - pa_start, mode);
7304                                         if (error != 0)
7305                                                 break;
7306                                         /* Start physical address run. */
7307                                         pa_start = *pte & PG_FRAME;
7308                                         pa_end = pa_start + PAGE_SIZE;
7309                                 }
7310                         }
7311                         tmpva += PAGE_SIZE;
7312                 }
7313         }
7314         if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
7315                 pa_end1 = MIN(pa_end, dmaplimit);
7316                 if (pa_start != pa_end1)
7317                         error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
7318                             pa_end1 - pa_start, mode);
7319         }
7320
7321         /*
7322          * Flush CPU caches if required to make sure any data isn't cached that
7323          * shouldn't be, etc.
7324          */
7325         if (changed) {
7326                 pmap_invalidate_range(kernel_pmap, base, tmpva);
7327                 pmap_invalidate_cache_range(base, tmpva);
7328         }
7329         return (error);
7330 }
7331
7332 /*
7333  * Demotes any mapping within the direct map region that covers more than the
7334  * specified range of physical addresses.  This range's size must be a power
7335  * of two and its starting address must be a multiple of its size.  Since the
7336  * demotion does not change any attributes of the mapping, a TLB invalidation
7337  * is not mandatory.  The caller may, however, request a TLB invalidation.
7338  */
7339 void
7340 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
7341 {
7342         pdp_entry_t *pdpe;
7343         pd_entry_t *pde;
7344         vm_offset_t va;
7345         boolean_t changed;
7346
7347         if (len == 0)
7348                 return;
7349         KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
7350         KASSERT((base & (len - 1)) == 0,
7351             ("pmap_demote_DMAP: base is not a multiple of len"));
7352         if (len < NBPDP && base < dmaplimit) {
7353                 va = PHYS_TO_DMAP(base);
7354                 changed = FALSE;
7355                 PMAP_LOCK(kernel_pmap);
7356                 pdpe = pmap_pdpe(kernel_pmap, va);
7357                 if ((*pdpe & X86_PG_V) == 0)
7358                         panic("pmap_demote_DMAP: invalid PDPE");
7359                 if ((*pdpe & PG_PS) != 0) {
7360                         if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
7361                                 panic("pmap_demote_DMAP: PDPE failed");
7362                         changed = TRUE;
7363                 }
7364                 if (len < NBPDR) {
7365                         pde = pmap_pdpe_to_pde(pdpe, va);
7366                         if ((*pde & X86_PG_V) == 0)
7367                                 panic("pmap_demote_DMAP: invalid PDE");
7368                         if ((*pde & PG_PS) != 0) {
7369                                 if (!pmap_demote_pde(kernel_pmap, pde, va))
7370                                         panic("pmap_demote_DMAP: PDE failed");
7371                                 changed = TRUE;
7372                         }
7373                 }
7374                 if (changed && invalidate)
7375                         pmap_invalidate_page(kernel_pmap, va);
7376                 PMAP_UNLOCK(kernel_pmap);
7377         }
7378 }
7379
7380 /*
7381  * perform the pmap work for mincore
7382  */
7383 int
7384 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
7385 {
7386         pd_entry_t *pdep;
7387         pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
7388         vm_paddr_t pa;
7389         int val;
7390
7391         PG_A = pmap_accessed_bit(pmap);
7392         PG_M = pmap_modified_bit(pmap);
7393         PG_V = pmap_valid_bit(pmap);
7394         PG_RW = pmap_rw_bit(pmap);
7395
7396         PMAP_LOCK(pmap);
7397 retry:
7398         pdep = pmap_pde(pmap, addr);
7399         if (pdep != NULL && (*pdep & PG_V)) {
7400                 if (*pdep & PG_PS) {
7401                         pte = *pdep;
7402                         /* Compute the physical address of the 4KB page. */
7403                         pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
7404                             PG_FRAME;
7405                         val = MINCORE_SUPER;
7406                 } else {
7407                         pte = *pmap_pde_to_pte(pdep, addr);
7408                         pa = pte & PG_FRAME;
7409                         val = 0;
7410                 }
7411         } else {
7412                 pte = 0;
7413                 pa = 0;
7414                 val = 0;
7415         }
7416         if ((pte & PG_V) != 0) {
7417                 val |= MINCORE_INCORE;
7418                 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
7419                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
7420                 if ((pte & PG_A) != 0)
7421                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
7422         }
7423         if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
7424             (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
7425             (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
7426                 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
7427                 if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
7428                         goto retry;
7429         } else
7430                 PA_UNLOCK_COND(*locked_pa);
7431         PMAP_UNLOCK(pmap);
7432         return (val);
7433 }
7434
7435 static uint64_t
7436 pmap_pcid_alloc(pmap_t pmap, u_int cpuid)
7437 {
7438         uint32_t gen, new_gen, pcid_next;
7439
7440         CRITICAL_ASSERT(curthread);
7441         gen = PCPU_GET(pcid_gen);
7442         if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN)
7443                 return (pti ? 0 : CR3_PCID_SAVE);
7444         if (pmap->pm_pcids[cpuid].pm_gen == gen)
7445                 return (CR3_PCID_SAVE);
7446         pcid_next = PCPU_GET(pcid_next);
7447         KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) ||
7448             (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN),
7449             ("cpu %d pcid_next %#x", cpuid, pcid_next));
7450         if ((!pti && pcid_next == PMAP_PCID_OVERMAX) ||
7451             (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) {
7452                 new_gen = gen + 1;
7453                 if (new_gen == 0)
7454                         new_gen = 1;
7455                 PCPU_SET(pcid_gen, new_gen);
7456                 pcid_next = PMAP_PCID_KERN + 1;
7457         } else {
7458                 new_gen = gen;
7459         }
7460         pmap->pm_pcids[cpuid].pm_pcid = pcid_next;
7461         pmap->pm_pcids[cpuid].pm_gen = new_gen;
7462         PCPU_SET(pcid_next, pcid_next + 1);
7463         return (0);
7464 }
7465
7466 static uint64_t
7467 pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid)
7468 {
7469         uint64_t cached;
7470
7471         cached = pmap_pcid_alloc(pmap, cpuid);
7472         KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX,
7473             ("pmap %p cpu %d pcid %#x", pmap, cpuid,
7474             pmap->pm_pcids[cpuid].pm_pcid));
7475         KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN ||
7476             pmap == kernel_pmap,
7477             ("non-kernel pmap pmap %p cpu %d pcid %#x",
7478             pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid));
7479         return (cached);
7480 }
7481
7482 static void
7483 pmap_activate_sw_pti_post(pmap_t pmap)
7484 {
7485
7486         if (pmap->pm_ucr3 != PMAP_NO_CR3)
7487                 PCPU_GET(tssp)->tss_rsp0 = ((vm_offset_t)PCPU_PTR(pti_stack) +
7488                     PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful;
7489 }
7490
7491 static void inline
7492 pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1)
7493 {
7494         struct invpcid_descr d;
7495         uint64_t cached, cr3, kcr3, ucr3;
7496
7497         cached = pmap_pcid_alloc_checked(pmap, cpuid);
7498         cr3 = rcr3();
7499         if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
7500                 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid);
7501         PCPU_SET(curpmap, pmap);
7502         kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid;
7503         ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid |
7504             PMAP_PCID_USER_PT;
7505
7506         if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) {
7507                 /*
7508                  * Explicitly invalidate translations cached from the
7509                  * user page table.  They are not automatically
7510                  * flushed by reload of cr3 with the kernel page table
7511                  * pointer above.
7512                  *
7513                  * Note that the if() condition is resolved statically
7514                  * by using the function argument instead of
7515                  * runtime-evaluated invpcid_works value.
7516                  */
7517                 if (invpcid_works1) {
7518                         d.pcid = PMAP_PCID_USER_PT |
7519                             pmap->pm_pcids[cpuid].pm_pcid;
7520                         d.pad = 0;
7521                         d.addr = 0;
7522                         invpcid(&d, INVPCID_CTX);
7523                 } else {
7524                         pmap_pti_pcid_invalidate(ucr3, kcr3);
7525                 }
7526         }
7527
7528         PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
7529         PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
7530         if (cached)
7531                 PCPU_INC(pm_save_cnt);
7532 }
7533
7534 static void
7535 pmap_activate_sw_pcid_invpcid_pti(pmap_t pmap, u_int cpuid)
7536 {
7537
7538         pmap_activate_sw_pcid_pti(pmap, cpuid, true);
7539         pmap_activate_sw_pti_post(pmap);
7540 }
7541
7542 static void
7543 pmap_activate_sw_pcid_noinvpcid_pti(pmap_t pmap, u_int cpuid)
7544 {
7545         register_t rflags;
7546
7547         /*
7548          * If the INVPCID instruction is not available,
7549          * invltlb_pcid_handler() is used to handle an invalidate_all
7550          * IPI, which checks for curpmap == smp_tlb_pmap.  The below
7551          * sequence of operations has a window where %CR3 is loaded
7552          * with the new pmap's PML4 address, but the curpmap value has
7553          * not yet been updated.  This causes the invltlb IPI handler,
7554          * which is called between the updates, to execute as a NOP,
7555          * which leaves stale TLB entries.
7556          *
7557          * Note that the most typical use of pmap_activate_sw(), from
7558          * the context switch, is immune to this race, because
7559          * interrupts are disabled (while the thread lock is owned),
7560          * and the IPI happens after curpmap is updated.  Protect
7561          * other callers in a similar way, by disabling interrupts
7562          * around the %cr3 register reload and curpmap assignment.
7563          */
7564         rflags = intr_disable();
7565         pmap_activate_sw_pcid_pti(pmap, cpuid, false);
7566         intr_restore(rflags);
7567         pmap_activate_sw_pti_post(pmap);
7568 }
7569
7570 static void
7571 pmap_activate_sw_pcid_nopti(pmap_t pmap, u_int cpuid)
7572 {
7573         uint64_t cached, cr3;
7574
7575         cached = pmap_pcid_alloc_checked(pmap, cpuid);
7576         cr3 = rcr3();
7577         if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
7578                 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid |
7579                     cached);
7580         PCPU_SET(curpmap, pmap);
7581         if (cached)
7582                 PCPU_INC(pm_save_cnt);
7583 }
7584
7585 static void
7586 pmap_activate_sw_pcid_noinvpcid_nopti(pmap_t pmap, u_int cpuid)
7587 {
7588         register_t rflags;
7589
7590         rflags = intr_disable();
7591         pmap_activate_sw_pcid_nopti(pmap, cpuid);
7592         intr_restore(rflags);
7593 }
7594
7595 static void
7596 pmap_activate_sw_nopcid_nopti(pmap_t pmap, u_int cpuid __unused)
7597 {
7598
7599         load_cr3(pmap->pm_cr3);
7600         PCPU_SET(curpmap, pmap);
7601 }
7602
7603 static void
7604 pmap_activate_sw_nopcid_pti(pmap_t pmap, u_int cpuid __unused)
7605 {
7606
7607         pmap_activate_sw_nopcid_nopti(pmap, cpuid);
7608         PCPU_SET(kcr3, pmap->pm_cr3);
7609         PCPU_SET(ucr3, pmap->pm_ucr3);
7610         pmap_activate_sw_pti_post(pmap);
7611 }
7612
7613 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (pmap_t, u_int), static)
7614 {
7615
7616         if (pmap_pcid_enabled && pti && invpcid_works)
7617                 return (pmap_activate_sw_pcid_invpcid_pti);
7618         else if (pmap_pcid_enabled && pti && !invpcid_works)
7619                 return (pmap_activate_sw_pcid_noinvpcid_pti);
7620         else if (pmap_pcid_enabled && !pti && invpcid_works)
7621                 return (pmap_activate_sw_pcid_nopti);
7622         else if (pmap_pcid_enabled && !pti && !invpcid_works)
7623                 return (pmap_activate_sw_pcid_noinvpcid_nopti);
7624         else if (!pmap_pcid_enabled && pti)
7625                 return (pmap_activate_sw_nopcid_pti);
7626         else /* if (!pmap_pcid_enabled && !pti) */
7627                 return (pmap_activate_sw_nopcid_nopti);
7628 }
7629
7630 void
7631 pmap_activate_sw(struct thread *td)
7632 {
7633         pmap_t oldpmap, pmap;
7634         u_int cpuid;
7635
7636         oldpmap = PCPU_GET(curpmap);
7637         pmap = vmspace_pmap(td->td_proc->p_vmspace);
7638         if (oldpmap == pmap)
7639                 return;
7640         cpuid = PCPU_GET(cpuid);
7641 #ifdef SMP
7642         CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
7643 #else
7644         CPU_SET(cpuid, &pmap->pm_active);
7645 #endif
7646         pmap_activate_sw_mode(pmap, cpuid);
7647 #ifdef SMP
7648         CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
7649 #else
7650         CPU_CLR(cpuid, &oldpmap->pm_active);
7651 #endif
7652 }
7653
7654 void
7655 pmap_activate(struct thread *td)
7656 {
7657
7658         critical_enter();
7659         pmap_activate_sw(td);
7660         critical_exit();
7661 }
7662
7663 void
7664 pmap_activate_boot(pmap_t pmap)
7665 {
7666         uint64_t kcr3;
7667         u_int cpuid;
7668
7669         /*
7670          * kernel_pmap must be never deactivated, and we ensure that
7671          * by never activating it at all.
7672          */
7673         MPASS(pmap != kernel_pmap);
7674
7675         cpuid = PCPU_GET(cpuid);
7676 #ifdef SMP
7677         CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
7678 #else
7679         CPU_SET(cpuid, &pmap->pm_active);
7680 #endif
7681         PCPU_SET(curpmap, pmap);
7682         if (pti) {
7683                 kcr3 = pmap->pm_cr3;
7684                 if (pmap_pcid_enabled)
7685                         kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE;
7686         } else {
7687                 kcr3 = PMAP_NO_CR3;
7688         }
7689         PCPU_SET(kcr3, kcr3);
7690         PCPU_SET(ucr3, PMAP_NO_CR3);
7691 }
7692
7693 void
7694 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
7695 {
7696 }
7697
7698 /*
7699  *      Increase the starting virtual address of the given mapping if a
7700  *      different alignment might result in more superpage mappings.
7701  */
7702 void
7703 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
7704     vm_offset_t *addr, vm_size_t size)
7705 {
7706         vm_offset_t superpage_offset;
7707
7708         if (size < NBPDR)
7709                 return;
7710         if (object != NULL && (object->flags & OBJ_COLORED) != 0)
7711                 offset += ptoa(object->pg_color);
7712         superpage_offset = offset & PDRMASK;
7713         if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
7714             (*addr & PDRMASK) == superpage_offset)
7715                 return;
7716         if ((*addr & PDRMASK) < superpage_offset)
7717                 *addr = (*addr & ~PDRMASK) + superpage_offset;
7718         else
7719                 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
7720 }
7721
7722 #ifdef INVARIANTS
7723 static unsigned long num_dirty_emulations;
7724 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
7725              &num_dirty_emulations, 0, NULL);
7726
7727 static unsigned long num_accessed_emulations;
7728 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
7729              &num_accessed_emulations, 0, NULL);
7730
7731 static unsigned long num_superpage_accessed_emulations;
7732 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
7733              &num_superpage_accessed_emulations, 0, NULL);
7734
7735 static unsigned long ad_emulation_superpage_promotions;
7736 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
7737              &ad_emulation_superpage_promotions, 0, NULL);
7738 #endif  /* INVARIANTS */
7739
7740 int
7741 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
7742 {
7743         int rv;
7744         struct rwlock *lock;
7745 #if VM_NRESERVLEVEL > 0
7746         vm_page_t m, mpte;
7747 #endif
7748         pd_entry_t *pde;
7749         pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
7750
7751         KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
7752             ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
7753
7754         if (!pmap_emulate_ad_bits(pmap))
7755                 return (-1);
7756
7757         PG_A = pmap_accessed_bit(pmap);
7758         PG_M = pmap_modified_bit(pmap);
7759         PG_V = pmap_valid_bit(pmap);
7760         PG_RW = pmap_rw_bit(pmap);
7761
7762         rv = -1;
7763         lock = NULL;
7764         PMAP_LOCK(pmap);
7765
7766         pde = pmap_pde(pmap, va);
7767         if (pde == NULL || (*pde & PG_V) == 0)
7768                 goto done;
7769
7770         if ((*pde & PG_PS) != 0) {
7771                 if (ftype == VM_PROT_READ) {
7772 #ifdef INVARIANTS
7773                         atomic_add_long(&num_superpage_accessed_emulations, 1);
7774 #endif
7775                         *pde |= PG_A;
7776                         rv = 0;
7777                 }
7778                 goto done;
7779         }
7780
7781         pte = pmap_pde_to_pte(pde, va);
7782         if ((*pte & PG_V) == 0)
7783                 goto done;
7784
7785         if (ftype == VM_PROT_WRITE) {
7786                 if ((*pte & PG_RW) == 0)
7787                         goto done;
7788                 /*
7789                  * Set the modified and accessed bits simultaneously.
7790                  *
7791                  * Intel EPT PTEs that do software emulation of A/D bits map
7792                  * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
7793                  * An EPT misconfiguration is triggered if the PTE is writable
7794                  * but not readable (WR=10). This is avoided by setting PG_A
7795                  * and PG_M simultaneously.
7796                  */
7797                 *pte |= PG_M | PG_A;
7798         } else {
7799                 *pte |= PG_A;
7800         }
7801
7802 #if VM_NRESERVLEVEL > 0
7803         /* try to promote the mapping */
7804         if (va < VM_MAXUSER_ADDRESS)
7805                 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
7806         else
7807                 mpte = NULL;
7808
7809         m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
7810
7811         if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
7812             pmap_ps_enabled(pmap) &&
7813             (m->flags & PG_FICTITIOUS) == 0 &&
7814             vm_reserv_level_iffullpop(m) == 0) {
7815                 pmap_promote_pde(pmap, pde, va, &lock);
7816 #ifdef INVARIANTS
7817                 atomic_add_long(&ad_emulation_superpage_promotions, 1);
7818 #endif
7819         }
7820 #endif
7821
7822 #ifdef INVARIANTS
7823         if (ftype == VM_PROT_WRITE)
7824                 atomic_add_long(&num_dirty_emulations, 1);
7825         else
7826                 atomic_add_long(&num_accessed_emulations, 1);
7827 #endif
7828         rv = 0;         /* success */
7829 done:
7830         if (lock != NULL)
7831                 rw_wunlock(lock);
7832         PMAP_UNLOCK(pmap);
7833         return (rv);
7834 }
7835
7836 void
7837 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
7838 {
7839         pml4_entry_t *pml4;
7840         pdp_entry_t *pdp;
7841         pd_entry_t *pde;
7842         pt_entry_t *pte, PG_V;
7843         int idx;
7844
7845         idx = 0;
7846         PG_V = pmap_valid_bit(pmap);
7847         PMAP_LOCK(pmap);
7848
7849         pml4 = pmap_pml4e(pmap, va);
7850         ptr[idx++] = *pml4;
7851         if ((*pml4 & PG_V) == 0)
7852                 goto done;
7853
7854         pdp = pmap_pml4e_to_pdpe(pml4, va);
7855         ptr[idx++] = *pdp;
7856         if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
7857                 goto done;
7858
7859         pde = pmap_pdpe_to_pde(pdp, va);
7860         ptr[idx++] = *pde;
7861         if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
7862                 goto done;
7863
7864         pte = pmap_pde_to_pte(pde, va);
7865         ptr[idx++] = *pte;
7866
7867 done:
7868         PMAP_UNLOCK(pmap);
7869         *num = idx;
7870 }
7871
7872 /**
7873  * Get the kernel virtual address of a set of physical pages. If there are
7874  * physical addresses not covered by the DMAP perform a transient mapping
7875  * that will be removed when calling pmap_unmap_io_transient.
7876  *
7877  * \param page        The pages the caller wishes to obtain the virtual
7878  *                    address on the kernel memory map.
7879  * \param vaddr       On return contains the kernel virtual memory address
7880  *                    of the pages passed in the page parameter.
7881  * \param count       Number of pages passed in.
7882  * \param can_fault   TRUE if the thread using the mapped pages can take
7883  *                    page faults, FALSE otherwise.
7884  *
7885  * \returns TRUE if the caller must call pmap_unmap_io_transient when
7886  *          finished or FALSE otherwise.
7887  *
7888  */
7889 boolean_t
7890 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7891     boolean_t can_fault)
7892 {
7893         vm_paddr_t paddr;
7894         boolean_t needs_mapping;
7895         pt_entry_t *pte;
7896         int cache_bits, error __unused, i;
7897
7898         /*
7899          * Allocate any KVA space that we need, this is done in a separate
7900          * loop to prevent calling vmem_alloc while pinned.
7901          */
7902         needs_mapping = FALSE;
7903         for (i = 0; i < count; i++) {
7904                 paddr = VM_PAGE_TO_PHYS(page[i]);
7905                 if (__predict_false(paddr >= dmaplimit)) {
7906                         error = vmem_alloc(kernel_arena, PAGE_SIZE,
7907                             M_BESTFIT | M_WAITOK, &vaddr[i]);
7908                         KASSERT(error == 0, ("vmem_alloc failed: %d", error));
7909                         needs_mapping = TRUE;
7910                 } else {
7911                         vaddr[i] = PHYS_TO_DMAP(paddr);
7912                 }
7913         }
7914
7915         /* Exit early if everything is covered by the DMAP */
7916         if (!needs_mapping)
7917                 return (FALSE);
7918
7919         /*
7920          * NB:  The sequence of updating a page table followed by accesses
7921          * to the corresponding pages used in the !DMAP case is subject to
7922          * the situation described in the "AMD64 Architecture Programmer's
7923          * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
7924          * Coherency Considerations".  Therefore, issuing the INVLPG right
7925          * after modifying the PTE bits is crucial.
7926          */
7927         if (!can_fault)
7928                 sched_pin();
7929         for (i = 0; i < count; i++) {
7930                 paddr = VM_PAGE_TO_PHYS(page[i]);
7931                 if (paddr >= dmaplimit) {
7932                         if (can_fault) {
7933                                 /*
7934                                  * Slow path, since we can get page faults
7935                                  * while mappings are active don't pin the
7936                                  * thread to the CPU and instead add a global
7937                                  * mapping visible to all CPUs.
7938                                  */
7939                                 pmap_qenter(vaddr[i], &page[i], 1);
7940                         } else {
7941                                 pte = vtopte(vaddr[i]);
7942                                 cache_bits = pmap_cache_bits(kernel_pmap,
7943                                     page[i]->md.pat_mode, 0);
7944                                 pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
7945                                     cache_bits);
7946                                 invlpg(vaddr[i]);
7947                         }
7948                 }
7949         }
7950
7951         return (needs_mapping);
7952 }
7953
7954 void
7955 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7956     boolean_t can_fault)
7957 {
7958         vm_paddr_t paddr;
7959         int i;
7960
7961         if (!can_fault)
7962                 sched_unpin();
7963         for (i = 0; i < count; i++) {
7964                 paddr = VM_PAGE_TO_PHYS(page[i]);
7965                 if (paddr >= dmaplimit) {
7966                         if (can_fault)
7967                                 pmap_qremove(vaddr[i], 1);
7968                         vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
7969                 }
7970         }
7971 }
7972
7973 vm_offset_t
7974 pmap_quick_enter_page(vm_page_t m)
7975 {
7976         vm_paddr_t paddr;
7977
7978         paddr = VM_PAGE_TO_PHYS(m);
7979         if (paddr < dmaplimit)
7980                 return (PHYS_TO_DMAP(paddr));
7981         mtx_lock_spin(&qframe_mtx);
7982         KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
7983         pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
7984             X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
7985         return (qframe);
7986 }
7987
7988 void
7989 pmap_quick_remove_page(vm_offset_t addr)
7990 {
7991
7992         if (addr != qframe)
7993                 return;
7994         pte_store(vtopte(qframe), 0);
7995         invlpg(qframe);
7996         mtx_unlock_spin(&qframe_mtx);
7997 }
7998
7999 static vm_page_t
8000 pmap_pti_alloc_page(void)
8001 {
8002         vm_page_t m;
8003
8004         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8005         m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY |
8006             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
8007         return (m);
8008 }
8009
8010 static bool
8011 pmap_pti_free_page(vm_page_t m)
8012 {
8013
8014         KASSERT(m->wire_count > 0, ("page %p not wired", m));
8015         if (!vm_page_unwire_noq(m))
8016                 return (false);
8017         vm_page_free_zero(m);
8018         return (true);
8019 }
8020
8021 static void
8022 pmap_pti_init(void)
8023 {
8024         vm_page_t pml4_pg;
8025         pdp_entry_t *pdpe;
8026         vm_offset_t va;
8027         int i;
8028
8029         if (!pti)
8030                 return;
8031         pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
8032         VM_OBJECT_WLOCK(pti_obj);
8033         pml4_pg = pmap_pti_alloc_page();
8034         pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
8035         for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
8036             va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
8037                 pdpe = pmap_pti_pdpe(va);
8038                 pmap_pti_wire_pte(pdpe);
8039         }
8040         pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
8041             (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
8042         pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt +
8043             sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false);
8044         pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
8045             sizeof(struct gate_descriptor) * NIDT, false);
8046         pmap_pti_add_kva_locked((vm_offset_t)common_tss,
8047             (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false);
8048         CPU_FOREACH(i) {
8049                 /* Doublefault stack IST 1 */
8050                 va = common_tss[i].tss_ist1;
8051                 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
8052                 /* NMI stack IST 2 */
8053                 va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu);
8054                 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
8055                 /* MC# stack IST 3 */
8056                 va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
8057                 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
8058                 /* DB# stack IST 4 */
8059                 va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu);
8060                 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
8061         }
8062         pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
8063             (vm_offset_t)etext, true);
8064         pti_finalized = true;
8065         VM_OBJECT_WUNLOCK(pti_obj);
8066 }
8067 SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL);
8068
8069 static pdp_entry_t *
8070 pmap_pti_pdpe(vm_offset_t va)
8071 {
8072         pml4_entry_t *pml4e;
8073         pdp_entry_t *pdpe;
8074         vm_page_t m;
8075         vm_pindex_t pml4_idx;
8076         vm_paddr_t mphys;
8077
8078         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8079
8080         pml4_idx = pmap_pml4e_index(va);
8081         pml4e = &pti_pml4[pml4_idx];
8082         m = NULL;
8083         if (*pml4e == 0) {
8084                 if (pti_finalized)
8085                         panic("pml4 alloc after finalization\n");
8086                 m = pmap_pti_alloc_page();
8087                 if (*pml4e != 0) {
8088                         pmap_pti_free_page(m);
8089                         mphys = *pml4e & ~PAGE_MASK;
8090                 } else {
8091                         mphys = VM_PAGE_TO_PHYS(m);
8092                         *pml4e = mphys | X86_PG_RW | X86_PG_V;
8093                 }
8094         } else {
8095                 mphys = *pml4e & ~PAGE_MASK;
8096         }
8097         pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
8098         return (pdpe);
8099 }
8100
8101 static void
8102 pmap_pti_wire_pte(void *pte)
8103 {
8104         vm_page_t m;
8105
8106         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8107         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
8108         m->wire_count++;
8109 }
8110
8111 static void
8112 pmap_pti_unwire_pde(void *pde, bool only_ref)
8113 {
8114         vm_page_t m;
8115
8116         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8117         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
8118         MPASS(m->wire_count > 0);
8119         MPASS(only_ref || m->wire_count > 1);
8120         pmap_pti_free_page(m);
8121 }
8122
8123 static void
8124 pmap_pti_unwire_pte(void *pte, vm_offset_t va)
8125 {
8126         vm_page_t m;
8127         pd_entry_t *pde;
8128
8129         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8130         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
8131         MPASS(m->wire_count > 0);
8132         if (pmap_pti_free_page(m)) {
8133                 pde = pmap_pti_pde(va);
8134                 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
8135                 *pde = 0;
8136                 pmap_pti_unwire_pde(pde, false);
8137         }
8138 }
8139
8140 static pd_entry_t *
8141 pmap_pti_pde(vm_offset_t va)
8142 {
8143         pdp_entry_t *pdpe;
8144         pd_entry_t *pde;
8145         vm_page_t m;
8146         vm_pindex_t pd_idx;
8147         vm_paddr_t mphys;
8148
8149         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8150
8151         pdpe = pmap_pti_pdpe(va);
8152         if (*pdpe == 0) {
8153                 m = pmap_pti_alloc_page();
8154                 if (*pdpe != 0) {
8155                         pmap_pti_free_page(m);
8156                         MPASS((*pdpe & X86_PG_PS) == 0);
8157                         mphys = *pdpe & ~PAGE_MASK;
8158                 } else {
8159                         mphys =  VM_PAGE_TO_PHYS(m);
8160                         *pdpe = mphys | X86_PG_RW | X86_PG_V;
8161                 }
8162         } else {
8163                 MPASS((*pdpe & X86_PG_PS) == 0);
8164                 mphys = *pdpe & ~PAGE_MASK;
8165         }
8166
8167         pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
8168         pd_idx = pmap_pde_index(va);
8169         pde += pd_idx;
8170         return (pde);
8171 }
8172
8173 static pt_entry_t *
8174 pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
8175 {
8176         pd_entry_t *pde;
8177         pt_entry_t *pte;
8178         vm_page_t m;
8179         vm_paddr_t mphys;
8180
8181         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8182
8183         pde = pmap_pti_pde(va);
8184         if (unwire_pde != NULL) {
8185                 *unwire_pde = true;
8186                 pmap_pti_wire_pte(pde);
8187         }
8188         if (*pde == 0) {
8189                 m = pmap_pti_alloc_page();
8190                 if (*pde != 0) {
8191                         pmap_pti_free_page(m);
8192                         MPASS((*pde & X86_PG_PS) == 0);
8193                         mphys = *pde & ~(PAGE_MASK | pg_nx);
8194                 } else {
8195                         mphys = VM_PAGE_TO_PHYS(m);
8196                         *pde = mphys | X86_PG_RW | X86_PG_V;
8197                         if (unwire_pde != NULL)
8198                                 *unwire_pde = false;
8199                 }
8200         } else {
8201                 MPASS((*pde & X86_PG_PS) == 0);
8202                 mphys = *pde & ~(PAGE_MASK | pg_nx);
8203         }
8204
8205         pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
8206         pte += pmap_pte_index(va);
8207
8208         return (pte);
8209 }
8210
8211 static void
8212 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
8213 {
8214         vm_paddr_t pa;
8215         pd_entry_t *pde;
8216         pt_entry_t *pte, ptev;
8217         bool unwire_pde;
8218
8219         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8220
8221         sva = trunc_page(sva);
8222         MPASS(sva > VM_MAXUSER_ADDRESS);
8223         eva = round_page(eva);
8224         MPASS(sva < eva);
8225         for (; sva < eva; sva += PAGE_SIZE) {
8226                 pte = pmap_pti_pte(sva, &unwire_pde);
8227                 pa = pmap_kextract(sva);
8228                 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G |
8229                     (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
8230                     VM_MEMATTR_DEFAULT, FALSE);
8231                 if (*pte == 0) {
8232                         pte_store(pte, ptev);
8233                         pmap_pti_wire_pte(pte);
8234                 } else {
8235                         KASSERT(!pti_finalized,
8236                             ("pti overlap after fin %#lx %#lx %#lx",
8237                             sva, *pte, ptev));
8238                         KASSERT(*pte == ptev,
8239                             ("pti non-identical pte after fin %#lx %#lx %#lx",
8240                             sva, *pte, ptev));
8241                 }
8242                 if (unwire_pde) {
8243                         pde = pmap_pti_pde(sva);
8244                         pmap_pti_unwire_pde(pde, true);
8245                 }
8246         }
8247 }
8248
8249 void
8250 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
8251 {
8252
8253         if (!pti)
8254                 return;
8255         VM_OBJECT_WLOCK(pti_obj);
8256         pmap_pti_add_kva_locked(sva, eva, exec);
8257         VM_OBJECT_WUNLOCK(pti_obj);
8258 }
8259
8260 void
8261 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
8262 {
8263         pt_entry_t *pte;
8264         vm_offset_t va;
8265
8266         if (!pti)
8267                 return;
8268         sva = rounddown2(sva, PAGE_SIZE);
8269         MPASS(sva > VM_MAXUSER_ADDRESS);
8270         eva = roundup2(eva, PAGE_SIZE);
8271         MPASS(sva < eva);
8272         VM_OBJECT_WLOCK(pti_obj);
8273         for (va = sva; va < eva; va += PAGE_SIZE) {
8274                 pte = pmap_pti_pte(va, NULL);
8275                 KASSERT((*pte & X86_PG_V) != 0,
8276                     ("invalid pte va %#lx pte %#lx pt %#lx", va,
8277                     (u_long)pte, *pte));
8278                 pte_clear(pte);
8279                 pmap_pti_unwire_pte(pte, va);
8280         }
8281         pmap_invalidate_range(kernel_pmap, sva, eva);
8282         VM_OBJECT_WUNLOCK(pti_obj);
8283 }
8284
8285 #include "opt_ddb.h"
8286 #ifdef DDB
8287 #include <sys/kdb.h>
8288 #include <ddb/ddb.h>
8289
8290 DB_SHOW_COMMAND(pte, pmap_print_pte)
8291 {
8292         pmap_t pmap;
8293         pml4_entry_t *pml4;
8294         pdp_entry_t *pdp;
8295         pd_entry_t *pde;
8296         pt_entry_t *pte, PG_V;
8297         vm_offset_t va;
8298
8299         if (!have_addr) {
8300                 db_printf("show pte addr\n");
8301                 return;
8302         }
8303         va = (vm_offset_t)addr;
8304
8305         if (kdb_thread != NULL)
8306                 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
8307         else
8308                 pmap = PCPU_GET(curpmap);
8309
8310         PG_V = pmap_valid_bit(pmap);
8311         pml4 = pmap_pml4e(pmap, va);
8312         db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
8313         if ((*pml4 & PG_V) == 0) {
8314                 db_printf("\n");
8315                 return;
8316         }
8317         pdp = pmap_pml4e_to_pdpe(pml4, va);
8318         db_printf(" pdpe %#016lx", *pdp);
8319         if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
8320                 db_printf("\n");
8321                 return;
8322         }
8323         pde = pmap_pdpe_to_pde(pdp, va);
8324         db_printf(" pde %#016lx", *pde);
8325         if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
8326                 db_printf("\n");
8327                 return;
8328         }
8329         pte = pmap_pde_to_pte(pde, va);
8330         db_printf(" pte %#016lx\n", *pte);
8331 }
8332
8333 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
8334 {
8335         vm_paddr_t a;
8336
8337         if (have_addr) {
8338                 a = (vm_paddr_t)addr;
8339                 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
8340         } else {
8341                 db_printf("show phys2dmap addr\n");
8342         }
8343 }
8344 #endif