sys/amd64/amd64/pmap.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-4-Clause
   3  *
   4  * Copyright (c) 1991 Regents of the University of California.
   5  * All rights reserved.
   6  * Copyright (c) 1994 John S. Dyson
   7  * All rights reserved.
   8  * Copyright (c) 1994 David Greenman
   9  * All rights reserved.
  10  * Copyright (c) 2003 Peter Wemm
  11  * All rights reserved.
  12  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  13  * All rights reserved.
  14  *
  15  * This code is derived from software contributed to Berkeley by
  16  * the Systems Programming Group of the University of Utah Computer
  17  * Science Department and William Jolitz of UUNET Technologies Inc.
  18  *
  19  * Redistribution and use in source and binary forms, with or without
  20  * modification, are permitted provided that the following conditions
  21  * are met:
  22  * 1. Redistributions of source code must retain the above copyright
  23  *    notice, this list of conditions and the following disclaimer.
  24  * 2. Redistributions in binary form must reproduce the above copyright
  25  *    notice, this list of conditions and the following disclaimer in the
  26  *    documentation and/or other materials provided with the distribution.
  27  * 3. All advertising materials mentioning features or use of this software
  28  *    must display the following acknowledgement:
  29  *      This product includes software developed by the University of
  30  *      California, Berkeley and its contributors.
  31  * 4. Neither the name of the University nor the names of its contributors
  32  *    may be used to endorse or promote products derived from this software
  33  *    without specific prior written permission.
  34  *
  35  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  36  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  38  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  39  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  40  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  41  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  42  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  43  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  44  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  45  * SUCH DAMAGE.
  46  *
  47  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
  48  */
  49 /*-
  50  * Copyright (c) 2003 Networks Associates Technology, Inc.
  51  * Copyright (c) 2014-2018 The FreeBSD Foundation
  52  * All rights reserved.
  53  *
  54  * This software was developed for the FreeBSD Project by Jake Burkholder,
  55  * Safeport Network Services, and Network Associates Laboratories, the
  56  * Security Research Division of Network Associates, Inc. under
  57  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  58  * CHATS research program.
  59  *
  60  * Portions of this software were developed by
  61  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
  62  * the FreeBSD Foundation.
  63  *
  64  * Redistribution and use in source and binary forms, with or without
  65  * modification, are permitted provided that the following conditions
  66  * are met:
  67  * 1. Redistributions of source code must retain the above copyright
  68  *    notice, this list of conditions and the following disclaimer.
  69  * 2. Redistributions in binary form must reproduce the above copyright
  70  *    notice, this list of conditions and the following disclaimer in the
  71  *    documentation and/or other materials provided with the distribution.
  72  *
  73  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  74  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  75  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  76  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  77  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  78  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  79  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  80  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  81  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  82  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  83  * SUCH DAMAGE.
  84  */
  85
  86 #define AMD64_NPT_AWARE
  87
  88 #include <sys/cdefs.h>
  89 __FBSDID("$FreeBSD$");
  90
  91 /*
  92  *      Manages physical address maps.
  93  *
  94  *      Since the information managed by this module is
  95  *      also stored by the logical address mapping module,
  96  *      this module may throw away valid virtual-to-physical
  97  *      mappings at almost any time.  However, invalidations
  98  *      of virtual-to-physical mappings must be done as
  99  *      requested.
 100  *
 101  *      In order to cope with hardware architectures which
 102  *      make virtual-to-physical map invalidates expensive,
 103  *      this module may delay invalidate or reduced protection
 104  *      operations until such time as they are actually
 105  *      necessary.  This module is given full information as
 106  *      to which processors are currently using which maps,
 107  *      and to when physical maps must be made correct.
 108  */
 109
 110 #include "opt_pmap.h"
 111 #include "opt_vm.h"
 112
 113 #include <sys/param.h>
 114 #include <sys/bitstring.h>
 115 #include <sys/bus.h>
 116 #include <sys/systm.h>
 117 #include <sys/kernel.h>
 118 #include <sys/ktr.h>
 119 #include <sys/lock.h>
 120 #include <sys/malloc.h>
 121 #include <sys/mman.h>
 122 #include <sys/mutex.h>
 123 #include <sys/proc.h>
 124 #include <sys/rwlock.h>
 125 #include <sys/sx.h>
 126 #include <sys/turnstile.h>
 127 #include <sys/vmem.h>
 128 #include <sys/vmmeter.h>
 129 #include <sys/sched.h>
 130 #include <sys/sysctl.h>
 131 #include <sys/smp.h>
 132
 133 #include <vm/vm.h>
 134 #include <vm/vm_param.h>
 135 #include <vm/vm_kern.h>
 136 #include <vm/vm_page.h>
 137 #include <vm/vm_map.h>
 138 #include <vm/vm_object.h>
 139 #include <vm/vm_extern.h>
 140 #include <vm/vm_pageout.h>
 141 #include <vm/vm_pager.h>
 142 #include <vm/vm_phys.h>
 143 #include <vm/vm_radix.h>
 144 #include <vm/vm_reserv.h>
 145 #include <vm/uma.h>
 146
 147 #include <machine/intr_machdep.h>
 148 #include <x86/apicvar.h>
 149 #include <x86/ifunc.h>
 150 #include <machine/cpu.h>
 151 #include <machine/cputypes.h>
 152 #include <machine/md_var.h>
 153 #include <machine/pcb.h>
 154 #include <machine/specialreg.h>
 155 #ifdef SMP
 156 #include <machine/smp.h>
 157 #endif
 158 #include <machine/tss.h>
 159
 160 static __inline boolean_t
 161 pmap_type_guest(pmap_t pmap)
 162 {
 163
 164         return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
 165 }
 166
 167 static __inline boolean_t
 168 pmap_emulate_ad_bits(pmap_t pmap)
 169 {
 170
 171         return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
 172 }
 173
 174 static __inline pt_entry_t
 175 pmap_valid_bit(pmap_t pmap)
 176 {
 177         pt_entry_t mask;
 178
 179         switch (pmap->pm_type) {
 180         case PT_X86:
 181         case PT_RVI:
 182                 mask = X86_PG_V;
 183                 break;
 184         case PT_EPT:
 185                 if (pmap_emulate_ad_bits(pmap))
 186                         mask = EPT_PG_EMUL_V;
 187                 else
 188                         mask = EPT_PG_READ;
 189                 break;
 190         default:
 191                 panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
 192         }
 193
 194         return (mask);
 195 }
 196
 197 static __inline pt_entry_t
 198 pmap_rw_bit(pmap_t pmap)
 199 {
 200         pt_entry_t mask;
 201
 202         switch (pmap->pm_type) {
 203         case PT_X86:
 204         case PT_RVI:
 205                 mask = X86_PG_RW;
 206                 break;
 207         case PT_EPT:
 208                 if (pmap_emulate_ad_bits(pmap))
 209                         mask = EPT_PG_EMUL_RW;
 210                 else
 211                         mask = EPT_PG_WRITE;
 212                 break;
 213         default:
 214                 panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
 215         }
 216
 217         return (mask);
 218 }
 219
 220 static pt_entry_t pg_g;
 221
 222 static __inline pt_entry_t
 223 pmap_global_bit(pmap_t pmap)
 224 {
 225         pt_entry_t mask;
 226
 227         switch (pmap->pm_type) {
 228         case PT_X86:
 229                 mask = pg_g;
 230                 break;
 231         case PT_RVI:
 232         case PT_EPT:
 233                 mask = 0;
 234                 break;
 235         default:
 236                 panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
 237         }
 238
 239         return (mask);
 240 }
 241
 242 static __inline pt_entry_t
 243 pmap_accessed_bit(pmap_t pmap)
 244 {
 245         pt_entry_t mask;
 246
 247         switch (pmap->pm_type) {
 248         case PT_X86:
 249         case PT_RVI:
 250                 mask = X86_PG_A;
 251                 break;
 252         case PT_EPT:
 253                 if (pmap_emulate_ad_bits(pmap))
 254                         mask = EPT_PG_READ;
 255                 else
 256                         mask = EPT_PG_A;
 257                 break;
 258         default:
 259                 panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
 260         }
 261
 262         return (mask);
 263 }
 264
 265 static __inline pt_entry_t
 266 pmap_modified_bit(pmap_t pmap)
 267 {
 268         pt_entry_t mask;
 269
 270         switch (pmap->pm_type) {
 271         case PT_X86:
 272         case PT_RVI:
 273                 mask = X86_PG_M;
 274                 break;
 275         case PT_EPT:
 276                 if (pmap_emulate_ad_bits(pmap))
 277                         mask = EPT_PG_WRITE;
 278                 else
 279                         mask = EPT_PG_M;
 280                 break;
 281         default:
 282                 panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
 283         }
 284
 285         return (mask);
 286 }
 287
 288 #if !defined(DIAGNOSTIC)
 289 #ifdef __GNUC_GNU_INLINE__
 290 #define PMAP_INLINE     __attribute__((__gnu_inline__)) inline
 291 #else
 292 #define PMAP_INLINE     extern inline
 293 #endif
 294 #else
 295 #define PMAP_INLINE
 296 #endif
 297
 298 #ifdef PV_STATS
 299 #define PV_STAT(x)      do { x ; } while (0)
 300 #else
 301 #define PV_STAT(x)      do { } while (0)
 302 #endif
 303
 304 #define pa_index(pa)    ((pa) >> PDRSHIFT)
 305 #define pa_to_pvh(pa)   (&pv_table[pa_index(pa)])
 306
 307 #define NPV_LIST_LOCKS  MAXCPU
 308
 309 #define PHYS_TO_PV_LIST_LOCK(pa)        \
 310                         (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
 311
 312 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)  do {    \
 313         struct rwlock **_lockp = (lockp);               \
 314         struct rwlock *_new_lock;                       \
 315                                                         \
 316         _new_lock = PHYS_TO_PV_LIST_LOCK(pa);           \
 317         if (_new_lock != *_lockp) {                     \
 318                 if (*_lockp != NULL)                    \
 319                         rw_wunlock(*_lockp);            \
 320                 *_lockp = _new_lock;                    \
 321                 rw_wlock(*_lockp);                      \
 322         }                                               \
 323 } while (0)
 324
 325 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)        \
 326                         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
 327
 328 #define RELEASE_PV_LIST_LOCK(lockp)             do {    \
 329         struct rwlock **_lockp = (lockp);               \
 330                                                         \
 331         if (*_lockp != NULL) {                          \
 332                 rw_wunlock(*_lockp);                    \
 333                 *_lockp = NULL;                         \
 334         }                                               \
 335 } while (0)
 336
 337 #define VM_PAGE_TO_PV_LIST_LOCK(m)      \
 338                         PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
 339
 340 struct pmap kernel_pmap_store;
 341
 342 vm_offset_t virtual_avail;      /* VA of first avail page (after kernel bss) */
 343 vm_offset_t virtual_end;        /* VA of last avail page (end of kernel AS) */
 344
 345 int nkpt;
 346 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
 347     "Number of kernel page table pages allocated on bootup");
 348
 349 static int ndmpdp;
 350 vm_paddr_t dmaplimit;
 351 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
 352 pt_entry_t pg_nx;
 353
 354 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 355
 356 static int pat_works = 1;
 357 SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1,
 358     "Is page attribute table fully functional?");
 359
 360 static int pg_ps_enabled = 1;
 361 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
 362     &pg_ps_enabled, 0, "Are large page mappings enabled?");
 363
 364 #define PAT_INDEX_SIZE  8
 365 static int pat_index[PAT_INDEX_SIZE];   /* cache mode to PAT index conversion */
 366
 367 static u_int64_t        KPTphys;        /* phys addr of kernel level 1 */
 368 static u_int64_t        KPDphys;        /* phys addr of kernel level 2 */
 369 u_int64_t               KPDPphys;       /* phys addr of kernel level 3 */
 370 u_int64_t               KPML4phys;      /* phys addr of kernel level 4 */
 371
 372 static u_int64_t        DMPDphys;       /* phys addr of direct mapped level 2 */
 373 static u_int64_t        DMPDPphys;      /* phys addr of direct mapped level 3 */
 374 static int              ndmpdpphys;     /* number of DMPDPphys pages */
 375
 376 static vm_paddr_t       KERNend;        /* phys addr of end of bootstrap data */
 377
 378 /*
 379  * pmap_mapdev support pre initialization (i.e. console)
 380  */
 381 #define PMAP_PREINIT_MAPPING_COUNT      8
 382 static struct pmap_preinit_mapping {
 383         vm_paddr_t      pa;
 384         vm_offset_t     va;
 385         vm_size_t       sz;
 386         int             mode;
 387 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
 388 static int pmap_initialized;
 389
 390 /*
 391  * Data for the pv entry allocation mechanism.
 392  * Updates to pv_invl_gen are protected by the pv_list_locks[]
 393  * elements, but reads are not.
 394  */
 395 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 396 static struct mtx __exclusive_cache_line pv_chunks_mutex;
 397 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
 398 static u_long pv_invl_gen[NPV_LIST_LOCKS];
 399 static struct md_page *pv_table;
 400 static struct md_page pv_dummy;
 401
 402 /*
 403  * All those kernel PT submaps that BSD is so fond of
 404  */
 405 pt_entry_t *CMAP1 = NULL;
 406 caddr_t CADDR1 = 0;
 407 static vm_offset_t qframe = 0;
 408 static struct mtx qframe_mtx;
 409
 410 static int pmap_flags = PMAP_PDE_SUPERPAGE;     /* flags for x86 pmaps */
 411
 412 int pmap_pcid_enabled = 1;
 413 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
 414     &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
 415 int invpcid_works = 0;
 416 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
 417     "Is the invpcid instruction available ?");
 418
 419 int __read_frequently pti = 0;
 420 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
 421     &pti, 0,
 422     "Page Table Isolation enabled");
 423 static vm_object_t pti_obj;
 424 static pml4_entry_t *pti_pml4;
 425 static vm_pindex_t pti_pg_idx;
 426 static bool pti_finalized;
 427
 428 static int
 429 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
 430 {
 431         int i;
 432         uint64_t res;
 433
 434         res = 0;
 435         CPU_FOREACH(i) {
 436                 res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
 437         }
 438         return (sysctl_handle_64(oidp, &res, 0, req));
 439 }
 440 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RW |
 441     CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
 442     "Count of saved TLB context on switch");
 443
 444 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
 445     LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
 446 static struct mtx invl_gen_mtx;
 447 static u_long pmap_invl_gen = 0;
 448 /* Fake lock object to satisfy turnstiles interface. */
 449 static struct lock_object invl_gen_ts = {
 450         .lo_name = "invlts",
 451 };
 452
 453 static bool
 454 pmap_not_in_di(void)
 455 {
 456
 457         return (curthread->td_md.md_invl_gen.gen == 0);
 458 }
 459
 460 #define PMAP_ASSERT_NOT_IN_DI() \
 461     KASSERT(pmap_not_in_di(), ("DI already started"))
 462
 463 /*
 464  * Start a new Delayed Invalidation (DI) block of code, executed by
 465  * the current thread.  Within a DI block, the current thread may
 466  * destroy both the page table and PV list entries for a mapping and
 467  * then release the corresponding PV list lock before ensuring that
 468  * the mapping is flushed from the TLBs of any processors with the
 469  * pmap active.
 470  */
 471 static void
 472 pmap_delayed_invl_started(void)
 473 {
 474         struct pmap_invl_gen *invl_gen;
 475         u_long currgen;
 476
 477         invl_gen = &curthread->td_md.md_invl_gen;
 478         PMAP_ASSERT_NOT_IN_DI();
 479         mtx_lock(&invl_gen_mtx);
 480         if (LIST_EMPTY(&pmap_invl_gen_tracker))
 481                 currgen = pmap_invl_gen;
 482         else
 483                 currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
 484         invl_gen->gen = currgen + 1;
 485         LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
 486         mtx_unlock(&invl_gen_mtx);
 487 }
 488
 489 /*
 490  * Finish the DI block, previously started by the current thread.  All
 491  * required TLB flushes for the pages marked by
 492  * pmap_delayed_invl_page() must be finished before this function is
 493  * called.
 494  *
 495  * This function works by bumping the global DI generation number to
 496  * the generation number of the current thread's DI, unless there is a
 497  * pending DI that started earlier.  In the latter case, bumping the
 498  * global DI generation number would incorrectly signal that the
 499  * earlier DI had finished.  Instead, this function bumps the earlier
 500  * DI's generation number to match the generation number of the
 501  * current thread's DI.
 502  */
 503 static void
 504 pmap_delayed_invl_finished(void)
 505 {
 506         struct pmap_invl_gen *invl_gen, *next;
 507         struct turnstile *ts;
 508
 509         invl_gen = &curthread->td_md.md_invl_gen;
 510         KASSERT(invl_gen->gen != 0, ("missed invl_started"));
 511         mtx_lock(&invl_gen_mtx);
 512         next = LIST_NEXT(invl_gen, link);
 513         if (next == NULL) {
 514                 turnstile_chain_lock(&invl_gen_ts);
 515                 ts = turnstile_lookup(&invl_gen_ts);
 516                 pmap_invl_gen = invl_gen->gen;
 517                 if (ts != NULL) {
 518                         turnstile_broadcast(ts, TS_SHARED_QUEUE);
 519                         turnstile_unpend(ts);
 520                 }
 521                 turnstile_chain_unlock(&invl_gen_ts);
 522         } else {
 523                 next->gen = invl_gen->gen;
 524         }
 525         LIST_REMOVE(invl_gen, link);
 526         mtx_unlock(&invl_gen_mtx);
 527         invl_gen->gen = 0;
 528 }
 529
 530 #ifdef PV_STATS
 531 static long invl_wait;
 532 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
 533     "Number of times DI invalidation blocked pmap_remove_all/write");
 534 #endif
 535
 536 static u_long *
 537 pmap_delayed_invl_genp(vm_page_t m)
 538 {
 539
 540         return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
 541 }
 542
 543 /*
 544  * Ensure that all currently executing DI blocks, that need to flush
 545  * TLB for the given page m, actually flushed the TLB at the time the
 546  * function returned.  If the page m has an empty PV list and we call
 547  * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
 548  * valid mapping for the page m in either its page table or TLB.
 549  *
 550  * This function works by blocking until the global DI generation
 551  * number catches up with the generation number associated with the
 552  * given page m and its PV list.  Since this function's callers
 553  * typically own an object lock and sometimes own a page lock, it
 554  * cannot sleep.  Instead, it blocks on a turnstile to relinquish the
 555  * processor.
 556  */
 557 static void
 558 pmap_delayed_invl_wait(vm_page_t m)
 559 {
 560         struct turnstile *ts;
 561         u_long *m_gen;
 562 #ifdef PV_STATS
 563         bool accounted = false;
 564 #endif
 565
 566         m_gen = pmap_delayed_invl_genp(m);
 567         while (*m_gen > pmap_invl_gen) {
 568 #ifdef PV_STATS
 569                 if (!accounted) {
 570                         atomic_add_long(&invl_wait, 1);
 571                         accounted = true;
 572                 }
 573 #endif
 574                 ts = turnstile_trywait(&invl_gen_ts);
 575                 if (*m_gen > pmap_invl_gen)
 576                         turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
 577                 else
 578                         turnstile_cancel(ts);
 579         }
 580 }
 581
 582 /*
 583  * Mark the page m's PV list as participating in the current thread's
 584  * DI block.  Any threads concurrently using m's PV list to remove or
 585  * restrict all mappings to m will wait for the current thread's DI
 586  * block to complete before proceeding.
 587  *
 588  * The function works by setting the DI generation number for m's PV
 589  * list to at least the DI generation number of the current thread.
 590  * This forces a caller of pmap_delayed_invl_wait() to block until
 591  * current thread calls pmap_delayed_invl_finished().
 592  */
 593 static void
 594 pmap_delayed_invl_page(vm_page_t m)
 595 {
 596         u_long gen, *m_gen;
 597
 598         rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
 599         gen = curthread->td_md.md_invl_gen.gen;
 600         if (gen == 0)
 601                 return;
 602         m_gen = pmap_delayed_invl_genp(m);
 603         if (*m_gen < gen)
 604                 *m_gen = gen;
 605 }
 606
 607 /*
 608  * Crashdump maps.
 609  */
 610 static caddr_t crashdumpmap;
 611
 612 /*
 613  * Internal flags for pmap_enter()'s helper functions.
 614  */
 615 #define PMAP_ENTER_NORECLAIM    0x1000000       /* Don't reclaim PV entries. */
 616 #define PMAP_ENTER_NOREPLACE    0x2000000       /* Don't replace mappings. */
 617
 618 static void     free_pv_chunk(struct pv_chunk *pc);
 619 static void     free_pv_entry(pmap_t pmap, pv_entry_t pv);
 620 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
 621 static int      popcnt_pc_map_pq(uint64_t *map);
 622 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
 623 static void     reserve_pv_entries(pmap_t pmap, int needed,
 624                     struct rwlock **lockp);
 625 static void     pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 626                     struct rwlock **lockp);
 627 static bool     pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
 628                     u_int flags, struct rwlock **lockp);
 629 #if VM_NRESERVLEVEL > 0
 630 static void     pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 631                     struct rwlock **lockp);
 632 #endif
 633 static void     pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 634 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 635                     vm_offset_t va);
 636
 637 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
 638 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
 639 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
 640     vm_offset_t va, struct rwlock **lockp);
 641 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
 642     vm_offset_t va);
 643 static bool     pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
 644                     vm_prot_t prot, struct rwlock **lockp);
 645 static int      pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
 646                     u_int flags, vm_page_t m, struct rwlock **lockp);
 647 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
 648     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 649 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
 650 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
 651 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,
 652     vm_offset_t eva);
 653 static void pmap_invalidate_cache_range_all(vm_offset_t sva,
 654     vm_offset_t eva);
 655 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
 656                     pd_entry_t pde);
 657 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
 658 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
 659 #if VM_NRESERVLEVEL > 0
 660 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
 661     struct rwlock **lockp);
 662 #endif
 663 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
 664     vm_prot_t prot);
 665 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
 666 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
 667     bool exec);
 668 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
 669 static pd_entry_t *pmap_pti_pde(vm_offset_t va);
 670 static void pmap_pti_wire_pte(void *pte);
 671 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
 672     struct spglist *free, struct rwlock **lockp);
 673 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
 674     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
 675 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
 676 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
 677     struct spglist *free);
 678 static bool     pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 679                     pd_entry_t *pde, struct spglist *free,
 680                     struct rwlock **lockp);
 681 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
 682     vm_page_t m, struct rwlock **lockp);
 683 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
 684     pd_entry_t newpde);
 685 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
 686
 687 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
 688                 struct rwlock **lockp);
 689 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
 690                 struct rwlock **lockp);
 691 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
 692                 struct rwlock **lockp);
 693
 694 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
 695     struct spglist *free);
 696 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 697
 698 /********************/
 699 /* Inline functions */
 700 /********************/
 701
 702 /* Return a non-clipped PD index for a given VA */
 703 static __inline vm_pindex_t
 704 pmap_pde_pindex(vm_offset_t va)
 705 {
 706         return (va >> PDRSHIFT);
 707 }
 708
 709
 710 /* Return a pointer to the PML4 slot that corresponds to a VA */
 711 static __inline pml4_entry_t *
 712 pmap_pml4e(pmap_t pmap, vm_offset_t va)
 713 {
 714
 715         return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
 716 }
 717
 718 /* Return a pointer to the PDP slot that corresponds to a VA */
 719 static __inline pdp_entry_t *
 720 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
 721 {
 722         pdp_entry_t *pdpe;
 723
 724         pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
 725         return (&pdpe[pmap_pdpe_index(va)]);
 726 }
 727
 728 /* Return a pointer to the PDP slot that corresponds to a VA */
 729 static __inline pdp_entry_t *
 730 pmap_pdpe(pmap_t pmap, vm_offset_t va)
 731 {
 732         pml4_entry_t *pml4e;
 733         pt_entry_t PG_V;
 734
 735         PG_V = pmap_valid_bit(pmap);
 736         pml4e = pmap_pml4e(pmap, va);
 737         if ((*pml4e & PG_V) == 0)
 738                 return (NULL);
 739         return (pmap_pml4e_to_pdpe(pml4e, va));
 740 }
 741
 742 /* Return a pointer to the PD slot that corresponds to a VA */
 743 static __inline pd_entry_t *
 744 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
 745 {
 746         pd_entry_t *pde;
 747
 748         pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
 749         return (&pde[pmap_pde_index(va)]);
 750 }
 751
 752 /* Return a pointer to the PD slot that corresponds to a VA */
 753 static __inline pd_entry_t *
 754 pmap_pde(pmap_t pmap, vm_offset_t va)
 755 {
 756         pdp_entry_t *pdpe;
 757         pt_entry_t PG_V;
 758
 759         PG_V = pmap_valid_bit(pmap);
 760         pdpe = pmap_pdpe(pmap, va);
 761         if (pdpe == NULL || (*pdpe & PG_V) == 0)
 762                 return (NULL);
 763         return (pmap_pdpe_to_pde(pdpe, va));
 764 }
 765
 766 /* Return a pointer to the PT slot that corresponds to a VA */
 767 static __inline pt_entry_t *
 768 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
 769 {
 770         pt_entry_t *pte;
 771
 772         pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 773         return (&pte[pmap_pte_index(va)]);
 774 }
 775
 776 /* Return a pointer to the PT slot that corresponds to a VA */
 777 static __inline pt_entry_t *
 778 pmap_pte(pmap_t pmap, vm_offset_t va)
 779 {
 780         pd_entry_t *pde;
 781         pt_entry_t PG_V;
 782
 783         PG_V = pmap_valid_bit(pmap);
 784         pde = pmap_pde(pmap, va);
 785         if (pde == NULL || (*pde & PG_V) == 0)
 786                 return (NULL);
 787         if ((*pde & PG_PS) != 0)        /* compat with i386 pmap_pte() */
 788                 return ((pt_entry_t *)pde);
 789         return (pmap_pde_to_pte(pde, va));
 790 }
 791
 792 static __inline void
 793 pmap_resident_count_inc(pmap_t pmap, int count)
 794 {
 795
 796         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 797         pmap->pm_stats.resident_count += count;
 798 }
 799
 800 static __inline void
 801 pmap_resident_count_dec(pmap_t pmap, int count)
 802 {
 803
 804         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 805         KASSERT(pmap->pm_stats.resident_count >= count,
 806             ("pmap %p resident count underflow %ld %d", pmap,
 807             pmap->pm_stats.resident_count, count));
 808         pmap->pm_stats.resident_count -= count;
 809 }
 810
 811 PMAP_INLINE pt_entry_t *
 812 vtopte(vm_offset_t va)
 813 {
 814         u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 815
 816         KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
 817
 818         return (PTmap + ((va >> PAGE_SHIFT) & mask));
 819 }
 820
 821 static __inline pd_entry_t *
 822 vtopde(vm_offset_t va)
 823 {
 824         u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 825
 826         KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
 827
 828         return (PDmap + ((va >> PDRSHIFT) & mask));
 829 }
 830
 831 static u_int64_t
 832 allocpages(vm_paddr_t *firstaddr, int n)
 833 {
 834         u_int64_t ret;
 835
 836         ret = *firstaddr;
 837         bzero((void *)ret, n * PAGE_SIZE);
 838         *firstaddr += n * PAGE_SIZE;
 839         return (ret);
 840 }
 841
 842 CTASSERT(powerof2(NDMPML4E));
 843
 844 /* number of kernel PDP slots */
 845 #define NKPDPE(ptpgs)           howmany(ptpgs, NPDEPG)
 846
 847 static void
 848 nkpt_init(vm_paddr_t addr)
 849 {
 850         int pt_pages;
 851
 852 #ifdef NKPT
 853         pt_pages = NKPT;
 854 #else
 855         pt_pages = howmany(addr, 1 << PDRSHIFT);
 856         pt_pages += NKPDPE(pt_pages);
 857
 858         /*
 859          * Add some slop beyond the bare minimum required for bootstrapping
 860          * the kernel.
 861          *
 862          * This is quite important when allocating KVA for kernel modules.
 863          * The modules are required to be linked in the negative 2GB of
 864          * the address space.  If we run out of KVA in this region then
 865          * pmap_growkernel() will need to allocate page table pages to map
 866          * the entire 512GB of KVA space which is an unnecessary tax on
 867          * physical memory.
 868          *
 869          * Secondly, device memory mapped as part of setting up the low-
 870          * level console(s) is taken from KVA, starting at virtual_avail.
 871          * This is because cninit() is called after pmap_bootstrap() but
 872          * before vm_init() and pmap_init(). 20MB for a frame buffer is
 873          * not uncommon.
 874          */
 875         pt_pages += 32;         /* 64MB additional slop. */
 876 #endif
 877         nkpt = pt_pages;
 878 }
 879
 880 /*
 881  * Returns the proper write/execute permission for a physical page that is
 882  * part of the initial boot allocations.
 883  *
 884  * If the page has kernel text, it is marked as read-only. If the page has
 885  * kernel read-only data, it is marked as read-only/not-executable. If the
 886  * page has only read-write data, it is marked as read-write/not-executable.
 887  * If the page is below/above the kernel range, it is marked as read-write.
 888  *
 889  * This function operates on 2M pages, since we map the kernel space that
 890  * way.
 891  *
 892  * Note that this doesn't currently provide any protection for modules.
 893  */
 894 static inline pt_entry_t
 895 bootaddr_rwx(vm_paddr_t pa)
 896 {
 897
 898         /*
 899          * Everything in the same 2M page as the start of the kernel
 900          * should be static. On the other hand, things in the same 2M
 901          * page as the end of the kernel could be read-write/executable,
 902          * as the kernel image is not guaranteed to end on a 2M boundary.
 903          */
 904         if (pa < trunc_2mpage(btext - KERNBASE) ||
 905            pa >= trunc_2mpage(_end - KERNBASE))
 906                 return (X86_PG_RW);
 907         /*
 908          * The linker should ensure that the read-only and read-write
 909          * portions don't share the same 2M page, so this shouldn't
 910          * impact read-only data. However, in any case, any page with
 911          * read-write data needs to be read-write.
 912          */
 913         if (pa >= trunc_2mpage(brwsection - KERNBASE))
 914                 return (X86_PG_RW | pg_nx);
 915         /*
 916          * Mark any 2M page containing kernel text as read-only. Mark
 917          * other pages with read-only data as read-only and not executable.
 918          * (It is likely a small portion of the read-only data section will
 919          * be marked as read-only, but executable. This should be acceptable
 920          * since the read-only protection will keep the data from changing.)
 921          * Note that fixups to the .text section will still work until we
 922          * set CR0.WP.
 923          */
 924         if (pa < round_2mpage(etext - KERNBASE))
 925                 return (0);
 926         return (pg_nx);
 927 }
 928
 929 static void
 930 create_pagetables(vm_paddr_t *firstaddr)
 931 {
 932         int i, j, ndm1g, nkpdpe, nkdmpde;
 933         pt_entry_t *pt_p;
 934         pd_entry_t *pd_p;
 935         pdp_entry_t *pdp_p;
 936         pml4_entry_t *p4_p;
 937         uint64_t DMPDkernphys;
 938
 939         /* Allocate page table pages for the direct map */
 940         ndmpdp = howmany(ptoa(Maxmem), NBPDP);
 941         if (ndmpdp < 4)         /* Minimum 4GB of dirmap */
 942                 ndmpdp = 4;
 943         ndmpdpphys = howmany(ndmpdp, NPDPEPG);
 944         if (ndmpdpphys > NDMPML4E) {
 945                 /*
 946                  * Each NDMPML4E allows 512 GB, so limit to that,
 947                  * and then readjust ndmpdp and ndmpdpphys.
 948                  */
 949                 printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
 950                 Maxmem = atop(NDMPML4E * NBPML4);
 951                 ndmpdpphys = NDMPML4E;
 952                 ndmpdp = NDMPML4E * NPDEPG;
 953         }
 954         DMPDPphys = allocpages(firstaddr, ndmpdpphys);
 955         ndm1g = 0;
 956         if ((amd_feature & AMDID_PAGE1GB) != 0) {
 957                 /*
 958                  * Calculate the number of 1G pages that will fully fit in
 959                  * Maxmem.
 960                  */
 961                 ndm1g = ptoa(Maxmem) >> PDPSHIFT;
 962
 963                 /*
 964                  * Allocate 2M pages for the kernel. These will be used in
 965                  * place of the first one or more 1G pages from ndm1g.
 966                  */
 967                 nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP);
 968                 DMPDkernphys = allocpages(firstaddr, nkdmpde);
 969         }
 970         if (ndm1g < ndmpdp)
 971                 DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
 972         dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
 973
 974         /* Allocate pages */
 975         KPML4phys = allocpages(firstaddr, 1);
 976         KPDPphys = allocpages(firstaddr, NKPML4E);
 977
 978         /*
 979          * Allocate the initial number of kernel page table pages required to
 980          * bootstrap.  We defer this until after all memory-size dependent
 981          * allocations are done (e.g. direct map), so that we don't have to
 982          * build in too much slop in our estimate.
 983          *
 984          * Note that when NKPML4E > 1, we have an empty page underneath
 985          * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
 986          * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
 987          */
 988         nkpt_init(*firstaddr);
 989         nkpdpe = NKPDPE(nkpt);
 990
 991         KPTphys = allocpages(firstaddr, nkpt);
 992         KPDphys = allocpages(firstaddr, nkpdpe);
 993
 994         /* Fill in the underlying page table pages */
 995         /* XXX not fully used, underneath 2M pages */
 996         pt_p = (pt_entry_t *)KPTphys;
 997         for (i = 0; ptoa(i) < *firstaddr; i++)
 998                 pt_p[i] = ptoa(i) | X86_PG_V | pg_g | bootaddr_rwx(ptoa(i));
 999
1000         /* Now map the page tables at their location within PTmap */
1001         pd_p = (pd_entry_t *)KPDphys;
1002         for (i = 0; i < nkpt; i++)
1003                 pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
1004
1005         /* Map from zero to end of allocations under 2M pages */
1006         /* This replaces some of the KPTphys entries above */
1007         for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
1008                 /* Preset PG_M and PG_A because demotion expects it. */
1009                 pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
1010                     X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT);
1011
1012         /*
1013          * Because we map the physical blocks in 2M pages, adjust firstaddr
1014          * to record the physical blocks we've actually mapped into kernel
1015          * virtual address space.
1016          */
1017         *firstaddr = round_2mpage(*firstaddr);
1018
1019         /* And connect up the PD to the PDP (leaving room for L4 pages) */
1020         pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
1021         for (i = 0; i < nkpdpe; i++)
1022                 pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
1023
1024         /*
1025          * Now, set up the direct map region using 2MB and/or 1GB pages.  If
1026          * the end of physical memory is not aligned to a 1GB page boundary,
1027          * then the residual physical memory is mapped with 2MB pages.  Later,
1028          * if pmap_mapdev{_attr}() uses the direct map for non-write-back
1029          * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
1030          * that are partially used.
1031          */
1032         pd_p = (pd_entry_t *)DMPDphys;
1033         for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
1034                 pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
1035                 /* Preset PG_M and PG_A because demotion expects it. */
1036                 pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
1037                     X86_PG_M | X86_PG_A | pg_nx;
1038         }
1039         pdp_p = (pdp_entry_t *)DMPDPphys;
1040         for (i = 0; i < ndm1g; i++) {
1041                 pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
1042                 /* Preset PG_M and PG_A because demotion expects it. */
1043                 pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
1044                     X86_PG_M | X86_PG_A | pg_nx;
1045         }
1046         for (j = 0; i < ndmpdp; i++, j++) {
1047                 pdp_p[i] = DMPDphys + ptoa(j);
1048                 pdp_p[i] |= X86_PG_RW | X86_PG_V;
1049         }
1050
1051         /*
1052          * Instead of using a 1G page for the memory containing the kernel,
1053          * use 2M pages with appropriate permissions. (If using 1G pages,
1054          * this will partially overwrite the PDPEs above.)
1055          */
1056         if (ndm1g) {
1057                 pd_p = (pd_entry_t *)DMPDkernphys;
1058                 for (i = 0; i < (NPDEPG * nkdmpde); i++)
1059                         pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
1060                             X86_PG_M | X86_PG_A | pg_nx |
1061                             bootaddr_rwx(i << PDRSHIFT);
1062                 for (i = 0; i < nkdmpde; i++)
1063                         pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW |
1064                             X86_PG_V;
1065         }
1066
1067         /* And recursively map PML4 to itself in order to get PTmap */
1068         p4_p = (pml4_entry_t *)KPML4phys;
1069         p4_p[PML4PML4I] = KPML4phys;
1070         p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
1071
1072         /* Connect the Direct Map slot(s) up to the PML4. */
1073         for (i = 0; i < ndmpdpphys; i++) {
1074                 p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
1075                 p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V;
1076         }
1077
1078         /* Connect the KVA slots up to the PML4 */
1079         for (i = 0; i < NKPML4E; i++) {
1080                 p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
1081                 p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
1082         }
1083 }
1084
1085 /*
1086  *      Bootstrap the system enough to run with virtual memory.
1087  *
1088  *      On amd64 this is called after mapping has already been enabled
1089  *      and just syncs the pmap module with what has already been done.
1090  *      [We can't call it easily with mapping off since the kernel is not
1091  *      mapped with PA == VA, hence we would have to relocate every address
1092  *      from the linked base (virtual) address "KERNBASE" to the actual
1093  *      (physical) address starting relative to 0]
1094  */
1095 void
1096 pmap_bootstrap(vm_paddr_t *firstaddr)
1097 {
1098         vm_offset_t va;
1099         pt_entry_t *pte;
1100         uint64_t cr4;
1101         int i;
1102
1103         KERNend = *firstaddr;
1104
1105         if (!pti)
1106                 pg_g = X86_PG_G;
1107
1108         /*
1109          * Create an initial set of page tables to run the kernel in.
1110          */
1111         create_pagetables(firstaddr);
1112
1113         /*
1114          * Add a physical memory segment (vm_phys_seg) corresponding to the
1115          * preallocated kernel page table pages so that vm_page structures
1116          * representing these pages will be created.  The vm_page structures
1117          * are required for promotion of the corresponding kernel virtual
1118          * addresses to superpage mappings.
1119          */
1120         vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
1121
1122         virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
1123
1124         virtual_end = VM_MAX_KERNEL_ADDRESS;
1125
1126
1127         /*
1128          * Enable PG_G global pages, then switch to the kernel page
1129          * table from the bootstrap page table.  After the switch, it
1130          * is possible to enable SMEP and SMAP since PG_U bits are
1131          * correct now.
1132          */
1133         cr4 = rcr4();
1134         cr4 |= CR4_PGE;
1135         load_cr4(cr4);
1136         load_cr3(KPML4phys);
1137         if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
1138                 cr4 |= CR4_SMEP;
1139         if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
1140                 cr4 |= CR4_SMAP;
1141         load_cr4(cr4);
1142
1143         /*
1144          * Initialize the kernel pmap (which is statically allocated).
1145          */
1146         PMAP_LOCK_INIT(kernel_pmap);
1147         kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
1148         kernel_pmap->pm_cr3 = KPML4phys;
1149         kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
1150         CPU_FILL(&kernel_pmap->pm_active);      /* don't allow deactivation */
1151         TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1152         kernel_pmap->pm_flags = pmap_flags;
1153
1154         /*
1155          * Initialize the TLB invalidations generation number lock.
1156          */
1157         mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
1158
1159         /*
1160          * Reserve some special page table entries/VA space for temporary
1161          * mapping of pages.
1162          */
1163 #define SYSMAP(c, p, v, n)      \
1164         v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
1165
1166         va = virtual_avail;
1167         pte = vtopte(va);
1168
1169         /*
1170          * Crashdump maps.  The first page is reused as CMAP1 for the
1171          * memory test.
1172          */
1173         SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
1174         CADDR1 = crashdumpmap;
1175
1176         virtual_avail = va;
1177
1178         /*
1179          * Initialize the PAT MSR.
1180          * pmap_init_pat() clears and sets CR4_PGE, which, as a
1181          * side-effect, invalidates stale PG_G TLB entries that might
1182          * have been created in our pre-boot environment.
1183          */
1184         pmap_init_pat();
1185
1186         /* Initialize TLB Context Id. */
1187         if (pmap_pcid_enabled) {
1188                 for (i = 0; i < MAXCPU; i++) {
1189                         kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN;
1190                         kernel_pmap->pm_pcids[i].pm_gen = 1;
1191                 }
1192
1193                 /*
1194                  * PMAP_PCID_KERN + 1 is used for initialization of
1195                  * proc0 pmap.  The pmap' pcid state might be used by
1196                  * EFIRT entry before first context switch, so it
1197                  * needs to be valid.
1198                  */
1199                 PCPU_SET(pcid_next, PMAP_PCID_KERN + 2);
1200                 PCPU_SET(pcid_gen, 1);
1201
1202                 /*
1203                  * pcpu area for APs is zeroed during AP startup.
1204                  * pc_pcid_next and pc_pcid_gen are initialized by AP
1205                  * during pcpu setup.
1206                  */
1207                 load_cr4(rcr4() | CR4_PCIDE);
1208         }
1209 }
1210
1211 /*
1212  * Setup the PAT MSR.
1213  */
1214 void
1215 pmap_init_pat(void)
1216 {
1217         int pat_table[PAT_INDEX_SIZE];
1218         uint64_t pat_msr;
1219         u_long cr0, cr4;
1220         int i;
1221
1222         /* Bail if this CPU doesn't implement PAT. */
1223         if ((cpu_feature & CPUID_PAT) == 0)
1224                 panic("no PAT??");
1225
1226         /* Set default PAT index table. */
1227         for (i = 0; i < PAT_INDEX_SIZE; i++)
1228                 pat_table[i] = -1;
1229         pat_table[PAT_WRITE_BACK] = 0;
1230         pat_table[PAT_WRITE_THROUGH] = 1;
1231         pat_table[PAT_UNCACHEABLE] = 3;
1232         pat_table[PAT_WRITE_COMBINING] = 3;
1233         pat_table[PAT_WRITE_PROTECTED] = 3;
1234         pat_table[PAT_UNCACHED] = 3;
1235
1236         /* Initialize default PAT entries. */
1237         pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
1238             PAT_VALUE(1, PAT_WRITE_THROUGH) |
1239             PAT_VALUE(2, PAT_UNCACHED) |
1240             PAT_VALUE(3, PAT_UNCACHEABLE) |
1241             PAT_VALUE(4, PAT_WRITE_BACK) |
1242             PAT_VALUE(5, PAT_WRITE_THROUGH) |
1243             PAT_VALUE(6, PAT_UNCACHED) |
1244             PAT_VALUE(7, PAT_UNCACHEABLE);
1245
1246         if (pat_works) {
1247                 /*
1248                  * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
1249                  * Program 5 and 6 as WP and WC.
1250                  * Leave 4 and 7 as WB and UC.
1251                  */
1252                 pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
1253                 pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
1254                     PAT_VALUE(6, PAT_WRITE_COMBINING);
1255                 pat_table[PAT_UNCACHED] = 2;
1256                 pat_table[PAT_WRITE_PROTECTED] = 5;
1257                 pat_table[PAT_WRITE_COMBINING] = 6;
1258         } else {
1259                 /*
1260                  * Just replace PAT Index 2 with WC instead of UC-.
1261                  */
1262                 pat_msr &= ~PAT_MASK(2);
1263                 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
1264                 pat_table[PAT_WRITE_COMBINING] = 2;
1265         }
1266
1267         /* Disable PGE. */
1268         cr4 = rcr4();
1269         load_cr4(cr4 & ~CR4_PGE);
1270
1271         /* Disable caches (CD = 1, NW = 0). */
1272         cr0 = rcr0();
1273         load_cr0((cr0 & ~CR0_NW) | CR0_CD);
1274
1275         /* Flushes caches and TLBs. */
1276         wbinvd();
1277         invltlb();
1278
1279         /* Update PAT and index table. */
1280         wrmsr(MSR_PAT, pat_msr);
1281         for (i = 0; i < PAT_INDEX_SIZE; i++)
1282                 pat_index[i] = pat_table[i];
1283
1284         /* Flush caches and TLBs again. */
1285         wbinvd();
1286         invltlb();
1287
1288         /* Restore caches and PGE. */
1289         load_cr0(cr0);
1290         load_cr4(cr4);
1291 }
1292
1293 /*
1294  *      Initialize a vm_page's machine-dependent fields.
1295  */
1296 void
1297 pmap_page_init(vm_page_t m)
1298 {
1299
1300         TAILQ_INIT(&m->md.pv_list);
1301         m->md.pat_mode = PAT_WRITE_BACK;
1302 }
1303
1304 /*
1305  *      Initialize the pmap module.
1306  *      Called by vm_init, to initialize any structures that the pmap
1307  *      system needs to map virtual memory.
1308  */
1309 void
1310 pmap_init(void)
1311 {
1312         struct pmap_preinit_mapping *ppim;
1313         vm_page_t mpte;
1314         vm_size_t s;
1315         int error, i, pv_npg, ret, skz63;
1316
1317         /* L1TF, reserve page @0 unconditionally */
1318         vm_page_blacklist_add(0, bootverbose);
1319
1320         /* Detect bare-metal Skylake Server and Skylake-X. */
1321         if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL &&
1322             CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) {
1323                 /*
1324                  * Skylake-X errata SKZ63. Processor May Hang When
1325                  * Executing Code In an HLE Transaction Region between
1326                  * 40000000H and 403FFFFFH.
1327                  *
1328                  * Mark the pages in the range as preallocated.  It
1329                  * seems to be impossible to distinguish between
1330                  * Skylake Server and Skylake X.
1331                  */
1332                 skz63 = 1;
1333                 TUNABLE_INT_FETCH("hw.skz63_enable", &skz63);
1334                 if (skz63 != 0) {
1335                         if (bootverbose)
1336                                 printf("SKZ63: skipping 4M RAM starting "
1337                                     "at physical 1G\n");
1338                         for (i = 0; i < atop(0x400000); i++) {
1339                                 ret = vm_page_blacklist_add(0x40000000 +
1340                                     ptoa(i), FALSE);
1341                                 if (!ret && bootverbose)
1342                                         printf("page at %#lx already used\n",
1343                                             0x40000000 + ptoa(i));
1344                         }
1345                 }
1346         }
1347
1348         /*
1349          * Initialize the vm page array entries for the kernel pmap's
1350          * page table pages.
1351          */
1352         PMAP_LOCK(kernel_pmap);
1353         for (i = 0; i < nkpt; i++) {
1354                 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
1355                 KASSERT(mpte >= vm_page_array &&
1356                     mpte < &vm_page_array[vm_page_array_size],
1357                     ("pmap_init: page table page is out of range"));
1358                 mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
1359                 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
1360                 mpte->wire_count = 1;
1361                 if (i << PDRSHIFT < KERNend &&
1362                     pmap_insert_pt_page(kernel_pmap, mpte))
1363                         panic("pmap_init: pmap_insert_pt_page failed");
1364         }
1365         PMAP_UNLOCK(kernel_pmap);
1366         vm_wire_add(nkpt);
1367
1368         /*
1369          * If the kernel is running on a virtual machine, then it must assume
1370          * that MCA is enabled by the hypervisor.  Moreover, the kernel must
1371          * be prepared for the hypervisor changing the vendor and family that
1372          * are reported by CPUID.  Consequently, the workaround for AMD Family
1373          * 10h Erratum 383 is enabled if the processor's feature set does not
1374          * include at least one feature that is only supported by older Intel
1375          * or newer AMD processors.
1376          */
1377         if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
1378             (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
1379             CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
1380             AMDID2_FMA4)) == 0)
1381                 workaround_erratum383 = 1;
1382
1383         /*
1384          * Are large page mappings enabled?
1385          */
1386         TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
1387         if (pg_ps_enabled) {
1388                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1389                     ("pmap_init: can't assign to pagesizes[1]"));
1390                 pagesizes[1] = NBPDR;
1391         }
1392
1393         /*
1394          * Initialize the pv chunk list mutex.
1395          */
1396         mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1397
1398         /*
1399          * Initialize the pool of pv list locks.
1400          */
1401         for (i = 0; i < NPV_LIST_LOCKS; i++)
1402                 rw_init(&pv_list_locks[i], "pmap pv list");
1403
1404         /*
1405          * Calculate the size of the pv head table for superpages.
1406          */
1407         pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
1408
1409         /*
1410          * Allocate memory for the pv head table for superpages.
1411          */
1412         s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1413         s = round_page(s);
1414         pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
1415         for (i = 0; i < pv_npg; i++)
1416                 TAILQ_INIT(&pv_table[i].pv_list);
1417         TAILQ_INIT(&pv_dummy.pv_list);
1418
1419         pmap_initialized = 1;
1420         for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
1421                 ppim = pmap_preinit_mapping + i;
1422                 if (ppim->va == 0)
1423                         continue;
1424                 /* Make the direct map consistent */
1425                 if (ppim->pa < dmaplimit && ppim->pa + ppim->sz < dmaplimit) {
1426                         (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
1427                             ppim->sz, ppim->mode);
1428                 }
1429                 if (!bootverbose)
1430                         continue;
1431                 printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
1432                     ppim->pa, ppim->va, ppim->sz, ppim->mode);
1433         }
1434
1435         mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
1436         error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
1437             (vmem_addr_t *)&qframe);
1438         if (error != 0)
1439                 panic("qframe allocation failed");
1440 }
1441
1442 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
1443     "2MB page mapping counters");
1444
1445 static u_long pmap_pde_demotions;
1446 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
1447     &pmap_pde_demotions, 0, "2MB page demotions");
1448
1449 static u_long pmap_pde_mappings;
1450 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
1451     &pmap_pde_mappings, 0, "2MB page mappings");
1452
1453 static u_long pmap_pde_p_failures;
1454 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
1455     &pmap_pde_p_failures, 0, "2MB page promotion failures");
1456
1457 static u_long pmap_pde_promotions;
1458 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
1459     &pmap_pde_promotions, 0, "2MB page promotions");
1460
1461 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
1462     "1GB page mapping counters");
1463
1464 static u_long pmap_pdpe_demotions;
1465 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
1466     &pmap_pdpe_demotions, 0, "1GB page demotions");
1467
1468 /***************************************************
1469  * Low level helper routines.....
1470  ***************************************************/
1471
1472 static pt_entry_t
1473 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
1474 {
1475         int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
1476
1477         switch (pmap->pm_type) {
1478         case PT_X86:
1479         case PT_RVI:
1480                 /* Verify that both PAT bits are not set at the same time */
1481                 KASSERT((entry & x86_pat_bits) != x86_pat_bits,
1482                     ("Invalid PAT bits in entry %#lx", entry));
1483
1484                 /* Swap the PAT bits if one of them is set */
1485                 if ((entry & x86_pat_bits) != 0)
1486                         entry ^= x86_pat_bits;
1487                 break;
1488         case PT_EPT:
1489                 /*
1490                  * Nothing to do - the memory attributes are represented
1491                  * the same way for regular pages and superpages.
1492                  */
1493                 break;
1494         default:
1495                 panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
1496         }
1497
1498         return (entry);
1499 }
1500
1501 boolean_t
1502 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
1503 {
1504
1505         return (mode >= 0 && mode < PAT_INDEX_SIZE &&
1506             pat_index[(int)mode] >= 0);
1507 }
1508
1509 /*
1510  * Determine the appropriate bits to set in a PTE or PDE for a specified
1511  * caching mode.
1512  */
1513 int
1514 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
1515 {
1516         int cache_bits, pat_flag, pat_idx;
1517
1518         if (!pmap_is_valid_memattr(pmap, mode))
1519                 panic("Unknown caching mode %d\n", mode);
1520
1521         switch (pmap->pm_type) {
1522         case PT_X86:
1523         case PT_RVI:
1524                 /* The PAT bit is different for PTE's and PDE's. */
1525                 pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
1526
1527                 /* Map the caching mode to a PAT index. */
1528                 pat_idx = pat_index[mode];
1529
1530                 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
1531                 cache_bits = 0;
1532                 if (pat_idx & 0x4)
1533                         cache_bits |= pat_flag;
1534                 if (pat_idx & 0x2)
1535                         cache_bits |= PG_NC_PCD;
1536                 if (pat_idx & 0x1)
1537                         cache_bits |= PG_NC_PWT;
1538                 break;
1539
1540         case PT_EPT:
1541                 cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
1542                 break;
1543
1544         default:
1545                 panic("unsupported pmap type %d", pmap->pm_type);
1546         }
1547
1548         return (cache_bits);
1549 }
1550
1551 static int
1552 pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
1553 {
1554         int mask;
1555
1556         switch (pmap->pm_type) {
1557         case PT_X86:
1558         case PT_RVI:
1559                 mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
1560                 break;
1561         case PT_EPT:
1562                 mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
1563                 break;
1564         default:
1565                 panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
1566         }
1567
1568         return (mask);
1569 }
1570
1571 bool
1572 pmap_ps_enabled(pmap_t pmap)
1573 {
1574
1575         return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
1576 }
1577
1578 static void
1579 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
1580 {
1581
1582         switch (pmap->pm_type) {
1583         case PT_X86:
1584                 break;
1585         case PT_RVI:
1586         case PT_EPT:
1587                 /*
1588                  * XXX
1589                  * This is a little bogus since the generation number is
1590                  * supposed to be bumped up when a region of the address
1591                  * space is invalidated in the page tables.
1592                  *
1593                  * In this case the old PDE entry is valid but yet we want
1594                  * to make sure that any mappings using the old entry are
1595                  * invalidated in the TLB.
1596                  *
1597                  * The reason this works as expected is because we rendezvous
1598                  * "all" host cpus and force any vcpu context to exit as a
1599                  * side-effect.
1600                  */
1601                 atomic_add_acq_long(&pmap->pm_eptgen, 1);
1602                 break;
1603         default:
1604                 panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
1605         }
1606         pde_store(pde, newpde);
1607 }
1608
1609 /*
1610  * After changing the page size for the specified virtual address in the page
1611  * table, flush the corresponding entries from the processor's TLB.  Only the
1612  * calling processor's TLB is affected.
1613  *
1614  * The calling thread must be pinned to a processor.
1615  */
1616 static void
1617 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
1618 {
1619         pt_entry_t PG_G;
1620
1621         if (pmap_type_guest(pmap))
1622                 return;
1623
1624         KASSERT(pmap->pm_type == PT_X86,
1625             ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
1626
1627         PG_G = pmap_global_bit(pmap);
1628
1629         if ((newpde & PG_PS) == 0)
1630                 /* Demotion: flush a specific 2MB page mapping. */
1631                 invlpg(va);
1632         else if ((newpde & PG_G) == 0)
1633                 /*
1634                  * Promotion: flush every 4KB page mapping from the TLB
1635                  * because there are too many to flush individually.
1636                  */
1637                 invltlb();
1638         else {
1639                 /*
1640                  * Promotion: flush every 4KB page mapping from the TLB,
1641                  * including any global (PG_G) mappings.
1642                  */
1643                 invltlb_glob();
1644         }
1645 }
1646 #ifdef SMP
1647
1648 /*
1649  * For SMP, these functions have to use the IPI mechanism for coherence.
1650  *
1651  * N.B.: Before calling any of the following TLB invalidation functions,
1652  * the calling processor must ensure that all stores updating a non-
1653  * kernel page table are globally performed.  Otherwise, another
1654  * processor could cache an old, pre-update entry without being
1655  * invalidated.  This can happen one of two ways: (1) The pmap becomes
1656  * active on another processor after its pm_active field is checked by
1657  * one of the following functions but before a store updating the page
1658  * table is globally performed. (2) The pmap becomes active on another
1659  * processor before its pm_active field is checked but due to
1660  * speculative loads one of the following functions stills reads the
1661  * pmap as inactive on the other processor.
1662  *
1663  * The kernel page table is exempt because its pm_active field is
1664  * immutable.  The kernel page table is always active on every
1665  * processor.
1666  */
1667
1668 /*
1669  * Interrupt the cpus that are executing in the guest context.
1670  * This will force the vcpu to exit and the cached EPT mappings
1671  * will be invalidated by the host before the next vmresume.
1672  */
1673 static __inline void
1674 pmap_invalidate_ept(pmap_t pmap)
1675 {
1676         int ipinum;
1677
1678         sched_pin();
1679         KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1680             ("pmap_invalidate_ept: absurd pm_active"));
1681
1682         /*
1683          * The TLB mappings associated with a vcpu context are not
1684          * flushed each time a different vcpu is chosen to execute.
1685          *
1686          * This is in contrast with a process's vtop mappings that
1687          * are flushed from the TLB on each context switch.
1688          *
1689          * Therefore we need to do more than just a TLB shootdown on
1690          * the active cpus in 'pmap->pm_active'. To do this we keep
1691          * track of the number of invalidations performed on this pmap.
1692          *
1693          * Each vcpu keeps a cache of this counter and compares it
1694          * just before a vmresume. If the counter is out-of-date an
1695          * invept will be done to flush stale mappings from the TLB.
1696          */
1697         atomic_add_acq_long(&pmap->pm_eptgen, 1);
1698
1699         /*
1700          * Force the vcpu to exit and trap back into the hypervisor.
1701          */
1702         ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
1703         ipi_selected(pmap->pm_active, ipinum);
1704         sched_unpin();
1705 }
1706
1707 static inline void
1708 pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va,
1709     const bool invpcid_works1)
1710 {
1711         struct invpcid_descr d;
1712         uint64_t kcr3, ucr3;
1713         uint32_t pcid;
1714         u_int cpuid, i;
1715
1716         cpuid = PCPU_GET(cpuid);
1717         if (pmap == PCPU_GET(curpmap)) {
1718                 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
1719                         /*
1720                          * Because pm_pcid is recalculated on a
1721                          * context switch, we must disable switching.
1722                          * Otherwise, we might use a stale value
1723                          * below.
1724                          */
1725                         critical_enter();
1726                         pcid = pmap->pm_pcids[cpuid].pm_pcid;
1727                         if (invpcid_works1) {
1728                                 d.pcid = pcid | PMAP_PCID_USER_PT;
1729                                 d.pad = 0;
1730                                 d.addr = va;
1731                                 invpcid(&d, INVPCID_ADDR);
1732                         } else {
1733                                 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
1734                                 ucr3 = pmap->pm_ucr3 | pcid |
1735                                     PMAP_PCID_USER_PT | CR3_PCID_SAVE;
1736                                 pmap_pti_pcid_invlpg(ucr3, kcr3, va);
1737                         }
1738                         critical_exit();
1739                 }
1740         } else
1741                 pmap->pm_pcids[cpuid].pm_gen = 0;
1742
1743         CPU_FOREACH(i) {
1744                 if (cpuid != i)
1745                         pmap->pm_pcids[i].pm_gen = 0;
1746         }
1747
1748         /*
1749          * The fence is between stores to pm_gen and the read of the
1750          * pm_active mask.  We need to ensure that it is impossible
1751          * for us to miss the bit update in pm_active and
1752          * simultaneously observe a non-zero pm_gen in
1753          * pmap_activate_sw(), otherwise TLB update is missed.
1754          * Without the fence, IA32 allows such an outcome.  Note that
1755          * pm_active is updated by a locked operation, which provides
1756          * the reciprocal fence.
1757          */
1758         atomic_thread_fence_seq_cst();
1759 }
1760
1761 static void
1762 pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va)
1763 {
1764
1765         pmap_invalidate_page_pcid(pmap, va, true);
1766 }
1767
1768 static void
1769 pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va)
1770 {
1771
1772         pmap_invalidate_page_pcid(pmap, va, false);
1773 }
1774
1775 static void
1776 pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va)
1777 {
1778 }
1779
1780 DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t),
1781     static)
1782 {
1783
1784         if (pmap_pcid_enabled)
1785                 return (invpcid_works ? pmap_invalidate_page_pcid_invpcid :
1786                     pmap_invalidate_page_pcid_noinvpcid);
1787         return (pmap_invalidate_page_nopcid);
1788 }
1789
1790 void
1791 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
1792 {
1793         cpuset_t *mask;
1794
1795         if (pmap_type_guest(pmap)) {
1796                 pmap_invalidate_ept(pmap);
1797                 return;
1798         }
1799
1800         KASSERT(pmap->pm_type == PT_X86,
1801             ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
1802
1803         sched_pin();
1804         if (pmap == kernel_pmap) {
1805                 invlpg(va);
1806                 mask = &all_cpus;
1807         } else {
1808                 if (pmap == PCPU_GET(curpmap))
1809                         invlpg(va);
1810                 pmap_invalidate_page_mode(pmap, va);
1811                 mask = &pmap->pm_active;
1812         }
1813         smp_masked_invlpg(*mask, va, pmap);
1814         sched_unpin();
1815 }
1816
1817 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
1818 #define PMAP_INVLPG_THRESHOLD   (4 * 1024 * PAGE_SIZE)
1819
1820 static void
1821 pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1822     const bool invpcid_works1)
1823 {
1824         struct invpcid_descr d;
1825         uint64_t kcr3, ucr3;
1826         uint32_t pcid;
1827         u_int cpuid, i;
1828
1829         cpuid = PCPU_GET(cpuid);
1830         if (pmap == PCPU_GET(curpmap)) {
1831                 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
1832                         critical_enter();
1833                         pcid = pmap->pm_pcids[cpuid].pm_pcid;
1834                         if (invpcid_works1) {
1835                                 d.pcid = pcid | PMAP_PCID_USER_PT;
1836                                 d.pad = 0;
1837                                 d.addr = sva;
1838                                 for (; d.addr < eva; d.addr += PAGE_SIZE)
1839                                         invpcid(&d, INVPCID_ADDR);
1840                         } else {
1841                                 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
1842                                 ucr3 = pmap->pm_ucr3 | pcid |
1843                                     PMAP_PCID_USER_PT | CR3_PCID_SAVE;
1844                                 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
1845                         }
1846                         critical_exit();
1847                 }
1848         } else
1849                 pmap->pm_pcids[cpuid].pm_gen = 0;
1850
1851         CPU_FOREACH(i) {
1852                 if (cpuid != i)
1853                         pmap->pm_pcids[i].pm_gen = 0;
1854         }
1855         /* See the comment in pmap_invalidate_page_pcid(). */
1856         atomic_thread_fence_seq_cst();
1857 }
1858
1859 static void
1860 pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva,
1861     vm_offset_t eva)
1862 {
1863
1864         pmap_invalidate_range_pcid(pmap, sva, eva, true);
1865 }
1866
1867 static void
1868 pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva,
1869     vm_offset_t eva)
1870 {
1871
1872         pmap_invalidate_range_pcid(pmap, sva, eva, false);
1873 }
1874
1875 static void
1876 pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1877 {
1878 }
1879
1880 DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t,
1881     vm_offset_t), static)
1882 {
1883
1884         if (pmap_pcid_enabled)
1885                 return (invpcid_works ? pmap_invalidate_range_pcid_invpcid :
1886                     pmap_invalidate_range_pcid_noinvpcid);
1887         return (pmap_invalidate_range_nopcid);
1888 }
1889
1890 void
1891 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
1892 {
1893         cpuset_t *mask;
1894         vm_offset_t addr;
1895
1896         if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
1897                 pmap_invalidate_all(pmap);
1898                 return;
1899         }
1900
1901         if (pmap_type_guest(pmap)) {
1902                 pmap_invalidate_ept(pmap);
1903                 return;
1904         }
1905
1906         KASSERT(pmap->pm_type == PT_X86,
1907             ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
1908
1909         sched_pin();
1910         if (pmap == kernel_pmap) {
1911                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
1912                         invlpg(addr);
1913                 mask = &all_cpus;
1914         } else {
1915                 if (pmap == PCPU_GET(curpmap)) {
1916                         for (addr = sva; addr < eva; addr += PAGE_SIZE)
1917                                 invlpg(addr);
1918                 }
1919                 pmap_invalidate_range_mode(pmap, sva, eva);
1920                 mask = &pmap->pm_active;
1921         }
1922         smp_masked_invlpg_range(*mask, sva, eva, pmap);
1923         sched_unpin();
1924 }
1925
1926 static inline void
1927 pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1)
1928 {
1929         struct invpcid_descr d;
1930         uint64_t kcr3, ucr3;
1931         uint32_t pcid;
1932         u_int cpuid, i;
1933
1934         if (pmap == kernel_pmap) {
1935                 if (invpcid_works1) {
1936                         bzero(&d, sizeof(d));
1937                         invpcid(&d, INVPCID_CTXGLOB);
1938                 } else {
1939                         invltlb_glob();
1940                 }
1941         } else {
1942                 cpuid = PCPU_GET(cpuid);
1943                 if (pmap == PCPU_GET(curpmap)) {
1944                         critical_enter();
1945                         pcid = pmap->pm_pcids[cpuid].pm_pcid;
1946                         if (invpcid_works1) {
1947                                 d.pcid = pcid;
1948                                 d.pad = 0;
1949                                 d.addr = 0;
1950                                 invpcid(&d, INVPCID_CTX);
1951                                 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
1952                                         d.pcid |= PMAP_PCID_USER_PT;
1953                                         invpcid(&d, INVPCID_CTX);
1954                                 }
1955                         } else {
1956                                 kcr3 = pmap->pm_cr3 | pcid;
1957                                 ucr3 = pmap->pm_ucr3;
1958                                 if (ucr3 != PMAP_NO_CR3) {
1959                                         ucr3 |= pcid | PMAP_PCID_USER_PT;
1960                                         pmap_pti_pcid_invalidate(ucr3, kcr3);
1961                                 } else {
1962                                         load_cr3(kcr3);
1963                                 }
1964                         }
1965                         critical_exit();
1966                 } else
1967                         pmap->pm_pcids[cpuid].pm_gen = 0;
1968         }
1969         CPU_FOREACH(i) {
1970                 if (cpuid != i)
1971                         pmap->pm_pcids[i].pm_gen = 0;
1972         }
1973         /* See the comment in pmap_invalidate_page_pcid(). */
1974         atomic_thread_fence_seq_cst();
1975 }
1976
1977 static void
1978 pmap_invalidate_all_pcid_invpcid(pmap_t pmap)
1979 {
1980
1981         pmap_invalidate_all_pcid(pmap, true);
1982 }
1983
1984 static void
1985 pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap)
1986 {
1987
1988         pmap_invalidate_all_pcid(pmap, false);
1989 }
1990
1991 static void
1992 pmap_invalidate_all_nopcid(pmap_t pmap)
1993 {
1994
1995         if (pmap == kernel_pmap)
1996                 invltlb_glob();
1997         else if (pmap == PCPU_GET(curpmap))
1998                 invltlb();
1999 }
2000
2001 DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t), static)
2002 {
2003
2004         if (pmap_pcid_enabled)
2005                 return (invpcid_works ? pmap_invalidate_all_pcid_invpcid :
2006                     pmap_invalidate_all_pcid_noinvpcid);
2007         return (pmap_invalidate_all_nopcid);
2008 }
2009
2010 void
2011 pmap_invalidate_all(pmap_t pmap)
2012 {
2013         cpuset_t *mask;
2014
2015         if (pmap_type_guest(pmap)) {
2016                 pmap_invalidate_ept(pmap);
2017                 return;
2018         }
2019
2020         KASSERT(pmap->pm_type == PT_X86,
2021             ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
2022
2023         sched_pin();
2024         mask = pmap == kernel_pmap ? &all_cpus : &pmap->pm_active;
2025         pmap_invalidate_all_mode(pmap);
2026         smp_masked_invltlb(*mask, pmap);
2027         sched_unpin();
2028 }
2029
2030 void
2031 pmap_invalidate_cache(void)
2032 {
2033
2034         sched_pin();
2035         wbinvd();
2036         smp_cache_flush();
2037         sched_unpin();
2038 }
2039
2040 struct pde_action {
2041         cpuset_t invalidate;    /* processors that invalidate their TLB */
2042         pmap_t pmap;
2043         vm_offset_t va;
2044         pd_entry_t *pde;
2045         pd_entry_t newpde;
2046         u_int store;            /* processor that updates the PDE */
2047 };
2048
2049 static void
2050 pmap_update_pde_action(void *arg)
2051 {
2052         struct pde_action *act = arg;
2053
2054         if (act->store == PCPU_GET(cpuid))
2055                 pmap_update_pde_store(act->pmap, act->pde, act->newpde);
2056 }
2057
2058 static void
2059 pmap_update_pde_teardown(void *arg)
2060 {
2061         struct pde_action *act = arg;
2062
2063         if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
2064                 pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
2065 }
2066
2067 /*
2068  * Change the page size for the specified virtual address in a way that
2069  * prevents any possibility of the TLB ever having two entries that map the
2070  * same virtual address using different page sizes.  This is the recommended
2071  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
2072  * machine check exception for a TLB state that is improperly diagnosed as a
2073  * hardware error.
2074  */
2075 static void
2076 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
2077 {
2078         struct pde_action act;
2079         cpuset_t active, other_cpus;
2080         u_int cpuid;
2081
2082         sched_pin();
2083         cpuid = PCPU_GET(cpuid);
2084         other_cpus = all_cpus;
2085         CPU_CLR(cpuid, &other_cpus);
2086         if (pmap == kernel_pmap || pmap_type_guest(pmap))
2087                 active = all_cpus;
2088         else {
2089                 active = pmap->pm_active;
2090         }
2091         if (CPU_OVERLAP(&active, &other_cpus)) {
2092                 act.store = cpuid;
2093                 act.invalidate = active;
2094                 act.va = va;
2095                 act.pmap = pmap;
2096                 act.pde = pde;
2097                 act.newpde = newpde;
2098                 CPU_SET(cpuid, &active);
2099                 smp_rendezvous_cpus(active,
2100                     smp_no_rendezvous_barrier, pmap_update_pde_action,
2101                     pmap_update_pde_teardown, &act);
2102         } else {
2103                 pmap_update_pde_store(pmap, pde, newpde);
2104                 if (CPU_ISSET(cpuid, &active))
2105                         pmap_update_pde_invalidate(pmap, va, newpde);
2106         }
2107         sched_unpin();
2108 }
2109 #else /* !SMP */
2110 /*
2111  * Normal, non-SMP, invalidation functions.
2112  */
2113 void
2114 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
2115 {
2116         struct invpcid_descr d;
2117         uint64_t kcr3, ucr3;
2118         uint32_t pcid;
2119
2120         if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
2121                 pmap->pm_eptgen++;
2122                 return;
2123         }
2124         KASSERT(pmap->pm_type == PT_X86,
2125             ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
2126
2127         if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
2128                 invlpg(va);
2129                 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
2130                     pmap->pm_ucr3 != PMAP_NO_CR3) {
2131                         critical_enter();
2132                         pcid = pmap->pm_pcids[0].pm_pcid;
2133                         if (invpcid_works) {
2134                                 d.pcid = pcid | PMAP_PCID_USER_PT;
2135                                 d.pad = 0;
2136                                 d.addr = va;
2137                                 invpcid(&d, INVPCID_ADDR);
2138                         } else {
2139                                 kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
2140                                 ucr3 = pmap->pm_ucr3 | pcid |
2141                                     PMAP_PCID_USER_PT | CR3_PCID_SAVE;
2142                                 pmap_pti_pcid_invlpg(ucr3, kcr3, va);
2143                         }
2144                         critical_exit();
2145                 }
2146         } else if (pmap_pcid_enabled)
2147                 pmap->pm_pcids[0].pm_gen = 0;
2148 }
2149
2150 void
2151 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2152 {
2153         struct invpcid_descr d;
2154         vm_offset_t addr;
2155         uint64_t kcr3, ucr3;
2156
2157         if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
2158                 pmap->pm_eptgen++;
2159                 return;
2160         }
2161         KASSERT(pmap->pm_type == PT_X86,
2162             ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
2163
2164         if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
2165                 for (addr = sva; addr < eva; addr += PAGE_SIZE)
2166                         invlpg(addr);
2167                 if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
2168                     pmap->pm_ucr3 != PMAP_NO_CR3) {
2169                         critical_enter();
2170                         if (invpcid_works) {
2171                                 d.pcid = pmap->pm_pcids[0].pm_pcid |
2172                                     PMAP_PCID_USER_PT;
2173                                 d.pad = 0;
2174                                 d.addr = sva;
2175                                 for (; d.addr < eva; d.addr += PAGE_SIZE)
2176                                         invpcid(&d, INVPCID_ADDR);
2177                         } else {
2178                                 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].
2179                                     pm_pcid | CR3_PCID_SAVE;
2180                                 ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0].
2181                                     pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
2182                                 pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
2183                         }
2184                         critical_exit();
2185                 }
2186         } else if (pmap_pcid_enabled) {
2187                 pmap->pm_pcids[0].pm_gen = 0;
2188         }
2189 }
2190
2191 void
2192 pmap_invalidate_all(pmap_t pmap)
2193 {
2194         struct invpcid_descr d;
2195         uint64_t kcr3, ucr3;
2196
2197         if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
2198                 pmap->pm_eptgen++;
2199                 return;
2200         }
2201         KASSERT(pmap->pm_type == PT_X86,
2202             ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
2203
2204         if (pmap == kernel_pmap) {
2205                 if (pmap_pcid_enabled && invpcid_works) {
2206                         bzero(&d, sizeof(d));
2207                         invpcid(&d, INVPCID_CTXGLOB);
2208                 } else {
2209                         invltlb_glob();
2210                 }
2211         } else if (pmap == PCPU_GET(curpmap)) {
2212                 if (pmap_pcid_enabled) {
2213                         critical_enter();
2214                         if (invpcid_works) {
2215                                 d.pcid = pmap->pm_pcids[0].pm_pcid;
2216                                 d.pad = 0;
2217                                 d.addr = 0;
2218                                 invpcid(&d, INVPCID_CTX);
2219                                 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
2220                                         d.pcid |= PMAP_PCID_USER_PT;
2221                                         invpcid(&d, INVPCID_CTX);
2222                                 }
2223                         } else {
2224                                 kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid;
2225                                 if (pmap->pm_ucr3 != PMAP_NO_CR3) {
2226                                         ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[
2227                                             0].pm_pcid | PMAP_PCID_USER_PT;
2228                                         pmap_pti_pcid_invalidate(ucr3, kcr3);
2229                                 } else
2230                                         load_cr3(kcr3);
2231                         }
2232                         critical_exit();
2233                 } else {
2234                         invltlb();
2235                 }
2236         } else if (pmap_pcid_enabled) {
2237                 pmap->pm_pcids[0].pm_gen = 0;
2238         }
2239 }
2240
2241 PMAP_INLINE void
2242 pmap_invalidate_cache(void)
2243 {
2244
2245         wbinvd();
2246 }
2247
2248 static void
2249 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
2250 {
2251
2252         pmap_update_pde_store(pmap, pde, newpde);
2253         if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
2254                 pmap_update_pde_invalidate(pmap, va, newpde);
2255         else
2256                 pmap->pm_pcids[0].pm_gen = 0;
2257 }
2258 #endif /* !SMP */
2259
2260 static void
2261 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
2262 {
2263
2264         /*
2265          * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
2266          * by a promotion that did not invalidate the 512 4KB page mappings
2267          * that might exist in the TLB.  Consequently, at this point, the TLB
2268          * may hold both 4KB and 2MB page mappings for the address range [va,
2269          * va + NBPDR).  Therefore, the entire range must be invalidated here.
2270          * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
2271          * 4KB page mappings for the address range [va, va + NBPDR), and so a
2272          * single INVLPG suffices to invalidate the 2MB page mapping from the
2273          * TLB.
2274          */
2275         if ((pde & PG_PROMOTED) != 0)
2276                 pmap_invalidate_range(pmap, va, va + NBPDR - 1);
2277         else
2278                 pmap_invalidate_page(pmap, va);
2279 }
2280
2281 DEFINE_IFUNC(, void, pmap_invalidate_cache_range,
2282     (vm_offset_t sva, vm_offset_t eva), static)
2283 {
2284
2285         if ((cpu_feature & CPUID_SS) != 0)
2286                 return (pmap_invalidate_cache_range_selfsnoop);
2287         if ((cpu_feature & CPUID_CLFSH) != 0)
2288                 return (pmap_force_invalidate_cache_range);
2289         return (pmap_invalidate_cache_range_all);
2290 }
2291
2292 #define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
2293
2294 static void
2295 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva)
2296 {
2297
2298         KASSERT((sva & PAGE_MASK) == 0,
2299             ("pmap_invalidate_cache_range: sva not page-aligned"));
2300         KASSERT((eva & PAGE_MASK) == 0,
2301             ("pmap_invalidate_cache_range: eva not page-aligned"));
2302 }
2303
2304 static void
2305 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva)
2306 {
2307
2308         pmap_invalidate_cache_range_check_align(sva, eva);
2309 }
2310
2311 void
2312 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
2313 {
2314
2315         sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
2316         if (eva - sva >= PMAP_CLFLUSH_THRESHOLD) {
2317                 /*
2318                  * The supplied range is bigger than 2MB.
2319                  * Globally invalidate cache.
2320                  */
2321                 pmap_invalidate_cache();
2322                 return;
2323         }
2324
2325         /*
2326          * XXX: Some CPUs fault, hang, or trash the local APIC
2327          * registers if we use CLFLUSH on the local APIC range.  The
2328          * local APIC is always uncached, so we don't need to flush
2329          * for that range anyway.
2330          */
2331         if (pmap_kextract(sva) == lapic_paddr)
2332                 return;
2333
2334         if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {
2335                 /*
2336                  * Do per-cache line flush.  Use the sfence
2337                  * instruction to insure that previous stores are
2338                  * included in the write-back.  The processor
2339                  * propagates flush to other processors in the cache
2340                  * coherence domain.
2341                  */
2342                 sfence();
2343                 for (; sva < eva; sva += cpu_clflush_line_size)
2344                         clflushopt(sva);
2345                 sfence();
2346         } else {
2347                 /*
2348                  * Writes are ordered by CLFLUSH on Intel CPUs.
2349                  */
2350                 if (cpu_vendor_id != CPU_VENDOR_INTEL)
2351                         mfence();
2352                 for (; sva < eva; sva += cpu_clflush_line_size)
2353                         clflush(sva);
2354                 if (cpu_vendor_id != CPU_VENDOR_INTEL)
2355                         mfence();
2356         }
2357 }
2358
2359 static void
2360 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva)
2361 {
2362
2363         pmap_invalidate_cache_range_check_align(sva, eva);
2364         pmap_invalidate_cache();
2365 }
2366
2367 /*
2368  * Remove the specified set of pages from the data and instruction caches.
2369  *
2370  * In contrast to pmap_invalidate_cache_range(), this function does not
2371  * rely on the CPU's self-snoop feature, because it is intended for use
2372  * when moving pages into a different cache domain.
2373  */
2374 void
2375 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
2376 {
2377         vm_offset_t daddr, eva;
2378         int i;
2379         bool useclflushopt;
2380
2381         useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
2382         if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
2383             ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
2384                 pmap_invalidate_cache();
2385         else {
2386                 if (useclflushopt)
2387                         sfence();
2388                 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
2389                         mfence();
2390                 for (i = 0; i < count; i++) {
2391                         daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
2392                         eva = daddr + PAGE_SIZE;
2393                         for (; daddr < eva; daddr += cpu_clflush_line_size) {
2394                                 if (useclflushopt)
2395                                         clflushopt(daddr);
2396                                 else
2397                                         clflush(daddr);
2398                         }
2399                 }
2400                 if (useclflushopt)
2401                         sfence();
2402                 else if (cpu_vendor_id != CPU_VENDOR_INTEL)
2403                         mfence();
2404         }
2405 }
2406
2407 /*
2408  *      Routine:        pmap_extract
2409  *      Function:
2410  *              Extract the physical page address associated
2411  *              with the given map/virtual_address pair.
2412  */
2413 vm_paddr_t
2414 pmap_extract(pmap_t pmap, vm_offset_t va)
2415 {
2416         pdp_entry_t *pdpe;
2417         pd_entry_t *pde;
2418         pt_entry_t *pte, PG_V;
2419         vm_paddr_t pa;
2420
2421         pa = 0;
2422         PG_V = pmap_valid_bit(pmap);
2423         PMAP_LOCK(pmap);
2424         pdpe = pmap_pdpe(pmap, va);
2425         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
2426                 if ((*pdpe & PG_PS) != 0)
2427                         pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
2428                 else {
2429                         pde = pmap_pdpe_to_pde(pdpe, va);
2430                         if ((*pde & PG_V) != 0) {
2431                                 if ((*pde & PG_PS) != 0) {
2432                                         pa = (*pde & PG_PS_FRAME) |
2433                                             (va & PDRMASK);
2434                                 } else {
2435                                         pte = pmap_pde_to_pte(pde, va);
2436                                         pa = (*pte & PG_FRAME) |
2437                                             (va & PAGE_MASK);
2438                                 }
2439                         }
2440                 }
2441         }
2442         PMAP_UNLOCK(pmap);
2443         return (pa);
2444 }
2445
2446 /*
2447  *      Routine:        pmap_extract_and_hold
2448  *      Function:
2449  *              Atomically extract and hold the physical page
2450  *              with the given pmap and virtual address pair
2451  *              if that mapping permits the given protection.
2452  */
2453 vm_page_t
2454 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
2455 {
2456         pd_entry_t pde, *pdep;
2457         pt_entry_t pte, PG_RW, PG_V;
2458         vm_paddr_t pa;
2459         vm_page_t m;
2460
2461         pa = 0;
2462         m = NULL;
2463         PG_RW = pmap_rw_bit(pmap);
2464         PG_V = pmap_valid_bit(pmap);
2465         PMAP_LOCK(pmap);
2466 retry:
2467         pdep = pmap_pde(pmap, va);
2468         if (pdep != NULL && (pde = *pdep)) {
2469                 if (pde & PG_PS) {
2470                         if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
2471                                 if (vm_page_pa_tryrelock(pmap, (pde &
2472                                     PG_PS_FRAME) | (va & PDRMASK), &pa))
2473                                         goto retry;
2474                                 m = PHYS_TO_VM_PAGE(pa);
2475                         }
2476                 } else {
2477                         pte = *pmap_pde_to_pte(pdep, va);
2478                         if ((pte & PG_V) &&
2479                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
2480                                 if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
2481                                     &pa))
2482                                         goto retry;
2483                                 m = PHYS_TO_VM_PAGE(pa);
2484                         }
2485                 }
2486                 if (m != NULL)
2487                         vm_page_hold(m);
2488         }
2489         PA_UNLOCK_COND(pa);
2490         PMAP_UNLOCK(pmap);
2491         return (m);
2492 }
2493
2494 vm_paddr_t
2495 pmap_kextract(vm_offset_t va)
2496 {
2497         pd_entry_t pde;
2498         vm_paddr_t pa;
2499
2500         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
2501                 pa = DMAP_TO_PHYS(va);
2502         } else {
2503                 pde = *vtopde(va);
2504                 if (pde & PG_PS) {
2505                         pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
2506                 } else {
2507                         /*
2508                          * Beware of a concurrent promotion that changes the
2509                          * PDE at this point!  For example, vtopte() must not
2510                          * be used to access the PTE because it would use the
2511                          * new PDE.  It is, however, safe to use the old PDE
2512                          * because the page table page is preserved by the
2513                          * promotion.
2514                          */
2515                         pa = *pmap_pde_to_pte(&pde, va);
2516                         pa = (pa & PG_FRAME) | (va & PAGE_MASK);
2517                 }
2518         }
2519         return (pa);
2520 }
2521
2522 /***************************************************
2523  * Low level mapping routines.....
2524  ***************************************************/
2525
2526 /*
2527  * Add a wired page to the kva.
2528  * Note: not SMP coherent.
2529  */
2530 PMAP_INLINE void
2531 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
2532 {
2533         pt_entry_t *pte;
2534
2535         pte = vtopte(va);
2536         pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g);
2537 }
2538
2539 static __inline void
2540 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
2541 {
2542         pt_entry_t *pte;
2543         int cache_bits;
2544
2545         pte = vtopte(va);
2546         cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
2547         pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits);
2548 }
2549
2550 /*
2551  * Remove a page from the kernel pagetables.
2552  * Note: not SMP coherent.
2553  */
2554 PMAP_INLINE void
2555 pmap_kremove(vm_offset_t va)
2556 {
2557         pt_entry_t *pte;
2558
2559         pte = vtopte(va);
2560         pte_clear(pte);
2561 }
2562
2563 /*
2564  *      Used to map a range of physical addresses into kernel
2565  *      virtual address space.
2566  *
2567  *      The value passed in '*virt' is a suggested virtual address for
2568  *      the mapping. Architectures which can support a direct-mapped
2569  *      physical to virtual region can return the appropriate address
2570  *      within that region, leaving '*virt' unchanged. Other
2571  *      architectures should map the pages starting at '*virt' and
2572  *      update '*virt' with the first usable address after the mapped
2573  *      region.
2574  */
2575 vm_offset_t
2576 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2577 {
2578         return PHYS_TO_DMAP(start);
2579 }
2580
2581
2582 /*
2583  * Add a list of wired pages to the kva
2584  * this routine is only used for temporary
2585  * kernel mappings that do not need to have
2586  * page modification or references recorded.
2587  * Note that old mappings are simply written
2588  * over.  The page *must* be wired.
2589  * Note: SMP coherent.  Uses a ranged shootdown IPI.
2590  */
2591 void
2592 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2593 {
2594         pt_entry_t *endpte, oldpte, pa, *pte;
2595         vm_page_t m;
2596         int cache_bits;
2597
2598         oldpte = 0;
2599         pte = vtopte(sva);
2600         endpte = pte + count;
2601         while (pte < endpte) {
2602                 m = *ma++;
2603                 cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
2604                 pa = VM_PAGE_TO_PHYS(m) | cache_bits;
2605                 if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
2606                         oldpte |= *pte;
2607                         pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V);
2608                 }
2609                 pte++;
2610         }
2611         if (__predict_false((oldpte & X86_PG_V) != 0))
2612                 pmap_invalidate_range(kernel_pmap, sva, sva + count *
2613                     PAGE_SIZE);
2614 }
2615
2616 /*
2617  * This routine tears out page mappings from the
2618  * kernel -- it is meant only for temporary mappings.
2619  * Note: SMP coherent.  Uses a ranged shootdown IPI.
2620  */
2621 void
2622 pmap_qremove(vm_offset_t sva, int count)
2623 {
2624         vm_offset_t va;
2625
2626         va = sva;
2627         while (count-- > 0) {
2628                 KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
2629                 pmap_kremove(va);
2630                 va += PAGE_SIZE;
2631         }
2632         pmap_invalidate_range(kernel_pmap, sva, va);
2633 }
2634
2635 /***************************************************
2636  * Page table page management routines.....
2637  ***************************************************/
2638 /*
2639  * Schedule the specified unused page table page to be freed.  Specifically,
2640  * add the page to the specified list of pages that will be released to the
2641  * physical memory manager after the TLB has been updated.
2642  */
2643 static __inline void
2644 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2645     boolean_t set_PG_ZERO)
2646 {
2647
2648         if (set_PG_ZERO)
2649                 m->flags |= PG_ZERO;
2650         else
2651                 m->flags &= ~PG_ZERO;
2652         SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2653 }
2654
2655 /*
2656  * Inserts the specified page table page into the specified pmap's collection
2657  * of idle page table pages.  Each of a pmap's page table pages is responsible
2658  * for mapping a distinct range of virtual addresses.  The pmap's collection is
2659  * ordered by this virtual address range.
2660  */
2661 static __inline int
2662 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2663 {
2664
2665         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2666         return (vm_radix_insert(&pmap->pm_root, mpte));
2667 }
2668
2669 /*
2670  * Removes the page table page mapping the specified virtual address from the
2671  * specified pmap's collection of idle page table pages, and returns it.
2672  * Otherwise, returns NULL if there is no page table page corresponding to the
2673  * specified virtual address.
2674  */
2675 static __inline vm_page_t
2676 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
2677 {
2678
2679         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2680         return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va)));
2681 }
2682
2683 /*
2684  * Decrements a page table page's wire count, which is used to record the
2685  * number of valid page table entries within the page.  If the wire count
2686  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
2687  * page table page was unmapped and FALSE otherwise.
2688  */
2689 static inline boolean_t
2690 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2691 {
2692
2693         --m->wire_count;
2694         if (m->wire_count == 0) {
2695                 _pmap_unwire_ptp(pmap, va, m, free);
2696                 return (TRUE);
2697         } else
2698                 return (FALSE);
2699 }
2700
2701 static void
2702 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2703 {
2704
2705         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2706         /*
2707          * unmap the page table page
2708          */
2709         if (m->pindex >= (NUPDE + NUPDPE)) {
2710                 /* PDP page */
2711                 pml4_entry_t *pml4;
2712                 pml4 = pmap_pml4e(pmap, va);
2713                 *pml4 = 0;
2714                 if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
2715                         pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
2716                         *pml4 = 0;
2717                 }
2718         } else if (m->pindex >= NUPDE) {
2719                 /* PD page */
2720                 pdp_entry_t *pdp;
2721                 pdp = pmap_pdpe(pmap, va);
2722                 *pdp = 0;
2723         } else {
2724                 /* PTE page */
2725                 pd_entry_t *pd;
2726                 pd = pmap_pde(pmap, va);
2727                 *pd = 0;
2728         }
2729         pmap_resident_count_dec(pmap, 1);
2730         if (m->pindex < NUPDE) {
2731                 /* We just released a PT, unhold the matching PD */
2732                 vm_page_t pdpg;
2733
2734                 pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
2735                 pmap_unwire_ptp(pmap, va, pdpg, free);
2736         }
2737         if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
2738                 /* We just released a PD, unhold the matching PDP */
2739                 vm_page_t pdppg;
2740
2741                 pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
2742                 pmap_unwire_ptp(pmap, va, pdppg, free);
2743         }
2744
2745         /*
2746          * Put page on a list so that it is released after
2747          * *ALL* TLB shootdown is done
2748          */
2749         pmap_add_delayed_free_list(m, free, TRUE);
2750 }
2751
2752 /*
2753  * After removing a page table entry, this routine is used to
2754  * conditionally free the page, and manage the hold/wire counts.
2755  */
2756 static int
2757 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2758     struct spglist *free)
2759 {
2760         vm_page_t mpte;
2761
2762         if (va >= VM_MAXUSER_ADDRESS)
2763                 return (0);
2764         KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2765         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
2766         return (pmap_unwire_ptp(pmap, va, mpte, free));
2767 }
2768
2769 void
2770 pmap_pinit0(pmap_t pmap)
2771 {
2772         int i;
2773
2774         PMAP_LOCK_INIT(pmap);
2775         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
2776         pmap->pm_pml4u = NULL;
2777         pmap->pm_cr3 = KPML4phys;
2778         /* hack to keep pmap_pti_pcid_invalidate() alive */
2779         pmap->pm_ucr3 = PMAP_NO_CR3;
2780         pmap->pm_root.rt_root = 0;
2781         CPU_ZERO(&pmap->pm_active);
2782         TAILQ_INIT(&pmap->pm_pvchunk);
2783         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2784         pmap->pm_flags = pmap_flags;
2785         CPU_FOREACH(i) {
2786                 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1;
2787                 pmap->pm_pcids[i].pm_gen = 1;
2788         }
2789         pmap_activate_boot(pmap);
2790 }
2791
2792 void
2793 pmap_pinit_pml4(vm_page_t pml4pg)
2794 {
2795         pml4_entry_t *pm_pml4;
2796         int i;
2797
2798         pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
2799
2800         /* Wire in kernel global address entries. */
2801         for (i = 0; i < NKPML4E; i++) {
2802                 pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
2803                     X86_PG_V;
2804         }
2805         for (i = 0; i < ndmpdpphys; i++) {
2806                 pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
2807                     X86_PG_V;
2808         }
2809
2810         /* install self-referential address mapping entry(s) */
2811         pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
2812             X86_PG_A | X86_PG_M;
2813 }
2814
2815 static void
2816 pmap_pinit_pml4_pti(vm_page_t pml4pg)
2817 {
2818         pml4_entry_t *pm_pml4;
2819         int i;
2820
2821         pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
2822         for (i = 0; i < NPML4EPG; i++)
2823                 pm_pml4[i] = pti_pml4[i];
2824 }
2825
2826 /*
2827  * Initialize a preallocated and zeroed pmap structure,
2828  * such as one in a vmspace structure.
2829  */
2830 int
2831 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
2832 {
2833         vm_page_t pml4pg, pml4pgu;
2834         vm_paddr_t pml4phys;
2835         int i;
2836
2837         /*
2838          * allocate the page directory page
2839          */
2840         pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2841             VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK);
2842
2843         pml4phys = VM_PAGE_TO_PHYS(pml4pg);
2844         pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
2845         CPU_FOREACH(i) {
2846                 pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
2847                 pmap->pm_pcids[i].pm_gen = 0;
2848         }
2849         pmap->pm_cr3 = PMAP_NO_CR3;     /* initialize to an invalid value */
2850         pmap->pm_ucr3 = PMAP_NO_CR3;
2851         pmap->pm_pml4u = NULL;
2852
2853         pmap->pm_type = pm_type;
2854         if ((pml4pg->flags & PG_ZERO) == 0)
2855                 pagezero(pmap->pm_pml4);
2856
2857         /*
2858          * Do not install the host kernel mappings in the nested page
2859          * tables. These mappings are meaningless in the guest physical
2860          * address space.
2861          * Install minimal kernel mappings in PTI case.
2862          */
2863         if (pm_type == PT_X86) {
2864                 pmap->pm_cr3 = pml4phys;
2865                 pmap_pinit_pml4(pml4pg);
2866                 if (pti) {
2867                         pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2868                             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
2869                         pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
2870                             VM_PAGE_TO_PHYS(pml4pgu));
2871                         pmap_pinit_pml4_pti(pml4pgu);
2872                         pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
2873                 }
2874         }
2875
2876         pmap->pm_root.rt_root = 0;
2877         CPU_ZERO(&pmap->pm_active);
2878         TAILQ_INIT(&pmap->pm_pvchunk);
2879         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
2880         pmap->pm_flags = flags;
2881         pmap->pm_eptgen = 0;
2882
2883         return (1);
2884 }
2885
2886 int
2887 pmap_pinit(pmap_t pmap)
2888 {
2889
2890         return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
2891 }
2892
2893 /*
2894  * This routine is called if the desired page table page does not exist.
2895  *
2896  * If page table page allocation fails, this routine may sleep before
2897  * returning NULL.  It sleeps only if a lock pointer was given.
2898  *
2899  * Note: If a page allocation fails at page table level two or three,
2900  * one or two pages may be held during the wait, only to be released
2901  * afterwards.  This conservative approach is easily argued to avoid
2902  * race conditions.
2903  */
2904 static vm_page_t
2905 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2906 {
2907         vm_page_t m, pdppg, pdpg;
2908         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
2909
2910         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2911
2912         PG_A = pmap_accessed_bit(pmap);
2913         PG_M = pmap_modified_bit(pmap);
2914         PG_V = pmap_valid_bit(pmap);
2915         PG_RW = pmap_rw_bit(pmap);
2916
2917         /*
2918          * Allocate a page table page.
2919          */
2920         if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
2921             VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2922                 if (lockp != NULL) {
2923                         RELEASE_PV_LIST_LOCK(lockp);
2924                         PMAP_UNLOCK(pmap);
2925                         PMAP_ASSERT_NOT_IN_DI();
2926                         vm_wait(NULL);
2927                         PMAP_LOCK(pmap);
2928                 }
2929
2930                 /*
2931                  * Indicate the need to retry.  While waiting, the page table
2932                  * page may have been allocated.
2933                  */
2934                 return (NULL);
2935         }
2936         if ((m->flags & PG_ZERO) == 0)
2937                 pmap_zero_page(m);
2938
2939         /*
2940          * Map the pagetable page into the process address space, if
2941          * it isn't already there.
2942          */
2943
2944         if (ptepindex >= (NUPDE + NUPDPE)) {
2945                 pml4_entry_t *pml4, *pml4u;
2946                 vm_pindex_t pml4index;
2947
2948                 /* Wire up a new PDPE page */
2949                 pml4index = ptepindex - (NUPDE + NUPDPE);
2950                 pml4 = &pmap->pm_pml4[pml4index];
2951                 *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2952                 if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
2953                         /*
2954                          * PTI: Make all user-space mappings in the
2955                          * kernel-mode page table no-execute so that
2956                          * we detect any programming errors that leave
2957                          * the kernel-mode page table active on return
2958                          * to user space.
2959                          */
2960                         if (pmap->pm_ucr3 != PMAP_NO_CR3)
2961                                 *pml4 |= pg_nx;
2962
2963                         pml4u = &pmap->pm_pml4u[pml4index];
2964                         *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
2965                             PG_A | PG_M;
2966                 }
2967
2968         } else if (ptepindex >= NUPDE) {
2969                 vm_pindex_t pml4index;
2970                 vm_pindex_t pdpindex;
2971                 pml4_entry_t *pml4;
2972                 pdp_entry_t *pdp;
2973
2974                 /* Wire up a new PDE page */
2975                 pdpindex = ptepindex - NUPDE;
2976                 pml4index = pdpindex >> NPML4EPGSHIFT;
2977
2978                 pml4 = &pmap->pm_pml4[pml4index];
2979                 if ((*pml4 & PG_V) == 0) {
2980                         /* Have to allocate a new pdp, recurse */
2981                         if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
2982                             lockp) == NULL) {
2983                                 vm_page_unwire_noq(m);
2984                                 vm_page_free_zero(m);
2985                                 return (NULL);
2986                         }
2987                 } else {
2988                         /* Add reference to pdp page */
2989                         pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
2990                         pdppg->wire_count++;
2991                 }
2992                 pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
2993
2994                 /* Now find the pdp page */
2995                 pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
2996                 *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
2997
2998         } else {
2999                 vm_pindex_t pml4index;
3000                 vm_pindex_t pdpindex;
3001                 pml4_entry_t *pml4;
3002                 pdp_entry_t *pdp;
3003                 pd_entry_t *pd;
3004
3005                 /* Wire up a new PTE page */
3006                 pdpindex = ptepindex >> NPDPEPGSHIFT;
3007                 pml4index = pdpindex >> NPML4EPGSHIFT;
3008
3009                 /* First, find the pdp and check that its valid. */
3010                 pml4 = &pmap->pm_pml4[pml4index];
3011                 if ((*pml4 & PG_V) == 0) {
3012                         /* Have to allocate a new pd, recurse */
3013                         if (_pmap_allocpte(pmap, NUPDE + pdpindex,
3014                             lockp) == NULL) {
3015                                 vm_page_unwire_noq(m);
3016                                 vm_page_free_zero(m);
3017                                 return (NULL);
3018                         }
3019                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
3020                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
3021                 } else {
3022                         pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
3023                         pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
3024                         if ((*pdp & PG_V) == 0) {
3025                                 /* Have to allocate a new pd, recurse */
3026                                 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
3027                                     lockp) == NULL) {
3028                                         vm_page_unwire_noq(m);
3029                                         vm_page_free_zero(m);
3030                                         return (NULL);
3031                                 }
3032                         } else {
3033                                 /* Add reference to the pd page */
3034                                 pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
3035                                 pdpg->wire_count++;
3036                         }
3037                 }
3038                 pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
3039
3040                 /* Now we know where the page directory page is */
3041                 pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
3042                 *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
3043         }
3044
3045         pmap_resident_count_inc(pmap, 1);
3046
3047         return (m);
3048 }
3049
3050 static vm_page_t
3051 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
3052 {
3053         vm_pindex_t pdpindex, ptepindex;
3054         pdp_entry_t *pdpe, PG_V;
3055         vm_page_t pdpg;
3056
3057         PG_V = pmap_valid_bit(pmap);
3058
3059 retry:
3060         pdpe = pmap_pdpe(pmap, va);
3061         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
3062                 /* Add a reference to the pd page. */
3063                 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
3064                 pdpg->wire_count++;
3065         } else {
3066                 /* Allocate a pd page. */
3067                 ptepindex = pmap_pde_pindex(va);
3068                 pdpindex = ptepindex >> NPDPEPGSHIFT;
3069                 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
3070                 if (pdpg == NULL && lockp != NULL)
3071                         goto retry;
3072         }
3073         return (pdpg);
3074 }
3075
3076 static vm_page_t
3077 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
3078 {
3079         vm_pindex_t ptepindex;
3080         pd_entry_t *pd, PG_V;
3081         vm_page_t m;
3082
3083         PG_V = pmap_valid_bit(pmap);
3084
3085         /*
3086          * Calculate pagetable page index
3087          */
3088         ptepindex = pmap_pde_pindex(va);
3089 retry:
3090         /*
3091          * Get the page directory entry
3092          */
3093         pd = pmap_pde(pmap, va);
3094
3095         /*
3096          * This supports switching from a 2MB page to a
3097          * normal 4K page.
3098          */
3099         if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
3100                 if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
3101                         /*
3102                          * Invalidation of the 2MB page mapping may have caused
3103                          * the deallocation of the underlying PD page.
3104                          */
3105                         pd = NULL;
3106                 }
3107         }
3108
3109         /*
3110          * If the page table page is mapped, we just increment the
3111          * hold count, and activate it.
3112          */
3113         if (pd != NULL && (*pd & PG_V) != 0) {
3114                 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
3115                 m->wire_count++;
3116         } else {
3117                 /*
3118                  * Here if the pte page isn't mapped, or if it has been
3119                  * deallocated.
3120                  */
3121                 m = _pmap_allocpte(pmap, ptepindex, lockp);
3122                 if (m == NULL && lockp != NULL)
3123                         goto retry;
3124         }
3125         return (m);
3126 }
3127
3128
3129 /***************************************************
3130  * Pmap allocation/deallocation routines.
3131  ***************************************************/
3132
3133 /*
3134  * Release any resources held by the given physical map.
3135  * Called when a pmap initialized by pmap_pinit is being released.
3136  * Should only be called if the map contains no valid mappings.
3137  */
3138 void
3139 pmap_release(pmap_t pmap)
3140 {
3141         vm_page_t m;
3142         int i;
3143
3144         KASSERT(pmap->pm_stats.resident_count == 0,
3145             ("pmap_release: pmap resident count %ld != 0",
3146             pmap->pm_stats.resident_count));
3147         KASSERT(vm_radix_is_empty(&pmap->pm_root),
3148             ("pmap_release: pmap has reserved page table page(s)"));
3149         KASSERT(CPU_EMPTY(&pmap->pm_active),
3150             ("releasing active pmap %p", pmap));
3151
3152         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
3153
3154         for (i = 0; i < NKPML4E; i++)   /* KVA */
3155                 pmap->pm_pml4[KPML4BASE + i] = 0;
3156         for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
3157                 pmap->pm_pml4[DMPML4I + i] = 0;
3158         pmap->pm_pml4[PML4PML4I] = 0;   /* Recursive Mapping */
3159
3160         vm_page_unwire_noq(m);
3161         vm_page_free_zero(m);
3162
3163         if (pmap->pm_pml4u != NULL) {
3164                 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
3165                 vm_page_unwire_noq(m);
3166                 vm_page_free(m);
3167         }
3168 }
3169
3170 static int
3171 kvm_size(SYSCTL_HANDLER_ARGS)
3172 {
3173         unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
3174
3175         return sysctl_handle_long(oidp, &ksize, 0, req);
3176 }
3177 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
3178     0, 0, kvm_size, "LU", "Size of KVM");
3179
3180 static int
3181 kvm_free(SYSCTL_HANDLER_ARGS)
3182 {
3183         unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
3184
3185         return sysctl_handle_long(oidp, &kfree, 0, req);
3186 }
3187 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
3188     0, 0, kvm_free, "LU", "Amount of KVM free");
3189
3190 /*
3191  * grow the number of kernel page table entries, if needed
3192  */
3193 void
3194 pmap_growkernel(vm_offset_t addr)
3195 {
3196         vm_paddr_t paddr;
3197         vm_page_t nkpg;
3198         pd_entry_t *pde, newpdir;
3199         pdp_entry_t *pdpe;
3200
3201         mtx_assert(&kernel_map->system_mtx, MA_OWNED);
3202
3203         /*
3204          * Return if "addr" is within the range of kernel page table pages
3205          * that were preallocated during pmap bootstrap.  Moreover, leave
3206          * "kernel_vm_end" and the kernel page table as they were.
3207          *
3208          * The correctness of this action is based on the following
3209          * argument: vm_map_insert() allocates contiguous ranges of the
3210          * kernel virtual address space.  It calls this function if a range
3211          * ends after "kernel_vm_end".  If the kernel is mapped between
3212          * "kernel_vm_end" and "addr", then the range cannot begin at
3213          * "kernel_vm_end".  In fact, its beginning address cannot be less
3214          * than the kernel.  Thus, there is no immediate need to allocate
3215          * any new kernel page table pages between "kernel_vm_end" and
3216          * "KERNBASE".
3217          */
3218         if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
3219                 return;
3220
3221         addr = roundup2(addr, NBPDR);
3222         if (addr - 1 >= vm_map_max(kernel_map))
3223                 addr = vm_map_max(kernel_map);
3224         while (kernel_vm_end < addr) {
3225                 pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
3226                 if ((*pdpe & X86_PG_V) == 0) {
3227                         /* We need a new PDP entry */
3228                         nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
3229                             VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
3230                             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3231                         if (nkpg == NULL)
3232                                 panic("pmap_growkernel: no memory to grow kernel");
3233                         if ((nkpg->flags & PG_ZERO) == 0)
3234                                 pmap_zero_page(nkpg);
3235                         paddr = VM_PAGE_TO_PHYS(nkpg);
3236                         *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
3237                             X86_PG_A | X86_PG_M);
3238                         continue; /* try again */
3239                 }
3240                 pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
3241                 if ((*pde & X86_PG_V) != 0) {
3242                         kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
3243                         if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3244                                 kernel_vm_end = vm_map_max(kernel_map);
3245                                 break;
3246                         }
3247                         continue;
3248                 }
3249
3250                 nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
3251                     VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
3252                     VM_ALLOC_ZERO);
3253                 if (nkpg == NULL)
3254                         panic("pmap_growkernel: no memory to grow kernel");
3255                 if ((nkpg->flags & PG_ZERO) == 0)
3256                         pmap_zero_page(nkpg);
3257                 paddr = VM_PAGE_TO_PHYS(nkpg);
3258                 newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
3259                 pde_store(pde, newpdir);
3260
3261                 kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
3262                 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3263                         kernel_vm_end = vm_map_max(kernel_map);
3264                         break;
3265                 }
3266         }
3267 }
3268
3269
3270 /***************************************************
3271  * page management routines.
3272  ***************************************************/
3273
3274 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
3275 CTASSERT(_NPCM == 3);
3276 CTASSERT(_NPCPV == 168);
3277
3278 static __inline struct pv_chunk *
3279 pv_to_chunk(pv_entry_t pv)
3280 {
3281
3282         return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
3283 }
3284
3285 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
3286
3287 #define PC_FREE0        0xfffffffffffffffful
3288 #define PC_FREE1        0xfffffffffffffffful
3289 #define PC_FREE2        0x000000fffffffffful
3290
3291 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
3292
3293 #ifdef PV_STATS
3294 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
3295
3296 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
3297         "Current number of pv entry chunks");
3298 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
3299         "Current number of pv entry chunks allocated");
3300 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
3301         "Current number of pv entry chunks frees");
3302 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
3303         "Number of times tried to get a chunk page but failed.");
3304
3305 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
3306 static int pv_entry_spare;
3307
3308 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
3309         "Current number of pv entry frees");
3310 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
3311         "Current number of pv entry allocs");
3312 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
3313         "Current number of pv entries");
3314 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
3315         "Current number of spare pv entries");
3316 #endif
3317
3318 static void
3319 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di)
3320 {
3321
3322         if (pmap == NULL)
3323                 return;
3324         pmap_invalidate_all(pmap);
3325         if (pmap != locked_pmap)
3326                 PMAP_UNLOCK(pmap);
3327         if (start_di)
3328                 pmap_delayed_invl_finished();
3329 }
3330
3331 /*
3332  * We are in a serious low memory condition.  Resort to
3333  * drastic measures to free some pages so we can allocate
3334  * another pv entry chunk.
3335  *
3336  * Returns NULL if PV entries were reclaimed from the specified pmap.
3337  *
3338  * We do not, however, unmap 2mpages because subsequent accesses will
3339  * allocate per-page pv entries until repromotion occurs, thereby
3340  * exacerbating the shortage of free pv entries.
3341  */
3342 static vm_page_t
3343 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
3344 {
3345         struct pv_chunk *pc, *pc_marker, *pc_marker_end;
3346         struct pv_chunk_header pc_marker_b, pc_marker_end_b;
3347         struct md_page *pvh;
3348         pd_entry_t *pde;
3349         pmap_t next_pmap, pmap;
3350         pt_entry_t *pte, tpte;
3351         pt_entry_t PG_G, PG_A, PG_M, PG_RW;
3352         pv_entry_t pv;
3353         vm_offset_t va;
3354         vm_page_t m, m_pc;
3355         struct spglist free;
3356         uint64_t inuse;
3357         int bit, field, freed;
3358         bool start_di;
3359         static int active_reclaims = 0;
3360
3361         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
3362         KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
3363         pmap = NULL;
3364         m_pc = NULL;
3365         PG_G = PG_A = PG_M = PG_RW = 0;
3366         SLIST_INIT(&free);
3367         bzero(&pc_marker_b, sizeof(pc_marker_b));
3368         bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
3369         pc_marker = (struct pv_chunk *)&pc_marker_b;
3370         pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
3371
3372         /*
3373          * A delayed invalidation block should already be active if
3374          * pmap_advise() or pmap_remove() called this function by way
3375          * of pmap_demote_pde_locked().
3376          */
3377         start_di = pmap_not_in_di();
3378
3379         mtx_lock(&pv_chunks_mutex);
3380         active_reclaims++;
3381         TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
3382         TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
3383         while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
3384             SLIST_EMPTY(&free)) {
3385                 next_pmap = pc->pc_pmap;
3386                 if (next_pmap == NULL) {
3387                         /*
3388                          * The next chunk is a marker.  However, it is
3389                          * not our marker, so active_reclaims must be
3390                          * > 1.  Consequently, the next_chunk code
3391                          * will not rotate the pv_chunks list.
3392                          */
3393                         goto next_chunk;
3394                 }
3395                 mtx_unlock(&pv_chunks_mutex);
3396
3397                 /*
3398                  * A pv_chunk can only be removed from the pc_lru list
3399                  * when both pc_chunks_mutex is owned and the
3400                  * corresponding pmap is locked.
3401                  */
3402                 if (pmap != next_pmap) {
3403                         reclaim_pv_chunk_leave_pmap(pmap, locked_pmap,
3404                             start_di);
3405                         pmap = next_pmap;
3406                         /* Avoid deadlock and lock recursion. */
3407                         if (pmap > locked_pmap) {
3408                                 RELEASE_PV_LIST_LOCK(lockp);
3409                                 PMAP_LOCK(pmap);
3410                                 if (start_di)
3411                                         pmap_delayed_invl_started();
3412                                 mtx_lock(&pv_chunks_mutex);
3413                                 continue;
3414                         } else if (pmap != locked_pmap) {
3415                                 if (PMAP_TRYLOCK(pmap)) {
3416                                         if (start_di)
3417                                                 pmap_delayed_invl_started();
3418                                         mtx_lock(&pv_chunks_mutex);
3419                                         continue;
3420                                 } else {
3421                                         pmap = NULL; /* pmap is not locked */
3422                                         mtx_lock(&pv_chunks_mutex);
3423                                         pc = TAILQ_NEXT(pc_marker, pc_lru);
3424                                         if (pc == NULL ||
3425                                             pc->pc_pmap != next_pmap)
3426                                                 continue;
3427                                         goto next_chunk;
3428                                 }
3429                         } else if (start_di)
3430                                 pmap_delayed_invl_started();
3431                         PG_G = pmap_global_bit(pmap);
3432                         PG_A = pmap_accessed_bit(pmap);
3433                         PG_M = pmap_modified_bit(pmap);
3434                         PG_RW = pmap_rw_bit(pmap);
3435                 }
3436
3437                 /*
3438                  * Destroy every non-wired, 4 KB page mapping in the chunk.
3439                  */
3440                 freed = 0;
3441                 for (field = 0; field < _NPCM; field++) {
3442                         for (inuse = ~pc->pc_map[field] & pc_freemask[field];
3443                             inuse != 0; inuse &= ~(1UL << bit)) {
3444                                 bit = bsfq(inuse);
3445                                 pv = &pc->pc_pventry[field * 64 + bit];
3446                                 va = pv->pv_va;
3447                                 pde = pmap_pde(pmap, va);
3448                                 if ((*pde & PG_PS) != 0)
3449                                         continue;
3450                                 pte = pmap_pde_to_pte(pde, va);
3451                                 if ((*pte & PG_W) != 0)
3452                                         continue;
3453                                 tpte = pte_load_clear(pte);
3454                                 if ((tpte & PG_G) != 0)
3455                                         pmap_invalidate_page(pmap, va);
3456                                 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
3457                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3458                                         vm_page_dirty(m);
3459                                 if ((tpte & PG_A) != 0)
3460                                         vm_page_aflag_set(m, PGA_REFERENCED);
3461                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3462                                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3463                                 m->md.pv_gen++;
3464                                 if (TAILQ_EMPTY(&m->md.pv_list) &&
3465                                     (m->flags & PG_FICTITIOUS) == 0) {
3466                                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3467                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
3468                                                 vm_page_aflag_clear(m,
3469                                                     PGA_WRITEABLE);
3470                                         }
3471                                 }
3472                                 pmap_delayed_invl_page(m);
3473                                 pc->pc_map[field] |= 1UL << bit;
3474                                 pmap_unuse_pt(pmap, va, *pde, &free);
3475                                 freed++;
3476                         }
3477                 }
3478                 if (freed == 0) {
3479                         mtx_lock(&pv_chunks_mutex);
3480                         goto next_chunk;
3481                 }
3482                 /* Every freed mapping is for a 4 KB page. */
3483                 pmap_resident_count_dec(pmap, freed);
3484                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3485                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3486                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3487                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3488                 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
3489                     pc->pc_map[2] == PC_FREE2) {
3490                         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3491                         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3492                         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3493                         /* Entire chunk is free; return it. */
3494                         m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3495                         dump_drop_page(m_pc->phys_addr);
3496                         mtx_lock(&pv_chunks_mutex);
3497                         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
3498                         break;
3499                 }
3500                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3501                 mtx_lock(&pv_chunks_mutex);
3502                 /* One freed pv entry in locked_pmap is sufficient. */
3503                 if (pmap == locked_pmap)
3504                         break;
3505 next_chunk:
3506                 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
3507                 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
3508                 if (active_reclaims == 1 && pmap != NULL) {
3509                         /*
3510                          * Rotate the pv chunks list so that we do not
3511                          * scan the same pv chunks that could not be
3512                          * freed (because they contained a wired
3513                          * and/or superpage mapping) on every
3514                          * invocation of reclaim_pv_chunk().
3515                          */
3516                         while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
3517                                 MPASS(pc->pc_pmap != NULL);
3518                                 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
3519                                 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
3520                         }
3521                 }
3522         }
3523         TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
3524         TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
3525         active_reclaims--;
3526         mtx_unlock(&pv_chunks_mutex);
3527         reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di);
3528         if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3529                 m_pc = SLIST_FIRST(&free);
3530                 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3531                 /* Recycle a freed page table page. */
3532                 m_pc->wire_count = 1;
3533         }
3534         vm_page_free_pages_toq(&free, true);
3535         return (m_pc);
3536 }
3537
3538 /*
3539  * free the pv_entry back to the free list
3540  */
3541 static void
3542 free_pv_entry(pmap_t pmap, pv_entry_t pv)
3543 {
3544         struct pv_chunk *pc;
3545         int idx, field, bit;
3546
3547         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3548         PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3549         PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3550         PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3551         pc = pv_to_chunk(pv);
3552         idx = pv - &pc->pc_pventry[0];
3553         field = idx / 64;
3554         bit = idx % 64;
3555         pc->pc_map[field] |= 1ul << bit;
3556         if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
3557             pc->pc_map[2] != PC_FREE2) {
3558                 /* 98% of the time, pc is already at the head of the list. */
3559                 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3560                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3561                         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3562                 }
3563                 return;
3564         }
3565         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3566         free_pv_chunk(pc);
3567 }
3568
3569 static void
3570 free_pv_chunk(struct pv_chunk *pc)
3571 {
3572         vm_page_t m;
3573
3574         mtx_lock(&pv_chunks_mutex);
3575         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
3576         mtx_unlock(&pv_chunks_mutex);
3577         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3578         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3579         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3580         /* entire chunk is free, return it */
3581         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3582         dump_drop_page(m->phys_addr);
3583         vm_page_unwire(m, PQ_NONE);
3584         vm_page_free(m);
3585 }
3586
3587 /*
3588  * Returns a new PV entry, allocating a new PV chunk from the system when
3589  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
3590  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
3591  * returned.
3592  *
3593  * The given PV list lock may be released.
3594  */
3595 static pv_entry_t
3596 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3597 {
3598         int bit, field;
3599         pv_entry_t pv;
3600         struct pv_chunk *pc;
3601         vm_page_t m;
3602
3603         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3604         PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3605 retry:
3606         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3607         if (pc != NULL) {
3608                 for (field = 0; field < _NPCM; field++) {
3609                         if (pc->pc_map[field]) {
3610                                 bit = bsfq(pc->pc_map[field]);
3611                                 break;
3612                         }
3613                 }
3614                 if (field < _NPCM) {
3615                         pv = &pc->pc_pventry[field * 64 + bit];
3616                         pc->pc_map[field] &= ~(1ul << bit);
3617                         /* If this was the last item, move it to tail */
3618                         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
3619                             pc->pc_map[2] == 0) {
3620                                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3621                                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3622                                     pc_list);
3623                         }
3624                         PV_STAT(atomic_add_long(&pv_entry_count, 1));
3625                         PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3626                         return (pv);
3627                 }
3628         }
3629         /* No free items, allocate another chunk */
3630         m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3631             VM_ALLOC_WIRED);
3632         if (m == NULL) {
3633                 if (lockp == NULL) {
3634                         PV_STAT(pc_chunk_tryfail++);
3635                         return (NULL);
3636                 }
3637                 m = reclaim_pv_chunk(pmap, lockp);
3638                 if (m == NULL)
3639                         goto retry;
3640         }
3641         PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3642         PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3643         dump_add_page(m->phys_addr);
3644         pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3645         pc->pc_pmap = pmap;
3646         pc->pc_map[0] = PC_FREE0 & ~1ul;        /* preallocated bit 0 */
3647         pc->pc_map[1] = PC_FREE1;
3648         pc->pc_map[2] = PC_FREE2;
3649         mtx_lock(&pv_chunks_mutex);
3650         TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
3651         mtx_unlock(&pv_chunks_mutex);
3652         pv = &pc->pc_pventry[0];
3653         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3654         PV_STAT(atomic_add_long(&pv_entry_count, 1));
3655         PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3656         return (pv);
3657 }
3658
3659 /*
3660  * Returns the number of one bits within the given PV chunk map.
3661  *
3662  * The erratas for Intel processors state that "POPCNT Instruction May
3663  * Take Longer to Execute Than Expected".  It is believed that the
3664  * issue is the spurious dependency on the destination register.
3665  * Provide a hint to the register rename logic that the destination
3666  * value is overwritten, by clearing it, as suggested in the
3667  * optimization manual.  It should be cheap for unaffected processors
3668  * as well.
3669  *
3670  * Reference numbers for erratas are
3671  * 4th Gen Core: HSD146
3672  * 5th Gen Core: BDM85
3673  * 6th Gen Core: SKL029
3674  */
3675 static int
3676 popcnt_pc_map_pq(uint64_t *map)
3677 {
3678         u_long result, tmp;
3679
3680         __asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
3681             "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
3682             "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
3683             : "=&r" (result), "=&r" (tmp)
3684             : "m" (map[0]), "m" (map[1]), "m" (map[2]));
3685         return (result);
3686 }
3687
3688 /*
3689  * Ensure that the number of spare PV entries in the specified pmap meets or
3690  * exceeds the given count, "needed".
3691  *
3692  * The given PV list lock may be released.
3693  */
3694 static void
3695 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3696 {
3697         struct pch new_tail;
3698         struct pv_chunk *pc;
3699         vm_page_t m;
3700         int avail, free;
3701         bool reclaimed;
3702
3703         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3704         KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3705
3706         /*
3707          * Newly allocated PV chunks must be stored in a private list until
3708          * the required number of PV chunks have been allocated.  Otherwise,
3709          * reclaim_pv_chunk() could recycle one of these chunks.  In
3710          * contrast, these chunks must be added to the pmap upon allocation.
3711          */
3712         TAILQ_INIT(&new_tail);
3713 retry:
3714         avail = 0;
3715         TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3716 #ifndef __POPCNT__
3717                 if ((cpu_feature2 & CPUID2_POPCNT) == 0)
3718                         bit_count((bitstr_t *)pc->pc_map, 0,
3719                             sizeof(pc->pc_map) * NBBY, &free);
3720                 else
3721 #endif
3722                 free = popcnt_pc_map_pq(pc->pc_map);
3723                 if (free == 0)
3724                         break;
3725                 avail += free;
3726                 if (avail >= needed)
3727                         break;
3728         }
3729         for (reclaimed = false; avail < needed; avail += _NPCPV) {
3730                 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
3731                     VM_ALLOC_WIRED);
3732                 if (m == NULL) {
3733                         m = reclaim_pv_chunk(pmap, lockp);
3734                         if (m == NULL)
3735                                 goto retry;
3736                         reclaimed = true;
3737                 }
3738                 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3739                 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3740                 dump_add_page(m->phys_addr);
3741                 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3742                 pc->pc_pmap = pmap;
3743                 pc->pc_map[0] = PC_FREE0;
3744                 pc->pc_map[1] = PC_FREE1;
3745                 pc->pc_map[2] = PC_FREE2;
3746                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3747                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
3748                 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3749
3750                 /*
3751                  * The reclaim might have freed a chunk from the current pmap.
3752                  * If that chunk contained available entries, we need to
3753                  * re-count the number of available entries.
3754                  */
3755                 if (reclaimed)
3756                         goto retry;
3757         }
3758         if (!TAILQ_EMPTY(&new_tail)) {
3759                 mtx_lock(&pv_chunks_mutex);
3760                 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
3761                 mtx_unlock(&pv_chunks_mutex);
3762         }
3763 }
3764
3765 /*
3766  * First find and then remove the pv entry for the specified pmap and virtual
3767  * address from the specified pv list.  Returns the pv entry if found and NULL
3768  * otherwise.  This operation can be performed on pv lists for either 4KB or
3769  * 2MB page mappings.
3770  */
3771 static __inline pv_entry_t
3772 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3773 {
3774         pv_entry_t pv;
3775
3776         TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3777                 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3778                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3779                         pvh->pv_gen++;
3780                         break;
3781                 }
3782         }
3783         return (pv);
3784 }
3785
3786 /*
3787  * After demotion from a 2MB page mapping to 512 4KB page mappings,
3788  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3789  * entries for each of the 4KB page mappings.
3790  */
3791 static void
3792 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3793     struct rwlock **lockp)
3794 {
3795         struct md_page *pvh;
3796         struct pv_chunk *pc;
3797         pv_entry_t pv;
3798         vm_offset_t va_last;
3799         vm_page_t m;
3800         int bit, field;
3801
3802         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3803         KASSERT((pa & PDRMASK) == 0,
3804             ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
3805         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3806
3807         /*
3808          * Transfer the 2mpage's pv entry for this mapping to the first
3809          * page's pv list.  Once this transfer begins, the pv list lock
3810          * must not be released until the last pv entry is reinstantiated.
3811          */
3812         pvh = pa_to_pvh(pa);
3813         va = trunc_2mpage(va);
3814         pv = pmap_pvh_remove(pvh, pmap, va);
3815         KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
3816         m = PHYS_TO_VM_PAGE(pa);
3817         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3818         m->md.pv_gen++;
3819         /* Instantiate the remaining NPTEPG - 1 pv entries. */
3820         PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
3821         va_last = va + NBPDR - PAGE_SIZE;
3822         for (;;) {
3823                 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3824                 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
3825                     pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
3826                 for (field = 0; field < _NPCM; field++) {
3827                         while (pc->pc_map[field]) {
3828                                 bit = bsfq(pc->pc_map[field]);
3829                                 pc->pc_map[field] &= ~(1ul << bit);
3830                                 pv = &pc->pc_pventry[field * 64 + bit];
3831                                 va += PAGE_SIZE;
3832                                 pv->pv_va = va;
3833                                 m++;
3834                                 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3835                             ("pmap_pv_demote_pde: page %p is not managed", m));
3836                                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3837                                 m->md.pv_gen++;
3838                                 if (va == va_last)
3839                                         goto out;
3840                         }
3841                 }
3842                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3843                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3844         }
3845 out:
3846         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
3847                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3848                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3849         }
3850         PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
3851         PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
3852 }
3853
3854 #if VM_NRESERVLEVEL > 0
3855 /*
3856  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3857  * replace the many pv entries for the 4KB page mappings by a single pv entry
3858  * for the 2MB page mapping.
3859  */
3860 static void
3861 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3862     struct rwlock **lockp)
3863 {
3864         struct md_page *pvh;
3865         pv_entry_t pv;
3866         vm_offset_t va_last;
3867         vm_page_t m;
3868
3869         KASSERT((pa & PDRMASK) == 0,
3870             ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
3871         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3872
3873         /*
3874          * Transfer the first page's pv entry for this mapping to the 2mpage's
3875          * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
3876          * a transfer avoids the possibility that get_pv_entry() calls
3877          * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3878          * mappings that is being promoted.
3879          */
3880         m = PHYS_TO_VM_PAGE(pa);
3881         va = trunc_2mpage(va);
3882         pv = pmap_pvh_remove(&m->md, pmap, va);
3883         KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
3884         pvh = pa_to_pvh(pa);
3885         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3886         pvh->pv_gen++;
3887         /* Free the remaining NPTEPG - 1 pv entries. */
3888         va_last = va + NBPDR - PAGE_SIZE;
3889         do {
3890                 m++;
3891                 va += PAGE_SIZE;
3892                 pmap_pvh_free(&m->md, pmap, va);
3893         } while (va < va_last);
3894 }
3895 #endif /* VM_NRESERVLEVEL > 0 */
3896
3897 /*
3898  * First find and then destroy the pv entry for the specified pmap and virtual
3899  * address.  This operation can be performed on pv lists for either 4KB or 2MB
3900  * page mappings.
3901  */
3902 static void
3903 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3904 {
3905         pv_entry_t pv;
3906
3907         pv = pmap_pvh_remove(pvh, pmap, va);
3908         KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3909         free_pv_entry(pmap, pv);
3910 }
3911
3912 /*
3913  * Conditionally create the PV entry for a 4KB page mapping if the required
3914  * memory can be allocated without resorting to reclamation.
3915  */
3916 static boolean_t
3917 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3918     struct rwlock **lockp)
3919 {
3920         pv_entry_t pv;
3921
3922         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3923         /* Pass NULL instead of the lock pointer to disable reclamation. */
3924         if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3925                 pv->pv_va = va;
3926                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3927                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3928                 m->md.pv_gen++;
3929                 return (TRUE);
3930         } else
3931                 return (FALSE);
3932 }
3933
3934 /*
3935  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
3936  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
3937  * false if the PV entry cannot be allocated without resorting to reclamation.
3938  */
3939 static bool
3940 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags,
3941     struct rwlock **lockp)
3942 {
3943         struct md_page *pvh;
3944         pv_entry_t pv;
3945         vm_paddr_t pa;
3946
3947         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3948         /* Pass NULL instead of the lock pointer to disable reclamation. */
3949         if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3950             NULL : lockp)) == NULL)
3951                 return (false);
3952         pv->pv_va = va;
3953         pa = pde & PG_PS_FRAME;
3954         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3955         pvh = pa_to_pvh(pa);
3956         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3957         pvh->pv_gen++;
3958         return (true);
3959 }
3960
3961 /*
3962  * Fills a page table page with mappings to consecutive physical pages.
3963  */
3964 static void
3965 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
3966 {
3967         pt_entry_t *pte;
3968
3969         for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
3970                 *pte = newpte;
3971                 newpte += PAGE_SIZE;
3972         }
3973 }
3974
3975 /*
3976  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
3977  * mapping is invalidated.
3978  */
3979 static boolean_t
3980 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
3981 {
3982         struct rwlock *lock;
3983         boolean_t rv;
3984
3985         lock = NULL;
3986         rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
3987         if (lock != NULL)
3988                 rw_wunlock(lock);
3989         return (rv);
3990 }
3991
3992 static boolean_t
3993 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
3994     struct rwlock **lockp)
3995 {
3996         pd_entry_t newpde, oldpde;
3997         pt_entry_t *firstpte, newpte;
3998         pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
3999         vm_paddr_t mptepa;
4000         vm_page_t mpte;
4001         struct spglist free;
4002         vm_offset_t sva;
4003         int PG_PTE_CACHE;
4004
4005         PG_G = pmap_global_bit(pmap);
4006         PG_A = pmap_accessed_bit(pmap);
4007         PG_M = pmap_modified_bit(pmap);
4008         PG_RW = pmap_rw_bit(pmap);
4009         PG_V = pmap_valid_bit(pmap);
4010         PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
4011
4012         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4013         oldpde = *pde;
4014         KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
4015             ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
4016         if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
4017             NULL) {
4018                 KASSERT((oldpde & PG_W) == 0,
4019                     ("pmap_demote_pde: page table page for a wired mapping"
4020                     " is missing"));
4021
4022                 /*
4023                  * Invalidate the 2MB page mapping and return "failure" if the
4024                  * mapping was never accessed or the allocation of the new
4025                  * page table page fails.  If the 2MB page mapping belongs to
4026                  * the direct map region of the kernel's address space, then
4027                  * the page allocation request specifies the highest possible
4028                  * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
4029                  * normal.  Page table pages are preallocated for every other
4030                  * part of the kernel address space, so the direct map region
4031                  * is the only part of the kernel address space that must be
4032                  * handled here.
4033                  */
4034                 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
4035                     pmap_pde_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
4036                     DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
4037                     VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
4038                         SLIST_INIT(&free);
4039                         sva = trunc_2mpage(va);
4040                         pmap_remove_pde(pmap, pde, sva, &free, lockp);
4041                         if ((oldpde & PG_G) == 0)
4042                                 pmap_invalidate_pde_page(pmap, sva, oldpde);
4043                         vm_page_free_pages_toq(&free, true);
4044                         CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx"
4045                             " in pmap %p", va, pmap);
4046                         return (FALSE);
4047                 }
4048                 if (va < VM_MAXUSER_ADDRESS)
4049                         pmap_resident_count_inc(pmap, 1);
4050         }
4051         mptepa = VM_PAGE_TO_PHYS(mpte);
4052         firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
4053         newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
4054         KASSERT((oldpde & PG_A) != 0,
4055             ("pmap_demote_pde: oldpde is missing PG_A"));
4056         KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
4057             ("pmap_demote_pde: oldpde is missing PG_M"));
4058         newpte = oldpde & ~PG_PS;
4059         newpte = pmap_swap_pat(pmap, newpte);
4060
4061         /*
4062          * If the page table page is new, initialize it.
4063          */
4064         if (mpte->wire_count == 1) {
4065                 mpte->wire_count = NPTEPG;
4066                 pmap_fill_ptp(firstpte, newpte);
4067         }
4068         KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
4069             ("pmap_demote_pde: firstpte and newpte map different physical"
4070             " addresses"));
4071
4072         /*
4073          * If the mapping has changed attributes, update the page table
4074          * entries.
4075          */
4076         if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
4077                 pmap_fill_ptp(firstpte, newpte);
4078
4079         /*
4080          * The spare PV entries must be reserved prior to demoting the
4081          * mapping, that is, prior to changing the PDE.  Otherwise, the state
4082          * of the PDE and the PV lists will be inconsistent, which can result
4083          * in reclaim_pv_chunk() attempting to remove a PV entry from the
4084          * wrong PV list and pmap_pv_demote_pde() failing to find the expected
4085          * PV entry for the 2MB page mapping that is being demoted.
4086          */
4087         if ((oldpde & PG_MANAGED) != 0)
4088                 reserve_pv_entries(pmap, NPTEPG - 1, lockp);
4089
4090         /*
4091          * Demote the mapping.  This pmap is locked.  The old PDE has
4092          * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
4093          * set.  Thus, there is no danger of a race with another
4094          * processor changing the setting of PG_A and/or PG_M between
4095          * the read above and the store below.
4096          */
4097         if (workaround_erratum383)
4098                 pmap_update_pde(pmap, va, pde, newpde);
4099         else
4100                 pde_store(pde, newpde);
4101
4102         /*
4103          * Invalidate a stale recursive mapping of the page table page.
4104          */
4105         if (va >= VM_MAXUSER_ADDRESS)
4106                 pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
4107
4108         /*
4109          * Demote the PV entry.
4110          */
4111         if ((oldpde & PG_MANAGED) != 0)
4112                 pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
4113
4114         atomic_add_long(&pmap_pde_demotions, 1);
4115         CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx"
4116             " in pmap %p", va, pmap);
4117         return (TRUE);
4118 }
4119
4120 /*
4121  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
4122  */
4123 static void
4124 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
4125 {
4126         pd_entry_t newpde;
4127         vm_paddr_t mptepa;
4128         vm_page_t mpte;
4129
4130         KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
4131         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4132         mpte = pmap_remove_pt_page(pmap, va);
4133         if (mpte == NULL)
4134                 panic("pmap_remove_kernel_pde: Missing pt page.");
4135
4136         mptepa = VM_PAGE_TO_PHYS(mpte);
4137         newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
4138
4139         /*
4140          * Initialize the page table page.
4141          */
4142         pagezero((void *)PHYS_TO_DMAP(mptepa));
4143
4144         /*
4145          * Demote the mapping.
4146          */
4147         if (workaround_erratum383)
4148                 pmap_update_pde(pmap, va, pde, newpde);
4149         else
4150                 pde_store(pde, newpde);
4151
4152         /*
4153          * Invalidate a stale recursive mapping of the page table page.
4154          */
4155         pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
4156 }
4157
4158 /*
4159  * pmap_remove_pde: do the things to unmap a superpage in a process
4160  */
4161 static int
4162 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
4163     struct spglist *free, struct rwlock **lockp)
4164 {
4165         struct md_page *pvh;
4166         pd_entry_t oldpde;
4167         vm_offset_t eva, va;
4168         vm_page_t m, mpte;
4169         pt_entry_t PG_G, PG_A, PG_M, PG_RW;
4170
4171         PG_G = pmap_global_bit(pmap);
4172         PG_A = pmap_accessed_bit(pmap);
4173         PG_M = pmap_modified_bit(pmap);
4174         PG_RW = pmap_rw_bit(pmap);
4175
4176         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4177         KASSERT((sva & PDRMASK) == 0,
4178             ("pmap_remove_pde: sva is not 2mpage aligned"));
4179         oldpde = pte_load_clear(pdq);
4180         if (oldpde & PG_W)
4181                 pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
4182         if ((oldpde & PG_G) != 0)
4183                 pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
4184         pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
4185         if (oldpde & PG_MANAGED) {
4186                 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
4187                 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
4188                 pmap_pvh_free(pvh, pmap, sva);
4189                 eva = sva + NBPDR;
4190                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
4191                     va < eva; va += PAGE_SIZE, m++) {
4192                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
4193                                 vm_page_dirty(m);
4194                         if (oldpde & PG_A)
4195                                 vm_page_aflag_set(m, PGA_REFERENCED);
4196                         if (TAILQ_EMPTY(&m->md.pv_list) &&
4197                             TAILQ_EMPTY(&pvh->pv_list))
4198                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
4199                         pmap_delayed_invl_page(m);
4200                 }
4201         }
4202         if (pmap == kernel_pmap) {
4203                 pmap_remove_kernel_pde(pmap, pdq, sva);
4204         } else {
4205                 mpte = pmap_remove_pt_page(pmap, sva);
4206                 if (mpte != NULL) {
4207                         pmap_resident_count_dec(pmap, 1);
4208                         KASSERT(mpte->wire_count == NPTEPG,
4209                             ("pmap_remove_pde: pte page wire count error"));
4210                         mpte->wire_count = 0;
4211                         pmap_add_delayed_free_list(mpte, free, FALSE);
4212                 }
4213         }
4214         return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
4215 }
4216
4217 /*
4218  * pmap_remove_pte: do the things to unmap a page in a process
4219  */
4220 static int
4221 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
4222     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
4223 {
4224         struct md_page *pvh;
4225         pt_entry_t oldpte, PG_A, PG_M, PG_RW;
4226         vm_page_t m;
4227
4228         PG_A = pmap_accessed_bit(pmap);
4229         PG_M = pmap_modified_bit(pmap);
4230         PG_RW = pmap_rw_bit(pmap);
4231
4232         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4233         oldpte = pte_load_clear(ptq);
4234         if (oldpte & PG_W)
4235                 pmap->pm_stats.wired_count -= 1;
4236         pmap_resident_count_dec(pmap, 1);
4237         if (oldpte & PG_MANAGED) {
4238                 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
4239                 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4240                         vm_page_dirty(m);
4241                 if (oldpte & PG_A)
4242                         vm_page_aflag_set(m, PGA_REFERENCED);
4243                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
4244                 pmap_pvh_free(&m->md, pmap, va);
4245                 if (TAILQ_EMPTY(&m->md.pv_list) &&
4246                     (m->flags & PG_FICTITIOUS) == 0) {
4247                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4248                         if (TAILQ_EMPTY(&pvh->pv_list))
4249                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
4250                 }
4251                 pmap_delayed_invl_page(m);
4252         }
4253         return (pmap_unuse_pt(pmap, va, ptepde, free));
4254 }
4255
4256 /*
4257  * Remove a single page from a process address space
4258  */
4259 static void
4260 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
4261     struct spglist *free)
4262 {
4263         struct rwlock *lock;
4264         pt_entry_t *pte, PG_V;
4265
4266         PG_V = pmap_valid_bit(pmap);
4267         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4268         if ((*pde & PG_V) == 0)
4269                 return;
4270         pte = pmap_pde_to_pte(pde, va);
4271         if ((*pte & PG_V) == 0)
4272                 return;
4273         lock = NULL;
4274         pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
4275         if (lock != NULL)
4276                 rw_wunlock(lock);
4277         pmap_invalidate_page(pmap, va);
4278 }
4279
4280 /*
4281  * Removes the specified range of addresses from the page table page.
4282  */
4283 static bool
4284 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
4285     pd_entry_t *pde, struct spglist *free, struct rwlock **lockp)
4286 {
4287         pt_entry_t PG_G, *pte;
4288         vm_offset_t va;
4289         bool anyvalid;
4290
4291         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4292         PG_G = pmap_global_bit(pmap);
4293         anyvalid = false;
4294         va = eva;
4295         for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++,
4296             sva += PAGE_SIZE) {
4297                 if (*pte == 0) {
4298                         if (va != eva) {
4299                                 pmap_invalidate_range(pmap, va, sva);
4300                                 va = eva;
4301                         }
4302                         continue;
4303                 }
4304                 if ((*pte & PG_G) == 0)
4305                         anyvalid = true;
4306                 else if (va == eva)
4307                         va = sva;
4308                 if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) {
4309                         sva += PAGE_SIZE;
4310                         break;
4311                 }
4312         }
4313         if (va != eva)
4314                 pmap_invalidate_range(pmap, va, sva);
4315         return (anyvalid);
4316 }
4317
4318 /*
4319  *      Remove the given range of addresses from the specified map.
4320  *
4321  *      It is assumed that the start and end are properly
4322  *      rounded to the page size.
4323  */
4324 void
4325 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4326 {
4327         struct rwlock *lock;
4328         vm_offset_t va_next;
4329         pml4_entry_t *pml4e;
4330         pdp_entry_t *pdpe;
4331         pd_entry_t ptpaddr, *pde;
4332         pt_entry_t PG_G, PG_V;
4333         struct spglist free;
4334         int anyvalid;
4335
4336         PG_G = pmap_global_bit(pmap);
4337         PG_V = pmap_valid_bit(pmap);
4338
4339         /*
4340          * Perform an unsynchronized read.  This is, however, safe.
4341          */
4342         if (pmap->pm_stats.resident_count == 0)
4343                 return;
4344
4345         anyvalid = 0;
4346         SLIST_INIT(&free);
4347
4348         pmap_delayed_invl_started();
4349         PMAP_LOCK(pmap);
4350
4351         /*
4352          * special handling of removing one page.  a very
4353          * common operation and easy to short circuit some
4354          * code.
4355          */
4356         if (sva + PAGE_SIZE == eva) {
4357                 pde = pmap_pde(pmap, sva);
4358                 if (pde && (*pde & PG_PS) == 0) {
4359                         pmap_remove_page(pmap, sva, pde, &free);
4360                         goto out;
4361                 }
4362         }
4363
4364         lock = NULL;
4365         for (; sva < eva; sva = va_next) {
4366
4367                 if (pmap->pm_stats.resident_count == 0)
4368                         break;
4369
4370                 pml4e = pmap_pml4e(pmap, sva);
4371                 if ((*pml4e & PG_V) == 0) {
4372                         va_next = (sva + NBPML4) & ~PML4MASK;
4373                         if (va_next < sva)
4374                                 va_next = eva;
4375                         continue;
4376                 }
4377
4378                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
4379                 if ((*pdpe & PG_V) == 0) {
4380                         va_next = (sva + NBPDP) & ~PDPMASK;
4381                         if (va_next < sva)
4382                                 va_next = eva;
4383                         continue;
4384                 }
4385
4386                 /*
4387                  * Calculate index for next page table.
4388                  */
4389                 va_next = (sva + NBPDR) & ~PDRMASK;
4390                 if (va_next < sva)
4391                         va_next = eva;
4392
4393                 pde = pmap_pdpe_to_pde(pdpe, sva);
4394                 ptpaddr = *pde;
4395
4396                 /*
4397                  * Weed out invalid mappings.
4398                  */
4399                 if (ptpaddr == 0)
4400                         continue;
4401
4402                 /*
4403                  * Check for large page.
4404                  */
4405                 if ((ptpaddr & PG_PS) != 0) {
4406                         /*
4407                          * Are we removing the entire large page?  If not,
4408                          * demote the mapping and fall through.
4409                          */
4410                         if (sva + NBPDR == va_next && eva >= va_next) {
4411                                 /*
4412                                  * The TLB entry for a PG_G mapping is
4413                                  * invalidated by pmap_remove_pde().
4414                                  */
4415                                 if ((ptpaddr & PG_G) == 0)
4416                                         anyvalid = 1;
4417                                 pmap_remove_pde(pmap, pde, sva, &free, &lock);
4418                                 continue;
4419                         } else if (!pmap_demote_pde_locked(pmap, pde, sva,
4420                             &lock)) {
4421                                 /* The large page mapping was destroyed. */
4422                                 continue;
4423                         } else
4424                                 ptpaddr = *pde;
4425                 }
4426
4427                 /*
4428                  * Limit our scan to either the end of the va represented
4429                  * by the current page table page, or to the end of the
4430                  * range being removed.
4431                  */
4432                 if (va_next > eva)
4433                         va_next = eva;
4434
4435                 if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock))
4436                         anyvalid = 1;
4437         }
4438         if (lock != NULL)
4439                 rw_wunlock(lock);
4440 out:
4441         if (anyvalid)
4442                 pmap_invalidate_all(pmap);
4443         PMAP_UNLOCK(pmap);
4444         pmap_delayed_invl_finished();
4445         vm_page_free_pages_toq(&free, true);
4446 }
4447
4448 /*
4449  *      Routine:        pmap_remove_all
4450  *      Function:
4451  *              Removes this physical page from
4452  *              all physical maps in which it resides.
4453  *              Reflects back modify bits to the pager.
4454  *
4455  *      Notes:
4456  *              Original versions of this routine were very
4457  *              inefficient because they iteratively called
4458  *              pmap_remove (slow...)
4459  */
4460
4461 void
4462 pmap_remove_all(vm_page_t m)
4463 {
4464         struct md_page *pvh;
4465         pv_entry_t pv;
4466         pmap_t pmap;
4467         struct rwlock *lock;
4468         pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
4469         pd_entry_t *pde;
4470         vm_offset_t va;
4471         struct spglist free;
4472         int pvh_gen, md_gen;
4473
4474         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4475             ("pmap_remove_all: page %p is not managed", m));
4476         SLIST_INIT(&free);
4477         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4478         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4479             pa_to_pvh(VM_PAGE_TO_PHYS(m));
4480 retry:
4481         rw_wlock(lock);
4482         while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4483                 pmap = PV_PMAP(pv);
4484                 if (!PMAP_TRYLOCK(pmap)) {
4485                         pvh_gen = pvh->pv_gen;
4486                         rw_wunlock(lock);
4487                         PMAP_LOCK(pmap);
4488                         rw_wlock(lock);
4489                         if (pvh_gen != pvh->pv_gen) {
4490                                 rw_wunlock(lock);
4491                                 PMAP_UNLOCK(pmap);
4492                                 goto retry;
4493                         }
4494                 }
4495                 va = pv->pv_va;
4496                 pde = pmap_pde(pmap, va);
4497                 (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
4498                 PMAP_UNLOCK(pmap);
4499         }
4500         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4501                 pmap = PV_PMAP(pv);
4502                 if (!PMAP_TRYLOCK(pmap)) {
4503                         pvh_gen = pvh->pv_gen;
4504                         md_gen = m->md.pv_gen;
4505                         rw_wunlock(lock);
4506                         PMAP_LOCK(pmap);
4507                         rw_wlock(lock);
4508                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4509                                 rw_wunlock(lock);
4510                                 PMAP_UNLOCK(pmap);
4511                                 goto retry;
4512                         }
4513                 }
4514                 PG_A = pmap_accessed_bit(pmap);
4515                 PG_M = pmap_modified_bit(pmap);
4516                 PG_RW = pmap_rw_bit(pmap);
4517                 pmap_resident_count_dec(pmap, 1);
4518                 pde = pmap_pde(pmap, pv->pv_va);
4519                 KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
4520                     " a 2mpage in page %p's pv list", m));
4521                 pte = pmap_pde_to_pte(pde, pv->pv_va);
4522                 tpte = pte_load_clear(pte);
4523                 if (tpte & PG_W)
4524                         pmap->pm_stats.wired_count--;
4525                 if (tpte & PG_A)
4526                         vm_page_aflag_set(m, PGA_REFERENCED);
4527
4528                 /*
4529                  * Update the vm_page_t clean and reference bits.
4530                  */
4531                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
4532                         vm_page_dirty(m);
4533                 pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
4534                 pmap_invalidate_page(pmap, pv->pv_va);
4535                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4536                 m->md.pv_gen++;
4537                 free_pv_entry(pmap, pv);
4538                 PMAP_UNLOCK(pmap);
4539         }
4540         vm_page_aflag_clear(m, PGA_WRITEABLE);
4541         rw_wunlock(lock);
4542         pmap_delayed_invl_wait(m);
4543         vm_page_free_pages_toq(&free, true);
4544 }
4545
4546 /*
4547  * pmap_protect_pde: do the things to protect a 2mpage in a process
4548  */
4549 static boolean_t
4550 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
4551 {
4552         pd_entry_t newpde, oldpde;
4553         vm_offset_t eva, va;
4554         vm_page_t m;
4555         boolean_t anychanged;
4556         pt_entry_t PG_G, PG_M, PG_RW;
4557
4558         PG_G = pmap_global_bit(pmap);
4559         PG_M = pmap_modified_bit(pmap);
4560         PG_RW = pmap_rw_bit(pmap);
4561
4562         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4563         KASSERT((sva & PDRMASK) == 0,
4564             ("pmap_protect_pde: sva is not 2mpage aligned"));
4565         anychanged = FALSE;
4566 retry:
4567         oldpde = newpde = *pde;
4568         if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
4569             (PG_MANAGED | PG_M | PG_RW)) {
4570                 eva = sva + NBPDR;
4571                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
4572                     va < eva; va += PAGE_SIZE, m++)
4573                         vm_page_dirty(m);
4574         }
4575         if ((prot & VM_PROT_WRITE) == 0)
4576                 newpde &= ~(PG_RW | PG_M);
4577         if ((prot & VM_PROT_EXECUTE) == 0)
4578                 newpde |= pg_nx;
4579         if (newpde != oldpde) {
4580                 /*
4581                  * As an optimization to future operations on this PDE, clear
4582                  * PG_PROMOTED.  The impending invalidation will remove any
4583                  * lingering 4KB page mappings from the TLB.
4584                  */
4585                 if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
4586                         goto retry;
4587                 if ((oldpde & PG_G) != 0)
4588                         pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
4589                 else
4590                         anychanged = TRUE;
4591         }
4592         return (anychanged);
4593 }
4594
4595 /*
4596  *      Set the physical protection on the
4597  *      specified range of this map as requested.
4598  */
4599 void
4600 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4601 {
4602         vm_offset_t va_next;
4603         pml4_entry_t *pml4e;
4604         pdp_entry_t *pdpe;
4605         pd_entry_t ptpaddr, *pde;
4606         pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
4607         boolean_t anychanged;
4608
4609         KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4610         if (prot == VM_PROT_NONE) {
4611                 pmap_remove(pmap, sva, eva);
4612                 return;
4613         }
4614
4615         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
4616             (VM_PROT_WRITE|VM_PROT_EXECUTE))
4617                 return;
4618
4619         PG_G = pmap_global_bit(pmap);
4620         PG_M = pmap_modified_bit(pmap);
4621         PG_V = pmap_valid_bit(pmap);
4622         PG_RW = pmap_rw_bit(pmap);
4623         anychanged = FALSE;
4624
4625         /*
4626          * Although this function delays and batches the invalidation
4627          * of stale TLB entries, it does not need to call
4628          * pmap_delayed_invl_started() and
4629          * pmap_delayed_invl_finished(), because it does not
4630          * ordinarily destroy mappings.  Stale TLB entries from
4631          * protection-only changes need only be invalidated before the
4632          * pmap lock is released, because protection-only changes do
4633          * not destroy PV entries.  Even operations that iterate over
4634          * a physical page's PV list of mappings, like
4635          * pmap_remove_write(), acquire the pmap lock for each
4636          * mapping.  Consequently, for protection-only changes, the
4637          * pmap lock suffices to synchronize both page table and TLB
4638          * updates.
4639          *
4640          * This function only destroys a mapping if pmap_demote_pde()
4641          * fails.  In that case, stale TLB entries are immediately
4642          * invalidated.
4643          */
4644
4645         PMAP_LOCK(pmap);
4646         for (; sva < eva; sva = va_next) {
4647
4648                 pml4e = pmap_pml4e(pmap, sva);
4649                 if ((*pml4e & PG_V) == 0) {
4650                         va_next = (sva + NBPML4) & ~PML4MASK;
4651                         if (va_next < sva)
4652                                 va_next = eva;
4653                         continue;
4654                 }
4655
4656                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
4657                 if ((*pdpe & PG_V) == 0) {
4658                         va_next = (sva + NBPDP) & ~PDPMASK;
4659                         if (va_next < sva)
4660                                 va_next = eva;
4661                         continue;
4662                 }
4663
4664                 va_next = (sva + NBPDR) & ~PDRMASK;
4665                 if (va_next < sva)
4666                         va_next = eva;
4667
4668                 pde = pmap_pdpe_to_pde(pdpe, sva);
4669                 ptpaddr = *pde;
4670
4671                 /*
4672                  * Weed out invalid mappings.
4673                  */
4674                 if (ptpaddr == 0)
4675                         continue;
4676
4677                 /*
4678                  * Check for large page.
4679                  */
4680                 if ((ptpaddr & PG_PS) != 0) {
4681                         /*
4682                          * Are we protecting the entire large page?  If not,
4683                          * demote the mapping and fall through.
4684                          */
4685                         if (sva + NBPDR == va_next && eva >= va_next) {
4686                                 /*
4687                                  * The TLB entry for a PG_G mapping is
4688                                  * invalidated by pmap_protect_pde().
4689                                  */
4690                                 if (pmap_protect_pde(pmap, pde, sva, prot))
4691                                         anychanged = TRUE;
4692                                 continue;
4693                         } else if (!pmap_demote_pde(pmap, pde, sva)) {
4694                                 /*
4695                                  * The large page mapping was destroyed.
4696                                  */
4697                                 continue;
4698                         }
4699                 }
4700
4701                 if (va_next > eva)
4702                         va_next = eva;
4703
4704                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
4705                     sva += PAGE_SIZE) {
4706                         pt_entry_t obits, pbits;
4707                         vm_page_t m;
4708
4709 retry:
4710                         obits = pbits = *pte;
4711                         if ((pbits & PG_V) == 0)
4712                                 continue;
4713
4714                         if ((prot & VM_PROT_WRITE) == 0) {
4715                                 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
4716                                     (PG_MANAGED | PG_M | PG_RW)) {
4717                                         m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
4718                                         vm_page_dirty(m);
4719                                 }
4720                                 pbits &= ~(PG_RW | PG_M);
4721                         }
4722                         if ((prot & VM_PROT_EXECUTE) == 0)
4723                                 pbits |= pg_nx;
4724
4725                         if (pbits != obits) {
4726                                 if (!atomic_cmpset_long(pte, obits, pbits))
4727                                         goto retry;
4728                                 if (obits & PG_G)
4729                                         pmap_invalidate_page(pmap, sva);
4730                                 else
4731                                         anychanged = TRUE;
4732                         }
4733                 }
4734         }
4735         if (anychanged)
4736                 pmap_invalidate_all(pmap);
4737         PMAP_UNLOCK(pmap);
4738 }
4739
4740 #if VM_NRESERVLEVEL > 0
4741 /*
4742  * Tries to promote the 512, contiguous 4KB page mappings that are within a
4743  * single page table page (PTP) to a single 2MB page mapping.  For promotion
4744  * to occur, two conditions must be met: (1) the 4KB page mappings must map
4745  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4746  * identical characteristics.
4747  */
4748 static void
4749 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
4750     struct rwlock **lockp)
4751 {
4752         pd_entry_t newpde;
4753         pt_entry_t *firstpte, oldpte, pa, *pte;
4754         pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
4755         vm_page_t mpte;
4756         int PG_PTE_CACHE;
4757
4758         PG_A = pmap_accessed_bit(pmap);
4759         PG_G = pmap_global_bit(pmap);
4760         PG_M = pmap_modified_bit(pmap);
4761         PG_V = pmap_valid_bit(pmap);
4762         PG_RW = pmap_rw_bit(pmap);
4763         PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
4764
4765         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4766
4767         /*
4768          * Examine the first PTE in the specified PTP.  Abort if this PTE is
4769          * either invalid, unused, or does not map the first 4KB physical page
4770          * within a 2MB page.
4771          */
4772         firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
4773 setpde:
4774         newpde = *firstpte;
4775         if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
4776                 atomic_add_long(&pmap_pde_p_failures, 1);
4777                 CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4778                     " in pmap %p", va, pmap);
4779                 return;
4780         }
4781         if ((newpde & (PG_M | PG_RW)) == PG_RW) {
4782                 /*
4783                  * When PG_M is already clear, PG_RW can be cleared without
4784                  * a TLB invalidation.
4785                  */
4786                 if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
4787                         goto setpde;
4788                 newpde &= ~PG_RW;
4789         }
4790
4791         /*
4792          * Examine each of the other PTEs in the specified PTP.  Abort if this
4793          * PTE maps an unexpected 4KB physical page or does not have identical
4794          * characteristics to the first PTE.
4795          */
4796         pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
4797         for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
4798 setpte:
4799                 oldpte = *pte;
4800                 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
4801                         atomic_add_long(&pmap_pde_p_failures, 1);
4802                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4803                             " in pmap %p", va, pmap);
4804                         return;
4805                 }
4806                 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
4807                         /*
4808                          * When PG_M is already clear, PG_RW can be cleared
4809                          * without a TLB invalidation.
4810                          */
4811                         if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
4812                                 goto setpte;
4813                         oldpte &= ~PG_RW;
4814                         CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
4815                             " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
4816                             (va & ~PDRMASK), pmap);
4817                 }
4818                 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
4819                         atomic_add_long(&pmap_pde_p_failures, 1);
4820                         CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
4821                             " in pmap %p", va, pmap);
4822                         return;
4823                 }
4824                 pa -= PAGE_SIZE;
4825         }
4826
4827         /*
4828          * Save the page table page in its current state until the PDE
4829          * mapping the superpage is demoted by pmap_demote_pde() or
4830          * destroyed by pmap_remove_pde().
4831          */
4832         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4833         KASSERT(mpte >= vm_page_array &&
4834             mpte < &vm_page_array[vm_page_array_size],
4835             ("pmap_promote_pde: page table page is out of range"));
4836         KASSERT(mpte->pindex == pmap_pde_pindex(va),
4837             ("pmap_promote_pde: page table page's pindex is wrong"));
4838         if (pmap_insert_pt_page(pmap, mpte)) {
4839                 atomic_add_long(&pmap_pde_p_failures, 1);
4840                 CTR2(KTR_PMAP,
4841                     "pmap_promote_pde: failure for va %#lx in pmap %p", va,
4842                     pmap);
4843                 return;
4844         }
4845
4846         /*
4847          * Promote the pv entries.
4848          */
4849         if ((newpde & PG_MANAGED) != 0)
4850                 pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
4851
4852         /*
4853          * Propagate the PAT index to its proper position.
4854          */
4855         newpde = pmap_swap_pat(pmap, newpde);
4856
4857         /*
4858          * Map the superpage.
4859          */
4860         if (workaround_erratum383)
4861                 pmap_update_pde(pmap, va, pde, PG_PS | newpde);
4862         else
4863                 pde_store(pde, PG_PROMOTED | PG_PS | newpde);
4864
4865         atomic_add_long(&pmap_pde_promotions, 1);
4866         CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
4867             " in pmap %p", va, pmap);
4868 }
4869 #endif /* VM_NRESERVLEVEL > 0 */
4870
4871 /*
4872  *      Insert the given physical page (p) at
4873  *      the specified virtual address (v) in the
4874  *      target physical map with the protection requested.
4875  *
4876  *      If specified, the page will be wired down, meaning
4877  *      that the related pte can not be reclaimed.
4878  *
4879  *      NB:  This is the only routine which MAY NOT lazy-evaluate
4880  *      or lose information.  That is, this routine must actually
4881  *      insert this page into the given map NOW.
4882  *
4883  *      When destroying both a page table and PV entry, this function
4884  *      performs the TLB invalidation before releasing the PV list
4885  *      lock, so we do not need pmap_delayed_invl_page() calls here.
4886  */
4887 int
4888 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4889     u_int flags, int8_t psind)
4890 {
4891         struct rwlock *lock;
4892         pd_entry_t *pde;
4893         pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
4894         pt_entry_t newpte, origpte;
4895         pv_entry_t pv;
4896         vm_paddr_t opa, pa;
4897         vm_page_t mpte, om;
4898         int rv;
4899         boolean_t nosleep;
4900
4901         PG_A = pmap_accessed_bit(pmap);
4902         PG_G = pmap_global_bit(pmap);
4903         PG_M = pmap_modified_bit(pmap);
4904         PG_V = pmap_valid_bit(pmap);
4905         PG_RW = pmap_rw_bit(pmap);
4906
4907         va = trunc_page(va);
4908         KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
4909         KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
4910             ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
4911             va));
4912         KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
4913             va >= kmi.clean_eva,
4914             ("pmap_enter: managed mapping within the clean submap"));
4915         if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
4916                 VM_OBJECT_ASSERT_LOCKED(m->object);
4917         KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
4918             ("pmap_enter: flags %u has reserved bits set", flags));
4919         pa = VM_PAGE_TO_PHYS(m);
4920         newpte = (pt_entry_t)(pa | PG_A | PG_V);
4921         if ((flags & VM_PROT_WRITE) != 0)
4922                 newpte |= PG_M;
4923         if ((prot & VM_PROT_WRITE) != 0)
4924                 newpte |= PG_RW;
4925         KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
4926             ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
4927         if ((prot & VM_PROT_EXECUTE) == 0)
4928                 newpte |= pg_nx;
4929         if ((flags & PMAP_ENTER_WIRED) != 0)
4930                 newpte |= PG_W;
4931         if (va < VM_MAXUSER_ADDRESS)
4932                 newpte |= PG_U;
4933         if (pmap == kernel_pmap)
4934                 newpte |= PG_G;
4935         newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
4936
4937         /*
4938          * Set modified bit gratuitously for writeable mappings if
4939          * the page is unmanaged. We do not want to take a fault
4940          * to do the dirty bit accounting for these mappings.
4941          */
4942         if ((m->oflags & VPO_UNMANAGED) != 0) {
4943                 if ((newpte & PG_RW) != 0)
4944                         newpte |= PG_M;
4945         } else
4946                 newpte |= PG_MANAGED;
4947
4948         lock = NULL;
4949         PMAP_LOCK(pmap);
4950         if (psind == 1) {
4951                 /* Assert the required virtual and physical alignment. */
4952                 KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
4953                 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
4954                 rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock);
4955                 goto out;
4956         }
4957         mpte = NULL;
4958
4959         /*
4960          * In the case that a page table page is not
4961          * resident, we are creating it here.
4962          */
4963 retry:
4964         pde = pmap_pde(pmap, va);
4965         if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
4966             pmap_demote_pde_locked(pmap, pde, va, &lock))) {
4967                 pte = pmap_pde_to_pte(pde, va);
4968                 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
4969                         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
4970                         mpte->wire_count++;
4971                 }
4972         } else if (va < VM_MAXUSER_ADDRESS) {
4973                 /*
4974                  * Here if the pte page isn't mapped, or if it has been
4975                  * deallocated.
4976                  */
4977                 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
4978                 mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
4979                     nosleep ? NULL : &lock);
4980                 if (mpte == NULL && nosleep) {
4981                         rv = KERN_RESOURCE_SHORTAGE;
4982                         goto out;
4983                 }
4984                 goto retry;
4985         } else
4986                 panic("pmap_enter: invalid page directory va=%#lx", va);
4987
4988         origpte = *pte;
4989         pv = NULL;
4990
4991         /*
4992          * Is the specified virtual address already mapped?
4993          */
4994         if ((origpte & PG_V) != 0) {
4995                 /*
4996                  * Wiring change, just update stats. We don't worry about
4997                  * wiring PT pages as they remain resident as long as there
4998                  * are valid mappings in them. Hence, if a user page is wired,
4999                  * the PT page will be also.
5000                  */
5001                 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
5002                         pmap->pm_stats.wired_count++;
5003                 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
5004                         pmap->pm_stats.wired_count--;
5005
5006                 /*
5007                  * Remove the extra PT page reference.
5008                  */
5009                 if (mpte != NULL) {
5010                         mpte->wire_count--;
5011                         KASSERT(mpte->wire_count > 0,
5012                             ("pmap_enter: missing reference to page table page,"
5013                              " va: 0x%lx", va));
5014                 }
5015
5016                 /*
5017                  * Has the physical page changed?
5018                  */
5019                 opa = origpte & PG_FRAME;
5020                 if (opa == pa) {
5021                         /*
5022                          * No, might be a protection or wiring change.
5023                          */
5024                         if ((origpte & PG_MANAGED) != 0 &&
5025                             (newpte & PG_RW) != 0)
5026                                 vm_page_aflag_set(m, PGA_WRITEABLE);
5027                         if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
5028                                 goto unchanged;
5029                         goto validate;
5030                 }
5031
5032                 /*
5033                  * The physical page has changed.  Temporarily invalidate
5034                  * the mapping.  This ensures that all threads sharing the
5035                  * pmap keep a consistent view of the mapping, which is
5036                  * necessary for the correct handling of COW faults.  It
5037                  * also permits reuse of the old mapping's PV entry,
5038                  * avoiding an allocation.
5039                  *
5040                  * For consistency, handle unmanaged mappings the same way.
5041                  */
5042                 origpte = pte_load_clear(pte);
5043                 KASSERT((origpte & PG_FRAME) == opa,
5044                     ("pmap_enter: unexpected pa update for %#lx", va));
5045                 if ((origpte & PG_MANAGED) != 0) {
5046                         om = PHYS_TO_VM_PAGE(opa);
5047
5048                         /*
5049                          * The pmap lock is sufficient to synchronize with
5050                          * concurrent calls to pmap_page_test_mappings() and
5051                          * pmap_ts_referenced().
5052                          */
5053                         if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5054                                 vm_page_dirty(om);
5055                         if ((origpte & PG_A) != 0)
5056                                 vm_page_aflag_set(om, PGA_REFERENCED);
5057                         CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
5058                         pv = pmap_pvh_remove(&om->md, pmap, va);
5059                         if ((newpte & PG_MANAGED) == 0)
5060                                 free_pv_entry(pmap, pv);
5061                         if ((om->aflags & PGA_WRITEABLE) != 0 &&
5062                             TAILQ_EMPTY(&om->md.pv_list) &&
5063                             ((om->flags & PG_FICTITIOUS) != 0 ||
5064                             TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
5065                                 vm_page_aflag_clear(om, PGA_WRITEABLE);
5066                 }
5067                 if ((origpte & PG_A) != 0)
5068                         pmap_invalidate_page(pmap, va);
5069                 origpte = 0;
5070         } else {
5071                 /*
5072                  * Increment the counters.
5073                  */
5074                 if ((newpte & PG_W) != 0)
5075                         pmap->pm_stats.wired_count++;
5076                 pmap_resident_count_inc(pmap, 1);
5077         }
5078
5079         /*
5080          * Enter on the PV list if part of our managed memory.
5081          */
5082         if ((newpte & PG_MANAGED) != 0) {
5083                 if (pv == NULL) {
5084                         pv = get_pv_entry(pmap, &lock);
5085                         pv->pv_va = va;
5086                 }
5087                 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
5088                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5089                 m->md.pv_gen++;
5090                 if ((newpte & PG_RW) != 0)
5091                         vm_page_aflag_set(m, PGA_WRITEABLE);
5092         }
5093
5094         /*
5095          * Update the PTE.
5096          */
5097         if ((origpte & PG_V) != 0) {
5098 validate:
5099                 origpte = pte_load_store(pte, newpte);
5100                 KASSERT((origpte & PG_FRAME) == pa,
5101                     ("pmap_enter: unexpected pa update for %#lx", va));
5102                 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
5103                     (PG_M | PG_RW)) {
5104                         if ((origpte & PG_MANAGED) != 0)
5105                                 vm_page_dirty(m);
5106
5107                         /*
5108                          * Although the PTE may still have PG_RW set, TLB
5109                          * invalidation may nonetheless be required because
5110                          * the PTE no longer has PG_M set.
5111                          */
5112                 } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
5113                         /*
5114                          * This PTE change does not require TLB invalidation.
5115                          */
5116                         goto unchanged;
5117                 }
5118                 if ((origpte & PG_A) != 0)
5119                         pmap_invalidate_page(pmap, va);
5120         } else
5121                 pte_store(pte, newpte);
5122
5123 unchanged:
5124
5125 #if VM_NRESERVLEVEL > 0
5126         /*
5127          * If both the page table page and the reservation are fully
5128          * populated, then attempt promotion.
5129          */
5130         if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
5131             pmap_ps_enabled(pmap) &&
5132             (m->flags & PG_FICTITIOUS) == 0 &&
5133             vm_reserv_level_iffullpop(m) == 0)
5134                 pmap_promote_pde(pmap, pde, va, &lock);
5135 #endif
5136
5137         rv = KERN_SUCCESS;
5138 out:
5139         if (lock != NULL)
5140                 rw_wunlock(lock);
5141         PMAP_UNLOCK(pmap);
5142         return (rv);
5143 }
5144
5145 /*
5146  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
5147  * if successful.  Returns false if (1) a page table page cannot be allocated
5148  * without sleeping, (2) a mapping already exists at the specified virtual
5149  * address, or (3) a PV entry cannot be allocated without reclaiming another
5150  * PV entry.
5151  */
5152 static bool
5153 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5154     struct rwlock **lockp)
5155 {
5156         pd_entry_t newpde;
5157         pt_entry_t PG_V;
5158
5159         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5160         PG_V = pmap_valid_bit(pmap);
5161         newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
5162             PG_PS | PG_V;
5163         if ((m->oflags & VPO_UNMANAGED) == 0)
5164                 newpde |= PG_MANAGED;
5165         if ((prot & VM_PROT_EXECUTE) == 0)
5166                 newpde |= pg_nx;
5167         if (va < VM_MAXUSER_ADDRESS)
5168                 newpde |= PG_U;
5169         return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
5170             PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
5171             KERN_SUCCESS);
5172 }
5173
5174 /*
5175  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
5176  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
5177  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
5178  * a mapping already exists at the specified virtual address.  Returns
5179  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
5180  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
5181  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
5182  *
5183  * The parameter "m" is only used when creating a managed, writeable mapping.
5184  */
5185 static int
5186 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
5187     vm_page_t m, struct rwlock **lockp)
5188 {
5189         struct spglist free;
5190         pd_entry_t oldpde, *pde;
5191         pt_entry_t PG_G, PG_RW, PG_V;
5192         vm_page_t mt, pdpg;
5193
5194         PG_G = pmap_global_bit(pmap);
5195         PG_RW = pmap_rw_bit(pmap);
5196         KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
5197             ("pmap_enter_pde: newpde is missing PG_M"));
5198         PG_V = pmap_valid_bit(pmap);
5199         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5200
5201         if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
5202             NULL : lockp)) == NULL) {
5203                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
5204                     " in pmap %p", va, pmap);
5205                 return (KERN_RESOURCE_SHORTAGE);
5206         }
5207         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
5208         pde = &pde[pmap_pde_index(va)];
5209         oldpde = *pde;
5210         if ((oldpde & PG_V) != 0) {
5211                 KASSERT(pdpg->wire_count > 1,
5212                     ("pmap_enter_pde: pdpg's wire count is too low"));
5213                 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5214                         pdpg->wire_count--;
5215                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
5216                             " in pmap %p", va, pmap);
5217                         return (KERN_FAILURE);
5218                 }
5219                 /* Break the existing mapping(s). */
5220                 SLIST_INIT(&free);
5221                 if ((oldpde & PG_PS) != 0) {
5222                         /*
5223                          * The reference to the PD page that was acquired by
5224                          * pmap_allocpde() ensures that it won't be freed.
5225                          * However, if the PDE resulted from a promotion, then
5226                          * a reserved PT page could be freed.
5227                          */
5228                         (void)pmap_remove_pde(pmap, pde, va, &free, lockp);
5229                         if ((oldpde & PG_G) == 0)
5230                                 pmap_invalidate_pde_page(pmap, va, oldpde);
5231                 } else {
5232                         pmap_delayed_invl_started();
5233                         if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free,
5234                             lockp))
5235                                pmap_invalidate_all(pmap);
5236                         pmap_delayed_invl_finished();
5237                 }
5238                 vm_page_free_pages_toq(&free, true);
5239                 if (va >= VM_MAXUSER_ADDRESS) {
5240                         mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
5241                         if (pmap_insert_pt_page(pmap, mt)) {
5242                                 /*
5243                                  * XXX Currently, this can't happen because
5244                                  * we do not perform pmap_enter(psind == 1)
5245                                  * on the kernel pmap.
5246                                  */
5247                                 panic("pmap_enter_pde: trie insert failed");
5248                         }
5249                 } else
5250                         KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
5251                             pde));
5252         }
5253         if ((newpde & PG_MANAGED) != 0) {
5254                 /*
5255                  * Abort this mapping if its PV entry could not be created.
5256                  */
5257                 if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
5258                         SLIST_INIT(&free);
5259                         if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
5260                                 /*
5261                                  * Although "va" is not mapped, paging-
5262                                  * structure caches could nonetheless have
5263                                  * entries that refer to the freed page table
5264                                  * pages.  Invalidate those entries.
5265                                  */
5266                                 pmap_invalidate_page(pmap, va);
5267                                 vm_page_free_pages_toq(&free, true);
5268                         }
5269                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
5270                             " in pmap %p", va, pmap);
5271                         return (KERN_RESOURCE_SHORTAGE);
5272                 }
5273                 if ((newpde & PG_RW) != 0) {
5274                         for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
5275                                 vm_page_aflag_set(mt, PGA_WRITEABLE);
5276                 }
5277         }
5278
5279         /*
5280          * Increment counters.
5281          */
5282         if ((newpde & PG_W) != 0)
5283                 pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
5284         pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
5285
5286         /*
5287          * Map the superpage.  (This is not a promoted mapping; there will not
5288          * be any lingering 4KB page mappings in the TLB.)
5289          */
5290         pde_store(pde, newpde);
5291
5292         atomic_add_long(&pmap_pde_mappings, 1);
5293         CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
5294             " in pmap %p", va, pmap);
5295         return (KERN_SUCCESS);
5296 }
5297
5298 /*
5299  * Maps a sequence of resident pages belonging to the same object.
5300  * The sequence begins with the given page m_start.  This page is
5301  * mapped at the given virtual address start.  Each subsequent page is
5302  * mapped at a virtual address that is offset from start by the same
5303  * amount as the page is offset from m_start within the object.  The
5304  * last page in the sequence is the page with the largest offset from
5305  * m_start that can be mapped at a virtual address less than the given
5306  * virtual address end.  Not every virtual page between start and end
5307  * is mapped; only those for which a resident page exists with the
5308  * corresponding offset from m_start are mapped.
5309  */
5310 void
5311 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
5312     vm_page_t m_start, vm_prot_t prot)
5313 {
5314         struct rwlock *lock;
5315         vm_offset_t va;
5316         vm_page_t m, mpte;
5317         vm_pindex_t diff, psize;
5318
5319         VM_OBJECT_ASSERT_LOCKED(m_start->object);
5320
5321         psize = atop(end - start);
5322         mpte = NULL;
5323         m = m_start;
5324         lock = NULL;
5325         PMAP_LOCK(pmap);
5326         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
5327                 va = start + ptoa(diff);
5328                 if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
5329                     m->psind == 1 && pmap_ps_enabled(pmap) &&
5330                     pmap_enter_2mpage(pmap, va, m, prot, &lock))
5331                         m = &m[NBPDR / PAGE_SIZE - 1];
5332                 else
5333                         mpte = pmap_enter_quick_locked(pmap, va, m, prot,
5334                             mpte, &lock);
5335                 m = TAILQ_NEXT(m, listq);
5336         }
5337         if (lock != NULL)
5338                 rw_wunlock(lock);
5339         PMAP_UNLOCK(pmap);
5340 }
5341
5342 /*
5343  * this code makes some *MAJOR* assumptions:
5344  * 1. Current pmap & pmap exists.
5345  * 2. Not wired.
5346  * 3. Read access.
5347  * 4. No page table pages.
5348  * but is *MUCH* faster than pmap_enter...
5349  */
5350
5351 void
5352 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
5353 {
5354         struct rwlock *lock;
5355
5356         lock = NULL;
5357         PMAP_LOCK(pmap);
5358         (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
5359         if (lock != NULL)
5360                 rw_wunlock(lock);
5361         PMAP_UNLOCK(pmap);
5362 }
5363
5364 static vm_page_t
5365 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
5366     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
5367 {
5368         struct spglist free;
5369         pt_entry_t *pte, PG_V;
5370         vm_paddr_t pa;
5371
5372         KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
5373             (m->oflags & VPO_UNMANAGED) != 0,
5374             ("pmap_enter_quick_locked: managed mapping within the clean submap"));
5375         PG_V = pmap_valid_bit(pmap);
5376         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5377
5378         /*
5379          * In the case that a page table page is not
5380          * resident, we are creating it here.
5381          */
5382         if (va < VM_MAXUSER_ADDRESS) {
5383                 vm_pindex_t ptepindex;
5384                 pd_entry_t *ptepa;
5385
5386                 /*
5387                  * Calculate pagetable page index
5388                  */
5389                 ptepindex = pmap_pde_pindex(va);
5390                 if (mpte && (mpte->pindex == ptepindex)) {
5391                         mpte->wire_count++;
5392                 } else {
5393                         /*
5394                          * Get the page directory entry
5395                          */
5396                         ptepa = pmap_pde(pmap, va);
5397
5398                         /*
5399                          * If the page table page is mapped, we just increment
5400                          * the hold count, and activate it.  Otherwise, we
5401                          * attempt to allocate a page table page.  If this
5402                          * attempt fails, we don't retry.  Instead, we give up.
5403                          */
5404                         if (ptepa && (*ptepa & PG_V) != 0) {
5405                                 if (*ptepa & PG_PS)
5406                                         return (NULL);
5407                                 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
5408                                 mpte->wire_count++;
5409                         } else {
5410                                 /*
5411                                  * Pass NULL instead of the PV list lock
5412                                  * pointer, because we don't intend to sleep.
5413                                  */
5414                                 mpte = _pmap_allocpte(pmap, ptepindex, NULL);
5415                                 if (mpte == NULL)
5416                                         return (mpte);
5417                         }
5418                 }
5419                 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
5420                 pte = &pte[pmap_pte_index(va)];
5421         } else {
5422                 mpte = NULL;
5423                 pte = vtopte(va);
5424         }
5425         if (*pte) {
5426                 if (mpte != NULL) {
5427                         mpte->wire_count--;
5428                         mpte = NULL;
5429                 }
5430                 return (mpte);
5431         }
5432
5433         /*
5434          * Enter on the PV list if part of our managed memory.
5435          */
5436         if ((m->oflags & VPO_UNMANAGED) == 0 &&
5437             !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
5438                 if (mpte != NULL) {
5439                         SLIST_INIT(&free);
5440                         if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
5441                                 /*
5442                                  * Although "va" is not mapped, paging-
5443                                  * structure caches could nonetheless have
5444                                  * entries that refer to the freed page table
5445                                  * pages.  Invalidate those entries.
5446                                  */
5447                                 pmap_invalidate_page(pmap, va);
5448                                 vm_page_free_pages_toq(&free, true);
5449                         }
5450                         mpte = NULL;
5451                 }
5452                 return (mpte);
5453         }
5454
5455         /*
5456          * Increment counters
5457          */
5458         pmap_resident_count_inc(pmap, 1);
5459
5460         pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 0);
5461         if ((prot & VM_PROT_EXECUTE) == 0)
5462                 pa |= pg_nx;
5463
5464         /*
5465          * Now validate mapping with RO protection
5466          */
5467         if ((m->oflags & VPO_UNMANAGED) != 0)
5468                 pte_store(pte, pa | PG_V | PG_U);
5469         else
5470                 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
5471         return (mpte);
5472 }
5473
5474 /*
5475  * Make a temporary mapping for a physical address.  This is only intended
5476  * to be used for panic dumps.
5477  */
5478 void *
5479 pmap_kenter_temporary(vm_paddr_t pa, int i)
5480 {
5481         vm_offset_t va;
5482
5483         va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
5484         pmap_kenter(va, pa);
5485         invlpg(va);
5486         return ((void *)crashdumpmap);
5487 }
5488
5489 /*
5490  * This code maps large physical mmap regions into the
5491  * processor address space.  Note that some shortcuts
5492  * are taken, but the code works.
5493  */
5494 void
5495 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
5496     vm_pindex_t pindex, vm_size_t size)
5497 {
5498         pd_entry_t *pde;
5499         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
5500         vm_paddr_t pa, ptepa;
5501         vm_page_t p, pdpg;
5502         int pat_mode;
5503
5504         PG_A = pmap_accessed_bit(pmap);
5505         PG_M = pmap_modified_bit(pmap);
5506         PG_V = pmap_valid_bit(pmap);
5507         PG_RW = pmap_rw_bit(pmap);
5508
5509         VM_OBJECT_ASSERT_WLOCKED(object);
5510         KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
5511             ("pmap_object_init_pt: non-device object"));
5512         if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
5513                 if (!pmap_ps_enabled(pmap))
5514                         return;
5515                 if (!vm_object_populate(object, pindex, pindex + atop(size)))
5516                         return;
5517                 p = vm_page_lookup(object, pindex);
5518                 KASSERT(p->valid == VM_PAGE_BITS_ALL,
5519                     ("pmap_object_init_pt: invalid page %p", p));
5520                 pat_mode = p->md.pat_mode;
5521
5522                 /*
5523                  * Abort the mapping if the first page is not physically
5524                  * aligned to a 2MB page boundary.
5525                  */
5526                 ptepa = VM_PAGE_TO_PHYS(p);
5527                 if (ptepa & (NBPDR - 1))
5528                         return;
5529
5530                 /*
5531                  * Skip the first page.  Abort the mapping if the rest of
5532                  * the pages are not physically contiguous or have differing
5533                  * memory attributes.
5534                  */
5535                 p = TAILQ_NEXT(p, listq);
5536                 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
5537                     pa += PAGE_SIZE) {
5538                         KASSERT(p->valid == VM_PAGE_BITS_ALL,
5539                             ("pmap_object_init_pt: invalid page %p", p));
5540                         if (pa != VM_PAGE_TO_PHYS(p) ||
5541                             pat_mode != p->md.pat_mode)
5542                                 return;
5543                         p = TAILQ_NEXT(p, listq);
5544                 }
5545
5546                 /*
5547                  * Map using 2MB pages.  Since "ptepa" is 2M aligned and
5548                  * "size" is a multiple of 2M, adding the PAT setting to "pa"
5549                  * will not affect the termination of this loop.
5550                  */
5551                 PMAP_LOCK(pmap);
5552                 for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
5553                     pa < ptepa + size; pa += NBPDR) {
5554                         pdpg = pmap_allocpde(pmap, addr, NULL);
5555                         if (pdpg == NULL) {
5556                                 /*
5557                                  * The creation of mappings below is only an
5558                                  * optimization.  If a page directory page
5559                                  * cannot be allocated without blocking,
5560                                  * continue on to the next mapping rather than
5561                                  * blocking.
5562                                  */
5563                                 addr += NBPDR;
5564                                 continue;
5565                         }
5566                         pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
5567                         pde = &pde[pmap_pde_index(addr)];
5568                         if ((*pde & PG_V) == 0) {
5569                                 pde_store(pde, pa | PG_PS | PG_M | PG_A |
5570                                     PG_U | PG_RW | PG_V);
5571                                 pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
5572                                 atomic_add_long(&pmap_pde_mappings, 1);
5573                         } else {
5574                                 /* Continue on if the PDE is already valid. */
5575                                 pdpg->wire_count--;
5576                                 KASSERT(pdpg->wire_count > 0,
5577                                     ("pmap_object_init_pt: missing reference "
5578                                     "to page directory page, va: 0x%lx", addr));
5579                         }
5580                         addr += NBPDR;
5581                 }
5582                 PMAP_UNLOCK(pmap);
5583         }
5584 }
5585
5586 /*
5587  *      Clear the wired attribute from the mappings for the specified range of
5588  *      addresses in the given pmap.  Every valid mapping within that range
5589  *      must have the wired attribute set.  In contrast, invalid mappings
5590  *      cannot have the wired attribute set, so they are ignored.
5591  *
5592  *      The wired attribute of the page table entry is not a hardware
5593  *      feature, so there is no need to invalidate any TLB entries.
5594  *      Since pmap_demote_pde() for the wired entry must never fail,
5595  *      pmap_delayed_invl_started()/finished() calls around the
5596  *      function are not needed.
5597  */
5598 void
5599 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5600 {
5601         vm_offset_t va_next;
5602         pml4_entry_t *pml4e;
5603         pdp_entry_t *pdpe;
5604         pd_entry_t *pde;
5605         pt_entry_t *pte, PG_V;
5606
5607         PG_V = pmap_valid_bit(pmap);
5608         PMAP_LOCK(pmap);
5609         for (; sva < eva; sva = va_next) {
5610                 pml4e = pmap_pml4e(pmap, sva);
5611                 if ((*pml4e & PG_V) == 0) {
5612                         va_next = (sva + NBPML4) & ~PML4MASK;
5613                         if (va_next < sva)
5614                                 va_next = eva;
5615                         continue;
5616                 }
5617                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
5618                 if ((*pdpe & PG_V) == 0) {
5619                         va_next = (sva + NBPDP) & ~PDPMASK;
5620                         if (va_next < sva)
5621                                 va_next = eva;
5622                         continue;
5623                 }
5624                 va_next = (sva + NBPDR) & ~PDRMASK;
5625                 if (va_next < sva)
5626                         va_next = eva;
5627                 pde = pmap_pdpe_to_pde(pdpe, sva);
5628                 if ((*pde & PG_V) == 0)
5629                         continue;
5630                 if ((*pde & PG_PS) != 0) {
5631                         if ((*pde & PG_W) == 0)
5632                                 panic("pmap_unwire: pde %#jx is missing PG_W",
5633                                     (uintmax_t)*pde);
5634
5635                         /*
5636                          * Are we unwiring the entire large page?  If not,
5637                          * demote the mapping and fall through.
5638                          */
5639                         if (sva + NBPDR == va_next && eva >= va_next) {
5640                                 atomic_clear_long(pde, PG_W);
5641                                 pmap->pm_stats.wired_count -= NBPDR /
5642                                     PAGE_SIZE;
5643                                 continue;
5644                         } else if (!pmap_demote_pde(pmap, pde, sva))
5645                                 panic("pmap_unwire: demotion failed");
5646                 }
5647                 if (va_next > eva)
5648                         va_next = eva;
5649                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
5650                     sva += PAGE_SIZE) {
5651                         if ((*pte & PG_V) == 0)
5652                                 continue;
5653                         if ((*pte & PG_W) == 0)
5654                                 panic("pmap_unwire: pte %#jx is missing PG_W",
5655                                     (uintmax_t)*pte);
5656
5657                         /*
5658                          * PG_W must be cleared atomically.  Although the pmap
5659                          * lock synchronizes access to PG_W, another processor
5660                          * could be setting PG_M and/or PG_A concurrently.
5661                          */
5662                         atomic_clear_long(pte, PG_W);
5663                         pmap->pm_stats.wired_count--;
5664                 }
5665         }
5666         PMAP_UNLOCK(pmap);
5667 }
5668
5669 /*
5670  *      Copy the range specified by src_addr/len
5671  *      from the source map to the range dst_addr/len
5672  *      in the destination map.
5673  *
5674  *      This routine is only advisory and need not do anything.
5675  */
5676
5677 void
5678 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
5679     vm_offset_t src_addr)
5680 {
5681         struct rwlock *lock;
5682         struct spglist free;
5683         vm_offset_t addr;
5684         vm_offset_t end_addr = src_addr + len;
5685         vm_offset_t va_next;
5686         vm_page_t dst_pdpg, dstmpte, srcmpte;
5687         pt_entry_t PG_A, PG_M, PG_V;
5688
5689         if (dst_addr != src_addr)
5690                 return;
5691
5692         if (dst_pmap->pm_type != src_pmap->pm_type)
5693                 return;
5694
5695         /*
5696          * EPT page table entries that require emulation of A/D bits are
5697          * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
5698          * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
5699          * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
5700          * implementations flag an EPT misconfiguration for exec-only
5701          * mappings we skip this function entirely for emulated pmaps.
5702          */
5703         if (pmap_emulate_ad_bits(dst_pmap))
5704                 return;
5705
5706         lock = NULL;
5707         if (dst_pmap < src_pmap) {
5708                 PMAP_LOCK(dst_pmap);
5709                 PMAP_LOCK(src_pmap);
5710         } else {
5711                 PMAP_LOCK(src_pmap);
5712                 PMAP_LOCK(dst_pmap);
5713         }
5714
5715         PG_A = pmap_accessed_bit(dst_pmap);
5716         PG_M = pmap_modified_bit(dst_pmap);
5717         PG_V = pmap_valid_bit(dst_pmap);
5718
5719         for (addr = src_addr; addr < end_addr; addr = va_next) {
5720                 pt_entry_t *src_pte, *dst_pte;
5721                 pml4_entry_t *pml4e;
5722                 pdp_entry_t *pdpe;
5723                 pd_entry_t srcptepaddr, *pde;
5724
5725                 KASSERT(addr < UPT_MIN_ADDRESS,
5726                     ("pmap_copy: invalid to pmap_copy page tables"));
5727
5728                 pml4e = pmap_pml4e(src_pmap, addr);
5729                 if ((*pml4e & PG_V) == 0) {
5730                         va_next = (addr + NBPML4) & ~PML4MASK;
5731                         if (va_next < addr)
5732                                 va_next = end_addr;
5733                         continue;
5734                 }
5735
5736                 pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
5737                 if ((*pdpe & PG_V) == 0) {
5738                         va_next = (addr + NBPDP) & ~PDPMASK;
5739                         if (va_next < addr)
5740                                 va_next = end_addr;
5741                         continue;
5742                 }
5743
5744                 va_next = (addr + NBPDR) & ~PDRMASK;
5745                 if (va_next < addr)
5746                         va_next = end_addr;
5747
5748                 pde = pmap_pdpe_to_pde(pdpe, addr);
5749                 srcptepaddr = *pde;
5750                 if (srcptepaddr == 0)
5751                         continue;
5752
5753                 if (srcptepaddr & PG_PS) {
5754                         if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
5755                                 continue;
5756                         dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL);
5757                         if (dst_pdpg == NULL)
5758                                 break;
5759                         pde = (pd_entry_t *)
5760                             PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
5761                         pde = &pde[pmap_pde_index(addr)];
5762                         if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
5763                             pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
5764                             PMAP_ENTER_NORECLAIM, &lock))) {
5765                                 *pde = srcptepaddr & ~PG_W;
5766                                 pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE);
5767                                 atomic_add_long(&pmap_pde_mappings, 1);
5768                         } else
5769                                 dst_pdpg->wire_count--;
5770                         continue;
5771                 }
5772
5773                 srcptepaddr &= PG_FRAME;
5774                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
5775                 KASSERT(srcmpte->wire_count > 0,
5776                     ("pmap_copy: source page table page is unused"));
5777
5778                 if (va_next > end_addr)
5779                         va_next = end_addr;
5780
5781                 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
5782                 src_pte = &src_pte[pmap_pte_index(addr)];
5783                 dstmpte = NULL;
5784                 while (addr < va_next) {
5785                         pt_entry_t ptetemp;
5786                         ptetemp = *src_pte;
5787                         /*
5788                          * we only virtual copy managed pages
5789                          */
5790                         if ((ptetemp & PG_MANAGED) != 0) {
5791                                 if (dstmpte != NULL &&
5792                                     dstmpte->pindex == pmap_pde_pindex(addr))
5793                                         dstmpte->wire_count++;
5794                                 else if ((dstmpte = pmap_allocpte(dst_pmap,
5795                                     addr, NULL)) == NULL)
5796                                         goto out;
5797                                 dst_pte = (pt_entry_t *)
5798                                     PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
5799                                 dst_pte = &dst_pte[pmap_pte_index(addr)];
5800                                 if (*dst_pte == 0 &&
5801                                     pmap_try_insert_pv_entry(dst_pmap, addr,
5802                                     PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
5803                                     &lock)) {
5804                                         /*
5805                                          * Clear the wired, modified, and
5806                                          * accessed (referenced) bits
5807                                          * during the copy.
5808                                          */
5809                                         *dst_pte = ptetemp & ~(PG_W | PG_M |
5810                                             PG_A);
5811                                         pmap_resident_count_inc(dst_pmap, 1);
5812                                 } else {
5813                                         SLIST_INIT(&free);
5814                                         if (pmap_unwire_ptp(dst_pmap, addr,
5815                                             dstmpte, &free)) {
5816                                                 /*
5817                                                  * Although "addr" is not
5818                                                  * mapped, paging-structure
5819                                                  * caches could nonetheless
5820                                                  * have entries that refer to
5821                                                  * the freed page table pages.
5822                                                  * Invalidate those entries.
5823                                                  */
5824                                                 pmap_invalidate_page(dst_pmap,
5825                                                     addr);
5826                                                 vm_page_free_pages_toq(&free,
5827                                                     true);
5828                                         }
5829                                         goto out;
5830                                 }
5831                                 if (dstmpte->wire_count >= srcmpte->wire_count)
5832                                         break;
5833                         }
5834                         addr += PAGE_SIZE;
5835                         src_pte++;
5836                 }
5837         }
5838 out:
5839         if (lock != NULL)
5840                 rw_wunlock(lock);
5841         PMAP_UNLOCK(src_pmap);
5842         PMAP_UNLOCK(dst_pmap);
5843 }
5844
5845 /*
5846  * Zero the specified hardware page.
5847  */
5848 void
5849 pmap_zero_page(vm_page_t m)
5850 {
5851         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5852
5853         pagezero((void *)va);
5854 }
5855
5856 /*
5857  * Zero an an area within a single hardware page.  off and size must not
5858  * cover an area beyond a single hardware page.
5859  */
5860 void
5861 pmap_zero_page_area(vm_page_t m, int off, int size)
5862 {
5863         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5864
5865         if (off == 0 && size == PAGE_SIZE)
5866                 pagezero((void *)va);
5867         else
5868                 bzero((char *)va + off, size);
5869 }
5870
5871 /*
5872  * Copy 1 specified hardware page to another.
5873  */
5874 void
5875 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
5876 {
5877         vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
5878         vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
5879
5880         pagecopy((void *)src, (void *)dst);
5881 }
5882
5883 int unmapped_buf_allowed = 1;
5884
5885 void
5886 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
5887     vm_offset_t b_offset, int xfersize)
5888 {
5889         void *a_cp, *b_cp;
5890         vm_page_t pages[2];
5891         vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
5892         int cnt;
5893         boolean_t mapped;
5894
5895         while (xfersize > 0) {
5896                 a_pg_offset = a_offset & PAGE_MASK;
5897                 pages[0] = ma[a_offset >> PAGE_SHIFT];
5898                 b_pg_offset = b_offset & PAGE_MASK;
5899                 pages[1] = mb[b_offset >> PAGE_SHIFT];
5900                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
5901                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
5902                 mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
5903                 a_cp = (char *)vaddr[0] + a_pg_offset;
5904                 b_cp = (char *)vaddr[1] + b_pg_offset;
5905                 bcopy(a_cp, b_cp, cnt);
5906                 if (__predict_false(mapped))
5907                         pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
5908                 a_offset += cnt;
5909                 b_offset += cnt;
5910                 xfersize -= cnt;
5911         }
5912 }
5913
5914 /*
5915  * Returns true if the pmap's pv is one of the first
5916  * 16 pvs linked to from this page.  This count may
5917  * be changed upwards or downwards in the future; it
5918  * is only necessary that true be returned for a small
5919  * subset of pmaps for proper page aging.
5920  */
5921 boolean_t
5922 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
5923 {
5924         struct md_page *pvh;
5925         struct rwlock *lock;
5926         pv_entry_t pv;
5927         int loops = 0;
5928         boolean_t rv;
5929
5930         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5931             ("pmap_page_exists_quick: page %p is not managed", m));
5932         rv = FALSE;
5933         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5934         rw_rlock(lock);
5935         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5936                 if (PV_PMAP(pv) == pmap) {
5937                         rv = TRUE;
5938                         break;
5939                 }
5940                 loops++;
5941                 if (loops >= 16)
5942                         break;
5943         }
5944         if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5945                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5946                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5947                         if (PV_PMAP(pv) == pmap) {
5948                                 rv = TRUE;
5949                                 break;
5950                         }
5951                         loops++;
5952                         if (loops >= 16)
5953                                 break;
5954                 }
5955         }
5956         rw_runlock(lock);
5957         return (rv);
5958 }
5959
5960 /*
5961  *      pmap_page_wired_mappings:
5962  *
5963  *      Return the number of managed mappings to the given physical page
5964  *      that are wired.
5965  */
5966 int
5967 pmap_page_wired_mappings(vm_page_t m)
5968 {
5969         struct rwlock *lock;
5970         struct md_page *pvh;
5971         pmap_t pmap;
5972         pt_entry_t *pte;
5973         pv_entry_t pv;
5974         int count, md_gen, pvh_gen;
5975
5976         if ((m->oflags & VPO_UNMANAGED) != 0)
5977                 return (0);
5978         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5979         rw_rlock(lock);
5980 restart:
5981         count = 0;
5982         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5983                 pmap = PV_PMAP(pv);
5984                 if (!PMAP_TRYLOCK(pmap)) {
5985                         md_gen = m->md.pv_gen;
5986                         rw_runlock(lock);
5987                         PMAP_LOCK(pmap);
5988                         rw_rlock(lock);
5989                         if (md_gen != m->md.pv_gen) {
5990                                 PMAP_UNLOCK(pmap);
5991                                 goto restart;
5992                         }
5993                 }
5994                 pte = pmap_pte(pmap, pv->pv_va);
5995                 if ((*pte & PG_W) != 0)
5996                         count++;
5997                 PMAP_UNLOCK(pmap);
5998         }
5999         if ((m->flags & PG_FICTITIOUS) == 0) {
6000                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6001                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6002                         pmap = PV_PMAP(pv);
6003                         if (!PMAP_TRYLOCK(pmap)) {
6004                                 md_gen = m->md.pv_gen;
6005                                 pvh_gen = pvh->pv_gen;
6006                                 rw_runlock(lock);
6007                                 PMAP_LOCK(pmap);
6008                                 rw_rlock(lock);
6009                                 if (md_gen != m->md.pv_gen ||
6010                                     pvh_gen != pvh->pv_gen) {
6011                                         PMAP_UNLOCK(pmap);
6012                                         goto restart;
6013                                 }
6014                         }
6015                         pte = pmap_pde(pmap, pv->pv_va);
6016                         if ((*pte & PG_W) != 0)
6017                                 count++;
6018                         PMAP_UNLOCK(pmap);
6019                 }
6020         }
6021         rw_runlock(lock);
6022         return (count);
6023 }
6024
6025 /*
6026  * Returns TRUE if the given page is mapped individually or as part of
6027  * a 2mpage.  Otherwise, returns FALSE.
6028  */
6029 boolean_t
6030 pmap_page_is_mapped(vm_page_t m)
6031 {
6032         struct rwlock *lock;
6033         boolean_t rv;
6034
6035         if ((m->oflags & VPO_UNMANAGED) != 0)
6036                 return (FALSE);
6037         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6038         rw_rlock(lock);
6039         rv = !TAILQ_EMPTY(&m->md.pv_list) ||
6040             ((m->flags & PG_FICTITIOUS) == 0 &&
6041             !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
6042         rw_runlock(lock);
6043         return (rv);
6044 }
6045
6046 /*
6047  * Destroy all managed, non-wired mappings in the given user-space
6048  * pmap.  This pmap cannot be active on any processor besides the
6049  * caller.
6050  *
6051  * This function cannot be applied to the kernel pmap.  Moreover, it
6052  * is not intended for general use.  It is only to be used during
6053  * process termination.  Consequently, it can be implemented in ways
6054  * that make it faster than pmap_remove().  First, it can more quickly
6055  * destroy mappings by iterating over the pmap's collection of PV
6056  * entries, rather than searching the page table.  Second, it doesn't
6057  * have to test and clear the page table entries atomically, because
6058  * no processor is currently accessing the user address space.  In
6059  * particular, a page table entry's dirty bit won't change state once
6060  * this function starts.
6061  *
6062  * Although this function destroys all of the pmap's managed,
6063  * non-wired mappings, it can delay and batch the invalidation of TLB
6064  * entries without calling pmap_delayed_invl_started() and
6065  * pmap_delayed_invl_finished().  Because the pmap is not active on
6066  * any other processor, none of these TLB entries will ever be used
6067  * before their eventual invalidation.  Consequently, there is no need
6068  * for either pmap_remove_all() or pmap_remove_write() to wait for
6069  * that eventual TLB invalidation.
6070  */
6071 void
6072 pmap_remove_pages(pmap_t pmap)
6073 {
6074         pd_entry_t ptepde;
6075         pt_entry_t *pte, tpte;
6076         pt_entry_t PG_M, PG_RW, PG_V;
6077         struct spglist free;
6078         vm_page_t m, mpte, mt;
6079         pv_entry_t pv;
6080         struct md_page *pvh;
6081         struct pv_chunk *pc, *npc;
6082         struct rwlock *lock;
6083         int64_t bit;
6084         uint64_t inuse, bitmask;
6085         int allfree, field, freed, idx;
6086         boolean_t superpage;
6087         vm_paddr_t pa;
6088
6089         /*
6090          * Assert that the given pmap is only active on the current
6091          * CPU.  Unfortunately, we cannot block another CPU from
6092          * activating the pmap while this function is executing.
6093          */
6094         KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
6095 #ifdef INVARIANTS
6096         {
6097                 cpuset_t other_cpus;
6098
6099                 other_cpus = all_cpus;
6100                 critical_enter();
6101                 CPU_CLR(PCPU_GET(cpuid), &other_cpus);
6102                 CPU_AND(&other_cpus, &pmap->pm_active);
6103                 critical_exit();
6104                 KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
6105         }
6106 #endif
6107
6108         lock = NULL;
6109         PG_M = pmap_modified_bit(pmap);
6110         PG_V = pmap_valid_bit(pmap);
6111         PG_RW = pmap_rw_bit(pmap);
6112
6113         SLIST_INIT(&free);
6114         PMAP_LOCK(pmap);
6115         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
6116                 allfree = 1;
6117                 freed = 0;
6118                 for (field = 0; field < _NPCM; field++) {
6119                         inuse = ~pc->pc_map[field] & pc_freemask[field];
6120                         while (inuse != 0) {
6121                                 bit = bsfq(inuse);
6122                                 bitmask = 1UL << bit;
6123                                 idx = field * 64 + bit;
6124                                 pv = &pc->pc_pventry[idx];
6125                                 inuse &= ~bitmask;
6126
6127                                 pte = pmap_pdpe(pmap, pv->pv_va);
6128                                 ptepde = *pte;
6129                                 pte = pmap_pdpe_to_pde(pte, pv->pv_va);
6130                                 tpte = *pte;
6131                                 if ((tpte & (PG_PS | PG_V)) == PG_V) {
6132                                         superpage = FALSE;
6133                                         ptepde = tpte;
6134                                         pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
6135                                             PG_FRAME);
6136                                         pte = &pte[pmap_pte_index(pv->pv_va)];
6137                                         tpte = *pte;
6138                                 } else {
6139                                         /*
6140                                          * Keep track whether 'tpte' is a
6141                                          * superpage explicitly instead of
6142                                          * relying on PG_PS being set.
6143                                          *
6144                                          * This is because PG_PS is numerically
6145                                          * identical to PG_PTE_PAT and thus a
6146                                          * regular page could be mistaken for
6147                                          * a superpage.
6148                                          */
6149                                         superpage = TRUE;
6150                                 }
6151
6152                                 if ((tpte & PG_V) == 0) {
6153                                         panic("bad pte va %lx pte %lx",
6154                                             pv->pv_va, tpte);
6155                                 }
6156
6157 /*
6158  * We cannot remove wired pages from a process' mapping at this time
6159  */
6160                                 if (tpte & PG_W) {
6161                                         allfree = 0;
6162                                         continue;
6163                                 }
6164
6165                                 if (superpage)
6166                                         pa = tpte & PG_PS_FRAME;
6167                                 else
6168                                         pa = tpte & PG_FRAME;
6169
6170                                 m = PHYS_TO_VM_PAGE(pa);
6171                                 KASSERT(m->phys_addr == pa,
6172                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
6173                                     m, (uintmax_t)m->phys_addr,
6174                                     (uintmax_t)tpte));
6175
6176                                 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
6177                                     m < &vm_page_array[vm_page_array_size],
6178                                     ("pmap_remove_pages: bad tpte %#jx",
6179                                     (uintmax_t)tpte));
6180
6181                                 pte_clear(pte);
6182
6183                                 /*
6184                                  * Update the vm_page_t clean/reference bits.
6185                                  */
6186                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6187                                         if (superpage) {
6188                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
6189                                                         vm_page_dirty(mt);
6190                                         } else
6191                                                 vm_page_dirty(m);
6192                                 }
6193
6194                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
6195
6196                                 /* Mark free */
6197                                 pc->pc_map[field] |= bitmask;
6198                                 if (superpage) {
6199                                         pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
6200                                         pvh = pa_to_pvh(tpte & PG_PS_FRAME);
6201                                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
6202                                         pvh->pv_gen++;
6203                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
6204                                                 for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
6205                                                         if ((mt->aflags & PGA_WRITEABLE) != 0 &&
6206                                                             TAILQ_EMPTY(&mt->md.pv_list))
6207                                                                 vm_page_aflag_clear(mt, PGA_WRITEABLE);
6208                                         }
6209                                         mpte = pmap_remove_pt_page(pmap, pv->pv_va);
6210                                         if (mpte != NULL) {
6211                                                 pmap_resident_count_dec(pmap, 1);
6212                                                 KASSERT(mpte->wire_count == NPTEPG,
6213                                                     ("pmap_remove_pages: pte page wire count error"));
6214                                                 mpte->wire_count = 0;
6215                                                 pmap_add_delayed_free_list(mpte, &free, FALSE);
6216                                         }
6217                                 } else {
6218                                         pmap_resident_count_dec(pmap, 1);
6219                                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
6220                                         m->md.pv_gen++;
6221                                         if ((m->aflags & PGA_WRITEABLE) != 0 &&
6222                                             TAILQ_EMPTY(&m->md.pv_list) &&
6223                                             (m->flags & PG_FICTITIOUS) == 0) {
6224                                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6225                                                 if (TAILQ_EMPTY(&pvh->pv_list))
6226                                                         vm_page_aflag_clear(m, PGA_WRITEABLE);
6227                                         }
6228                                 }
6229                                 pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
6230                                 freed++;
6231                         }
6232                 }
6233                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
6234                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
6235                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
6236                 if (allfree) {
6237                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
6238                         free_pv_chunk(pc);
6239                 }
6240         }
6241         if (lock != NULL)
6242                 rw_wunlock(lock);
6243         pmap_invalidate_all(pmap);
6244         PMAP_UNLOCK(pmap);
6245         vm_page_free_pages_toq(&free, true);
6246 }
6247
6248 static boolean_t
6249 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
6250 {
6251         struct rwlock *lock;
6252         pv_entry_t pv;
6253         struct md_page *pvh;
6254         pt_entry_t *pte, mask;
6255         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
6256         pmap_t pmap;
6257         int md_gen, pvh_gen;
6258         boolean_t rv;
6259
6260         rv = FALSE;
6261         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6262         rw_rlock(lock);
6263 restart:
6264         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6265                 pmap = PV_PMAP(pv);
6266                 if (!PMAP_TRYLOCK(pmap)) {
6267                         md_gen = m->md.pv_gen;
6268                         rw_runlock(lock);
6269                         PMAP_LOCK(pmap);
6270                         rw_rlock(lock);
6271                         if (md_gen != m->md.pv_gen) {
6272                                 PMAP_UNLOCK(pmap);
6273                                 goto restart;
6274                         }
6275                 }
6276                 pte = pmap_pte(pmap, pv->pv_va);
6277                 mask = 0;
6278                 if (modified) {
6279                         PG_M = pmap_modified_bit(pmap);
6280                         PG_RW = pmap_rw_bit(pmap);
6281                         mask |= PG_RW | PG_M;
6282                 }
6283                 if (accessed) {
6284                         PG_A = pmap_accessed_bit(pmap);
6285                         PG_V = pmap_valid_bit(pmap);
6286                         mask |= PG_V | PG_A;
6287                 }
6288                 rv = (*pte & mask) == mask;
6289                 PMAP_UNLOCK(pmap);
6290                 if (rv)
6291                         goto out;
6292         }
6293         if ((m->flags & PG_FICTITIOUS) == 0) {
6294                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
6295                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6296                         pmap = PV_PMAP(pv);
6297                         if (!PMAP_TRYLOCK(pmap)) {
6298                                 md_gen = m->md.pv_gen;
6299                                 pvh_gen = pvh->pv_gen;
6300                                 rw_runlock(lock);
6301                                 PMAP_LOCK(pmap);
6302                                 rw_rlock(lock);
6303                                 if (md_gen != m->md.pv_gen ||
6304                                     pvh_gen != pvh->pv_gen) {
6305                                         PMAP_UNLOCK(pmap);
6306                                         goto restart;
6307                                 }
6308                         }
6309                         pte = pmap_pde(pmap, pv->pv_va);
6310                         mask = 0;
6311                         if (modified) {
6312                                 PG_M = pmap_modified_bit(pmap);
6313                                 PG_RW = pmap_rw_bit(pmap);
6314                                 mask |= PG_RW | PG_M;
6315                         }
6316                         if (accessed) {
6317                                 PG_A = pmap_accessed_bit(pmap);
6318                                 PG_V = pmap_valid_bit(pmap);
6319                                 mask |= PG_V | PG_A;
6320                         }
6321                         rv = (*pte & mask) == mask;
6322                         PMAP_UNLOCK(pmap);
6323                         if (rv)
6324                                 goto out;
6325                 }
6326         }
6327 out:
6328         rw_runlock(lock);
6329         return (rv);
6330 }
6331
6332 /*
6333  *      pmap_is_modified:
6334  *
6335  *      Return whether or not the specified physical page was modified
6336  *      in any physical maps.
6337  */
6338 boolean_t
6339 pmap_is_modified(vm_page_t m)
6340 {
6341
6342         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6343             ("pmap_is_modified: page %p is not managed", m));
6344
6345         /*
6346          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
6347          * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
6348          * is clear, no PTEs can have PG_M set.
6349          */
6350         VM_OBJECT_ASSERT_WLOCKED(m->object);
6351         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
6352                 return (FALSE);
6353         return (pmap_page_test_mappings(m, FALSE, TRUE));
6354 }
6355
6356 /*
6357  *      pmap_is_prefaultable:
6358  *
6359  *      Return whether or not the specified virtual address is eligible
6360  *      for prefault.
6361  */
6362 boolean_t
6363 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
6364 {
6365         pd_entry_t *pde;
6366         pt_entry_t *pte, PG_V;
6367         boolean_t rv;
6368
6369         PG_V = pmap_valid_bit(pmap);
6370         rv = FALSE;
6371         PMAP_LOCK(pmap);
6372         pde = pmap_pde(pmap, addr);
6373         if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
6374                 pte = pmap_pde_to_pte(pde, addr);
6375                 rv = (*pte & PG_V) == 0;
6376         }
6377         PMAP_UNLOCK(pmap);
6378         return (rv);
6379 }
6380
6381 /*
6382  *      pmap_is_referenced:
6383  *
6384  *      Return whether or not the specified physical page was referenced
6385  *      in any physical maps.
6386  */
6387 boolean_t
6388 pmap_is_referenced(vm_page_t m)
6389 {
6390
6391         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6392             ("pmap_is_referenced: page %p is not managed", m));
6393         return (pmap_page_test_mappings(m, TRUE, FALSE));
6394 }
6395
6396 /*
6397  * Clear the write and modified bits in each of the given page's mappings.
6398  */
6399 void
6400 pmap_remove_write(vm_page_t m)
6401 {
6402         struct md_page *pvh;
6403         pmap_t pmap;
6404         struct rwlock *lock;
6405         pv_entry_t next_pv, pv;
6406         pd_entry_t *pde;
6407         pt_entry_t oldpte, *pte, PG_M, PG_RW;
6408         vm_offset_t va;
6409         int pvh_gen, md_gen;
6410
6411         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6412             ("pmap_remove_write: page %p is not managed", m));
6413
6414         /*
6415          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
6416          * set by another thread while the object is locked.  Thus,
6417          * if PGA_WRITEABLE is clear, no page table entries need updating.
6418          */
6419         VM_OBJECT_ASSERT_WLOCKED(m->object);
6420         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
6421                 return;
6422         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6423         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
6424             pa_to_pvh(VM_PAGE_TO_PHYS(m));
6425 retry_pv_loop:
6426         rw_wlock(lock);
6427         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6428                 pmap = PV_PMAP(pv);
6429                 if (!PMAP_TRYLOCK(pmap)) {
6430                         pvh_gen = pvh->pv_gen;
6431                         rw_wunlock(lock);
6432                         PMAP_LOCK(pmap);
6433                         rw_wlock(lock);
6434                         if (pvh_gen != pvh->pv_gen) {
6435                                 PMAP_UNLOCK(pmap);
6436                                 rw_wunlock(lock);
6437                                 goto retry_pv_loop;
6438                         }
6439                 }
6440                 PG_RW = pmap_rw_bit(pmap);
6441                 va = pv->pv_va;
6442                 pde = pmap_pde(pmap, va);
6443                 if ((*pde & PG_RW) != 0)
6444                         (void)pmap_demote_pde_locked(pmap, pde, va, &lock);
6445                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
6446                     ("inconsistent pv lock %p %p for page %p",
6447                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
6448                 PMAP_UNLOCK(pmap);
6449         }
6450         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6451                 pmap = PV_PMAP(pv);
6452                 if (!PMAP_TRYLOCK(pmap)) {
6453                         pvh_gen = pvh->pv_gen;
6454                         md_gen = m->md.pv_gen;
6455                         rw_wunlock(lock);
6456                         PMAP_LOCK(pmap);
6457                         rw_wlock(lock);
6458                         if (pvh_gen != pvh->pv_gen ||
6459                             md_gen != m->md.pv_gen) {
6460                                 PMAP_UNLOCK(pmap);
6461                                 rw_wunlock(lock);
6462                                 goto retry_pv_loop;
6463                         }
6464                 }
6465                 PG_M = pmap_modified_bit(pmap);
6466                 PG_RW = pmap_rw_bit(pmap);
6467                 pde = pmap_pde(pmap, pv->pv_va);
6468                 KASSERT((*pde & PG_PS) == 0,
6469                     ("pmap_remove_write: found a 2mpage in page %p's pv list",
6470                     m));
6471                 pte = pmap_pde_to_pte(pde, pv->pv_va);
6472 retry:
6473                 oldpte = *pte;
6474                 if (oldpte & PG_RW) {
6475                         if (!atomic_cmpset_long(pte, oldpte, oldpte &
6476                             ~(PG_RW | PG_M)))
6477                                 goto retry;
6478                         if ((oldpte & PG_M) != 0)
6479                                 vm_page_dirty(m);
6480                         pmap_invalidate_page(pmap, pv->pv_va);
6481                 }
6482                 PMAP_UNLOCK(pmap);
6483         }
6484         rw_wunlock(lock);
6485         vm_page_aflag_clear(m, PGA_WRITEABLE);
6486         pmap_delayed_invl_wait(m);
6487 }
6488
6489 static __inline boolean_t
6490 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
6491 {
6492
6493         if (!pmap_emulate_ad_bits(pmap))
6494                 return (TRUE);
6495
6496         KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
6497
6498         /*
6499          * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
6500          * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
6501          * if the EPT_PG_WRITE bit is set.
6502          */
6503         if ((pte & EPT_PG_WRITE) != 0)
6504                 return (FALSE);
6505
6506         /*
6507          * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
6508          */
6509         if ((pte & EPT_PG_EXECUTE) == 0 ||
6510             ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
6511                 return (TRUE);
6512         else
6513                 return (FALSE);
6514 }
6515
6516 /*
6517  *      pmap_ts_referenced:
6518  *
6519  *      Return a count of reference bits for a page, clearing those bits.
6520  *      It is not necessary for every reference bit to be cleared, but it
6521  *      is necessary that 0 only be returned when there are truly no
6522  *      reference bits set.
6523  *
6524  *      As an optimization, update the page's dirty field if a modified bit is
6525  *      found while counting reference bits.  This opportunistic update can be
6526  *      performed at low cost and can eliminate the need for some future calls
6527  *      to pmap_is_modified().  However, since this function stops after
6528  *      finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
6529  *      dirty pages.  Those dirty pages will only be detected by a future call
6530  *      to pmap_is_modified().
6531  *
6532  *      A DI block is not needed within this function, because
6533  *      invalidations are performed before the PV list lock is
6534  *      released.
6535  */
6536 int
6537 pmap_ts_referenced(vm_page_t m)
6538 {
6539         struct md_page *pvh;
6540         pv_entry_t pv, pvf;
6541         pmap_t pmap;
6542         struct rwlock *lock;
6543         pd_entry_t oldpde, *pde;
6544         pt_entry_t *pte, PG_A, PG_M, PG_RW;
6545         vm_offset_t va;
6546         vm_paddr_t pa;
6547         int cleared, md_gen, not_cleared, pvh_gen;
6548         struct spglist free;
6549         boolean_t demoted;
6550
6551         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6552             ("pmap_ts_referenced: page %p is not managed", m));
6553         SLIST_INIT(&free);
6554         cleared = 0;
6555         pa = VM_PAGE_TO_PHYS(m);
6556         lock = PHYS_TO_PV_LIST_LOCK(pa);
6557         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
6558         rw_wlock(lock);
6559 retry:
6560         not_cleared = 0;
6561         if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
6562                 goto small_mappings;
6563         pv = pvf;
6564         do {
6565                 if (pvf == NULL)
6566                         pvf = pv;
6567                 pmap = PV_PMAP(pv);
6568                 if (!PMAP_TRYLOCK(pmap)) {
6569                         pvh_gen = pvh->pv_gen;
6570                         rw_wunlock(lock);
6571                         PMAP_LOCK(pmap);
6572                         rw_wlock(lock);
6573                         if (pvh_gen != pvh->pv_gen) {
6574                                 PMAP_UNLOCK(pmap);
6575                                 goto retry;
6576                         }
6577                 }
6578                 PG_A = pmap_accessed_bit(pmap);
6579                 PG_M = pmap_modified_bit(pmap);
6580                 PG_RW = pmap_rw_bit(pmap);
6581                 va = pv->pv_va;
6582                 pde = pmap_pde(pmap, pv->pv_va);
6583                 oldpde = *pde;
6584                 if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6585                         /*
6586                          * Although "oldpde" is mapping a 2MB page, because
6587                          * this function is called at a 4KB page granularity,
6588                          * we only update the 4KB page under test.
6589                          */
6590                         vm_page_dirty(m);
6591                 }
6592                 if ((oldpde & PG_A) != 0) {
6593                         /*
6594                          * Since this reference bit is shared by 512 4KB
6595                          * pages, it should not be cleared every time it is
6596                          * tested.  Apply a simple "hash" function on the
6597                          * physical page number, the virtual superpage number,
6598                          * and the pmap address to select one 4KB page out of
6599                          * the 512 on which testing the reference bit will
6600                          * result in clearing that reference bit.  This
6601                          * function is designed to avoid the selection of the
6602                          * same 4KB page for every 2MB page mapping.
6603                          *
6604                          * On demotion, a mapping that hasn't been referenced
6605                          * is simply destroyed.  To avoid the possibility of a
6606                          * subsequent page fault on a demoted wired mapping,
6607                          * always leave its reference bit set.  Moreover,
6608                          * since the superpage is wired, the current state of
6609                          * its reference bit won't affect page replacement.
6610                          */
6611                         if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
6612                             (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
6613                             (oldpde & PG_W) == 0) {
6614                                 if (safe_to_clear_referenced(pmap, oldpde)) {
6615                                         atomic_clear_long(pde, PG_A);
6616                                         pmap_invalidate_page(pmap, pv->pv_va);
6617                                         demoted = FALSE;
6618                                 } else if (pmap_demote_pde_locked(pmap, pde,
6619                                     pv->pv_va, &lock)) {
6620                                         /*
6621                                          * Remove the mapping to a single page
6622                                          * so that a subsequent access may
6623                                          * repromote.  Since the underlying
6624                                          * page table page is fully populated,
6625                                          * this removal never frees a page
6626                                          * table page.
6627                                          */
6628                                         demoted = TRUE;
6629                                         va += VM_PAGE_TO_PHYS(m) - (oldpde &
6630                                             PG_PS_FRAME);
6631                                         pte = pmap_pde_to_pte(pde, va);
6632                                         pmap_remove_pte(pmap, pte, va, *pde,
6633                                             NULL, &lock);
6634                                         pmap_invalidate_page(pmap, va);
6635                                 } else
6636                                         demoted = TRUE;
6637
6638                                 if (demoted) {
6639                                         /*
6640                                          * The superpage mapping was removed
6641                                          * entirely and therefore 'pv' is no
6642                                          * longer valid.
6643                                          */
6644                                         if (pvf == pv)
6645                                                 pvf = NULL;
6646                                         pv = NULL;
6647                                 }
6648                                 cleared++;
6649                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
6650                                     ("inconsistent pv lock %p %p for page %p",
6651                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
6652                         } else
6653                                 not_cleared++;
6654                 }
6655                 PMAP_UNLOCK(pmap);
6656                 /* Rotate the PV list if it has more than one entry. */
6657                 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
6658                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
6659                         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
6660                         pvh->pv_gen++;
6661                 }
6662                 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
6663                         goto out;
6664         } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
6665 small_mappings:
6666         if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
6667                 goto out;
6668         pv = pvf;
6669         do {
6670                 if (pvf == NULL)
6671                         pvf = pv;
6672                 pmap = PV_PMAP(pv);
6673                 if (!PMAP_TRYLOCK(pmap)) {
6674                         pvh_gen = pvh->pv_gen;
6675                         md_gen = m->md.pv_gen;
6676                         rw_wunlock(lock);
6677                         PMAP_LOCK(pmap);
6678                         rw_wlock(lock);
6679                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6680                                 PMAP_UNLOCK(pmap);
6681                                 goto retry;
6682                         }
6683                 }
6684                 PG_A = pmap_accessed_bit(pmap);
6685                 PG_M = pmap_modified_bit(pmap);
6686                 PG_RW = pmap_rw_bit(pmap);
6687                 pde = pmap_pde(pmap, pv->pv_va);
6688                 KASSERT((*pde & PG_PS) == 0,
6689                     ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
6690                     m));
6691                 pte = pmap_pde_to_pte(pde, pv->pv_va);
6692                 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
6693                         vm_page_dirty(m);
6694                 if ((*pte & PG_A) != 0) {
6695                         if (safe_to_clear_referenced(pmap, *pte)) {
6696                                 atomic_clear_long(pte, PG_A);
6697                                 pmap_invalidate_page(pmap, pv->pv_va);
6698                                 cleared++;
6699                         } else if ((*pte & PG_W) == 0) {
6700                                 /*
6701                                  * Wired pages cannot be paged out so
6702                                  * doing accessed bit emulation for
6703                                  * them is wasted effort. We do the
6704                                  * hard work for unwired pages only.
6705                                  */
6706                                 pmap_remove_pte(pmap, pte, pv->pv_va,
6707                                     *pde, &free, &lock);
6708                                 pmap_invalidate_page(pmap, pv->pv_va);
6709                                 cleared++;
6710                                 if (pvf == pv)
6711                                         pvf = NULL;
6712                                 pv = NULL;
6713                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
6714                                     ("inconsistent pv lock %p %p for page %p",
6715                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
6716                         } else
6717                                 not_cleared++;
6718                 }
6719                 PMAP_UNLOCK(pmap);
6720                 /* Rotate the PV list if it has more than one entry. */
6721                 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
6722                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
6723                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
6724                         m->md.pv_gen++;
6725                 }
6726         } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
6727             not_cleared < PMAP_TS_REFERENCED_MAX);
6728 out:
6729         rw_wunlock(lock);
6730         vm_page_free_pages_toq(&free, true);
6731         return (cleared + not_cleared);
6732 }
6733
6734 /*
6735  *      Apply the given advice to the specified range of addresses within the
6736  *      given pmap.  Depending on the advice, clear the referenced and/or
6737  *      modified flags in each mapping and set the mapped page's dirty field.
6738  */
6739 void
6740 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
6741 {
6742         struct rwlock *lock;
6743         pml4_entry_t *pml4e;
6744         pdp_entry_t *pdpe;
6745         pd_entry_t oldpde, *pde;
6746         pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
6747         vm_offset_t va, va_next;
6748         vm_page_t m;
6749         boolean_t anychanged;
6750
6751         if (advice != MADV_DONTNEED && advice != MADV_FREE)
6752                 return;
6753
6754         /*
6755          * A/D bit emulation requires an alternate code path when clearing
6756          * the modified and accessed bits below. Since this function is
6757          * advisory in nature we skip it entirely for pmaps that require
6758          * A/D bit emulation.
6759          */
6760         if (pmap_emulate_ad_bits(pmap))
6761                 return;
6762
6763         PG_A = pmap_accessed_bit(pmap);
6764         PG_G = pmap_global_bit(pmap);
6765         PG_M = pmap_modified_bit(pmap);
6766         PG_V = pmap_valid_bit(pmap);
6767         PG_RW = pmap_rw_bit(pmap);
6768         anychanged = FALSE;
6769         pmap_delayed_invl_started();
6770         PMAP_LOCK(pmap);
6771         for (; sva < eva; sva = va_next) {
6772                 pml4e = pmap_pml4e(pmap, sva);
6773                 if ((*pml4e & PG_V) == 0) {
6774                         va_next = (sva + NBPML4) & ~PML4MASK;
6775                         if (va_next < sva)
6776                                 va_next = eva;
6777                         continue;
6778                 }
6779                 pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
6780                 if ((*pdpe & PG_V) == 0) {
6781                         va_next = (sva + NBPDP) & ~PDPMASK;
6782                         if (va_next < sva)
6783                                 va_next = eva;
6784                         continue;
6785                 }
6786                 va_next = (sva + NBPDR) & ~PDRMASK;
6787                 if (va_next < sva)
6788                         va_next = eva;
6789                 pde = pmap_pdpe_to_pde(pdpe, sva);
6790                 oldpde = *pde;
6791                 if ((oldpde & PG_V) == 0)
6792                         continue;
6793                 else if ((oldpde & PG_PS) != 0) {
6794                         if ((oldpde & PG_MANAGED) == 0)
6795                                 continue;
6796                         lock = NULL;
6797                         if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
6798                                 if (lock != NULL)
6799                                         rw_wunlock(lock);
6800
6801                                 /*
6802                                  * The large page mapping was destroyed.
6803                                  */
6804                                 continue;
6805                         }
6806
6807                         /*
6808                          * Unless the page mappings are wired, remove the
6809                          * mapping to a single page so that a subsequent
6810                          * access may repromote.  Since the underlying page
6811                          * table page is fully populated, this removal never
6812                          * frees a page table page.
6813                          */
6814                         if ((oldpde & PG_W) == 0) {
6815                                 pte = pmap_pde_to_pte(pde, sva);
6816                                 KASSERT((*pte & PG_V) != 0,
6817                                     ("pmap_advise: invalid PTE"));
6818                                 pmap_remove_pte(pmap, pte, sva, *pde, NULL,
6819                                     &lock);
6820                                 anychanged = TRUE;
6821                         }
6822                         if (lock != NULL)
6823                                 rw_wunlock(lock);
6824                 }
6825                 if (va_next > eva)
6826                         va_next = eva;
6827                 va = va_next;
6828                 for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
6829                     sva += PAGE_SIZE) {
6830                         if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
6831                                 goto maybe_invlrng;
6832                         else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6833                                 if (advice == MADV_DONTNEED) {
6834                                         /*
6835                                          * Future calls to pmap_is_modified()
6836                                          * can be avoided by making the page
6837                                          * dirty now.
6838                                          */
6839                                         m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
6840                                         vm_page_dirty(m);
6841                                 }
6842                                 atomic_clear_long(pte, PG_M | PG_A);
6843                         } else if ((*pte & PG_A) != 0)
6844                                 atomic_clear_long(pte, PG_A);
6845                         else
6846                                 goto maybe_invlrng;
6847
6848                         if ((*pte & PG_G) != 0) {
6849                                 if (va == va_next)
6850                                         va = sva;
6851                         } else
6852                                 anychanged = TRUE;
6853                         continue;
6854 maybe_invlrng:
6855                         if (va != va_next) {
6856                                 pmap_invalidate_range(pmap, va, sva);
6857                                 va = va_next;
6858                         }
6859                 }
6860                 if (va != va_next)
6861                         pmap_invalidate_range(pmap, va, sva);
6862         }
6863         if (anychanged)
6864                 pmap_invalidate_all(pmap);
6865         PMAP_UNLOCK(pmap);
6866         pmap_delayed_invl_finished();
6867 }
6868
6869 /*
6870  *      Clear the modify bits on the specified physical page.
6871  */
6872 void
6873 pmap_clear_modify(vm_page_t m)
6874 {
6875         struct md_page *pvh;
6876         pmap_t pmap;
6877         pv_entry_t next_pv, pv;
6878         pd_entry_t oldpde, *pde;
6879         pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
6880         struct rwlock *lock;
6881         vm_offset_t va;
6882         int md_gen, pvh_gen;
6883
6884         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6885             ("pmap_clear_modify: page %p is not managed", m));
6886         VM_OBJECT_ASSERT_WLOCKED(m->object);
6887         KASSERT(!vm_page_xbusied(m),
6888             ("pmap_clear_modify: page %p is exclusive busied", m));
6889
6890         /*
6891          * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
6892          * If the object containing the page is locked and the page is not
6893          * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
6894          */
6895         if ((m->aflags & PGA_WRITEABLE) == 0)
6896                 return;
6897         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
6898             pa_to_pvh(VM_PAGE_TO_PHYS(m));
6899         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6900         rw_wlock(lock);
6901 restart:
6902         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6903                 pmap = PV_PMAP(pv);
6904                 if (!PMAP_TRYLOCK(pmap)) {
6905                         pvh_gen = pvh->pv_gen;
6906                         rw_wunlock(lock);
6907                         PMAP_LOCK(pmap);
6908                         rw_wlock(lock);
6909                         if (pvh_gen != pvh->pv_gen) {
6910                                 PMAP_UNLOCK(pmap);
6911                                 goto restart;
6912                         }
6913                 }
6914                 PG_M = pmap_modified_bit(pmap);
6915                 PG_V = pmap_valid_bit(pmap);
6916                 PG_RW = pmap_rw_bit(pmap);
6917                 va = pv->pv_va;
6918                 pde = pmap_pde(pmap, va);
6919                 oldpde = *pde;
6920                 if ((oldpde & PG_RW) != 0) {
6921                         if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
6922                                 if ((oldpde & PG_W) == 0) {
6923                                         /*
6924                                          * Write protect the mapping to a
6925                                          * single page so that a subsequent
6926                                          * write access may repromote.
6927                                          */
6928                                         va += VM_PAGE_TO_PHYS(m) - (oldpde &
6929                                             PG_PS_FRAME);
6930                                         pte = pmap_pde_to_pte(pde, va);
6931                                         oldpte = *pte;
6932                                         if ((oldpte & PG_V) != 0) {
6933                                                 while (!atomic_cmpset_long(pte,
6934                                                     oldpte,
6935                                                     oldpte & ~(PG_M | PG_RW)))
6936                                                         oldpte = *pte;
6937                                                 vm_page_dirty(m);
6938                                                 pmap_invalidate_page(pmap, va);
6939                                         }
6940                                 }
6941                         }
6942                 }
6943                 PMAP_UNLOCK(pmap);
6944         }
6945         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6946                 pmap = PV_PMAP(pv);
6947                 if (!PMAP_TRYLOCK(pmap)) {
6948                         md_gen = m->md.pv_gen;
6949                         pvh_gen = pvh->pv_gen;
6950                         rw_wunlock(lock);
6951                         PMAP_LOCK(pmap);
6952                         rw_wlock(lock);
6953                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6954                                 PMAP_UNLOCK(pmap);
6955                                 goto restart;
6956                         }
6957                 }
6958                 PG_M = pmap_modified_bit(pmap);
6959                 PG_RW = pmap_rw_bit(pmap);
6960                 pde = pmap_pde(pmap, pv->pv_va);
6961                 KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
6962                     " a 2mpage in page %p's pv list", m));
6963                 pte = pmap_pde_to_pte(pde, pv->pv_va);
6964                 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
6965                         atomic_clear_long(pte, PG_M);
6966                         pmap_invalidate_page(pmap, pv->pv_va);
6967                 }
6968                 PMAP_UNLOCK(pmap);
6969         }
6970         rw_wunlock(lock);
6971 }
6972
6973 /*
6974  * Miscellaneous support routines follow
6975  */
6976
6977 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
6978 static __inline void
6979 pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
6980 {
6981         u_int opte, npte;
6982
6983         /*
6984          * The cache mode bits are all in the low 32-bits of the
6985          * PTE, so we can just spin on updating the low 32-bits.
6986          */
6987         do {
6988                 opte = *(u_int *)pte;
6989                 npte = opte & ~mask;
6990                 npte |= cache_bits;
6991         } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
6992 }
6993
6994 /* Adjust the cache mode for a 2MB page mapped via a PDE. */
6995 static __inline void
6996 pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
6997 {
6998         u_int opde, npde;
6999
7000         /*
7001          * The cache mode bits are all in the low 32-bits of the
7002          * PDE, so we can just spin on updating the low 32-bits.
7003          */
7004         do {
7005                 opde = *(u_int *)pde;
7006                 npde = opde & ~mask;
7007                 npde |= cache_bits;
7008         } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
7009 }
7010
7011 /*
7012  * Map a set of physical memory pages into the kernel virtual
7013  * address space. Return a pointer to where it is mapped. This
7014  * routine is intended to be used for mapping device memory,
7015  * NOT real memory.
7016  */
7017 void *
7018 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
7019 {
7020         struct pmap_preinit_mapping *ppim;
7021         vm_offset_t va, offset;
7022         vm_size_t tmpsize;
7023         int i;
7024
7025         offset = pa & PAGE_MASK;
7026         size = round_page(offset + size);
7027         pa = trunc_page(pa);
7028
7029         if (!pmap_initialized) {
7030                 va = 0;
7031                 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7032                         ppim = pmap_preinit_mapping + i;
7033                         if (ppim->va == 0) {
7034                                 ppim->pa = pa;
7035                                 ppim->sz = size;
7036                                 ppim->mode = mode;
7037                                 ppim->va = virtual_avail;
7038                                 virtual_avail += size;
7039                                 va = ppim->va;
7040                                 break;
7041                         }
7042                 }
7043                 if (va == 0)
7044                         panic("%s: too many preinit mappings", __func__);
7045         } else {
7046                 /*
7047                  * If we have a preinit mapping, re-use it.
7048                  */
7049                 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7050                         ppim = pmap_preinit_mapping + i;
7051                         if (ppim->pa == pa && ppim->sz == size &&
7052                             ppim->mode == mode)
7053                                 return ((void *)(ppim->va + offset));
7054                 }
7055                 /*
7056                  * If the specified range of physical addresses fits within
7057                  * the direct map window, use the direct map.
7058                  */
7059                 if (pa < dmaplimit && pa + size < dmaplimit) {
7060                         va = PHYS_TO_DMAP(pa);
7061                         if (!pmap_change_attr(va, size, mode))
7062                                 return ((void *)(va + offset));
7063                 }
7064                 va = kva_alloc(size);
7065                 if (va == 0)
7066                         panic("%s: Couldn't allocate KVA", __func__);
7067         }
7068         for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
7069                 pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
7070         pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
7071         pmap_invalidate_cache_range(va, va + tmpsize);
7072         return ((void *)(va + offset));
7073 }
7074
7075 void *
7076 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
7077 {
7078
7079         return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
7080 }
7081
7082 void *
7083 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
7084 {
7085
7086         return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
7087 }
7088
7089 void
7090 pmap_unmapdev(vm_offset_t va, vm_size_t size)
7091 {
7092         struct pmap_preinit_mapping *ppim;
7093         vm_offset_t offset;
7094         int i;
7095
7096         /* If we gave a direct map region in pmap_mapdev, do nothing */
7097         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
7098                 return;
7099         offset = va & PAGE_MASK;
7100         size = round_page(offset + size);
7101         va = trunc_page(va);
7102         for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7103                 ppim = pmap_preinit_mapping + i;
7104                 if (ppim->va == va && ppim->sz == size) {
7105                         if (pmap_initialized)
7106                                 return;
7107                         ppim->pa = 0;
7108                         ppim->va = 0;
7109                         ppim->sz = 0;
7110                         ppim->mode = 0;
7111                         if (va + size == virtual_avail)
7112                                 virtual_avail = va;
7113                         return;
7114                 }
7115         }
7116         if (pmap_initialized)
7117                 kva_free(va, size);
7118 }
7119
7120 /*
7121  * Tries to demote a 1GB page mapping.
7122  */
7123 static boolean_t
7124 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
7125 {
7126         pdp_entry_t newpdpe, oldpdpe;
7127         pd_entry_t *firstpde, newpde, *pde;
7128         pt_entry_t PG_A, PG_M, PG_RW, PG_V;
7129         vm_paddr_t pdpgpa;
7130         vm_page_t pdpg;
7131
7132         PG_A = pmap_accessed_bit(pmap);
7133         PG_M = pmap_modified_bit(pmap);
7134         PG_V = pmap_valid_bit(pmap);
7135         PG_RW = pmap_rw_bit(pmap);
7136
7137         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
7138         oldpdpe = *pdpe;
7139         KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
7140             ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
7141         if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
7142             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
7143                 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
7144                     " in pmap %p", va, pmap);
7145                 return (FALSE);
7146         }
7147         pdpgpa = VM_PAGE_TO_PHYS(pdpg);
7148         firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa);
7149         newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
7150         KASSERT((oldpdpe & PG_A) != 0,
7151             ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
7152         KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
7153             ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
7154         newpde = oldpdpe;
7155
7156         /*
7157          * Initialize the page directory page.
7158          */
7159         for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
7160                 *pde = newpde;
7161                 newpde += NBPDR;
7162         }
7163
7164         /*
7165          * Demote the mapping.
7166          */
7167         *pdpe = newpdpe;
7168
7169         /*
7170          * Invalidate a stale recursive mapping of the page directory page.
7171          */
7172         pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
7173
7174         pmap_pdpe_demotions++;
7175         CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
7176             " in pmap %p", va, pmap);
7177         return (TRUE);
7178 }
7179
7180 /*
7181  * Sets the memory attribute for the specified page.
7182  */
7183 void
7184 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
7185 {
7186
7187         m->md.pat_mode = ma;
7188
7189         /*
7190          * If "m" is a normal page, update its direct mapping.  This update
7191          * can be relied upon to perform any cache operations that are
7192          * required for data coherence.
7193          */
7194         if ((m->flags & PG_FICTITIOUS) == 0 &&
7195             pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
7196             m->md.pat_mode))
7197                 panic("memory attribute change on the direct map failed");
7198 }
7199
7200 /*
7201  * Changes the specified virtual address range's memory type to that given by
7202  * the parameter "mode".  The specified virtual address range must be
7203  * completely contained within either the direct map or the kernel map.  If
7204  * the virtual address range is contained within the kernel map, then the
7205  * memory type for each of the corresponding ranges of the direct map is also
7206  * changed.  (The corresponding ranges of the direct map are those ranges that
7207  * map the same physical pages as the specified virtual address range.)  These
7208  * changes to the direct map are necessary because Intel describes the
7209  * behavior of their processors as "undefined" if two or more mappings to the
7210  * same physical page have different memory types.
7211  *
7212  * Returns zero if the change completed successfully, and either EINVAL or
7213  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
7214  * of the virtual address range was not mapped, and ENOMEM is returned if
7215  * there was insufficient memory available to complete the change.  In the
7216  * latter case, the memory type may have been changed on some part of the
7217  * virtual address range or the direct map.
7218  */
7219 int
7220 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
7221 {
7222         int error;
7223
7224         PMAP_LOCK(kernel_pmap);
7225         error = pmap_change_attr_locked(va, size, mode);
7226         PMAP_UNLOCK(kernel_pmap);
7227         return (error);
7228 }
7229
7230 static int
7231 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
7232 {
7233         vm_offset_t base, offset, tmpva;
7234         vm_paddr_t pa_start, pa_end, pa_end1;
7235         pdp_entry_t *pdpe;
7236         pd_entry_t *pde;
7237         pt_entry_t *pte;
7238         int cache_bits_pte, cache_bits_pde, error;
7239         boolean_t changed;
7240
7241         PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
7242         base = trunc_page(va);
7243         offset = va & PAGE_MASK;
7244         size = round_page(offset + size);
7245
7246         /*
7247          * Only supported on kernel virtual addresses, including the direct
7248          * map but excluding the recursive map.
7249          */
7250         if (base < DMAP_MIN_ADDRESS)
7251                 return (EINVAL);
7252
7253         cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
7254         cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
7255         changed = FALSE;
7256
7257         /*
7258          * Pages that aren't mapped aren't supported.  Also break down 2MB pages
7259          * into 4KB pages if required.
7260          */
7261         for (tmpva = base; tmpva < base + size; ) {
7262                 pdpe = pmap_pdpe(kernel_pmap, tmpva);
7263                 if (pdpe == NULL || *pdpe == 0)
7264                         return (EINVAL);
7265                 if (*pdpe & PG_PS) {
7266                         /*
7267                          * If the current 1GB page already has the required
7268                          * memory type, then we need not demote this page. Just
7269                          * increment tmpva to the next 1GB page frame.
7270                          */
7271                         if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
7272                                 tmpva = trunc_1gpage(tmpva) + NBPDP;
7273                                 continue;
7274                         }
7275
7276                         /*
7277                          * If the current offset aligns with a 1GB page frame
7278                          * and there is at least 1GB left within the range, then
7279                          * we need not break down this page into 2MB pages.
7280                          */
7281                         if ((tmpva & PDPMASK) == 0 &&
7282                             tmpva + PDPMASK < base + size) {
7283                                 tmpva += NBPDP;
7284                                 continue;
7285                         }
7286                         if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
7287                                 return (ENOMEM);
7288                 }
7289                 pde = pmap_pdpe_to_pde(pdpe, tmpva);
7290                 if (*pde == 0)
7291                         return (EINVAL);
7292                 if (*pde & PG_PS) {
7293                         /*
7294                          * If the current 2MB page already has the required
7295                          * memory type, then we need not demote this page. Just
7296                          * increment tmpva to the next 2MB page frame.
7297                          */
7298                         if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
7299                                 tmpva = trunc_2mpage(tmpva) + NBPDR;
7300                                 continue;
7301                         }
7302
7303                         /*
7304                          * If the current offset aligns with a 2MB page frame
7305                          * and there is at least 2MB left within the range, then
7306                          * we need not break down this page into 4KB pages.
7307                          */
7308                         if ((tmpva & PDRMASK) == 0 &&
7309                             tmpva + PDRMASK < base + size) {
7310                                 tmpva += NBPDR;
7311                                 continue;
7312                         }
7313                         if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
7314                                 return (ENOMEM);
7315                 }
7316                 pte = pmap_pde_to_pte(pde, tmpva);
7317                 if (*pte == 0)
7318                         return (EINVAL);
7319                 tmpva += PAGE_SIZE;
7320         }
7321         error = 0;
7322
7323         /*
7324          * Ok, all the pages exist, so run through them updating their
7325          * cache mode if required.
7326          */
7327         pa_start = pa_end = 0;
7328         for (tmpva = base; tmpva < base + size; ) {
7329                 pdpe = pmap_pdpe(kernel_pmap, tmpva);
7330                 if (*pdpe & PG_PS) {
7331                         if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
7332                                 pmap_pde_attr(pdpe, cache_bits_pde,
7333                                     X86_PG_PDE_CACHE);
7334                                 changed = TRUE;
7335                         }
7336                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
7337                             (*pdpe & PG_PS_FRAME) < dmaplimit) {
7338                                 if (pa_start == pa_end) {
7339                                         /* Start physical address run. */
7340                                         pa_start = *pdpe & PG_PS_FRAME;
7341                                         pa_end = pa_start + NBPDP;
7342                                 } else if (pa_end == (*pdpe & PG_PS_FRAME))
7343                                         pa_end += NBPDP;
7344                                 else {
7345                                         /* Run ended, update direct map. */
7346                                         error = pmap_change_attr_locked(
7347                                             PHYS_TO_DMAP(pa_start),
7348                                             pa_end - pa_start, mode);
7349                                         if (error != 0)
7350                                                 break;
7351                                         /* Start physical address run. */
7352                                         pa_start = *pdpe & PG_PS_FRAME;
7353                                         pa_end = pa_start + NBPDP;
7354                                 }
7355                         }
7356                         tmpva = trunc_1gpage(tmpva) + NBPDP;
7357                         continue;
7358                 }
7359                 pde = pmap_pdpe_to_pde(pdpe, tmpva);
7360                 if (*pde & PG_PS) {
7361                         if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
7362                                 pmap_pde_attr(pde, cache_bits_pde,
7363                                     X86_PG_PDE_CACHE);
7364                                 changed = TRUE;
7365                         }
7366                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
7367                             (*pde & PG_PS_FRAME) < dmaplimit) {
7368                                 if (pa_start == pa_end) {
7369                                         /* Start physical address run. */
7370                                         pa_start = *pde & PG_PS_FRAME;
7371                                         pa_end = pa_start + NBPDR;
7372                                 } else if (pa_end == (*pde & PG_PS_FRAME))
7373                                         pa_end += NBPDR;
7374                                 else {
7375                                         /* Run ended, update direct map. */
7376                                         error = pmap_change_attr_locked(
7377                                             PHYS_TO_DMAP(pa_start),
7378                                             pa_end - pa_start, mode);
7379                                         if (error != 0)
7380                                                 break;
7381                                         /* Start physical address run. */
7382                                         pa_start = *pde & PG_PS_FRAME;
7383                                         pa_end = pa_start + NBPDR;
7384                                 }
7385                         }
7386                         tmpva = trunc_2mpage(tmpva) + NBPDR;
7387                 } else {
7388                         pte = pmap_pde_to_pte(pde, tmpva);
7389                         if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
7390                                 pmap_pte_attr(pte, cache_bits_pte,
7391                                     X86_PG_PTE_CACHE);
7392                                 changed = TRUE;
7393                         }
7394                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
7395                             (*pte & PG_FRAME) < dmaplimit) {
7396                                 if (pa_start == pa_end) {
7397                                         /* Start physical address run. */
7398                                         pa_start = *pte & PG_FRAME;
7399                                         pa_end = pa_start + PAGE_SIZE;
7400                                 } else if (pa_end == (*pte & PG_FRAME))
7401                                         pa_end += PAGE_SIZE;
7402                                 else {
7403                                         /* Run ended, update direct map. */
7404                                         error = pmap_change_attr_locked(
7405                                             PHYS_TO_DMAP(pa_start),
7406                                             pa_end - pa_start, mode);
7407                                         if (error != 0)
7408                                                 break;
7409                                         /* Start physical address run. */
7410                                         pa_start = *pte & PG_FRAME;
7411                                         pa_end = pa_start + PAGE_SIZE;
7412                                 }
7413                         }
7414                         tmpva += PAGE_SIZE;
7415                 }
7416         }
7417         if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
7418                 pa_end1 = MIN(pa_end, dmaplimit);
7419                 if (pa_start != pa_end1)
7420                         error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
7421                             pa_end1 - pa_start, mode);
7422         }
7423
7424         /*
7425          * Flush CPU caches if required to make sure any data isn't cached that
7426          * shouldn't be, etc.
7427          */
7428         if (changed) {
7429                 pmap_invalidate_range(kernel_pmap, base, tmpva);
7430                 pmap_invalidate_cache_range(base, tmpva);
7431         }
7432         return (error);
7433 }
7434
7435 /*
7436  * Demotes any mapping within the direct map region that covers more than the
7437  * specified range of physical addresses.  This range's size must be a power
7438  * of two and its starting address must be a multiple of its size.  Since the
7439  * demotion does not change any attributes of the mapping, a TLB invalidation
7440  * is not mandatory.  The caller may, however, request a TLB invalidation.
7441  */
7442 void
7443 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
7444 {
7445         pdp_entry_t *pdpe;
7446         pd_entry_t *pde;
7447         vm_offset_t va;
7448         boolean_t changed;
7449
7450         if (len == 0)
7451                 return;
7452         KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
7453         KASSERT((base & (len - 1)) == 0,
7454             ("pmap_demote_DMAP: base is not a multiple of len"));
7455         if (len < NBPDP && base < dmaplimit) {
7456                 va = PHYS_TO_DMAP(base);
7457                 changed = FALSE;
7458                 PMAP_LOCK(kernel_pmap);
7459                 pdpe = pmap_pdpe(kernel_pmap, va);
7460                 if ((*pdpe & X86_PG_V) == 0)
7461                         panic("pmap_demote_DMAP: invalid PDPE");
7462                 if ((*pdpe & PG_PS) != 0) {
7463                         if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
7464                                 panic("pmap_demote_DMAP: PDPE failed");
7465                         changed = TRUE;
7466                 }
7467                 if (len < NBPDR) {
7468                         pde = pmap_pdpe_to_pde(pdpe, va);
7469                         if ((*pde & X86_PG_V) == 0)
7470                                 panic("pmap_demote_DMAP: invalid PDE");
7471                         if ((*pde & PG_PS) != 0) {
7472                                 if (!pmap_demote_pde(kernel_pmap, pde, va))
7473                                         panic("pmap_demote_DMAP: PDE failed");
7474                                 changed = TRUE;
7475                         }
7476                 }
7477                 if (changed && invalidate)
7478                         pmap_invalidate_page(kernel_pmap, va);
7479                 PMAP_UNLOCK(kernel_pmap);
7480         }
7481 }
7482
7483 /*
7484  * perform the pmap work for mincore
7485  */
7486 int
7487 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
7488 {
7489         pd_entry_t *pdep;
7490         pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
7491         vm_paddr_t pa;
7492         int val;
7493
7494         PG_A = pmap_accessed_bit(pmap);
7495         PG_M = pmap_modified_bit(pmap);
7496         PG_V = pmap_valid_bit(pmap);
7497         PG_RW = pmap_rw_bit(pmap);
7498
7499         PMAP_LOCK(pmap);
7500 retry:
7501         pdep = pmap_pde(pmap, addr);
7502         if (pdep != NULL && (*pdep & PG_V)) {
7503                 if (*pdep & PG_PS) {
7504                         pte = *pdep;
7505                         /* Compute the physical address of the 4KB page. */
7506                         pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
7507                             PG_FRAME;
7508                         val = MINCORE_SUPER;
7509                 } else {
7510                         pte = *pmap_pde_to_pte(pdep, addr);
7511                         pa = pte & PG_FRAME;
7512                         val = 0;
7513                 }
7514         } else {
7515                 pte = 0;
7516                 pa = 0;
7517                 val = 0;
7518         }
7519         if ((pte & PG_V) != 0) {
7520                 val |= MINCORE_INCORE;
7521                 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
7522                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
7523                 if ((pte & PG_A) != 0)
7524                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
7525         }
7526         if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
7527             (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
7528             (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
7529                 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
7530                 if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
7531                         goto retry;
7532         } else
7533                 PA_UNLOCK_COND(*locked_pa);
7534         PMAP_UNLOCK(pmap);
7535         return (val);
7536 }
7537
7538 static uint64_t
7539 pmap_pcid_alloc(pmap_t pmap, u_int cpuid)
7540 {
7541         uint32_t gen, new_gen, pcid_next;
7542
7543         CRITICAL_ASSERT(curthread);
7544         gen = PCPU_GET(pcid_gen);
7545         if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN)
7546                 return (pti ? 0 : CR3_PCID_SAVE);
7547         if (pmap->pm_pcids[cpuid].pm_gen == gen)
7548                 return (CR3_PCID_SAVE);
7549         pcid_next = PCPU_GET(pcid_next);
7550         KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) ||
7551             (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN),
7552             ("cpu %d pcid_next %#x", cpuid, pcid_next));
7553         if ((!pti && pcid_next == PMAP_PCID_OVERMAX) ||
7554             (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) {
7555                 new_gen = gen + 1;
7556                 if (new_gen == 0)
7557                         new_gen = 1;
7558                 PCPU_SET(pcid_gen, new_gen);
7559                 pcid_next = PMAP_PCID_KERN + 1;
7560         } else {
7561                 new_gen = gen;
7562         }
7563         pmap->pm_pcids[cpuid].pm_pcid = pcid_next;
7564         pmap->pm_pcids[cpuid].pm_gen = new_gen;
7565         PCPU_SET(pcid_next, pcid_next + 1);
7566         return (0);
7567 }
7568
7569 static uint64_t
7570 pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid)
7571 {
7572         uint64_t cached;
7573
7574         cached = pmap_pcid_alloc(pmap, cpuid);
7575         KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX,
7576             ("pmap %p cpu %d pcid %#x", pmap, cpuid,
7577             pmap->pm_pcids[cpuid].pm_pcid));
7578         KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN ||
7579             pmap == kernel_pmap,
7580             ("non-kernel pmap pmap %p cpu %d pcid %#x",
7581             pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid));
7582         return (cached);
7583 }
7584
7585 static void
7586 pmap_activate_sw_pti_post(pmap_t pmap)
7587 {
7588
7589         if (pmap->pm_ucr3 != PMAP_NO_CR3)
7590                 PCPU_GET(tssp)->tss_rsp0 = ((vm_offset_t)PCPU_PTR(pti_stack) +
7591                     PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful;
7592 }
7593
7594 static void inline
7595 pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1)
7596 {
7597         struct invpcid_descr d;
7598         uint64_t cached, cr3, kcr3, ucr3;
7599
7600         cached = pmap_pcid_alloc_checked(pmap, cpuid);
7601         cr3 = rcr3();
7602         if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
7603                 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid);
7604         PCPU_SET(curpmap, pmap);
7605         kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid;
7606         ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid |
7607             PMAP_PCID_USER_PT;
7608
7609         if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) {
7610                 /*
7611                  * Explicitly invalidate translations cached from the
7612                  * user page table.  They are not automatically
7613                  * flushed by reload of cr3 with the kernel page table
7614                  * pointer above.
7615                  *
7616                  * Note that the if() condition is resolved statically
7617                  * by using the function argument instead of
7618                  * runtime-evaluated invpcid_works value.
7619                  */
7620                 if (invpcid_works1) {
7621                         d.pcid = PMAP_PCID_USER_PT |
7622                             pmap->pm_pcids[cpuid].pm_pcid;
7623                         d.pad = 0;
7624                         d.addr = 0;
7625                         invpcid(&d, INVPCID_CTX);
7626                 } else {
7627                         pmap_pti_pcid_invalidate(ucr3, kcr3);
7628                 }
7629         }
7630
7631         PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
7632         PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
7633         if (cached)
7634                 PCPU_INC(pm_save_cnt);
7635 }
7636
7637 static void
7638 pmap_activate_sw_pcid_invpcid_pti(pmap_t pmap, u_int cpuid)
7639 {
7640
7641         pmap_activate_sw_pcid_pti(pmap, cpuid, true);
7642         pmap_activate_sw_pti_post(pmap);
7643 }
7644
7645 static void
7646 pmap_activate_sw_pcid_noinvpcid_pti(pmap_t pmap, u_int cpuid)
7647 {
7648         register_t rflags;
7649
7650         /*
7651          * If the INVPCID instruction is not available,
7652          * invltlb_pcid_handler() is used to handle an invalidate_all
7653          * IPI, which checks for curpmap == smp_tlb_pmap.  The below
7654          * sequence of operations has a window where %CR3 is loaded
7655          * with the new pmap's PML4 address, but the curpmap value has
7656          * not yet been updated.  This causes the invltlb IPI handler,
7657          * which is called between the updates, to execute as a NOP,
7658          * which leaves stale TLB entries.
7659          *
7660          * Note that the most typical use of pmap_activate_sw(), from
7661          * the context switch, is immune to this race, because
7662          * interrupts are disabled (while the thread lock is owned),
7663          * and the IPI happens after curpmap is updated.  Protect
7664          * other callers in a similar way, by disabling interrupts
7665          * around the %cr3 register reload and curpmap assignment.
7666          */
7667         rflags = intr_disable();
7668         pmap_activate_sw_pcid_pti(pmap, cpuid, false);
7669         intr_restore(rflags);
7670         pmap_activate_sw_pti_post(pmap);
7671 }
7672
7673 static void
7674 pmap_activate_sw_pcid_nopti(pmap_t pmap, u_int cpuid)
7675 {
7676         uint64_t cached, cr3;
7677
7678         cached = pmap_pcid_alloc_checked(pmap, cpuid);
7679         cr3 = rcr3();
7680         if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
7681                 load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid |
7682                     cached);
7683         PCPU_SET(curpmap, pmap);
7684         if (cached)
7685                 PCPU_INC(pm_save_cnt);
7686 }
7687
7688 static void
7689 pmap_activate_sw_pcid_noinvpcid_nopti(pmap_t pmap, u_int cpuid)
7690 {
7691         register_t rflags;
7692
7693         rflags = intr_disable();
7694         pmap_activate_sw_pcid_nopti(pmap, cpuid);
7695         intr_restore(rflags);
7696 }
7697
7698 static void
7699 pmap_activate_sw_nopcid_nopti(pmap_t pmap, u_int cpuid __unused)
7700 {
7701
7702         load_cr3(pmap->pm_cr3);
7703         PCPU_SET(curpmap, pmap);
7704 }
7705
7706 static void
7707 pmap_activate_sw_nopcid_pti(pmap_t pmap, u_int cpuid __unused)
7708 {
7709
7710         pmap_activate_sw_nopcid_nopti(pmap, cpuid);
7711         PCPU_SET(kcr3, pmap->pm_cr3);
7712         PCPU_SET(ucr3, pmap->pm_ucr3);
7713         pmap_activate_sw_pti_post(pmap);
7714 }
7715
7716 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (pmap_t, u_int), static)
7717 {
7718
7719         if (pmap_pcid_enabled && pti && invpcid_works)
7720                 return (pmap_activate_sw_pcid_invpcid_pti);
7721         else if (pmap_pcid_enabled && pti && !invpcid_works)
7722                 return (pmap_activate_sw_pcid_noinvpcid_pti);
7723         else if (pmap_pcid_enabled && !pti && invpcid_works)
7724                 return (pmap_activate_sw_pcid_nopti);
7725         else if (pmap_pcid_enabled && !pti && !invpcid_works)
7726                 return (pmap_activate_sw_pcid_noinvpcid_nopti);
7727         else if (!pmap_pcid_enabled && pti)
7728                 return (pmap_activate_sw_nopcid_pti);
7729         else /* if (!pmap_pcid_enabled && !pti) */
7730                 return (pmap_activate_sw_nopcid_nopti);
7731 }
7732
7733 void
7734 pmap_activate_sw(struct thread *td)
7735 {
7736         pmap_t oldpmap, pmap;
7737         u_int cpuid;
7738
7739         oldpmap = PCPU_GET(curpmap);
7740         pmap = vmspace_pmap(td->td_proc->p_vmspace);
7741         if (oldpmap == pmap)
7742                 return;
7743         cpuid = PCPU_GET(cpuid);
7744 #ifdef SMP
7745         CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
7746 #else
7747         CPU_SET(cpuid, &pmap->pm_active);
7748 #endif
7749         pmap_activate_sw_mode(pmap, cpuid);
7750 #ifdef SMP
7751         CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
7752 #else
7753         CPU_CLR(cpuid, &oldpmap->pm_active);
7754 #endif
7755 }
7756
7757 void
7758 pmap_activate(struct thread *td)
7759 {
7760
7761         critical_enter();
7762         pmap_activate_sw(td);
7763         critical_exit();
7764 }
7765
7766 void
7767 pmap_activate_boot(pmap_t pmap)
7768 {
7769         uint64_t kcr3;
7770         u_int cpuid;
7771
7772         /*
7773          * kernel_pmap must be never deactivated, and we ensure that
7774          * by never activating it at all.
7775          */
7776         MPASS(pmap != kernel_pmap);
7777
7778         cpuid = PCPU_GET(cpuid);
7779 #ifdef SMP
7780         CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
7781 #else
7782         CPU_SET(cpuid, &pmap->pm_active);
7783 #endif
7784         PCPU_SET(curpmap, pmap);
7785         if (pti) {
7786                 kcr3 = pmap->pm_cr3;
7787                 if (pmap_pcid_enabled)
7788                         kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE;
7789         } else {
7790                 kcr3 = PMAP_NO_CR3;
7791         }
7792         PCPU_SET(kcr3, kcr3);
7793         PCPU_SET(ucr3, PMAP_NO_CR3);
7794 }
7795
7796 void
7797 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
7798 {
7799 }
7800
7801 /*
7802  *      Increase the starting virtual address of the given mapping if a
7803  *      different alignment might result in more superpage mappings.
7804  */
7805 void
7806 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
7807     vm_offset_t *addr, vm_size_t size)
7808 {
7809         vm_offset_t superpage_offset;
7810
7811         if (size < NBPDR)
7812                 return;
7813         if (object != NULL && (object->flags & OBJ_COLORED) != 0)
7814                 offset += ptoa(object->pg_color);
7815         superpage_offset = offset & PDRMASK;
7816         if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
7817             (*addr & PDRMASK) == superpage_offset)
7818                 return;
7819         if ((*addr & PDRMASK) < superpage_offset)
7820                 *addr = (*addr & ~PDRMASK) + superpage_offset;
7821         else
7822                 *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
7823 }
7824
7825 #ifdef INVARIANTS
7826 static unsigned long num_dirty_emulations;
7827 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
7828              &num_dirty_emulations, 0, NULL);
7829
7830 static unsigned long num_accessed_emulations;
7831 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
7832              &num_accessed_emulations, 0, NULL);
7833
7834 static unsigned long num_superpage_accessed_emulations;
7835 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
7836              &num_superpage_accessed_emulations, 0, NULL);
7837
7838 static unsigned long ad_emulation_superpage_promotions;
7839 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
7840              &ad_emulation_superpage_promotions, 0, NULL);
7841 #endif  /* INVARIANTS */
7842
7843 int
7844 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
7845 {
7846         int rv;
7847         struct rwlock *lock;
7848 #if VM_NRESERVLEVEL > 0
7849         vm_page_t m, mpte;
7850 #endif
7851         pd_entry_t *pde;
7852         pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
7853
7854         KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
7855             ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
7856
7857         if (!pmap_emulate_ad_bits(pmap))
7858                 return (-1);
7859
7860         PG_A = pmap_accessed_bit(pmap);
7861         PG_M = pmap_modified_bit(pmap);
7862         PG_V = pmap_valid_bit(pmap);
7863         PG_RW = pmap_rw_bit(pmap);
7864
7865         rv = -1;
7866         lock = NULL;
7867         PMAP_LOCK(pmap);
7868
7869         pde = pmap_pde(pmap, va);
7870         if (pde == NULL || (*pde & PG_V) == 0)
7871                 goto done;
7872
7873         if ((*pde & PG_PS) != 0) {
7874                 if (ftype == VM_PROT_READ) {
7875 #ifdef INVARIANTS
7876                         atomic_add_long(&num_superpage_accessed_emulations, 1);
7877 #endif
7878                         *pde |= PG_A;
7879                         rv = 0;
7880                 }
7881                 goto done;
7882         }
7883
7884         pte = pmap_pde_to_pte(pde, va);
7885         if ((*pte & PG_V) == 0)
7886                 goto done;
7887
7888         if (ftype == VM_PROT_WRITE) {
7889                 if ((*pte & PG_RW) == 0)
7890                         goto done;
7891                 /*
7892                  * Set the modified and accessed bits simultaneously.
7893                  *
7894                  * Intel EPT PTEs that do software emulation of A/D bits map
7895                  * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
7896                  * An EPT misconfiguration is triggered if the PTE is writable
7897                  * but not readable (WR=10). This is avoided by setting PG_A
7898                  * and PG_M simultaneously.
7899                  */
7900                 *pte |= PG_M | PG_A;
7901         } else {
7902                 *pte |= PG_A;
7903         }
7904
7905 #if VM_NRESERVLEVEL > 0
7906         /* try to promote the mapping */
7907         if (va < VM_MAXUSER_ADDRESS)
7908                 mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
7909         else
7910                 mpte = NULL;
7911
7912         m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
7913
7914         if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
7915             pmap_ps_enabled(pmap) &&
7916             (m->flags & PG_FICTITIOUS) == 0 &&
7917             vm_reserv_level_iffullpop(m) == 0) {
7918                 pmap_promote_pde(pmap, pde, va, &lock);
7919 #ifdef INVARIANTS
7920                 atomic_add_long(&ad_emulation_superpage_promotions, 1);
7921 #endif
7922         }
7923 #endif
7924
7925 #ifdef INVARIANTS
7926         if (ftype == VM_PROT_WRITE)
7927                 atomic_add_long(&num_dirty_emulations, 1);
7928         else
7929                 atomic_add_long(&num_accessed_emulations, 1);
7930 #endif
7931         rv = 0;         /* success */
7932 done:
7933         if (lock != NULL)
7934                 rw_wunlock(lock);
7935         PMAP_UNLOCK(pmap);
7936         return (rv);
7937 }
7938
7939 void
7940 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
7941 {
7942         pml4_entry_t *pml4;
7943         pdp_entry_t *pdp;
7944         pd_entry_t *pde;
7945         pt_entry_t *pte, PG_V;
7946         int idx;
7947
7948         idx = 0;
7949         PG_V = pmap_valid_bit(pmap);
7950         PMAP_LOCK(pmap);
7951
7952         pml4 = pmap_pml4e(pmap, va);
7953         ptr[idx++] = *pml4;
7954         if ((*pml4 & PG_V) == 0)
7955                 goto done;
7956
7957         pdp = pmap_pml4e_to_pdpe(pml4, va);
7958         ptr[idx++] = *pdp;
7959         if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
7960                 goto done;
7961
7962         pde = pmap_pdpe_to_pde(pdp, va);
7963         ptr[idx++] = *pde;
7964         if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
7965                 goto done;
7966
7967         pte = pmap_pde_to_pte(pde, va);
7968         ptr[idx++] = *pte;
7969
7970 done:
7971         PMAP_UNLOCK(pmap);
7972         *num = idx;
7973 }
7974
7975 /**
7976  * Get the kernel virtual address of a set of physical pages. If there are
7977  * physical addresses not covered by the DMAP perform a transient mapping
7978  * that will be removed when calling pmap_unmap_io_transient.
7979  *
7980  * \param page        The pages the caller wishes to obtain the virtual
7981  *                    address on the kernel memory map.
7982  * \param vaddr       On return contains the kernel virtual memory address
7983  *                    of the pages passed in the page parameter.
7984  * \param count       Number of pages passed in.
7985  * \param can_fault   TRUE if the thread using the mapped pages can take
7986  *                    page faults, FALSE otherwise.
7987  *
7988  * \returns TRUE if the caller must call pmap_unmap_io_transient when
7989  *          finished or FALSE otherwise.
7990  *
7991  */
7992 boolean_t
7993 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7994     boolean_t can_fault)
7995 {
7996         vm_paddr_t paddr;
7997         boolean_t needs_mapping;
7998         pt_entry_t *pte;
7999         int cache_bits, error __unused, i;
8000
8001         /*
8002          * Allocate any KVA space that we need, this is done in a separate
8003          * loop to prevent calling vmem_alloc while pinned.
8004          */
8005         needs_mapping = FALSE;
8006         for (i = 0; i < count; i++) {
8007                 paddr = VM_PAGE_TO_PHYS(page[i]);
8008                 if (__predict_false(paddr >= dmaplimit)) {
8009                         error = vmem_alloc(kernel_arena, PAGE_SIZE,
8010                             M_BESTFIT | M_WAITOK, &vaddr[i]);
8011                         KASSERT(error == 0, ("vmem_alloc failed: %d", error));
8012                         needs_mapping = TRUE;
8013                 } else {
8014                         vaddr[i] = PHYS_TO_DMAP(paddr);
8015                 }
8016         }
8017
8018         /* Exit early if everything is covered by the DMAP */
8019         if (!needs_mapping)
8020                 return (FALSE);
8021
8022         /*
8023          * NB:  The sequence of updating a page table followed by accesses
8024          * to the corresponding pages used in the !DMAP case is subject to
8025          * the situation described in the "AMD64 Architecture Programmer's
8026          * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
8027          * Coherency Considerations".  Therefore, issuing the INVLPG right
8028          * after modifying the PTE bits is crucial.
8029          */
8030         if (!can_fault)
8031                 sched_pin();
8032         for (i = 0; i < count; i++) {
8033                 paddr = VM_PAGE_TO_PHYS(page[i]);
8034                 if (paddr >= dmaplimit) {
8035                         if (can_fault) {
8036                                 /*
8037                                  * Slow path, since we can get page faults
8038                                  * while mappings are active don't pin the
8039                                  * thread to the CPU and instead add a global
8040                                  * mapping visible to all CPUs.
8041                                  */
8042                                 pmap_qenter(vaddr[i], &page[i], 1);
8043                         } else {
8044                                 pte = vtopte(vaddr[i]);
8045                                 cache_bits = pmap_cache_bits(kernel_pmap,
8046                                     page[i]->md.pat_mode, 0);
8047                                 pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
8048                                     cache_bits);
8049                                 invlpg(vaddr[i]);
8050                         }
8051                 }
8052         }
8053
8054         return (needs_mapping);
8055 }
8056
8057 void
8058 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
8059     boolean_t can_fault)
8060 {
8061         vm_paddr_t paddr;
8062         int i;
8063
8064         if (!can_fault)
8065                 sched_unpin();
8066         for (i = 0; i < count; i++) {
8067                 paddr = VM_PAGE_TO_PHYS(page[i]);
8068                 if (paddr >= dmaplimit) {
8069                         if (can_fault)
8070                                 pmap_qremove(vaddr[i], 1);
8071                         vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
8072                 }
8073         }
8074 }
8075
8076 vm_offset_t
8077 pmap_quick_enter_page(vm_page_t m)
8078 {
8079         vm_paddr_t paddr;
8080
8081         paddr = VM_PAGE_TO_PHYS(m);
8082         if (paddr < dmaplimit)
8083                 return (PHYS_TO_DMAP(paddr));
8084         mtx_lock_spin(&qframe_mtx);
8085         KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
8086         pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
8087             X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
8088         return (qframe);
8089 }
8090
8091 void
8092 pmap_quick_remove_page(vm_offset_t addr)
8093 {
8094
8095         if (addr != qframe)
8096                 return;
8097         pte_store(vtopte(qframe), 0);
8098         invlpg(qframe);
8099         mtx_unlock_spin(&qframe_mtx);
8100 }
8101
8102 static vm_page_t
8103 pmap_pti_alloc_page(void)
8104 {
8105         vm_page_t m;
8106
8107         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8108         m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY |
8109             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
8110         return (m);
8111 }
8112
8113 static bool
8114 pmap_pti_free_page(vm_page_t m)
8115 {
8116
8117         KASSERT(m->wire_count > 0, ("page %p not wired", m));
8118         if (!vm_page_unwire_noq(m))
8119                 return (false);
8120         vm_page_free_zero(m);
8121         return (true);
8122 }
8123
8124 static void
8125 pmap_pti_init(void)
8126 {
8127         vm_page_t pml4_pg;
8128         pdp_entry_t *pdpe;
8129         vm_offset_t va;
8130         int i;
8131
8132         if (!pti)
8133                 return;
8134         pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
8135         VM_OBJECT_WLOCK(pti_obj);
8136         pml4_pg = pmap_pti_alloc_page();
8137         pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
8138         for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
8139             va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
8140                 pdpe = pmap_pti_pdpe(va);
8141                 pmap_pti_wire_pte(pdpe);
8142         }
8143         pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
8144             (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
8145         pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt +
8146             sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false);
8147         pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
8148             sizeof(struct gate_descriptor) * NIDT, false);
8149         pmap_pti_add_kva_locked((vm_offset_t)common_tss,
8150             (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false);
8151         CPU_FOREACH(i) {
8152                 /* Doublefault stack IST 1 */
8153                 va = common_tss[i].tss_ist1;
8154                 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
8155                 /* NMI stack IST 2 */
8156                 va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu);
8157                 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
8158                 /* MC# stack IST 3 */
8159                 va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
8160                 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
8161                 /* DB# stack IST 4 */
8162                 va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu);
8163                 pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
8164         }
8165         pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
8166             (vm_offset_t)etext, true);
8167         pti_finalized = true;
8168         VM_OBJECT_WUNLOCK(pti_obj);
8169 }
8170 SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL);
8171
8172 static pdp_entry_t *
8173 pmap_pti_pdpe(vm_offset_t va)
8174 {
8175         pml4_entry_t *pml4e;
8176         pdp_entry_t *pdpe;
8177         vm_page_t m;
8178         vm_pindex_t pml4_idx;
8179         vm_paddr_t mphys;
8180
8181         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8182
8183         pml4_idx = pmap_pml4e_index(va);
8184         pml4e = &pti_pml4[pml4_idx];
8185         m = NULL;
8186         if (*pml4e == 0) {
8187                 if (pti_finalized)
8188                         panic("pml4 alloc after finalization\n");
8189                 m = pmap_pti_alloc_page();
8190                 if (*pml4e != 0) {
8191                         pmap_pti_free_page(m);
8192                         mphys = *pml4e & ~PAGE_MASK;
8193                 } else {
8194                         mphys = VM_PAGE_TO_PHYS(m);
8195                         *pml4e = mphys | X86_PG_RW | X86_PG_V;
8196                 }
8197         } else {
8198                 mphys = *pml4e & ~PAGE_MASK;
8199         }
8200         pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
8201         return (pdpe);
8202 }
8203
8204 static void
8205 pmap_pti_wire_pte(void *pte)
8206 {
8207         vm_page_t m;
8208
8209         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8210         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
8211         m->wire_count++;
8212 }
8213
8214 static void
8215 pmap_pti_unwire_pde(void *pde, bool only_ref)
8216 {
8217         vm_page_t m;
8218
8219         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8220         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
8221         MPASS(m->wire_count > 0);
8222         MPASS(only_ref || m->wire_count > 1);
8223         pmap_pti_free_page(m);
8224 }
8225
8226 static void
8227 pmap_pti_unwire_pte(void *pte, vm_offset_t va)
8228 {
8229         vm_page_t m;
8230         pd_entry_t *pde;
8231
8232         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8233         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
8234         MPASS(m->wire_count > 0);
8235         if (pmap_pti_free_page(m)) {
8236                 pde = pmap_pti_pde(va);
8237                 MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
8238                 *pde = 0;
8239                 pmap_pti_unwire_pde(pde, false);
8240         }
8241 }
8242
8243 static pd_entry_t *
8244 pmap_pti_pde(vm_offset_t va)
8245 {
8246         pdp_entry_t *pdpe;
8247         pd_entry_t *pde;
8248         vm_page_t m;
8249         vm_pindex_t pd_idx;
8250         vm_paddr_t mphys;
8251
8252         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8253
8254         pdpe = pmap_pti_pdpe(va);
8255         if (*pdpe == 0) {
8256                 m = pmap_pti_alloc_page();
8257                 if (*pdpe != 0) {
8258                         pmap_pti_free_page(m);
8259                         MPASS((*pdpe & X86_PG_PS) == 0);
8260                         mphys = *pdpe & ~PAGE_MASK;
8261                 } else {
8262                         mphys =  VM_PAGE_TO_PHYS(m);
8263                         *pdpe = mphys | X86_PG_RW | X86_PG_V;
8264                 }
8265         } else {
8266                 MPASS((*pdpe & X86_PG_PS) == 0);
8267                 mphys = *pdpe & ~PAGE_MASK;
8268         }
8269
8270         pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
8271         pd_idx = pmap_pde_index(va);
8272         pde += pd_idx;
8273         return (pde);
8274 }
8275
8276 static pt_entry_t *
8277 pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
8278 {
8279         pd_entry_t *pde;
8280         pt_entry_t *pte;
8281         vm_page_t m;
8282         vm_paddr_t mphys;
8283
8284         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8285
8286         pde = pmap_pti_pde(va);
8287         if (unwire_pde != NULL) {
8288                 *unwire_pde = true;
8289                 pmap_pti_wire_pte(pde);
8290         }
8291         if (*pde == 0) {
8292                 m = pmap_pti_alloc_page();
8293                 if (*pde != 0) {
8294                         pmap_pti_free_page(m);
8295                         MPASS((*pde & X86_PG_PS) == 0);
8296                         mphys = *pde & ~(PAGE_MASK | pg_nx);
8297                 } else {
8298                         mphys = VM_PAGE_TO_PHYS(m);
8299                         *pde = mphys | X86_PG_RW | X86_PG_V;
8300                         if (unwire_pde != NULL)
8301                                 *unwire_pde = false;
8302                 }
8303         } else {
8304                 MPASS((*pde & X86_PG_PS) == 0);
8305                 mphys = *pde & ~(PAGE_MASK | pg_nx);
8306         }
8307
8308         pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
8309         pte += pmap_pte_index(va);
8310
8311         return (pte);
8312 }
8313
8314 static void
8315 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
8316 {
8317         vm_paddr_t pa;
8318         pd_entry_t *pde;
8319         pt_entry_t *pte, ptev;
8320         bool unwire_pde;
8321
8322         VM_OBJECT_ASSERT_WLOCKED(pti_obj);
8323
8324         sva = trunc_page(sva);
8325         MPASS(sva > VM_MAXUSER_ADDRESS);
8326         eva = round_page(eva);
8327         MPASS(sva < eva);
8328         for (; sva < eva; sva += PAGE_SIZE) {
8329                 pte = pmap_pti_pte(sva, &unwire_pde);
8330                 pa = pmap_kextract(sva);
8331                 ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G |
8332                     (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
8333                     VM_MEMATTR_DEFAULT, FALSE);
8334                 if (*pte == 0) {
8335                         pte_store(pte, ptev);
8336                         pmap_pti_wire_pte(pte);
8337                 } else {
8338                         KASSERT(!pti_finalized,
8339                             ("pti overlap after fin %#lx %#lx %#lx",
8340                             sva, *pte, ptev));
8341                         KASSERT(*pte == ptev,
8342                             ("pti non-identical pte after fin %#lx %#lx %#lx",
8343                             sva, *pte, ptev));
8344                 }
8345                 if (unwire_pde) {
8346                         pde = pmap_pti_pde(sva);
8347                         pmap_pti_unwire_pde(pde, true);
8348                 }
8349         }
8350 }
8351
8352 void
8353 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
8354 {
8355
8356         if (!pti)
8357                 return;
8358         VM_OBJECT_WLOCK(pti_obj);
8359         pmap_pti_add_kva_locked(sva, eva, exec);
8360         VM_OBJECT_WUNLOCK(pti_obj);
8361 }
8362
8363 void
8364 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
8365 {
8366         pt_entry_t *pte;
8367         vm_offset_t va;
8368
8369         if (!pti)
8370                 return;
8371         sva = rounddown2(sva, PAGE_SIZE);
8372         MPASS(sva > VM_MAXUSER_ADDRESS);
8373         eva = roundup2(eva, PAGE_SIZE);
8374         MPASS(sva < eva);
8375         VM_OBJECT_WLOCK(pti_obj);
8376         for (va = sva; va < eva; va += PAGE_SIZE) {
8377                 pte = pmap_pti_pte(va, NULL);
8378                 KASSERT((*pte & X86_PG_V) != 0,
8379                     ("invalid pte va %#lx pte %#lx pt %#lx", va,
8380                     (u_long)pte, *pte));
8381                 pte_clear(pte);
8382                 pmap_pti_unwire_pte(pte, va);
8383         }
8384         pmap_invalidate_range(kernel_pmap, sva, eva);
8385         VM_OBJECT_WUNLOCK(pti_obj);
8386 }
8387
8388 #include "opt_ddb.h"
8389 #ifdef DDB
8390 #include <sys/kdb.h>
8391 #include <ddb/ddb.h>
8392
8393 DB_SHOW_COMMAND(pte, pmap_print_pte)
8394 {
8395         pmap_t pmap;
8396         pml4_entry_t *pml4;
8397         pdp_entry_t *pdp;
8398         pd_entry_t *pde;
8399         pt_entry_t *pte, PG_V;
8400         vm_offset_t va;
8401
8402         if (!have_addr) {
8403                 db_printf("show pte addr\n");
8404                 return;
8405         }
8406         va = (vm_offset_t)addr;
8407
8408         if (kdb_thread != NULL)
8409                 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
8410         else
8411                 pmap = PCPU_GET(curpmap);
8412
8413         PG_V = pmap_valid_bit(pmap);
8414         pml4 = pmap_pml4e(pmap, va);
8415         db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
8416         if ((*pml4 & PG_V) == 0) {
8417                 db_printf("\n");
8418                 return;
8419         }
8420         pdp = pmap_pml4e_to_pdpe(pml4, va);
8421         db_printf(" pdpe %#016lx", *pdp);
8422         if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
8423                 db_printf("\n");
8424                 return;
8425         }
8426         pde = pmap_pdpe_to_pde(pdp, va);
8427         db_printf(" pde %#016lx", *pde);
8428         if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
8429                 db_printf("\n");
8430                 return;
8431         }
8432         pte = pmap_pde_to_pte(pde, va);
8433         db_printf(" pte %#016lx\n", *pte);
8434 }
8435
8436 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
8437 {
8438         vm_paddr_t a;
8439
8440         if (have_addr) {
8441                 a = (vm_paddr_t)addr;
8442                 db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
8443         } else {
8444                 db_printf("show phys2dmap addr\n");
8445         }
8446 }
8447 #endif