]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/arm64/arm64/pmap.c
Raise the size of L3 table for early devmap on arm64
[FreeBSD/FreeBSD.git] / sys / arm64 / arm64 / pmap.c
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2003 Peter Wemm
9  * All rights reserved.
10  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11  * All rights reserved.
12  * Copyright (c) 2014 Andrew Turner
13  * All rights reserved.
14  * Copyright (c) 2014-2016 The FreeBSD Foundation
15  * All rights reserved.
16  *
17  * This code is derived from software contributed to Berkeley by
18  * the Systems Programming Group of the University of Utah Computer
19  * Science Department and William Jolitz of UUNET Technologies Inc.
20  *
21  * This software was developed by Andrew Turner under sponsorship from
22  * the FreeBSD Foundation.
23  *
24  * Redistribution and use in source and binary forms, with or without
25  * modification, are permitted provided that the following conditions
26  * are met:
27  * 1. Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  * 2. Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in the
31  *    documentation and/or other materials provided with the distribution.
32  * 3. All advertising materials mentioning features or use of this software
33  *    must display the following acknowledgement:
34  *      This product includes software developed by the University of
35  *      California, Berkeley and its contributors.
36  * 4. Neither the name of the University nor the names of its contributors
37  *    may be used to endorse or promote products derived from this software
38  *    without specific prior written permission.
39  *
40  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  *
52  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
53  */
54 /*-
55  * Copyright (c) 2003 Networks Associates Technology, Inc.
56  * All rights reserved.
57  *
58  * This software was developed for the FreeBSD Project by Jake Burkholder,
59  * Safeport Network Services, and Network Associates Laboratories, the
60  * Security Research Division of Network Associates, Inc. under
61  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
62  * CHATS research program.
63  *
64  * Redistribution and use in source and binary forms, with or without
65  * modification, are permitted provided that the following conditions
66  * are met:
67  * 1. Redistributions of source code must retain the above copyright
68  *    notice, this list of conditions and the following disclaimer.
69  * 2. Redistributions in binary form must reproduce the above copyright
70  *    notice, this list of conditions and the following disclaimer in the
71  *    documentation and/or other materials provided with the distribution.
72  *
73  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
74  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83  * SUCH DAMAGE.
84  */
85
86 #include <sys/cdefs.h>
87 __FBSDID("$FreeBSD$");
88
89 /*
90  *      Manages physical address maps.
91  *
92  *      Since the information managed by this module is
93  *      also stored by the logical address mapping module,
94  *      this module may throw away valid virtual-to-physical
95  *      mappings at almost any time.  However, invalidations
96  *      of virtual-to-physical mappings must be done as
97  *      requested.
98  *
99  *      In order to cope with hardware architectures which
100  *      make virtual-to-physical map invalidates expensive,
101  *      this module may delay invalidate or reduced protection
102  *      operations until such time as they are actually
103  *      necessary.  This module is given full information as
104  *      to which processors are currently using which maps,
105  *      and to when physical maps must be made correct.
106  */
107
108 #include "opt_vm.h"
109
110 #include <sys/param.h>
111 #include <sys/bitstring.h>
112 #include <sys/bus.h>
113 #include <sys/systm.h>
114 #include <sys/kernel.h>
115 #include <sys/ktr.h>
116 #include <sys/lock.h>
117 #include <sys/malloc.h>
118 #include <sys/mman.h>
119 #include <sys/msgbuf.h>
120 #include <sys/mutex.h>
121 #include <sys/proc.h>
122 #include <sys/rwlock.h>
123 #include <sys/sx.h>
124 #include <sys/vmem.h>
125 #include <sys/vmmeter.h>
126 #include <sys/sched.h>
127 #include <sys/sysctl.h>
128 #include <sys/_unrhdr.h>
129 #include <sys/smp.h>
130
131 #include <vm/vm.h>
132 #include <vm/vm_param.h>
133 #include <vm/vm_kern.h>
134 #include <vm/vm_page.h>
135 #include <vm/vm_map.h>
136 #include <vm/vm_object.h>
137 #include <vm/vm_extern.h>
138 #include <vm/vm_pageout.h>
139 #include <vm/vm_pager.h>
140 #include <vm/vm_phys.h>
141 #include <vm/vm_radix.h>
142 #include <vm/vm_reserv.h>
143 #include <vm/uma.h>
144
145 #include <machine/machdep.h>
146 #include <machine/md_var.h>
147 #include <machine/pcb.h>
148
149 #include <arm/include/physmem.h>
150
151 #define NL0PG           (PAGE_SIZE/(sizeof (pd_entry_t)))
152 #define NL1PG           (PAGE_SIZE/(sizeof (pd_entry_t)))
153 #define NL2PG           (PAGE_SIZE/(sizeof (pd_entry_t)))
154 #define NL3PG           (PAGE_SIZE/(sizeof (pt_entry_t)))
155
156 #define NUL0E           L0_ENTRIES
157 #define NUL1E           (NUL0E * NL1PG)
158 #define NUL2E           (NUL1E * NL2PG)
159
160 #if !defined(DIAGNOSTIC)
161 #ifdef __GNUC_GNU_INLINE__
162 #define PMAP_INLINE     __attribute__((__gnu_inline__)) inline
163 #else
164 #define PMAP_INLINE     extern inline
165 #endif
166 #else
167 #define PMAP_INLINE
168 #endif
169
170 /*
171  * These are configured by the mair_el1 register. This is set up in locore.S
172  */
173 #define DEVICE_MEMORY   0
174 #define UNCACHED_MEMORY 1
175 #define CACHED_MEMORY   2
176
177
178 #ifdef PV_STATS
179 #define PV_STAT(x)      do { x ; } while (0)
180 #else
181 #define PV_STAT(x)      do { } while (0)
182 #endif
183
184 #define pmap_l2_pindex(v)       ((v) >> L2_SHIFT)
185 #define pa_to_pvh(pa)           (&pv_table[pmap_l2_pindex(pa)])
186
187 #define NPV_LIST_LOCKS  MAXCPU
188
189 #define PHYS_TO_PV_LIST_LOCK(pa)        \
190                         (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
191
192 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)  do {    \
193         struct rwlock **_lockp = (lockp);               \
194         struct rwlock *_new_lock;                       \
195                                                         \
196         _new_lock = PHYS_TO_PV_LIST_LOCK(pa);           \
197         if (_new_lock != *_lockp) {                     \
198                 if (*_lockp != NULL)                    \
199                         rw_wunlock(*_lockp);            \
200                 *_lockp = _new_lock;                    \
201                 rw_wlock(*_lockp);                      \
202         }                                               \
203 } while (0)
204
205 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)        \
206                         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
207
208 #define RELEASE_PV_LIST_LOCK(lockp)             do {    \
209         struct rwlock **_lockp = (lockp);               \
210                                                         \
211         if (*_lockp != NULL) {                          \
212                 rw_wunlock(*_lockp);                    \
213                 *_lockp = NULL;                         \
214         }                                               \
215 } while (0)
216
217 #define VM_PAGE_TO_PV_LIST_LOCK(m)      \
218                         PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
219
220 struct pmap kernel_pmap_store;
221
222 /* Used for mapping ACPI memory before VM is initialized */
223 #define PMAP_PREINIT_MAPPING_COUNT      32
224 #define PMAP_PREINIT_MAPPING_SIZE       (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
225 static vm_offset_t preinit_map_va;      /* Start VA of pre-init mapping space */
226 static int vm_initialized = 0;          /* No need to use pre-init maps when set */
227
228 /*
229  * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
230  * Always map entire L2 block for simplicity.
231  * VA of L2 block = preinit_map_va + i * L2_SIZE
232  */
233 static struct pmap_preinit_mapping {
234         vm_paddr_t      pa;
235         vm_offset_t     va;
236         vm_size_t       size;
237 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
238
239 vm_offset_t virtual_avail;      /* VA of first avail page (after kernel bss) */
240 vm_offset_t virtual_end;        /* VA of last avail page (end of kernel AS) */
241 vm_offset_t kernel_vm_end = 0;
242
243 /*
244  * Data for the pv entry allocation mechanism.
245  * Updates to pv_invl_gen are protected by the pv_list_locks[]
246  * elements, but reads are not.
247  */
248 static struct md_page *pv_table;
249 static struct md_page pv_dummy;
250
251 vm_paddr_t dmap_phys_base;      /* The start of the dmap region */
252 vm_paddr_t dmap_phys_max;       /* The limit of the dmap region */
253 vm_offset_t dmap_max_addr;      /* The virtual address limit of the dmap */
254
255 /* This code assumes all L1 DMAP entries will be used */
256 CTASSERT((DMAP_MIN_ADDRESS  & ~L0_OFFSET) == DMAP_MIN_ADDRESS);
257 CTASSERT((DMAP_MAX_ADDRESS  & ~L0_OFFSET) == DMAP_MAX_ADDRESS);
258
259 #define DMAP_TABLES     ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
260 extern pt_entry_t pagetable_dmap[];
261
262 #define PHYSMAP_SIZE    (2 * (VM_PHYSSEG_MAX - 1))
263 static vm_paddr_t physmap[PHYSMAP_SIZE];
264 static u_int physmap_idx;
265
266 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
267
268 static int superpages_enabled = 1;
269 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
270     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
271     "Are large page mappings enabled?");
272
273 /*
274  * Data for the pv entry allocation mechanism
275  */
276 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
277 static struct mtx pv_chunks_mutex;
278 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
279
280 static void     free_pv_chunk(struct pv_chunk *pc);
281 static void     free_pv_entry(pmap_t pmap, pv_entry_t pv);
282 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
283 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
284 static void     pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
285 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
286                     vm_offset_t va);
287
288 static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode);
289 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
290 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
291 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
292     vm_offset_t va, struct rwlock **lockp);
293 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
294 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
295     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
296 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
297     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
298 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
299     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
300 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
301     vm_page_t m, struct rwlock **lockp);
302
303 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
304                 struct rwlock **lockp);
305
306 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
307     struct spglist *free);
308 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
309 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
310
311 /*
312  * These load the old table data and store the new value.
313  * They need to be atomic as the System MMU may write to the table at
314  * the same time as the CPU.
315  */
316 #define pmap_load_store(table, entry) atomic_swap_64(table, entry)
317 #define pmap_set(table, mask) atomic_set_64(table, mask)
318 #define pmap_load_clear(table) atomic_swap_64(table, 0)
319 #define pmap_load(table) (*table)
320
321 /********************/
322 /* Inline functions */
323 /********************/
324
325 static __inline void
326 pagecopy(void *s, void *d)
327 {
328
329         memcpy(d, s, PAGE_SIZE);
330 }
331
332 static __inline pd_entry_t *
333 pmap_l0(pmap_t pmap, vm_offset_t va)
334 {
335
336         return (&pmap->pm_l0[pmap_l0_index(va)]);
337 }
338
339 static __inline pd_entry_t *
340 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
341 {
342         pd_entry_t *l1;
343
344         l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
345         return (&l1[pmap_l1_index(va)]);
346 }
347
348 static __inline pd_entry_t *
349 pmap_l1(pmap_t pmap, vm_offset_t va)
350 {
351         pd_entry_t *l0;
352
353         l0 = pmap_l0(pmap, va);
354         if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
355                 return (NULL);
356
357         return (pmap_l0_to_l1(l0, va));
358 }
359
360 static __inline pd_entry_t *
361 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
362 {
363         pd_entry_t *l2;
364
365         l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
366         return (&l2[pmap_l2_index(va)]);
367 }
368
369 static __inline pd_entry_t *
370 pmap_l2(pmap_t pmap, vm_offset_t va)
371 {
372         pd_entry_t *l1;
373
374         l1 = pmap_l1(pmap, va);
375         if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
376                 return (NULL);
377
378         return (pmap_l1_to_l2(l1, va));
379 }
380
381 static __inline pt_entry_t *
382 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
383 {
384         pt_entry_t *l3;
385
386         l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK);
387         return (&l3[pmap_l3_index(va)]);
388 }
389
390 /*
391  * Returns the lowest valid pde for a given virtual address.
392  * The next level may or may not point to a valid page or block.
393  */
394 static __inline pd_entry_t *
395 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
396 {
397         pd_entry_t *l0, *l1, *l2, desc;
398
399         l0 = pmap_l0(pmap, va);
400         desc = pmap_load(l0) & ATTR_DESCR_MASK;
401         if (desc != L0_TABLE) {
402                 *level = -1;
403                 return (NULL);
404         }
405
406         l1 = pmap_l0_to_l1(l0, va);
407         desc = pmap_load(l1) & ATTR_DESCR_MASK;
408         if (desc != L1_TABLE) {
409                 *level = 0;
410                 return (l0);
411         }
412
413         l2 = pmap_l1_to_l2(l1, va);
414         desc = pmap_load(l2) & ATTR_DESCR_MASK;
415         if (desc != L2_TABLE) {
416                 *level = 1;
417                 return (l1);
418         }
419
420         *level = 2;
421         return (l2);
422 }
423
424 /*
425  * Returns the lowest valid pte block or table entry for a given virtual
426  * address. If there are no valid entries return NULL and set the level to
427  * the first invalid level.
428  */
429 static __inline pt_entry_t *
430 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
431 {
432         pd_entry_t *l1, *l2, desc;
433         pt_entry_t *l3;
434
435         l1 = pmap_l1(pmap, va);
436         if (l1 == NULL) {
437                 *level = 0;
438                 return (NULL);
439         }
440         desc = pmap_load(l1) & ATTR_DESCR_MASK;
441         if (desc == L1_BLOCK) {
442                 *level = 1;
443                 return (l1);
444         }
445
446         if (desc != L1_TABLE) {
447                 *level = 1;
448                 return (NULL);
449         }
450
451         l2 = pmap_l1_to_l2(l1, va);
452         desc = pmap_load(l2) & ATTR_DESCR_MASK;
453         if (desc == L2_BLOCK) {
454                 *level = 2;
455                 return (l2);
456         }
457
458         if (desc != L2_TABLE) {
459                 *level = 2;
460                 return (NULL);
461         }
462
463         *level = 3;
464         l3 = pmap_l2_to_l3(l2, va);
465         if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
466                 return (NULL);
467
468         return (l3);
469 }
470
471 static inline bool
472 pmap_superpages_enabled(void)
473 {
474
475         return (superpages_enabled != 0);
476 }
477
478 bool
479 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
480     pd_entry_t **l2, pt_entry_t **l3)
481 {
482         pd_entry_t *l0p, *l1p, *l2p;
483
484         if (pmap->pm_l0 == NULL)
485                 return (false);
486
487         l0p = pmap_l0(pmap, va);
488         *l0 = l0p;
489
490         if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
491                 return (false);
492
493         l1p = pmap_l0_to_l1(l0p, va);
494         *l1 = l1p;
495
496         if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
497                 *l2 = NULL;
498                 *l3 = NULL;
499                 return (true);
500         }
501
502         if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
503                 return (false);
504
505         l2p = pmap_l1_to_l2(l1p, va);
506         *l2 = l2p;
507
508         if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
509                 *l3 = NULL;
510                 return (true);
511         }
512
513         if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
514                 return (false);
515
516         *l3 = pmap_l2_to_l3(l2p, va);
517
518         return (true);
519 }
520
521 static __inline int
522 pmap_l3_valid(pt_entry_t l3)
523 {
524
525         return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
526 }
527
528
529 CTASSERT(L1_BLOCK == L2_BLOCK);
530
531 /*
532  * Checks if the page is dirty. We currently lack proper tracking of this on
533  * arm64 so for now assume is a page mapped as rw was accessed it is.
534  */
535 static inline int
536 pmap_page_dirty(pt_entry_t pte)
537 {
538
539         return ((pte & (ATTR_AF | ATTR_AP_RW_BIT)) ==
540             (ATTR_AF | ATTR_AP(ATTR_AP_RW)));
541 }
542
543 static __inline void
544 pmap_resident_count_inc(pmap_t pmap, int count)
545 {
546
547         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
548         pmap->pm_stats.resident_count += count;
549 }
550
551 static __inline void
552 pmap_resident_count_dec(pmap_t pmap, int count)
553 {
554
555         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
556         KASSERT(pmap->pm_stats.resident_count >= count,
557             ("pmap %p resident count underflow %ld %d", pmap,
558             pmap->pm_stats.resident_count, count));
559         pmap->pm_stats.resident_count -= count;
560 }
561
562 static pt_entry_t *
563 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
564     u_int *l2_slot)
565 {
566         pt_entry_t *l2;
567         pd_entry_t *l1;
568
569         l1 = (pd_entry_t *)l1pt;
570         *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;
571
572         /* Check locore has used a table L1 map */
573         KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE,
574            ("Invalid bootstrap L1 table"));
575         /* Find the address of the L2 table */
576         l2 = (pt_entry_t *)init_pt_va;
577         *l2_slot = pmap_l2_index(va);
578
579         return (l2);
580 }
581
582 static vm_paddr_t
583 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
584 {
585         u_int l1_slot, l2_slot;
586         pt_entry_t *l2;
587
588         l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);
589
590         return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET));
591 }
592
593 static vm_offset_t
594 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa,
595     vm_offset_t freemempos)
596 {
597         pt_entry_t *l2;
598         vm_offset_t va;
599         vm_paddr_t l2_pa, pa;
600         u_int l1_slot, l2_slot, prev_l1_slot;
601         int i;
602
603         dmap_phys_base = min_pa & ~L1_OFFSET;
604         dmap_phys_max = 0;
605         dmap_max_addr = 0;
606         l2 = NULL;
607         prev_l1_slot = -1;
608
609 #define DMAP_TABLES     ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
610         memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES);
611
612         for (i = 0; i < (physmap_idx * 2); i += 2) {
613                 pa = physmap[i] & ~L2_OFFSET;
614                 va = pa - dmap_phys_base + DMAP_MIN_ADDRESS;
615
616                 /* Create L2 mappings at the start of the region */
617                 if ((pa & L1_OFFSET) != 0) {
618                         l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
619                         if (l1_slot != prev_l1_slot) {
620                                 prev_l1_slot = l1_slot;
621                                 l2 = (pt_entry_t *)freemempos;
622                                 l2_pa = pmap_early_vtophys(kern_l1,
623                                     (vm_offset_t)l2);
624                                 freemempos += PAGE_SIZE;
625
626                                 pmap_load_store(&pagetable_dmap[l1_slot],
627                                     (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE);
628
629                                 memset(l2, 0, PAGE_SIZE);
630                         }
631                         KASSERT(l2 != NULL,
632                             ("pmap_bootstrap_dmap: NULL l2 map"));
633                         for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
634                             pa += L2_SIZE, va += L2_SIZE) {
635                                 /*
636                                  * We are on a boundary, stop to
637                                  * create a level 1 block
638                                  */
639                                 if ((pa & L1_OFFSET) == 0)
640                                         break;
641
642                                 l2_slot = pmap_l2_index(va);
643                                 KASSERT(l2_slot != 0, ("..."));
644                                 pmap_load_store(&l2[l2_slot],
645                                     (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN |
646                                     ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
647                         }
648                         KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS),
649                             ("..."));
650                 }
651
652                 for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] &&
653                     (physmap[i + 1] - pa) >= L1_SIZE;
654                     pa += L1_SIZE, va += L1_SIZE) {
655                         l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
656                         pmap_load_store(&pagetable_dmap[l1_slot],
657                             (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_XN |
658                             ATTR_IDX(CACHED_MEMORY) | L1_BLOCK);
659                 }
660
661                 /* Create L2 mappings at the end of the region */
662                 if (pa < physmap[i + 1]) {
663                         l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
664                         if (l1_slot != prev_l1_slot) {
665                                 prev_l1_slot = l1_slot;
666                                 l2 = (pt_entry_t *)freemempos;
667                                 l2_pa = pmap_early_vtophys(kern_l1,
668                                     (vm_offset_t)l2);
669                                 freemempos += PAGE_SIZE;
670
671                                 pmap_load_store(&pagetable_dmap[l1_slot],
672                                     (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE);
673
674                                 memset(l2, 0, PAGE_SIZE);
675                         }
676                         KASSERT(l2 != NULL,
677                             ("pmap_bootstrap_dmap: NULL l2 map"));
678                         for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
679                             pa += L2_SIZE, va += L2_SIZE) {
680                                 l2_slot = pmap_l2_index(va);
681                                 pmap_load_store(&l2[l2_slot],
682                                     (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN |
683                                     ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
684                         }
685                 }
686
687                 if (pa > dmap_phys_max) {
688                         dmap_phys_max = pa;
689                         dmap_max_addr = va;
690                 }
691         }
692
693         cpu_tlb_flushID();
694
695         return (freemempos);
696 }
697
698 static vm_offset_t
699 pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start)
700 {
701         vm_offset_t l2pt;
702         vm_paddr_t pa;
703         pd_entry_t *l1;
704         u_int l1_slot;
705
706         KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
707
708         l1 = (pd_entry_t *)l1pt;
709         l1_slot = pmap_l1_index(va);
710         l2pt = l2_start;
711
712         for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) {
713                 KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
714
715                 pa = pmap_early_vtophys(l1pt, l2pt);
716                 pmap_load_store(&l1[l1_slot],
717                     (pa & ~Ln_TABLE_MASK) | L1_TABLE);
718                 l2pt += PAGE_SIZE;
719         }
720
721         /* Clean the L2 page table */
722         memset((void *)l2_start, 0, l2pt - l2_start);
723
724         return l2pt;
725 }
726
727 static vm_offset_t
728 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
729 {
730         vm_offset_t l3pt;
731         vm_paddr_t pa;
732         pd_entry_t *l2;
733         u_int l2_slot;
734
735         KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
736
737         l2 = pmap_l2(kernel_pmap, va);
738         l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE);
739         l2_slot = pmap_l2_index(va);
740         l3pt = l3_start;
741
742         for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
743                 KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
744
745                 pa = pmap_early_vtophys(l1pt, l3pt);
746                 pmap_load_store(&l2[l2_slot],
747                     (pa & ~Ln_TABLE_MASK) | L2_TABLE);
748                 l3pt += PAGE_SIZE;
749         }
750
751         /* Clean the L2 page table */
752         memset((void *)l3_start, 0, l3pt - l3_start);
753
754         return l3pt;
755 }
756
757 /*
758  *      Bootstrap the system enough to run with virtual memory.
759  */
760 void
761 pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart,
762     vm_size_t kernlen)
763 {
764         u_int l1_slot, l2_slot;
765         uint64_t kern_delta;
766         pt_entry_t *l2;
767         vm_offset_t va, freemempos;
768         vm_offset_t dpcpu, msgbufpv;
769         vm_paddr_t start_pa, pa, min_pa;
770         int i;
771
772         kern_delta = KERNBASE - kernstart;
773
774         printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
775         printf("%lx\n", l1pt);
776         printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
777
778         /* Set this early so we can use the pagetable walking functions */
779         kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt;
780         PMAP_LOCK_INIT(kernel_pmap);
781
782         /* Assume the address we were loaded to is a valid physical address */
783         min_pa = KERNBASE - kern_delta;
784
785         physmap_idx = arm_physmem_avail(physmap, nitems(physmap));
786         physmap_idx /= 2;
787
788         /*
789          * Find the minimum physical address. physmap is sorted,
790          * but may contain empty ranges.
791          */
792         for (i = 0; i < (physmap_idx * 2); i += 2) {
793                 if (physmap[i] == physmap[i + 1])
794                         continue;
795                 if (physmap[i] <= min_pa)
796                         min_pa = physmap[i];
797         }
798
799         freemempos = KERNBASE + kernlen;
800         freemempos = roundup2(freemempos, PAGE_SIZE);
801
802         /* Create a direct map region early so we can use it for pa -> va */
803         freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos);
804
805         va = KERNBASE;
806         start_pa = pa = KERNBASE - kern_delta;
807
808         /*
809          * Read the page table to find out what is already mapped.
810          * This assumes we have mapped a block of memory from KERNBASE
811          * using a single L1 entry.
812          */
813         l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);
814
815         /* Sanity check the index, KERNBASE should be the first VA */
816         KASSERT(l2_slot == 0, ("The L2 index is non-zero"));
817
818         /* Find how many pages we have mapped */
819         for (; l2_slot < Ln_ENTRIES; l2_slot++) {
820                 if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0)
821                         break;
822
823                 /* Check locore used L2 blocks */
824                 KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK,
825                     ("Invalid bootstrap L2 table"));
826                 KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa,
827                     ("Incorrect PA in L2 table"));
828
829                 va += L2_SIZE;
830                 pa += L2_SIZE;
831         }
832
833         va = roundup2(va, L1_SIZE);
834
835         /* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */
836         freemempos = pmap_bootstrap_l2(l1pt, va, freemempos);
837         /* And the l3 tables for the early devmap */
838         freemempos = pmap_bootstrap_l3(l1pt,
839             VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos);
840
841         cpu_tlb_flushID();
842
843 #define alloc_pages(var, np)                                            \
844         (var) = freemempos;                                             \
845         freemempos += (np * PAGE_SIZE);                                 \
846         memset((char *)(var), 0, ((np) * PAGE_SIZE));
847
848         /* Allocate dynamic per-cpu area. */
849         alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
850         dpcpu_init((void *)dpcpu, 0);
851
852         /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
853         alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
854         msgbufp = (void *)msgbufpv;
855
856         /* Reserve some VA space for early BIOS/ACPI mapping */
857         preinit_map_va = roundup2(freemempos, L2_SIZE);
858
859         virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
860         virtual_avail = roundup2(virtual_avail, L1_SIZE);
861         virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
862         kernel_vm_end = virtual_avail;
863
864         pa = pmap_early_vtophys(l1pt, freemempos);
865
866         arm_physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
867
868         cpu_tlb_flushID();
869 }
870
871 /*
872  *      Initialize a vm_page's machine-dependent fields.
873  */
874 void
875 pmap_page_init(vm_page_t m)
876 {
877
878         TAILQ_INIT(&m->md.pv_list);
879         m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
880 }
881
882 /*
883  *      Initialize the pmap module.
884  *      Called by vm_init, to initialize any structures that the pmap
885  *      system needs to map virtual memory.
886  */
887 void
888 pmap_init(void)
889 {
890         vm_size_t s;
891         int i, pv_npg;
892
893         /*
894          * Are large page mappings enabled?
895          */
896         TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
897
898         /*
899          * Initialize the pv chunk list mutex.
900          */
901         mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
902
903         /*
904          * Initialize the pool of pv list locks.
905          */
906         for (i = 0; i < NPV_LIST_LOCKS; i++)
907                 rw_init(&pv_list_locks[i], "pmap pv list");
908
909         /*
910          * Calculate the size of the pv head table for superpages.
911          */
912         pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
913
914         /*
915          * Allocate memory for the pv head table for superpages.
916          */
917         s = (vm_size_t)(pv_npg * sizeof(struct md_page));
918         s = round_page(s);
919         pv_table = (struct md_page *)kmem_malloc(kernel_arena, s,
920             M_WAITOK | M_ZERO);
921         for (i = 0; i < pv_npg; i++)
922                 TAILQ_INIT(&pv_table[i].pv_list);
923         TAILQ_INIT(&pv_dummy.pv_list);
924
925         vm_initialized = 1;
926 }
927
928 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0,
929     "2MB page mapping counters");
930
931 static u_long pmap_l2_demotions;
932 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
933     &pmap_l2_demotions, 0, "2MB page demotions");
934
935 static u_long pmap_l2_p_failures;
936 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
937     &pmap_l2_p_failures, 0, "2MB page promotion failures");
938
939 static u_long pmap_l2_promotions;
940 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
941     &pmap_l2_promotions, 0, "2MB page promotions");
942
943 /*
944  * Invalidate a single TLB entry.
945  */
946 static __inline void
947 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
948 {
949
950         sched_pin();
951         __asm __volatile(
952             "dsb  ishst         \n"
953             "tlbi vaae1is, %0   \n"
954             "dsb  ish           \n"
955             "isb                \n"
956             : : "r"(va >> PAGE_SHIFT));
957         sched_unpin();
958 }
959
960 static __inline void
961 pmap_invalidate_range_nopin(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
962 {
963         vm_offset_t addr;
964
965         dsb(ishst);
966         for (addr = sva; addr < eva; addr += PAGE_SIZE) {
967                 __asm __volatile(
968                     "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT));
969         }
970         __asm __volatile(
971             "dsb  ish   \n"
972             "isb        \n");
973 }
974
975 static __inline void
976 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
977 {
978
979         sched_pin();
980         pmap_invalidate_range_nopin(pmap, sva, eva);
981         sched_unpin();
982 }
983
984 static __inline void
985 pmap_invalidate_all(pmap_t pmap)
986 {
987
988         sched_pin();
989         __asm __volatile(
990             "dsb  ishst         \n"
991             "tlbi vmalle1is     \n"
992             "dsb  ish           \n"
993             "isb                \n");
994         sched_unpin();
995 }
996
997 /*
998  *      Routine:        pmap_extract
999  *      Function:
1000  *              Extract the physical page address associated
1001  *              with the given map/virtual_address pair.
1002  */
1003 vm_paddr_t
1004 pmap_extract(pmap_t pmap, vm_offset_t va)
1005 {
1006         pt_entry_t *pte, tpte;
1007         vm_paddr_t pa;
1008         int lvl;
1009
1010         pa = 0;
1011         PMAP_LOCK(pmap);
1012         /*
1013          * Find the block or page map for this virtual address. pmap_pte
1014          * will return either a valid block/page entry, or NULL.
1015          */
1016         pte = pmap_pte(pmap, va, &lvl);
1017         if (pte != NULL) {
1018                 tpte = pmap_load(pte);
1019                 pa = tpte & ~ATTR_MASK;
1020                 switch(lvl) {
1021                 case 1:
1022                         KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1023                             ("pmap_extract: Invalid L1 pte found: %lx",
1024                             tpte & ATTR_DESCR_MASK));
1025                         pa |= (va & L1_OFFSET);
1026                         break;
1027                 case 2:
1028                         KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1029                             ("pmap_extract: Invalid L2 pte found: %lx",
1030                             tpte & ATTR_DESCR_MASK));
1031                         pa |= (va & L2_OFFSET);
1032                         break;
1033                 case 3:
1034                         KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1035                             ("pmap_extract: Invalid L3 pte found: %lx",
1036                             tpte & ATTR_DESCR_MASK));
1037                         pa |= (va & L3_OFFSET);
1038                         break;
1039                 }
1040         }
1041         PMAP_UNLOCK(pmap);
1042         return (pa);
1043 }
1044
1045 /*
1046  *      Routine:        pmap_extract_and_hold
1047  *      Function:
1048  *              Atomically extract and hold the physical page
1049  *              with the given pmap and virtual address pair
1050  *              if that mapping permits the given protection.
1051  */
1052 vm_page_t
1053 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1054 {
1055         pt_entry_t *pte, tpte;
1056         vm_offset_t off;
1057         vm_paddr_t pa;
1058         vm_page_t m;
1059         int lvl;
1060
1061         pa = 0;
1062         m = NULL;
1063         PMAP_LOCK(pmap);
1064 retry:
1065         pte = pmap_pte(pmap, va, &lvl);
1066         if (pte != NULL) {
1067                 tpte = pmap_load(pte);
1068
1069                 KASSERT(lvl > 0 && lvl <= 3,
1070                     ("pmap_extract_and_hold: Invalid level %d", lvl));
1071                 CTASSERT(L1_BLOCK == L2_BLOCK);
1072                 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1073                     (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1074                     ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1075                      tpte & ATTR_DESCR_MASK));
1076                 if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) ||
1077                     ((prot & VM_PROT_WRITE) == 0)) {
1078                         switch(lvl) {
1079                         case 1:
1080                                 off = va & L1_OFFSET;
1081                                 break;
1082                         case 2:
1083                                 off = va & L2_OFFSET;
1084                                 break;
1085                         case 3:
1086                         default:
1087                                 off = 0;
1088                         }
1089                         if (vm_page_pa_tryrelock(pmap,
1090                             (tpte & ~ATTR_MASK) | off, &pa))
1091                                 goto retry;
1092                         m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off);
1093                         vm_page_hold(m);
1094                 }
1095         }
1096         PA_UNLOCK_COND(pa);
1097         PMAP_UNLOCK(pmap);
1098         return (m);
1099 }
1100
1101 vm_paddr_t
1102 pmap_kextract(vm_offset_t va)
1103 {
1104         pt_entry_t *pte, tpte;
1105         vm_paddr_t pa;
1106         int lvl;
1107
1108         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
1109                 pa = DMAP_TO_PHYS(va);
1110         } else {
1111                 pa = 0;
1112                 pte = pmap_pte(kernel_pmap, va, &lvl);
1113                 if (pte != NULL) {
1114                         tpte = pmap_load(pte);
1115                         pa = tpte & ~ATTR_MASK;
1116                         switch(lvl) {
1117                         case 1:
1118                                 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1119                                     ("pmap_kextract: Invalid L1 pte found: %lx",
1120                                     tpte & ATTR_DESCR_MASK));
1121                                 pa |= (va & L1_OFFSET);
1122                                 break;
1123                         case 2:
1124                                 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1125                                     ("pmap_kextract: Invalid L2 pte found: %lx",
1126                                     tpte & ATTR_DESCR_MASK));
1127                                 pa |= (va & L2_OFFSET);
1128                                 break;
1129                         case 3:
1130                                 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1131                                     ("pmap_kextract: Invalid L3 pte found: %lx",
1132                                     tpte & ATTR_DESCR_MASK));
1133                                 pa |= (va & L3_OFFSET);
1134                                 break;
1135                         }
1136                 }
1137         }
1138         return (pa);
1139 }
1140
1141 /***************************************************
1142  * Low level mapping routines.....
1143  ***************************************************/
1144
1145 void
1146 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1147 {
1148         pd_entry_t *pde;
1149         pt_entry_t *pte, attr;
1150         vm_offset_t va;
1151         int lvl;
1152
1153         KASSERT((pa & L3_OFFSET) == 0,
1154            ("pmap_kenter: Invalid physical address"));
1155         KASSERT((sva & L3_OFFSET) == 0,
1156            ("pmap_kenter: Invalid virtual address"));
1157         KASSERT((size & PAGE_MASK) == 0,
1158             ("pmap_kenter: Mapping is not page-sized"));
1159
1160         attr = ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE;
1161         if (mode == DEVICE_MEMORY)
1162                 attr |= ATTR_XN;
1163
1164         va = sva;
1165         while (size != 0) {
1166                 pde = pmap_pde(kernel_pmap, va, &lvl);
1167                 KASSERT(pde != NULL,
1168                     ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
1169                 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
1170
1171                 pte = pmap_l2_to_l3(pde, va);
1172                 pmap_load_store(pte, (pa & ~L3_OFFSET) | attr);
1173
1174                 va += PAGE_SIZE;
1175                 pa += PAGE_SIZE;
1176                 size -= PAGE_SIZE;
1177         }
1178         pmap_invalidate_range(kernel_pmap, sva, va);
1179 }
1180
1181 void
1182 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
1183 {
1184
1185         pmap_kenter(sva, size, pa, DEVICE_MEMORY);
1186 }
1187
1188 /*
1189  * Remove a page from the kernel pagetables.
1190  */
1191 PMAP_INLINE void
1192 pmap_kremove(vm_offset_t va)
1193 {
1194         pt_entry_t *pte;
1195         int lvl;
1196
1197         pte = pmap_pte(kernel_pmap, va, &lvl);
1198         KASSERT(pte != NULL, ("pmap_kremove: Invalid address"));
1199         KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl));
1200
1201         pmap_load_clear(pte);
1202         pmap_invalidate_page(kernel_pmap, va);
1203 }
1204
1205 void
1206 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
1207 {
1208         pt_entry_t *pte;
1209         vm_offset_t va;
1210         int lvl;
1211
1212         KASSERT((sva & L3_OFFSET) == 0,
1213            ("pmap_kremove_device: Invalid virtual address"));
1214         KASSERT((size & PAGE_MASK) == 0,
1215             ("pmap_kremove_device: Mapping is not page-sized"));
1216
1217         va = sva;
1218         while (size != 0) {
1219                 pte = pmap_pte(kernel_pmap, va, &lvl);
1220                 KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va));
1221                 KASSERT(lvl == 3,
1222                     ("Invalid device pagetable level: %d != 3", lvl));
1223                 pmap_load_clear(pte);
1224
1225                 va += PAGE_SIZE;
1226                 size -= PAGE_SIZE;
1227         }
1228         pmap_invalidate_range(kernel_pmap, sva, va);
1229 }
1230
1231 /*
1232  *      Used to map a range of physical addresses into kernel
1233  *      virtual address space.
1234  *
1235  *      The value passed in '*virt' is a suggested virtual address for
1236  *      the mapping. Architectures which can support a direct-mapped
1237  *      physical to virtual region can return the appropriate address
1238  *      within that region, leaving '*virt' unchanged. Other
1239  *      architectures should map the pages starting at '*virt' and
1240  *      update '*virt' with the first usable address after the mapped
1241  *      region.
1242  */
1243 vm_offset_t
1244 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1245 {
1246         return PHYS_TO_DMAP(start);
1247 }
1248
1249
1250 /*
1251  * Add a list of wired pages to the kva
1252  * this routine is only used for temporary
1253  * kernel mappings that do not need to have
1254  * page modification or references recorded.
1255  * Note that old mappings are simply written
1256  * over.  The page *must* be wired.
1257  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1258  */
1259 void
1260 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1261 {
1262         pd_entry_t *pde;
1263         pt_entry_t *pte, pa;
1264         vm_offset_t va;
1265         vm_page_t m;
1266         int i, lvl;
1267
1268         va = sva;
1269         for (i = 0; i < count; i++) {
1270                 pde = pmap_pde(kernel_pmap, va, &lvl);
1271                 KASSERT(pde != NULL,
1272                     ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
1273                 KASSERT(lvl == 2,
1274                     ("pmap_qenter: Invalid level %d", lvl));
1275
1276                 m = ma[i];
1277                 pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) |
1278                     ATTR_IDX(m->md.pv_memattr) | L3_PAGE;
1279                 if (m->md.pv_memattr == DEVICE_MEMORY)
1280                         pa |= ATTR_XN;
1281                 pte = pmap_l2_to_l3(pde, va);
1282                 pmap_load_store(pte, pa);
1283
1284                 va += L3_SIZE;
1285         }
1286         pmap_invalidate_range(kernel_pmap, sva, va);
1287 }
1288
1289 /*
1290  * This routine tears out page mappings from the
1291  * kernel -- it is meant only for temporary mappings.
1292  */
1293 void
1294 pmap_qremove(vm_offset_t sva, int count)
1295 {
1296         pt_entry_t *pte;
1297         vm_offset_t va;
1298         int lvl;
1299
1300         KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
1301
1302         va = sva;
1303         while (count-- > 0) {
1304                 pte = pmap_pte(kernel_pmap, va, &lvl);
1305                 KASSERT(lvl == 3,
1306                     ("Invalid device pagetable level: %d != 3", lvl));
1307                 if (pte != NULL) {
1308                         pmap_load_clear(pte);
1309                 }
1310
1311                 va += PAGE_SIZE;
1312         }
1313         pmap_invalidate_range(kernel_pmap, sva, va);
1314 }
1315
1316 /***************************************************
1317  * Page table page management routines.....
1318  ***************************************************/
1319 /*
1320  * Schedule the specified unused page table page to be freed.  Specifically,
1321  * add the page to the specified list of pages that will be released to the
1322  * physical memory manager after the TLB has been updated.
1323  */
1324 static __inline void
1325 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1326     boolean_t set_PG_ZERO)
1327 {
1328
1329         if (set_PG_ZERO)
1330                 m->flags |= PG_ZERO;
1331         else
1332                 m->flags &= ~PG_ZERO;
1333         SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1334 }
1335
1336 /*
1337  * Decrements a page table page's wire count, which is used to record the
1338  * number of valid page table entries within the page.  If the wire count
1339  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1340  * page table page was unmapped and FALSE otherwise.
1341  */
1342 static inline boolean_t
1343 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1344 {
1345
1346         --m->wire_count;
1347         if (m->wire_count == 0) {
1348                 _pmap_unwire_l3(pmap, va, m, free);
1349                 return (TRUE);
1350         } else
1351                 return (FALSE);
1352 }
1353
1354 static void
1355 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1356 {
1357
1358         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1359         /*
1360          * unmap the page table page
1361          */
1362         if (m->pindex >= (NUL2E + NUL1E)) {
1363                 /* l1 page */
1364                 pd_entry_t *l0;
1365
1366                 l0 = pmap_l0(pmap, va);
1367                 pmap_load_clear(l0);
1368         } else if (m->pindex >= NUL2E) {
1369                 /* l2 page */
1370                 pd_entry_t *l1;
1371
1372                 l1 = pmap_l1(pmap, va);
1373                 pmap_load_clear(l1);
1374         } else {
1375                 /* l3 page */
1376                 pd_entry_t *l2;
1377
1378                 l2 = pmap_l2(pmap, va);
1379                 pmap_load_clear(l2);
1380         }
1381         pmap_resident_count_dec(pmap, 1);
1382         if (m->pindex < NUL2E) {
1383                 /* We just released an l3, unhold the matching l2 */
1384                 pd_entry_t *l1, tl1;
1385                 vm_page_t l2pg;
1386
1387                 l1 = pmap_l1(pmap, va);
1388                 tl1 = pmap_load(l1);
1389                 l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
1390                 pmap_unwire_l3(pmap, va, l2pg, free);
1391         } else if (m->pindex < (NUL2E + NUL1E)) {
1392                 /* We just released an l2, unhold the matching l1 */
1393                 pd_entry_t *l0, tl0;
1394                 vm_page_t l1pg;
1395
1396                 l0 = pmap_l0(pmap, va);
1397                 tl0 = pmap_load(l0);
1398                 l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
1399                 pmap_unwire_l3(pmap, va, l1pg, free);
1400         }
1401         pmap_invalidate_page(pmap, va);
1402
1403         vm_wire_sub(1);
1404
1405         /*
1406          * Put page on a list so that it is released after
1407          * *ALL* TLB shootdown is done
1408          */
1409         pmap_add_delayed_free_list(m, free, TRUE);
1410 }
1411
1412 /*
1413  * After removing a page table entry, this routine is used to
1414  * conditionally free the page, and manage the hold/wire counts.
1415  */
1416 static int
1417 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1418     struct spglist *free)
1419 {
1420         vm_page_t mpte;
1421
1422         if (va >= VM_MAXUSER_ADDRESS)
1423                 return (0);
1424         KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1425         mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
1426         return (pmap_unwire_l3(pmap, va, mpte, free));
1427 }
1428
1429 void
1430 pmap_pinit0(pmap_t pmap)
1431 {
1432
1433         PMAP_LOCK_INIT(pmap);
1434         bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1435         pmap->pm_l0 = kernel_pmap->pm_l0;
1436         pmap->pm_root.rt_root = 0;
1437 }
1438
1439 int
1440 pmap_pinit(pmap_t pmap)
1441 {
1442         vm_paddr_t l0phys;
1443         vm_page_t l0pt;
1444
1445         /*
1446          * allocate the l0 page
1447          */
1448         while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
1449             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1450                 vm_wait(NULL);
1451
1452         l0phys = VM_PAGE_TO_PHYS(l0pt);
1453         pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys);
1454
1455         if ((l0pt->flags & PG_ZERO) == 0)
1456                 pagezero(pmap->pm_l0);
1457
1458         pmap->pm_root.rt_root = 0;
1459         bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1460
1461         return (1);
1462 }
1463
1464 /*
1465  * This routine is called if the desired page table page does not exist.
1466  *
1467  * If page table page allocation fails, this routine may sleep before
1468  * returning NULL.  It sleeps only if a lock pointer was given.
1469  *
1470  * Note: If a page allocation fails at page table level two or three,
1471  * one or two pages may be held during the wait, only to be released
1472  * afterwards.  This conservative approach is easily argued to avoid
1473  * race conditions.
1474  */
1475 static vm_page_t
1476 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1477 {
1478         vm_page_t m, l1pg, l2pg;
1479
1480         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1481
1482         /*
1483          * Allocate a page table page.
1484          */
1485         if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1486             VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1487                 if (lockp != NULL) {
1488                         RELEASE_PV_LIST_LOCK(lockp);
1489                         PMAP_UNLOCK(pmap);
1490                         vm_wait(NULL);
1491                         PMAP_LOCK(pmap);
1492                 }
1493
1494                 /*
1495                  * Indicate the need to retry.  While waiting, the page table
1496                  * page may have been allocated.
1497                  */
1498                 return (NULL);
1499         }
1500         if ((m->flags & PG_ZERO) == 0)
1501                 pmap_zero_page(m);
1502
1503         /*
1504          * Map the pagetable page into the process address space, if
1505          * it isn't already there.
1506          */
1507
1508         if (ptepindex >= (NUL2E + NUL1E)) {
1509                 pd_entry_t *l0;
1510                 vm_pindex_t l0index;
1511
1512                 l0index = ptepindex - (NUL2E + NUL1E);
1513                 l0 = &pmap->pm_l0[l0index];
1514                 pmap_load_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE);
1515         } else if (ptepindex >= NUL2E) {
1516                 vm_pindex_t l0index, l1index;
1517                 pd_entry_t *l0, *l1;
1518                 pd_entry_t tl0;
1519
1520                 l1index = ptepindex - NUL2E;
1521                 l0index = l1index >> L0_ENTRIES_SHIFT;
1522
1523                 l0 = &pmap->pm_l0[l0index];
1524                 tl0 = pmap_load(l0);
1525                 if (tl0 == 0) {
1526                         /* recurse for allocating page dir */
1527                         if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
1528                             lockp) == NULL) {
1529                                 vm_page_unwire_noq(m);
1530                                 vm_page_free_zero(m);
1531                                 return (NULL);
1532                         }
1533                 } else {
1534                         l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
1535                         l1pg->wire_count++;
1536                 }
1537
1538                 l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
1539                 l1 = &l1[ptepindex & Ln_ADDR_MASK];
1540                 pmap_load_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
1541         } else {
1542                 vm_pindex_t l0index, l1index;
1543                 pd_entry_t *l0, *l1, *l2;
1544                 pd_entry_t tl0, tl1;
1545
1546                 l1index = ptepindex >> Ln_ENTRIES_SHIFT;
1547                 l0index = l1index >> L0_ENTRIES_SHIFT;
1548
1549                 l0 = &pmap->pm_l0[l0index];
1550                 tl0 = pmap_load(l0);
1551                 if (tl0 == 0) {
1552                         /* recurse for allocating page dir */
1553                         if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1554                             lockp) == NULL) {
1555                                 vm_page_unwire_noq(m);
1556                                 vm_page_free_zero(m);
1557                                 return (NULL);
1558                         }
1559                         tl0 = pmap_load(l0);
1560                         l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
1561                         l1 = &l1[l1index & Ln_ADDR_MASK];
1562                 } else {
1563                         l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
1564                         l1 = &l1[l1index & Ln_ADDR_MASK];
1565                         tl1 = pmap_load(l1);
1566                         if (tl1 == 0) {
1567                                 /* recurse for allocating page dir */
1568                                 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1569                                     lockp) == NULL) {
1570                                         vm_page_unwire_noq(m);
1571                                         vm_page_free_zero(m);
1572                                         return (NULL);
1573                                 }
1574                         } else {
1575                                 l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
1576                                 l2pg->wire_count++;
1577                         }
1578                 }
1579
1580                 l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
1581                 l2 = &l2[ptepindex & Ln_ADDR_MASK];
1582                 pmap_load_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
1583         }
1584
1585         pmap_resident_count_inc(pmap, 1);
1586
1587         return (m);
1588 }
1589
1590 static vm_page_t
1591 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1592 {
1593         vm_pindex_t ptepindex;
1594         pd_entry_t *pde, tpde;
1595 #ifdef INVARIANTS
1596         pt_entry_t *pte;
1597 #endif
1598         vm_page_t m;
1599         int lvl;
1600
1601         /*
1602          * Calculate pagetable page index
1603          */
1604         ptepindex = pmap_l2_pindex(va);
1605 retry:
1606         /*
1607          * Get the page directory entry
1608          */
1609         pde = pmap_pde(pmap, va, &lvl);
1610
1611         /*
1612          * If the page table page is mapped, we just increment the hold count,
1613          * and activate it. If we get a level 2 pde it will point to a level 3
1614          * table.
1615          */
1616         switch (lvl) {
1617         case -1:
1618                 break;
1619         case 0:
1620 #ifdef INVARIANTS
1621                 pte = pmap_l0_to_l1(pde, va);
1622                 KASSERT(pmap_load(pte) == 0,
1623                     ("pmap_alloc_l3: TODO: l0 superpages"));
1624 #endif
1625                 break;
1626         case 1:
1627 #ifdef INVARIANTS
1628                 pte = pmap_l1_to_l2(pde, va);
1629                 KASSERT(pmap_load(pte) == 0,
1630                     ("pmap_alloc_l3: TODO: l1 superpages"));
1631 #endif
1632                 break;
1633         case 2:
1634                 tpde = pmap_load(pde);
1635                 if (tpde != 0) {
1636                         m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK);
1637                         m->wire_count++;
1638                         return (m);
1639                 }
1640                 break;
1641         default:
1642                 panic("pmap_alloc_l3: Invalid level %d", lvl);
1643         }
1644
1645         /*
1646          * Here if the pte page isn't mapped, or if it has been deallocated.
1647          */
1648         m = _pmap_alloc_l3(pmap, ptepindex, lockp);
1649         if (m == NULL && lockp != NULL)
1650                 goto retry;
1651
1652         return (m);
1653 }
1654
1655
1656 /***************************************************
1657  * Pmap allocation/deallocation routines.
1658  ***************************************************/
1659
1660 /*
1661  * Release any resources held by the given physical map.
1662  * Called when a pmap initialized by pmap_pinit is being released.
1663  * Should only be called if the map contains no valid mappings.
1664  */
1665 void
1666 pmap_release(pmap_t pmap)
1667 {
1668         vm_page_t m;
1669
1670         KASSERT(pmap->pm_stats.resident_count == 0,
1671             ("pmap_release: pmap resident count %ld != 0",
1672             pmap->pm_stats.resident_count));
1673         KASSERT(vm_radix_is_empty(&pmap->pm_root),
1674             ("pmap_release: pmap has reserved page table page(s)"));
1675
1676         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0));
1677
1678         vm_page_unwire_noq(m);
1679         vm_page_free_zero(m);
1680 }
1681
1682 static int
1683 kvm_size(SYSCTL_HANDLER_ARGS)
1684 {
1685         unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1686
1687         return sysctl_handle_long(oidp, &ksize, 0, req);
1688 }
1689 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1690     0, 0, kvm_size, "LU", "Size of KVM");
1691
1692 static int
1693 kvm_free(SYSCTL_HANDLER_ARGS)
1694 {
1695         unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1696
1697         return sysctl_handle_long(oidp, &kfree, 0, req);
1698 }
1699 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1700     0, 0, kvm_free, "LU", "Amount of KVM free");
1701
1702 /*
1703  * grow the number of kernel page table entries, if needed
1704  */
1705 void
1706 pmap_growkernel(vm_offset_t addr)
1707 {
1708         vm_paddr_t paddr;
1709         vm_page_t nkpg;
1710         pd_entry_t *l0, *l1, *l2;
1711
1712         mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1713
1714         addr = roundup2(addr, L2_SIZE);
1715         if (addr - 1 >= kernel_map->max_offset)
1716                 addr = kernel_map->max_offset;
1717         while (kernel_vm_end < addr) {
1718                 l0 = pmap_l0(kernel_pmap, kernel_vm_end);
1719                 KASSERT(pmap_load(l0) != 0,
1720                     ("pmap_growkernel: No level 0 kernel entry"));
1721
1722                 l1 = pmap_l0_to_l1(l0, kernel_vm_end);
1723                 if (pmap_load(l1) == 0) {
1724                         /* We need a new PDP entry */
1725                         nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT,
1726                             VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
1727                             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1728                         if (nkpg == NULL)
1729                                 panic("pmap_growkernel: no memory to grow kernel");
1730                         if ((nkpg->flags & PG_ZERO) == 0)
1731                                 pmap_zero_page(nkpg);
1732                         paddr = VM_PAGE_TO_PHYS(nkpg);
1733                         pmap_load_store(l1, paddr | L1_TABLE);
1734                         continue; /* try again */
1735                 }
1736                 l2 = pmap_l1_to_l2(l1, kernel_vm_end);
1737                 if ((pmap_load(l2) & ATTR_AF) != 0) {
1738                         kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1739                         if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1740                                 kernel_vm_end = kernel_map->max_offset;
1741                                 break;
1742                         }
1743                         continue;
1744                 }
1745
1746                 nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT,
1747                     VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1748                     VM_ALLOC_ZERO);
1749                 if (nkpg == NULL)
1750                         panic("pmap_growkernel: no memory to grow kernel");
1751                 if ((nkpg->flags & PG_ZERO) == 0)
1752                         pmap_zero_page(nkpg);
1753                 paddr = VM_PAGE_TO_PHYS(nkpg);
1754                 pmap_load_store(l2, paddr | L2_TABLE);
1755                 pmap_invalidate_page(kernel_pmap, kernel_vm_end);
1756
1757                 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1758                 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1759                         kernel_vm_end = kernel_map->max_offset;
1760                         break;
1761                 }
1762         }
1763 }
1764
1765
1766 /***************************************************
1767  * page management routines.
1768  ***************************************************/
1769
1770 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1771 CTASSERT(_NPCM == 3);
1772 CTASSERT(_NPCPV == 168);
1773
1774 static __inline struct pv_chunk *
1775 pv_to_chunk(pv_entry_t pv)
1776 {
1777
1778         return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
1779 }
1780
1781 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1782
1783 #define PC_FREE0        0xfffffffffffffffful
1784 #define PC_FREE1        0xfffffffffffffffful
1785 #define PC_FREE2        0x000000fffffffffful
1786
1787 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
1788
1789 #if 0
1790 #ifdef PV_STATS
1791 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1792
1793 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1794         "Current number of pv entry chunks");
1795 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1796         "Current number of pv entry chunks allocated");
1797 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1798         "Current number of pv entry chunks frees");
1799 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1800         "Number of times tried to get a chunk page but failed.");
1801
1802 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
1803 static int pv_entry_spare;
1804
1805 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1806         "Current number of pv entry frees");
1807 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1808         "Current number of pv entry allocs");
1809 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1810         "Current number of pv entries");
1811 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1812         "Current number of spare pv entries");
1813 #endif
1814 #endif /* 0 */
1815
1816 /*
1817  * We are in a serious low memory condition.  Resort to
1818  * drastic measures to free some pages so we can allocate
1819  * another pv entry chunk.
1820  *
1821  * Returns NULL if PV entries were reclaimed from the specified pmap.
1822  *
1823  * We do not, however, unmap 2mpages because subsequent accesses will
1824  * allocate per-page pv entries until repromotion occurs, thereby
1825  * exacerbating the shortage of free pv entries.
1826  */
1827 static vm_page_t
1828 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
1829 {
1830         struct pv_chunk *pc, *pc_marker, *pc_marker_end;
1831         struct pv_chunk_header pc_marker_b, pc_marker_end_b;
1832         struct md_page *pvh;
1833         pd_entry_t *pde;
1834         pmap_t next_pmap, pmap;
1835         pt_entry_t *pte, tpte;
1836         pv_entry_t pv;
1837         vm_offset_t va;
1838         vm_page_t m, m_pc;
1839         struct spglist free;
1840         uint64_t inuse;
1841         int bit, field, freed, lvl;
1842         static int active_reclaims = 0;
1843
1844         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
1845         KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
1846
1847         pmap = NULL;
1848         m_pc = NULL;
1849         SLIST_INIT(&free);
1850         bzero(&pc_marker_b, sizeof(pc_marker_b));
1851         bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
1852         pc_marker = (struct pv_chunk *)&pc_marker_b;
1853         pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
1854
1855         mtx_lock(&pv_chunks_mutex);
1856         active_reclaims++;
1857         TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
1858         TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
1859         while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
1860             SLIST_EMPTY(&free)) {
1861                 next_pmap = pc->pc_pmap;
1862                 if (next_pmap == NULL) {
1863                         /*
1864                          * The next chunk is a marker.  However, it is
1865                          * not our marker, so active_reclaims must be
1866                          * > 1.  Consequently, the next_chunk code
1867                          * will not rotate the pv_chunks list.
1868                          */
1869                         goto next_chunk;
1870                 }
1871                 mtx_unlock(&pv_chunks_mutex);
1872
1873                 /*
1874                  * A pv_chunk can only be removed from the pc_lru list
1875                  * when both pv_chunks_mutex is owned and the
1876                  * corresponding pmap is locked.
1877                  */
1878                 if (pmap != next_pmap) {
1879                         if (pmap != NULL && pmap != locked_pmap)
1880                                 PMAP_UNLOCK(pmap);
1881                         pmap = next_pmap;
1882                         /* Avoid deadlock and lock recursion. */
1883                         if (pmap > locked_pmap) {
1884                                 RELEASE_PV_LIST_LOCK(lockp);
1885                                 PMAP_LOCK(pmap);
1886                                 mtx_lock(&pv_chunks_mutex);
1887                                 continue;
1888                         } else if (pmap != locked_pmap) {
1889                                 if (PMAP_TRYLOCK(pmap)) {
1890                                         mtx_lock(&pv_chunks_mutex);
1891                                         continue;
1892                                 } else {
1893                                         pmap = NULL; /* pmap is not locked */
1894                                         mtx_lock(&pv_chunks_mutex);
1895                                         pc = TAILQ_NEXT(pc_marker, pc_lru);
1896                                         if (pc == NULL ||
1897                                             pc->pc_pmap != next_pmap)
1898                                                 continue;
1899                                         goto next_chunk;
1900                                 }
1901                         }
1902                 }
1903
1904                 /*
1905                  * Destroy every non-wired, 4 KB page mapping in the chunk.
1906                  */
1907                 freed = 0;
1908                 for (field = 0; field < _NPCM; field++) {
1909                         for (inuse = ~pc->pc_map[field] & pc_freemask[field];
1910                             inuse != 0; inuse &= ~(1UL << bit)) {
1911                                 bit = ffsl(inuse) - 1;
1912                                 pv = &pc->pc_pventry[field * 64 + bit];
1913                                 va = pv->pv_va;
1914                                 pde = pmap_pde(pmap, va, &lvl);
1915                                 if (lvl != 2)
1916                                         continue;
1917                                 pte = pmap_l2_to_l3(pde, va);
1918                                 tpte = pmap_load(pte);
1919                                 if ((tpte & ATTR_SW_WIRED) != 0)
1920                                         continue;
1921                                 tpte = pmap_load_clear(pte);
1922                                 pmap_invalidate_page(pmap, va);
1923                                 m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK);
1924                                 if (pmap_page_dirty(tpte))
1925                                         vm_page_dirty(m);
1926                                 if ((tpte & ATTR_AF) != 0)
1927                                         vm_page_aflag_set(m, PGA_REFERENCED);
1928                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1929                                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
1930                                 m->md.pv_gen++;
1931                                 if (TAILQ_EMPTY(&m->md.pv_list) &&
1932                                     (m->flags & PG_FICTITIOUS) == 0) {
1933                                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
1934                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
1935                                                 vm_page_aflag_clear(m,
1936                                                     PGA_WRITEABLE);
1937                                         }
1938                                 }
1939                                 pc->pc_map[field] |= 1UL << bit;
1940                                 pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
1941                                 freed++;
1942                         }
1943                 }
1944                 if (freed == 0) {
1945                         mtx_lock(&pv_chunks_mutex);
1946                         goto next_chunk;
1947                 }
1948                 /* Every freed mapping is for a 4 KB page. */
1949                 pmap_resident_count_dec(pmap, freed);
1950                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
1951                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
1952                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
1953                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1954                 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
1955                     pc->pc_map[2] == PC_FREE2) {
1956                         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1957                         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1958                         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1959                         /* Entire chunk is free; return it. */
1960                         m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1961                         dump_drop_page(m_pc->phys_addr);
1962                         mtx_lock(&pv_chunks_mutex);
1963                         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1964                         break;
1965                 }
1966                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1967                 mtx_lock(&pv_chunks_mutex);
1968                 /* One freed pv entry in locked_pmap is sufficient. */
1969                 if (pmap == locked_pmap)
1970                         break;
1971
1972 next_chunk:
1973                 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
1974                 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
1975                 if (active_reclaims == 1 && pmap != NULL) {
1976                         /*
1977                          * Rotate the pv chunks list so that we do not
1978                          * scan the same pv chunks that could not be
1979                          * freed (because they contained a wired
1980                          * and/or superpage mapping) on every
1981                          * invocation of reclaim_pv_chunk().
1982                          */
1983                         while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
1984                                 MPASS(pc->pc_pmap != NULL);
1985                                 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1986                                 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1987                         }
1988                 }
1989         }
1990         TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
1991         TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
1992         active_reclaims--;
1993         mtx_unlock(&pv_chunks_mutex);
1994         if (pmap != NULL && pmap != locked_pmap)
1995                 PMAP_UNLOCK(pmap);
1996         if (m_pc == NULL && !SLIST_EMPTY(&free)) {
1997                 m_pc = SLIST_FIRST(&free);
1998                 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
1999                 /* Recycle a freed page table page. */
2000                 m_pc->wire_count = 1;
2001                 vm_wire_add(1);
2002         }
2003         vm_page_free_pages_toq(&free, false);
2004         return (m_pc);
2005 }
2006
2007 /*
2008  * free the pv_entry back to the free list
2009  */
2010 static void
2011 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2012 {
2013         struct pv_chunk *pc;
2014         int idx, field, bit;
2015
2016         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2017         PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2018         PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2019         PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2020         pc = pv_to_chunk(pv);
2021         idx = pv - &pc->pc_pventry[0];
2022         field = idx / 64;
2023         bit = idx % 64;
2024         pc->pc_map[field] |= 1ul << bit;
2025         if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
2026             pc->pc_map[2] != PC_FREE2) {
2027                 /* 98% of the time, pc is already at the head of the list. */
2028                 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2029                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2030                         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2031                 }
2032                 return;
2033         }
2034         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2035         free_pv_chunk(pc);
2036 }
2037
2038 static void
2039 free_pv_chunk(struct pv_chunk *pc)
2040 {
2041         vm_page_t m;
2042
2043         mtx_lock(&pv_chunks_mutex);
2044         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2045         mtx_unlock(&pv_chunks_mutex);
2046         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2047         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2048         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2049         /* entire chunk is free, return it */
2050         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2051         dump_drop_page(m->phys_addr);
2052         vm_page_unwire_noq(m);
2053         vm_page_free(m);
2054 }
2055
2056 /*
2057  * Returns a new PV entry, allocating a new PV chunk from the system when
2058  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2059  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2060  * returned.
2061  *
2062  * The given PV list lock may be released.
2063  */
2064 static pv_entry_t
2065 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2066 {
2067         int bit, field;
2068         pv_entry_t pv;
2069         struct pv_chunk *pc;
2070         vm_page_t m;
2071
2072         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2073         PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2074 retry:
2075         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2076         if (pc != NULL) {
2077                 for (field = 0; field < _NPCM; field++) {
2078                         if (pc->pc_map[field]) {
2079                                 bit = ffsl(pc->pc_map[field]) - 1;
2080                                 break;
2081                         }
2082                 }
2083                 if (field < _NPCM) {
2084                         pv = &pc->pc_pventry[field * 64 + bit];
2085                         pc->pc_map[field] &= ~(1ul << bit);
2086                         /* If this was the last item, move it to tail */
2087                         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2088                             pc->pc_map[2] == 0) {
2089                                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2090                                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2091                                     pc_list);
2092                         }
2093                         PV_STAT(atomic_add_long(&pv_entry_count, 1));
2094                         PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2095                         return (pv);
2096                 }
2097         }
2098         /* No free items, allocate another chunk */
2099         m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2100             VM_ALLOC_WIRED);
2101         if (m == NULL) {
2102                 if (lockp == NULL) {
2103                         PV_STAT(pc_chunk_tryfail++);
2104                         return (NULL);
2105                 }
2106                 m = reclaim_pv_chunk(pmap, lockp);
2107                 if (m == NULL)
2108                         goto retry;
2109         }
2110         PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2111         PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2112         dump_add_page(m->phys_addr);
2113         pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2114         pc->pc_pmap = pmap;
2115         pc->pc_map[0] = PC_FREE0 & ~1ul;        /* preallocated bit 0 */
2116         pc->pc_map[1] = PC_FREE1;
2117         pc->pc_map[2] = PC_FREE2;
2118         mtx_lock(&pv_chunks_mutex);
2119         TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2120         mtx_unlock(&pv_chunks_mutex);
2121         pv = &pc->pc_pventry[0];
2122         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2123         PV_STAT(atomic_add_long(&pv_entry_count, 1));
2124         PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2125         return (pv);
2126 }
2127
2128 /*
2129  * Ensure that the number of spare PV entries in the specified pmap meets or
2130  * exceeds the given count, "needed".
2131  *
2132  * The given PV list lock may be released.
2133  */
2134 static void
2135 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2136 {
2137         struct pch new_tail;
2138         struct pv_chunk *pc;
2139         vm_page_t m;
2140         int avail, free;
2141         bool reclaimed;
2142
2143         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2144         KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2145
2146         /*
2147          * Newly allocated PV chunks must be stored in a private list until
2148          * the required number of PV chunks have been allocated.  Otherwise,
2149          * reclaim_pv_chunk() could recycle one of these chunks.  In
2150          * contrast, these chunks must be added to the pmap upon allocation.
2151          */
2152         TAILQ_INIT(&new_tail);
2153 retry:
2154         avail = 0;
2155         TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
2156                 bit_count((bitstr_t *)pc->pc_map, 0,
2157                     sizeof(pc->pc_map) * NBBY, &free);
2158                 if (free == 0)
2159                         break;
2160                 avail += free;
2161                 if (avail >= needed)
2162                         break;
2163         }
2164         for (reclaimed = false; avail < needed; avail += _NPCPV) {
2165                 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
2166                     VM_ALLOC_WIRED);
2167                 if (m == NULL) {
2168                         m = reclaim_pv_chunk(pmap, lockp);
2169                         if (m == NULL)
2170                                 goto retry;
2171                         reclaimed = true;
2172                 }
2173                 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2174                 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2175                 dump_add_page(m->phys_addr);
2176                 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2177                 pc->pc_pmap = pmap;
2178                 pc->pc_map[0] = PC_FREE0;
2179                 pc->pc_map[1] = PC_FREE1;
2180                 pc->pc_map[2] = PC_FREE2;
2181                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2182                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2183                 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
2184
2185                 /*
2186                  * The reclaim might have freed a chunk from the current pmap.
2187                  * If that chunk contained available entries, we need to
2188                  * re-count the number of available entries.
2189                  */
2190                 if (reclaimed)
2191                         goto retry;
2192         }
2193         if (!TAILQ_EMPTY(&new_tail)) {
2194                 mtx_lock(&pv_chunks_mutex);
2195                 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2196                 mtx_unlock(&pv_chunks_mutex);
2197         }
2198 }
2199
2200 /*
2201  * First find and then remove the pv entry for the specified pmap and virtual
2202  * address from the specified pv list.  Returns the pv entry if found and NULL
2203  * otherwise.  This operation can be performed on pv lists for either 4KB or
2204  * 2MB page mappings.
2205  */
2206 static __inline pv_entry_t
2207 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2208 {
2209         pv_entry_t pv;
2210
2211         TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2212                 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2213                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2214                         pvh->pv_gen++;
2215                         break;
2216                 }
2217         }
2218         return (pv);
2219 }
2220
2221 /*
2222  * After demotion from a 2MB page mapping to 512 4KB page mappings,
2223  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2224  * entries for each of the 4KB page mappings.
2225  */
2226 static void
2227 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2228     struct rwlock **lockp)
2229 {
2230         struct md_page *pvh;
2231         struct pv_chunk *pc;
2232         pv_entry_t pv;
2233         vm_offset_t va_last;
2234         vm_page_t m;
2235         int bit, field;
2236
2237         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2238         KASSERT((pa & L2_OFFSET) == 0,
2239             ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
2240         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2241
2242         /*
2243          * Transfer the 2mpage's pv entry for this mapping to the first
2244          * page's pv list.  Once this transfer begins, the pv list lock
2245          * must not be released until the last pv entry is reinstantiated.
2246          */
2247         pvh = pa_to_pvh(pa);
2248         va = va & ~L2_OFFSET;
2249         pv = pmap_pvh_remove(pvh, pmap, va);
2250         KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
2251         m = PHYS_TO_VM_PAGE(pa);
2252         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2253         m->md.pv_gen++;
2254         /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
2255         PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
2256         va_last = va + L2_SIZE - PAGE_SIZE;
2257         for (;;) {
2258                 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2259                 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
2260                     pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
2261                 for (field = 0; field < _NPCM; field++) {
2262                         while (pc->pc_map[field]) {
2263                                 bit = ffsl(pc->pc_map[field]) - 1;
2264                                 pc->pc_map[field] &= ~(1ul << bit);
2265                                 pv = &pc->pc_pventry[field * 64 + bit];
2266                                 va += PAGE_SIZE;
2267                                 pv->pv_va = va;
2268                                 m++;
2269                                 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2270                             ("pmap_pv_demote_l2: page %p is not managed", m));
2271                                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2272                                 m->md.pv_gen++;
2273                                 if (va == va_last)
2274                                         goto out;
2275                         }
2276                 }
2277                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2278                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2279         }
2280 out:
2281         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
2282                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2283                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2284         }
2285         PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
2286         PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
2287 }
2288
2289 /*
2290  * First find and then destroy the pv entry for the specified pmap and virtual
2291  * address.  This operation can be performed on pv lists for either 4KB or 2MB
2292  * page mappings.
2293  */
2294 static void
2295 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2296 {
2297         pv_entry_t pv;
2298
2299         pv = pmap_pvh_remove(pvh, pmap, va);
2300         KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2301         free_pv_entry(pmap, pv);
2302 }
2303
2304 /*
2305  * Conditionally create the PV entry for a 4KB page mapping if the required
2306  * memory can be allocated without resorting to reclamation.
2307  */
2308 static boolean_t
2309 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
2310     struct rwlock **lockp)
2311 {
2312         pv_entry_t pv;
2313
2314         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2315         /* Pass NULL instead of the lock pointer to disable reclamation. */
2316         if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
2317                 pv->pv_va = va;
2318                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2319                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2320                 m->md.pv_gen++;
2321                 return (TRUE);
2322         } else
2323                 return (FALSE);
2324 }
2325
2326 /*
2327  * pmap_remove_l2: do the things to unmap a level 2 superpage in a process
2328  */
2329 static int
2330 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
2331     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
2332 {
2333         struct md_page *pvh;
2334         pt_entry_t old_l2;
2335         vm_offset_t eva, va;
2336         vm_page_t m, ml3;
2337
2338         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2339         KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
2340         old_l2 = pmap_load_clear(l2);
2341         pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
2342         if (old_l2 & ATTR_SW_WIRED)
2343                 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
2344         pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
2345         if (old_l2 & ATTR_SW_MANAGED) {
2346                 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK);
2347                 pvh = pa_to_pvh(old_l2 & ~ATTR_MASK);
2348                 pmap_pvh_free(pvh, pmap, sva);
2349                 eva = sva + L2_SIZE;
2350                 for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
2351                     va < eva; va += PAGE_SIZE, m++) {
2352                         if (pmap_page_dirty(old_l2))
2353                                 vm_page_dirty(m);
2354                         if (old_l2 & ATTR_AF)
2355                                 vm_page_aflag_set(m, PGA_REFERENCED);
2356                         if (TAILQ_EMPTY(&m->md.pv_list) &&
2357                             TAILQ_EMPTY(&pvh->pv_list))
2358                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
2359                 }
2360         }
2361         KASSERT(pmap != kernel_pmap,
2362             ("Attempting to remove an l2 kernel page"));
2363         ml3 = pmap_remove_pt_page(pmap, sva);
2364         if (ml3 != NULL) {
2365                 pmap_resident_count_dec(pmap, 1);
2366                 KASSERT(ml3->wire_count == NL3PG,
2367                     ("pmap_remove_pages: l3 page wire count error"));
2368                 ml3->wire_count = 1;
2369                 vm_page_unwire_noq(ml3);
2370                 pmap_add_delayed_free_list(ml3, free, FALSE);
2371         }
2372         return (pmap_unuse_pt(pmap, sva, l1e, free));
2373 }
2374
2375 /*
2376  * pmap_remove_l3: do the things to unmap a page in a process
2377  */
2378 static int
2379 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
2380     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
2381 {
2382         struct md_page *pvh;
2383         pt_entry_t old_l3;
2384         vm_page_t m;
2385
2386         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2387         old_l3 = pmap_load_clear(l3);
2388         pmap_invalidate_page(pmap, va);
2389         if (old_l3 & ATTR_SW_WIRED)
2390                 pmap->pm_stats.wired_count -= 1;
2391         pmap_resident_count_dec(pmap, 1);
2392         if (old_l3 & ATTR_SW_MANAGED) {
2393                 m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
2394                 if (pmap_page_dirty(old_l3))
2395                         vm_page_dirty(m);
2396                 if (old_l3 & ATTR_AF)
2397                         vm_page_aflag_set(m, PGA_REFERENCED);
2398                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2399                 pmap_pvh_free(&m->md, pmap, va);
2400                 if (TAILQ_EMPTY(&m->md.pv_list) &&
2401                     (m->flags & PG_FICTITIOUS) == 0) {
2402                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2403                         if (TAILQ_EMPTY(&pvh->pv_list))
2404                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
2405                 }
2406         }
2407         return (pmap_unuse_pt(pmap, va, l2e, free));
2408 }
2409
2410 /*
2411  *      Remove the given range of addresses from the specified map.
2412  *
2413  *      It is assumed that the start and end are properly
2414  *      rounded to the page size.
2415  */
2416 void
2417 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2418 {
2419         struct rwlock *lock;
2420         vm_offset_t va, va_next;
2421         pd_entry_t *l0, *l1, *l2;
2422         pt_entry_t l3_paddr, *l3;
2423         struct spglist free;
2424
2425         /*
2426          * Perform an unsynchronized read.  This is, however, safe.
2427          */
2428         if (pmap->pm_stats.resident_count == 0)
2429                 return;
2430
2431         SLIST_INIT(&free);
2432
2433         PMAP_LOCK(pmap);
2434
2435         lock = NULL;
2436         for (; sva < eva; sva = va_next) {
2437
2438                 if (pmap->pm_stats.resident_count == 0)
2439                         break;
2440
2441                 l0 = pmap_l0(pmap, sva);
2442                 if (pmap_load(l0) == 0) {
2443                         va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2444                         if (va_next < sva)
2445                                 va_next = eva;
2446                         continue;
2447                 }
2448
2449                 l1 = pmap_l0_to_l1(l0, sva);
2450                 if (pmap_load(l1) == 0) {
2451                         va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2452                         if (va_next < sva)
2453                                 va_next = eva;
2454                         continue;
2455                 }
2456
2457                 /*
2458                  * Calculate index for next page table.
2459                  */
2460                 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2461                 if (va_next < sva)
2462                         va_next = eva;
2463
2464                 l2 = pmap_l1_to_l2(l1, sva);
2465                 if (l2 == NULL)
2466                         continue;
2467
2468                 l3_paddr = pmap_load(l2);
2469
2470                 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
2471                         if (sva + L2_SIZE == va_next && eva >= va_next) {
2472                                 pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
2473                                     &free, &lock);
2474                                 continue;
2475                         } else if (pmap_demote_l2_locked(pmap, l2,
2476                             sva &~L2_OFFSET, &lock) == NULL)
2477                                 continue;
2478                         l3_paddr = pmap_load(l2);
2479                 }
2480
2481                 /*
2482                  * Weed out invalid mappings.
2483                  */
2484                 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
2485                         continue;
2486
2487                 /*
2488                  * Limit our scan to either the end of the va represented
2489                  * by the current page table page, or to the end of the
2490                  * range being removed.
2491                  */
2492                 if (va_next > eva)
2493                         va_next = eva;
2494
2495                 va = va_next;
2496                 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2497                     sva += L3_SIZE) {
2498                         if (l3 == NULL)
2499                                 panic("l3 == NULL");
2500                         if (pmap_load(l3) == 0) {
2501                                 if (va != va_next) {
2502                                         pmap_invalidate_range(pmap, va, sva);
2503                                         va = va_next;
2504                                 }
2505                                 continue;
2506                         }
2507                         if (va == va_next)
2508                                 va = sva;
2509                         if (pmap_remove_l3(pmap, l3, sva, l3_paddr, &free,
2510                             &lock)) {
2511                                 sva += L3_SIZE;
2512                                 break;
2513                         }
2514                 }
2515                 if (va != va_next)
2516                         pmap_invalidate_range(pmap, va, sva);
2517         }
2518         if (lock != NULL)
2519                 rw_wunlock(lock);
2520         PMAP_UNLOCK(pmap);
2521         vm_page_free_pages_toq(&free, false);
2522 }
2523
2524 /*
2525  *      Routine:        pmap_remove_all
2526  *      Function:
2527  *              Removes this physical page from
2528  *              all physical maps in which it resides.
2529  *              Reflects back modify bits to the pager.
2530  *
2531  *      Notes:
2532  *              Original versions of this routine were very
2533  *              inefficient because they iteratively called
2534  *              pmap_remove (slow...)
2535  */
2536
2537 void
2538 pmap_remove_all(vm_page_t m)
2539 {
2540         struct md_page *pvh;
2541         pv_entry_t pv;
2542         pmap_t pmap;
2543         struct rwlock *lock;
2544         pd_entry_t *pde, tpde;
2545         pt_entry_t *pte, tpte;
2546         vm_offset_t va;
2547         struct spglist free;
2548         int lvl, pvh_gen, md_gen;
2549
2550         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2551             ("pmap_remove_all: page %p is not managed", m));
2552         SLIST_INIT(&free);
2553         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2554         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2555             pa_to_pvh(VM_PAGE_TO_PHYS(m));
2556 retry:
2557         rw_wlock(lock);
2558         while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2559                 pmap = PV_PMAP(pv);
2560                 if (!PMAP_TRYLOCK(pmap)) {
2561                         pvh_gen = pvh->pv_gen;
2562                         rw_wunlock(lock);
2563                         PMAP_LOCK(pmap);
2564                         rw_wlock(lock);
2565                         if (pvh_gen != pvh->pv_gen) {
2566                                 rw_wunlock(lock);
2567                                 PMAP_UNLOCK(pmap);
2568                                 goto retry;
2569                         }
2570                 }
2571                 va = pv->pv_va;
2572                 pte = pmap_pte(pmap, va, &lvl);
2573                 KASSERT(pte != NULL,
2574                     ("pmap_remove_all: no page table entry found"));
2575                 KASSERT(lvl == 2,
2576                     ("pmap_remove_all: invalid pte level %d", lvl));
2577
2578                 pmap_demote_l2_locked(pmap, pte, va, &lock);
2579                 PMAP_UNLOCK(pmap);
2580         }
2581         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2582                 pmap = PV_PMAP(pv);
2583                 if (!PMAP_TRYLOCK(pmap)) {
2584                         pvh_gen = pvh->pv_gen;
2585                         md_gen = m->md.pv_gen;
2586                         rw_wunlock(lock);
2587                         PMAP_LOCK(pmap);
2588                         rw_wlock(lock);
2589                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
2590                                 rw_wunlock(lock);
2591                                 PMAP_UNLOCK(pmap);
2592                                 goto retry;
2593                         }
2594                 }
2595                 pmap_resident_count_dec(pmap, 1);
2596
2597                 pde = pmap_pde(pmap, pv->pv_va, &lvl);
2598                 KASSERT(pde != NULL,
2599                     ("pmap_remove_all: no page directory entry found"));
2600                 KASSERT(lvl == 2,
2601                     ("pmap_remove_all: invalid pde level %d", lvl));
2602                 tpde = pmap_load(pde);
2603
2604                 pte = pmap_l2_to_l3(pde, pv->pv_va);
2605                 tpte = pmap_load(pte);
2606                 pmap_load_clear(pte);
2607                 pmap_invalidate_page(pmap, pv->pv_va);
2608                 if (tpte & ATTR_SW_WIRED)
2609                         pmap->pm_stats.wired_count--;
2610                 if ((tpte & ATTR_AF) != 0)
2611                         vm_page_aflag_set(m, PGA_REFERENCED);
2612
2613                 /*
2614                  * Update the vm_page_t clean and reference bits.
2615                  */
2616                 if (pmap_page_dirty(tpte))
2617                         vm_page_dirty(m);
2618                 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
2619                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2620                 m->md.pv_gen++;
2621                 free_pv_entry(pmap, pv);
2622                 PMAP_UNLOCK(pmap);
2623         }
2624         vm_page_aflag_clear(m, PGA_WRITEABLE);
2625         rw_wunlock(lock);
2626         vm_page_free_pages_toq(&free, false);
2627 }
2628
2629 /*
2630  *      Set the physical protection on the
2631  *      specified range of this map as requested.
2632  */
2633 void
2634 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2635 {
2636         vm_offset_t va, va_next;
2637         pd_entry_t *l0, *l1, *l2;
2638         pt_entry_t *l3p, l3, nbits;
2639
2640         KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
2641         if (prot == VM_PROT_NONE) {
2642                 pmap_remove(pmap, sva, eva);
2643                 return;
2644         }
2645
2646         if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
2647             (VM_PROT_WRITE | VM_PROT_EXECUTE))
2648                 return;
2649
2650         PMAP_LOCK(pmap);
2651         for (; sva < eva; sva = va_next) {
2652
2653                 l0 = pmap_l0(pmap, sva);
2654                 if (pmap_load(l0) == 0) {
2655                         va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2656                         if (va_next < sva)
2657                                 va_next = eva;
2658                         continue;
2659                 }
2660
2661                 l1 = pmap_l0_to_l1(l0, sva);
2662                 if (pmap_load(l1) == 0) {
2663                         va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2664                         if (va_next < sva)
2665                                 va_next = eva;
2666                         continue;
2667                 }
2668
2669                 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2670                 if (va_next < sva)
2671                         va_next = eva;
2672
2673                 l2 = pmap_l1_to_l2(l1, sva);
2674                 if (pmap_load(l2) == 0)
2675                         continue;
2676
2677                 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
2678                         l3p = pmap_demote_l2(pmap, l2, sva);
2679                         if (l3p == NULL)
2680                                 continue;
2681                 }
2682                 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
2683                     ("pmap_protect: Invalid L2 entry after demotion"));
2684
2685                 if (va_next > eva)
2686                         va_next = eva;
2687
2688                 va = va_next;
2689                 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
2690                     sva += L3_SIZE) {
2691                         l3 = pmap_load(l3p);
2692                         if (!pmap_l3_valid(l3))
2693                                 continue;
2694
2695                         nbits = 0;
2696                         if ((prot & VM_PROT_WRITE) == 0) {
2697                                 if ((l3 & ATTR_SW_MANAGED) &&
2698                                     pmap_page_dirty(l3)) {
2699                                         vm_page_dirty(PHYS_TO_VM_PAGE(l3 &
2700                                             ~ATTR_MASK));
2701                                 }
2702                                 nbits |= ATTR_AP(ATTR_AP_RO);
2703                         }
2704                         if ((prot & VM_PROT_EXECUTE) == 0)
2705                                 nbits |= ATTR_XN;
2706
2707                         pmap_set(l3p, nbits);
2708                         /* XXX: Use pmap_invalidate_range */
2709                         pmap_invalidate_page(pmap, sva);
2710                 }
2711         }
2712         PMAP_UNLOCK(pmap);
2713 }
2714
2715 /*
2716  * Inserts the specified page table page into the specified pmap's collection
2717  * of idle page table pages.  Each of a pmap's page table pages is responsible
2718  * for mapping a distinct range of virtual addresses.  The pmap's collection is
2719  * ordered by this virtual address range.
2720  */
2721 static __inline int
2722 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
2723 {
2724
2725         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2726         return (vm_radix_insert(&pmap->pm_root, mpte));
2727 }
2728
2729 /*
2730  * Removes the page table page mapping the specified virtual address from the
2731  * specified pmap's collection of idle page table pages, and returns it.
2732  * Otherwise, returns NULL if there is no page table page corresponding to the
2733  * specified virtual address.
2734  */
2735 static __inline vm_page_t
2736 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
2737 {
2738
2739         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2740         return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
2741 }
2742
2743 /*
2744  * Performs a break-before-make update of a pmap entry. This is needed when
2745  * either promoting or demoting pages to ensure the TLB doesn't get into an
2746  * inconsistent state.
2747  */
2748 static void
2749 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
2750     vm_offset_t va, vm_size_t size)
2751 {
2752         register_t intr;
2753
2754         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2755
2756         /*
2757          * Ensure we don't get switched out with the page table in an
2758          * inconsistent state. We also need to ensure no interrupts fire
2759          * as they may make use of an address we are about to invalidate.
2760          */
2761         intr = intr_disable();
2762         critical_enter();
2763
2764         /* Clear the old mapping */
2765         pmap_load_clear(pte);
2766         pmap_invalidate_range_nopin(pmap, va, va + size);
2767
2768         /* Create the new mapping */
2769         pmap_load_store(pte, newpte);
2770
2771         critical_exit();
2772         intr_restore(intr);
2773 }
2774
2775 #if VM_NRESERVLEVEL > 0
2776 /*
2777  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
2778  * replace the many pv entries for the 4KB page mappings by a single pv entry
2779  * for the 2MB page mapping.
2780  */
2781 static void
2782 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2783     struct rwlock **lockp)
2784 {
2785         struct md_page *pvh;
2786         pv_entry_t pv;
2787         vm_offset_t va_last;
2788         vm_page_t m;
2789
2790         KASSERT((pa & L2_OFFSET) == 0,
2791             ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
2792         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2793
2794         /*
2795          * Transfer the first page's pv entry for this mapping to the 2mpage's
2796          * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
2797          * a transfer avoids the possibility that get_pv_entry() calls
2798          * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
2799          * mappings that is being promoted.
2800          */
2801         m = PHYS_TO_VM_PAGE(pa);
2802         va = va & ~L2_OFFSET;
2803         pv = pmap_pvh_remove(&m->md, pmap, va);
2804         KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
2805         pvh = pa_to_pvh(pa);
2806         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2807         pvh->pv_gen++;
2808         /* Free the remaining NPTEPG - 1 pv entries. */
2809         va_last = va + L2_SIZE - PAGE_SIZE;
2810         do {
2811                 m++;
2812                 va += PAGE_SIZE;
2813                 pmap_pvh_free(&m->md, pmap, va);
2814         } while (va < va_last);
2815 }
2816
2817 /*
2818  * Tries to promote the 512, contiguous 4KB page mappings that are within a
2819  * single level 2 table entry to a single 2MB page mapping.  For promotion
2820  * to occur, two conditions must be met: (1) the 4KB page mappings must map
2821  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
2822  * identical characteristics.
2823  */
2824 static void
2825 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
2826     struct rwlock **lockp)
2827 {
2828         pt_entry_t *firstl3, *l3, newl2, oldl3, pa;
2829         vm_page_t mpte;
2830         vm_offset_t sva;
2831
2832         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2833
2834         sva = va & ~L2_OFFSET;
2835         firstl3 = pmap_l2_to_l3(l2, sva);
2836         newl2 = pmap_load(firstl3);
2837
2838         /* Check the alingment is valid */
2839         if (((newl2 & ~ATTR_MASK) & L2_OFFSET) != 0) {
2840                 atomic_add_long(&pmap_l2_p_failures, 1);
2841                 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
2842                     " in pmap %p", va, pmap);
2843                 return;
2844         }
2845
2846         pa = newl2 + L2_SIZE - PAGE_SIZE;
2847         for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
2848                 oldl3 = pmap_load(l3);
2849                 if (oldl3 != pa) {
2850                         atomic_add_long(&pmap_l2_p_failures, 1);
2851                         CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
2852                             " in pmap %p", va, pmap);
2853                         return;
2854                 }
2855                 pa -= PAGE_SIZE;
2856         }
2857
2858         /*
2859          * Save the page table page in its current state until the L2
2860          * mapping the superpage is demoted by pmap_demote_l2() or
2861          * destroyed by pmap_remove_l3().
2862          */
2863         mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
2864         KASSERT(mpte >= vm_page_array &&
2865             mpte < &vm_page_array[vm_page_array_size],
2866             ("pmap_promote_l2: page table page is out of range"));
2867         KASSERT(mpte->pindex == pmap_l2_pindex(va),
2868             ("pmap_promote_l2: page table page's pindex is wrong"));
2869         if (pmap_insert_pt_page(pmap, mpte)) {
2870                 atomic_add_long(&pmap_l2_p_failures, 1);
2871                 CTR2(KTR_PMAP,
2872                     "pmap_promote_l2: failure for va %#lx in pmap %p", va,
2873                     pmap);
2874                 return;
2875         }
2876
2877         if ((newl2 & ATTR_SW_MANAGED) != 0)
2878                 pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp);
2879
2880         newl2 &= ~ATTR_DESCR_MASK;
2881         newl2 |= L2_BLOCK;
2882
2883         pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE);
2884
2885         atomic_add_long(&pmap_l2_promotions, 1);
2886         CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
2887                     pmap);
2888 }
2889 #endif /* VM_NRESERVLEVEL > 0 */
2890
2891 /*
2892  *      Insert the given physical page (p) at
2893  *      the specified virtual address (v) in the
2894  *      target physical map with the protection requested.
2895  *
2896  *      If specified, the page will be wired down, meaning
2897  *      that the related pte can not be reclaimed.
2898  *
2899  *      NB:  This is the only routine which MAY NOT lazy-evaluate
2900  *      or lose information.  That is, this routine must actually
2901  *      insert this page into the given map NOW.
2902  */
2903 int
2904 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2905     u_int flags, int8_t psind __unused)
2906 {
2907         struct rwlock *lock;
2908         pd_entry_t *pde;
2909         pt_entry_t new_l3, orig_l3;
2910         pt_entry_t *l2, *l3;
2911         pv_entry_t pv;
2912         vm_paddr_t opa, pa, l1_pa, l2_pa, l3_pa;
2913         vm_page_t mpte, om, l1_m, l2_m, l3_m;
2914         boolean_t nosleep;
2915         int lvl;
2916
2917         va = trunc_page(va);
2918         if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
2919                 VM_OBJECT_ASSERT_LOCKED(m->object);
2920         pa = VM_PAGE_TO_PHYS(m);
2921         new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
2922             L3_PAGE);
2923         if ((prot & VM_PROT_WRITE) == 0)
2924                 new_l3 |= ATTR_AP(ATTR_AP_RO);
2925         if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
2926                 new_l3 |= ATTR_XN;
2927         if ((flags & PMAP_ENTER_WIRED) != 0)
2928                 new_l3 |= ATTR_SW_WIRED;
2929         if (va < VM_MAXUSER_ADDRESS)
2930                 new_l3 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN;
2931         if ((m->oflags & VPO_UNMANAGED) == 0)
2932                 new_l3 |= ATTR_SW_MANAGED;
2933
2934         CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
2935
2936         mpte = NULL;
2937
2938         lock = NULL;
2939         PMAP_LOCK(pmap);
2940
2941         pde = pmap_pde(pmap, va, &lvl);
2942         if (pde != NULL && lvl == 1) {
2943                 l2 = pmap_l1_to_l2(pde, va);
2944                 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
2945                     (l3 = pmap_demote_l2_locked(pmap, l2, va & ~L2_OFFSET,
2946                     &lock)) != NULL) {
2947                         l3 = &l3[pmap_l3_index(va)];
2948                         if (va < VM_MAXUSER_ADDRESS) {
2949                                 mpte = PHYS_TO_VM_PAGE(
2950                                     pmap_load(l2) & ~ATTR_MASK);
2951                                 mpte->wire_count++;
2952                         }
2953                         goto havel3;
2954                 }
2955         }
2956
2957         if (va < VM_MAXUSER_ADDRESS) {
2958                 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
2959                 mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
2960                 if (mpte == NULL && nosleep) {
2961                         CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
2962                         if (lock != NULL)
2963                                 rw_wunlock(lock);
2964                         PMAP_UNLOCK(pmap);
2965                         return (KERN_RESOURCE_SHORTAGE);
2966                 }
2967                 pde = pmap_pde(pmap, va, &lvl);
2968                 KASSERT(pde != NULL,
2969                     ("pmap_enter: Invalid page entry, va: 0x%lx", va));
2970                 KASSERT(lvl == 2,
2971                     ("pmap_enter: Invalid level %d", lvl));
2972         } else {
2973                 /*
2974                  * If we get a level 2 pde it must point to a level 3 entry
2975                  * otherwise we will need to create the intermediate tables
2976                  */
2977                 if (lvl < 2) {
2978                         switch (lvl) {
2979                         default:
2980                         case -1:
2981                                 /* Get the l0 pde to update */
2982                                 pde = pmap_l0(pmap, va);
2983                                 KASSERT(pde != NULL, ("..."));
2984
2985                                 l1_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2986                                     VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2987                                     VM_ALLOC_ZERO);
2988                                 if (l1_m == NULL)
2989                                         panic("pmap_enter: l1 pte_m == NULL");
2990                                 if ((l1_m->flags & PG_ZERO) == 0)
2991                                         pmap_zero_page(l1_m);
2992
2993                                 l1_pa = VM_PAGE_TO_PHYS(l1_m);
2994                                 pmap_load_store(pde, l1_pa | L0_TABLE);
2995                                 /* FALLTHROUGH */
2996                         case 0:
2997                                 /* Get the l1 pde to update */
2998                                 pde = pmap_l1_to_l2(pde, va);
2999                                 KASSERT(pde != NULL, ("..."));
3000
3001                                 l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
3002                                     VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
3003                                     VM_ALLOC_ZERO);
3004                                 if (l2_m == NULL)
3005                                         panic("pmap_enter: l2 pte_m == NULL");
3006                                 if ((l2_m->flags & PG_ZERO) == 0)
3007                                         pmap_zero_page(l2_m);
3008
3009                                 l2_pa = VM_PAGE_TO_PHYS(l2_m);
3010                                 pmap_load_store(pde, l2_pa | L1_TABLE);
3011                                 /* FALLTHROUGH */
3012                         case 1:
3013                                 /* Get the l2 pde to update */
3014                                 pde = pmap_l1_to_l2(pde, va);
3015                                 KASSERT(pde != NULL, ("..."));
3016
3017                                 l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
3018                                     VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
3019                                     VM_ALLOC_ZERO);
3020                                 if (l3_m == NULL)
3021                                         panic("pmap_enter: l3 pte_m == NULL");
3022                                 if ((l3_m->flags & PG_ZERO) == 0)
3023                                         pmap_zero_page(l3_m);
3024
3025                                 l3_pa = VM_PAGE_TO_PHYS(l3_m);
3026                                 pmap_load_store(pde, l3_pa | L2_TABLE);
3027                                 break;
3028                         }
3029                 }
3030         }
3031         l3 = pmap_l2_to_l3(pde, va);
3032
3033 havel3:
3034         orig_l3 = pmap_load(l3);
3035         opa = orig_l3 & ~ATTR_MASK;
3036         pv = NULL;
3037
3038         /*
3039          * Is the specified virtual address already mapped?
3040          */
3041         if (pmap_l3_valid(orig_l3)) {
3042                 /*
3043                  * Wiring change, just update stats. We don't worry about
3044                  * wiring PT pages as they remain resident as long as there
3045                  * are valid mappings in them. Hence, if a user page is wired,
3046                  * the PT page will be also.
3047                  */
3048                 if ((flags & PMAP_ENTER_WIRED) != 0 &&
3049                     (orig_l3 & ATTR_SW_WIRED) == 0)
3050                         pmap->pm_stats.wired_count++;
3051                 else if ((flags & PMAP_ENTER_WIRED) == 0 &&
3052                     (orig_l3 & ATTR_SW_WIRED) != 0)
3053                         pmap->pm_stats.wired_count--;
3054
3055                 /*
3056                  * Remove the extra PT page reference.
3057                  */
3058                 if (mpte != NULL) {
3059                         mpte->wire_count--;
3060                         KASSERT(mpte->wire_count > 0,
3061                             ("pmap_enter: missing reference to page table page,"
3062                              " va: 0x%lx", va));
3063                 }
3064
3065                 /*
3066                  * Has the physical page changed?
3067                  */
3068                 if (opa == pa) {
3069                         /*
3070                          * No, might be a protection or wiring change.
3071                          */
3072                         if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
3073                                 if ((new_l3 & ATTR_AP(ATTR_AP_RW)) ==
3074                                     ATTR_AP(ATTR_AP_RW)) {
3075                                         vm_page_aflag_set(m, PGA_WRITEABLE);
3076                                 }
3077                         }
3078                         goto validate;
3079                 }
3080
3081                 /*
3082                  * The physical page has changed.
3083                  */
3084                 (void)pmap_load_clear(l3);
3085                 KASSERT((orig_l3 & ~ATTR_MASK) == opa,
3086                     ("pmap_enter: unexpected pa update for %#lx", va));
3087                 if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
3088                         om = PHYS_TO_VM_PAGE(opa);
3089
3090                         /*
3091                          * The pmap lock is sufficient to synchronize with
3092                          * concurrent calls to pmap_page_test_mappings() and
3093                          * pmap_ts_referenced().
3094                          */
3095                         if (pmap_page_dirty(orig_l3))
3096                                 vm_page_dirty(om);
3097                         if ((orig_l3 & ATTR_AF) != 0)
3098                                 vm_page_aflag_set(om, PGA_REFERENCED);
3099                         CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
3100                         pv = pmap_pvh_remove(&om->md, pmap, va);
3101                         if ((m->oflags & VPO_UNMANAGED) != 0)
3102                                 free_pv_entry(pmap, pv);
3103                         if ((om->aflags & PGA_WRITEABLE) != 0 &&
3104                             TAILQ_EMPTY(&om->md.pv_list) &&
3105                             ((om->flags & PG_FICTITIOUS) != 0 ||
3106                             TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3107                                 vm_page_aflag_clear(om, PGA_WRITEABLE);
3108                 }
3109                 pmap_invalidate_page(pmap, va);
3110                 orig_l3 = 0;
3111         } else {
3112                 /*
3113                  * Increment the counters.
3114                  */
3115                 if ((new_l3 & ATTR_SW_WIRED) != 0)
3116                         pmap->pm_stats.wired_count++;
3117                 pmap_resident_count_inc(pmap, 1);
3118         }
3119         /*
3120          * Enter on the PV list if part of our managed memory.
3121          */
3122         if ((m->oflags & VPO_UNMANAGED) == 0) {
3123                 if (pv == NULL) {
3124                         pv = get_pv_entry(pmap, &lock);
3125                         pv->pv_va = va;
3126                 }
3127                 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
3128                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3129                 m->md.pv_gen++;
3130                 if ((new_l3 & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))
3131                         vm_page_aflag_set(m, PGA_WRITEABLE);
3132         }
3133
3134 validate:
3135         /*
3136          * Sync icache if exec permission and attribute VM_MEMATTR_WRITE_BACK
3137          * is set. Do it now, before the mapping is stored and made
3138          * valid for hardware table walk. If done later, then other can
3139          * access this page before caches are properly synced.
3140          * Don't do it for kernel memory which is mapped with exec
3141          * permission even if the memory isn't going to hold executable
3142          * code. The only time when icache sync is needed is after
3143          * kernel module is loaded and the relocation info is processed.
3144          * And it's done in elf_cpu_load_file().
3145         */
3146         if ((prot & VM_PROT_EXECUTE) &&  pmap != kernel_pmap &&
3147             m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
3148             (opa != pa || (orig_l3 & ATTR_XN)))
3149                 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
3150
3151         /*
3152          * Update the L3 entry
3153          */
3154         if (pmap_l3_valid(orig_l3)) {
3155                 KASSERT(opa == pa, ("pmap_enter: invalid update"));
3156                 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
3157                         /* same PA, different attributes */
3158                         pmap_load_store(l3, new_l3);
3159                         pmap_invalidate_page(pmap, va);
3160                         if (pmap_page_dirty(orig_l3) &&
3161                             (orig_l3 & ATTR_SW_MANAGED) != 0)
3162                                 vm_page_dirty(m);
3163                 } else {
3164                         /*
3165                          * orig_l3 == new_l3
3166                          * This can happens if multiple threads simultaneously
3167                          * access not yet mapped page. This bad for performance
3168                          * since this can cause full demotion-NOP-promotion
3169                          * cycle.
3170                          * Another possible reasons are:
3171                          * - VM and pmap memory layout are diverged
3172                          * - tlb flush is missing somewhere and CPU doesn't see
3173                          *   actual mapping.
3174                          */
3175                         CTR4(KTR_PMAP, "%s: already mapped page - "
3176                             "pmap %p va 0x%#lx pte 0x%lx",
3177                             __func__, pmap, va, new_l3);
3178                 }
3179         } else {
3180                 /* New mappig */
3181                 pmap_load_store(l3, new_l3);
3182         }
3183
3184 #if VM_NRESERVLEVEL > 0
3185         if (pmap != pmap_kernel() &&
3186             (mpte == NULL || mpte->wire_count == NL3PG) &&
3187             pmap_superpages_enabled() &&
3188             (m->flags & PG_FICTITIOUS) == 0 &&
3189             vm_reserv_level_iffullpop(m) == 0) {
3190                 pmap_promote_l2(pmap, pde, va, &lock);
3191         }
3192 #endif
3193
3194         if (lock != NULL)
3195                 rw_wunlock(lock);
3196         PMAP_UNLOCK(pmap);
3197         return (KERN_SUCCESS);
3198 }
3199
3200 /*
3201  * Maps a sequence of resident pages belonging to the same object.
3202  * The sequence begins with the given page m_start.  This page is
3203  * mapped at the given virtual address start.  Each subsequent page is
3204  * mapped at a virtual address that is offset from start by the same
3205  * amount as the page is offset from m_start within the object.  The
3206  * last page in the sequence is the page with the largest offset from
3207  * m_start that can be mapped at a virtual address less than the given
3208  * virtual address end.  Not every virtual page between start and end
3209  * is mapped; only those for which a resident page exists with the
3210  * corresponding offset from m_start are mapped.
3211  */
3212 void
3213 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3214     vm_page_t m_start, vm_prot_t prot)
3215 {
3216         struct rwlock *lock;
3217         vm_offset_t va;
3218         vm_page_t m, mpte;
3219         vm_pindex_t diff, psize;
3220
3221         VM_OBJECT_ASSERT_LOCKED(m_start->object);
3222
3223         psize = atop(end - start);
3224         mpte = NULL;
3225         m = m_start;
3226         lock = NULL;
3227         PMAP_LOCK(pmap);
3228         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3229                 va = start + ptoa(diff);
3230                 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock);
3231                 m = TAILQ_NEXT(m, listq);
3232         }
3233         if (lock != NULL)
3234                 rw_wunlock(lock);
3235         PMAP_UNLOCK(pmap);
3236 }
3237
3238 /*
3239  * this code makes some *MAJOR* assumptions:
3240  * 1. Current pmap & pmap exists.
3241  * 2. Not wired.
3242  * 3. Read access.
3243  * 4. No page table pages.
3244  * but is *MUCH* faster than pmap_enter...
3245  */
3246
3247 void
3248 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3249 {
3250         struct rwlock *lock;
3251
3252         lock = NULL;
3253         PMAP_LOCK(pmap);
3254         (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
3255         if (lock != NULL)
3256                 rw_wunlock(lock);
3257         PMAP_UNLOCK(pmap);
3258 }
3259
3260 static vm_page_t
3261 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3262     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
3263 {
3264         struct spglist free;
3265         pd_entry_t *pde;
3266         pt_entry_t *l2, *l3, l3_val;
3267         vm_paddr_t pa;
3268         int lvl;
3269
3270         KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3271             (m->oflags & VPO_UNMANAGED) != 0,
3272             ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3273         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3274
3275         CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
3276         /*
3277          * In the case that a page table page is not
3278          * resident, we are creating it here.
3279          */
3280         if (va < VM_MAXUSER_ADDRESS) {
3281                 vm_pindex_t l2pindex;
3282
3283                 /*
3284                  * Calculate pagetable page index
3285                  */
3286                 l2pindex = pmap_l2_pindex(va);
3287                 if (mpte && (mpte->pindex == l2pindex)) {
3288                         mpte->wire_count++;
3289                 } else {
3290                         /*
3291                          * Get the l2 entry
3292                          */
3293                         pde = pmap_pde(pmap, va, &lvl);
3294
3295                         /*
3296                          * If the page table page is mapped, we just increment
3297                          * the hold count, and activate it.  Otherwise, we
3298                          * attempt to allocate a page table page.  If this
3299                          * attempt fails, we don't retry.  Instead, we give up.
3300                          */
3301                         if (lvl == 1) {
3302                                 l2 = pmap_l1_to_l2(pde, va);
3303                                 if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
3304                                     L2_BLOCK)
3305                                         return (NULL);
3306                         }
3307                         if (lvl == 2 && pmap_load(pde) != 0) {
3308                                 mpte =
3309                                     PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
3310                                 mpte->wire_count++;
3311                         } else {
3312                                 /*
3313                                  * Pass NULL instead of the PV list lock
3314                                  * pointer, because we don't intend to sleep.
3315                                  */
3316                                 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
3317                                 if (mpte == NULL)
3318                                         return (mpte);
3319                         }
3320                 }
3321                 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3322                 l3 = &l3[pmap_l3_index(va)];
3323         } else {
3324                 mpte = NULL;
3325                 pde = pmap_pde(kernel_pmap, va, &lvl);
3326                 KASSERT(pde != NULL,
3327                     ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
3328                      va));
3329                 KASSERT(lvl == 2,
3330                     ("pmap_enter_quick_locked: Invalid level %d", lvl));
3331                 l3 = pmap_l2_to_l3(pde, va);
3332         }
3333
3334         if (pmap_load(l3) != 0) {
3335                 if (mpte != NULL) {
3336                         mpte->wire_count--;
3337                         mpte = NULL;
3338                 }
3339                 return (mpte);
3340         }
3341
3342         /*
3343          * Enter on the PV list if part of our managed memory.
3344          */
3345         if ((m->oflags & VPO_UNMANAGED) == 0 &&
3346             !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3347                 if (mpte != NULL) {
3348                         SLIST_INIT(&free);
3349                         if (pmap_unwire_l3(pmap, va, mpte, &free)) {
3350                                 pmap_invalidate_page(pmap, va);
3351                                 vm_page_free_pages_toq(&free, false);
3352                         }
3353                         mpte = NULL;
3354                 }
3355                 return (mpte);
3356         }
3357
3358         /*
3359          * Increment counters
3360          */
3361         pmap_resident_count_inc(pmap, 1);
3362
3363         pa = VM_PAGE_TO_PHYS(m);
3364         l3_val = pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
3365             ATTR_AP(ATTR_AP_RO) | L3_PAGE;
3366         if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
3367                 l3_val |= ATTR_XN;
3368         else if (va < VM_MAXUSER_ADDRESS)
3369                 l3_val |= ATTR_PXN;
3370
3371         /*
3372          * Now validate mapping with RO protection
3373          */
3374         if ((m->oflags & VPO_UNMANAGED) == 0)
3375                 l3_val |= ATTR_SW_MANAGED;
3376
3377         /* Sync icache before the mapping is stored to PTE */
3378         if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
3379             m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
3380                 cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
3381
3382         pmap_load_store(l3, l3_val);
3383         pmap_invalidate_page(pmap, va);
3384         return (mpte);
3385 }
3386
3387 /*
3388  * This code maps large physical mmap regions into the
3389  * processor address space.  Note that some shortcuts
3390  * are taken, but the code works.
3391  */
3392 void
3393 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3394     vm_pindex_t pindex, vm_size_t size)
3395 {
3396
3397         VM_OBJECT_ASSERT_WLOCKED(object);
3398         KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3399             ("pmap_object_init_pt: non-device object"));
3400 }
3401
3402 /*
3403  *      Clear the wired attribute from the mappings for the specified range of
3404  *      addresses in the given pmap.  Every valid mapping within that range
3405  *      must have the wired attribute set.  In contrast, invalid mappings
3406  *      cannot have the wired attribute set, so they are ignored.
3407  *
3408  *      The wired attribute of the page table entry is not a hardware feature,
3409  *      so there is no need to invalidate any TLB entries.
3410  */
3411 void
3412 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3413 {
3414         vm_offset_t va_next;
3415         pd_entry_t *l0, *l1, *l2;
3416         pt_entry_t *l3;
3417
3418         PMAP_LOCK(pmap);
3419         for (; sva < eva; sva = va_next) {
3420                 l0 = pmap_l0(pmap, sva);
3421                 if (pmap_load(l0) == 0) {
3422                         va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3423                         if (va_next < sva)
3424                                 va_next = eva;
3425                         continue;
3426                 }
3427
3428                 l1 = pmap_l0_to_l1(l0, sva);
3429                 if (pmap_load(l1) == 0) {
3430                         va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3431                         if (va_next < sva)
3432                                 va_next = eva;
3433                         continue;
3434                 }
3435
3436                 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3437                 if (va_next < sva)
3438                         va_next = eva;
3439
3440                 l2 = pmap_l1_to_l2(l1, sva);
3441                 if (pmap_load(l2) == 0)
3442                         continue;
3443
3444                 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
3445                         l3 = pmap_demote_l2(pmap, l2, sva);
3446                         if (l3 == NULL)
3447                                 continue;
3448                 }
3449                 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
3450                     ("pmap_unwire: Invalid l2 entry after demotion"));
3451
3452                 if (va_next > eva)
3453                         va_next = eva;
3454                 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
3455                     sva += L3_SIZE) {
3456                         if (pmap_load(l3) == 0)
3457                                 continue;
3458                         if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
3459                                 panic("pmap_unwire: l3 %#jx is missing "
3460                                     "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
3461
3462                         /*
3463                          * PG_W must be cleared atomically.  Although the pmap
3464                          * lock synchronizes access to PG_W, another processor
3465                          * could be setting PG_M and/or PG_A concurrently.
3466                          */
3467                         atomic_clear_long(l3, ATTR_SW_WIRED);
3468                         pmap->pm_stats.wired_count--;
3469                 }
3470         }
3471         PMAP_UNLOCK(pmap);
3472 }
3473
3474 /*
3475  *      Copy the range specified by src_addr/len
3476  *      from the source map to the range dst_addr/len
3477  *      in the destination map.
3478  *
3479  *      This routine is only advisory and need not do anything.
3480  */
3481
3482 void
3483 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3484     vm_offset_t src_addr)
3485 {
3486 }
3487
3488 /*
3489  *      pmap_zero_page zeros the specified hardware page by mapping
3490  *      the page into KVM and using bzero to clear its contents.
3491  */
3492 void
3493 pmap_zero_page(vm_page_t m)
3494 {
3495         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3496
3497         pagezero((void *)va);
3498 }
3499
3500 /*
3501  *      pmap_zero_page_area zeros the specified hardware page by mapping
3502  *      the page into KVM and using bzero to clear its contents.
3503  *
3504  *      off and size may not cover an area beyond a single hardware page.
3505  */
3506 void
3507 pmap_zero_page_area(vm_page_t m, int off, int size)
3508 {
3509         vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3510
3511         if (off == 0 && size == PAGE_SIZE)
3512                 pagezero((void *)va);
3513         else
3514                 bzero((char *)va + off, size);
3515 }
3516
3517 /*
3518  *      pmap_copy_page copies the specified (machine independent)
3519  *      page by mapping the page into virtual memory and using
3520  *      bcopy to copy the page, one machine dependent page at a
3521  *      time.
3522  */
3523 void
3524 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
3525 {
3526         vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
3527         vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
3528
3529         pagecopy((void *)src, (void *)dst);
3530 }
3531
3532 int unmapped_buf_allowed = 1;
3533
3534 void
3535 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
3536     vm_offset_t b_offset, int xfersize)
3537 {
3538         void *a_cp, *b_cp;
3539         vm_page_t m_a, m_b;
3540         vm_paddr_t p_a, p_b;
3541         vm_offset_t a_pg_offset, b_pg_offset;
3542         int cnt;
3543
3544         while (xfersize > 0) {
3545                 a_pg_offset = a_offset & PAGE_MASK;
3546                 m_a = ma[a_offset >> PAGE_SHIFT];
3547                 p_a = m_a->phys_addr;
3548                 b_pg_offset = b_offset & PAGE_MASK;
3549                 m_b = mb[b_offset >> PAGE_SHIFT];
3550                 p_b = m_b->phys_addr;
3551                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
3552                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
3553                 if (__predict_false(!PHYS_IN_DMAP(p_a))) {
3554                         panic("!DMAP a %lx", p_a);
3555                 } else {
3556                         a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
3557                 }
3558                 if (__predict_false(!PHYS_IN_DMAP(p_b))) {
3559                         panic("!DMAP b %lx", p_b);
3560                 } else {
3561                         b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
3562                 }
3563                 bcopy(a_cp, b_cp, cnt);
3564                 a_offset += cnt;
3565                 b_offset += cnt;
3566                 xfersize -= cnt;
3567         }
3568 }
3569
3570 vm_offset_t
3571 pmap_quick_enter_page(vm_page_t m)
3572 {
3573
3574         return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
3575 }
3576
3577 void
3578 pmap_quick_remove_page(vm_offset_t addr)
3579 {
3580 }
3581
3582 /*
3583  * Returns true if the pmap's pv is one of the first
3584  * 16 pvs linked to from this page.  This count may
3585  * be changed upwards or downwards in the future; it
3586  * is only necessary that true be returned for a small
3587  * subset of pmaps for proper page aging.
3588  */
3589 boolean_t
3590 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
3591 {
3592         struct md_page *pvh;
3593         struct rwlock *lock;
3594         pv_entry_t pv;
3595         int loops = 0;
3596         boolean_t rv;
3597
3598         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3599             ("pmap_page_exists_quick: page %p is not managed", m));
3600         rv = FALSE;
3601         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3602         rw_rlock(lock);
3603         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3604                 if (PV_PMAP(pv) == pmap) {
3605                         rv = TRUE;
3606                         break;
3607                 }
3608                 loops++;
3609                 if (loops >= 16)
3610                         break;
3611         }
3612         if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
3613                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3614                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3615                         if (PV_PMAP(pv) == pmap) {
3616                                 rv = TRUE;
3617                                 break;
3618                         }
3619                         loops++;
3620                         if (loops >= 16)
3621                                 break;
3622                 }
3623         }
3624         rw_runlock(lock);
3625         return (rv);
3626 }
3627
3628 /*
3629  *      pmap_page_wired_mappings:
3630  *
3631  *      Return the number of managed mappings to the given physical page
3632  *      that are wired.
3633  */
3634 int
3635 pmap_page_wired_mappings(vm_page_t m)
3636 {
3637         struct rwlock *lock;
3638         struct md_page *pvh;
3639         pmap_t pmap;
3640         pt_entry_t *pte;
3641         pv_entry_t pv;
3642         int count, lvl, md_gen, pvh_gen;
3643
3644         if ((m->oflags & VPO_UNMANAGED) != 0)
3645                 return (0);
3646         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3647         rw_rlock(lock);
3648 restart:
3649         count = 0;
3650         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3651                 pmap = PV_PMAP(pv);
3652                 if (!PMAP_TRYLOCK(pmap)) {
3653                         md_gen = m->md.pv_gen;
3654                         rw_runlock(lock);
3655                         PMAP_LOCK(pmap);
3656                         rw_rlock(lock);
3657                         if (md_gen != m->md.pv_gen) {
3658                                 PMAP_UNLOCK(pmap);
3659                                 goto restart;
3660                         }
3661                 }
3662                 pte = pmap_pte(pmap, pv->pv_va, &lvl);
3663                 if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0)
3664                         count++;
3665                 PMAP_UNLOCK(pmap);
3666         }
3667         if ((m->flags & PG_FICTITIOUS) == 0) {
3668                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3669                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3670                         pmap = PV_PMAP(pv);
3671                         if (!PMAP_TRYLOCK(pmap)) {
3672                                 md_gen = m->md.pv_gen;
3673                                 pvh_gen = pvh->pv_gen;
3674                                 rw_runlock(lock);
3675                                 PMAP_LOCK(pmap);
3676                                 rw_rlock(lock);
3677                                 if (md_gen != m->md.pv_gen ||
3678                                     pvh_gen != pvh->pv_gen) {
3679                                         PMAP_UNLOCK(pmap);
3680                                         goto restart;
3681                                 }
3682                         }
3683                         pte = pmap_pte(pmap, pv->pv_va, &lvl);
3684                         if (pte != NULL &&
3685                             (pmap_load(pte) & ATTR_SW_WIRED) != 0)
3686                                 count++;
3687                         PMAP_UNLOCK(pmap);
3688                 }
3689         }
3690         rw_runlock(lock);
3691         return (count);
3692 }
3693
3694 /*
3695  * Destroy all managed, non-wired mappings in the given user-space
3696  * pmap.  This pmap cannot be active on any processor besides the
3697  * caller.
3698  *
3699  * This function cannot be applied to the kernel pmap.  Moreover, it
3700  * is not intended for general use.  It is only to be used during
3701  * process termination.  Consequently, it can be implemented in ways
3702  * that make it faster than pmap_remove().  First, it can more quickly
3703  * destroy mappings by iterating over the pmap's collection of PV
3704  * entries, rather than searching the page table.  Second, it doesn't
3705  * have to test and clear the page table entries atomically, because
3706  * no processor is currently accessing the user address space.  In
3707  * particular, a page table entry's dirty bit won't change state once
3708  * this function starts.
3709  */
3710 void
3711 pmap_remove_pages(pmap_t pmap)
3712 {
3713         pd_entry_t *pde;
3714         pt_entry_t *pte, tpte;
3715         struct spglist free;
3716         vm_page_t m, ml3, mt;
3717         pv_entry_t pv;
3718         struct md_page *pvh;
3719         struct pv_chunk *pc, *npc;
3720         struct rwlock *lock;
3721         int64_t bit;
3722         uint64_t inuse, bitmask;
3723         int allfree, field, freed, idx, lvl;
3724         vm_paddr_t pa;
3725
3726         lock = NULL;
3727
3728         SLIST_INIT(&free);
3729         PMAP_LOCK(pmap);
3730         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
3731                 allfree = 1;
3732                 freed = 0;
3733                 for (field = 0; field < _NPCM; field++) {
3734                         inuse = ~pc->pc_map[field] & pc_freemask[field];
3735                         while (inuse != 0) {
3736                                 bit = ffsl(inuse) - 1;
3737                                 bitmask = 1UL << bit;
3738                                 idx = field * 64 + bit;
3739                                 pv = &pc->pc_pventry[idx];
3740                                 inuse &= ~bitmask;
3741
3742                                 pde = pmap_pde(pmap, pv->pv_va, &lvl);
3743                                 KASSERT(pde != NULL,
3744                                     ("Attempting to remove an unmapped page"));
3745
3746                                 switch(lvl) {
3747                                 case 1:
3748                                         pte = pmap_l1_to_l2(pde, pv->pv_va);
3749                                         tpte = pmap_load(pte); 
3750                                         KASSERT((tpte & ATTR_DESCR_MASK) ==
3751                                             L2_BLOCK,
3752                                             ("Attempting to remove an invalid "
3753                                             "block: %lx", tpte));
3754                                         tpte = pmap_load(pte);
3755                                         break;
3756                                 case 2:
3757                                         pte = pmap_l2_to_l3(pde, pv->pv_va);
3758                                         tpte = pmap_load(pte);
3759                                         KASSERT((tpte & ATTR_DESCR_MASK) ==
3760                                             L3_PAGE,
3761                                             ("Attempting to remove an invalid "
3762                                              "page: %lx", tpte));
3763                                         break;
3764                                 default:
3765                                         panic(
3766                                             "Invalid page directory level: %d",
3767                                             lvl);
3768                                 }
3769
3770 /*
3771  * We cannot remove wired pages from a process' mapping at this time
3772  */
3773                                 if (tpte & ATTR_SW_WIRED) {
3774                                         allfree = 0;
3775                                         continue;
3776                                 }
3777
3778                                 pa = tpte & ~ATTR_MASK;
3779
3780                                 m = PHYS_TO_VM_PAGE(pa);
3781                                 KASSERT(m->phys_addr == pa,
3782                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
3783                                     m, (uintmax_t)m->phys_addr,
3784                                     (uintmax_t)tpte));
3785
3786                                 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
3787                                     m < &vm_page_array[vm_page_array_size],
3788                                     ("pmap_remove_pages: bad pte %#jx",
3789                                     (uintmax_t)tpte));
3790
3791                                 pmap_load_clear(pte);
3792
3793                                 /*
3794                                  * Update the vm_page_t clean/reference bits.
3795                                  */
3796                                 if ((tpte & ATTR_AP_RW_BIT) ==
3797                                     ATTR_AP(ATTR_AP_RW)) {
3798                                         switch (lvl) {
3799                                         case 1:
3800                                                 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3801                                                         vm_page_dirty(m);
3802                                                 break;
3803                                         case 2:
3804                                                 vm_page_dirty(m);
3805                                                 break;
3806                                         }
3807                                 }
3808
3809                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
3810
3811                                 /* Mark free */
3812                                 pc->pc_map[field] |= bitmask;
3813                                 switch (lvl) {
3814                                 case 1:
3815                                         pmap_resident_count_dec(pmap,
3816                                             L2_SIZE / PAGE_SIZE);
3817                                         pvh = pa_to_pvh(tpte & ~ATTR_MASK);
3818                                         TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
3819                                         pvh->pv_gen++;
3820                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
3821                                                 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3822                                                         if ((mt->aflags & PGA_WRITEABLE) != 0 &&
3823                                                             TAILQ_EMPTY(&mt->md.pv_list))
3824                                                                 vm_page_aflag_clear(mt, PGA_WRITEABLE);
3825                                         }
3826                                         ml3 = pmap_remove_pt_page(pmap,
3827                                             pv->pv_va);
3828                                         if (ml3 != NULL) {
3829                                                 pmap_resident_count_dec(pmap,1);
3830                                                 KASSERT(ml3->wire_count == NL3PG,
3831                                                     ("pmap_remove_pages: l3 page wire count error"));
3832                                                 ml3->wire_count = 1;
3833                                                 vm_page_unwire_noq(ml3);
3834                                                 pmap_add_delayed_free_list(ml3,
3835                                                     &free, FALSE);
3836                                         }
3837                                         break;
3838                                 case 2:
3839                                         pmap_resident_count_dec(pmap, 1);
3840                                         TAILQ_REMOVE(&m->md.pv_list, pv,
3841                                             pv_next);
3842                                         m->md.pv_gen++;
3843                                         if ((m->aflags & PGA_WRITEABLE) != 0 &&
3844                                             TAILQ_EMPTY(&m->md.pv_list) &&
3845                                             (m->flags & PG_FICTITIOUS) == 0) {
3846                                                 pvh = pa_to_pvh(
3847                                                     VM_PAGE_TO_PHYS(m));
3848                                                 if (TAILQ_EMPTY(&pvh->pv_list))
3849                                                         vm_page_aflag_clear(m,
3850                                                             PGA_WRITEABLE);
3851                                         }
3852                                         break;
3853                                 }
3854                                 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
3855                                     &free);
3856                                 freed++;
3857                         }
3858                 }
3859                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3860                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3861                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3862                 if (allfree) {
3863                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3864                         free_pv_chunk(pc);
3865                 }
3866         }
3867         pmap_invalidate_all(pmap);
3868         if (lock != NULL)
3869                 rw_wunlock(lock);
3870         PMAP_UNLOCK(pmap);
3871         vm_page_free_pages_toq(&free, false);
3872 }
3873
3874 /*
3875  * This is used to check if a page has been accessed or modified. As we
3876  * don't have a bit to see if it has been modified we have to assume it
3877  * has been if the page is read/write.
3878  */
3879 static boolean_t
3880 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
3881 {
3882         struct rwlock *lock;
3883         pv_entry_t pv;
3884         struct md_page *pvh;
3885         pt_entry_t *pte, mask, value;
3886         pmap_t pmap;
3887         int lvl, md_gen, pvh_gen;
3888         boolean_t rv;
3889
3890         rv = FALSE;
3891         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3892         rw_rlock(lock);
3893 restart:
3894         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3895                 pmap = PV_PMAP(pv);
3896                 if (!PMAP_TRYLOCK(pmap)) {
3897                         md_gen = m->md.pv_gen;
3898                         rw_runlock(lock);
3899                         PMAP_LOCK(pmap);
3900                         rw_rlock(lock);
3901                         if (md_gen != m->md.pv_gen) {
3902                                 PMAP_UNLOCK(pmap);
3903                                 goto restart;
3904                         }
3905                 }
3906                 pte = pmap_pte(pmap, pv->pv_va, &lvl);
3907                 KASSERT(lvl == 3,
3908                     ("pmap_page_test_mappings: Invalid level %d", lvl));
3909                 mask = 0;
3910                 value = 0;
3911                 if (modified) {
3912                         mask |= ATTR_AP_RW_BIT;
3913                         value |= ATTR_AP(ATTR_AP_RW);
3914                 }
3915                 if (accessed) {
3916                         mask |= ATTR_AF | ATTR_DESCR_MASK;
3917                         value |= ATTR_AF | L3_PAGE;
3918                 }
3919                 rv = (pmap_load(pte) & mask) == value;
3920                 PMAP_UNLOCK(pmap);
3921                 if (rv)
3922                         goto out;
3923         }
3924         if ((m->flags & PG_FICTITIOUS) == 0) {
3925                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3926                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3927                         pmap = PV_PMAP(pv);
3928                         if (!PMAP_TRYLOCK(pmap)) {
3929                                 md_gen = m->md.pv_gen;
3930                                 pvh_gen = pvh->pv_gen;
3931                                 rw_runlock(lock);
3932                                 PMAP_LOCK(pmap);
3933                                 rw_rlock(lock);
3934                                 if (md_gen != m->md.pv_gen ||
3935                                     pvh_gen != pvh->pv_gen) {
3936                                         PMAP_UNLOCK(pmap);
3937                                         goto restart;
3938                                 }
3939                         }
3940                         pte = pmap_pte(pmap, pv->pv_va, &lvl);
3941                         KASSERT(lvl == 2,
3942                             ("pmap_page_test_mappings: Invalid level %d", lvl));
3943                         mask = 0;
3944                         value = 0;
3945                         if (modified) {
3946                                 mask |= ATTR_AP_RW_BIT;
3947                                 value |= ATTR_AP(ATTR_AP_RW);
3948                         }
3949                         if (accessed) {
3950                                 mask |= ATTR_AF | ATTR_DESCR_MASK;
3951                                 value |= ATTR_AF | L2_BLOCK;
3952                         }
3953                         rv = (pmap_load(pte) & mask) == value;
3954                         PMAP_UNLOCK(pmap);
3955                         if (rv)
3956                                 goto out;
3957                 }
3958         }
3959 out:
3960         rw_runlock(lock);
3961         return (rv);
3962 }
3963
3964 /*
3965  *      pmap_is_modified:
3966  *
3967  *      Return whether or not the specified physical page was modified
3968  *      in any physical maps.
3969  */
3970 boolean_t
3971 pmap_is_modified(vm_page_t m)
3972 {
3973
3974         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3975             ("pmap_is_modified: page %p is not managed", m));
3976
3977         /*
3978          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
3979          * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
3980          * is clear, no PTEs can have PG_M set.
3981          */
3982         VM_OBJECT_ASSERT_WLOCKED(m->object);
3983         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
3984                 return (FALSE);
3985         return (pmap_page_test_mappings(m, FALSE, TRUE));
3986 }
3987
3988 /*
3989  *      pmap_is_prefaultable:
3990  *
3991  *      Return whether or not the specified virtual address is eligible
3992  *      for prefault.
3993  */
3994 boolean_t
3995 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3996 {
3997         pt_entry_t *pte;
3998         boolean_t rv;
3999         int lvl;
4000
4001         rv = FALSE;
4002         PMAP_LOCK(pmap);
4003         pte = pmap_pte(pmap, addr, &lvl);
4004         if (pte != NULL && pmap_load(pte) != 0) {
4005                 rv = TRUE;
4006         }
4007         PMAP_UNLOCK(pmap);
4008         return (rv);
4009 }
4010
4011 /*
4012  *      pmap_is_referenced:
4013  *
4014  *      Return whether or not the specified physical page was referenced
4015  *      in any physical maps.
4016  */
4017 boolean_t
4018 pmap_is_referenced(vm_page_t m)
4019 {
4020
4021         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4022             ("pmap_is_referenced: page %p is not managed", m));
4023         return (pmap_page_test_mappings(m, TRUE, FALSE));
4024 }
4025
4026 /*
4027  * Clear the write and modified bits in each of the given page's mappings.
4028  */
4029 void
4030 pmap_remove_write(vm_page_t m)
4031 {
4032         struct md_page *pvh;
4033         pmap_t pmap;
4034         struct rwlock *lock;
4035         pv_entry_t next_pv, pv;
4036         pt_entry_t oldpte, *pte;
4037         vm_offset_t va;
4038         int lvl, md_gen, pvh_gen;
4039
4040         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4041             ("pmap_remove_write: page %p is not managed", m));
4042
4043         /*
4044          * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
4045          * set by another thread while the object is locked.  Thus,
4046          * if PGA_WRITEABLE is clear, no page table entries need updating.
4047          */
4048         VM_OBJECT_ASSERT_WLOCKED(m->object);
4049         if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
4050                 return;
4051         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4052         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4053             pa_to_pvh(VM_PAGE_TO_PHYS(m));
4054 retry_pv_loop:
4055         rw_wlock(lock);
4056         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4057                 pmap = PV_PMAP(pv);
4058                 if (!PMAP_TRYLOCK(pmap)) {
4059                         pvh_gen = pvh->pv_gen;
4060                         rw_wunlock(lock);
4061                         PMAP_LOCK(pmap);
4062                         rw_wlock(lock);
4063                         if (pvh_gen != pvh->pv_gen) {
4064                                 PMAP_UNLOCK(pmap);
4065                                 rw_wunlock(lock);
4066                                 goto retry_pv_loop;
4067                         }
4068                 }
4069                 va = pv->pv_va;
4070                 pte = pmap_pte(pmap, pv->pv_va, &lvl);
4071                 if ((pmap_load(pte) & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))
4072                         pmap_demote_l2_locked(pmap, pte, va & ~L2_OFFSET,
4073                             &lock);
4074                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4075                     ("inconsistent pv lock %p %p for page %p",
4076                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4077                 PMAP_UNLOCK(pmap);
4078         }
4079         TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4080                 pmap = PV_PMAP(pv);
4081                 if (!PMAP_TRYLOCK(pmap)) {
4082                         pvh_gen = pvh->pv_gen;
4083                         md_gen = m->md.pv_gen;
4084                         rw_wunlock(lock);
4085                         PMAP_LOCK(pmap);
4086                         rw_wlock(lock);
4087                         if (pvh_gen != pvh->pv_gen ||
4088                             md_gen != m->md.pv_gen) {
4089                                 PMAP_UNLOCK(pmap);
4090                                 rw_wunlock(lock);
4091                                 goto retry_pv_loop;
4092                         }
4093                 }
4094                 pte = pmap_pte(pmap, pv->pv_va, &lvl);
4095 retry:
4096                 oldpte = pmap_load(pte);
4097                 if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) {
4098                         if (!atomic_cmpset_long(pte, oldpte,
4099                             oldpte | ATTR_AP(ATTR_AP_RO)))
4100                                 goto retry;
4101                         if ((oldpte & ATTR_AF) != 0)
4102                                 vm_page_dirty(m);
4103                         pmap_invalidate_page(pmap, pv->pv_va);
4104                 }
4105                 PMAP_UNLOCK(pmap);
4106         }
4107         rw_wunlock(lock);
4108         vm_page_aflag_clear(m, PGA_WRITEABLE);
4109 }
4110
4111 static __inline boolean_t
4112 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
4113 {
4114
4115         return (FALSE);
4116 }
4117
4118 /*
4119  *      pmap_ts_referenced:
4120  *
4121  *      Return a count of reference bits for a page, clearing those bits.
4122  *      It is not necessary for every reference bit to be cleared, but it
4123  *      is necessary that 0 only be returned when there are truly no
4124  *      reference bits set.
4125  *
4126  *      As an optimization, update the page's dirty field if a modified bit is
4127  *      found while counting reference bits.  This opportunistic update can be
4128  *      performed at low cost and can eliminate the need for some future calls
4129  *      to pmap_is_modified().  However, since this function stops after
4130  *      finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
4131  *      dirty pages.  Those dirty pages will only be detected by a future call
4132  *      to pmap_is_modified().
4133  */
4134 int
4135 pmap_ts_referenced(vm_page_t m)
4136 {
4137         struct md_page *pvh;
4138         pv_entry_t pv, pvf;
4139         pmap_t pmap;
4140         struct rwlock *lock;
4141         pd_entry_t *pde, tpde;
4142         pt_entry_t *pte, tpte;
4143         pt_entry_t *l3;
4144         vm_offset_t va;
4145         vm_paddr_t pa;
4146         int cleared, md_gen, not_cleared, lvl, pvh_gen;
4147         struct spglist free;
4148         bool demoted;
4149
4150         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4151             ("pmap_ts_referenced: page %p is not managed", m));
4152         SLIST_INIT(&free);
4153         cleared = 0;
4154         pa = VM_PAGE_TO_PHYS(m);
4155         lock = PHYS_TO_PV_LIST_LOCK(pa);
4156         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
4157         rw_wlock(lock);
4158 retry:
4159         not_cleared = 0;
4160         if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4161                 goto small_mappings;
4162         pv = pvf;
4163         do {
4164                 if (pvf == NULL)
4165                         pvf = pv;
4166                 pmap = PV_PMAP(pv);
4167                 if (!PMAP_TRYLOCK(pmap)) {
4168                         pvh_gen = pvh->pv_gen;
4169                         rw_wunlock(lock);
4170                         PMAP_LOCK(pmap);
4171                         rw_wlock(lock);
4172                         if (pvh_gen != pvh->pv_gen) {
4173                                 PMAP_UNLOCK(pmap);
4174                                 goto retry;
4175                         }
4176                 }
4177                 va = pv->pv_va;
4178                 pde = pmap_pde(pmap, pv->pv_va, &lvl);
4179                 KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found"));
4180                 KASSERT(lvl == 1,
4181                     ("pmap_ts_referenced: invalid pde level %d", lvl));
4182                 tpde = pmap_load(pde);
4183                 KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE,
4184                     ("pmap_ts_referenced: found an invalid l1 table"));
4185                 pte = pmap_l1_to_l2(pde, pv->pv_va);
4186                 tpte = pmap_load(pte);
4187                 if (pmap_page_dirty(tpte)) {
4188                         /*
4189                          * Although "tpte" is mapping a 2MB page, because
4190                          * this function is called at a 4KB page granularity,
4191                          * we only update the 4KB page under test.
4192                          */
4193                         vm_page_dirty(m);
4194                 }
4195                 if ((tpte & ATTR_AF) != 0) {
4196                         /*
4197                          * Since this reference bit is shared by 512 4KB
4198                          * pages, it should not be cleared every time it is
4199                          * tested.  Apply a simple "hash" function on the
4200                          * physical page number, the virtual superpage number,
4201                          * and the pmap address to select one 4KB page out of
4202                          * the 512 on which testing the reference bit will
4203                          * result in clearing that reference bit.  This
4204                          * function is designed to avoid the selection of the
4205                          * same 4KB page for every 2MB page mapping.
4206                          *
4207                          * On demotion, a mapping that hasn't been referenced
4208                          * is simply destroyed.  To avoid the possibility of a
4209                          * subsequent page fault on a demoted wired mapping,
4210                          * always leave its reference bit set.  Moreover,
4211                          * since the superpage is wired, the current state of
4212                          * its reference bit won't affect page replacement.
4213                          */
4214                         if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
4215                             (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
4216                             (tpte & ATTR_SW_WIRED) == 0) {
4217                                 if (safe_to_clear_referenced(pmap, tpte)) {
4218                                         /*
4219                                          * TODO: We don't handle the access
4220                                          * flag at all. We need to be able
4221                                          * to set it in  the exception handler.
4222                                          */
4223                                         panic("ARM64TODO: "
4224                                             "safe_to_clear_referenced\n");
4225                                 } else if (pmap_demote_l2_locked(pmap, pte,
4226                                     pv->pv_va, &lock) != NULL) {
4227                                         demoted = true;
4228                                         va += VM_PAGE_TO_PHYS(m) -
4229                                             (tpte & ~ATTR_MASK);
4230                                         l3 = pmap_l2_to_l3(pte, va);
4231                                         pmap_remove_l3(pmap, l3, va,
4232                                             pmap_load(pte), NULL, &lock);
4233                                 } else
4234                                         demoted = true;
4235
4236                                 if (demoted) {
4237                                         /*
4238                                          * The superpage mapping was removed
4239                                          * entirely and therefore 'pv' is no
4240                                          * longer valid.
4241                                          */
4242                                         if (pvf == pv)
4243                                                 pvf = NULL;
4244                                         pv = NULL;
4245                                 }
4246                                 cleared++;
4247                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4248                                     ("inconsistent pv lock %p %p for page %p",
4249                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4250                         } else
4251                                 not_cleared++;
4252                 }
4253                 PMAP_UNLOCK(pmap);
4254                 /* Rotate the PV list if it has more than one entry. */
4255                 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4256                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4257                         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4258                         pvh->pv_gen++;
4259                 }
4260                 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
4261                         goto out;
4262         } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4263 small_mappings:
4264         if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4265                 goto out;
4266         pv = pvf;
4267         do {
4268                 if (pvf == NULL)
4269                         pvf = pv;
4270                 pmap = PV_PMAP(pv);
4271                 if (!PMAP_TRYLOCK(pmap)) {
4272                         pvh_gen = pvh->pv_gen;
4273                         md_gen = m->md.pv_gen;
4274                         rw_wunlock(lock);
4275                         PMAP_LOCK(pmap);
4276                         rw_wlock(lock);
4277                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4278                                 PMAP_UNLOCK(pmap);
4279                                 goto retry;
4280                         }
4281                 }
4282                 pde = pmap_pde(pmap, pv->pv_va, &lvl);
4283                 KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found"));
4284                 KASSERT(lvl == 2,
4285                     ("pmap_ts_referenced: invalid pde level %d", lvl));
4286                 tpde = pmap_load(pde);
4287                 KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE,
4288                     ("pmap_ts_referenced: found an invalid l2 table"));
4289                 pte = pmap_l2_to_l3(pde, pv->pv_va);
4290                 tpte = pmap_load(pte);
4291                 if (pmap_page_dirty(tpte))
4292                         vm_page_dirty(m);
4293                 if ((tpte & ATTR_AF) != 0) {
4294                         if (safe_to_clear_referenced(pmap, tpte)) {
4295                                 /*
4296                                  * TODO: We don't handle the access flag
4297                                  * at all. We need to be able to set it in
4298                                  * the exception handler.
4299                                  */
4300                                 panic("ARM64TODO: safe_to_clear_referenced\n");
4301                         } else if ((tpte & ATTR_SW_WIRED) == 0) {
4302                                 /*
4303                                  * Wired pages cannot be paged out so
4304                                  * doing accessed bit emulation for
4305                                  * them is wasted effort. We do the
4306                                  * hard work for unwired pages only.
4307                                  */
4308                                 pmap_remove_l3(pmap, pte, pv->pv_va, tpde,
4309                                     &free, &lock);
4310                                 pmap_invalidate_page(pmap, pv->pv_va);
4311                                 cleared++;
4312                                 if (pvf == pv)
4313                                         pvf = NULL;
4314                                 pv = NULL;
4315                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4316                                     ("inconsistent pv lock %p %p for page %p",
4317                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4318                         } else
4319                                 not_cleared++;
4320                 }
4321                 PMAP_UNLOCK(pmap);
4322                 /* Rotate the PV list if it has more than one entry. */
4323                 if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4324                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4325                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4326                         m->md.pv_gen++;
4327                 }
4328         } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
4329             not_cleared < PMAP_TS_REFERENCED_MAX);
4330 out:
4331         rw_wunlock(lock);
4332         vm_page_free_pages_toq(&free, false);
4333         return (cleared + not_cleared);
4334 }
4335
4336 /*
4337  *      Apply the given advice to the specified range of addresses within the
4338  *      given pmap.  Depending on the advice, clear the referenced and/or
4339  *      modified flags in each mapping and set the mapped page's dirty field.
4340  */
4341 void
4342 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4343 {
4344 }
4345
4346 /*
4347  *      Clear the modify bits on the specified physical page.
4348  */
4349 void
4350 pmap_clear_modify(vm_page_t m)
4351 {
4352
4353         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4354             ("pmap_clear_modify: page %p is not managed", m));
4355         VM_OBJECT_ASSERT_WLOCKED(m->object);
4356         KASSERT(!vm_page_xbusied(m),
4357             ("pmap_clear_modify: page %p is exclusive busied", m));
4358
4359         /*
4360          * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
4361          * If the object containing the page is locked and the page is not
4362          * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
4363          */
4364         if ((m->aflags & PGA_WRITEABLE) == 0)
4365                 return;
4366
4367         /* ARM64TODO: We lack support for tracking if a page is modified */
4368 }
4369
4370 void *
4371 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4372 {
4373         struct pmap_preinit_mapping *ppim;
4374         vm_offset_t va, offset;
4375         pd_entry_t *pde;
4376         pt_entry_t *l2;
4377         int i, lvl, l2_blocks, free_l2_count, start_idx;
4378
4379         if (!vm_initialized) {
4380                 /*
4381                  * No L3 ptables so map entire L2 blocks where start VA is:
4382                  *      preinit_map_va + start_idx * L2_SIZE
4383                  * There may be duplicate mappings (multiple VA -> same PA) but
4384                  * ARM64 dcache is always PIPT so that's acceptable.
4385                  */
4386                  if (size == 0)
4387                          return (NULL);
4388
4389                  /* Calculate how many full L2 blocks are needed for the mapping */
4390                 l2_blocks = (roundup2(pa + size, L2_SIZE) - rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
4391
4392                 offset = pa & L2_OFFSET;
4393
4394                 if (preinit_map_va == 0)
4395                         return (NULL);
4396
4397                 /* Map 2MiB L2 blocks from reserved VA space */
4398
4399                 free_l2_count = 0;
4400                 start_idx = -1;
4401                 /* Find enough free contiguous VA space */
4402                 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
4403                         ppim = pmap_preinit_mapping + i;
4404                         if (free_l2_count > 0 && ppim->pa != 0) {
4405                                 /* Not enough space here */
4406                                 free_l2_count = 0;
4407                                 start_idx = -1;
4408                                 continue;
4409                         }
4410
4411                         if (ppim->pa == 0) {
4412                                 /* Free L2 block */
4413                                 if (start_idx == -1)
4414                                         start_idx = i;
4415                                 free_l2_count++;
4416                                 if (free_l2_count == l2_blocks)
4417                                         break;
4418                         }
4419                 }
4420                 if (free_l2_count != l2_blocks)
4421                         panic("%s: too many preinit mappings", __func__);
4422
4423                 va = preinit_map_va + (start_idx * L2_SIZE);
4424                 for (i = start_idx; i < start_idx + l2_blocks; i++) {
4425                         /* Mark entries as allocated */
4426                         ppim = pmap_preinit_mapping + i;
4427                         ppim->pa = pa;
4428                         ppim->va = va + offset;
4429                         ppim->size = size;
4430                 }
4431
4432                 /* Map L2 blocks */
4433                 pa = rounddown2(pa, L2_SIZE);
4434                 for (i = 0; i < l2_blocks; i++) {
4435                         pde = pmap_pde(kernel_pmap, va, &lvl);
4436                         KASSERT(pde != NULL,
4437                             ("pmap_mapbios: Invalid page entry, va: 0x%lx", va));
4438                         KASSERT(lvl == 1, ("pmap_mapbios: Invalid level %d", lvl));
4439
4440                         /* Insert L2_BLOCK */
4441                         l2 = pmap_l1_to_l2(pde, va);
4442                         pmap_load_store(l2,
4443                             pa | ATTR_DEFAULT | ATTR_XN |
4444                             ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
4445                         pmap_invalidate_range(kernel_pmap, va, va + L2_SIZE);
4446
4447                         va += L2_SIZE;
4448                         pa += L2_SIZE;
4449                 }
4450
4451                 va = preinit_map_va + (start_idx * L2_SIZE);
4452
4453         } else {
4454                 /* kva_alloc may be used to map the pages */
4455                 offset = pa & PAGE_MASK;
4456                 size = round_page(offset + size);
4457
4458                 va = kva_alloc(size);
4459                 if (va == 0)
4460                         panic("%s: Couldn't allocate KVA", __func__);
4461
4462                 pde = pmap_pde(kernel_pmap, va, &lvl);
4463                 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
4464
4465                 /* L3 table is linked */
4466                 va = trunc_page(va);
4467                 pa = trunc_page(pa);
4468                 pmap_kenter(va, size, pa, CACHED_MEMORY);
4469         }
4470
4471         return ((void *)(va + offset));
4472 }
4473
4474 void
4475 pmap_unmapbios(vm_offset_t va, vm_size_t size)
4476 {
4477         struct pmap_preinit_mapping *ppim;
4478         vm_offset_t offset, tmpsize, va_trunc;
4479         pd_entry_t *pde;
4480         pt_entry_t *l2;
4481         int i, lvl, l2_blocks, block;
4482
4483         l2_blocks = (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
4484         KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
4485
4486         /* Remove preinit mapping */
4487         block = 0;
4488         for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
4489                 ppim = pmap_preinit_mapping + i;
4490                 if (ppim->va == va) {
4491                         KASSERT(ppim->size == size, ("pmap_unmapbios: size mismatch"));
4492                         ppim->va = 0;
4493                         ppim->pa = 0;
4494                         ppim->size = 0;
4495                         offset = block * L2_SIZE;
4496                         va_trunc = rounddown2(va, L2_SIZE) + offset;
4497
4498                         /* Remove L2_BLOCK */
4499                         pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
4500                         KASSERT(pde != NULL,
4501                             ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va_trunc));
4502                         l2 = pmap_l1_to_l2(pde, va_trunc);
4503                         pmap_load_clear(l2);
4504                         pmap_invalidate_range(kernel_pmap, va_trunc, va_trunc + L2_SIZE);
4505
4506                         if (block == (l2_blocks - 1))
4507                                 return;
4508                         block++;
4509                 }
4510         }
4511
4512         /* Unmap the pages reserved with kva_alloc. */
4513         if (vm_initialized) {
4514                 offset = va & PAGE_MASK;
4515                 size = round_page(offset + size);
4516                 va = trunc_page(va);
4517
4518                 pde = pmap_pde(kernel_pmap, va, &lvl);
4519                 KASSERT(pde != NULL,
4520                     ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va));
4521                 KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl));
4522
4523                 /* Unmap and invalidate the pages */
4524                 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
4525                         pmap_kremove(va + tmpsize);
4526
4527                 kva_free(va, size);
4528         }
4529 }
4530
4531 /*
4532  * Sets the memory attribute for the specified page.
4533  */
4534 void
4535 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4536 {
4537
4538         m->md.pv_memattr = ma;
4539
4540         /*
4541          * If "m" is a normal page, update its direct mapping.  This update
4542          * can be relied upon to perform any cache operations that are
4543          * required for data coherence.
4544          */
4545         if ((m->flags & PG_FICTITIOUS) == 0 &&
4546             pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
4547             m->md.pv_memattr) != 0)
4548                 panic("memory attribute change on the direct map failed");
4549 }
4550
4551 /*
4552  * Changes the specified virtual address range's memory type to that given by
4553  * the parameter "mode".  The specified virtual address range must be
4554  * completely contained within either the direct map or the kernel map.  If
4555  * the virtual address range is contained within the kernel map, then the
4556  * memory type for each of the corresponding ranges of the direct map is also
4557  * changed.  (The corresponding ranges of the direct map are those ranges that
4558  * map the same physical pages as the specified virtual address range.)  These
4559  * changes to the direct map are necessary because Intel describes the
4560  * behavior of their processors as "undefined" if two or more mappings to the
4561  * same physical page have different memory types.
4562  *
4563  * Returns zero if the change completed successfully, and either EINVAL or
4564  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
4565  * of the virtual address range was not mapped, and ENOMEM is returned if
4566  * there was insufficient memory available to complete the change.  In the
4567  * latter case, the memory type may have been changed on some part of the
4568  * virtual address range or the direct map.
4569  */
4570 static int
4571 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4572 {
4573         int error;
4574
4575         PMAP_LOCK(kernel_pmap);
4576         error = pmap_change_attr_locked(va, size, mode);
4577         PMAP_UNLOCK(kernel_pmap);
4578         return (error);
4579 }
4580
4581 static int
4582 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
4583 {
4584         vm_offset_t base, offset, tmpva;
4585         pt_entry_t l3, *pte, *newpte;
4586         int lvl;
4587
4588         PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
4589         base = trunc_page(va);
4590         offset = va & PAGE_MASK;
4591         size = round_page(offset + size);
4592
4593         if (!VIRT_IN_DMAP(base))
4594                 return (EINVAL);
4595
4596         for (tmpva = base; tmpva < base + size; ) {
4597                 pte = pmap_pte(kernel_pmap, va, &lvl);
4598                 if (pte == NULL)
4599                         return (EINVAL);
4600
4601                 if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) {
4602                         /*
4603                          * We already have the correct attribute,
4604                          * ignore this entry.
4605                          */
4606                         switch (lvl) {
4607                         default:
4608                                 panic("Invalid DMAP table level: %d\n", lvl);
4609                         case 1:
4610                                 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
4611                                 break;
4612                         case 2:
4613                                 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
4614                                 break;
4615                         case 3:
4616                                 tmpva += PAGE_SIZE;
4617                                 break;
4618                         }
4619                 } else {
4620                         /*
4621                          * Split the entry to an level 3 table, then
4622                          * set the new attribute.
4623                          */
4624                         switch (lvl) {
4625                         default:
4626                                 panic("Invalid DMAP table level: %d\n", lvl);
4627                         case 1:
4628                                 newpte = pmap_demote_l1(kernel_pmap, pte,
4629                                     tmpva & ~L1_OFFSET);
4630                                 if (newpte == NULL)
4631                                         return (EINVAL);
4632                                 pte = pmap_l1_to_l2(pte, tmpva);
4633                         case 2:
4634                                 newpte = pmap_demote_l2(kernel_pmap, pte,
4635                                     tmpva & ~L2_OFFSET);
4636                                 if (newpte == NULL)
4637                                         return (EINVAL);
4638                                 pte = pmap_l2_to_l3(pte, tmpva);
4639                         case 3:
4640                                 /* Update the entry */
4641                                 l3 = pmap_load(pte);
4642                                 l3 &= ~ATTR_IDX_MASK;
4643                                 l3 |= ATTR_IDX(mode);
4644                                 if (mode == DEVICE_MEMORY)
4645                                         l3 |= ATTR_XN;
4646
4647                                 pmap_update_entry(kernel_pmap, pte, l3, tmpva,
4648                                     PAGE_SIZE);
4649
4650                                 /*
4651                                  * If moving to a non-cacheable entry flush
4652                                  * the cache.
4653                                  */
4654                                 if (mode == VM_MEMATTR_UNCACHEABLE)
4655                                         cpu_dcache_wbinv_range(tmpva, L3_SIZE);
4656
4657                                 break;
4658                         }
4659                         tmpva += PAGE_SIZE;
4660                 }
4661         }
4662
4663         return (0);
4664 }
4665
4666 /*
4667  * Create an L2 table to map all addresses within an L1 mapping.
4668  */
4669 static pt_entry_t *
4670 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
4671 {
4672         pt_entry_t *l2, newl2, oldl1;
4673         vm_offset_t tmpl1;
4674         vm_paddr_t l2phys, phys;
4675         vm_page_t ml2;
4676         int i;
4677
4678         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4679         oldl1 = pmap_load(l1);
4680         KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
4681             ("pmap_demote_l1: Demoting a non-block entry"));
4682         KASSERT((va & L1_OFFSET) == 0,
4683             ("pmap_demote_l1: Invalid virtual address %#lx", va));
4684         KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
4685             ("pmap_demote_l1: Level 1 table shouldn't be managed"));
4686
4687         tmpl1 = 0;
4688         if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
4689                 tmpl1 = kva_alloc(PAGE_SIZE);
4690                 if (tmpl1 == 0)
4691                         return (NULL);
4692         }
4693
4694         if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
4695             VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
4696                 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
4697                     " in pmap %p", va, pmap);
4698                 return (NULL);
4699         }
4700
4701         l2phys = VM_PAGE_TO_PHYS(ml2);
4702         l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
4703
4704         /* Address the range points at */
4705         phys = oldl1 & ~ATTR_MASK;
4706         /* The attributed from the old l1 table to be copied */
4707         newl2 = oldl1 & ATTR_MASK;
4708
4709         /* Create the new entries */
4710         for (i = 0; i < Ln_ENTRIES; i++) {
4711                 l2[i] = newl2 | phys;
4712                 phys += L2_SIZE;
4713         }
4714         KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
4715             ("Invalid l2 page (%lx != %lx)", l2[0],
4716             (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
4717
4718         if (tmpl1 != 0) {
4719                 pmap_kenter(tmpl1, PAGE_SIZE,
4720                     DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY);
4721                 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
4722         }
4723
4724         pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
4725
4726         if (tmpl1 != 0) {
4727                 pmap_kremove(tmpl1);
4728                 kva_free(tmpl1, PAGE_SIZE);
4729         }
4730
4731         return (l2);
4732 }
4733
4734 /*
4735  * Create an L3 table to map all addresses within an L2 mapping.
4736  */
4737 static pt_entry_t *
4738 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
4739     struct rwlock **lockp)
4740 {
4741         pt_entry_t *l3, newl3, oldl2;
4742         vm_offset_t tmpl2;
4743         vm_paddr_t l3phys, phys;
4744         vm_page_t ml3;
4745         int i;
4746
4747         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4748         l3 = NULL;
4749         oldl2 = pmap_load(l2);
4750         KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
4751             ("pmap_demote_l2: Demoting a non-block entry"));
4752         KASSERT((va & L2_OFFSET) == 0,
4753             ("pmap_demote_l2: Invalid virtual address %#lx", va));
4754
4755         tmpl2 = 0;
4756         if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
4757                 tmpl2 = kva_alloc(PAGE_SIZE);
4758                 if (tmpl2 == 0)
4759                         return (NULL);
4760         }
4761
4762         if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
4763                 ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va),
4764                     (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
4765                     VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
4766                 if (ml3 == NULL) {
4767                         CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
4768                             " in pmap %p", va, pmap);
4769                         goto fail;
4770                 }
4771                 if (va < VM_MAXUSER_ADDRESS)
4772                         pmap_resident_count_inc(pmap, 1);
4773         }
4774
4775         l3phys = VM_PAGE_TO_PHYS(ml3);
4776         l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
4777
4778         /* Address the range points at */
4779         phys = oldl2 & ~ATTR_MASK;
4780         /* The attributed from the old l2 table to be copied */
4781         newl3 = (oldl2 & (ATTR_MASK & ~ATTR_DESCR_MASK)) | L3_PAGE;
4782
4783         /*
4784          * If the page table page is new, initialize it.
4785          */
4786         if (ml3->wire_count == 1) {
4787                 for (i = 0; i < Ln_ENTRIES; i++) {
4788                         l3[i] = newl3 | phys;
4789                         phys += L3_SIZE;
4790                 }
4791         }
4792         KASSERT(l3[0] == ((oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE),
4793             ("Invalid l3 page (%lx != %lx)", l3[0],
4794             (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE));
4795
4796         /*
4797          * Map the temporary page so we don't lose access to the l2 table.
4798          */
4799         if (tmpl2 != 0) {
4800                 pmap_kenter(tmpl2, PAGE_SIZE,
4801                     DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY);
4802                 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
4803         }
4804
4805         /*
4806          * The spare PV entries must be reserved prior to demoting the
4807          * mapping, that is, prior to changing the PDE.  Otherwise, the state
4808          * of the L2 and the PV lists will be inconsistent, which can result
4809          * in reclaim_pv_chunk() attempting to remove a PV entry from the
4810          * wrong PV list and pmap_pv_demote_l2() failing to find the expected
4811          * PV entry for the 2MB page mapping that is being demoted.
4812          */
4813         if ((oldl2 & ATTR_SW_MANAGED) != 0)
4814                 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
4815
4816         pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
4817
4818         /*
4819          * Demote the PV entry.
4820          */
4821         if ((oldl2 & ATTR_SW_MANAGED) != 0)
4822                 pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp);
4823
4824         atomic_add_long(&pmap_l2_demotions, 1);
4825         CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
4826             " in pmap %p %lx", va, pmap, l3[0]);
4827
4828 fail:
4829         if (tmpl2 != 0) {
4830                 pmap_kremove(tmpl2);
4831                 kva_free(tmpl2, PAGE_SIZE);
4832         }
4833
4834         return (l3);
4835
4836 }
4837
4838 static pt_entry_t *
4839 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
4840 {
4841         struct rwlock *lock;
4842         pt_entry_t *l3;
4843
4844         lock = NULL;
4845         l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
4846         if (lock != NULL)
4847                 rw_wunlock(lock);
4848         return (l3);
4849 }
4850
4851 /*
4852  * perform the pmap work for mincore
4853  */
4854 int
4855 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
4856 {
4857         pd_entry_t *l1p, l1;
4858         pd_entry_t *l2p, l2;
4859         pt_entry_t *l3p, l3;
4860         vm_paddr_t pa;
4861         bool managed;
4862         int val;
4863
4864         PMAP_LOCK(pmap);
4865 retry:
4866         pa = 0;
4867         val = 0;
4868         managed = false;
4869
4870         l1p = pmap_l1(pmap, addr);
4871         if (l1p == NULL) /* No l1 */
4872                 goto done;
4873
4874         l1 = pmap_load(l1p);
4875         if ((l1 & ATTR_DESCR_MASK) == L1_INVAL)
4876                 goto done;
4877
4878         if ((l1 & ATTR_DESCR_MASK) == L1_BLOCK) {
4879                 pa = (l1 & ~ATTR_MASK) | (addr & L1_OFFSET);
4880                 managed = (l1 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
4881                 val = MINCORE_SUPER | MINCORE_INCORE;
4882                 if (pmap_page_dirty(l1))
4883                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
4884                 if ((l1 & ATTR_AF) == ATTR_AF)
4885                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
4886                 goto done;
4887         }
4888
4889         l2p = pmap_l1_to_l2(l1p, addr);
4890         if (l2p == NULL) /* No l2 */
4891                 goto done;
4892
4893         l2 = pmap_load(l2p);
4894         if ((l2 & ATTR_DESCR_MASK) == L2_INVAL)
4895                 goto done;
4896
4897         if ((l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
4898                 pa = (l2 & ~ATTR_MASK) | (addr & L2_OFFSET);
4899                 managed = (l2 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
4900                 val = MINCORE_SUPER | MINCORE_INCORE;
4901                 if (pmap_page_dirty(l2))
4902                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
4903                 if ((l2 & ATTR_AF) == ATTR_AF)
4904                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
4905                 goto done;
4906         }
4907
4908         l3p = pmap_l2_to_l3(l2p, addr);
4909         if (l3p == NULL) /* No l3 */
4910                 goto done;
4911
4912         l3 = pmap_load(l2p);
4913         if ((l3 & ATTR_DESCR_MASK) == L3_INVAL)
4914                 goto done;
4915
4916         if ((l3 & ATTR_DESCR_MASK) == L3_PAGE) {
4917                 pa = (l3 & ~ATTR_MASK) | (addr & L3_OFFSET);
4918                 managed = (l3 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED;
4919                 val = MINCORE_INCORE;
4920                 if (pmap_page_dirty(l3))
4921                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
4922                 if ((l3 & ATTR_AF) == ATTR_AF)
4923                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
4924         }
4925
4926 done:
4927         if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
4928             (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
4929                 /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
4930                 if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
4931                         goto retry;
4932         } else
4933                 PA_UNLOCK_COND(*locked_pa);
4934         PMAP_UNLOCK(pmap);
4935
4936         return (val);
4937 }
4938
4939 void
4940 pmap_activate(struct thread *td)
4941 {
4942         pmap_t  pmap;
4943
4944         critical_enter();
4945         pmap = vmspace_pmap(td->td_proc->p_vmspace);
4946         td->td_proc->p_md.md_l0addr = vtophys(pmap->pm_l0);
4947         __asm __volatile("msr ttbr0_el1, %0" : :
4948             "r"(td->td_proc->p_md.md_l0addr));
4949         pmap_invalidate_all(pmap);
4950         critical_exit();
4951 }
4952
4953 struct pcb *
4954 pmap_switch(struct thread *old, struct thread *new)
4955 {
4956         pcpu_bp_harden bp_harden;
4957         struct pcb *pcb;
4958
4959         /* Store the new curthread */
4960         PCPU_SET(curthread, new);
4961
4962         /* And the new pcb */
4963         pcb = new->td_pcb;
4964         PCPU_SET(curpcb, pcb);
4965
4966         /*
4967          * TODO: We may need to flush the cache here if switching
4968          * to a user process.
4969          */
4970
4971         if (old == NULL ||
4972             old->td_proc->p_md.md_l0addr != new->td_proc->p_md.md_l0addr) {
4973                 __asm __volatile(
4974                     /* Switch to the new pmap */
4975                     "msr        ttbr0_el1, %0   \n"
4976                     "isb                        \n"
4977
4978                     /* Invalidate the TLB */
4979                     "dsb        ishst           \n"
4980                     "tlbi       vmalle1is       \n"
4981                     "dsb        ish             \n"
4982                     "isb                        \n"
4983                     : : "r"(new->td_proc->p_md.md_l0addr));
4984
4985                 /*
4986                  * Stop userspace from training the branch predictor against
4987                  * other processes. This will call into a CPU specific
4988                  * function that clears the branch predictor state.
4989                  */
4990                 bp_harden = PCPU_GET(bp_harden);
4991                 if (bp_harden != NULL)
4992                         bp_harden();
4993         }
4994
4995         return (pcb);
4996 }
4997
4998 void
4999 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
5000 {
5001
5002         if (va >= VM_MIN_KERNEL_ADDRESS) {
5003                 cpu_icache_sync_range(va, sz);
5004         } else {
5005                 u_int len, offset;
5006                 vm_paddr_t pa;
5007
5008                 /* Find the length of data in this page to flush */
5009                 offset = va & PAGE_MASK;
5010                 len = imin(PAGE_SIZE - offset, sz);
5011
5012                 while (sz != 0) {
5013                         /* Extract the physical address & find it in the DMAP */
5014                         pa = pmap_extract(pmap, va);
5015                         if (pa != 0)
5016                                 cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
5017
5018                         /* Move to the next page */
5019                         sz -= len;
5020                         va += len;
5021                         /* Set the length for the next iteration */
5022                         len = imin(PAGE_SIZE, sz);
5023                 }
5024         }
5025 }
5026
5027 int
5028 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
5029 {
5030 #ifdef SMP
5031         register_t intr;
5032         uint64_t par;
5033
5034         switch (ESR_ELx_EXCEPTION(esr)) {
5035         case EXCP_INSN_ABORT_L:
5036         case EXCP_INSN_ABORT:
5037         case EXCP_DATA_ABORT_L:
5038         case EXCP_DATA_ABORT:
5039                 break;
5040         default:
5041                 return (KERN_FAILURE);
5042         }
5043
5044         /* Data and insn aborts use same encoding for FCS field. */
5045         switch (esr & ISS_DATA_DFSC_MASK) {
5046         case ISS_DATA_DFSC_TF_L0:
5047         case ISS_DATA_DFSC_TF_L1:
5048         case ISS_DATA_DFSC_TF_L2:
5049         case ISS_DATA_DFSC_TF_L3:
5050                 PMAP_LOCK(pmap);
5051                 /* Ask the MMU to check the address */
5052                 intr = intr_disable();
5053                 if (pmap == kernel_pmap)
5054                         par = arm64_address_translate_s1e1r(far);
5055                 else
5056                         par = arm64_address_translate_s1e0r(far);
5057                 intr_restore(intr);
5058                 PMAP_UNLOCK(pmap);
5059
5060                 /*
5061                  * If the translation was successful the address was invalid
5062                  * due to a break-before-make sequence. We can unlock and
5063                  * return success to the trap handler.
5064                  */
5065                 if (PAR_SUCCESS(par))
5066                         return (KERN_SUCCESS);
5067                 break;
5068         default:
5069                 break;
5070         }
5071 #endif
5072
5073         return (KERN_FAILURE);
5074 }
5075
5076 /*
5077  *      Increase the starting virtual address of the given mapping if a
5078  *      different alignment might result in more superpage mappings.
5079  */
5080 void
5081 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
5082     vm_offset_t *addr, vm_size_t size)
5083 {
5084         vm_offset_t superpage_offset;
5085
5086         if (size < L2_SIZE)
5087                 return;
5088         if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5089                 offset += ptoa(object->pg_color);
5090         superpage_offset = offset & L2_OFFSET;
5091         if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
5092             (*addr & L2_OFFSET) == superpage_offset)
5093                 return;
5094         if ((*addr & L2_OFFSET) < superpage_offset)
5095                 *addr = (*addr & ~L2_OFFSET) + superpage_offset;
5096         else
5097                 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
5098 }
5099
5100 /**
5101  * Get the kernel virtual address of a set of physical pages. If there are
5102  * physical addresses not covered by the DMAP perform a transient mapping
5103  * that will be removed when calling pmap_unmap_io_transient.
5104  *
5105  * \param page        The pages the caller wishes to obtain the virtual
5106  *                    address on the kernel memory map.
5107  * \param vaddr       On return contains the kernel virtual memory address
5108  *                    of the pages passed in the page parameter.
5109  * \param count       Number of pages passed in.
5110  * \param can_fault   TRUE if the thread using the mapped pages can take
5111  *                    page faults, FALSE otherwise.
5112  *
5113  * \returns TRUE if the caller must call pmap_unmap_io_transient when
5114  *          finished or FALSE otherwise.
5115  *
5116  */
5117 boolean_t
5118 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
5119     boolean_t can_fault)
5120 {
5121         vm_paddr_t paddr;
5122         boolean_t needs_mapping;
5123         int error, i;
5124
5125         /*
5126          * Allocate any KVA space that we need, this is done in a separate
5127          * loop to prevent calling vmem_alloc while pinned.
5128          */
5129         needs_mapping = FALSE;
5130         for (i = 0; i < count; i++) {
5131                 paddr = VM_PAGE_TO_PHYS(page[i]);
5132                 if (__predict_false(!PHYS_IN_DMAP(paddr))) {
5133                         error = vmem_alloc(kernel_arena, PAGE_SIZE,
5134                             M_BESTFIT | M_WAITOK, &vaddr[i]);
5135                         KASSERT(error == 0, ("vmem_alloc failed: %d", error));
5136                         needs_mapping = TRUE;
5137                 } else {
5138                         vaddr[i] = PHYS_TO_DMAP(paddr);
5139                 }
5140         }
5141
5142         /* Exit early if everything is covered by the DMAP */
5143         if (!needs_mapping)
5144                 return (FALSE);
5145
5146         if (!can_fault)
5147                 sched_pin();
5148         for (i = 0; i < count; i++) {
5149                 paddr = VM_PAGE_TO_PHYS(page[i]);
5150                 if (!PHYS_IN_DMAP(paddr)) {
5151                         panic(
5152                            "pmap_map_io_transient: TODO: Map out of DMAP data");
5153                 }
5154         }
5155
5156         return (needs_mapping);
5157 }
5158
5159 void
5160 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
5161     boolean_t can_fault)
5162 {
5163         vm_paddr_t paddr;
5164         int i;
5165
5166         if (!can_fault)
5167                 sched_unpin();
5168         for (i = 0; i < count; i++) {
5169                 paddr = VM_PAGE_TO_PHYS(page[i]);
5170                 if (!PHYS_IN_DMAP(paddr)) {
5171                         panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
5172                 }
5173         }
5174 }