]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/powerpc/aim/mmu_radix.c
Include the psind in data returned by mincore(2).
[FreeBSD/FreeBSD.git] / sys / powerpc / aim / mmu_radix.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2018 Matthew Macy
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30
31 #include <sys/param.h>
32 #include <sys/kernel.h>
33 #include <sys/systm.h>
34 #include <sys/conf.h>
35 #include <sys/bitstring.h>
36 #include <sys/queue.h>
37 #include <sys/cpuset.h>
38 #include <sys/endian.h>
39 #include <sys/kerneldump.h>
40 #include <sys/ktr.h>
41 #include <sys/lock.h>
42 #include <sys/syslog.h>
43 #include <sys/msgbuf.h>
44 #include <sys/malloc.h>
45 #include <sys/mman.h>
46 #include <sys/mutex.h>
47 #include <sys/proc.h>
48 #include <sys/rwlock.h>
49 #include <sys/sched.h>
50 #include <sys/sysctl.h>
51 #include <sys/systm.h>
52 #include <sys/vmem.h>
53 #include <sys/vmmeter.h>
54 #include <sys/smp.h>
55
56 #include <sys/kdb.h>
57
58 #include <dev/ofw/openfirm.h>
59
60 #include <vm/vm.h>
61 #include <vm/pmap.h>
62 #include <vm/vm_param.h>
63 #include <vm/vm_kern.h>
64 #include <vm/vm_page.h>
65 #include <vm/vm_map.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_extern.h>
68 #include <vm/vm_pageout.h>
69 #include <vm/vm_phys.h>
70 #include <vm/vm_reserv.h>
71 #include <vm/uma.h>
72
73 #include <machine/_inttypes.h>
74 #include <machine/cpu.h>
75 #include <machine/platform.h>
76 #include <machine/frame.h>
77 #include <machine/md_var.h>
78 #include <machine/psl.h>
79 #include <machine/bat.h>
80 #include <machine/hid.h>
81 #include <machine/pte.h>
82 #include <machine/sr.h>
83 #include <machine/trap.h>
84 #include <machine/mmuvar.h>
85
86 #ifdef INVARIANTS
87 #include <vm/uma_dbg.h>
88 #endif
89
90 #define PPC_BITLSHIFT(bit)      (sizeof(long)*NBBY - 1 - (bit))
91 #define PPC_BIT(bit)            (1UL << PPC_BITLSHIFT(bit))
92 #define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit))
93
94 #include "opt_ddb.h"
95 #ifdef DDB
96 static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va);
97 #endif
98
99 #define PG_W    RPTE_WIRED
100 #define PG_V    RPTE_VALID
101 #define PG_MANAGED      RPTE_MANAGED
102 #define PG_PROMOTED     RPTE_PROMOTED
103 #define PG_M    RPTE_C
104 #define PG_A    RPTE_R
105 #define PG_X    RPTE_EAA_X
106 #define PG_RW   RPTE_EAA_W
107 #define PG_PTE_CACHE RPTE_ATTR_MASK
108
109 #define RPTE_SHIFT 9
110 #define NLS_MASK ((1UL<<5)-1)
111 #define RPTE_ENTRIES (1UL<<RPTE_SHIFT)
112 #define RPTE_MASK (RPTE_ENTRIES-1)
113
114 #define NLB_SHIFT 0
115 #define NLB_MASK (((1UL<<52)-1) << 8)
116
117 extern int nkpt;
118 extern caddr_t crashdumpmap;
119
120 #define RIC_FLUSH_TLB 0
121 #define RIC_FLUSH_PWC 1
122 #define RIC_FLUSH_ALL 2
123
124 #define POWER9_TLB_SETS_RADIX   128     /* # sets in POWER9 TLB Radix mode */
125
126 #define PPC_INST_TLBIE                  0x7c000264
127 #define PPC_INST_TLBIEL                 0x7c000224
128 #define PPC_INST_SLBIA                  0x7c0003e4
129
130 #define ___PPC_RA(a)    (((a) & 0x1f) << 16)
131 #define ___PPC_RB(b)    (((b) & 0x1f) << 11)
132 #define ___PPC_RS(s)    (((s) & 0x1f) << 21)
133 #define ___PPC_RT(t)    ___PPC_RS(t)
134 #define ___PPC_R(r)     (((r) & 0x1) << 16)
135 #define ___PPC_PRS(prs) (((prs) & 0x1) << 17)
136 #define ___PPC_RIC(ric) (((ric) & 0x3) << 18)
137
138 #define PPC_SLBIA(IH)   __XSTRING(.long PPC_INST_SLBIA | \
139                                        ((IH & 0x7) << 21))
140 #define PPC_TLBIE_5(rb,rs,ric,prs,r)                            \
141         __XSTRING(.long PPC_INST_TLBIE |                        \
142                           ___PPC_RB(rb) | ___PPC_RS(rs) |       \
143                           ___PPC_RIC(ric) | ___PPC_PRS(prs) |   \
144                           ___PPC_R(r))
145
146 #define PPC_TLBIEL(rb,rs,ric,prs,r) \
147          __XSTRING(.long PPC_INST_TLBIEL | \
148                            ___PPC_RB(rb) | ___PPC_RS(rs) |      \
149                            ___PPC_RIC(ric) | ___PPC_PRS(prs) |  \
150                            ___PPC_R(r))
151
152 #define PPC_INVALIDATE_ERAT             PPC_SLBIA(7)
153
154 static __inline void
155 ttusync(void)
156 {
157         __asm __volatile("eieio; tlbsync; ptesync" ::: "memory");
158 }
159
160 #define TLBIEL_INVAL_SEL_MASK   0xc00   /* invalidation selector */
161 #define  TLBIEL_INVAL_PAGE      0x000   /* invalidate a single page */
162 #define  TLBIEL_INVAL_SET_PID   0x400   /* invalidate a set for the current PID */
163 #define  TLBIEL_INVAL_SET_LPID  0x800   /* invalidate a set for current LPID */
164 #define  TLBIEL_INVAL_SET       0xc00   /* invalidate a set for all LPIDs */
165
166 #define TLBIE_ACTUAL_PAGE_MASK          0xe0
167 #define  TLBIE_ACTUAL_PAGE_4K           0x00
168 #define  TLBIE_ACTUAL_PAGE_64K          0xa0
169 #define  TLBIE_ACTUAL_PAGE_2M           0x20
170 #define  TLBIE_ACTUAL_PAGE_1G           0x40
171
172 #define TLBIE_PRS_PARTITION_SCOPE       0x0
173 #define TLBIE_PRS_PROCESS_SCOPE 0x1
174
175 #define TLBIE_RIC_INVALIDATE_TLB        0x0     /* Invalidate just TLB */
176 #define TLBIE_RIC_INVALIDATE_PWC        0x1     /* Invalidate just PWC */
177 #define TLBIE_RIC_INVALIDATE_ALL        0x2     /* Invalidate TLB, PWC,
178                                                  * cached {proc, part}tab entries
179                                                  */
180 #define TLBIE_RIC_INVALIDATE_SEQ        0x3     /* HPT - only:
181                                                  * Invalidate a range of translations
182                                                  */
183
184 static __always_inline void
185 radix_tlbie(uint8_t ric, uint8_t prs, uint16_t is, uint32_t pid, uint32_t lpid,
186                         vm_offset_t va, uint16_t ap)
187 {
188         uint64_t rb, rs;
189
190         MPASS((va & PAGE_MASK) == 0);
191
192         rs = ((uint64_t)pid << 32) | lpid;
193         rb = va | is | ap;
194         __asm __volatile(PPC_TLBIE_5(%0, %1, %2, %3, 1) : :
195                 "r" (rb), "r" (rs), "i" (ric), "i" (prs));
196 }
197
198 static __inline void
199 radix_tlbie_invlpg_user_4k(uint32_t pid, vm_offset_t va)
200 {
201
202         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
203                 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_4K);
204 }
205
206 static __inline void
207 radix_tlbie_invlpg_user_2m(uint32_t pid, vm_offset_t va)
208 {
209
210         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
211                 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_2M);
212 }
213
214 static __inline void
215 radix_tlbie_invlpwc_user(uint32_t pid)
216 {
217
218         radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE,
219                 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0);
220 }
221
222 static __inline void
223 radix_tlbie_flush_user(uint32_t pid)
224 {
225
226         radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE,
227                 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0);
228 }
229
230 static __inline void
231 radix_tlbie_invlpg_kernel_4k(vm_offset_t va)
232 {
233
234         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
235             TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_4K);
236 }
237
238 static __inline void
239 radix_tlbie_invlpg_kernel_2m(vm_offset_t va)
240 {
241
242         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
243             TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_2M);
244 }
245
246 /* 1GB pages aren't currently supported. */
247 static __inline __unused void
248 radix_tlbie_invlpg_kernel_1g(vm_offset_t va)
249 {
250
251         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
252             TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_1G);
253 }
254
255 static __inline void
256 radix_tlbie_invlpwc_kernel(void)
257 {
258
259         radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE,
260             TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0);
261 }
262
263 static __inline void
264 radix_tlbie_flush_kernel(void)
265 {
266
267         radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE,
268             TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0);
269 }
270
271 static __inline vm_pindex_t
272 pmap_l3e_pindex(vm_offset_t va)
273 {
274         return ((va & PG_FRAME) >> L3_PAGE_SIZE_SHIFT);
275 }
276
277 static __inline vm_pindex_t
278 pmap_pml3e_index(vm_offset_t va)
279 {
280
281         return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK);
282 }
283
284 static __inline vm_pindex_t
285 pmap_pml2e_index(vm_offset_t va)
286 {
287         return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK);
288 }
289
290 static __inline vm_pindex_t
291 pmap_pml1e_index(vm_offset_t va)
292 {
293         return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT);
294 }
295
296 /* Return various clipped indexes for a given VA */
297 static __inline vm_pindex_t
298 pmap_pte_index(vm_offset_t va)
299 {
300
301         return ((va >> PAGE_SHIFT) & RPTE_MASK);
302 }
303
304 /* Return a pointer to the PT slot that corresponds to a VA */
305 static __inline pt_entry_t *
306 pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va)
307 {
308         pt_entry_t *pte;
309         vm_paddr_t ptepa;
310
311         ptepa = (*l3e & NLB_MASK);
312         pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa);
313         return (&pte[pmap_pte_index(va)]);
314 }
315
316 /* Return a pointer to the PD slot that corresponds to a VA */
317 static __inline pt_entry_t *
318 pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va)
319 {
320         pt_entry_t *l3e;
321         vm_paddr_t l3pa;
322
323         l3pa = (*l2e & NLB_MASK);
324         l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa);
325         return (&l3e[pmap_pml3e_index(va)]);
326 }
327
328 /* Return a pointer to the PD slot that corresponds to a VA */
329 static __inline pt_entry_t *
330 pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va)
331 {
332         pt_entry_t *l2e;
333         vm_paddr_t l2pa;
334
335         l2pa = (*l1e & NLB_MASK);
336
337         l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa);
338         return (&l2e[pmap_pml2e_index(va)]);
339 }
340
341 static __inline pml1_entry_t *
342 pmap_pml1e(pmap_t pmap, vm_offset_t va)
343 {
344
345         return (&pmap->pm_pml1[pmap_pml1e_index(va)]);
346 }
347
348 static pt_entry_t *
349 pmap_pml2e(pmap_t pmap, vm_offset_t va)
350 {
351         pt_entry_t *l1e;
352
353         l1e = pmap_pml1e(pmap, va);
354         if (l1e == NULL || (*l1e & RPTE_VALID) == 0)
355                 return (NULL);
356         return (pmap_l1e_to_l2e(l1e, va));
357 }
358
359 static __inline pt_entry_t *
360 pmap_pml3e(pmap_t pmap, vm_offset_t va)
361 {
362         pt_entry_t *l2e;
363
364         l2e = pmap_pml2e(pmap, va);
365         if (l2e == NULL || (*l2e & RPTE_VALID) == 0)
366                 return (NULL);
367         return (pmap_l2e_to_l3e(l2e, va));
368 }
369
370 static __inline pt_entry_t *
371 pmap_pte(pmap_t pmap, vm_offset_t va)
372 {
373         pt_entry_t *l3e;
374
375         l3e = pmap_pml3e(pmap, va);
376         if (l3e == NULL || (*l3e & RPTE_VALID) == 0)
377                 return (NULL);
378         return (pmap_l3e_to_pte(l3e, va));
379 }
380
381 int nkpt = 64;
382 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
383     "Number of kernel page table pages allocated on bootup");
384
385 vm_paddr_t dmaplimit;
386
387 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
388
389 static int pg_ps_enabled = 1;
390 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
391     &pg_ps_enabled, 0, "Are large page mappings enabled?");
392 #ifdef INVARIANTS
393 #define VERBOSE_PMAP 0
394 #define VERBOSE_PROTECT 0
395 static int pmap_logging;
396 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN,
397     &pmap_logging, 0, "verbose debug logging");
398 #endif
399
400 static u_int64_t        KPTphys;        /* phys addr of kernel level 1 */
401
402 //static vm_paddr_t     KERNend;        /* phys addr of end of bootstrap data */
403
404 static vm_offset_t qframe = 0;
405 static struct mtx qframe_mtx;
406
407 void mmu_radix_activate(struct thread *);
408 void mmu_radix_advise(pmap_t, vm_offset_t, vm_offset_t, int);
409 void mmu_radix_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *,
410     vm_size_t);
411 void mmu_radix_clear_modify(vm_page_t);
412 void mmu_radix_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t);
413 int mmu_radix_decode_kernel_ptr(vm_offset_t, int *, vm_offset_t *);
414 int mmu_radix_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t);
415 void mmu_radix_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
416         vm_prot_t);
417 void mmu_radix_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
418 vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va);
419 vm_page_t mmu_radix_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t);
420 void mmu_radix_kenter(vm_offset_t, vm_paddr_t);
421 vm_paddr_t mmu_radix_kextract(vm_offset_t);
422 void mmu_radix_kremove(vm_offset_t);
423 boolean_t mmu_radix_is_modified(vm_page_t);
424 boolean_t mmu_radix_is_prefaultable(pmap_t, vm_offset_t);
425 boolean_t mmu_radix_is_referenced(vm_page_t);
426 void mmu_radix_object_init_pt(pmap_t, vm_offset_t, vm_object_t,
427         vm_pindex_t, vm_size_t);
428 boolean_t mmu_radix_page_exists_quick(pmap_t, vm_page_t);
429 void mmu_radix_page_init(vm_page_t);
430 boolean_t mmu_radix_page_is_mapped(vm_page_t m);
431 void mmu_radix_page_set_memattr(vm_page_t, vm_memattr_t);
432 int mmu_radix_page_wired_mappings(vm_page_t);
433 int mmu_radix_pinit(pmap_t);
434 void mmu_radix_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
435 bool mmu_radix_ps_enabled(pmap_t);
436 void mmu_radix_qenter(vm_offset_t, vm_page_t *, int);
437 void mmu_radix_qremove(vm_offset_t, int);
438 vm_offset_t mmu_radix_quick_enter_page(vm_page_t);
439 void mmu_radix_quick_remove_page(vm_offset_t);
440 boolean_t mmu_radix_ts_referenced(vm_page_t);
441 void mmu_radix_release(pmap_t);
442 void mmu_radix_remove(pmap_t, vm_offset_t, vm_offset_t);
443 void mmu_radix_remove_all(vm_page_t);
444 void mmu_radix_remove_pages(pmap_t);
445 void mmu_radix_remove_write(vm_page_t);
446 void mmu_radix_unwire(pmap_t, vm_offset_t, vm_offset_t);
447 void mmu_radix_zero_page(vm_page_t);
448 void mmu_radix_zero_page_area(vm_page_t, int, int);
449 int mmu_radix_change_attr(vm_offset_t, vm_size_t, vm_memattr_t);
450 void mmu_radix_page_array_startup(long pages);
451
452 #include "mmu_oea64.h"
453
454 /*
455  * Kernel MMU interface
456  */
457
458 static void     mmu_radix_bootstrap(vm_offset_t, vm_offset_t);
459
460 static void mmu_radix_copy_page(vm_page_t, vm_page_t);
461 static void mmu_radix_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
462     vm_page_t *mb, vm_offset_t b_offset, int xfersize);
463 static void mmu_radix_growkernel(vm_offset_t);
464 static void mmu_radix_init(void);
465 static int mmu_radix_mincore(pmap_t, vm_offset_t, vm_paddr_t *);
466 static vm_offset_t mmu_radix_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
467 static void mmu_radix_pinit0(pmap_t);
468
469 static void *mmu_radix_mapdev(vm_paddr_t, vm_size_t);
470 static void *mmu_radix_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t);
471 static void mmu_radix_unmapdev(vm_offset_t, vm_size_t);
472 static void mmu_radix_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma);
473 static boolean_t mmu_radix_dev_direct_mapped(vm_paddr_t, vm_size_t);
474 static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va);
475 static void mmu_radix_scan_init(void);
476 static void     mmu_radix_cpu_bootstrap(int ap);
477 static void     mmu_radix_tlbie_all(void);
478
479 static struct pmap_funcs mmu_radix_methods = {
480         .bootstrap = mmu_radix_bootstrap,
481         .copy_page = mmu_radix_copy_page,
482         .copy_pages = mmu_radix_copy_pages,
483         .cpu_bootstrap = mmu_radix_cpu_bootstrap,
484         .growkernel = mmu_radix_growkernel,
485         .init = mmu_radix_init,
486         .map =                  mmu_radix_map,
487         .mincore =              mmu_radix_mincore,
488         .pinit = mmu_radix_pinit,
489         .pinit0 = mmu_radix_pinit0,
490
491         .mapdev = mmu_radix_mapdev,
492         .mapdev_attr = mmu_radix_mapdev_attr,
493         .unmapdev = mmu_radix_unmapdev,
494         .kenter_attr = mmu_radix_kenter_attr,
495         .dev_direct_mapped = mmu_radix_dev_direct_mapped,
496         .dumpsys_pa_init = mmu_radix_scan_init,
497         .dumpsys_map_chunk = mmu_radix_dumpsys_map,
498         .page_is_mapped = mmu_radix_page_is_mapped,
499         .ps_enabled = mmu_radix_ps_enabled,
500         .object_init_pt = mmu_radix_object_init_pt,
501         .protect = mmu_radix_protect,
502         /* pmap dispatcher interface */
503         .clear_modify = mmu_radix_clear_modify,
504         .copy = mmu_radix_copy,
505         .enter = mmu_radix_enter,
506         .enter_object = mmu_radix_enter_object,
507         .enter_quick = mmu_radix_enter_quick,
508         .extract = mmu_radix_extract,
509         .extract_and_hold = mmu_radix_extract_and_hold,
510         .is_modified = mmu_radix_is_modified,
511         .is_prefaultable = mmu_radix_is_prefaultable,
512         .is_referenced = mmu_radix_is_referenced,
513         .ts_referenced = mmu_radix_ts_referenced,
514         .page_exists_quick = mmu_radix_page_exists_quick,
515         .page_init = mmu_radix_page_init,
516         .page_wired_mappings =  mmu_radix_page_wired_mappings,
517         .qenter = mmu_radix_qenter,
518         .qremove = mmu_radix_qremove,
519         .release = mmu_radix_release,
520         .remove = mmu_radix_remove,
521         .remove_all = mmu_radix_remove_all,
522         .remove_write = mmu_radix_remove_write,
523         .unwire = mmu_radix_unwire,
524         .zero_page = mmu_radix_zero_page,
525         .zero_page_area = mmu_radix_zero_page_area,
526         .activate = mmu_radix_activate,
527         .quick_enter_page =  mmu_radix_quick_enter_page,
528         .quick_remove_page =  mmu_radix_quick_remove_page,
529         .page_set_memattr = mmu_radix_page_set_memattr,
530         .page_array_startup =  mmu_radix_page_array_startup,
531
532         /* Internal interfaces */
533         .kenter = mmu_radix_kenter,
534         .kextract = mmu_radix_kextract,
535         .kremove = mmu_radix_kremove,
536         .change_attr = mmu_radix_change_attr,
537         .decode_kernel_ptr =  mmu_radix_decode_kernel_ptr,
538
539         .tlbie_all = mmu_radix_tlbie_all,
540 };
541
542 MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods);
543
544 static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va,
545         struct rwlock **lockp);
546 static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va);
547 static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *);
548 static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva,
549     struct spglist *free, struct rwlock **lockp);
550 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
551     pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
552 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
553 static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde,
554     struct spglist *free);
555 static bool     pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
556         pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp);
557
558 static bool     pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e,
559                     u_int flags, struct rwlock **lockp);
560 #if VM_NRESERVLEVEL > 0
561 static void     pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
562         struct rwlock **lockp);
563 #endif
564 static void     pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
565 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
566 static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
567         vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate);
568
569 static bool     pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
570         vm_prot_t prot, struct rwlock **lockp);
571 static int      pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde,
572         u_int flags, vm_page_t m, struct rwlock **lockp);
573
574 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
575 static void free_pv_chunk(struct pv_chunk *pc);
576 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp);
577 static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va,
578         struct rwlock **lockp);
579 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
580         struct rwlock **lockp);
581 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
582     struct spglist *free);
583 static boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free);
584
585 static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start);
586 static void pmap_invalidate_all(pmap_t pmap);
587 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush);
588
589 /*
590  * Internal flags for pmap_enter()'s helper functions.
591  */
592 #define PMAP_ENTER_NORECLAIM    0x1000000       /* Don't reclaim PV entries. */
593 #define PMAP_ENTER_NOREPLACE    0x2000000       /* Don't replace mappings. */
594
595 #define UNIMPLEMENTED() panic("%s not implemented", __func__)
596 #define UNTESTED() panic("%s not yet tested", __func__)
597
598 /* Number of supported PID bits */
599 static unsigned int isa3_pid_bits;
600
601 /* PID to start allocating from */
602 static unsigned int isa3_base_pid;
603
604 #define PROCTAB_SIZE_SHIFT      (isa3_pid_bits + 4)
605 #define PROCTAB_ENTRIES (1ul << isa3_pid_bits)
606
607 /*
608  * Map of physical memory regions.
609  */
610 static struct   mem_region *regions, *pregions;
611 static struct   numa_mem_region *numa_pregions;
612 static u_int    phys_avail_count;
613 static int      regions_sz, pregions_sz, numa_pregions_sz;
614 static struct pate *isa3_parttab;
615 static struct prte *isa3_proctab;
616 static vmem_t *asid_arena;
617
618 extern void bs_remap_earlyboot(void);
619
620 #define RADIX_PGD_SIZE_SHIFT    16
621 #define RADIX_PGD_SIZE  (1UL << RADIX_PGD_SIZE_SHIFT)
622
623 #define RADIX_PGD_INDEX_SHIFT   (RADIX_PGD_SIZE_SHIFT-3)
624 #define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t))
625 #define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t))
626
627 #define NUPML1E         (RADIX_PGD_SIZE/sizeof(uint64_t))       /* number of userland PML1 pages */
628 #define NUPDPE          (NUPML1E * NL2EPG)/* number of userland PDP pages */
629 #define NUPDE           (NUPDPE * NL3EPG)       /* number of userland PD entries */
630
631 /* POWER9 only permits a 64k partition table size. */
632 #define PARTTAB_SIZE_SHIFT      16
633 #define PARTTAB_SIZE    (1UL << PARTTAB_SIZE_SHIFT)
634
635 #define PARTTAB_HR              (1UL << 63) /* host uses radix */
636 #define PARTTAB_GR              (1UL << 63) /* guest uses radix must match host */
637
638 /* TLB flush actions. Used as argument to tlbiel_all() */
639 enum {
640         TLB_INVAL_SCOPE_LPID = 0,       /* invalidate TLBs for current LPID */
641         TLB_INVAL_SCOPE_GLOBAL = 1,     /* invalidate all TLBs */
642 };
643
644 #define NPV_LIST_LOCKS  MAXCPU
645 static int pmap_initialized;
646 static vm_paddr_t proctab0pa;
647 static vm_paddr_t parttab_phys;
648 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
649
650 /*
651  * Data for the pv entry allocation mechanism.
652  * Updates to pv_invl_gen are protected by the pv_list_locks[]
653  * elements, but reads are not.
654  */
655 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
656 static struct mtx __exclusive_cache_line pv_chunks_mutex;
657 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
658 static struct md_page *pv_table;
659 static struct md_page pv_dummy;
660
661 #ifdef PV_STATS
662 #define PV_STAT(x)      do { x ; } while (0)
663 #else
664 #define PV_STAT(x)      do { } while (0)
665 #endif
666
667 #define pa_radix_index(pa)      ((pa) >> L3_PAGE_SIZE_SHIFT)
668 #define pa_to_pvh(pa)   (&pv_table[pa_radix_index(pa)])
669
670 #define PHYS_TO_PV_LIST_LOCK(pa)        \
671                         (&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS])
672
673 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)  do {    \
674         struct rwlock **_lockp = (lockp);               \
675         struct rwlock *_new_lock;                       \
676                                                         \
677         _new_lock = PHYS_TO_PV_LIST_LOCK(pa);           \
678         if (_new_lock != *_lockp) {                     \
679                 if (*_lockp != NULL)                    \
680                         rw_wunlock(*_lockp);            \
681                 *_lockp = _new_lock;                    \
682                 rw_wlock(*_lockp);                      \
683         }                                               \
684 } while (0)
685
686 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)        \
687         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
688
689 #define RELEASE_PV_LIST_LOCK(lockp)             do {    \
690         struct rwlock **_lockp = (lockp);               \
691                                                         \
692         if (*_lockp != NULL) {                          \
693                 rw_wunlock(*_lockp);                    \
694                 *_lockp = NULL;                         \
695         }                                               \
696 } while (0)
697
698 #define VM_PAGE_TO_PV_LIST_LOCK(m)      \
699         PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
700
701 /*
702  * We support 52 bits, hence:
703  * bits 52 - 31 = 21, 0b10101
704  * RTS encoding details
705  * bits 0 - 3 of rts -> bits 6 - 8 unsigned long
706  * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long
707  */
708 #define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5))
709
710 static int powernv_enabled = 1;
711
712 static __always_inline void
713 tlbiel_radix_set_isa300(uint32_t set, uint32_t is,
714         uint32_t pid, uint32_t ric, uint32_t prs)
715 {
716         uint64_t rb;
717         uint64_t rs;
718
719         rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53);
720         rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31);
721
722         __asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
723                      : : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
724                      : "memory");
725 }
726
727 static void
728 tlbiel_flush_isa3(uint32_t num_sets, uint32_t is)
729 {
730         uint32_t set;
731
732         __asm __volatile("ptesync": : :"memory");
733
734         /*
735          * Flush the first set of the TLB, and the entire Page Walk Cache
736          * and partition table entries. Then flush the remaining sets of the
737          * TLB.
738          */
739         tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
740         for (set = 1; set < num_sets; set++)
741                 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
742
743         /* Do the same for process scoped entries. */
744         tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
745         for (set = 1; set < num_sets; set++)
746                 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
747
748         __asm __volatile("ptesync": : :"memory");
749 }
750
751 static void
752 mmu_radix_tlbiel_flush(int scope)
753 {
754         int is;
755
756         MPASS(scope == TLB_INVAL_SCOPE_LPID ||
757                   scope == TLB_INVAL_SCOPE_GLOBAL);
758         is = scope + 2;
759
760         tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, is);
761         __asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
762 }
763
764 static void
765 mmu_radix_tlbie_all()
766 {
767         /* TODO: LPID invalidate */
768         mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
769 }
770
771 static void
772 mmu_radix_init_amor(void)
773 {
774         /*
775         * In HV mode, we init AMOR (Authority Mask Override Register) so that
776         * the hypervisor and guest can setup IAMR (Instruction Authority Mask
777         * Register), enable key 0 and set it to 1.
778         *
779         * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
780         */
781         mtspr(SPR_AMOR, (3ul << 62));
782 }
783
784 static void
785 mmu_radix_init_iamr(void)
786 {
787         /*
788          * Radix always uses key0 of the IAMR to determine if an access is
789          * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
790          * fetch.
791          */
792         mtspr(SPR_IAMR, (1ul << 62));
793 }
794
795 static void
796 mmu_radix_pid_set(pmap_t pmap)
797 {
798
799         mtspr(SPR_PID, pmap->pm_pid);
800         isync();
801 }
802
803 /* Quick sort callout for comparing physical addresses. */
804 static int
805 pa_cmp(const void *a, const void *b)
806 {
807         const vm_paddr_t *pa = a, *pb = b;
808
809         if (*pa < *pb)
810                 return (-1);
811         else if (*pa > *pb)
812                 return (1);
813         else
814                 return (0);
815 }
816
817 #define pte_load_store(ptep, pte)       atomic_swap_long(ptep, pte)
818 #define pte_load_clear(ptep)            atomic_swap_long(ptep, 0)
819 #define pte_store(ptep, pte) do {          \
820         MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X));  \
821         *(u_long *)(ptep) = (u_long)((pte) | PG_V | RPTE_LEAF); \
822 } while (0)
823 /*
824  * NB: should only be used for adding directories - not for direct mappings
825  */
826 #define pde_store(ptep, pa) do {                                \
827         *(u_long *)(ptep) = (u_long)(pa|RPTE_VALID|RPTE_SHIFT); \
828 } while (0)
829
830 #define pte_clear(ptep) do {                                    \
831                 *(u_long *)(ptep) = (u_long)(0);                \
832 } while (0)
833
834 #define PMAP_PDE_SUPERPAGE      (1 << 8)        /* supports 2MB superpages */
835
836 /*
837  * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
838  * (PTE) page mappings have identical settings for the following fields:
839  */
840 #define PG_PTE_PROMOTE  (PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \
841             PG_M | PG_A | RPTE_EAA_MASK | PG_V)
842
843 static __inline void
844 pmap_resident_count_inc(pmap_t pmap, int count)
845 {
846
847         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
848         pmap->pm_stats.resident_count += count;
849 }
850
851 static __inline void
852 pmap_resident_count_dec(pmap_t pmap, int count)
853 {
854
855         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
856         KASSERT(pmap->pm_stats.resident_count >= count,
857             ("pmap %p resident count underflow %ld %d", pmap,
858             pmap->pm_stats.resident_count, count));
859         pmap->pm_stats.resident_count -= count;
860 }
861
862 static void
863 pagezero(vm_offset_t va)
864 {
865         va = trunc_page(va);
866
867         bzero((void *)va, PAGE_SIZE);
868 }
869
870 static uint64_t
871 allocpages(int n)
872 {
873         u_int64_t ret;
874
875         ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE);
876         for (int i = 0; i < n; i++)
877                 pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE));
878         return (ret);
879 }
880
881 static pt_entry_t *
882 kvtopte(vm_offset_t va)
883 {
884         pt_entry_t *l3e;
885
886         l3e = pmap_pml3e(kernel_pmap, va);
887         if ((*l3e & RPTE_VALID) == 0)
888                 return (NULL);
889         return (pmap_l3e_to_pte(l3e, va));
890 }
891
892 void
893 mmu_radix_kenter(vm_offset_t va, vm_paddr_t pa)
894 {
895         pt_entry_t *pte;
896
897         pte = kvtopte(va);
898         MPASS(pte != NULL);
899         *pte = pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | RPTE_EAA_W | \
900             RPTE_EAA_P | PG_M | PG_A;
901 }
902
903 bool
904 mmu_radix_ps_enabled(pmap_t pmap)
905 {
906         return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
907 }
908
909 static pt_entry_t *
910 pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e)
911 {
912         pml3_entry_t *l3e;
913         pt_entry_t *pte;
914
915         va &= PG_PS_FRAME;
916         l3e = pmap_pml3e(pmap, va);
917         if (l3e == NULL || (*l3e & PG_V) == 0)
918                 return (NULL);
919
920         if (*l3e & RPTE_LEAF) {
921                 *is_l3e = 1;
922                 return (l3e);
923         }
924         *is_l3e = 0;
925         va &= PG_FRAME;
926         pte = pmap_l3e_to_pte(l3e, va);
927         if (pte == NULL || (*pte & PG_V) == 0)
928                 return (NULL);
929         return (pte);
930 }
931
932 int
933 pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags)
934 {
935         pt_entry_t *pte;
936         pt_entry_t startpte, origpte, newpte;
937         vm_page_t m;
938         int is_l3e;
939
940         startpte = 0;
941  retry:
942         if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL)
943                 return (KERN_INVALID_ADDRESS);
944         origpte = newpte = *pte;
945         if (startpte == 0) {
946                 startpte = origpte;
947                 if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) ||
948                     ((flags & VM_PROT_READ) && (startpte & PG_A))) {
949                         pmap_invalidate_all(pmap);
950 #ifdef INVARIANTS
951                         if (VERBOSE_PMAP || pmap_logging)
952                                 printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n",
953                                     __func__, pmap, va, flags, origpte);
954 #endif
955                         return (KERN_FAILURE);
956                 }
957         }
958 #ifdef INVARIANTS
959         if (VERBOSE_PMAP || pmap_logging)
960                 printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va,
961                     flags, origpte);
962 #endif
963         PMAP_LOCK(pmap);
964         if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL ||
965             *pte != origpte) {
966                 PMAP_UNLOCK(pmap);
967                 return (KERN_FAILURE);
968         }
969         m = PHYS_TO_VM_PAGE(newpte & PG_FRAME);
970         MPASS(m != NULL);
971         switch (flags) {
972         case VM_PROT_READ:
973                 if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0)
974                         goto protfail;
975                 newpte |= PG_A;
976                 vm_page_aflag_set(m, PGA_REFERENCED);
977                 break;
978         case VM_PROT_WRITE:
979                 if ((newpte & RPTE_EAA_W) == 0)
980                         goto protfail;
981                 if (is_l3e)
982                         goto protfail;
983                 newpte |= PG_M;
984                 vm_page_dirty(m);
985                 break;
986         case VM_PROT_EXECUTE:
987                 if ((newpte & RPTE_EAA_X) == 0)
988                         goto protfail;
989                 newpte |= PG_A;
990                 vm_page_aflag_set(m, PGA_REFERENCED);
991                 break;
992         }
993
994         if (!atomic_cmpset_long(pte, origpte, newpte))
995                 goto retry;
996         ptesync();
997         PMAP_UNLOCK(pmap);
998         if (startpte == newpte)
999                 return (KERN_FAILURE);
1000         return (0);
1001  protfail:
1002         PMAP_UNLOCK(pmap);
1003         return (KERN_PROTECTION_FAILURE);
1004 }
1005
1006 /*
1007  * Returns TRUE if the given page is mapped individually or as part of
1008  * a 2mpage.  Otherwise, returns FALSE.
1009  */
1010 boolean_t
1011 mmu_radix_page_is_mapped(vm_page_t m)
1012 {
1013         struct rwlock *lock;
1014         boolean_t rv;
1015
1016         if ((m->oflags & VPO_UNMANAGED) != 0)
1017                 return (FALSE);
1018         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
1019         rw_rlock(lock);
1020         rv = !TAILQ_EMPTY(&m->md.pv_list) ||
1021             ((m->flags & PG_FICTITIOUS) == 0 &&
1022             !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
1023         rw_runlock(lock);
1024         return (rv);
1025 }
1026
1027 /*
1028  * Determine the appropriate bits to set in a PTE or PDE for a specified
1029  * caching mode.
1030  */
1031 static int
1032 pmap_cache_bits(vm_memattr_t ma)
1033 {
1034         if (ma != VM_MEMATTR_DEFAULT) {
1035                 switch (ma) {
1036                 case VM_MEMATTR_UNCACHEABLE:
1037                         return (RPTE_ATTR_GUARDEDIO);
1038                 case VM_MEMATTR_CACHEABLE:
1039                         return (RPTE_ATTR_MEM);
1040                 case VM_MEMATTR_WRITE_BACK:
1041                 case VM_MEMATTR_PREFETCHABLE:
1042                 case VM_MEMATTR_WRITE_COMBINING:
1043                         return (RPTE_ATTR_UNGUARDEDIO);
1044                 }
1045         }
1046         return (0);
1047 }
1048
1049 static void
1050 pmap_invalidate_page(pmap_t pmap, vm_offset_t start)
1051 {
1052         ptesync();
1053         if (pmap == kernel_pmap)
1054                 radix_tlbie_invlpg_kernel_4k(start);
1055         else
1056                 radix_tlbie_invlpg_user_4k(pmap->pm_pid, start);
1057         ttusync();
1058 }
1059
1060 static void
1061 pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start)
1062 {
1063         ptesync();
1064         if (pmap == kernel_pmap)
1065                 radix_tlbie_invlpg_kernel_2m(start);
1066         else
1067                 radix_tlbie_invlpg_user_2m(pmap->pm_pid, start);
1068         ttusync();
1069 }
1070
1071 static void
1072 pmap_invalidate_pwc(pmap_t pmap)
1073 {
1074         ptesync();
1075         if (pmap == kernel_pmap)
1076                 radix_tlbie_invlpwc_kernel();
1077         else
1078                 radix_tlbie_invlpwc_user(pmap->pm_pid);
1079         ttusync();
1080 }
1081
1082 static void
1083 pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end)
1084 {
1085         if (((start - end) >> PAGE_SHIFT) > 8) {
1086                 pmap_invalidate_all(pmap);
1087                 return;
1088         }
1089         ptesync();
1090         if (pmap == kernel_pmap) {
1091                 while (start < end) {
1092                         radix_tlbie_invlpg_kernel_4k(start);
1093                         start += PAGE_SIZE;
1094                 }
1095         } else {
1096                 while (start < end) {
1097                         radix_tlbie_invlpg_user_4k(pmap->pm_pid, start);
1098                         start += PAGE_SIZE;
1099                 }
1100         }
1101         ttusync();
1102 }
1103
1104 static void
1105 pmap_invalidate_all(pmap_t pmap)
1106 {
1107         ptesync();
1108         if (pmap == kernel_pmap)
1109                 radix_tlbie_flush_kernel();
1110         else
1111                 radix_tlbie_flush_user(pmap->pm_pid);
1112         ttusync();
1113 }
1114
1115 static void
1116 pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e)
1117 {
1118
1119         /*
1120          * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
1121          * by a promotion that did not invalidate the 512 4KB page mappings
1122          * that might exist in the TLB.  Consequently, at this point, the TLB
1123          * may hold both 4KB and 2MB page mappings for the address range [va,
1124          * va + L3_PAGE_SIZE).  Therefore, the entire range must be invalidated here.
1125          * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
1126          * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a
1127          * single INVLPG suffices to invalidate the 2MB page mapping from the
1128          * TLB.
1129          */
1130         ptesync();
1131         if ((l3e & PG_PROMOTED) != 0)
1132                 pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1);
1133         else
1134                 pmap_invalidate_page_2m(pmap, va);
1135
1136         pmap_invalidate_pwc(pmap);
1137 }
1138
1139 static __inline struct pv_chunk *
1140 pv_to_chunk(pv_entry_t pv)
1141 {
1142
1143         return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
1144 }
1145
1146 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1147
1148 #define PC_FREE0        0xfffffffffffffffful
1149 #define PC_FREE1        0x3ffffffffffffffful
1150
1151 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 };
1152
1153 /*
1154  * Ensure that the number of spare PV entries in the specified pmap meets or
1155  * exceeds the given count, "needed".
1156  *
1157  * The given PV list lock may be released.
1158  */
1159 static void
1160 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
1161 {
1162         struct pch new_tail;
1163         struct pv_chunk *pc;
1164         vm_page_t m;
1165         int avail, free;
1166         bool reclaimed;
1167
1168         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1169         KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
1170
1171         /*
1172          * Newly allocated PV chunks must be stored in a private list until
1173          * the required number of PV chunks have been allocated.  Otherwise,
1174          * reclaim_pv_chunk() could recycle one of these chunks.  In
1175          * contrast, these chunks must be added to the pmap upon allocation.
1176          */
1177         TAILQ_INIT(&new_tail);
1178 retry:
1179         avail = 0;
1180         TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
1181                 //              if ((cpu_feature2 & CPUID2_POPCNT) == 0)
1182                 bit_count((bitstr_t *)pc->pc_map, 0,
1183                                   sizeof(pc->pc_map) * NBBY, &free);
1184 #if 0
1185                 free = popcnt_pc_map_pq(pc->pc_map);
1186 #endif
1187                 if (free == 0)
1188                         break;
1189                 avail += free;
1190                 if (avail >= needed)
1191                         break;
1192         }
1193         for (reclaimed = false; avail < needed; avail += _NPCPV) {
1194                 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1195                     VM_ALLOC_WIRED);
1196                 if (m == NULL) {
1197                         m = reclaim_pv_chunk(pmap, lockp);
1198                         if (m == NULL)
1199                                 goto retry;
1200                         reclaimed = true;
1201                 }
1202                 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
1203                 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
1204                 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1205                 pc->pc_pmap = pmap;
1206                 pc->pc_map[0] = PC_FREE0;
1207                 pc->pc_map[1] = PC_FREE1;
1208                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1209                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
1210                 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
1211
1212                 /*
1213                  * The reclaim might have freed a chunk from the current pmap.
1214                  * If that chunk contained available entries, we need to
1215                  * re-count the number of available entries.
1216                  */
1217                 if (reclaimed)
1218                         goto retry;
1219         }
1220         if (!TAILQ_EMPTY(&new_tail)) {
1221                 mtx_lock(&pv_chunks_mutex);
1222                 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
1223                 mtx_unlock(&pv_chunks_mutex);
1224         }
1225 }
1226
1227 /*
1228  * First find and then remove the pv entry for the specified pmap and virtual
1229  * address from the specified pv list.  Returns the pv entry if found and NULL
1230  * otherwise.  This operation can be performed on pv lists for either 4KB or
1231  * 2MB page mappings.
1232  */
1233 static __inline pv_entry_t
1234 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1235 {
1236         pv_entry_t pv;
1237
1238         TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
1239 #ifdef INVARIANTS
1240                 if (PV_PMAP(pv) == NULL) {
1241                         printf("corrupted pv_chunk/pv %p\n", pv);
1242                         printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":");
1243                 }
1244                 MPASS(PV_PMAP(pv) != NULL);
1245                 MPASS(pv->pv_va != 0);
1246 #endif
1247                 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
1248                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
1249                         pvh->pv_gen++;
1250                         break;
1251                 }
1252         }
1253         return (pv);
1254 }
1255
1256 /*
1257  * After demotion from a 2MB page mapping to 512 4KB page mappings,
1258  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
1259  * entries for each of the 4KB page mappings.
1260  */
1261 static void
1262 pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1263     struct rwlock **lockp)
1264 {
1265         struct md_page *pvh;
1266         struct pv_chunk *pc;
1267         pv_entry_t pv;
1268         vm_offset_t va_last;
1269         vm_page_t m;
1270         int bit, field;
1271
1272         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1273         KASSERT((pa & L3_PAGE_MASK) == 0,
1274             ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
1275         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
1276
1277         /*
1278          * Transfer the 2mpage's pv entry for this mapping to the first
1279          * page's pv list.  Once this transfer begins, the pv list lock
1280          * must not be released until the last pv entry is reinstantiated.
1281          */
1282         pvh = pa_to_pvh(pa);
1283         va = trunc_2mpage(va);
1284         pv = pmap_pvh_remove(pvh, pmap, va);
1285         KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
1286         m = PHYS_TO_VM_PAGE(pa);
1287         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1288
1289         m->md.pv_gen++;
1290         /* Instantiate the remaining NPTEPG - 1 pv entries. */
1291         PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
1292         va_last = va + L3_PAGE_SIZE - PAGE_SIZE;
1293         for (;;) {
1294                 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1295                 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0
1296                     , ("pmap_pv_demote_pde: missing spare"));
1297                 for (field = 0; field < _NPCM; field++) {
1298                         while (pc->pc_map[field]) {
1299                                 bit = cnttzd(pc->pc_map[field]);
1300                                 pc->pc_map[field] &= ~(1ul << bit);
1301                                 pv = &pc->pc_pventry[field * 64 + bit];
1302                                 va += PAGE_SIZE;
1303                                 pv->pv_va = va;
1304                                 m++;
1305                                 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1306                             ("pmap_pv_demote_pde: page %p is not managed", m));
1307                                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1308
1309                                 m->md.pv_gen++;
1310                                 if (va == va_last)
1311                                         goto out;
1312                         }
1313                 }
1314                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1315                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1316         }
1317 out:
1318         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) {
1319                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1320                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1321         }
1322         PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
1323         PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
1324 }
1325
1326 static void
1327 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap)
1328 {
1329
1330         if (pmap == NULL)
1331                 return;
1332         pmap_invalidate_all(pmap);
1333         if (pmap != locked_pmap)
1334                 PMAP_UNLOCK(pmap);
1335 }
1336
1337 /*
1338  * We are in a serious low memory condition.  Resort to
1339  * drastic measures to free some pages so we can allocate
1340  * another pv entry chunk.
1341  *
1342  * Returns NULL if PV entries were reclaimed from the specified pmap.
1343  *
1344  * We do not, however, unmap 2mpages because subsequent accesses will
1345  * allocate per-page pv entries until repromotion occurs, thereby
1346  * exacerbating the shortage of free pv entries.
1347  */
1348 static int active_reclaims = 0;
1349 static vm_page_t
1350 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
1351 {
1352         struct pv_chunk *pc, *pc_marker, *pc_marker_end;
1353         struct pv_chunk_header pc_marker_b, pc_marker_end_b;
1354         struct md_page *pvh;
1355         pml3_entry_t *l3e;
1356         pmap_t next_pmap, pmap;
1357         pt_entry_t *pte, tpte;
1358         pv_entry_t pv;
1359         vm_offset_t va;
1360         vm_page_t m, m_pc;
1361         struct spglist free;
1362         uint64_t inuse;
1363         int bit, field, freed;
1364
1365         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
1366         KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
1367         pmap = NULL;
1368         m_pc = NULL;
1369         SLIST_INIT(&free);
1370         bzero(&pc_marker_b, sizeof(pc_marker_b));
1371         bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
1372         pc_marker = (struct pv_chunk *)&pc_marker_b;
1373         pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
1374
1375         mtx_lock(&pv_chunks_mutex);
1376         active_reclaims++;
1377         TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
1378         TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
1379         while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
1380             SLIST_EMPTY(&free)) {
1381                 next_pmap = pc->pc_pmap;
1382                 if (next_pmap == NULL) {
1383                         /*
1384                          * The next chunk is a marker.  However, it is
1385                          * not our marker, so active_reclaims must be
1386                          * > 1.  Consequently, the next_chunk code
1387                          * will not rotate the pv_chunks list.
1388                          */
1389                         goto next_chunk;
1390                 }
1391                 mtx_unlock(&pv_chunks_mutex);
1392
1393                 /*
1394                  * A pv_chunk can only be removed from the pc_lru list
1395                  * when both pc_chunks_mutex is owned and the
1396                  * corresponding pmap is locked.
1397                  */
1398                 if (pmap != next_pmap) {
1399                         reclaim_pv_chunk_leave_pmap(pmap, locked_pmap);
1400                         pmap = next_pmap;
1401                         /* Avoid deadlock and lock recursion. */
1402                         if (pmap > locked_pmap) {
1403                                 RELEASE_PV_LIST_LOCK(lockp);
1404                                 PMAP_LOCK(pmap);
1405                                 mtx_lock(&pv_chunks_mutex);
1406                                 continue;
1407                         } else if (pmap != locked_pmap) {
1408                                 if (PMAP_TRYLOCK(pmap)) {
1409                                         mtx_lock(&pv_chunks_mutex);
1410                                         continue;
1411                                 } else {
1412                                         pmap = NULL; /* pmap is not locked */
1413                                         mtx_lock(&pv_chunks_mutex);
1414                                         pc = TAILQ_NEXT(pc_marker, pc_lru);
1415                                         if (pc == NULL ||
1416                                             pc->pc_pmap != next_pmap)
1417                                                 continue;
1418                                         goto next_chunk;
1419                                 }
1420                         }
1421                 }
1422
1423                 /*
1424                  * Destroy every non-wired, 4 KB page mapping in the chunk.
1425                  */
1426                 freed = 0;
1427                 for (field = 0; field < _NPCM; field++) {
1428                         for (inuse = ~pc->pc_map[field] & pc_freemask[field];
1429                             inuse != 0; inuse &= ~(1UL << bit)) {
1430                                 bit = cnttzd(inuse);
1431                                 pv = &pc->pc_pventry[field * 64 + bit];
1432                                 va = pv->pv_va;
1433                                 l3e = pmap_pml3e(pmap, va);
1434                                 if ((*l3e & RPTE_LEAF) != 0)
1435                                         continue;
1436                                 pte = pmap_l3e_to_pte(l3e, va);
1437                                 if ((*pte & PG_W) != 0)
1438                                         continue;
1439                                 tpte = pte_load_clear(pte);
1440                                 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
1441                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
1442                                         vm_page_dirty(m);
1443                                 if ((tpte & PG_A) != 0)
1444                                         vm_page_aflag_set(m, PGA_REFERENCED);
1445                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1446                                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
1447
1448                                 m->md.pv_gen++;
1449                                 if (TAILQ_EMPTY(&m->md.pv_list) &&
1450                                     (m->flags & PG_FICTITIOUS) == 0) {
1451                                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
1452                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
1453                                                 vm_page_aflag_clear(m,
1454                                                     PGA_WRITEABLE);
1455                                         }
1456                                 }
1457                                 pc->pc_map[field] |= 1UL << bit;
1458                                 pmap_unuse_pt(pmap, va, *l3e, &free);
1459                                 freed++;
1460                         }
1461                 }
1462                 if (freed == 0) {
1463                         mtx_lock(&pv_chunks_mutex);
1464                         goto next_chunk;
1465                 }
1466                 /* Every freed mapping is for a 4 KB page. */
1467                 pmap_resident_count_dec(pmap, freed);
1468                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
1469                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
1470                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
1471                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1472                 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) {
1473                         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1474                         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1475                         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1476                         /* Entire chunk is free; return it. */
1477                         m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1478                         mtx_lock(&pv_chunks_mutex);
1479                         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1480                         break;
1481                 }
1482                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1483                 mtx_lock(&pv_chunks_mutex);
1484                 /* One freed pv entry in locked_pmap is sufficient. */
1485                 if (pmap == locked_pmap)
1486                         break;
1487 next_chunk:
1488                 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
1489                 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
1490                 if (active_reclaims == 1 && pmap != NULL) {
1491                         /*
1492                          * Rotate the pv chunks list so that we do not
1493                          * scan the same pv chunks that could not be
1494                          * freed (because they contained a wired
1495                          * and/or superpage mapping) on every
1496                          * invocation of reclaim_pv_chunk().
1497                          */
1498                         while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
1499                                 MPASS(pc->pc_pmap != NULL);
1500                                 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1501                                 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1502                         }
1503                 }
1504         }
1505         TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
1506         TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
1507         active_reclaims--;
1508         mtx_unlock(&pv_chunks_mutex);
1509         reclaim_pv_chunk_leave_pmap(pmap, locked_pmap);
1510         if (m_pc == NULL && !SLIST_EMPTY(&free)) {
1511                 m_pc = SLIST_FIRST(&free);
1512                 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
1513                 /* Recycle a freed page table page. */
1514                 m_pc->ref_count = 1;
1515         }
1516         vm_page_free_pages_toq(&free, true);
1517         return (m_pc);
1518 }
1519
1520 /*
1521  * free the pv_entry back to the free list
1522  */
1523 static void
1524 free_pv_entry(pmap_t pmap, pv_entry_t pv)
1525 {
1526         struct pv_chunk *pc;
1527         int idx, field, bit;
1528
1529 #ifdef VERBOSE_PV
1530         if (pmap != kernel_pmap)
1531                 printf("%s(%p, %p)\n", __func__, pmap, pv);
1532 #endif
1533         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1534         PV_STAT(atomic_add_long(&pv_entry_frees, 1));
1535         PV_STAT(atomic_add_int(&pv_entry_spare, 1));
1536         PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
1537         pc = pv_to_chunk(pv);
1538         idx = pv - &pc->pc_pventry[0];
1539         field = idx / 64;
1540         bit = idx % 64;
1541         pc->pc_map[field] |= 1ul << bit;
1542         if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) {
1543                 /* 98% of the time, pc is already at the head of the list. */
1544                 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
1545                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1546                         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1547                 }
1548                 return;
1549         }
1550         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1551         free_pv_chunk(pc);
1552 }
1553
1554 static void
1555 free_pv_chunk(struct pv_chunk *pc)
1556 {
1557         vm_page_t m;
1558
1559         mtx_lock(&pv_chunks_mutex);
1560         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1561         mtx_unlock(&pv_chunks_mutex);
1562         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1563         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1564         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1565         /* entire chunk is free, return it */
1566         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1567         vm_page_unwire_noq(m);
1568         vm_page_free(m);
1569 }
1570
1571 /*
1572  * Returns a new PV entry, allocating a new PV chunk from the system when
1573  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
1574  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
1575  * returned.
1576  *
1577  * The given PV list lock may be released.
1578  */
1579 static pv_entry_t
1580 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
1581 {
1582         int bit, field;
1583         pv_entry_t pv;
1584         struct pv_chunk *pc;
1585         vm_page_t m;
1586
1587         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1588         PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
1589 retry:
1590         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1591         if (pc != NULL) {
1592                 for (field = 0; field < _NPCM; field++) {
1593                         if (pc->pc_map[field]) {
1594                                 bit = cnttzd(pc->pc_map[field]);
1595                                 break;
1596                         }
1597                 }
1598                 if (field < _NPCM) {
1599                         pv = &pc->pc_pventry[field * 64 + bit];
1600                         pc->pc_map[field] &= ~(1ul << bit);
1601                         /* If this was the last item, move it to tail */
1602                         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) {
1603                                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1604                                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
1605                                     pc_list);
1606                         }
1607                         PV_STAT(atomic_add_long(&pv_entry_count, 1));
1608                         PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
1609                         MPASS(PV_PMAP(pv) != NULL);
1610                         return (pv);
1611                 }
1612         }
1613         /* No free items, allocate another chunk */
1614         m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1615             VM_ALLOC_WIRED);
1616         if (m == NULL) {
1617                 if (lockp == NULL) {
1618                         PV_STAT(pc_chunk_tryfail++);
1619                         return (NULL);
1620                 }
1621                 m = reclaim_pv_chunk(pmap, lockp);
1622                 if (m == NULL)
1623                         goto retry;
1624         }
1625         PV_STAT(atomic_add_int(&pc_chunk_count, 1));
1626         PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
1627         pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1628         pc->pc_pmap = pmap;
1629         pc->pc_map[0] = PC_FREE0 & ~1ul;        /* preallocated bit 0 */
1630         pc->pc_map[1] = PC_FREE1;
1631         mtx_lock(&pv_chunks_mutex);
1632         TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1633         mtx_unlock(&pv_chunks_mutex);
1634         pv = &pc->pc_pventry[0];
1635         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1636         PV_STAT(atomic_add_long(&pv_entry_count, 1));
1637         PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
1638         MPASS(PV_PMAP(pv) != NULL);
1639         return (pv);
1640 }
1641
1642 #if VM_NRESERVLEVEL > 0
1643 /*
1644  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
1645  * replace the many pv entries for the 4KB page mappings by a single pv entry
1646  * for the 2MB page mapping.
1647  */
1648 static void
1649 pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1650     struct rwlock **lockp)
1651 {
1652         struct md_page *pvh;
1653         pv_entry_t pv;
1654         vm_offset_t va_last;
1655         vm_page_t m;
1656
1657         KASSERT((pa & L3_PAGE_MASK) == 0,
1658             ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
1659         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
1660
1661         /*
1662          * Transfer the first page's pv entry for this mapping to the 2mpage's
1663          * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
1664          * a transfer avoids the possibility that get_pv_entry() calls
1665          * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
1666          * mappings that is being promoted.
1667          */
1668         m = PHYS_TO_VM_PAGE(pa);
1669         va = trunc_2mpage(va);
1670         pv = pmap_pvh_remove(&m->md, pmap, va);
1671         KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
1672         pvh = pa_to_pvh(pa);
1673         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
1674         pvh->pv_gen++;
1675         /* Free the remaining NPTEPG - 1 pv entries. */
1676         va_last = va + L3_PAGE_SIZE - PAGE_SIZE;
1677         do {
1678                 m++;
1679                 va += PAGE_SIZE;
1680                 pmap_pvh_free(&m->md, pmap, va);
1681         } while (va < va_last);
1682 }
1683 #endif /* VM_NRESERVLEVEL > 0 */
1684
1685 /*
1686  * First find and then destroy the pv entry for the specified pmap and virtual
1687  * address.  This operation can be performed on pv lists for either 4KB or 2MB
1688  * page mappings.
1689  */
1690 static void
1691 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1692 {
1693         pv_entry_t pv;
1694
1695         pv = pmap_pvh_remove(pvh, pmap, va);
1696         KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
1697         free_pv_entry(pmap, pv);
1698 }
1699
1700 /*
1701  * Conditionally create the PV entry for a 4KB page mapping if the required
1702  * memory can be allocated without resorting to reclamation.
1703  */
1704 static boolean_t
1705 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
1706     struct rwlock **lockp)
1707 {
1708         pv_entry_t pv;
1709
1710         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1711         /* Pass NULL instead of the lock pointer to disable reclamation. */
1712         if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
1713                 pv->pv_va = va;
1714                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1715                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1716                 m->md.pv_gen++;
1717                 return (TRUE);
1718         } else
1719                 return (FALSE);
1720 }
1721
1722 vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX];
1723 #ifdef INVARIANTS
1724 static void
1725 validate_addr(vm_paddr_t addr, vm_size_t size)
1726 {
1727         vm_paddr_t end = addr + size;
1728         bool found = false;
1729
1730         for (int i = 0; i < 2 * phys_avail_count; i += 2) {
1731                 if (addr >= phys_avail_debug[i] &&
1732                         end <= phys_avail_debug[i + 1]) {
1733                         found = true;
1734                         break;
1735                 }
1736         }
1737         KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array",
1738                                         addr, end));
1739 }
1740 #else
1741 static void validate_addr(vm_paddr_t addr, vm_size_t size) {}
1742 #endif
1743 #define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A)
1744
1745 static vm_paddr_t
1746 alloc_pt_page(void)
1747 {
1748         vm_paddr_t page;
1749
1750         page = allocpages(1);
1751         pagezero(PHYS_TO_DMAP(page));
1752         return (page);
1753 }
1754
1755 static void
1756 mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end)
1757 {
1758         pt_entry_t *pte, pteval;
1759         vm_paddr_t page;
1760
1761         if (bootverbose)
1762                 printf("%s %lx -> %lx\n", __func__, start, end);
1763         while (start < end) {
1764                 pteval = start | DMAP_PAGE_BITS;
1765                 pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start));
1766                 if ((*pte & RPTE_VALID) == 0) {
1767                         page = alloc_pt_page();
1768                         pde_store(pte, page);
1769                 }
1770                 pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start));
1771                 if ((start & L2_PAGE_MASK) == 0 &&
1772                         end - start >= L2_PAGE_SIZE) {
1773                         start += L2_PAGE_SIZE;
1774                         goto done;
1775                 } else if ((*pte & RPTE_VALID) == 0) {
1776                         page = alloc_pt_page();
1777                         pde_store(pte, page);
1778                 }
1779
1780                 pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start));
1781                 if ((start & L3_PAGE_MASK) == 0 &&
1782                         end - start >= L3_PAGE_SIZE) {
1783                         start += L3_PAGE_SIZE;
1784                         goto done;
1785                 } else if ((*pte & RPTE_VALID) == 0) {
1786                         page = alloc_pt_page();
1787                         pde_store(pte, page);
1788                 }
1789                 pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start));
1790                 start += PAGE_SIZE;
1791         done:
1792                 pte_store(pte, pteval);
1793         }
1794 }
1795
1796 static void
1797 mmu_radix_dmap_populate(vm_size_t hwphyssz)
1798 {
1799         vm_paddr_t start, end;
1800
1801         for (int i = 0; i < pregions_sz; i++) {
1802                 start = pregions[i].mr_start;
1803                 end = start + pregions[i].mr_size;
1804                 if (hwphyssz && start >= hwphyssz)
1805                         break;
1806                 if (hwphyssz && hwphyssz < end)
1807                         end = hwphyssz;
1808                 mmu_radix_dmap_range(start, end);
1809         }
1810 }
1811
1812 static void
1813 mmu_radix_setup_pagetables(vm_size_t hwphyssz)
1814 {
1815         vm_paddr_t ptpages, pages;
1816         pt_entry_t *pte;
1817         vm_paddr_t l1phys;
1818
1819         bzero(kernel_pmap, sizeof(struct pmap));
1820         PMAP_LOCK_INIT(kernel_pmap);
1821
1822         ptpages = allocpages(2);
1823         l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE);
1824         validate_addr(l1phys, RADIX_PGD_SIZE);
1825         if (bootverbose)
1826                 printf("l1phys=%lx\n", l1phys);
1827         MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0);
1828         for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++)
1829                 pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE));
1830         kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys);
1831
1832         mmu_radix_dmap_populate(hwphyssz);
1833
1834         /*
1835          * Create page tables for first 128MB of KVA
1836          */
1837         pages = ptpages;
1838         pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS);
1839         *pte = (pages | RPTE_VALID | RPTE_SHIFT);
1840         pages += PAGE_SIZE;
1841         pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS);
1842         *pte = (pages | RPTE_VALID | RPTE_SHIFT);
1843         pages += PAGE_SIZE;
1844         pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS);
1845         /*
1846          * the kernel page table pages need to be preserved in
1847          * phys_avail and not overlap with previous  allocations
1848          */
1849         pages = allocpages(nkpt);
1850         if (bootverbose) {
1851                 printf("phys_avail after dmap populate and nkpt allocation\n");
1852                 for (int j = 0; j < 2 * phys_avail_count; j+=2)
1853                         printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n",
1854                                    j, phys_avail[j], j + 1, phys_avail[j + 1]);
1855         }
1856         KPTphys = pages;
1857         for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE)
1858                 *pte = (pages | RPTE_VALID | RPTE_SHIFT);
1859         kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE;
1860         if (bootverbose)
1861                 printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1);
1862         /*
1863          * Add a physical memory segment (vm_phys_seg) corresponding to the
1864          * preallocated kernel page table pages so that vm_page structures
1865          * representing these pages will be created.  The vm_page structures
1866          * are required for promotion of the corresponding kernel virtual
1867          * addresses to superpage mappings.
1868          */
1869         vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
1870 }
1871
1872 static void
1873 mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end)
1874 {
1875         vm_paddr_t      kpstart, kpend;
1876         vm_size_t       physsz, hwphyssz;
1877         //uint64_t      l2virt;
1878         int             rm_pavail, proctab_size;
1879         int             i, j;
1880
1881         kpstart = start & ~DMAP_BASE_ADDRESS;
1882         kpend = end & ~DMAP_BASE_ADDRESS;
1883
1884         /* Get physical memory regions from firmware */
1885         mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
1886         CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory");
1887
1888         if (2 * VM_PHYSSEG_MAX < regions_sz)
1889                 panic("mmu_radix_early_bootstrap: phys_avail too small");
1890
1891         if (bootverbose)
1892                 for (int i = 0; i < regions_sz; i++)
1893                         printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n",
1894                             i, regions[i].mr_start, i, regions[i].mr_size);
1895         /*
1896          * XXX workaround a simulator bug
1897          */
1898         for (int i = 0; i < regions_sz; i++)
1899                 if (regions[i].mr_start & PAGE_MASK) {
1900                         regions[i].mr_start += PAGE_MASK;
1901                         regions[i].mr_start &= ~PAGE_MASK;
1902                         regions[i].mr_size &= ~PAGE_MASK;
1903                 }
1904         if (bootverbose)
1905                 for (int i = 0; i < pregions_sz; i++)
1906                         printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n",
1907                             i, pregions[i].mr_start, i, pregions[i].mr_size);
1908
1909         phys_avail_count = 0;
1910         physsz = 0;
1911         hwphyssz = 0;
1912         TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
1913         for (i = 0, j = 0; i < regions_sz; i++) {
1914                 if (bootverbose)
1915                         printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n",
1916                             i, regions[i].mr_start, i, regions[i].mr_size);
1917
1918                 if (regions[i].mr_size < PAGE_SIZE)
1919                         continue;
1920
1921                 if (hwphyssz != 0 &&
1922                     (physsz + regions[i].mr_size) >= hwphyssz) {
1923                         if (physsz < hwphyssz) {
1924                                 phys_avail[j] = regions[i].mr_start;
1925                                 phys_avail[j + 1] = regions[i].mr_start +
1926                                     (hwphyssz - physsz);
1927                                 physsz = hwphyssz;
1928                                 phys_avail_count++;
1929                                 dump_avail[j] = phys_avail[j];
1930                                 dump_avail[j + 1] = phys_avail[j + 1];
1931                         }
1932                         break;
1933                 }
1934                 phys_avail[j] = regions[i].mr_start;
1935                 phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
1936                 dump_avail[j] = phys_avail[j];
1937                 dump_avail[j + 1] = phys_avail[j + 1];
1938
1939                 phys_avail_count++;
1940                 physsz += regions[i].mr_size;
1941                 j += 2;
1942         }
1943
1944         /* Check for overlap with the kernel and exception vectors */
1945         rm_pavail = 0;
1946         for (j = 0; j < 2 * phys_avail_count; j+=2) {
1947                 if (phys_avail[j] < EXC_LAST)
1948                         phys_avail[j] += EXC_LAST;
1949
1950                 if (phys_avail[j] >= kpstart &&
1951                     phys_avail[j + 1] <= kpend) {
1952                         phys_avail[j] = phys_avail[j + 1] = ~0;
1953                         rm_pavail++;
1954                         continue;
1955                 }
1956
1957                 if (kpstart >= phys_avail[j] &&
1958                     kpstart < phys_avail[j + 1]) {
1959                         if (kpend < phys_avail[j + 1]) {
1960                                 phys_avail[2 * phys_avail_count] =
1961                                     (kpend & ~PAGE_MASK) + PAGE_SIZE;
1962                                 phys_avail[2 * phys_avail_count + 1] =
1963                                     phys_avail[j + 1];
1964                                 phys_avail_count++;
1965                         }
1966
1967                         phys_avail[j + 1] = kpstart & ~PAGE_MASK;
1968                 }
1969
1970                 if (kpend >= phys_avail[j] &&
1971                     kpend < phys_avail[j + 1]) {
1972                         if (kpstart > phys_avail[j]) {
1973                                 phys_avail[2 * phys_avail_count] = phys_avail[j];
1974                                 phys_avail[2 * phys_avail_count + 1] =
1975                                     kpstart & ~PAGE_MASK;
1976                                 phys_avail_count++;
1977                         }
1978
1979                         phys_avail[j] = (kpend & ~PAGE_MASK) +
1980                             PAGE_SIZE;
1981                 }
1982         }
1983         qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp);
1984         for (i = 0; i < 2 * phys_avail_count; i++)
1985                 phys_avail_debug[i] = phys_avail[i];
1986
1987         /* Remove physical available regions marked for removal (~0) */
1988         if (rm_pavail) {
1989                 phys_avail_count -= rm_pavail;
1990                 for (i = 2 * phys_avail_count;
1991                      i < 2*(phys_avail_count + rm_pavail); i+=2)
1992                         phys_avail[i] = phys_avail[i + 1] = 0;
1993         }
1994         if (bootverbose) {
1995                 printf("phys_avail ranges after filtering:\n");
1996                 for (j = 0; j < 2 * phys_avail_count; j+=2)
1997                         printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n",
1998                                    j, phys_avail[j], j + 1, phys_avail[j + 1]);
1999         }
2000         physmem = btoc(physsz);
2001
2002         /* XXX assume we're running non-virtualized and
2003          * we don't support BHYVE
2004          */
2005         if (isa3_pid_bits == 0)
2006                 isa3_pid_bits = 20;
2007         parttab_phys = moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE);
2008         validate_addr(parttab_phys, PARTTAB_SIZE);
2009         for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++)
2010                 pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE));
2011
2012         proctab_size = 1UL << PROCTAB_SIZE_SHIFT;
2013         proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size);
2014         validate_addr(proctab0pa, proctab_size);
2015         for (int i = 0; i < proctab_size/PAGE_SIZE; i++)
2016                 pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE));
2017
2018         mmu_radix_setup_pagetables(hwphyssz);
2019 }
2020
2021 static void
2022 mmu_radix_late_bootstrap(vm_offset_t start, vm_offset_t end)
2023 {
2024         int             i;
2025         vm_paddr_t      pa;
2026         void            *dpcpu;
2027         vm_offset_t va;
2028
2029         /*
2030          * Set up the Open Firmware pmap and add its mappings if not in real
2031          * mode.
2032          */
2033         if (bootverbose)
2034                 printf("%s enter\n", __func__);
2035
2036         /*
2037          * Calculate the last available physical address, and reserve the
2038          * vm_page_array (upper bound).
2039          */
2040         Maxmem = 0;
2041         for (i = 0; phys_avail[i + 2] != 0; i += 2)
2042                 Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1]));
2043
2044         /*
2045          * Set the start and end of kva.
2046          */
2047         virtual_avail = VM_MIN_KERNEL_ADDRESS;
2048         virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS;
2049
2050         /*
2051          * Remap any early IO mappings (console framebuffer, etc.)
2052          */
2053         bs_remap_earlyboot();
2054
2055         /*
2056          * Allocate a kernel stack with a guard page for thread0 and map it
2057          * into the kernel page map.
2058          */
2059         pa = allocpages(kstack_pages);
2060         va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
2061         virtual_avail = va + kstack_pages * PAGE_SIZE;
2062         CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va);
2063         thread0.td_kstack = va;
2064         for (i = 0; i < kstack_pages; i++) {
2065                 mmu_radix_kenter(va, pa);
2066                 pa += PAGE_SIZE;
2067                 va += PAGE_SIZE;
2068         }
2069         thread0.td_kstack_pages = kstack_pages;
2070
2071         /*
2072          * Allocate virtual address space for the message buffer.
2073          */
2074         pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK)  >> PAGE_SHIFT);
2075         msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa);
2076
2077         /*
2078          * Allocate virtual address space for the dynamic percpu area.
2079          */
2080         pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT);
2081         dpcpu = (void *)PHYS_TO_DMAP(pa);
2082         dpcpu_init(dpcpu, curcpu);
2083         /*
2084          * Reserve some special page table entries/VA space for temporary
2085          * mapping of pages.
2086          */
2087 }
2088
2089 static void
2090 mmu_parttab_init(void)
2091 {
2092         uint64_t ptcr;
2093
2094         isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys);
2095
2096         if (bootverbose)
2097                 printf("%s parttab: %p\n", __func__, isa3_parttab);
2098         ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12);
2099         if (bootverbose)
2100                 printf("setting ptcr %lx\n", ptcr);
2101         mtspr(SPR_PTCR, ptcr);
2102 }
2103
2104 static void
2105 mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab)
2106 {
2107         uint64_t prev;
2108
2109         if (bootverbose)
2110                 printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab,
2111                            lpid, pagetab, proctab);
2112         prev = be64toh(isa3_parttab[lpid].pagetab);
2113         isa3_parttab[lpid].pagetab = htobe64(pagetab);
2114         isa3_parttab[lpid].proctab = htobe64(proctab);
2115
2116         if (prev & PARTTAB_HR) {
2117                 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
2118                              "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2119                 __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
2120                              "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2121         } else {
2122                 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
2123                              "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2124         }
2125         ttusync();
2126 }
2127
2128 static void
2129 mmu_radix_parttab_init(void)
2130 {
2131         uint64_t pagetab;
2132
2133         mmu_parttab_init();
2134         pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \
2135                          RADIX_PGD_INDEX_SHIFT | PARTTAB_HR;
2136         mmu_parttab_update(0, pagetab, 0);
2137 }
2138
2139 static void
2140 mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size)
2141 {
2142         uint64_t pagetab, proctab;
2143
2144         pagetab = be64toh(isa3_parttab[0].pagetab);
2145         proctab = proctabpa | table_size | PARTTAB_GR;
2146         mmu_parttab_update(0, pagetab, proctab);
2147 }
2148
2149 static void
2150 mmu_radix_proctab_init(void)
2151 {
2152
2153         isa3_base_pid = 1;
2154
2155         isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa);
2156         isa3_proctab->proctab0 =
2157             htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) |
2158                 RADIX_PGD_INDEX_SHIFT);
2159
2160         mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12);
2161
2162         __asm __volatile("ptesync" : : : "memory");
2163         __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
2164                      "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
2165         __asm __volatile("eieio; tlbsync; ptesync" : : : "memory");
2166         if (bootverbose)
2167                 printf("process table %p and kernel radix PDE: %p\n",
2168                            isa3_proctab, kernel_pmap->pm_pml1);
2169         mtmsr(mfmsr() | PSL_DR );
2170         mtmsr(mfmsr() &  ~PSL_DR);
2171         kernel_pmap->pm_pid = isa3_base_pid;
2172         isa3_base_pid++;
2173 }
2174
2175 void
2176 mmu_radix_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2177     int advice)
2178 {
2179         struct rwlock *lock;
2180         pml1_entry_t *l1e;
2181         pml2_entry_t *l2e;
2182         pml3_entry_t oldl3e, *l3e;
2183         pt_entry_t *pte;
2184         vm_offset_t va, va_next;
2185         vm_page_t m;
2186         boolean_t anychanged;
2187
2188         if (advice != MADV_DONTNEED && advice != MADV_FREE)
2189                 return;
2190         anychanged = FALSE;
2191         PMAP_LOCK(pmap);
2192         for (; sva < eva; sva = va_next) {
2193                 l1e = pmap_pml1e(pmap, sva);
2194                 if ((*l1e & PG_V) == 0) {
2195                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
2196                         if (va_next < sva)
2197                                 va_next = eva;
2198                         continue;
2199                 }
2200                 l2e = pmap_l1e_to_l2e(l1e, sva);
2201                 if ((*l2e & PG_V) == 0) {
2202                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
2203                         if (va_next < sva)
2204                                 va_next = eva;
2205                         continue;
2206                 }
2207                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
2208                 if (va_next < sva)
2209                         va_next = eva;
2210                 l3e = pmap_l2e_to_l3e(l2e, sva);
2211                 oldl3e = *l3e;
2212                 if ((oldl3e & PG_V) == 0)
2213                         continue;
2214                 else if ((oldl3e & RPTE_LEAF) != 0) {
2215                         if ((oldl3e & PG_MANAGED) == 0)
2216                                 continue;
2217                         lock = NULL;
2218                         if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) {
2219                                 if (lock != NULL)
2220                                         rw_wunlock(lock);
2221
2222                                 /*
2223                                  * The large page mapping was destroyed.
2224                                  */
2225                                 continue;
2226                         }
2227
2228                         /*
2229                          * Unless the page mappings are wired, remove the
2230                          * mapping to a single page so that a subsequent
2231                          * access may repromote.  Since the underlying page
2232                          * table page is fully populated, this removal never
2233                          * frees a page table page.
2234                          */
2235                         if ((oldl3e & PG_W) == 0) {
2236                                 pte = pmap_l3e_to_pte(l3e, sva);
2237                                 KASSERT((*pte & PG_V) != 0,
2238                                     ("pmap_advise: invalid PTE"));
2239                                 pmap_remove_pte(pmap, pte, sva, *l3e, NULL,
2240                                     &lock);
2241                                 anychanged = TRUE;
2242                         }
2243                         if (lock != NULL)
2244                                 rw_wunlock(lock);
2245                 }
2246                 if (va_next > eva)
2247                         va_next = eva;
2248                 va = va_next;
2249                 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next;
2250                          pte++, sva += PAGE_SIZE) {
2251                         MPASS(pte == pmap_pte(pmap, sva));
2252
2253                         if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
2254                                 goto maybe_invlrng;
2255                         else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2256                                 if (advice == MADV_DONTNEED) {
2257                                         /*
2258                                          * Future calls to pmap_is_modified()
2259                                          * can be avoided by making the page
2260                                          * dirty now.
2261                                          */
2262                                         m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
2263                                         vm_page_dirty(m);
2264                                 }
2265                                 atomic_clear_long(pte, PG_M | PG_A);
2266                         } else if ((*pte & PG_A) != 0)
2267                                 atomic_clear_long(pte, PG_A);
2268                         else
2269                                 goto maybe_invlrng;
2270                         anychanged = TRUE;
2271                         continue;
2272 maybe_invlrng:
2273                         if (va != va_next) {
2274                                 anychanged = true;
2275                                 va = va_next;
2276                         }
2277                 }
2278                 if (va != va_next)
2279                         anychanged = true;
2280         }
2281         if (anychanged)
2282                 pmap_invalidate_all(pmap);
2283         PMAP_UNLOCK(pmap);
2284 }
2285
2286 /*
2287  * Routines used in machine-dependent code
2288  */
2289 static void
2290 mmu_radix_bootstrap(vm_offset_t start, vm_offset_t end)
2291 {
2292         uint64_t lpcr;
2293
2294         if (bootverbose)
2295                 printf("%s\n", __func__);
2296         hw_direct_map = 1;
2297         mmu_radix_early_bootstrap(start, end);
2298         if (bootverbose)
2299                 printf("early bootstrap complete\n");
2300         if (powernv_enabled) {
2301                 lpcr = mfspr(SPR_LPCR);
2302                 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
2303                 mmu_radix_parttab_init();
2304                 mmu_radix_init_amor();
2305                 if (bootverbose)
2306                         printf("powernv init complete\n");
2307         }
2308         mmu_radix_init_iamr();
2309         mmu_radix_proctab_init();
2310         mmu_radix_pid_set(kernel_pmap);
2311         /* XXX assume CPU_FTR_HVMODE */
2312         mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
2313
2314         mmu_radix_late_bootstrap(start, end);
2315         numa_mem_regions(&numa_pregions, &numa_pregions_sz);
2316         if (bootverbose)
2317                 printf("%s done\n", __func__);
2318         pmap_bootstrapped = 1;
2319         dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE);
2320         PCPU_SET(flags, PCPU_GET(flags) | PC_FLAG_NOSRS);
2321 }
2322
2323 static void
2324 mmu_radix_cpu_bootstrap(int ap)
2325 {
2326         uint64_t lpcr;
2327         uint64_t ptcr;
2328
2329         if (powernv_enabled) {
2330                 lpcr = mfspr(SPR_LPCR);
2331                 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
2332
2333                 ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12);
2334                 mtspr(SPR_PTCR, ptcr);
2335                 mmu_radix_init_amor();
2336         }
2337         mmu_radix_init_iamr();
2338         mmu_radix_pid_set(kernel_pmap);
2339         mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
2340 }
2341
2342 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0,
2343     "2MB page mapping counters");
2344
2345 static u_long pmap_l3e_demotions;
2346 SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD,
2347     &pmap_l3e_demotions, 0, "2MB page demotions");
2348
2349 static u_long pmap_l3e_mappings;
2350 SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD,
2351     &pmap_l3e_mappings, 0, "2MB page mappings");
2352
2353 static u_long pmap_l3e_p_failures;
2354 SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD,
2355     &pmap_l3e_p_failures, 0, "2MB page promotion failures");
2356
2357 static u_long pmap_l3e_promotions;
2358 SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD,
2359     &pmap_l3e_promotions, 0, "2MB page promotions");
2360
2361 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0,
2362     "1GB page mapping counters");
2363
2364 static u_long pmap_l2e_demotions;
2365 SYSCTL_ULONG(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD,
2366     &pmap_l2e_demotions, 0, "1GB page demotions");
2367
2368 void
2369 mmu_radix_clear_modify(vm_page_t m)
2370 {
2371         struct md_page *pvh;
2372         pmap_t pmap;
2373         pv_entry_t next_pv, pv;
2374         pml3_entry_t oldl3e, *l3e;
2375         pt_entry_t oldpte, *pte;
2376         struct rwlock *lock;
2377         vm_offset_t va;
2378         int md_gen, pvh_gen;
2379
2380         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2381             ("pmap_clear_modify: page %p is not managed", m));
2382         vm_page_assert_busied(m);
2383         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
2384
2385         /*
2386          * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
2387          * If the object containing the page is locked and the page is not
2388          * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
2389          */
2390         if ((m->a.flags & PGA_WRITEABLE) == 0)
2391                 return;
2392         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2393             pa_to_pvh(VM_PAGE_TO_PHYS(m));
2394         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2395         rw_wlock(lock);
2396 restart:
2397         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) {
2398                 pmap = PV_PMAP(pv);
2399                 if (!PMAP_TRYLOCK(pmap)) {
2400                         pvh_gen = pvh->pv_gen;
2401                         rw_wunlock(lock);
2402                         PMAP_LOCK(pmap);
2403                         rw_wlock(lock);
2404                         if (pvh_gen != pvh->pv_gen) {
2405                                 PMAP_UNLOCK(pmap);
2406                                 goto restart;
2407                         }
2408                 }
2409                 va = pv->pv_va;
2410                 l3e = pmap_pml3e(pmap, va);
2411                 oldl3e = *l3e;
2412                 if ((oldl3e & PG_RW) != 0) {
2413                         if (pmap_demote_l3e_locked(pmap, l3e, va, &lock)) {
2414                                 if ((oldl3e & PG_W) == 0) {
2415                                         /*
2416                                          * Write protect the mapping to a
2417                                          * single page so that a subsequent
2418                                          * write access may repromote.
2419                                          */
2420                                         va += VM_PAGE_TO_PHYS(m) - (oldl3e &
2421                                             PG_PS_FRAME);
2422                                         pte = pmap_l3e_to_pte(l3e, va);
2423                                         oldpte = *pte;
2424                                         if ((oldpte & PG_V) != 0) {
2425                                                 while (!atomic_cmpset_long(pte,
2426                                                     oldpte,
2427                                                         (oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW)))
2428                                                            oldpte = *pte;
2429                                                 vm_page_dirty(m);
2430                                                 pmap_invalidate_page(pmap, va);
2431                                         }
2432                                 }
2433                         }
2434                 }
2435                 PMAP_UNLOCK(pmap);
2436         }
2437         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
2438                 pmap = PV_PMAP(pv);
2439                 if (!PMAP_TRYLOCK(pmap)) {
2440                         md_gen = m->md.pv_gen;
2441                         pvh_gen = pvh->pv_gen;
2442                         rw_wunlock(lock);
2443                         PMAP_LOCK(pmap);
2444                         rw_wlock(lock);
2445                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
2446                                 PMAP_UNLOCK(pmap);
2447                                 goto restart;
2448                         }
2449                 }
2450                 l3e = pmap_pml3e(pmap, pv->pv_va);
2451                 KASSERT((*l3e & RPTE_LEAF) == 0, ("pmap_clear_modify: found"
2452                     " a 2mpage in page %p's pv list", m));
2453                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
2454                 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2455                         atomic_clear_long(pte, PG_M);
2456                         pmap_invalidate_page(pmap, pv->pv_va);
2457                 }
2458                 PMAP_UNLOCK(pmap);
2459         }
2460         rw_wunlock(lock);
2461 }
2462
2463 void
2464 mmu_radix_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
2465     vm_size_t len, vm_offset_t src_addr)
2466 {
2467         struct rwlock *lock;
2468         struct spglist free;
2469         vm_offset_t addr;
2470         vm_offset_t end_addr = src_addr + len;
2471         vm_offset_t va_next;
2472         vm_page_t dst_pdpg, dstmpte, srcmpte;
2473         bool invalidate_all;
2474
2475         CTR6(KTR_PMAP,
2476             "%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n",
2477             __func__, dst_pmap, src_pmap, dst_addr, len, src_addr);
2478
2479         if (dst_addr != src_addr)
2480                 return;
2481         lock = NULL;
2482         invalidate_all = false;
2483         if (dst_pmap < src_pmap) {
2484                 PMAP_LOCK(dst_pmap);
2485                 PMAP_LOCK(src_pmap);
2486         } else {
2487                 PMAP_LOCK(src_pmap);
2488                 PMAP_LOCK(dst_pmap);
2489         }
2490
2491         for (addr = src_addr; addr < end_addr; addr = va_next) {
2492                 pml1_entry_t *l1e;
2493                 pml2_entry_t *l2e;
2494                 pml3_entry_t srcptepaddr, *l3e;
2495                 pt_entry_t *src_pte, *dst_pte;
2496
2497                 l1e = pmap_pml1e(src_pmap, addr);
2498                 if ((*l1e & PG_V) == 0) {
2499                         va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
2500                         if (va_next < addr)
2501                                 va_next = end_addr;
2502                         continue;
2503                 }
2504
2505                 l2e = pmap_l1e_to_l2e(l1e, addr);
2506                 if ((*l2e & PG_V) == 0) {
2507                         va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
2508                         if (va_next < addr)
2509                                 va_next = end_addr;
2510                         continue;
2511                 }
2512
2513                 va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
2514                 if (va_next < addr)
2515                         va_next = end_addr;
2516
2517                 l3e = pmap_l2e_to_l3e(l2e, addr);
2518                 srcptepaddr = *l3e;
2519                 if (srcptepaddr == 0)
2520                         continue;
2521
2522                 if (srcptepaddr & RPTE_LEAF) {
2523                         if ((addr & L3_PAGE_MASK) != 0 ||
2524                             addr + L3_PAGE_SIZE > end_addr)
2525                                 continue;
2526                         dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL);
2527                         if (dst_pdpg == NULL)
2528                                 break;
2529                         l3e = (pml3_entry_t *)
2530                             PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
2531                         l3e = &l3e[pmap_pml3e_index(addr)];
2532                         if (*l3e == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
2533                             pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr,
2534                             PMAP_ENTER_NORECLAIM, &lock))) {
2535                                 *l3e = srcptepaddr & ~PG_W;
2536                                 pmap_resident_count_inc(dst_pmap,
2537                                     L3_PAGE_SIZE / PAGE_SIZE);
2538                                 atomic_add_long(&pmap_l3e_mappings, 1);
2539                         } else
2540                                 dst_pdpg->ref_count--;
2541                         continue;
2542                 }
2543
2544                 srcptepaddr &= PG_FRAME;
2545                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2546                 KASSERT(srcmpte->ref_count > 0,
2547                     ("pmap_copy: source page table page is unused"));
2548
2549                 if (va_next > end_addr)
2550                         va_next = end_addr;
2551
2552                 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
2553                 src_pte = &src_pte[pmap_pte_index(addr)];
2554                 dstmpte = NULL;
2555                 while (addr < va_next) {
2556                         pt_entry_t ptetemp;
2557                         ptetemp = *src_pte;
2558                         /*
2559                          * we only virtual copy managed pages
2560                          */
2561                         if ((ptetemp & PG_MANAGED) != 0) {
2562                                 if (dstmpte != NULL &&
2563                                     dstmpte->pindex == pmap_l3e_pindex(addr))
2564                                         dstmpte->ref_count++;
2565                                 else if ((dstmpte = pmap_allocpte(dst_pmap,
2566                                     addr, NULL)) == NULL)
2567                                         goto out;
2568                                 dst_pte = (pt_entry_t *)
2569                                     PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
2570                                 dst_pte = &dst_pte[pmap_pte_index(addr)];
2571                                 if (*dst_pte == 0 &&
2572                                     pmap_try_insert_pv_entry(dst_pmap, addr,
2573                                     PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
2574                                     &lock)) {
2575                                         /*
2576                                          * Clear the wired, modified, and
2577                                          * accessed (referenced) bits
2578                                          * during the copy.
2579                                          */
2580                                         *dst_pte = ptetemp & ~(PG_W | PG_M |
2581                                             PG_A);
2582                                         pmap_resident_count_inc(dst_pmap, 1);
2583                                 } else {
2584                                         SLIST_INIT(&free);
2585                                         if (pmap_unwire_ptp(dst_pmap, addr,
2586                                             dstmpte, &free)) {
2587                                                 /*
2588                                                  * Although "addr" is not
2589                                                  * mapped, paging-structure
2590                                                  * caches could nonetheless
2591                                                  * have entries that refer to
2592                                                  * the freed page table pages.
2593                                                  * Invalidate those entries.
2594                                                  */
2595                                                 invalidate_all = true;
2596                                                 vm_page_free_pages_toq(&free,
2597                                                     true);
2598                                         }
2599                                         goto out;
2600                                 }
2601                                 if (dstmpte->ref_count >= srcmpte->ref_count)
2602                                         break;
2603                         }
2604                         addr += PAGE_SIZE;
2605                         if (__predict_false((addr & L3_PAGE_MASK) == 0))
2606                                 src_pte = pmap_pte(src_pmap, addr);
2607                         else
2608                                 src_pte++;
2609                 }
2610         }
2611 out:
2612         if (invalidate_all)
2613                 pmap_invalidate_all(dst_pmap);
2614         if (lock != NULL)
2615                 rw_wunlock(lock);
2616         PMAP_UNLOCK(src_pmap);
2617         PMAP_UNLOCK(dst_pmap);
2618 }
2619
2620 static void
2621 mmu_radix_copy_page(vm_page_t msrc, vm_page_t mdst)
2622 {
2623         vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
2624         vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
2625
2626         CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst);
2627         /*
2628          * XXX slow
2629          */
2630         bcopy((void *)src, (void *)dst, PAGE_SIZE);
2631 }
2632
2633 static void
2634 mmu_radix_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
2635     vm_offset_t b_offset, int xfersize)
2636 {
2637
2638         CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma,
2639             a_offset, mb, b_offset, xfersize);
2640         UNIMPLEMENTED();
2641 }
2642
2643 #if VM_NRESERVLEVEL > 0
2644 /*
2645  * Tries to promote the 512, contiguous 4KB page mappings that are within a
2646  * single page table page (PTP) to a single 2MB page mapping.  For promotion
2647  * to occur, two conditions must be met: (1) the 4KB page mappings must map
2648  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
2649  * identical characteristics.
2650  */
2651 static int
2652 pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va,
2653     struct rwlock **lockp)
2654 {
2655         pml3_entry_t newpde;
2656         pt_entry_t *firstpte, oldpte, pa, *pte;
2657         vm_page_t mpte;
2658
2659         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2660
2661         /*
2662          * Examine the first PTE in the specified PTP.  Abort if this PTE is
2663          * either invalid, unused, or does not map the first 4KB physical page
2664          * within a 2MB page.
2665          */
2666         firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
2667 setpde:
2668         newpde = *firstpte;
2669         if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
2670                 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2671                     " in pmap %p", va, pmap);
2672                 goto fail;
2673         }
2674         if ((newpde & (PG_M | PG_RW)) == PG_RW) {
2675                 /*
2676                  * When PG_M is already clear, PG_RW can be cleared without
2677                  * a TLB invalidation.
2678                  */
2679                 if (!atomic_cmpset_long(firstpte, newpde, (newpde | RPTE_EAA_R) & ~RPTE_EAA_W))
2680                         goto setpde;
2681                 newpde &= ~RPTE_EAA_W;
2682         }
2683
2684         /*
2685          * Examine each of the other PTEs in the specified PTP.  Abort if this
2686          * PTE maps an unexpected 4KB physical page or does not have identical
2687          * characteristics to the first PTE.
2688          */
2689         pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE;
2690         for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
2691 setpte:
2692                 oldpte = *pte;
2693                 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
2694                         CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2695                             " in pmap %p", va, pmap);
2696                         goto fail;
2697                 }
2698                 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
2699                         /*
2700                          * When PG_M is already clear, PG_RW can be cleared
2701                          * without a TLB invalidation.
2702                          */
2703                         if (!atomic_cmpset_long(pte, oldpte, (oldpte | RPTE_EAA_R) & ~RPTE_EAA_W))
2704                                 goto setpte;
2705                         oldpte &= ~RPTE_EAA_W;
2706                         CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx"
2707                             " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) |
2708                             (va & ~L3_PAGE_MASK), pmap);
2709                 }
2710                 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
2711                         CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2712                             " in pmap %p", va, pmap);
2713                         goto fail;
2714                 }
2715                 pa -= PAGE_SIZE;
2716         }
2717
2718         /*
2719          * Save the page table page in its current state until the PDE
2720          * mapping the superpage is demoted by pmap_demote_pde() or
2721          * destroyed by pmap_remove_pde().
2722          */
2723         mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
2724         KASSERT(mpte >= vm_page_array &&
2725             mpte < &vm_page_array[vm_page_array_size],
2726             ("pmap_promote_l3e: page table page is out of range"));
2727         KASSERT(mpte->pindex == pmap_l3e_pindex(va),
2728             ("pmap_promote_l3e: page table page's pindex is wrong"));
2729         if (pmap_insert_pt_page(pmap, mpte)) {
2730                 CTR2(KTR_PMAP,
2731                     "pmap_promote_l3e: failure for va %#lx in pmap %p", va,
2732                     pmap);
2733                 goto fail;
2734         }
2735
2736         /*
2737          * Promote the pv entries.
2738          */
2739         if ((newpde & PG_MANAGED) != 0)
2740                 pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp);
2741
2742         pte_store(pde, PG_PROMOTED | newpde);
2743         atomic_add_long(&pmap_l3e_promotions, 1);
2744         CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx"
2745             " in pmap %p", va, pmap);
2746         return (0);
2747  fail:
2748         atomic_add_long(&pmap_l3e_p_failures, 1);
2749         return (KERN_FAILURE);
2750 }
2751 #endif /* VM_NRESERVLEVEL > 0 */
2752
2753 int
2754 mmu_radix_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
2755     vm_prot_t prot, u_int flags, int8_t psind)
2756 {
2757         struct rwlock *lock;
2758         pml3_entry_t *l3e;
2759         pt_entry_t *pte;
2760         pt_entry_t newpte, origpte;
2761         pv_entry_t pv;
2762         vm_paddr_t opa, pa;
2763         vm_page_t mpte, om;
2764         int rv, retrycount;
2765         boolean_t nosleep, invalidate_all, invalidate_page;
2766
2767         va = trunc_page(va);
2768         retrycount = 0;
2769         invalidate_page = invalidate_all = false;
2770         CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va,
2771             m, prot, flags, psind);
2772         KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
2773         KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
2774             va >= kmi.clean_eva,
2775             ("pmap_enter: managed mapping within the clean submap"));
2776         if ((m->oflags & VPO_UNMANAGED) == 0)
2777                 VM_PAGE_OBJECT_BUSY_ASSERT(m);
2778
2779         KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
2780             ("pmap_enter: flags %u has reserved bits set", flags));
2781         pa = VM_PAGE_TO_PHYS(m);
2782         newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF);
2783         if ((flags & VM_PROT_WRITE) != 0)
2784                 newpte |= PG_M;
2785         if ((flags & VM_PROT_READ) != 0)
2786                 newpte |= PG_A;
2787         if (prot & VM_PROT_READ)
2788                 newpte |= RPTE_EAA_R;
2789         if ((prot & VM_PROT_WRITE) != 0)
2790                 newpte |= RPTE_EAA_W;
2791         KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
2792             ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
2793
2794         if (prot & VM_PROT_EXECUTE)
2795                 newpte |= PG_X;
2796         if ((flags & PMAP_ENTER_WIRED) != 0)
2797                 newpte |= PG_W;
2798         if (va >= DMAP_MIN_ADDRESS)
2799                 newpte |= RPTE_EAA_P;
2800         newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs);
2801         /*
2802          * Set modified bit gratuitously for writeable mappings if
2803          * the page is unmanaged. We do not want to take a fault
2804          * to do the dirty bit accounting for these mappings.
2805          */
2806         if ((m->oflags & VPO_UNMANAGED) != 0) {
2807                 if ((newpte & PG_RW) != 0)
2808                         newpte |= PG_M;
2809         } else
2810                 newpte |= PG_MANAGED;
2811
2812         lock = NULL;
2813         PMAP_LOCK(pmap);
2814         if (psind == 1) {
2815                 /* Assert the required virtual and physical alignment. */
2816                 KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned"));
2817                 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
2818                 rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock);
2819                 goto out;
2820         }
2821         mpte = NULL;
2822
2823         /*
2824          * In the case that a page table page is not
2825          * resident, we are creating it here.
2826          */
2827 retry:
2828         l3e = pmap_pml3e(pmap, va);
2829         if (l3e != NULL && (*l3e & PG_V) != 0 && ((*l3e & RPTE_LEAF) == 0 ||
2830             pmap_demote_l3e_locked(pmap, l3e, va, &lock))) {
2831                 pte = pmap_l3e_to_pte(l3e, va);
2832                 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
2833                         mpte = PHYS_TO_VM_PAGE(*l3e & PG_FRAME);
2834                         mpte->ref_count++;
2835                 }
2836         } else if (va < VM_MAXUSER_ADDRESS) {
2837                 /*
2838                  * Here if the pte page isn't mapped, or if it has been
2839                  * deallocated.
2840                  */
2841                 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
2842                 mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va),
2843                     nosleep ? NULL : &lock);
2844                 if (mpte == NULL && nosleep) {
2845                         rv = KERN_RESOURCE_SHORTAGE;
2846                         goto out;
2847                 }
2848                 if (__predict_false(retrycount++ == 6))
2849                         panic("too many retries");
2850                 invalidate_all = true;
2851                 goto retry;
2852         } else
2853                 panic("pmap_enter: invalid page directory va=%#lx", va);
2854
2855         origpte = *pte;
2856         pv = NULL;
2857
2858         /*
2859          * Is the specified virtual address already mapped?
2860          */
2861         if ((origpte & PG_V) != 0) {
2862 #ifdef INVARIANTS
2863                 if (VERBOSE_PMAP || pmap_logging) {
2864                         printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --"
2865                             " asid=%lu curpid=%d name=%s origpte0x%lx\n",
2866                             pmap, va, m, prot, flags, psind, pmap->pm_pid,
2867                             curproc->p_pid, curproc->p_comm, origpte);
2868                         pmap_pte_walk(pmap->pm_pml1, va);
2869                 }
2870 #endif
2871                 /*
2872                  * Wiring change, just update stats. We don't worry about
2873                  * wiring PT pages as they remain resident as long as there
2874                  * are valid mappings in them. Hence, if a user page is wired,
2875                  * the PT page will be also.
2876                  */
2877                 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
2878                         pmap->pm_stats.wired_count++;
2879                 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
2880                         pmap->pm_stats.wired_count--;
2881
2882                 /*
2883                  * Remove the extra PT page reference.
2884                  */
2885                 if (mpte != NULL) {
2886                         mpte->ref_count--;
2887                         KASSERT(mpte->ref_count > 0,
2888                             ("pmap_enter: missing reference to page table page,"
2889                              " va: 0x%lx", va));
2890                 }
2891
2892                 /*
2893                  * Has the physical page changed?
2894                  */
2895                 opa = origpte & PG_FRAME;
2896                 if (opa == pa) {
2897                         /*
2898                          * No, might be a protection or wiring change.
2899                          */
2900                         if ((origpte & PG_MANAGED) != 0 &&
2901                             (newpte & PG_RW) != 0)
2902                                 vm_page_aflag_set(m, PGA_WRITEABLE);
2903                         if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) {
2904                                 if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) {
2905                                         if (!atomic_cmpset_long(pte, origpte, newpte))
2906                                                 goto retry;
2907                                         if ((newpte & PG_M) != (origpte & PG_M))
2908                                                 vm_page_dirty(m);
2909                                         if ((newpte & PG_A) != (origpte & PG_A))
2910                                                 vm_page_aflag_set(m, PGA_REFERENCED);
2911                                         ptesync();
2912                                 } else
2913                                         invalidate_all = true;
2914                                 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
2915                                         goto unchanged;
2916                         }
2917                         goto validate;
2918                 }
2919
2920                 /*
2921                  * The physical page has changed.  Temporarily invalidate
2922                  * the mapping.  This ensures that all threads sharing the
2923                  * pmap keep a consistent view of the mapping, which is
2924                  * necessary for the correct handling of COW faults.  It
2925                  * also permits reuse of the old mapping's PV entry,
2926                  * avoiding an allocation.
2927                  *
2928                  * For consistency, handle unmanaged mappings the same way.
2929                  */
2930                 origpte = pte_load_clear(pte);
2931                 KASSERT((origpte & PG_FRAME) == opa,
2932                     ("pmap_enter: unexpected pa update for %#lx", va));
2933                 if ((origpte & PG_MANAGED) != 0) {
2934                         om = PHYS_TO_VM_PAGE(opa);
2935
2936                         /*
2937                          * The pmap lock is sufficient to synchronize with
2938                          * concurrent calls to pmap_page_test_mappings() and
2939                          * pmap_ts_referenced().
2940                          */
2941                         if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2942                                 vm_page_dirty(om);
2943                         if ((origpte & PG_A) != 0)
2944                                 vm_page_aflag_set(om, PGA_REFERENCED);
2945                         CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
2946                         pv = pmap_pvh_remove(&om->md, pmap, va);
2947                         if ((newpte & PG_MANAGED) == 0)
2948                                 free_pv_entry(pmap, pv);
2949 #ifdef INVARIANTS
2950                         else if (origpte & PG_MANAGED) {
2951                                 if (pv == NULL) {
2952                                         pmap_page_print_mappings(om);
2953                                         MPASS(pv != NULL);
2954                                 }
2955                         }
2956 #endif
2957                         if ((om->a.flags & PGA_WRITEABLE) != 0 &&
2958                             TAILQ_EMPTY(&om->md.pv_list) &&
2959                             ((om->flags & PG_FICTITIOUS) != 0 ||
2960                             TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
2961                                 vm_page_aflag_clear(om, PGA_WRITEABLE);
2962                 }
2963                 if ((origpte & PG_A) != 0)
2964                         invalidate_page = true;
2965                 origpte = 0;
2966         } else {
2967                 if (pmap != kernel_pmap) {
2968 #ifdef INVARIANTS
2969                         if (VERBOSE_PMAP || pmap_logging)
2970                                 printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n",
2971                                     pmap, va, m, prot, flags, psind,
2972                                     pmap->pm_pid, curproc->p_pid,
2973                                     curproc->p_comm);
2974 #endif
2975                 }
2976
2977                 /*
2978                  * Increment the counters.
2979                  */
2980                 if ((newpte & PG_W) != 0)
2981                         pmap->pm_stats.wired_count++;
2982                 pmap_resident_count_inc(pmap, 1);
2983         }
2984
2985         /*
2986          * Enter on the PV list if part of our managed memory.
2987          */
2988         if ((newpte & PG_MANAGED) != 0) {
2989                 if (pv == NULL) {
2990                         pv = get_pv_entry(pmap, &lock);
2991                         pv->pv_va = va;
2992                 }
2993 #ifdef VERBOSE_PV
2994                 else
2995                         printf("reassigning pv: %p to pmap: %p\n",
2996                                    pv, pmap);
2997 #endif
2998                 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
2999                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
3000                 m->md.pv_gen++;
3001                 if ((newpte & PG_RW) != 0)
3002                         vm_page_aflag_set(m, PGA_WRITEABLE);
3003         }
3004
3005         /*
3006          * Update the PTE.
3007          */
3008         if ((origpte & PG_V) != 0) {
3009 validate:
3010                 origpte = pte_load_store(pte, newpte);
3011                 KASSERT((origpte & PG_FRAME) == pa,
3012                     ("pmap_enter: unexpected pa update for %#lx", va));
3013                 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
3014                     (PG_M | PG_RW)) {
3015                         if ((origpte & PG_MANAGED) != 0)
3016                                 vm_page_dirty(m);
3017                         invalidate_page = true;
3018
3019                         /*
3020                          * Although the PTE may still have PG_RW set, TLB
3021                          * invalidation may nonetheless be required because
3022                          * the PTE no longer has PG_M set.
3023                          */
3024                 } else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) {
3025                         /*
3026                          * Removing capabilities requires invalidation on POWER
3027                          */
3028                         invalidate_page = true;
3029                         goto unchanged;
3030                 }
3031                 if ((origpte & PG_A) != 0)
3032                         invalidate_page = true;
3033         } else {
3034                 pte_store(pte, newpte);
3035                 ptesync();
3036         }
3037 unchanged:
3038
3039 #if VM_NRESERVLEVEL > 0
3040         /*
3041          * If both the page table page and the reservation are fully
3042          * populated, then attempt promotion.
3043          */
3044         if ((mpte == NULL || mpte->ref_count == NPTEPG) &&
3045             mmu_radix_ps_enabled(pmap) &&
3046             (m->flags & PG_FICTITIOUS) == 0 &&
3047             vm_reserv_level_iffullpop(m) == 0 &&
3048                 pmap_promote_l3e(pmap, l3e, va, &lock) == 0)
3049                 invalidate_all = true;
3050 #endif
3051         if (invalidate_all)
3052                 pmap_invalidate_all(pmap);
3053         else if (invalidate_page)
3054                 pmap_invalidate_page(pmap, va);
3055
3056         rv = KERN_SUCCESS;
3057 out:
3058         if (lock != NULL)
3059                 rw_wunlock(lock);
3060         PMAP_UNLOCK(pmap);
3061
3062         return (rv);
3063 }
3064
3065 /*
3066  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
3067  * if successful.  Returns false if (1) a page table page cannot be allocated
3068  * without sleeping, (2) a mapping already exists at the specified virtual
3069  * address, or (3) a PV entry cannot be allocated without reclaiming another
3070  * PV entry.
3071  */
3072 static bool
3073 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3074     struct rwlock **lockp)
3075 {
3076         pml3_entry_t newpde;
3077
3078         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3079         newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) |
3080             RPTE_LEAF | PG_V;
3081         if ((m->oflags & VPO_UNMANAGED) == 0)
3082                 newpde |= PG_MANAGED;
3083         if (prot & VM_PROT_EXECUTE)
3084                 newpde |= PG_X;
3085         if (prot & VM_PROT_READ)
3086                 newpde |= RPTE_EAA_R;
3087         if (va >= DMAP_MIN_ADDRESS)
3088                 newpde |= RPTE_EAA_P;
3089         return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
3090             PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
3091             KERN_SUCCESS);
3092 }
3093
3094 /*
3095  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
3096  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
3097  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
3098  * a mapping already exists at the specified virtual address.  Returns
3099  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
3100  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
3101  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
3102  *
3103  * The parameter "m" is only used when creating a managed, writeable mapping.
3104  */
3105 static int
3106 pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags,
3107     vm_page_t m, struct rwlock **lockp)
3108 {
3109         struct spglist free;
3110         pml3_entry_t oldl3e, *l3e;
3111         vm_page_t mt, pdpg;
3112
3113         KASSERT((newpde & (PG_M | PG_RW)) != PG_RW,
3114             ("pmap_enter_pde: newpde is missing PG_M"));
3115         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3116
3117         if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
3118             NULL : lockp)) == NULL) {
3119                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3120                     " in pmap %p", va, pmap);
3121                 return (KERN_RESOURCE_SHORTAGE);
3122         }
3123         l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
3124         l3e = &l3e[pmap_pml3e_index(va)];
3125         oldl3e = *l3e;
3126         if ((oldl3e & PG_V) != 0) {
3127                 KASSERT(pdpg->ref_count > 1,
3128                     ("pmap_enter_pde: pdpg's wire count is too low"));
3129                 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
3130                         pdpg->ref_count--;
3131                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3132                             " in pmap %p", va, pmap);
3133                         return (KERN_FAILURE);
3134                 }
3135                 /* Break the existing mapping(s). */
3136                 SLIST_INIT(&free);
3137                 if ((oldl3e & RPTE_LEAF) != 0) {
3138                         /*
3139                          * The reference to the PD page that was acquired by
3140                          * pmap_allocl3e() ensures that it won't be freed.
3141                          * However, if the PDE resulted from a promotion, then
3142                          * a reserved PT page could be freed.
3143                          */
3144                         (void)pmap_remove_l3e(pmap, l3e, va, &free, lockp);
3145                 } else {
3146                         if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e,
3147                             &free, lockp))
3148                                pmap_invalidate_all(pmap);
3149                 }
3150                 vm_page_free_pages_toq(&free, true);
3151                 if (va >= VM_MAXUSER_ADDRESS) {
3152                         mt = PHYS_TO_VM_PAGE(*l3e & PG_FRAME);
3153                         if (pmap_insert_pt_page(pmap, mt)) {
3154                                 /*
3155                                  * XXX Currently, this can't happen because
3156                                  * we do not perform pmap_enter(psind == 1)
3157                                  * on the kernel pmap.
3158                                  */
3159                                 panic("pmap_enter_pde: trie insert failed");
3160                         }
3161                 } else
3162                         KASSERT(*l3e == 0, ("pmap_enter_pde: non-zero pde %p",
3163                             l3e));
3164         }
3165         if ((newpde & PG_MANAGED) != 0) {
3166                 /*
3167                  * Abort this mapping if its PV entry could not be created.
3168                  */
3169                 if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) {
3170                         SLIST_INIT(&free);
3171                         if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
3172                                 /*
3173                                  * Although "va" is not mapped, paging-
3174                                  * structure caches could nonetheless have
3175                                  * entries that refer to the freed page table
3176                                  * pages.  Invalidate those entries.
3177                                  */
3178                                 pmap_invalidate_page(pmap, va);
3179                                 vm_page_free_pages_toq(&free, true);
3180                         }
3181                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3182                             " in pmap %p", va, pmap);
3183                         return (KERN_RESOURCE_SHORTAGE);
3184                 }
3185                 if ((newpde & PG_RW) != 0) {
3186                         for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
3187                                 vm_page_aflag_set(mt, PGA_WRITEABLE);
3188                 }
3189         }
3190
3191         /*
3192          * Increment counters.
3193          */
3194         if ((newpde & PG_W) != 0)
3195                 pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE;
3196         pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE);
3197
3198         /*
3199          * Map the superpage.  (This is not a promoted mapping; there will not
3200          * be any lingering 4KB page mappings in the TLB.)
3201          */
3202         pte_store(l3e, newpde);
3203
3204         atomic_add_long(&pmap_l3e_mappings, 1);
3205         CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3206             " in pmap %p", va, pmap);
3207         return (KERN_SUCCESS);
3208 }
3209
3210 void
3211 mmu_radix_enter_object(pmap_t pmap, vm_offset_t start,
3212     vm_offset_t end, vm_page_t m_start, vm_prot_t prot)
3213 {
3214
3215         struct rwlock *lock;
3216         vm_offset_t va;
3217         vm_page_t m, mpte;
3218         vm_pindex_t diff, psize;
3219         bool invalidate;
3220         VM_OBJECT_ASSERT_LOCKED(m_start->object);
3221
3222         CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start,
3223             end, m_start, prot);
3224
3225         invalidate = false;
3226         psize = atop(end - start);
3227         mpte = NULL;
3228         m = m_start;
3229         lock = NULL;
3230         PMAP_LOCK(pmap);
3231         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3232                 va = start + ptoa(diff);
3233                 if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end &&
3234                     m->psind == 1 && mmu_radix_ps_enabled(pmap) &&
3235                     pmap_enter_2mpage(pmap, va, m, prot, &lock))
3236                         m = &m[L3_PAGE_SIZE / PAGE_SIZE - 1];
3237                 else
3238                         mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot,
3239                             mpte, &lock, &invalidate);
3240                 m = TAILQ_NEXT(m, listq);
3241         }
3242         ptesync();
3243         if (lock != NULL)
3244                 rw_wunlock(lock);
3245         if (invalidate)
3246                 pmap_invalidate_all(pmap);
3247         PMAP_UNLOCK(pmap);
3248 }
3249
3250 static vm_page_t
3251 mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3252     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate)
3253 {
3254         struct spglist free;
3255         pt_entry_t *pte;
3256         vm_paddr_t pa;
3257
3258         KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3259             (m->oflags & VPO_UNMANAGED) != 0,
3260             ("mmu_radix_enter_quick_locked: managed mapping within the clean submap"));
3261         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3262
3263         /*
3264          * In the case that a page table page is not
3265          * resident, we are creating it here.
3266          */
3267         if (va < VM_MAXUSER_ADDRESS) {
3268                 vm_pindex_t ptepindex;
3269                 pml3_entry_t *ptepa;
3270
3271                 /*
3272                  * Calculate pagetable page index
3273                  */
3274                 ptepindex = pmap_l3e_pindex(va);
3275                 if (mpte && (mpte->pindex == ptepindex)) {
3276                         mpte->ref_count++;
3277                 } else {
3278                         /*
3279                          * Get the page directory entry
3280                          */
3281                         ptepa = pmap_pml3e(pmap, va);
3282
3283                         /*
3284                          * If the page table page is mapped, we just increment
3285                          * the hold count, and activate it.  Otherwise, we
3286                          * attempt to allocate a page table page.  If this
3287                          * attempt fails, we don't retry.  Instead, we give up.
3288                          */
3289                         if (ptepa && (*ptepa & PG_V) != 0) {
3290                                 if (*ptepa & RPTE_LEAF)
3291                                         return (NULL);
3292                                 mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
3293                                 mpte->ref_count++;
3294                         } else {
3295                                 /*
3296                                  * Pass NULL instead of the PV list lock
3297                                  * pointer, because we don't intend to sleep.
3298                                  */
3299                                 mpte = _pmap_allocpte(pmap, ptepindex, NULL);
3300                                 if (mpte == NULL)
3301                                         return (mpte);
3302                         }
3303                 }
3304                 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3305                 pte = &pte[pmap_pte_index(va)];
3306         } else {
3307                 mpte = NULL;
3308                 pte = pmap_pte(pmap, va);
3309         }
3310         if (*pte) {
3311                 if (mpte != NULL) {
3312                         mpte->ref_count--;
3313                         mpte = NULL;
3314                 }
3315                 return (mpte);
3316         }
3317
3318         /*
3319          * Enter on the PV list if part of our managed memory.
3320          */
3321         if ((m->oflags & VPO_UNMANAGED) == 0 &&
3322             !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3323                 if (mpte != NULL) {
3324                         SLIST_INIT(&free);
3325                         if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
3326                                 /*
3327                                  * Although "va" is not mapped, paging-
3328                                  * structure caches could nonetheless have
3329                                  * entries that refer to the freed page table
3330                                  * pages.  Invalidate those entries.
3331                                  */
3332                                 *invalidate = true;
3333                                 vm_page_free_pages_toq(&free, true);
3334                         }
3335                         mpte = NULL;
3336                 }
3337                 return (mpte);
3338         }
3339
3340         /*
3341          * Increment counters
3342          */
3343         pmap_resident_count_inc(pmap, 1);
3344
3345         pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs);
3346         if (prot & VM_PROT_EXECUTE)
3347                 pa |= PG_X;
3348         else
3349                 pa |= RPTE_EAA_R;
3350         if ((m->oflags & VPO_UNMANAGED) == 0)
3351                 pa |= PG_MANAGED;
3352
3353         pte_store(pte, pa);
3354         return (mpte);
3355 }
3356
3357 void
3358 mmu_radix_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
3359     vm_prot_t prot)
3360 {
3361         struct rwlock *lock;
3362         bool invalidate;
3363
3364         lock = NULL;
3365         invalidate = false;
3366         PMAP_LOCK(pmap);
3367         mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock,
3368             &invalidate);
3369         ptesync();
3370         if (lock != NULL)
3371                 rw_wunlock(lock);
3372         if (invalidate)
3373                 pmap_invalidate_all(pmap);
3374         PMAP_UNLOCK(pmap);
3375 }
3376
3377 vm_paddr_t
3378 mmu_radix_extract(pmap_t pmap, vm_offset_t va)
3379 {
3380         pml3_entry_t *l3e;
3381         pt_entry_t *pte;
3382         vm_paddr_t pa;
3383
3384         l3e = pmap_pml3e(pmap, va);
3385         if (__predict_false(l3e == NULL))
3386                 return (0);
3387         if (*l3e & RPTE_LEAF) {
3388                 pa = (*l3e & PG_PS_FRAME) | (va & L3_PAGE_MASK);
3389                 pa |= (va & L3_PAGE_MASK);
3390         } else {
3391                 /*
3392                  * Beware of a concurrent promotion that changes the
3393                  * PDE at this point!  For example, vtopte() must not
3394                  * be used to access the PTE because it would use the
3395                  * new PDE.  It is, however, safe to use the old PDE
3396                  * because the page table page is preserved by the
3397                  * promotion.
3398                  */
3399                 pte = pmap_l3e_to_pte(l3e, va);
3400                 if (__predict_false(pte == NULL))
3401                         return (0);
3402                 pa = *pte;
3403                 pa = (pa & PG_FRAME) | (va & PAGE_MASK);
3404                 pa |= (va & PAGE_MASK);
3405         }
3406         return (pa);
3407 }
3408
3409 vm_page_t
3410 mmu_radix_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
3411 {
3412         pml3_entry_t l3e, *l3ep;
3413         pt_entry_t pte;
3414         vm_paddr_t pa;
3415         vm_page_t m;
3416
3417         pa = 0;
3418         m = NULL;
3419         CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot);
3420         PMAP_LOCK(pmap);
3421         l3ep = pmap_pml3e(pmap, va);
3422         if (l3ep != NULL && (l3e = *l3ep)) {
3423                 if (l3e & RPTE_LEAF) {
3424                         if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0)
3425                                 m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) |
3426                                     (va & L3_PAGE_MASK));
3427                 } else {
3428                         pte = *pmap_l3e_to_pte(l3ep, va);
3429                         if ((pte & PG_V) &&
3430                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0))
3431                                 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
3432                 }
3433                 if (m != NULL && !vm_page_wire_mapped(m))
3434                         m = NULL;
3435         }
3436         PMAP_UNLOCK(pmap);
3437         return (m);
3438 }
3439
3440 static void
3441 mmu_radix_growkernel(vm_offset_t addr)
3442 {
3443         vm_paddr_t paddr;
3444         vm_page_t nkpg;
3445         pml3_entry_t *l3e;
3446         pml2_entry_t *l2e;
3447
3448         CTR2(KTR_PMAP, "%s(%#x)", __func__, addr);
3449         if (VM_MIN_KERNEL_ADDRESS < addr &&
3450                 addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE))
3451                 return;
3452
3453         addr = roundup2(addr, L3_PAGE_SIZE);
3454         if (addr - 1 >= vm_map_max(kernel_map))
3455                 addr = vm_map_max(kernel_map);
3456         while (kernel_vm_end < addr) {
3457                 l2e = pmap_pml2e(kernel_pmap, kernel_vm_end);
3458                 if ((*l2e & PG_V) == 0) {
3459                         /* We need a new PDP entry */
3460                         nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_PAGE_SIZE_SHIFT,
3461                             VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
3462                             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3463                         if (nkpg == NULL)
3464                                 panic("pmap_growkernel: no memory to grow kernel");
3465                         if ((nkpg->flags & PG_ZERO) == 0)
3466                                 mmu_radix_zero_page(nkpg);
3467                         paddr = VM_PAGE_TO_PHYS(nkpg);
3468                         pde_store(l2e, paddr);
3469                         continue; /* try again */
3470                 }
3471                 l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end);
3472                 if ((*l3e & PG_V) != 0) {
3473                         kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
3474                         if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3475                                 kernel_vm_end = vm_map_max(kernel_map);
3476                                 break;
3477                         }
3478                         continue;
3479                 }
3480
3481                 nkpg = vm_page_alloc(NULL, pmap_l3e_pindex(kernel_vm_end),
3482                     VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
3483                     VM_ALLOC_ZERO);
3484                 if (nkpg == NULL)
3485                         panic("pmap_growkernel: no memory to grow kernel");
3486                 if ((nkpg->flags & PG_ZERO) == 0)
3487                         mmu_radix_zero_page(nkpg);
3488                 paddr = VM_PAGE_TO_PHYS(nkpg);
3489                 pde_store(l3e, paddr);
3490
3491                 kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
3492                 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3493                         kernel_vm_end = vm_map_max(kernel_map);
3494                         break;
3495                 }
3496         }
3497         ptesync();
3498 }
3499
3500 static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory");
3501 static uma_zone_t zone_radix_pgd;
3502
3503 static int
3504 radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused,
3505     int flags)
3506 {
3507
3508         for (int i = 0; i < count; i++) {
3509                 vm_page_t m = vm_page_alloc_contig(NULL, 0,
3510                     VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
3511                     VM_ALLOC_ZERO | VM_ALLOC_WAITOK, RADIX_PGD_SIZE/PAGE_SIZE,
3512                     0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE,
3513                     VM_MEMATTR_DEFAULT);
3514                 /* XXX zero on alloc here so we don't have to later */
3515                 store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3516         }
3517         return (count);
3518 }
3519
3520 static void
3521 radix_pgd_release(void *arg __unused, void **store, int count)
3522 {
3523         vm_page_t m;
3524         struct spglist free;
3525         int page_count;
3526
3527         SLIST_INIT(&free);
3528         page_count = RADIX_PGD_SIZE/PAGE_SIZE;
3529
3530         for (int i = 0; i < count; i++) {
3531                 /*
3532                  * XXX selectively remove dmap and KVA entries so we don't
3533                  * need to bzero
3534                  */
3535                 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
3536                 for (int j = page_count-1; j >= 0; j--) {
3537                         vm_page_unwire_noq(&m[j]);
3538                         SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss);
3539                 }
3540                 vm_page_free_pages_toq(&free, false);
3541         }
3542 }
3543
3544 static void
3545 mmu_radix_init()
3546 {
3547         vm_page_t mpte;
3548         vm_size_t s;
3549         int error, i, pv_npg;
3550
3551         /* L1TF, reserve page @0 unconditionally */
3552         vm_page_blacklist_add(0, bootverbose);
3553
3554         zone_radix_pgd = uma_zcache_create("radix_pgd_cache",
3555                 RADIX_PGD_SIZE, NULL, NULL,
3556 #ifdef INVARIANTS
3557             trash_init, trash_fini,
3558 #else
3559             NULL, NULL,
3560 #endif
3561                 radix_pgd_import, radix_pgd_release,
3562                 NULL, UMA_ZONE_NOBUCKET);
3563
3564         /*
3565          * Initialize the vm page array entries for the kernel pmap's
3566          * page table pages.
3567          */
3568         PMAP_LOCK(kernel_pmap);
3569         for (i = 0; i < nkpt; i++) {
3570                 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
3571                 KASSERT(mpte >= vm_page_array &&
3572                     mpte < &vm_page_array[vm_page_array_size],
3573                     ("pmap_init: page table page is out of range size: %lu",
3574                      vm_page_array_size));
3575                 mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i;
3576                 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
3577                 MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte);
3578                 //pmap_insert_pt_page(kernel_pmap, mpte);
3579                 mpte->ref_count = 1;
3580         }
3581         PMAP_UNLOCK(kernel_pmap);
3582         vm_wire_add(nkpt);
3583
3584         CTR1(KTR_PMAP, "%s()", __func__);
3585         TAILQ_INIT(&pv_dummy.pv_list);
3586
3587         /*
3588          * Are large page mappings enabled?
3589          */
3590         TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
3591         if (pg_ps_enabled) {
3592                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
3593                     ("pmap_init: can't assign to pagesizes[1]"));
3594                 pagesizes[1] = L3_PAGE_SIZE;
3595         }
3596
3597         /*
3598          * Initialize the pv chunk list mutex.
3599          */
3600         mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
3601
3602         /*
3603          * Initialize the pool of pv list locks.
3604          */
3605         for (i = 0; i < NPV_LIST_LOCKS; i++)
3606                 rw_init(&pv_list_locks[i], "pmap pv list");
3607
3608         /*
3609          * Calculate the size of the pv head table for superpages.
3610          */
3611         pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE);
3612
3613         /*
3614          * Allocate memory for the pv head table for superpages.
3615          */
3616         s = (vm_size_t)(pv_npg * sizeof(struct md_page));
3617         s = round_page(s);
3618         pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
3619         for (i = 0; i < pv_npg; i++)
3620                 TAILQ_INIT(&pv_table[i].pv_list);
3621         TAILQ_INIT(&pv_dummy.pv_list);
3622
3623         pmap_initialized = 1;
3624         mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
3625         error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
3626             (vmem_addr_t *)&qframe);
3627
3628         if (error != 0)
3629                 panic("qframe allocation failed");
3630         asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<<isa3_pid_bits),
3631             1, 1, M_WAITOK);
3632 }
3633
3634 static boolean_t
3635 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
3636 {
3637         struct rwlock *lock;
3638         pv_entry_t pv;
3639         struct md_page *pvh;
3640         pt_entry_t *pte, mask;
3641         pmap_t pmap;
3642         int md_gen, pvh_gen;
3643         boolean_t rv;
3644
3645         rv = FALSE;
3646         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3647         rw_rlock(lock);
3648 restart:
3649         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
3650                 pmap = PV_PMAP(pv);
3651                 if (!PMAP_TRYLOCK(pmap)) {
3652                         md_gen = m->md.pv_gen;
3653                         rw_runlock(lock);
3654                         PMAP_LOCK(pmap);
3655                         rw_rlock(lock);
3656                         if (md_gen != m->md.pv_gen) {
3657                                 PMAP_UNLOCK(pmap);
3658                                 goto restart;
3659                         }
3660                 }
3661                 pte = pmap_pte(pmap, pv->pv_va);
3662                 mask = 0;
3663                 if (modified)
3664                         mask |= PG_RW | PG_M;
3665                 if (accessed)
3666                         mask |= PG_V | PG_A;
3667                 rv = (*pte & mask) == mask;
3668                 PMAP_UNLOCK(pmap);
3669                 if (rv)
3670                         goto out;
3671         }
3672         if ((m->flags & PG_FICTITIOUS) == 0) {
3673                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3674                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
3675                         pmap = PV_PMAP(pv);
3676                         if (!PMAP_TRYLOCK(pmap)) {
3677                                 md_gen = m->md.pv_gen;
3678                                 pvh_gen = pvh->pv_gen;
3679                                 rw_runlock(lock);
3680                                 PMAP_LOCK(pmap);
3681                                 rw_rlock(lock);
3682                                 if (md_gen != m->md.pv_gen ||
3683                                     pvh_gen != pvh->pv_gen) {
3684                                         PMAP_UNLOCK(pmap);
3685                                         goto restart;
3686                                 }
3687                         }
3688                         pte = pmap_pml3e(pmap, pv->pv_va);
3689                         mask = 0;
3690                         if (modified)
3691                                 mask |= PG_RW | PG_M;
3692                         if (accessed)
3693                                 mask |= PG_V | PG_A;
3694                         rv = (*pte & mask) == mask;
3695                         PMAP_UNLOCK(pmap);
3696                         if (rv)
3697                                 goto out;
3698                 }
3699         }
3700 out:
3701         rw_runlock(lock);
3702         return (rv);
3703 }
3704
3705 /*
3706  *      pmap_is_modified:
3707  *
3708  *      Return whether or not the specified physical page was modified
3709  *      in any physical maps.
3710  */
3711 boolean_t
3712 mmu_radix_is_modified(vm_page_t m)
3713 {
3714
3715         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3716             ("pmap_is_modified: page %p is not managed", m));
3717
3718         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3719         /*
3720          * If the page is not busied then this check is racy.
3721          */
3722         if (!pmap_page_is_write_mapped(m))
3723                 return (FALSE);
3724         return (pmap_page_test_mappings(m, FALSE, TRUE));
3725 }
3726
3727 boolean_t
3728 mmu_radix_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3729 {
3730         pml3_entry_t *l3e;
3731         pt_entry_t *pte;
3732         boolean_t rv;
3733
3734         CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr);
3735         rv = FALSE;
3736         PMAP_LOCK(pmap);
3737         l3e = pmap_pml3e(pmap, addr);
3738         if (l3e != NULL && (*l3e & (RPTE_LEAF | PG_V)) == PG_V) {
3739                 pte = pmap_l3e_to_pte(l3e, addr);
3740                 rv = (*pte & PG_V) == 0;
3741         }
3742         PMAP_UNLOCK(pmap);
3743         return (rv);
3744 }
3745
3746 boolean_t
3747 mmu_radix_is_referenced(vm_page_t m)
3748 {
3749         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3750             ("pmap_is_referenced: page %p is not managed", m));
3751         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3752         return (pmap_page_test_mappings(m, TRUE, FALSE));
3753 }
3754
3755 /*
3756  *      pmap_ts_referenced:
3757  *
3758  *      Return a count of reference bits for a page, clearing those bits.
3759  *      It is not necessary for every reference bit to be cleared, but it
3760  *      is necessary that 0 only be returned when there are truly no
3761  *      reference bits set.
3762  *
3763  *      As an optimization, update the page's dirty field if a modified bit is
3764  *      found while counting reference bits.  This opportunistic update can be
3765  *      performed at low cost and can eliminate the need for some future calls
3766  *      to pmap_is_modified().  However, since this function stops after
3767  *      finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
3768  *      dirty pages.  Those dirty pages will only be detected by a future call
3769  *      to pmap_is_modified().
3770  *
3771  *      A DI block is not needed within this function, because
3772  *      invalidations are performed before the PV list lock is
3773  *      released.
3774  */
3775 boolean_t
3776 mmu_radix_ts_referenced(vm_page_t m)
3777 {
3778         struct md_page *pvh;
3779         pv_entry_t pv, pvf;
3780         pmap_t pmap;
3781         struct rwlock *lock;
3782         pml3_entry_t oldl3e, *l3e;
3783         pt_entry_t *pte;
3784         vm_paddr_t pa;
3785         int cleared, md_gen, not_cleared, pvh_gen;
3786         struct spglist free;
3787
3788         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3789         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3790             ("pmap_ts_referenced: page %p is not managed", m));
3791         SLIST_INIT(&free);
3792         cleared = 0;
3793         pa = VM_PAGE_TO_PHYS(m);
3794         lock = PHYS_TO_PV_LIST_LOCK(pa);
3795         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
3796         rw_wlock(lock);
3797 retry:
3798         not_cleared = 0;
3799         if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
3800                 goto small_mappings;
3801         pv = pvf;
3802         do {
3803                 if (pvf == NULL)
3804                         pvf = pv;
3805                 pmap = PV_PMAP(pv);
3806                 if (!PMAP_TRYLOCK(pmap)) {
3807                         pvh_gen = pvh->pv_gen;
3808                         rw_wunlock(lock);
3809                         PMAP_LOCK(pmap);
3810                         rw_wlock(lock);
3811                         if (pvh_gen != pvh->pv_gen) {
3812                                 PMAP_UNLOCK(pmap);
3813                                 goto retry;
3814                         }
3815                 }
3816                 l3e = pmap_pml3e(pmap, pv->pv_va);
3817                 oldl3e = *l3e;
3818                 if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3819                         /*
3820                          * Although "oldpde" is mapping a 2MB page, because
3821                          * this function is called at a 4KB page granularity,
3822                          * we only update the 4KB page under test.
3823                          */
3824                         vm_page_dirty(m);
3825                 }
3826                 if ((oldl3e & PG_A) != 0) {
3827                         /*
3828                          * Since this reference bit is shared by 512 4KB
3829                          * pages, it should not be cleared every time it is
3830                          * tested.  Apply a simple "hash" function on the
3831                          * physical page number, the virtual superpage number,
3832                          * and the pmap address to select one 4KB page out of
3833                          * the 512 on which testing the reference bit will
3834                          * result in clearing that reference bit.  This
3835                          * function is designed to avoid the selection of the
3836                          * same 4KB page for every 2MB page mapping.
3837                          *
3838                          * On demotion, a mapping that hasn't been referenced
3839                          * is simply destroyed.  To avoid the possibility of a
3840                          * subsequent page fault on a demoted wired mapping,
3841                          * always leave its reference bit set.  Moreover,
3842                          * since the superpage is wired, the current state of
3843                          * its reference bit won't affect page replacement.
3844                          */
3845                         if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^
3846                             (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
3847                             (oldl3e & PG_W) == 0) {
3848                                 atomic_clear_long(l3e, PG_A);
3849                                 pmap_invalidate_page(pmap, pv->pv_va);
3850                                 cleared++;
3851                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
3852                                     ("inconsistent pv lock %p %p for page %p",
3853                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
3854                         } else
3855                                 not_cleared++;
3856                 }
3857                 PMAP_UNLOCK(pmap);
3858                 /* Rotate the PV list if it has more than one entry. */
3859                 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) {
3860                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
3861                         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
3862                         pvh->pv_gen++;
3863                 }
3864                 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
3865                         goto out;
3866         } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
3867 small_mappings:
3868         if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
3869                 goto out;
3870         pv = pvf;
3871         do {
3872                 if (pvf == NULL)
3873                         pvf = pv;
3874                 pmap = PV_PMAP(pv);
3875                 if (!PMAP_TRYLOCK(pmap)) {
3876                         pvh_gen = pvh->pv_gen;
3877                         md_gen = m->md.pv_gen;
3878                         rw_wunlock(lock);
3879                         PMAP_LOCK(pmap);
3880                         rw_wlock(lock);
3881                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
3882                                 PMAP_UNLOCK(pmap);
3883                                 goto retry;
3884                         }
3885                 }
3886                 l3e = pmap_pml3e(pmap, pv->pv_va);
3887                 KASSERT((*l3e & RPTE_LEAF) == 0,
3888                     ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
3889                     m));
3890                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
3891                 if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3892                         vm_page_dirty(m);
3893                 if ((*pte & PG_A) != 0) {
3894                         atomic_clear_long(pte, PG_A);
3895                         pmap_invalidate_page(pmap, pv->pv_va);
3896                         cleared++;
3897                 }
3898                 PMAP_UNLOCK(pmap);
3899                 /* Rotate the PV list if it has more than one entry. */
3900                 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) {
3901                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
3902                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
3903                         m->md.pv_gen++;
3904                 }
3905         } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
3906             not_cleared < PMAP_TS_REFERENCED_MAX);
3907 out:
3908         rw_wunlock(lock);
3909         vm_page_free_pages_toq(&free, true);
3910         return (cleared + not_cleared);
3911 }
3912
3913 static vm_offset_t
3914 mmu_radix_map(vm_offset_t *virt __unused, vm_paddr_t start,
3915     vm_paddr_t end, int prot __unused)
3916 {
3917
3918         CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end,
3919                  prot);
3920         return (PHYS_TO_DMAP(start));
3921 }
3922
3923 void
3924 mmu_radix_object_init_pt(pmap_t pmap, vm_offset_t addr,
3925     vm_object_t object, vm_pindex_t pindex, vm_size_t size)
3926 {
3927         pml3_entry_t *l3e;
3928         vm_paddr_t pa, ptepa;
3929         vm_page_t p, pdpg;
3930         vm_memattr_t ma;
3931
3932         CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr,
3933             object, pindex, size);
3934         VM_OBJECT_ASSERT_WLOCKED(object);
3935         KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3936                         ("pmap_object_init_pt: non-device object"));
3937         /* NB: size can be logically ored with addr here */
3938         if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) {
3939                 if (!mmu_radix_ps_enabled(pmap))
3940                         return;
3941                 if (!vm_object_populate(object, pindex, pindex + atop(size)))
3942                         return;
3943                 p = vm_page_lookup(object, pindex);
3944                 KASSERT(p->valid == VM_PAGE_BITS_ALL,
3945                     ("pmap_object_init_pt: invalid page %p", p));
3946                 ma = p->md.mdpg_cache_attrs;
3947
3948                 /*
3949                  * Abort the mapping if the first page is not physically
3950                  * aligned to a 2MB page boundary.
3951                  */
3952                 ptepa = VM_PAGE_TO_PHYS(p);
3953                 if (ptepa & L3_PAGE_MASK)
3954                         return;
3955
3956                 /*
3957                  * Skip the first page.  Abort the mapping if the rest of
3958                  * the pages are not physically contiguous or have differing
3959                  * memory attributes.
3960                  */
3961                 p = TAILQ_NEXT(p, listq);
3962                 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3963                     pa += PAGE_SIZE) {
3964                         KASSERT(p->valid == VM_PAGE_BITS_ALL,
3965                             ("pmap_object_init_pt: invalid page %p", p));
3966                         if (pa != VM_PAGE_TO_PHYS(p) ||
3967                             ma != p->md.mdpg_cache_attrs)
3968                                 return;
3969                         p = TAILQ_NEXT(p, listq);
3970                 }
3971
3972                 PMAP_LOCK(pmap);
3973                 for (pa = ptepa | pmap_cache_bits(ma);
3974                     pa < ptepa + size; pa += L3_PAGE_SIZE) {
3975                         pdpg = pmap_allocl3e(pmap, addr, NULL);
3976                         if (pdpg == NULL) {
3977                                 /*
3978                                  * The creation of mappings below is only an
3979                                  * optimization.  If a page directory page
3980                                  * cannot be allocated without blocking,
3981                                  * continue on to the next mapping rather than
3982                                  * blocking.
3983                                  */
3984                                 addr += L3_PAGE_SIZE;
3985                                 continue;
3986                         }
3987                         l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
3988                         l3e = &l3e[pmap_pml3e_index(addr)];
3989                         if ((*l3e & PG_V) == 0) {
3990                                 pa |= PG_M | PG_A | PG_RW;
3991                                 pte_store(l3e, pa);
3992                                 pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE);
3993                                 atomic_add_long(&pmap_l3e_mappings, 1);
3994                         } else {
3995                                 /* Continue on if the PDE is already valid. */
3996                                 pdpg->ref_count--;
3997                                 KASSERT(pdpg->ref_count > 0,
3998                                     ("pmap_object_init_pt: missing reference "
3999                                     "to page directory page, va: 0x%lx", addr));
4000                         }
4001                         addr += L3_PAGE_SIZE;
4002                 }
4003                 ptesync();
4004                 PMAP_UNLOCK(pmap);
4005         }
4006 }
4007
4008 boolean_t
4009 mmu_radix_page_exists_quick(pmap_t pmap, vm_page_t m)
4010 {
4011         struct md_page *pvh;
4012         struct rwlock *lock;
4013         pv_entry_t pv;
4014         int loops = 0;
4015         boolean_t rv;
4016
4017         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4018             ("pmap_page_exists_quick: page %p is not managed", m));
4019         CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m);
4020         rv = FALSE;
4021         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4022         rw_rlock(lock);
4023         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
4024                 if (PV_PMAP(pv) == pmap) {
4025                         rv = TRUE;
4026                         break;
4027                 }
4028                 loops++;
4029                 if (loops >= 16)
4030                         break;
4031         }
4032         if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4033                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4034                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
4035                         if (PV_PMAP(pv) == pmap) {
4036                                 rv = TRUE;
4037                                 break;
4038                         }
4039                         loops++;
4040                         if (loops >= 16)
4041                                 break;
4042                 }
4043         }
4044         rw_runlock(lock);
4045         return (rv);
4046 }
4047
4048 void
4049 mmu_radix_page_init(vm_page_t m)
4050 {
4051
4052         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
4053         TAILQ_INIT(&m->md.pv_list);
4054         m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT;
4055 }
4056
4057 int
4058 mmu_radix_page_wired_mappings(vm_page_t m)
4059 {
4060         struct rwlock *lock;
4061         struct md_page *pvh;
4062         pmap_t pmap;
4063         pt_entry_t *pte;
4064         pv_entry_t pv;
4065         int count, md_gen, pvh_gen;
4066
4067         if ((m->oflags & VPO_UNMANAGED) != 0)
4068                 return (0);
4069         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
4070         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4071         rw_rlock(lock);
4072 restart:
4073         count = 0;
4074         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
4075                 pmap = PV_PMAP(pv);
4076                 if (!PMAP_TRYLOCK(pmap)) {
4077                         md_gen = m->md.pv_gen;
4078                         rw_runlock(lock);
4079                         PMAP_LOCK(pmap);
4080                         rw_rlock(lock);
4081                         if (md_gen != m->md.pv_gen) {
4082                                 PMAP_UNLOCK(pmap);
4083                                 goto restart;
4084                         }
4085                 }
4086                 pte = pmap_pte(pmap, pv->pv_va);
4087                 if ((*pte & PG_W) != 0)
4088                         count++;
4089                 PMAP_UNLOCK(pmap);
4090         }
4091         if ((m->flags & PG_FICTITIOUS) == 0) {
4092                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4093                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
4094                         pmap = PV_PMAP(pv);
4095                         if (!PMAP_TRYLOCK(pmap)) {
4096                                 md_gen = m->md.pv_gen;
4097                                 pvh_gen = pvh->pv_gen;
4098                                 rw_runlock(lock);
4099                                 PMAP_LOCK(pmap);
4100                                 rw_rlock(lock);
4101                                 if (md_gen != m->md.pv_gen ||
4102                                     pvh_gen != pvh->pv_gen) {
4103                                         PMAP_UNLOCK(pmap);
4104                                         goto restart;
4105                                 }
4106                         }
4107                         pte = pmap_pml3e(pmap, pv->pv_va);
4108                         if ((*pte & PG_W) != 0)
4109                                 count++;
4110                         PMAP_UNLOCK(pmap);
4111                 }
4112         }
4113         rw_runlock(lock);
4114         return (count);
4115 }
4116
4117 static void
4118 mmu_radix_update_proctab(int pid, pml1_entry_t l1pa)
4119 {
4120         isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE |  l1pa | RADIX_PGD_INDEX_SHIFT);
4121 }
4122
4123 int
4124 mmu_radix_pinit(pmap_t pmap)
4125 {
4126         vmem_addr_t pid;
4127         vm_paddr_t l1pa;
4128
4129         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4130
4131         /*
4132          * allocate the page directory page
4133          */
4134         pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK);
4135
4136         for (int j = 0; j <  RADIX_PGD_SIZE_SHIFT; j++)
4137                 pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE);
4138         pmap->pm_radix.rt_root = 0;
4139         TAILQ_INIT(&pmap->pm_pvchunk);
4140         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
4141         pmap->pm_flags = PMAP_PDE_SUPERPAGE;
4142         vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid);
4143
4144         pmap->pm_pid = pid;
4145         l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1);
4146         mmu_radix_update_proctab(pid, l1pa);
4147         __asm __volatile("ptesync;isync" : : : "memory");
4148
4149         return (1);
4150 }
4151
4152 /*
4153  * This routine is called if the desired page table page does not exist.
4154  *
4155  * If page table page allocation fails, this routine may sleep before
4156  * returning NULL.  It sleeps only if a lock pointer was given.
4157  *
4158  * Note: If a page allocation fails at page table level two or three,
4159  * one or two pages may be held during the wait, only to be released
4160  * afterwards.  This conservative approach is easily argued to avoid
4161  * race conditions.
4162  */
4163 static vm_page_t
4164 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
4165 {
4166         vm_page_t m, pdppg, pdpg;
4167
4168         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4169
4170         /*
4171          * Allocate a page table page.
4172          */
4173         if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
4174             VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
4175                 if (lockp != NULL) {
4176                         RELEASE_PV_LIST_LOCK(lockp);
4177                         PMAP_UNLOCK(pmap);
4178                         vm_wait(NULL);
4179                         PMAP_LOCK(pmap);
4180                 }
4181                 /*
4182                  * Indicate the need to retry.  While waiting, the page table
4183                  * page may have been allocated.
4184                  */
4185                 return (NULL);
4186         }
4187         if ((m->flags & PG_ZERO) == 0)
4188                 mmu_radix_zero_page(m);
4189
4190         /*
4191          * Map the pagetable page into the process address space, if
4192          * it isn't already there.
4193          */
4194
4195         if (ptepindex >= (NUPDE + NUPDPE)) {
4196                 pml1_entry_t *l1e;
4197                 vm_pindex_t pml1index;
4198
4199                 /* Wire up a new PDPE page */
4200                 pml1index = ptepindex - (NUPDE + NUPDPE);
4201                 l1e = &pmap->pm_pml1[pml1index];
4202                 pde_store(l1e, VM_PAGE_TO_PHYS(m));
4203
4204         } else if (ptepindex >= NUPDE) {
4205                 vm_pindex_t pml1index;
4206                 vm_pindex_t pdpindex;
4207                 pml1_entry_t *l1e;
4208                 pml2_entry_t *l2e;
4209
4210                 /* Wire up a new l2e page */
4211                 pdpindex = ptepindex - NUPDE;
4212                 pml1index = pdpindex >> RPTE_SHIFT;
4213
4214                 l1e = &pmap->pm_pml1[pml1index];
4215                 if ((*l1e & PG_V) == 0) {
4216                         /* Have to allocate a new pdp, recurse */
4217                         if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index,
4218                                 lockp) == NULL) {
4219                                 vm_page_unwire_noq(m);
4220                                 vm_page_free_zero(m);
4221                                 return (NULL);
4222                         }
4223                 } else {
4224                         /* Add reference to l2e page */
4225                         pdppg = PHYS_TO_VM_PAGE(*l1e & PG_FRAME);
4226                         pdppg->ref_count++;
4227                 }
4228                 l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME);
4229
4230                 /* Now find the pdp page */
4231                 l2e = &l2e[pdpindex & RPTE_MASK];
4232                 pde_store(l2e, VM_PAGE_TO_PHYS(m));
4233
4234         } else {
4235                 vm_pindex_t pml1index;
4236                 vm_pindex_t pdpindex;
4237                 pml1_entry_t *l1e;
4238                 pml2_entry_t *l2e;
4239                 pml3_entry_t *l3e;
4240
4241                 /* Wire up a new PTE page */
4242                 pdpindex = ptepindex >> RPTE_SHIFT;
4243                 pml1index = pdpindex >> RPTE_SHIFT;
4244
4245                 /* First, find the pdp and check that its valid. */
4246                 l1e = &pmap->pm_pml1[pml1index];
4247                 if ((*l1e & PG_V) == 0) {
4248                         /* Have to allocate a new pd, recurse */
4249                         if (_pmap_allocpte(pmap, NUPDE + pdpindex,
4250                             lockp) == NULL) {
4251                                 vm_page_unwire_noq(m);
4252                                 vm_page_free_zero(m);
4253                                 return (NULL);
4254                         }
4255                         l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME);
4256                         l2e = &l2e[pdpindex & RPTE_MASK];
4257                 } else {
4258                         l2e = (pml2_entry_t *)PHYS_TO_DMAP(*l1e & PG_FRAME);
4259                         l2e = &l2e[pdpindex & RPTE_MASK];
4260                         if ((*l2e & PG_V) == 0) {
4261                                 /* Have to allocate a new pd, recurse */
4262                                 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
4263                                     lockp) == NULL) {
4264                                         vm_page_unwire_noq(m);
4265                                         vm_page_free_zero(m);
4266                                         return (NULL);
4267                                 }
4268                         } else {
4269                                 /* Add reference to the pd page */
4270                                 pdpg = PHYS_TO_VM_PAGE(*l2e & PG_FRAME);
4271                                 pdpg->ref_count++;
4272                         }
4273                 }
4274                 l3e = (pml3_entry_t *)PHYS_TO_DMAP(*l2e & PG_FRAME);
4275
4276                 /* Now we know where the page directory page is */
4277                 l3e = &l3e[ptepindex & RPTE_MASK];
4278                 pde_store(l3e, VM_PAGE_TO_PHYS(m));
4279         }
4280
4281         pmap_resident_count_inc(pmap, 1);
4282         return (m);
4283 }
4284 static vm_page_t
4285 pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
4286 {
4287         vm_pindex_t pdpindex, ptepindex;
4288         pml2_entry_t *pdpe;
4289         vm_page_t pdpg;
4290
4291 retry:
4292         pdpe = pmap_pml2e(pmap, va);
4293         if (pdpe != NULL && (*pdpe & PG_V) != 0) {
4294                 /* Add a reference to the pd page. */
4295                 pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
4296                 pdpg->ref_count++;
4297         } else {
4298                 /* Allocate a pd page. */
4299                 ptepindex = pmap_l3e_pindex(va);
4300                 pdpindex = ptepindex >> RPTE_SHIFT;
4301                 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
4302                 if (pdpg == NULL && lockp != NULL)
4303                         goto retry;
4304         }
4305         return (pdpg);
4306 }
4307
4308 static vm_page_t
4309 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
4310 {
4311         vm_pindex_t ptepindex;
4312         pml3_entry_t *pd;
4313         vm_page_t m;
4314
4315         /*
4316          * Calculate pagetable page index
4317          */
4318         ptepindex = pmap_l3e_pindex(va);
4319 retry:
4320         /*
4321          * Get the page directory entry
4322          */
4323         pd = pmap_pml3e(pmap, va);
4324
4325         /*
4326          * This supports switching from a 2MB page to a
4327          * normal 4K page.
4328          */
4329         if (pd != NULL && (*pd & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) {
4330                 if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) {
4331                         /*
4332                          * Invalidation of the 2MB page mapping may have caused
4333                          * the deallocation of the underlying PD page.
4334                          */
4335                         pd = NULL;
4336                 }
4337         }
4338
4339         /*
4340          * If the page table page is mapped, we just increment the
4341          * hold count, and activate it.
4342          */
4343         if (pd != NULL && (*pd & PG_V) != 0) {
4344                 m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
4345                 m->ref_count++;
4346         } else {
4347                 /*
4348                  * Here if the pte page isn't mapped, or if it has been
4349                  * deallocated.
4350                  */
4351                 m = _pmap_allocpte(pmap, ptepindex, lockp);
4352                 if (m == NULL && lockp != NULL)
4353                         goto retry;
4354         }
4355         return (m);
4356 }
4357
4358 static void
4359 mmu_radix_pinit0(pmap_t pmap)
4360 {
4361
4362         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4363         PMAP_LOCK_INIT(pmap);
4364         pmap->pm_pml1 = kernel_pmap->pm_pml1;
4365         pmap->pm_pid = kernel_pmap->pm_pid;
4366
4367         pmap->pm_radix.rt_root = 0;
4368         TAILQ_INIT(&pmap->pm_pvchunk);
4369         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
4370         kernel_pmap->pm_flags =
4371                 pmap->pm_flags = PMAP_PDE_SUPERPAGE;
4372 }
4373 /*
4374  * pmap_protect_l3e: do the things to protect a 2mpage in a process
4375  */
4376 static boolean_t
4377 pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot)
4378 {
4379         pt_entry_t newpde, oldpde;
4380         vm_offset_t eva, va;
4381         vm_page_t m;
4382         boolean_t anychanged;
4383
4384         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4385         KASSERT((sva & L3_PAGE_MASK) == 0,
4386             ("pmap_protect_l3e: sva is not 2mpage aligned"));
4387         anychanged = FALSE;
4388 retry:
4389         oldpde = newpde = *l3e;
4390         if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
4391             (PG_MANAGED | PG_M | PG_RW)) {
4392                 eva = sva + L3_PAGE_SIZE;
4393                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
4394                     va < eva; va += PAGE_SIZE, m++)
4395                         vm_page_dirty(m);
4396         }
4397         if ((prot & VM_PROT_WRITE) == 0) {
4398                 newpde &= ~(PG_RW | PG_M);
4399                 newpde |= RPTE_EAA_R;
4400         }
4401         if (prot & VM_PROT_EXECUTE)
4402                 newpde |= PG_X;
4403         if (newpde != oldpde) {
4404                 /*
4405                  * As an optimization to future operations on this PDE, clear
4406                  * PG_PROMOTED.  The impending invalidation will remove any
4407                  * lingering 4KB page mappings from the TLB.
4408                  */
4409                 if (!atomic_cmpset_long(l3e, oldpde, newpde & ~PG_PROMOTED))
4410                         goto retry;
4411                 anychanged = TRUE;
4412         }
4413         return (anychanged);
4414 }
4415
4416 void
4417 mmu_radix_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
4418     vm_prot_t prot)
4419 {
4420         vm_offset_t va_next;
4421         pml1_entry_t *l1e;
4422         pml2_entry_t *l2e;
4423         pml3_entry_t ptpaddr, *l3e;
4424         pt_entry_t *pte;
4425         boolean_t anychanged;
4426
4427         CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva,
4428             prot);
4429
4430         KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4431         if (prot == VM_PROT_NONE) {
4432                 mmu_radix_remove(pmap, sva, eva);
4433                 return;
4434         }
4435
4436         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
4437             (VM_PROT_WRITE|VM_PROT_EXECUTE))
4438                 return;
4439
4440 #ifdef INVARIANTS
4441         if (VERBOSE_PROTECT || pmap_logging)
4442                 printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n",
4443                            pmap, sva, eva, prot, pmap->pm_pid);
4444 #endif
4445         anychanged = FALSE;
4446
4447         PMAP_LOCK(pmap);
4448         for (; sva < eva; sva = va_next) {
4449                 l1e = pmap_pml1e(pmap, sva);
4450                 if ((*l1e & PG_V) == 0) {
4451                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
4452                         if (va_next < sva)
4453                                 va_next = eva;
4454                         continue;
4455                 }
4456
4457                 l2e = pmap_l1e_to_l2e(l1e, sva);
4458                 if ((*l2e & PG_V) == 0) {
4459                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
4460                         if (va_next < sva)
4461                                 va_next = eva;
4462                         continue;
4463                 }
4464
4465                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
4466                 if (va_next < sva)
4467                         va_next = eva;
4468
4469                 l3e = pmap_l2e_to_l3e(l2e, sva);
4470                 ptpaddr = *l3e;
4471
4472                 /*
4473                  * Weed out invalid mappings.
4474                  */
4475                 if (ptpaddr == 0)
4476                         continue;
4477
4478                 /*
4479                  * Check for large page.
4480                  */
4481                 if ((ptpaddr & RPTE_LEAF) != 0) {
4482                         /*
4483                          * Are we protecting the entire large page?  If not,
4484                          * demote the mapping and fall through.
4485                          */
4486                         if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
4487                                 if (pmap_protect_l3e(pmap, l3e, sva, prot))
4488                                         anychanged = TRUE;
4489                                 continue;
4490                         } else if (!pmap_demote_l3e(pmap, l3e, sva)) {
4491                                 /*
4492                                  * The large page mapping was destroyed.
4493                                  */
4494                                 continue;
4495                         }
4496                 }
4497
4498                 if (va_next > eva)
4499                         va_next = eva;
4500
4501                 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++,
4502                     sva += PAGE_SIZE) {
4503                         pt_entry_t obits, pbits;
4504                         vm_page_t m;
4505
4506 retry:
4507                         MPASS(pte == pmap_pte(pmap, sva));
4508                         obits = pbits = *pte;
4509                         if ((pbits & PG_V) == 0)
4510                                 continue;
4511
4512                         if ((prot & VM_PROT_WRITE) == 0) {
4513                                 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
4514                                     (PG_MANAGED | PG_M | PG_RW)) {
4515                                         m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
4516                                         vm_page_dirty(m);
4517                                 }
4518                                 pbits &= ~(PG_RW | PG_M);
4519                                 pbits |= RPTE_EAA_R;
4520                         }
4521                         if (prot & VM_PROT_EXECUTE)
4522                                 pbits |= PG_X;
4523
4524                         if (pbits != obits) {
4525                                 if (!atomic_cmpset_long(pte, obits, pbits))
4526                                         goto retry;
4527                                 if (obits & (PG_A|PG_M)) {
4528                                         anychanged = TRUE;
4529 #ifdef INVARIANTS
4530                                         if (VERBOSE_PROTECT || pmap_logging)
4531                                                 printf("%#lx %#lx -> %#lx\n",
4532                                                     sva, obits, pbits);
4533 #endif
4534                                 }
4535                         }
4536                 }
4537         }
4538         if (anychanged)
4539                 pmap_invalidate_all(pmap);
4540         PMAP_UNLOCK(pmap);
4541 }
4542
4543 void
4544 mmu_radix_qenter(vm_offset_t sva, vm_page_t *ma, int count)
4545 {
4546
4547         CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count);
4548         pt_entry_t oldpte, pa, *pte;
4549         vm_page_t m;
4550         uint64_t cache_bits, attr_bits;
4551         vm_offset_t va;
4552
4553         oldpte = 0;
4554         attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A;
4555         va = sva;
4556         pte = kvtopte(va);
4557         while (va < sva + PAGE_SIZE * count) {
4558                 if (__predict_false((va & L3_PAGE_MASK) == 0))
4559                         pte = kvtopte(va);
4560                 MPASS(pte == pmap_pte(kernel_pmap, va));
4561
4562                 /*
4563                  * XXX there has to be a more efficient way than traversing
4564                  * the page table every time - but go for correctness for
4565                  * today
4566                  */
4567
4568                 m = *ma++;
4569                 cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs);
4570                 pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits;
4571                 if (*pte != pa) {
4572                         oldpte |= *pte;
4573                         pte_store(pte, pa);
4574                 }
4575                 va += PAGE_SIZE;
4576                 pte++;
4577         }
4578         if (__predict_false((oldpte & RPTE_VALID) != 0))
4579                 pmap_invalidate_range(kernel_pmap, sva, sva + count *
4580                     PAGE_SIZE);
4581         else
4582                 ptesync();
4583 }
4584
4585 void
4586 mmu_radix_qremove(vm_offset_t sva, int count)
4587 {
4588         vm_offset_t va;
4589         pt_entry_t *pte;
4590
4591         CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count);
4592         KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva));
4593
4594         va = sva;
4595         pte = kvtopte(va);
4596         while (va < sva + PAGE_SIZE * count) {
4597                 if (__predict_false((va & L3_PAGE_MASK) == 0))
4598                         pte = kvtopte(va);
4599                 pte_clear(pte);
4600                 pte++;
4601                 va += PAGE_SIZE;
4602         }
4603         pmap_invalidate_range(kernel_pmap, sva, va);
4604 }
4605
4606 /***************************************************
4607  * Page table page management routines.....
4608  ***************************************************/
4609 /*
4610  * Schedule the specified unused page table page to be freed.  Specifically,
4611  * add the page to the specified list of pages that will be released to the
4612  * physical memory manager after the TLB has been updated.
4613  */
4614 static __inline void
4615 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
4616     boolean_t set_PG_ZERO)
4617 {
4618
4619         if (set_PG_ZERO)
4620                 m->flags |= PG_ZERO;
4621         else
4622                 m->flags &= ~PG_ZERO;
4623         SLIST_INSERT_HEAD(free, m, plinks.s.ss);
4624 }
4625
4626 /*
4627  * Inserts the specified page table page into the specified pmap's collection
4628  * of idle page table pages.  Each of a pmap's page table pages is responsible
4629  * for mapping a distinct range of virtual addresses.  The pmap's collection is
4630  * ordered by this virtual address range.
4631  */
4632 static __inline int
4633 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
4634 {
4635
4636         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4637         return (vm_radix_insert(&pmap->pm_radix, mpte));
4638 }
4639
4640 /*
4641  * Removes the page table page mapping the specified virtual address from the
4642  * specified pmap's collection of idle page table pages, and returns it.
4643  * Otherwise, returns NULL if there is no page table page corresponding to the
4644  * specified virtual address.
4645  */
4646 static __inline vm_page_t
4647 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4648 {
4649
4650         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4651         return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va)));
4652 }
4653
4654 /*
4655  * Decrements a page table page's wire count, which is used to record the
4656  * number of valid page table entries within the page.  If the wire count
4657  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
4658  * page table page was unmapped and FALSE otherwise.
4659  */
4660 static inline boolean_t
4661 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
4662 {
4663
4664         --m->ref_count;
4665         if (m->ref_count == 0) {
4666                 _pmap_unwire_ptp(pmap, va, m, free);
4667                 return (TRUE);
4668         } else
4669                 return (FALSE);
4670 }
4671
4672 static void
4673 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
4674 {
4675
4676         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4677         /*
4678          * unmap the page table page
4679          */
4680         if (m->pindex >= (NUPDE + NUPDPE)) {
4681                 /* PDP page */
4682                 pml1_entry_t *pml1;
4683                 pml1 = pmap_pml1e(pmap, va);
4684                 *pml1 = 0;
4685         } else if (m->pindex >= NUPDE) {
4686                 /* PD page */
4687                 pml2_entry_t *l2e;
4688                 l2e = pmap_pml2e(pmap, va);
4689                 *l2e = 0;
4690         } else {
4691                 /* PTE page */
4692                 pml3_entry_t *l3e;
4693                 l3e = pmap_pml3e(pmap, va);
4694                 *l3e = 0;
4695         }
4696         pmap_resident_count_dec(pmap, 1);
4697         if (m->pindex < NUPDE) {
4698                 /* We just released a PT, unhold the matching PD */
4699                 vm_page_t pdpg;
4700
4701                 pdpg = PHYS_TO_VM_PAGE(*pmap_pml2e(pmap, va) & PG_FRAME);
4702                 pmap_unwire_ptp(pmap, va, pdpg, free);
4703         }
4704         if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
4705                 /* We just released a PD, unhold the matching PDP */
4706                 vm_page_t pdppg;
4707
4708                 pdppg = PHYS_TO_VM_PAGE(*pmap_pml1e(pmap, va) & PG_FRAME);
4709                 pmap_unwire_ptp(pmap, va, pdppg, free);
4710         }
4711
4712         /*
4713          * Put page on a list so that it is released after
4714          * *ALL* TLB shootdown is done
4715          */
4716         pmap_add_delayed_free_list(m, free, TRUE);
4717 }
4718
4719 /*
4720  * After removing a page table entry, this routine is used to
4721  * conditionally free the page, and manage the hold/wire counts.
4722  */
4723 static int
4724 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde,
4725     struct spglist *free)
4726 {
4727         vm_page_t mpte;
4728
4729         if (va >= VM_MAXUSER_ADDRESS)
4730                 return (0);
4731         KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
4732         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
4733         return (pmap_unwire_ptp(pmap, va, mpte, free));
4734 }
4735
4736 void
4737 mmu_radix_release(pmap_t pmap)
4738 {
4739
4740         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4741         KASSERT(pmap->pm_stats.resident_count == 0,
4742             ("pmap_release: pmap resident count %ld != 0",
4743             pmap->pm_stats.resident_count));
4744         KASSERT(vm_radix_is_empty(&pmap->pm_radix),
4745             ("pmap_release: pmap has reserved page table page(s)"));
4746
4747         pmap_invalidate_all(pmap);
4748         isa3_proctab[pmap->pm_pid].proctab0 = 0;
4749         uma_zfree(zone_radix_pgd, pmap->pm_pml1);
4750         vmem_free(asid_arena, pmap->pm_pid, 1);
4751 }
4752
4753 /*
4754  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
4755  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
4756  * false if the PV entry cannot be allocated without resorting to reclamation.
4757  */
4758 static bool
4759 pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags,
4760     struct rwlock **lockp)
4761 {
4762         struct md_page *pvh;
4763         pv_entry_t pv;
4764         vm_paddr_t pa;
4765
4766         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4767         /* Pass NULL instead of the lock pointer to disable reclamation. */
4768         if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
4769             NULL : lockp)) == NULL)
4770                 return (false);
4771         pv->pv_va = va;
4772         pa = pde & PG_PS_FRAME;
4773         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4774         pvh = pa_to_pvh(pa);
4775         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
4776         pvh->pv_gen++;
4777         return (true);
4778 }
4779
4780 /*
4781  * Fills a page table page with mappings to consecutive physical pages.
4782  */
4783 static void
4784 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
4785 {
4786         pt_entry_t *pte;
4787
4788         for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
4789                 *pte = newpte;
4790                 newpte += PAGE_SIZE;
4791         }
4792 }
4793
4794 static boolean_t
4795 pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va)
4796 {
4797         struct rwlock *lock;
4798         boolean_t rv;
4799
4800         lock = NULL;
4801         rv = pmap_demote_l3e_locked(pmap, pde, va, &lock);
4802         if (lock != NULL)
4803                 rw_wunlock(lock);
4804         return (rv);
4805 }
4806
4807 static boolean_t
4808 pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va,
4809     struct rwlock **lockp)
4810 {
4811         pml3_entry_t oldpde;
4812         pt_entry_t *firstpte;
4813         vm_paddr_t mptepa;
4814         vm_page_t mpte;
4815         struct spglist free;
4816         vm_offset_t sva;
4817
4818         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4819         oldpde = *l3e;
4820         KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V),
4821             ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx",
4822             oldpde));
4823         if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
4824             NULL) {
4825                 KASSERT((oldpde & PG_W) == 0,
4826                     ("pmap_demote_l3e: page table page for a wired mapping"
4827                     " is missing"));
4828
4829                 /*
4830                  * Invalidate the 2MB page mapping and return "failure" if the
4831                  * mapping was never accessed or the allocation of the new
4832                  * page table page fails.  If the 2MB page mapping belongs to
4833                  * the direct map region of the kernel's address space, then
4834                  * the page allocation request specifies the highest possible
4835                  * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
4836                  * normal.  Page table pages are preallocated for every other
4837                  * part of the kernel address space, so the direct map region
4838                  * is the only part of the kernel address space that must be
4839                  * handled here.
4840                  */
4841                 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
4842                     pmap_l3e_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
4843                     DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
4844                     VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
4845                         SLIST_INIT(&free);
4846                         sva = trunc_2mpage(va);
4847                         pmap_remove_l3e(pmap, l3e, sva, &free, lockp);
4848                         pmap_invalidate_l3e_page(pmap, sva, oldpde);
4849                         vm_page_free_pages_toq(&free, true);
4850                         CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx"
4851                             " in pmap %p", va, pmap);
4852                         return (FALSE);
4853                 }
4854                 if (va < VM_MAXUSER_ADDRESS)
4855                         pmap_resident_count_inc(pmap, 1);
4856         }
4857         mptepa = VM_PAGE_TO_PHYS(mpte);
4858         firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
4859         KASSERT((oldpde & PG_A) != 0,
4860             ("pmap_demote_l3e: oldpde is missing PG_A"));
4861         KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
4862             ("pmap_demote_l3e: oldpde is missing PG_M"));
4863
4864         /*
4865          * If the page table page is new, initialize it.
4866          */
4867         if (mpte->ref_count == 1) {
4868                 mpte->ref_count = NPTEPG;
4869                 pmap_fill_ptp(firstpte, oldpde);
4870         }
4871
4872         KASSERT((*firstpte & PG_FRAME) == (oldpde & PG_FRAME),
4873             ("pmap_demote_l3e: firstpte and newpte map different physical"
4874             " addresses"));
4875
4876         /*
4877          * If the mapping has changed attributes, update the page table
4878          * entries.
4879          */
4880         if ((*firstpte & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE))
4881                 pmap_fill_ptp(firstpte, oldpde);
4882
4883         /*
4884          * The spare PV entries must be reserved prior to demoting the
4885          * mapping, that is, prior to changing the PDE.  Otherwise, the state
4886          * of the PDE and the PV lists will be inconsistent, which can result
4887          * in reclaim_pv_chunk() attempting to remove a PV entry from the
4888          * wrong PV list and pmap_pv_demote_l3e() failing to find the expected
4889          * PV entry for the 2MB page mapping that is being demoted.
4890          */
4891         if ((oldpde & PG_MANAGED) != 0)
4892                 reserve_pv_entries(pmap, NPTEPG - 1, lockp);
4893
4894         /*
4895          * Demote the mapping.  This pmap is locked.  The old PDE has
4896          * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
4897          * set.  Thus, there is no danger of a race with another
4898          * processor changing the setting of PG_A and/or PG_M between
4899          * the read above and the store below.
4900          */
4901         pde_store(l3e, mptepa);
4902         ptesync();
4903         /*
4904          * Demote the PV entry.
4905          */
4906         if ((oldpde & PG_MANAGED) != 0)
4907                 pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp);
4908
4909         atomic_add_long(&pmap_l3e_demotions, 1);
4910         CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx"
4911             " in pmap %p", va, pmap);
4912         return (TRUE);
4913 }
4914
4915 /*
4916  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
4917  */
4918 static void
4919 pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va)
4920 {
4921         vm_paddr_t mptepa;
4922         vm_page_t mpte;
4923
4924         KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
4925         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4926         mpte = pmap_remove_pt_page(pmap, va);
4927         if (mpte == NULL)
4928                 panic("pmap_remove_kernel_pde: Missing pt page.");
4929
4930         mptepa = VM_PAGE_TO_PHYS(mpte);
4931
4932         /*
4933          * Initialize the page table page.
4934          */
4935         pagezero(PHYS_TO_DMAP(mptepa));
4936
4937         /*
4938          * Demote the mapping.
4939          */
4940         pde_store(l3e, mptepa);
4941         ptesync();
4942 }
4943
4944 /*
4945  * pmap_remove_l3e: do the things to unmap a superpage in a process
4946  */
4947 static int
4948 pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva,
4949     struct spglist *free, struct rwlock **lockp)
4950 {
4951         struct md_page *pvh;
4952         pml3_entry_t oldpde;
4953         vm_offset_t eva, va;
4954         vm_page_t m, mpte;
4955
4956         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4957         KASSERT((sva & L3_PAGE_MASK) == 0,
4958             ("pmap_remove_l3e: sva is not 2mpage aligned"));
4959         oldpde = pte_load_clear(pdq);
4960         if (oldpde & PG_W)
4961                 pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE);
4962         pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE);
4963         if (oldpde & PG_MANAGED) {
4964                 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
4965                 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
4966                 pmap_pvh_free(pvh, pmap, sva);
4967                 eva = sva + L3_PAGE_SIZE;
4968                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
4969                     va < eva; va += PAGE_SIZE, m++) {
4970                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
4971                                 vm_page_dirty(m);
4972                         if (oldpde & PG_A)
4973                                 vm_page_aflag_set(m, PGA_REFERENCED);
4974                         if (TAILQ_EMPTY(&m->md.pv_list) &&
4975                             TAILQ_EMPTY(&pvh->pv_list))
4976                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
4977                 }
4978         }
4979         if (pmap == kernel_pmap) {
4980                 pmap_remove_kernel_l3e(pmap, pdq, sva);
4981         } else {
4982                 mpte = pmap_remove_pt_page(pmap, sva);
4983                 if (mpte != NULL) {
4984                         pmap_resident_count_dec(pmap, 1);
4985                         KASSERT(mpte->ref_count == NPTEPG,
4986                             ("pmap_remove_l3e: pte page wire count error"));
4987                         mpte->ref_count = 0;
4988                         pmap_add_delayed_free_list(mpte, free, FALSE);
4989                 }
4990         }
4991         return (pmap_unuse_pt(pmap, sva, *pmap_pml2e(pmap, sva), free));
4992 }
4993
4994 /*
4995  * pmap_remove_pte: do the things to unmap a page in a process
4996  */
4997 static int
4998 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
4999     pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
5000 {
5001         struct md_page *pvh;
5002         pt_entry_t oldpte;
5003         vm_page_t m;
5004
5005         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5006         oldpte = pte_load_clear(ptq);
5007         if (oldpte & RPTE_WIRED)
5008                 pmap->pm_stats.wired_count -= 1;
5009         pmap_resident_count_dec(pmap, 1);
5010         if (oldpte & RPTE_MANAGED) {
5011                 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
5012                 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5013                         vm_page_dirty(m);
5014                 if (oldpte & PG_A)
5015                         vm_page_aflag_set(m, PGA_REFERENCED);
5016                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
5017                 pmap_pvh_free(&m->md, pmap, va);
5018                 if (TAILQ_EMPTY(&m->md.pv_list) &&
5019                     (m->flags & PG_FICTITIOUS) == 0) {
5020                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5021                         if (TAILQ_EMPTY(&pvh->pv_list))
5022                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
5023                 }
5024         }
5025         return (pmap_unuse_pt(pmap, va, ptepde, free));
5026 }
5027
5028 /*
5029  * Remove a single page from a process address space
5030  */
5031 static bool
5032 pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e,
5033     struct spglist *free)
5034 {
5035         struct rwlock *lock;
5036         pt_entry_t *pte;
5037         bool invalidate_all;
5038
5039         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5040         if ((*l3e & RPTE_VALID) == 0) {
5041                 return (false);
5042         }
5043         pte = pmap_l3e_to_pte(l3e, va);
5044         if ((*pte & RPTE_VALID) == 0) {
5045                 return (false);
5046         }
5047         lock = NULL;
5048
5049         invalidate_all = pmap_remove_pte(pmap, pte, va, *l3e, free, &lock);
5050         if (lock != NULL)
5051                 rw_wunlock(lock);
5052         if (!invalidate_all)
5053                 pmap_invalidate_page(pmap, va);
5054         return (invalidate_all);
5055 }
5056
5057 /*
5058  * Removes the specified range of addresses from the page table page.
5059  */
5060 static bool
5061 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
5062     pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp)
5063 {
5064         pt_entry_t *pte;
5065         vm_offset_t va;
5066         bool anyvalid;
5067
5068         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5069         anyvalid = false;
5070         va = eva;
5071         for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++,
5072             sva += PAGE_SIZE) {
5073                 MPASS(pte == pmap_pte(pmap, sva));
5074                 if (*pte == 0) {
5075                         if (va != eva) {
5076                                 anyvalid = true;
5077                                 va = eva;
5078                         }
5079                         continue;
5080                 }
5081                 if (va == eva)
5082                         va = sva;
5083                 if (pmap_remove_pte(pmap, pte, sva, *l3e, free, lockp)) {
5084                         anyvalid = true;
5085                         sva += PAGE_SIZE;
5086                         break;
5087                 }
5088         }
5089         if (anyvalid)
5090                 pmap_invalidate_all(pmap);
5091         else if (va != eva)
5092                 pmap_invalidate_range(pmap, va, sva);
5093         return (anyvalid);
5094 }
5095
5096 void
5097 mmu_radix_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5098 {
5099         struct rwlock *lock;
5100         vm_offset_t va_next;
5101         pml1_entry_t *l1e;
5102         pml2_entry_t *l2e;
5103         pml3_entry_t ptpaddr, *l3e;
5104         struct spglist free;
5105         bool anyvalid;
5106
5107         CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva);
5108
5109         /*
5110          * Perform an unsynchronized read.  This is, however, safe.
5111          */
5112         if (pmap->pm_stats.resident_count == 0)
5113                 return;
5114
5115         anyvalid = false;
5116         SLIST_INIT(&free);
5117
5118         /* XXX something fishy here */
5119         sva = (sva + PAGE_MASK) & ~PAGE_MASK;
5120         eva = (eva + PAGE_MASK) & ~PAGE_MASK;
5121
5122         PMAP_LOCK(pmap);
5123
5124         /*
5125          * special handling of removing one page.  a very
5126          * common operation and easy to short circuit some
5127          * code.
5128          */
5129         if (sva + PAGE_SIZE == eva) {
5130                 l3e = pmap_pml3e(pmap, sva);
5131                 if (l3e && (*l3e & RPTE_LEAF) == 0) {
5132                         anyvalid = pmap_remove_page(pmap, sva, l3e, &free);
5133                         goto out;
5134                 }
5135         }
5136
5137         lock = NULL;
5138         for (; sva < eva; sva = va_next) {
5139                 if (pmap->pm_stats.resident_count == 0)
5140                         break;
5141                 l1e = pmap_pml1e(pmap, sva);
5142                 if (l1e == NULL || (*l1e & PG_V) == 0) {
5143                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
5144                         if (va_next < sva)
5145                                 va_next = eva;
5146                         continue;
5147                 }
5148
5149                 l2e = pmap_l1e_to_l2e(l1e, sva);
5150                 if (l2e == NULL || (*l2e & PG_V) == 0) {
5151                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
5152                         if (va_next < sva)
5153                                 va_next = eva;
5154                         continue;
5155                 }
5156
5157                 /*
5158                  * Calculate index for next page table.
5159                  */
5160                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
5161                 if (va_next < sva)
5162                         va_next = eva;
5163
5164                 l3e = pmap_l2e_to_l3e(l2e, sva);
5165                 ptpaddr = *l3e;
5166
5167                 /*
5168                  * Weed out invalid mappings.
5169                  */
5170                 if (ptpaddr == 0)
5171                         continue;
5172
5173                 /*
5174                  * Check for large page.
5175                  */
5176                 if ((ptpaddr & RPTE_LEAF) != 0) {
5177                         /*
5178                          * Are we removing the entire large page?  If not,
5179                          * demote the mapping and fall through.
5180                          */
5181                         if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
5182                                 pmap_remove_l3e(pmap, l3e, sva, &free, &lock);
5183                                 continue;
5184                         } else if (!pmap_demote_l3e_locked(pmap, l3e, sva,
5185                             &lock)) {
5186                                 /* The large page mapping was destroyed. */
5187                                 continue;
5188                         } else
5189                                 ptpaddr = *l3e;
5190                 }
5191
5192                 /*
5193                  * Limit our scan to either the end of the va represented
5194                  * by the current page table page, or to the end of the
5195                  * range being removed.
5196                  */
5197                 if (va_next > eva)
5198                         va_next = eva;
5199
5200                 if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock))
5201                         anyvalid = true;
5202         }
5203         if (lock != NULL)
5204                 rw_wunlock(lock);
5205 out:
5206         if (anyvalid)
5207                 pmap_invalidate_all(pmap);
5208         PMAP_UNLOCK(pmap);
5209         vm_page_free_pages_toq(&free, true);
5210 }
5211
5212 void
5213 mmu_radix_remove_all(vm_page_t m)
5214 {
5215         struct md_page *pvh;
5216         pv_entry_t pv;
5217         pmap_t pmap;
5218         struct rwlock *lock;
5219         pt_entry_t *pte, tpte;
5220         pml3_entry_t *l3e;
5221         vm_offset_t va;
5222         struct spglist free;
5223         int pvh_gen, md_gen;
5224
5225         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5226         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5227             ("pmap_remove_all: page %p is not managed", m));
5228         SLIST_INIT(&free);
5229         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5230         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
5231             pa_to_pvh(VM_PAGE_TO_PHYS(m));
5232 retry:
5233         rw_wlock(lock);
5234         while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
5235                 pmap = PV_PMAP(pv);
5236                 if (!PMAP_TRYLOCK(pmap)) {
5237                         pvh_gen = pvh->pv_gen;
5238                         rw_wunlock(lock);
5239                         PMAP_LOCK(pmap);
5240                         rw_wlock(lock);
5241                         if (pvh_gen != pvh->pv_gen) {
5242                                 rw_wunlock(lock);
5243                                 PMAP_UNLOCK(pmap);
5244                                 goto retry;
5245                         }
5246                 }
5247                 va = pv->pv_va;
5248                 l3e = pmap_pml3e(pmap, va);
5249                 (void)pmap_demote_l3e_locked(pmap, l3e, va, &lock);
5250                 PMAP_UNLOCK(pmap);
5251         }
5252         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
5253                 pmap = PV_PMAP(pv);
5254                 if (!PMAP_TRYLOCK(pmap)) {
5255                         pvh_gen = pvh->pv_gen;
5256                         md_gen = m->md.pv_gen;
5257                         rw_wunlock(lock);
5258                         PMAP_LOCK(pmap);
5259                         rw_wlock(lock);
5260                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5261                                 rw_wunlock(lock);
5262                                 PMAP_UNLOCK(pmap);
5263                                 goto retry;
5264                         }
5265                 }
5266                 pmap_resident_count_dec(pmap, 1);
5267                 l3e = pmap_pml3e(pmap, pv->pv_va);
5268                 KASSERT((*l3e & RPTE_LEAF) == 0, ("pmap_remove_all: found"
5269                     " a 2mpage in page %p's pv list", m));
5270                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
5271                 tpte = pte_load_clear(pte);
5272                 if (tpte & PG_W)
5273                         pmap->pm_stats.wired_count--;
5274                 if (tpte & PG_A)
5275                         vm_page_aflag_set(m, PGA_REFERENCED);
5276
5277                 /*
5278                  * Update the vm_page_t clean and reference bits.
5279                  */
5280                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5281                         vm_page_dirty(m);
5282                 pmap_unuse_pt(pmap, pv->pv_va, *l3e, &free);
5283                 pmap_invalidate_page(pmap, pv->pv_va);
5284                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
5285                 m->md.pv_gen++;
5286                 free_pv_entry(pmap, pv);
5287                 PMAP_UNLOCK(pmap);
5288         }
5289         vm_page_aflag_clear(m, PGA_WRITEABLE);
5290         rw_wunlock(lock);
5291         vm_page_free_pages_toq(&free, true);
5292 }
5293
5294 /*
5295  * Destroy all managed, non-wired mappings in the given user-space
5296  * pmap.  This pmap cannot be active on any processor besides the
5297  * caller.
5298  *
5299  * This function cannot be applied to the kernel pmap.  Moreover, it
5300  * is not intended for general use.  It is only to be used during
5301  * process termination.  Consequently, it can be implemented in ways
5302  * that make it faster than pmap_remove().  First, it can more quickly
5303  * destroy mappings by iterating over the pmap's collection of PV
5304  * entries, rather than searching the page table.  Second, it doesn't
5305  * have to test and clear the page table entries atomically, because
5306  * no processor is currently accessing the user address space.  In
5307  * particular, a page table entry's dirty bit won't change state once
5308  * this function starts.
5309  *
5310  * Although this function destroys all of the pmap's managed,
5311  * non-wired mappings, it can delay and batch the invalidation of TLB
5312  * entries without calling pmap_delayed_invl_started() and
5313  * pmap_delayed_invl_finished().  Because the pmap is not active on
5314  * any other processor, none of these TLB entries will ever be used
5315  * before their eventual invalidation.  Consequently, there is no need
5316  * for either pmap_remove_all() or pmap_remove_write() to wait for
5317  * that eventual TLB invalidation.
5318  */
5319
5320 void
5321 mmu_radix_remove_pages(pmap_t pmap)
5322 {
5323
5324         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
5325         pml3_entry_t ptel3e;
5326         pt_entry_t *pte, tpte;
5327         struct spglist free;
5328         vm_page_t m, mpte, mt;
5329         pv_entry_t pv;
5330         struct md_page *pvh;
5331         struct pv_chunk *pc, *npc;
5332         struct rwlock *lock;
5333         int64_t bit;
5334         uint64_t inuse, bitmask;
5335         int allfree, field, freed, idx;
5336         boolean_t superpage;
5337         vm_paddr_t pa;
5338
5339         /*
5340          * Assert that the given pmap is only active on the current
5341          * CPU.  Unfortunately, we cannot block another CPU from
5342          * activating the pmap while this function is executing.
5343          */
5344         KASSERT(pmap->pm_pid == mfspr(SPR_PID),
5345             ("non-current asid %lu - expected %lu", pmap->pm_pid,
5346             mfspr(SPR_PID)));
5347
5348         lock = NULL;
5349
5350         SLIST_INIT(&free);
5351         PMAP_LOCK(pmap);
5352         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5353                 allfree = 1;
5354                 freed = 0;
5355                 for (field = 0; field < _NPCM; field++) {
5356                         inuse = ~pc->pc_map[field] & pc_freemask[field];
5357                         while (inuse != 0) {
5358                                 bit = cnttzd(inuse);
5359                                 bitmask = 1UL << bit;
5360                                 idx = field * 64 + bit;
5361                                 pv = &pc->pc_pventry[idx];
5362                                 inuse &= ~bitmask;
5363
5364                                 pte = pmap_pml2e(pmap, pv->pv_va);
5365                                 ptel3e = *pte;
5366                                 pte = pmap_l2e_to_l3e(pte, pv->pv_va);
5367                                 tpte = *pte;
5368                                 if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) {
5369                                         superpage = FALSE;
5370                                         ptel3e = tpte;
5371                                         pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
5372                                             PG_FRAME);
5373                                         pte = &pte[pmap_pte_index(pv->pv_va)];
5374                                         tpte = *pte;
5375                                 } else {
5376                                         /*
5377                                          * Keep track whether 'tpte' is a
5378                                          * superpage explicitly instead of
5379                                          * relying on RPTE_LEAF being set.
5380                                          *
5381                                          * This is because RPTE_LEAF is numerically
5382                                          * identical to PG_PTE_PAT and thus a
5383                                          * regular page could be mistaken for
5384                                          * a superpage.
5385                                          */
5386                                         superpage = TRUE;
5387                                 }
5388
5389                                 if ((tpte & PG_V) == 0) {
5390                                         panic("bad pte va %lx pte %lx",
5391                                             pv->pv_va, tpte);
5392                                 }
5393
5394 /*
5395  * We cannot remove wired pages from a process' mapping at this time
5396  */
5397                                 if (tpte & PG_W) {
5398                                         allfree = 0;
5399                                         continue;
5400                                 }
5401
5402                                 if (superpage)
5403                                         pa = tpte & PG_PS_FRAME;
5404                                 else
5405                                         pa = tpte & PG_FRAME;
5406
5407                                 m = PHYS_TO_VM_PAGE(pa);
5408                                 KASSERT(m->phys_addr == pa,
5409                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5410                                     m, (uintmax_t)m->phys_addr,
5411                                     (uintmax_t)tpte));
5412
5413                                 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5414                                     m < &vm_page_array[vm_page_array_size],
5415                                     ("pmap_remove_pages: bad tpte %#jx",
5416                                     (uintmax_t)tpte));
5417
5418                                 pte_clear(pte);
5419
5420                                 /*
5421                                  * Update the vm_page_t clean/reference bits.
5422                                  */
5423                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5424                                         if (superpage) {
5425                                                 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
5426                                                         vm_page_dirty(mt);
5427                                         } else
5428                                                 vm_page_dirty(m);
5429                                 }
5430
5431                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5432
5433                                 /* Mark free */
5434                                 pc->pc_map[field] |= bitmask;
5435                                 if (superpage) {
5436                                         pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE);
5437                                         pvh = pa_to_pvh(tpte & PG_PS_FRAME);
5438                                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
5439                                         pvh->pv_gen++;
5440                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
5441                                                 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
5442                                                         if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
5443                                                             TAILQ_EMPTY(&mt->md.pv_list))
5444                                                                 vm_page_aflag_clear(mt, PGA_WRITEABLE);
5445                                         }
5446                                         mpte = pmap_remove_pt_page(pmap, pv->pv_va);
5447                                         if (mpte != NULL) {
5448                                                 pmap_resident_count_dec(pmap, 1);
5449                                                 KASSERT(mpte->ref_count == NPTEPG,
5450                                                     ("pmap_remove_pages: pte page wire count error"));
5451                                                 mpte->ref_count = 0;
5452                                                 pmap_add_delayed_free_list(mpte, &free, FALSE);
5453                                         }
5454                                 } else {
5455                                         pmap_resident_count_dec(pmap, 1);
5456 #ifdef VERBOSE_PV
5457                                         printf("freeing pv (%p, %p)\n",
5458                                                    pmap, pv);
5459 #endif
5460                                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
5461                                         m->md.pv_gen++;
5462                                         if ((m->a.flags & PGA_WRITEABLE) != 0 &&
5463                                             TAILQ_EMPTY(&m->md.pv_list) &&
5464                                             (m->flags & PG_FICTITIOUS) == 0) {
5465                                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5466                                                 if (TAILQ_EMPTY(&pvh->pv_list))
5467                                                         vm_page_aflag_clear(m, PGA_WRITEABLE);
5468                                         }
5469                                 }
5470                                 pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free);
5471                                 freed++;
5472                         }
5473                 }
5474                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5475                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5476                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5477                 if (allfree) {
5478                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5479                         free_pv_chunk(pc);
5480                 }
5481         }
5482         if (lock != NULL)
5483                 rw_wunlock(lock);
5484         pmap_invalidate_all(pmap);
5485         PMAP_UNLOCK(pmap);
5486         vm_page_free_pages_toq(&free, true);
5487 }
5488
5489 void
5490 mmu_radix_remove_write(vm_page_t m)
5491 {
5492         struct md_page *pvh;
5493         pmap_t pmap;
5494         struct rwlock *lock;
5495         pv_entry_t next_pv, pv;
5496         pml3_entry_t *l3e;
5497         pt_entry_t oldpte, *pte;
5498         int pvh_gen, md_gen;
5499
5500         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5501         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5502             ("pmap_remove_write: page %p is not managed", m));
5503         vm_page_assert_busied(m);
5504
5505         if (!pmap_page_is_write_mapped(m))
5506                 return;
5507         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5508         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
5509             pa_to_pvh(VM_PAGE_TO_PHYS(m));
5510 retry_pv_loop:
5511         rw_wlock(lock);
5512         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) {
5513                 pmap = PV_PMAP(pv);
5514                 if (!PMAP_TRYLOCK(pmap)) {
5515                         pvh_gen = pvh->pv_gen;
5516                         rw_wunlock(lock);
5517                         PMAP_LOCK(pmap);
5518                         rw_wlock(lock);
5519                         if (pvh_gen != pvh->pv_gen) {
5520                                 PMAP_UNLOCK(pmap);
5521                                 rw_wunlock(lock);
5522                                 goto retry_pv_loop;
5523                         }
5524                 }
5525                 l3e = pmap_pml3e(pmap, pv->pv_va);
5526                 if ((*l3e & PG_RW) != 0)
5527                         (void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock);
5528                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5529                     ("inconsistent pv lock %p %p for page %p",
5530                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5531                 PMAP_UNLOCK(pmap);
5532         }
5533         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
5534                 pmap = PV_PMAP(pv);
5535                 if (!PMAP_TRYLOCK(pmap)) {
5536                         pvh_gen = pvh->pv_gen;
5537                         md_gen = m->md.pv_gen;
5538                         rw_wunlock(lock);
5539                         PMAP_LOCK(pmap);
5540                         rw_wlock(lock);
5541                         if (pvh_gen != pvh->pv_gen ||
5542                             md_gen != m->md.pv_gen) {
5543                                 PMAP_UNLOCK(pmap);
5544                                 rw_wunlock(lock);
5545                                 goto retry_pv_loop;
5546                         }
5547                 }
5548                 l3e = pmap_pml3e(pmap, pv->pv_va);
5549                 KASSERT((*l3e & RPTE_LEAF) == 0,
5550                     ("pmap_remove_write: found a 2mpage in page %p's pv list",
5551                     m));
5552                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
5553 retry:
5554                 oldpte = *pte;
5555                 if (oldpte & PG_RW) {
5556                         if (!atomic_cmpset_long(pte, oldpte,
5557                             (oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M)))
5558                                 goto retry;
5559                         if ((oldpte & PG_M) != 0)
5560                                 vm_page_dirty(m);
5561                         pmap_invalidate_page(pmap, pv->pv_va);
5562                 }
5563                 PMAP_UNLOCK(pmap);
5564         }
5565         rw_wunlock(lock);
5566         vm_page_aflag_clear(m, PGA_WRITEABLE);
5567 }
5568
5569 /*
5570  *      Clear the wired attribute from the mappings for the specified range of
5571  *      addresses in the given pmap.  Every valid mapping within that range
5572  *      must have the wired attribute set.  In contrast, invalid mappings
5573  *      cannot have the wired attribute set, so they are ignored.
5574  *
5575  *      The wired attribute of the page table entry is not a hardware
5576  *      feature, so there is no need to invalidate any TLB entries.
5577  *      Since pmap_demote_l3e() for the wired entry must never fail,
5578  *      pmap_delayed_invl_started()/finished() calls around the
5579  *      function are not needed.
5580  */
5581 void
5582 mmu_radix_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5583 {
5584         vm_offset_t va_next;
5585         pml1_entry_t *l1e;
5586         pml2_entry_t *l2e;
5587         pml3_entry_t *l3e;
5588         pt_entry_t *pte;
5589
5590         CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva);
5591         PMAP_LOCK(pmap);
5592         for (; sva < eva; sva = va_next) {
5593                 l1e = pmap_pml1e(pmap, sva);
5594                 if ((*l1e & PG_V) == 0) {
5595                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
5596                         if (va_next < sva)
5597                                 va_next = eva;
5598                         continue;
5599                 }
5600                 l2e = pmap_l1e_to_l2e(l1e, sva);
5601                 if ((*l2e & PG_V) == 0) {
5602                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
5603                         if (va_next < sva)
5604                                 va_next = eva;
5605                         continue;
5606                 }
5607                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
5608                 if (va_next < sva)
5609                         va_next = eva;
5610                 l3e = pmap_l2e_to_l3e(l2e, sva);
5611                 if ((*l3e & PG_V) == 0)
5612                         continue;
5613                 if ((*l3e & RPTE_LEAF) != 0) {
5614                         if ((*l3e & PG_W) == 0)
5615                                 panic("pmap_unwire: pde %#jx is missing PG_W",
5616                                     (uintmax_t)*l3e);
5617
5618                         /*
5619                          * Are we unwiring the entire large page?  If not,
5620                          * demote the mapping and fall through.
5621                          */
5622                         if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
5623                                 atomic_clear_long(l3e, PG_W);
5624                                 pmap->pm_stats.wired_count -= L3_PAGE_SIZE /
5625                                     PAGE_SIZE;
5626                                 continue;
5627                         } else if (!pmap_demote_l3e(pmap, l3e, sva))
5628                                 panic("pmap_unwire: demotion failed");
5629                 }
5630                 if (va_next > eva)
5631                         va_next = eva;
5632                 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++,
5633                     sva += PAGE_SIZE) {
5634                         MPASS(pte == pmap_pte(pmap, sva));
5635                         if ((*pte & PG_V) == 0)
5636                                 continue;
5637                         if ((*pte & PG_W) == 0)
5638                                 panic("pmap_unwire: pte %#jx is missing PG_W",
5639                                     (uintmax_t)*pte);
5640
5641                         /*
5642                          * PG_W must be cleared atomically.  Although the pmap
5643                          * lock synchronizes access to PG_W, another processor
5644                          * could be setting PG_M and/or PG_A concurrently.
5645                          */
5646                         atomic_clear_long(pte, PG_W);
5647                         pmap->pm_stats.wired_count--;
5648                 }
5649         }
5650         PMAP_UNLOCK(pmap);
5651 }
5652
5653 void
5654 mmu_radix_zero_page(vm_page_t m)
5655 {
5656         vm_offset_t addr;
5657
5658         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5659         addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5660         pagezero(addr);
5661 }
5662
5663 void
5664 mmu_radix_zero_page_area(vm_page_t m, int off, int size)
5665 {
5666         caddr_t addr;
5667
5668         CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size);
5669         MPASS(off + size <= PAGE_SIZE);
5670         addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5671         memset(addr + off, 0, size);
5672 }
5673
5674 static int
5675 mmu_radix_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5676 {
5677         pml3_entry_t *l3ep;
5678         pt_entry_t pte;
5679         vm_paddr_t pa;
5680         int val;
5681
5682         CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr);
5683         PMAP_LOCK(pmap);
5684
5685         l3ep = pmap_pml3e(pmap, addr);
5686         if (l3ep != NULL && (*l3ep & PG_V)) {
5687                 if (*l3ep & RPTE_LEAF) {
5688                         pte = *l3ep;
5689                         /* Compute the physical address of the 4KB page. */
5690                         pa = ((*l3ep & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) &
5691                             PG_FRAME;
5692                         val = MINCORE_PSIND(1);
5693                 } else {
5694                         pte = *pmap_l3e_to_pte(l3ep, addr);
5695                         pa = pte & PG_FRAME;
5696                         val = 0;
5697                 }
5698         } else {
5699                 pte = 0;
5700                 pa = 0;
5701                 val = 0;
5702         }
5703         if ((pte & PG_V) != 0) {
5704                 val |= MINCORE_INCORE;
5705                 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5706                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5707                 if ((pte & PG_A) != 0)
5708                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5709         }
5710         if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5711             (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5712             (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5713                 *locked_pa = pa;
5714         }
5715         PMAP_UNLOCK(pmap);
5716         return (val);
5717 }
5718
5719 void
5720 mmu_radix_activate(struct thread *td)
5721 {
5722         pmap_t pmap;
5723         uint32_t curpid;
5724
5725         CTR2(KTR_PMAP, "%s(%p)", __func__, td);
5726         critical_enter();
5727         pmap = vmspace_pmap(td->td_proc->p_vmspace);
5728         curpid = mfspr(SPR_PID);
5729         if (pmap->pm_pid > isa3_base_pid &&
5730                 curpid != pmap->pm_pid) {
5731                 mmu_radix_pid_set(pmap);
5732         }
5733         critical_exit();
5734 }
5735
5736 /*
5737  *      Increase the starting virtual address of the given mapping if a
5738  *      different alignment might result in more superpage mappings.
5739  */
5740 void
5741 mmu_radix_align_superpage(vm_object_t object, vm_ooffset_t offset,
5742     vm_offset_t *addr, vm_size_t size)
5743 {
5744
5745         CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr,
5746             size);
5747         vm_offset_t superpage_offset;
5748
5749         if (size < L3_PAGE_SIZE)
5750                 return;
5751         if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5752                 offset += ptoa(object->pg_color);
5753         superpage_offset = offset & L3_PAGE_MASK;
5754         if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE ||
5755             (*addr & L3_PAGE_MASK) == superpage_offset)
5756                 return;
5757         if ((*addr & L3_PAGE_MASK) < superpage_offset)
5758                 *addr = (*addr & ~L3_PAGE_MASK) + superpage_offset;
5759         else
5760                 *addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset;
5761 }
5762
5763 static void *
5764 mmu_radix_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr)
5765 {
5766         vm_offset_t va, tmpva, ppa, offset;
5767
5768         ppa = trunc_page(pa);
5769         offset = pa & PAGE_MASK;
5770         size = roundup2(offset + size, PAGE_SIZE);
5771         if (pa < powerpc_ptob(Maxmem))
5772                 panic("bad pa: %#lx less than Maxmem %#lx\n",
5773                           pa, powerpc_ptob(Maxmem));
5774         va = kva_alloc(size);
5775         if (bootverbose)
5776                 printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr);
5777         KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr));
5778
5779         if (!va)
5780                 panic("%s: Couldn't alloc kernel virtual memory", __func__);
5781
5782         for (tmpva = va; size > 0;) {
5783                 mmu_radix_kenter_attr(tmpva, ppa, attr);
5784                 size -= PAGE_SIZE;
5785                 tmpva += PAGE_SIZE;
5786                 ppa += PAGE_SIZE;
5787         }
5788         ptesync();
5789
5790         return ((void *)(va + offset));
5791 }
5792
5793 static void *
5794 mmu_radix_mapdev(vm_paddr_t pa, vm_size_t size)
5795 {
5796
5797         CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
5798
5799         return (mmu_radix_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT));
5800 }
5801
5802 void
5803 mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5804 {
5805
5806         CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma);
5807         m->md.mdpg_cache_attrs = ma;
5808
5809         /*
5810          * If "m" is a normal page, update its direct mapping.  This update
5811          * can be relied upon to perform any cache operations that are
5812          * required for data coherence.
5813          */
5814         if ((m->flags & PG_FICTITIOUS) == 0 &&
5815             mmu_radix_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)),
5816             PAGE_SIZE, m->md.mdpg_cache_attrs))
5817                 panic("memory attribute change on the direct map failed");
5818 }
5819
5820 static void
5821 mmu_radix_unmapdev(vm_offset_t va, vm_size_t size)
5822 {
5823         vm_offset_t offset;
5824
5825         CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, va, size);
5826         /* If we gave a direct map region in pmap_mapdev, do nothing */
5827         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
5828                 return;
5829
5830         offset = va & PAGE_MASK;
5831         size = round_page(offset + size);
5832         va = trunc_page(va);
5833
5834         if (pmap_initialized) {
5835                 mmu_radix_qremove(va, atop(size));
5836                 kva_free(va, size);
5837         }
5838 }
5839
5840 static __inline void
5841 pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask)
5842 {
5843         uint64_t opte, npte;
5844
5845         /*
5846          * The cache mode bits are all in the low 32-bits of the
5847          * PTE, so we can just spin on updating the low 32-bits.
5848          */
5849         do {
5850                 opte = *pte;
5851                 npte = opte & ~mask;
5852                 npte |= cache_bits;
5853         } while (npte != opte && !atomic_cmpset_long(pte, opte, npte));
5854 }
5855
5856 /*
5857  * Tries to demote a 1GB page mapping.
5858  */
5859 static boolean_t
5860 pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va)
5861 {
5862         pml2_entry_t oldpdpe;
5863         pml3_entry_t *firstpde, newpde, *pde;
5864         vm_paddr_t pdpgpa;
5865         vm_page_t pdpg;
5866
5867         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5868         oldpdpe = *l2e;
5869         KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V),
5870             ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
5871         pdpg = vm_page_alloc(NULL, va >> L2_PAGE_SIZE_SHIFT,
5872             VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
5873         if (pdpg == NULL) {
5874                 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
5875                     " in pmap %p", va, pmap);
5876                 return (FALSE);
5877         }
5878         pdpgpa = VM_PAGE_TO_PHYS(pdpg);
5879         firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa);
5880         KASSERT((oldpdpe & PG_A) != 0,
5881             ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
5882         KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
5883             ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
5884         newpde = oldpdpe;
5885
5886         /*
5887          * Initialize the page directory page.
5888          */
5889         for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
5890                 *pde = newpde;
5891                 newpde += L3_PAGE_SIZE;
5892         }
5893
5894         /*
5895          * Demote the mapping.
5896          */
5897         pde_store(l2e, pdpgpa);
5898
5899         /*
5900          * Flush PWC --- XXX revisit
5901          */
5902         pmap_invalidate_all(pmap);
5903
5904         pmap_l2e_demotions++;
5905         CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
5906             " in pmap %p", va, pmap);
5907         return (TRUE);
5908 }
5909
5910 vm_paddr_t
5911 mmu_radix_kextract(vm_offset_t va)
5912 {
5913         pml3_entry_t l3e;
5914         vm_paddr_t pa;
5915
5916         CTR2(KTR_PMAP, "%s(%#x)", __func__, va);
5917         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
5918                 pa = DMAP_TO_PHYS(va);
5919         } else {
5920                 l3e = *pmap_pml3e(kernel_pmap, va);
5921                 if (l3e & RPTE_LEAF) {
5922                         pa = (l3e & PG_PS_FRAME) | (va & L3_PAGE_MASK);
5923                         pa |= (va & L3_PAGE_MASK);
5924                 } else {
5925                         /*
5926                          * Beware of a concurrent promotion that changes the
5927                          * PDE at this point!  For example, vtopte() must not
5928                          * be used to access the PTE because it would use the
5929                          * new PDE.  It is, however, safe to use the old PDE
5930                          * because the page table page is preserved by the
5931                          * promotion.
5932                          */
5933                         pa = *pmap_l3e_to_pte(&l3e, va);
5934                         pa = (pa & PG_FRAME) | (va & PAGE_MASK);
5935                         pa |= (va & PAGE_MASK);
5936                 }
5937         }
5938         return (pa);
5939 }
5940
5941 static pt_entry_t
5942 mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
5943 {
5944
5945         if (ma != VM_MEMATTR_DEFAULT) {
5946                 return pmap_cache_bits(ma);
5947         }
5948
5949         /*
5950          * Assume the page is cache inhibited and access is guarded unless
5951          * it's in our available memory array.
5952          */
5953         for (int i = 0; i < pregions_sz; i++) {
5954                 if ((pa >= pregions[i].mr_start) &&
5955                     (pa < (pregions[i].mr_start + pregions[i].mr_size)))
5956                         return (RPTE_ATTR_MEM);
5957         }
5958         return (RPTE_ATTR_GUARDEDIO);
5959 }
5960
5961 static void
5962 mmu_radix_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
5963 {
5964         pt_entry_t *pte, pteval;
5965         uint64_t cache_bits;
5966
5967         pte = kvtopte(va);
5968         MPASS(pte != NULL);
5969         pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A;
5970         cache_bits = mmu_radix_calc_wimg(pa, ma);
5971         pte_store(pte, pteval | cache_bits);
5972 }
5973
5974 void
5975 mmu_radix_kremove(vm_offset_t va)
5976 {
5977         pt_entry_t *pte;
5978
5979         CTR2(KTR_PMAP, "%s(%#x)", __func__, va);
5980
5981         pte = kvtopte(va);
5982         pte_clear(pte);
5983 }
5984
5985 int
5986 mmu_radix_decode_kernel_ptr(vm_offset_t addr,
5987     int *is_user, vm_offset_t *decoded)
5988 {
5989
5990         CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr);
5991         *decoded = addr;
5992         *is_user = (addr < VM_MAXUSER_ADDRESS);
5993         return (0);
5994 }
5995
5996 static boolean_t
5997 mmu_radix_dev_direct_mapped(vm_paddr_t pa, vm_size_t size)
5998 {
5999
6000         CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
6001         return (mem_valid(pa, size));
6002 }
6003
6004 static void
6005 mmu_radix_scan_init()
6006 {
6007
6008         CTR1(KTR_PMAP, "%s()", __func__);
6009         UNIMPLEMENTED();
6010 }
6011
6012 static void
6013 mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz,
6014         void **va)
6015 {
6016         CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va);
6017         UNIMPLEMENTED();
6018 }
6019
6020 vm_offset_t
6021 mmu_radix_quick_enter_page(vm_page_t m)
6022 {
6023         vm_paddr_t paddr;
6024
6025         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
6026         paddr = VM_PAGE_TO_PHYS(m);
6027         return (PHYS_TO_DMAP(paddr));
6028 }
6029
6030 void
6031 mmu_radix_quick_remove_page(vm_offset_t addr __unused)
6032 {
6033         /* no work to do here */
6034         CTR2(KTR_PMAP, "%s(%#x)", __func__, addr);
6035 }
6036
6037 static void
6038 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
6039 {
6040         cpu_flush_dcache((void *)sva, eva - sva);
6041 }
6042
6043 int
6044 mmu_radix_change_attr(vm_offset_t va, vm_size_t size,
6045     vm_memattr_t mode)
6046 {
6047         int error;
6048
6049         CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode);
6050         PMAP_LOCK(kernel_pmap);
6051         error = pmap_change_attr_locked(va, size, mode, true);
6052         PMAP_UNLOCK(kernel_pmap);
6053         return (error);
6054 }
6055
6056 static int
6057 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush)
6058 {
6059         vm_offset_t base, offset, tmpva;
6060         vm_paddr_t pa_start, pa_end, pa_end1;
6061         pml2_entry_t *l2e;
6062         pml3_entry_t *l3e;
6063         pt_entry_t *pte;
6064         int cache_bits, error;
6065         boolean_t changed;
6066
6067         PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6068         base = trunc_page(va);
6069         offset = va & PAGE_MASK;
6070         size = round_page(offset + size);
6071
6072         /*
6073          * Only supported on kernel virtual addresses, including the direct
6074          * map but excluding the recursive map.
6075          */
6076         if (base < DMAP_MIN_ADDRESS)
6077                 return (EINVAL);
6078
6079         cache_bits = pmap_cache_bits(mode);
6080         changed = FALSE;
6081
6082         /*
6083          * Pages that aren't mapped aren't supported.  Also break down 2MB pages
6084          * into 4KB pages if required.
6085          */
6086         for (tmpva = base; tmpva < base + size; ) {
6087                 l2e = pmap_pml2e(kernel_pmap, tmpva);
6088                 if (l2e == NULL || *l2e == 0)
6089                         return (EINVAL);
6090                 if (*l2e & RPTE_LEAF) {
6091                         /*
6092                          * If the current 1GB page already has the required
6093                          * memory type, then we need not demote this page. Just
6094                          * increment tmpva to the next 1GB page frame.
6095                          */
6096                         if ((*l2e & RPTE_ATTR_MASK) == cache_bits) {
6097                                 tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE;
6098                                 continue;
6099                         }
6100
6101                         /*
6102                          * If the current offset aligns with a 1GB page frame
6103                          * and there is at least 1GB left within the range, then
6104                          * we need not break down this page into 2MB pages.
6105                          */
6106                         if ((tmpva & L2_PAGE_MASK) == 0 &&
6107                             tmpva + L2_PAGE_MASK < base + size) {
6108                                 tmpva += L2_PAGE_MASK;
6109                                 continue;
6110                         }
6111                         if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva))
6112                                 return (ENOMEM);
6113                 }
6114                 l3e = pmap_l2e_to_l3e(l2e, tmpva);
6115                 KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n",
6116                     tmpva, l2e));
6117                 if (*l3e == 0)
6118                         return (EINVAL);
6119                 if (*l3e & RPTE_LEAF) {
6120                         /*
6121                          * If the current 2MB page already has the required
6122                          * memory type, then we need not demote this page. Just
6123                          * increment tmpva to the next 2MB page frame.
6124                          */
6125                         if ((*l3e & RPTE_ATTR_MASK) == cache_bits) {
6126                                 tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE;
6127                                 continue;
6128                         }
6129
6130                         /*
6131                          * If the current offset aligns with a 2MB page frame
6132                          * and there is at least 2MB left within the range, then
6133                          * we need not break down this page into 4KB pages.
6134                          */
6135                         if ((tmpva & L3_PAGE_MASK) == 0 &&
6136                             tmpva + L3_PAGE_MASK < base + size) {
6137                                 tmpva += L3_PAGE_SIZE;
6138                                 continue;
6139                         }
6140                         if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva))
6141                                 return (ENOMEM);
6142                 }
6143                 pte = pmap_l3e_to_pte(l3e, tmpva);
6144                 if (*pte == 0)
6145                         return (EINVAL);
6146                 tmpva += PAGE_SIZE;
6147         }
6148         error = 0;
6149
6150         /*
6151          * Ok, all the pages exist, so run through them updating their
6152          * cache mode if required.
6153          */
6154         pa_start = pa_end = 0;
6155         for (tmpva = base; tmpva < base + size; ) {
6156                 l2e = pmap_pml2e(kernel_pmap, tmpva);
6157                 if (*l2e & RPTE_LEAF) {
6158                         if ((*l2e & RPTE_ATTR_MASK) != cache_bits) {
6159                                 pmap_pte_attr(l2e, cache_bits,
6160                                     RPTE_ATTR_MASK);
6161                                 changed = TRUE;
6162                         }
6163                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6164                             (*l2e & PG_PS_FRAME) < dmaplimit) {
6165                                 if (pa_start == pa_end) {
6166                                         /* Start physical address run. */
6167                                         pa_start = *l2e & PG_PS_FRAME;
6168                                         pa_end = pa_start + L2_PAGE_SIZE;
6169                                 } else if (pa_end == (*l2e & PG_PS_FRAME))
6170                                         pa_end += L2_PAGE_SIZE;
6171                                 else {
6172                                         /* Run ended, update direct map. */
6173                                         error = pmap_change_attr_locked(
6174                                             PHYS_TO_DMAP(pa_start),
6175                                             pa_end - pa_start, mode, flush);
6176                                         if (error != 0)
6177                                                 break;
6178                                         /* Start physical address run. */
6179                                         pa_start = *l2e & PG_PS_FRAME;
6180                                         pa_end = pa_start + L2_PAGE_SIZE;
6181                                 }
6182                         }
6183                         tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE;
6184                         continue;
6185                 }
6186                 l3e = pmap_l2e_to_l3e(l2e, tmpva);
6187                 if (*l3e & RPTE_LEAF) {
6188                         if ((*l3e & RPTE_ATTR_MASK) != cache_bits) {
6189                                 pmap_pte_attr(l3e, cache_bits,
6190                                     RPTE_ATTR_MASK);
6191                                 changed = TRUE;
6192                         }
6193                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6194                             (*l3e & PG_PS_FRAME) < dmaplimit) {
6195                                 if (pa_start == pa_end) {
6196                                         /* Start physical address run. */
6197                                         pa_start = *l3e & PG_PS_FRAME;
6198                                         pa_end = pa_start + L3_PAGE_SIZE;
6199                                 } else if (pa_end == (*l3e & PG_PS_FRAME))
6200                                         pa_end += L3_PAGE_SIZE;
6201                                 else {
6202                                         /* Run ended, update direct map. */
6203                                         error = pmap_change_attr_locked(
6204                                             PHYS_TO_DMAP(pa_start),
6205                                             pa_end - pa_start, mode, flush);
6206                                         if (error != 0)
6207                                                 break;
6208                                         /* Start physical address run. */
6209                                         pa_start = *l3e & PG_PS_FRAME;
6210                                         pa_end = pa_start + L3_PAGE_SIZE;
6211                                 }
6212                         }
6213                         tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE;
6214                 } else {
6215                         pte = pmap_l3e_to_pte(l3e, tmpva);
6216                         if ((*pte & RPTE_ATTR_MASK) != cache_bits) {
6217                                 pmap_pte_attr(pte, cache_bits,
6218                                     RPTE_ATTR_MASK);
6219                                 changed = TRUE;
6220                         }
6221                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6222                             (*pte & PG_FRAME) < dmaplimit) {
6223                                 if (pa_start == pa_end) {
6224                                         /* Start physical address run. */
6225                                         pa_start = *pte & PG_FRAME;
6226                                         pa_end = pa_start + PAGE_SIZE;
6227                                 } else if (pa_end == (*pte & PG_FRAME))
6228                                         pa_end += PAGE_SIZE;
6229                                 else {
6230                                         /* Run ended, update direct map. */
6231                                         error = pmap_change_attr_locked(
6232                                             PHYS_TO_DMAP(pa_start),
6233                                             pa_end - pa_start, mode, flush);
6234                                         if (error != 0)
6235                                                 break;
6236                                         /* Start physical address run. */
6237                                         pa_start = *pte & PG_FRAME;
6238                                         pa_end = pa_start + PAGE_SIZE;
6239                                 }
6240                         }
6241                         tmpva += PAGE_SIZE;
6242                 }
6243         }
6244         if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
6245                 pa_end1 = MIN(pa_end, dmaplimit);
6246                 if (pa_start != pa_end1)
6247                         error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
6248                             pa_end1 - pa_start, mode, flush);
6249         }
6250
6251         /*
6252          * Flush CPU caches if required to make sure any data isn't cached that
6253          * shouldn't be, etc.
6254          */
6255         if (changed) {
6256                 pmap_invalidate_all(kernel_pmap);
6257
6258                 if (flush)
6259                         pmap_invalidate_cache_range(base, tmpva);
6260         }
6261         return (error);
6262 }
6263
6264 /*
6265  * Allocate physical memory for the vm_page array and map it into KVA,
6266  * attempting to back the vm_pages with domain-local memory.
6267  */
6268 void
6269 mmu_radix_page_array_startup(long pages)
6270 {
6271 #ifdef notyet
6272         pml2_entry_t *l2e;
6273         pml3_entry_t *pde;
6274         pml3_entry_t newl3;
6275         vm_offset_t va;
6276         long pfn;
6277         int domain, i;
6278 #endif
6279         vm_paddr_t pa;
6280         vm_offset_t start, end;
6281
6282         vm_page_array_size = pages;
6283
6284         start = VM_MIN_KERNEL_ADDRESS;
6285         end = start + pages * sizeof(struct vm_page);
6286
6287         pa = vm_phys_early_alloc(0, end - start);
6288
6289         start = mmu_radix_map(&start, pa, end - start, VM_MEMATTR_DEFAULT);
6290 #ifdef notyet
6291         /* TODO: NUMA vm_page_array.  Blocked out until then (copied from amd64). */
6292         for (va = start; va < end; va += L3_PAGE_SIZE) {
6293                 pfn = first_page + (va - start) / sizeof(struct vm_page);
6294                 domain = _vm_phys_domain(ptoa(pfn));
6295                 l2e = pmap_pml2e(kernel_pmap, va);
6296                 if ((*l2e & PG_V) == 0) {
6297                         pa = vm_phys_early_alloc(domain, PAGE_SIZE);
6298                         dump_add_page(pa);
6299                         pagezero(PHYS_TO_DMAP(pa));
6300                         pde_store(l2e, (pml2_entry_t)pa);
6301                 }
6302                 pde = pmap_l2e_to_l3e(l2e, va);
6303                 if ((*pde & PG_V) != 0)
6304                         panic("Unexpected pde %p", pde);
6305                 pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE);
6306                 for (i = 0; i < NPDEPG; i++)
6307                         dump_add_page(pa + i * PAGE_SIZE);
6308                 newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W);
6309                 pte_store(pde, newl3);
6310         }
6311 #endif
6312         vm_page_array = (vm_page_t)start;
6313 }
6314
6315 #ifdef DDB
6316 #include <sys/kdb.h>
6317 #include <ddb/ddb.h>
6318
6319 static void
6320 pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va)
6321 {
6322         pml1_entry_t *l1e;
6323         pml2_entry_t *l2e;
6324         pml3_entry_t *l3e;
6325         pt_entry_t *pte;
6326
6327         l1e = &l1[pmap_pml1e_index(va)];
6328         db_printf("VA %#016lx l1e %#016lx", va, *l1e);
6329         if ((*l1e & PG_V) == 0) {
6330                 db_printf("\n");
6331                 return;
6332         }
6333         l2e = pmap_l1e_to_l2e(l1e, va);
6334         db_printf(" l2e %#016lx", *l2e);
6335         if ((*l2e & PG_V) == 0 || (*l2e & RPTE_LEAF) != 0) {
6336                 db_printf("\n");
6337                 return;
6338         }
6339         l3e = pmap_l2e_to_l3e(l2e, va);
6340         db_printf(" l3e %#016lx", *l3e);
6341         if ((*l3e & PG_V) == 0 || (*l3e & RPTE_LEAF) != 0) {
6342                 db_printf("\n");
6343                 return;
6344         }
6345         pte = pmap_l3e_to_pte(l3e, va);
6346         db_printf(" pte %#016lx\n", *pte);
6347 }
6348
6349 void
6350 pmap_page_print_mappings(vm_page_t m)
6351 {
6352         pmap_t pmap;
6353         pv_entry_t pv;
6354
6355         db_printf("page %p(%lx)\n", m, m->phys_addr);
6356         /* need to elide locks if running in ddb */
6357         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
6358                 db_printf("pv: %p ", pv);
6359                 db_printf("va: %#016lx ", pv->pv_va);
6360                 pmap = PV_PMAP(pv);
6361                 db_printf("pmap %p  ", pmap);
6362                 if (pmap != NULL) {
6363                         db_printf("asid: %lu\n", pmap->pm_pid);
6364                         pmap_pte_walk(pmap->pm_pml1, pv->pv_va);
6365                 }
6366         }
6367 }
6368
6369 DB_SHOW_COMMAND(pte, pmap_print_pte)
6370 {
6371         vm_offset_t va;
6372         pmap_t pmap;
6373
6374         if (!have_addr) {
6375                 db_printf("show pte addr\n");
6376                 return;
6377         }
6378         va = (vm_offset_t)addr;
6379
6380         if (va >= DMAP_MIN_ADDRESS)
6381                 pmap = kernel_pmap;
6382         else if (kdb_thread != NULL)
6383                 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
6384         else
6385                 pmap = vmspace_pmap(curthread->td_proc->p_vmspace);
6386
6387         pmap_pte_walk(pmap->pm_pml1, va);
6388 }
6389
6390 #endif