]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/powerpc/aim/mmu_radix.c
vm_phys: Try to clean up NUMA KPIs
[FreeBSD/FreeBSD.git] / sys / powerpc / aim / mmu_radix.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2018 Matthew Macy
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30
31 #include <sys/param.h>
32 #include <sys/kernel.h>
33 #include <sys/systm.h>
34 #include <sys/conf.h>
35 #include <sys/bitstring.h>
36 #include <sys/queue.h>
37 #include <sys/cpuset.h>
38 #include <sys/endian.h>
39 #include <sys/kerneldump.h>
40 #include <sys/ktr.h>
41 #include <sys/lock.h>
42 #include <sys/syslog.h>
43 #include <sys/msgbuf.h>
44 #include <sys/malloc.h>
45 #include <sys/mman.h>
46 #include <sys/mutex.h>
47 #include <sys/proc.h>
48 #include <sys/rwlock.h>
49 #include <sys/sched.h>
50 #include <sys/sysctl.h>
51 #include <sys/systm.h>
52 #include <sys/vmem.h>
53 #include <sys/vmmeter.h>
54 #include <sys/smp.h>
55
56 #include <sys/kdb.h>
57
58 #include <dev/ofw/openfirm.h>
59
60 #include <vm/vm.h>
61 #include <vm/pmap.h>
62 #include <vm/vm_param.h>
63 #include <vm/vm_kern.h>
64 #include <vm/vm_page.h>
65 #include <vm/vm_map.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_extern.h>
68 #include <vm/vm_pageout.h>
69 #include <vm/vm_phys.h>
70 #include <vm/vm_reserv.h>
71 #include <vm/vm_dumpset.h>
72 #include <vm/uma.h>
73
74 #include <machine/_inttypes.h>
75 #include <machine/cpu.h>
76 #include <machine/platform.h>
77 #include <machine/frame.h>
78 #include <machine/md_var.h>
79 #include <machine/psl.h>
80 #include <machine/bat.h>
81 #include <machine/hid.h>
82 #include <machine/pte.h>
83 #include <machine/sr.h>
84 #include <machine/trap.h>
85 #include <machine/mmuvar.h>
86
87 #ifdef INVARIANTS
88 #include <vm/uma_dbg.h>
89 #endif
90
91 #define PPC_BITLSHIFT(bit)      (sizeof(long)*NBBY - 1 - (bit))
92 #define PPC_BIT(bit)            (1UL << PPC_BITLSHIFT(bit))
93 #define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit))
94
95 #include "opt_ddb.h"
96 #ifdef DDB
97 static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va);
98 #endif
99
100 #define PG_W    RPTE_WIRED
101 #define PG_V    RPTE_VALID
102 #define PG_MANAGED      RPTE_MANAGED
103 #define PG_PROMOTED     RPTE_PROMOTED
104 #define PG_M    RPTE_C
105 #define PG_A    RPTE_R
106 #define PG_X    RPTE_EAA_X
107 #define PG_RW   RPTE_EAA_W
108 #define PG_PTE_CACHE RPTE_ATTR_MASK
109
110 #define RPTE_SHIFT 9
111 #define NLS_MASK ((1UL<<5)-1)
112 #define RPTE_ENTRIES (1UL<<RPTE_SHIFT)
113 #define RPTE_MASK (RPTE_ENTRIES-1)
114
115 #define NLB_SHIFT 0
116 #define NLB_MASK (((1UL<<52)-1) << 8)
117
118 extern int nkpt;
119 extern caddr_t crashdumpmap;
120
121 #define RIC_FLUSH_TLB 0
122 #define RIC_FLUSH_PWC 1
123 #define RIC_FLUSH_ALL 2
124
125 #define POWER9_TLB_SETS_RADIX   128     /* # sets in POWER9 TLB Radix mode */
126
127 #define PPC_INST_TLBIE                  0x7c000264
128 #define PPC_INST_TLBIEL                 0x7c000224
129 #define PPC_INST_SLBIA                  0x7c0003e4
130
131 #define ___PPC_RA(a)    (((a) & 0x1f) << 16)
132 #define ___PPC_RB(b)    (((b) & 0x1f) << 11)
133 #define ___PPC_RS(s)    (((s) & 0x1f) << 21)
134 #define ___PPC_RT(t)    ___PPC_RS(t)
135 #define ___PPC_R(r)     (((r) & 0x1) << 16)
136 #define ___PPC_PRS(prs) (((prs) & 0x1) << 17)
137 #define ___PPC_RIC(ric) (((ric) & 0x3) << 18)
138
139 #define PPC_SLBIA(IH)   __XSTRING(.long PPC_INST_SLBIA | \
140                                        ((IH & 0x7) << 21))
141 #define PPC_TLBIE_5(rb,rs,ric,prs,r)                            \
142         __XSTRING(.long PPC_INST_TLBIE |                        \
143                           ___PPC_RB(rb) | ___PPC_RS(rs) |       \
144                           ___PPC_RIC(ric) | ___PPC_PRS(prs) |   \
145                           ___PPC_R(r))
146
147 #define PPC_TLBIEL(rb,rs,ric,prs,r) \
148          __XSTRING(.long PPC_INST_TLBIEL | \
149                            ___PPC_RB(rb) | ___PPC_RS(rs) |      \
150                            ___PPC_RIC(ric) | ___PPC_PRS(prs) |  \
151                            ___PPC_R(r))
152
153 #define PPC_INVALIDATE_ERAT             PPC_SLBIA(7)
154
155 static __inline void
156 ttusync(void)
157 {
158         __asm __volatile("eieio; tlbsync; ptesync" ::: "memory");
159 }
160
161 #define TLBIEL_INVAL_SEL_MASK   0xc00   /* invalidation selector */
162 #define  TLBIEL_INVAL_PAGE      0x000   /* invalidate a single page */
163 #define  TLBIEL_INVAL_SET_PID   0x400   /* invalidate a set for the current PID */
164 #define  TLBIEL_INVAL_SET_LPID  0x800   /* invalidate a set for current LPID */
165 #define  TLBIEL_INVAL_SET       0xc00   /* invalidate a set for all LPIDs */
166
167 #define TLBIE_ACTUAL_PAGE_MASK          0xe0
168 #define  TLBIE_ACTUAL_PAGE_4K           0x00
169 #define  TLBIE_ACTUAL_PAGE_64K          0xa0
170 #define  TLBIE_ACTUAL_PAGE_2M           0x20
171 #define  TLBIE_ACTUAL_PAGE_1G           0x40
172
173 #define TLBIE_PRS_PARTITION_SCOPE       0x0
174 #define TLBIE_PRS_PROCESS_SCOPE 0x1
175
176 #define TLBIE_RIC_INVALIDATE_TLB        0x0     /* Invalidate just TLB */
177 #define TLBIE_RIC_INVALIDATE_PWC        0x1     /* Invalidate just PWC */
178 #define TLBIE_RIC_INVALIDATE_ALL        0x2     /* Invalidate TLB, PWC,
179                                                  * cached {proc, part}tab entries
180                                                  */
181 #define TLBIE_RIC_INVALIDATE_SEQ        0x3     /* HPT - only:
182                                                  * Invalidate a range of translations
183                                                  */
184
185 static __always_inline void
186 radix_tlbie(uint8_t ric, uint8_t prs, uint16_t is, uint32_t pid, uint32_t lpid,
187                         vm_offset_t va, uint16_t ap)
188 {
189         uint64_t rb, rs;
190
191         MPASS((va & PAGE_MASK) == 0);
192
193         rs = ((uint64_t)pid << 32) | lpid;
194         rb = va | is | ap;
195         __asm __volatile(PPC_TLBIE_5(%0, %1, %2, %3, 1) : :
196                 "r" (rb), "r" (rs), "i" (ric), "i" (prs));
197 }
198
199 static __inline void
200 radix_tlbie_invlpg_user_4k(uint32_t pid, vm_offset_t va)
201 {
202
203         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
204                 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_4K);
205 }
206
207 static __inline void
208 radix_tlbie_invlpg_user_2m(uint32_t pid, vm_offset_t va)
209 {
210
211         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
212                 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_2M);
213 }
214
215 static __inline void
216 radix_tlbie_invlpwc_user(uint32_t pid)
217 {
218
219         radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE,
220                 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0);
221 }
222
223 static __inline void
224 radix_tlbie_flush_user(uint32_t pid)
225 {
226
227         radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE,
228                 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0);
229 }
230
231 static __inline void
232 radix_tlbie_invlpg_kernel_4k(vm_offset_t va)
233 {
234
235         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
236             TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_4K);
237 }
238
239 static __inline void
240 radix_tlbie_invlpg_kernel_2m(vm_offset_t va)
241 {
242
243         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
244             TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_2M);
245 }
246
247 /* 1GB pages aren't currently supported. */
248 static __inline __unused void
249 radix_tlbie_invlpg_kernel_1g(vm_offset_t va)
250 {
251
252         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
253             TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_1G);
254 }
255
256 static __inline void
257 radix_tlbie_invlpwc_kernel(void)
258 {
259
260         radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE,
261             TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0);
262 }
263
264 static __inline void
265 radix_tlbie_flush_kernel(void)
266 {
267
268         radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE,
269             TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0);
270 }
271
272 static __inline vm_pindex_t
273 pmap_l3e_pindex(vm_offset_t va)
274 {
275         return ((va & PG_FRAME) >> L3_PAGE_SIZE_SHIFT);
276 }
277
278 static __inline vm_pindex_t
279 pmap_pml3e_index(vm_offset_t va)
280 {
281
282         return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK);
283 }
284
285 static __inline vm_pindex_t
286 pmap_pml2e_index(vm_offset_t va)
287 {
288         return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK);
289 }
290
291 static __inline vm_pindex_t
292 pmap_pml1e_index(vm_offset_t va)
293 {
294         return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT);
295 }
296
297 /* Return various clipped indexes for a given VA */
298 static __inline vm_pindex_t
299 pmap_pte_index(vm_offset_t va)
300 {
301
302         return ((va >> PAGE_SHIFT) & RPTE_MASK);
303 }
304
305 /* Return a pointer to the PT slot that corresponds to a VA */
306 static __inline pt_entry_t *
307 pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va)
308 {
309         pt_entry_t *pte;
310         vm_paddr_t ptepa;
311
312         ptepa = (be64toh(*l3e) & NLB_MASK);
313         pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa);
314         return (&pte[pmap_pte_index(va)]);
315 }
316
317 /* Return a pointer to the PD slot that corresponds to a VA */
318 static __inline pt_entry_t *
319 pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va)
320 {
321         pt_entry_t *l3e;
322         vm_paddr_t l3pa;
323
324         l3pa = (be64toh(*l2e) & NLB_MASK);
325         l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa);
326         return (&l3e[pmap_pml3e_index(va)]);
327 }
328
329 /* Return a pointer to the PD slot that corresponds to a VA */
330 static __inline pt_entry_t *
331 pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va)
332 {
333         pt_entry_t *l2e;
334         vm_paddr_t l2pa;
335
336         l2pa = (be64toh(*l1e) & NLB_MASK);
337
338         l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa);
339         return (&l2e[pmap_pml2e_index(va)]);
340 }
341
342 static __inline pml1_entry_t *
343 pmap_pml1e(pmap_t pmap, vm_offset_t va)
344 {
345
346         return (&pmap->pm_pml1[pmap_pml1e_index(va)]);
347 }
348
349 static pt_entry_t *
350 pmap_pml2e(pmap_t pmap, vm_offset_t va)
351 {
352         pt_entry_t *l1e;
353
354         l1e = pmap_pml1e(pmap, va);
355         if (l1e == NULL || (be64toh(*l1e) & RPTE_VALID) == 0)
356                 return (NULL);
357         return (pmap_l1e_to_l2e(l1e, va));
358 }
359
360 static __inline pt_entry_t *
361 pmap_pml3e(pmap_t pmap, vm_offset_t va)
362 {
363         pt_entry_t *l2e;
364
365         l2e = pmap_pml2e(pmap, va);
366         if (l2e == NULL || (be64toh(*l2e) & RPTE_VALID) == 0)
367                 return (NULL);
368         return (pmap_l2e_to_l3e(l2e, va));
369 }
370
371 static __inline pt_entry_t *
372 pmap_pte(pmap_t pmap, vm_offset_t va)
373 {
374         pt_entry_t *l3e;
375
376         l3e = pmap_pml3e(pmap, va);
377         if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0)
378                 return (NULL);
379         return (pmap_l3e_to_pte(l3e, va));
380 }
381
382 int nkpt = 64;
383 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
384     "Number of kernel page table pages allocated on bootup");
385
386 vm_paddr_t dmaplimit;
387
388 SYSCTL_DECL(_vm_pmap);
389
390 #ifdef INVARIANTS
391 #define VERBOSE_PMAP 0
392 #define VERBOSE_PROTECT 0
393 static int pmap_logging;
394 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN,
395     &pmap_logging, 0, "verbose debug logging");
396 #endif
397
398 static u_int64_t        KPTphys;        /* phys addr of kernel level 1 */
399
400 //static vm_paddr_t     KERNend;        /* phys addr of end of bootstrap data */
401
402 static vm_offset_t qframe = 0;
403 static struct mtx qframe_mtx;
404
405 void mmu_radix_activate(struct thread *);
406 void mmu_radix_advise(pmap_t, vm_offset_t, vm_offset_t, int);
407 void mmu_radix_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *,
408     vm_size_t);
409 void mmu_radix_clear_modify(vm_page_t);
410 void mmu_radix_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t);
411 int mmu_radix_decode_kernel_ptr(vm_offset_t, int *, vm_offset_t *);
412 int mmu_radix_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t);
413 void mmu_radix_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
414         vm_prot_t);
415 void mmu_radix_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
416 vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va);
417 vm_page_t mmu_radix_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t);
418 void mmu_radix_kenter(vm_offset_t, vm_paddr_t);
419 vm_paddr_t mmu_radix_kextract(vm_offset_t);
420 void mmu_radix_kremove(vm_offset_t);
421 boolean_t mmu_radix_is_modified(vm_page_t);
422 boolean_t mmu_radix_is_prefaultable(pmap_t, vm_offset_t);
423 boolean_t mmu_radix_is_referenced(vm_page_t);
424 void mmu_radix_object_init_pt(pmap_t, vm_offset_t, vm_object_t,
425         vm_pindex_t, vm_size_t);
426 boolean_t mmu_radix_page_exists_quick(pmap_t, vm_page_t);
427 void mmu_radix_page_init(vm_page_t);
428 boolean_t mmu_radix_page_is_mapped(vm_page_t m);
429 void mmu_radix_page_set_memattr(vm_page_t, vm_memattr_t);
430 int mmu_radix_page_wired_mappings(vm_page_t);
431 int mmu_radix_pinit(pmap_t);
432 void mmu_radix_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
433 bool mmu_radix_ps_enabled(pmap_t);
434 void mmu_radix_qenter(vm_offset_t, vm_page_t *, int);
435 void mmu_radix_qremove(vm_offset_t, int);
436 vm_offset_t mmu_radix_quick_enter_page(vm_page_t);
437 void mmu_radix_quick_remove_page(vm_offset_t);
438 boolean_t mmu_radix_ts_referenced(vm_page_t);
439 void mmu_radix_release(pmap_t);
440 void mmu_radix_remove(pmap_t, vm_offset_t, vm_offset_t);
441 void mmu_radix_remove_all(vm_page_t);
442 void mmu_radix_remove_pages(pmap_t);
443 void mmu_radix_remove_write(vm_page_t);
444 void mmu_radix_unwire(pmap_t, vm_offset_t, vm_offset_t);
445 void mmu_radix_zero_page(vm_page_t);
446 void mmu_radix_zero_page_area(vm_page_t, int, int);
447 int mmu_radix_change_attr(vm_offset_t, vm_size_t, vm_memattr_t);
448 void mmu_radix_page_array_startup(long pages);
449
450 #include "mmu_oea64.h"
451
452 /*
453  * Kernel MMU interface
454  */
455
456 static void     mmu_radix_bootstrap(vm_offset_t, vm_offset_t);
457
458 static void mmu_radix_copy_page(vm_page_t, vm_page_t);
459 static void mmu_radix_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
460     vm_page_t *mb, vm_offset_t b_offset, int xfersize);
461 static void mmu_radix_growkernel(vm_offset_t);
462 static void mmu_radix_init(void);
463 static int mmu_radix_mincore(pmap_t, vm_offset_t, vm_paddr_t *);
464 static vm_offset_t mmu_radix_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
465 static void mmu_radix_pinit0(pmap_t);
466
467 static void *mmu_radix_mapdev(vm_paddr_t, vm_size_t);
468 static void *mmu_radix_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t);
469 static void mmu_radix_unmapdev(vm_offset_t, vm_size_t);
470 static void mmu_radix_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma);
471 static boolean_t mmu_radix_dev_direct_mapped(vm_paddr_t, vm_size_t);
472 static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va);
473 static void mmu_radix_scan_init(void);
474 static void     mmu_radix_cpu_bootstrap(int ap);
475 static void     mmu_radix_tlbie_all(void);
476
477 static struct pmap_funcs mmu_radix_methods = {
478         .bootstrap = mmu_radix_bootstrap,
479         .copy_page = mmu_radix_copy_page,
480         .copy_pages = mmu_radix_copy_pages,
481         .cpu_bootstrap = mmu_radix_cpu_bootstrap,
482         .growkernel = mmu_radix_growkernel,
483         .init = mmu_radix_init,
484         .map =                  mmu_radix_map,
485         .mincore =              mmu_radix_mincore,
486         .pinit = mmu_radix_pinit,
487         .pinit0 = mmu_radix_pinit0,
488
489         .mapdev = mmu_radix_mapdev,
490         .mapdev_attr = mmu_radix_mapdev_attr,
491         .unmapdev = mmu_radix_unmapdev,
492         .kenter_attr = mmu_radix_kenter_attr,
493         .dev_direct_mapped = mmu_radix_dev_direct_mapped,
494         .dumpsys_pa_init = mmu_radix_scan_init,
495         .dumpsys_map_chunk = mmu_radix_dumpsys_map,
496         .page_is_mapped = mmu_radix_page_is_mapped,
497         .ps_enabled = mmu_radix_ps_enabled,
498         .object_init_pt = mmu_radix_object_init_pt,
499         .protect = mmu_radix_protect,
500         /* pmap dispatcher interface */
501         .clear_modify = mmu_radix_clear_modify,
502         .copy = mmu_radix_copy,
503         .enter = mmu_radix_enter,
504         .enter_object = mmu_radix_enter_object,
505         .enter_quick = mmu_radix_enter_quick,
506         .extract = mmu_radix_extract,
507         .extract_and_hold = mmu_radix_extract_and_hold,
508         .is_modified = mmu_radix_is_modified,
509         .is_prefaultable = mmu_radix_is_prefaultable,
510         .is_referenced = mmu_radix_is_referenced,
511         .ts_referenced = mmu_radix_ts_referenced,
512         .page_exists_quick = mmu_radix_page_exists_quick,
513         .page_init = mmu_radix_page_init,
514         .page_wired_mappings =  mmu_radix_page_wired_mappings,
515         .qenter = mmu_radix_qenter,
516         .qremove = mmu_radix_qremove,
517         .release = mmu_radix_release,
518         .remove = mmu_radix_remove,
519         .remove_all = mmu_radix_remove_all,
520         .remove_write = mmu_radix_remove_write,
521         .unwire = mmu_radix_unwire,
522         .zero_page = mmu_radix_zero_page,
523         .zero_page_area = mmu_radix_zero_page_area,
524         .activate = mmu_radix_activate,
525         .quick_enter_page =  mmu_radix_quick_enter_page,
526         .quick_remove_page =  mmu_radix_quick_remove_page,
527         .page_set_memattr = mmu_radix_page_set_memattr,
528         .page_array_startup =  mmu_radix_page_array_startup,
529
530         /* Internal interfaces */
531         .kenter = mmu_radix_kenter,
532         .kextract = mmu_radix_kextract,
533         .kremove = mmu_radix_kremove,
534         .change_attr = mmu_radix_change_attr,
535         .decode_kernel_ptr =  mmu_radix_decode_kernel_ptr,
536
537         .tlbie_all = mmu_radix_tlbie_all,
538 };
539
540 MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods);
541
542 static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va,
543         struct rwlock **lockp);
544 static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va);
545 static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *);
546 static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva,
547     struct spglist *free, struct rwlock **lockp);
548 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
549     pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
550 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
551 static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde,
552     struct spglist *free);
553 static bool     pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
554         pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp);
555
556 static bool     pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e,
557                     u_int flags, struct rwlock **lockp);
558 #if VM_NRESERVLEVEL > 0
559 static void     pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
560         struct rwlock **lockp);
561 #endif
562 static void     pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
563 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
564 static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
565         vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate);
566
567 static bool     pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
568         vm_prot_t prot, struct rwlock **lockp);
569 static int      pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde,
570         u_int flags, vm_page_t m, struct rwlock **lockp);
571
572 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
573 static void free_pv_chunk(struct pv_chunk *pc);
574 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp);
575 static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va,
576         struct rwlock **lockp);
577 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
578         struct rwlock **lockp);
579 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
580     struct spglist *free);
581 static boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free);
582
583 static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start);
584 static void pmap_invalidate_all(pmap_t pmap);
585 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush);
586
587 /*
588  * Internal flags for pmap_enter()'s helper functions.
589  */
590 #define PMAP_ENTER_NORECLAIM    0x1000000       /* Don't reclaim PV entries. */
591 #define PMAP_ENTER_NOREPLACE    0x2000000       /* Don't replace mappings. */
592
593 #define UNIMPLEMENTED() panic("%s not implemented", __func__)
594 #define UNTESTED() panic("%s not yet tested", __func__)
595
596 /* Number of supported PID bits */
597 static unsigned int isa3_pid_bits;
598
599 /* PID to start allocating from */
600 static unsigned int isa3_base_pid;
601
602 #define PROCTAB_SIZE_SHIFT      (isa3_pid_bits + 4)
603 #define PROCTAB_ENTRIES (1ul << isa3_pid_bits)
604
605 /*
606  * Map of physical memory regions.
607  */
608 static struct   mem_region *regions, *pregions;
609 static struct   numa_mem_region *numa_pregions;
610 static u_int    phys_avail_count;
611 static int      regions_sz, pregions_sz, numa_pregions_sz;
612 static struct pate *isa3_parttab;
613 static struct prte *isa3_proctab;
614 static vmem_t *asid_arena;
615
616 extern void bs_remap_earlyboot(void);
617
618 #define RADIX_PGD_SIZE_SHIFT    16
619 #define RADIX_PGD_SIZE  (1UL << RADIX_PGD_SIZE_SHIFT)
620
621 #define RADIX_PGD_INDEX_SHIFT   (RADIX_PGD_SIZE_SHIFT-3)
622 #define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t))
623 #define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t))
624
625 #define NUPML1E         (RADIX_PGD_SIZE/sizeof(uint64_t))       /* number of userland PML1 pages */
626 #define NUPDPE          (NUPML1E * NL2EPG)/* number of userland PDP pages */
627 #define NUPDE           (NUPDPE * NL3EPG)       /* number of userland PD entries */
628
629 /* POWER9 only permits a 64k partition table size. */
630 #define PARTTAB_SIZE_SHIFT      16
631 #define PARTTAB_SIZE    (1UL << PARTTAB_SIZE_SHIFT)
632
633 #define PARTTAB_HR              (1UL << 63) /* host uses radix */
634 #define PARTTAB_GR              (1UL << 63) /* guest uses radix must match host */
635
636 /* TLB flush actions. Used as argument to tlbiel_all() */
637 enum {
638         TLB_INVAL_SCOPE_LPID = 0,       /* invalidate TLBs for current LPID */
639         TLB_INVAL_SCOPE_GLOBAL = 1,     /* invalidate all TLBs */
640 };
641
642 #define NPV_LIST_LOCKS  MAXCPU
643 static int pmap_initialized;
644 static vm_paddr_t proctab0pa;
645 static vm_paddr_t parttab_phys;
646 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
647
648 /*
649  * Data for the pv entry allocation mechanism.
650  * Updates to pv_invl_gen are protected by the pv_list_locks[]
651  * elements, but reads are not.
652  */
653 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
654 static struct mtx __exclusive_cache_line pv_chunks_mutex;
655 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
656 static struct md_page *pv_table;
657 static struct md_page pv_dummy;
658
659 #ifdef PV_STATS
660 #define PV_STAT(x)      do { x ; } while (0)
661 #else
662 #define PV_STAT(x)      do { } while (0)
663 #endif
664
665 #define pa_radix_index(pa)      ((pa) >> L3_PAGE_SIZE_SHIFT)
666 #define pa_to_pvh(pa)   (&pv_table[pa_radix_index(pa)])
667
668 #define PHYS_TO_PV_LIST_LOCK(pa)        \
669                         (&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS])
670
671 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)  do {    \
672         struct rwlock **_lockp = (lockp);               \
673         struct rwlock *_new_lock;                       \
674                                                         \
675         _new_lock = PHYS_TO_PV_LIST_LOCK(pa);           \
676         if (_new_lock != *_lockp) {                     \
677                 if (*_lockp != NULL)                    \
678                         rw_wunlock(*_lockp);            \
679                 *_lockp = _new_lock;                    \
680                 rw_wlock(*_lockp);                      \
681         }                                               \
682 } while (0)
683
684 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)        \
685         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
686
687 #define RELEASE_PV_LIST_LOCK(lockp)             do {    \
688         struct rwlock **_lockp = (lockp);               \
689                                                         \
690         if (*_lockp != NULL) {                          \
691                 rw_wunlock(*_lockp);                    \
692                 *_lockp = NULL;                         \
693         }                                               \
694 } while (0)
695
696 #define VM_PAGE_TO_PV_LIST_LOCK(m)      \
697         PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
698
699 /*
700  * We support 52 bits, hence:
701  * bits 52 - 31 = 21, 0b10101
702  * RTS encoding details
703  * bits 0 - 3 of rts -> bits 6 - 8 unsigned long
704  * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long
705  */
706 #define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5))
707
708 static int powernv_enabled = 1;
709
710 static __always_inline void
711 tlbiel_radix_set_isa300(uint32_t set, uint32_t is,
712         uint32_t pid, uint32_t ric, uint32_t prs)
713 {
714         uint64_t rb;
715         uint64_t rs;
716
717         rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53);
718         rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31);
719
720         __asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
721                      : : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
722                      : "memory");
723 }
724
725 static void
726 tlbiel_flush_isa3(uint32_t num_sets, uint32_t is)
727 {
728         uint32_t set;
729
730         __asm __volatile("ptesync": : :"memory");
731
732         /*
733          * Flush the first set of the TLB, and the entire Page Walk Cache
734          * and partition table entries. Then flush the remaining sets of the
735          * TLB.
736          */
737         tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
738         for (set = 1; set < num_sets; set++)
739                 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
740
741         /* Do the same for process scoped entries. */
742         tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
743         for (set = 1; set < num_sets; set++)
744                 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
745
746         __asm __volatile("ptesync": : :"memory");
747 }
748
749 static void
750 mmu_radix_tlbiel_flush(int scope)
751 {
752         int is;
753
754         MPASS(scope == TLB_INVAL_SCOPE_LPID ||
755                   scope == TLB_INVAL_SCOPE_GLOBAL);
756         is = scope + 2;
757
758         tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, is);
759         __asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
760 }
761
762 static void
763 mmu_radix_tlbie_all()
764 {
765         /* TODO: LPID invalidate */
766         mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
767 }
768
769 static void
770 mmu_radix_init_amor(void)
771 {
772         /*
773         * In HV mode, we init AMOR (Authority Mask Override Register) so that
774         * the hypervisor and guest can setup IAMR (Instruction Authority Mask
775         * Register), enable key 0 and set it to 1.
776         *
777         * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
778         */
779         mtspr(SPR_AMOR, (3ul << 62));
780 }
781
782 static void
783 mmu_radix_init_iamr(void)
784 {
785         /*
786          * Radix always uses key0 of the IAMR to determine if an access is
787          * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
788          * fetch.
789          */
790         mtspr(SPR_IAMR, (1ul << 62));
791 }
792
793 static void
794 mmu_radix_pid_set(pmap_t pmap)
795 {
796
797         mtspr(SPR_PID, pmap->pm_pid);
798         isync();
799 }
800
801 /* Quick sort callout for comparing physical addresses. */
802 static int
803 pa_cmp(const void *a, const void *b)
804 {
805         const vm_paddr_t *pa = a, *pb = b;
806
807         if (*pa < *pb)
808                 return (-1);
809         else if (*pa > *pb)
810                 return (1);
811         else
812                 return (0);
813 }
814
815 #define pte_load_store(ptep, pte)       atomic_swap_long(ptep, pte)
816 #define pte_load_clear(ptep)            atomic_swap_long(ptep, 0)
817 #define pte_store(ptep, pte) do {          \
818         MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X));  \
819         *(u_long *)(ptep) = htobe64((u_long)((pte) | PG_V | RPTE_LEAF)); \
820 } while (0)
821 /*
822  * NB: should only be used for adding directories - not for direct mappings
823  */
824 #define pde_store(ptep, pa) do {                                \
825         *(u_long *)(ptep) = htobe64((u_long)(pa|RPTE_VALID|RPTE_SHIFT)); \
826 } while (0)
827
828 #define pte_clear(ptep) do {                                    \
829                 *(u_long *)(ptep) = (u_long)(0);                \
830 } while (0)
831
832 #define PMAP_PDE_SUPERPAGE      (1 << 8)        /* supports 2MB superpages */
833
834 /*
835  * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
836  * (PTE) page mappings have identical settings for the following fields:
837  */
838 #define PG_PTE_PROMOTE  (PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \
839             PG_M | PG_A | RPTE_EAA_MASK | PG_V)
840
841 static __inline void
842 pmap_resident_count_inc(pmap_t pmap, int count)
843 {
844
845         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
846         pmap->pm_stats.resident_count += count;
847 }
848
849 static __inline void
850 pmap_resident_count_dec(pmap_t pmap, int count)
851 {
852
853         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
854         KASSERT(pmap->pm_stats.resident_count >= count,
855             ("pmap %p resident count underflow %ld %d", pmap,
856             pmap->pm_stats.resident_count, count));
857         pmap->pm_stats.resident_count -= count;
858 }
859
860 static void
861 pagezero(vm_offset_t va)
862 {
863         va = trunc_page(va);
864
865         bzero((void *)va, PAGE_SIZE);
866 }
867
868 static uint64_t
869 allocpages(int n)
870 {
871         u_int64_t ret;
872
873         ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE);
874         for (int i = 0; i < n; i++)
875                 pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE));
876         return (ret);
877 }
878
879 static pt_entry_t *
880 kvtopte(vm_offset_t va)
881 {
882         pt_entry_t *l3e;
883
884         l3e = pmap_pml3e(kernel_pmap, va);
885         if ((be64toh(*l3e) & RPTE_VALID) == 0)
886                 return (NULL);
887         return (pmap_l3e_to_pte(l3e, va));
888 }
889
890 void
891 mmu_radix_kenter(vm_offset_t va, vm_paddr_t pa)
892 {
893         pt_entry_t *pte;
894
895         pte = kvtopte(va);
896         MPASS(pte != NULL);
897         *pte = htobe64(pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | \
898             RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A);
899 }
900
901 bool
902 mmu_radix_ps_enabled(pmap_t pmap)
903 {
904         return (superpages_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
905 }
906
907 static pt_entry_t *
908 pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e)
909 {
910         pml3_entry_t *l3e;
911         pt_entry_t *pte;
912
913         va &= PG_PS_FRAME;
914         l3e = pmap_pml3e(pmap, va);
915         if (l3e == NULL || (be64toh(*l3e) & PG_V) == 0)
916                 return (NULL);
917
918         if (be64toh(*l3e) & RPTE_LEAF) {
919                 *is_l3e = 1;
920                 return (l3e);
921         }
922         *is_l3e = 0;
923         va &= PG_FRAME;
924         pte = pmap_l3e_to_pte(l3e, va);
925         if (pte == NULL || (be64toh(*pte) & PG_V) == 0)
926                 return (NULL);
927         return (pte);
928 }
929
930 int
931 pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags)
932 {
933         pt_entry_t *pte;
934         pt_entry_t startpte, origpte, newpte;
935         vm_page_t m;
936         int is_l3e;
937
938         startpte = 0;
939  retry:
940         if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL)
941                 return (KERN_INVALID_ADDRESS);
942         origpte = newpte = be64toh(*pte);
943         if (startpte == 0) {
944                 startpte = origpte;
945                 if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) ||
946                     ((flags & VM_PROT_READ) && (startpte & PG_A))) {
947                         pmap_invalidate_all(pmap);
948 #ifdef INVARIANTS
949                         if (VERBOSE_PMAP || pmap_logging)
950                                 printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n",
951                                     __func__, pmap, va, flags, origpte);
952 #endif
953                         return (KERN_FAILURE);
954                 }
955         }
956 #ifdef INVARIANTS
957         if (VERBOSE_PMAP || pmap_logging)
958                 printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va,
959                     flags, origpte);
960 #endif
961         PMAP_LOCK(pmap);
962         if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL ||
963             be64toh(*pte) != origpte) {
964                 PMAP_UNLOCK(pmap);
965                 return (KERN_FAILURE);
966         }
967         m = PHYS_TO_VM_PAGE(newpte & PG_FRAME);
968         MPASS(m != NULL);
969         switch (flags) {
970         case VM_PROT_READ:
971                 if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0)
972                         goto protfail;
973                 newpte |= PG_A;
974                 vm_page_aflag_set(m, PGA_REFERENCED);
975                 break;
976         case VM_PROT_WRITE:
977                 if ((newpte & RPTE_EAA_W) == 0)
978                         goto protfail;
979                 if (is_l3e)
980                         goto protfail;
981                 newpte |= PG_M;
982                 vm_page_dirty(m);
983                 break;
984         case VM_PROT_EXECUTE:
985                 if ((newpte & RPTE_EAA_X) == 0)
986                         goto protfail;
987                 newpte |= PG_A;
988                 vm_page_aflag_set(m, PGA_REFERENCED);
989                 break;
990         }
991
992         if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte)))
993                 goto retry;
994         ptesync();
995         PMAP_UNLOCK(pmap);
996         if (startpte == newpte)
997                 return (KERN_FAILURE);
998         return (0);
999  protfail:
1000         PMAP_UNLOCK(pmap);
1001         return (KERN_PROTECTION_FAILURE);
1002 }
1003
1004 /*
1005  * Returns TRUE if the given page is mapped individually or as part of
1006  * a 2mpage.  Otherwise, returns FALSE.
1007  */
1008 boolean_t
1009 mmu_radix_page_is_mapped(vm_page_t m)
1010 {
1011         struct rwlock *lock;
1012         boolean_t rv;
1013
1014         if ((m->oflags & VPO_UNMANAGED) != 0)
1015                 return (FALSE);
1016         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
1017         rw_rlock(lock);
1018         rv = !TAILQ_EMPTY(&m->md.pv_list) ||
1019             ((m->flags & PG_FICTITIOUS) == 0 &&
1020             !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
1021         rw_runlock(lock);
1022         return (rv);
1023 }
1024
1025 /*
1026  * Determine the appropriate bits to set in a PTE or PDE for a specified
1027  * caching mode.
1028  */
1029 static int
1030 pmap_cache_bits(vm_memattr_t ma)
1031 {
1032         if (ma != VM_MEMATTR_DEFAULT) {
1033                 switch (ma) {
1034                 case VM_MEMATTR_UNCACHEABLE:
1035                         return (RPTE_ATTR_GUARDEDIO);
1036                 case VM_MEMATTR_CACHEABLE:
1037                         return (RPTE_ATTR_MEM);
1038                 case VM_MEMATTR_WRITE_BACK:
1039                 case VM_MEMATTR_PREFETCHABLE:
1040                 case VM_MEMATTR_WRITE_COMBINING:
1041                         return (RPTE_ATTR_UNGUARDEDIO);
1042                 }
1043         }
1044         return (0);
1045 }
1046
1047 static void
1048 pmap_invalidate_page(pmap_t pmap, vm_offset_t start)
1049 {
1050         ptesync();
1051         if (pmap == kernel_pmap)
1052                 radix_tlbie_invlpg_kernel_4k(start);
1053         else
1054                 radix_tlbie_invlpg_user_4k(pmap->pm_pid, start);
1055         ttusync();
1056 }
1057
1058 static void
1059 pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start)
1060 {
1061         ptesync();
1062         if (pmap == kernel_pmap)
1063                 radix_tlbie_invlpg_kernel_2m(start);
1064         else
1065                 radix_tlbie_invlpg_user_2m(pmap->pm_pid, start);
1066         ttusync();
1067 }
1068
1069 static void
1070 pmap_invalidate_pwc(pmap_t pmap)
1071 {
1072         ptesync();
1073         if (pmap == kernel_pmap)
1074                 radix_tlbie_invlpwc_kernel();
1075         else
1076                 radix_tlbie_invlpwc_user(pmap->pm_pid);
1077         ttusync();
1078 }
1079
1080 static void
1081 pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end)
1082 {
1083         if (((start - end) >> PAGE_SHIFT) > 8) {
1084                 pmap_invalidate_all(pmap);
1085                 return;
1086         }
1087         ptesync();
1088         if (pmap == kernel_pmap) {
1089                 while (start < end) {
1090                         radix_tlbie_invlpg_kernel_4k(start);
1091                         start += PAGE_SIZE;
1092                 }
1093         } else {
1094                 while (start < end) {
1095                         radix_tlbie_invlpg_user_4k(pmap->pm_pid, start);
1096                         start += PAGE_SIZE;
1097                 }
1098         }
1099         ttusync();
1100 }
1101
1102 static void
1103 pmap_invalidate_all(pmap_t pmap)
1104 {
1105         ptesync();
1106         if (pmap == kernel_pmap)
1107                 radix_tlbie_flush_kernel();
1108         else
1109                 radix_tlbie_flush_user(pmap->pm_pid);
1110         ttusync();
1111 }
1112
1113 static void
1114 pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e)
1115 {
1116
1117         /*
1118          * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
1119          * by a promotion that did not invalidate the 512 4KB page mappings
1120          * that might exist in the TLB.  Consequently, at this point, the TLB
1121          * may hold both 4KB and 2MB page mappings for the address range [va,
1122          * va + L3_PAGE_SIZE).  Therefore, the entire range must be invalidated here.
1123          * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
1124          * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a
1125          * single INVLPG suffices to invalidate the 2MB page mapping from the
1126          * TLB.
1127          */
1128         ptesync();
1129         if ((l3e & PG_PROMOTED) != 0)
1130                 pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1);
1131         else
1132                 pmap_invalidate_page_2m(pmap, va);
1133
1134         pmap_invalidate_pwc(pmap);
1135 }
1136
1137 static __inline struct pv_chunk *
1138 pv_to_chunk(pv_entry_t pv)
1139 {
1140
1141         return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
1142 }
1143
1144 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1145
1146 #define PC_FREE0        0xfffffffffffffffful
1147 #define PC_FREE1        0x3ffffffffffffffful
1148
1149 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 };
1150
1151 /*
1152  * Ensure that the number of spare PV entries in the specified pmap meets or
1153  * exceeds the given count, "needed".
1154  *
1155  * The given PV list lock may be released.
1156  */
1157 static void
1158 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
1159 {
1160         struct pch new_tail;
1161         struct pv_chunk *pc;
1162         vm_page_t m;
1163         int avail, free;
1164         bool reclaimed;
1165
1166         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1167         KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
1168
1169         /*
1170          * Newly allocated PV chunks must be stored in a private list until
1171          * the required number of PV chunks have been allocated.  Otherwise,
1172          * reclaim_pv_chunk() could recycle one of these chunks.  In
1173          * contrast, these chunks must be added to the pmap upon allocation.
1174          */
1175         TAILQ_INIT(&new_tail);
1176 retry:
1177         avail = 0;
1178         TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
1179                 //              if ((cpu_feature2 & CPUID2_POPCNT) == 0)
1180                 bit_count((bitstr_t *)pc->pc_map, 0,
1181                                   sizeof(pc->pc_map) * NBBY, &free);
1182 #if 0
1183                 free = popcnt_pc_map_pq(pc->pc_map);
1184 #endif
1185                 if (free == 0)
1186                         break;
1187                 avail += free;
1188                 if (avail >= needed)
1189                         break;
1190         }
1191         for (reclaimed = false; avail < needed; avail += _NPCPV) {
1192                 m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1193                     VM_ALLOC_WIRED);
1194                 if (m == NULL) {
1195                         m = reclaim_pv_chunk(pmap, lockp);
1196                         if (m == NULL)
1197                                 goto retry;
1198                         reclaimed = true;
1199                 }
1200                 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
1201                 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
1202                 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1203                 pc->pc_pmap = pmap;
1204                 pc->pc_map[0] = PC_FREE0;
1205                 pc->pc_map[1] = PC_FREE1;
1206                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1207                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
1208                 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
1209
1210                 /*
1211                  * The reclaim might have freed a chunk from the current pmap.
1212                  * If that chunk contained available entries, we need to
1213                  * re-count the number of available entries.
1214                  */
1215                 if (reclaimed)
1216                         goto retry;
1217         }
1218         if (!TAILQ_EMPTY(&new_tail)) {
1219                 mtx_lock(&pv_chunks_mutex);
1220                 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
1221                 mtx_unlock(&pv_chunks_mutex);
1222         }
1223 }
1224
1225 /*
1226  * First find and then remove the pv entry for the specified pmap and virtual
1227  * address from the specified pv list.  Returns the pv entry if found and NULL
1228  * otherwise.  This operation can be performed on pv lists for either 4KB or
1229  * 2MB page mappings.
1230  */
1231 static __inline pv_entry_t
1232 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1233 {
1234         pv_entry_t pv;
1235
1236         TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
1237 #ifdef INVARIANTS
1238                 if (PV_PMAP(pv) == NULL) {
1239                         printf("corrupted pv_chunk/pv %p\n", pv);
1240                         printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":");
1241                 }
1242                 MPASS(PV_PMAP(pv) != NULL);
1243                 MPASS(pv->pv_va != 0);
1244 #endif
1245                 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
1246                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
1247                         pvh->pv_gen++;
1248                         break;
1249                 }
1250         }
1251         return (pv);
1252 }
1253
1254 /*
1255  * After demotion from a 2MB page mapping to 512 4KB page mappings,
1256  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
1257  * entries for each of the 4KB page mappings.
1258  */
1259 static void
1260 pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1261     struct rwlock **lockp)
1262 {
1263         struct md_page *pvh;
1264         struct pv_chunk *pc;
1265         pv_entry_t pv;
1266         vm_offset_t va_last;
1267         vm_page_t m;
1268         int bit, field;
1269
1270         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1271         KASSERT((pa & L3_PAGE_MASK) == 0,
1272             ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
1273         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
1274
1275         /*
1276          * Transfer the 2mpage's pv entry for this mapping to the first
1277          * page's pv list.  Once this transfer begins, the pv list lock
1278          * must not be released until the last pv entry is reinstantiated.
1279          */
1280         pvh = pa_to_pvh(pa);
1281         va = trunc_2mpage(va);
1282         pv = pmap_pvh_remove(pvh, pmap, va);
1283         KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
1284         m = PHYS_TO_VM_PAGE(pa);
1285         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1286
1287         m->md.pv_gen++;
1288         /* Instantiate the remaining NPTEPG - 1 pv entries. */
1289         PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
1290         va_last = va + L3_PAGE_SIZE - PAGE_SIZE;
1291         for (;;) {
1292                 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1293                 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0
1294                     , ("pmap_pv_demote_pde: missing spare"));
1295                 for (field = 0; field < _NPCM; field++) {
1296                         while (pc->pc_map[field]) {
1297                                 bit = cnttzd(pc->pc_map[field]);
1298                                 pc->pc_map[field] &= ~(1ul << bit);
1299                                 pv = &pc->pc_pventry[field * 64 + bit];
1300                                 va += PAGE_SIZE;
1301                                 pv->pv_va = va;
1302                                 m++;
1303                                 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1304                             ("pmap_pv_demote_pde: page %p is not managed", m));
1305                                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1306
1307                                 m->md.pv_gen++;
1308                                 if (va == va_last)
1309                                         goto out;
1310                         }
1311                 }
1312                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1313                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1314         }
1315 out:
1316         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) {
1317                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1318                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1319         }
1320         PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
1321         PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
1322 }
1323
1324 static void
1325 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap)
1326 {
1327
1328         if (pmap == NULL)
1329                 return;
1330         pmap_invalidate_all(pmap);
1331         if (pmap != locked_pmap)
1332                 PMAP_UNLOCK(pmap);
1333 }
1334
1335 /*
1336  * We are in a serious low memory condition.  Resort to
1337  * drastic measures to free some pages so we can allocate
1338  * another pv entry chunk.
1339  *
1340  * Returns NULL if PV entries were reclaimed from the specified pmap.
1341  *
1342  * We do not, however, unmap 2mpages because subsequent accesses will
1343  * allocate per-page pv entries until repromotion occurs, thereby
1344  * exacerbating the shortage of free pv entries.
1345  */
1346 static int active_reclaims = 0;
1347 static vm_page_t
1348 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
1349 {
1350         struct pv_chunk *pc, *pc_marker, *pc_marker_end;
1351         struct pv_chunk_header pc_marker_b, pc_marker_end_b;
1352         struct md_page *pvh;
1353         pml3_entry_t *l3e;
1354         pmap_t next_pmap, pmap;
1355         pt_entry_t *pte, tpte;
1356         pv_entry_t pv;
1357         vm_offset_t va;
1358         vm_page_t m, m_pc;
1359         struct spglist free;
1360         uint64_t inuse;
1361         int bit, field, freed;
1362
1363         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
1364         KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
1365         pmap = NULL;
1366         m_pc = NULL;
1367         SLIST_INIT(&free);
1368         bzero(&pc_marker_b, sizeof(pc_marker_b));
1369         bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
1370         pc_marker = (struct pv_chunk *)&pc_marker_b;
1371         pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
1372
1373         mtx_lock(&pv_chunks_mutex);
1374         active_reclaims++;
1375         TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
1376         TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
1377         while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
1378             SLIST_EMPTY(&free)) {
1379                 next_pmap = pc->pc_pmap;
1380                 if (next_pmap == NULL) {
1381                         /*
1382                          * The next chunk is a marker.  However, it is
1383                          * not our marker, so active_reclaims must be
1384                          * > 1.  Consequently, the next_chunk code
1385                          * will not rotate the pv_chunks list.
1386                          */
1387                         goto next_chunk;
1388                 }
1389                 mtx_unlock(&pv_chunks_mutex);
1390
1391                 /*
1392                  * A pv_chunk can only be removed from the pc_lru list
1393                  * when both pc_chunks_mutex is owned and the
1394                  * corresponding pmap is locked.
1395                  */
1396                 if (pmap != next_pmap) {
1397                         reclaim_pv_chunk_leave_pmap(pmap, locked_pmap);
1398                         pmap = next_pmap;
1399                         /* Avoid deadlock and lock recursion. */
1400                         if (pmap > locked_pmap) {
1401                                 RELEASE_PV_LIST_LOCK(lockp);
1402                                 PMAP_LOCK(pmap);
1403                                 mtx_lock(&pv_chunks_mutex);
1404                                 continue;
1405                         } else if (pmap != locked_pmap) {
1406                                 if (PMAP_TRYLOCK(pmap)) {
1407                                         mtx_lock(&pv_chunks_mutex);
1408                                         continue;
1409                                 } else {
1410                                         pmap = NULL; /* pmap is not locked */
1411                                         mtx_lock(&pv_chunks_mutex);
1412                                         pc = TAILQ_NEXT(pc_marker, pc_lru);
1413                                         if (pc == NULL ||
1414                                             pc->pc_pmap != next_pmap)
1415                                                 continue;
1416                                         goto next_chunk;
1417                                 }
1418                         }
1419                 }
1420
1421                 /*
1422                  * Destroy every non-wired, 4 KB page mapping in the chunk.
1423                  */
1424                 freed = 0;
1425                 for (field = 0; field < _NPCM; field++) {
1426                         for (inuse = ~pc->pc_map[field] & pc_freemask[field];
1427                             inuse != 0; inuse &= ~(1UL << bit)) {
1428                                 bit = cnttzd(inuse);
1429                                 pv = &pc->pc_pventry[field * 64 + bit];
1430                                 va = pv->pv_va;
1431                                 l3e = pmap_pml3e(pmap, va);
1432                                 if ((be64toh(*l3e) & RPTE_LEAF) != 0)
1433                                         continue;
1434                                 pte = pmap_l3e_to_pte(l3e, va);
1435                                 if ((be64toh(*pte) & PG_W) != 0)
1436                                         continue;
1437                                 tpte = be64toh(pte_load_clear(pte));
1438                                 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
1439                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
1440                                         vm_page_dirty(m);
1441                                 if ((tpte & PG_A) != 0)
1442                                         vm_page_aflag_set(m, PGA_REFERENCED);
1443                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1444                                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
1445
1446                                 m->md.pv_gen++;
1447                                 if (TAILQ_EMPTY(&m->md.pv_list) &&
1448                                     (m->flags & PG_FICTITIOUS) == 0) {
1449                                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
1450                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
1451                                                 vm_page_aflag_clear(m,
1452                                                     PGA_WRITEABLE);
1453                                         }
1454                                 }
1455                                 pc->pc_map[field] |= 1UL << bit;
1456                                 pmap_unuse_pt(pmap, va, be64toh(*l3e), &free);
1457                                 freed++;
1458                         }
1459                 }
1460                 if (freed == 0) {
1461                         mtx_lock(&pv_chunks_mutex);
1462                         goto next_chunk;
1463                 }
1464                 /* Every freed mapping is for a 4 KB page. */
1465                 pmap_resident_count_dec(pmap, freed);
1466                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
1467                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
1468                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
1469                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1470                 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) {
1471                         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1472                         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1473                         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1474                         /* Entire chunk is free; return it. */
1475                         m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1476                         mtx_lock(&pv_chunks_mutex);
1477                         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1478                         break;
1479                 }
1480                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1481                 mtx_lock(&pv_chunks_mutex);
1482                 /* One freed pv entry in locked_pmap is sufficient. */
1483                 if (pmap == locked_pmap)
1484                         break;
1485 next_chunk:
1486                 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
1487                 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
1488                 if (active_reclaims == 1 && pmap != NULL) {
1489                         /*
1490                          * Rotate the pv chunks list so that we do not
1491                          * scan the same pv chunks that could not be
1492                          * freed (because they contained a wired
1493                          * and/or superpage mapping) on every
1494                          * invocation of reclaim_pv_chunk().
1495                          */
1496                         while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
1497                                 MPASS(pc->pc_pmap != NULL);
1498                                 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1499                                 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1500                         }
1501                 }
1502         }
1503         TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
1504         TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
1505         active_reclaims--;
1506         mtx_unlock(&pv_chunks_mutex);
1507         reclaim_pv_chunk_leave_pmap(pmap, locked_pmap);
1508         if (m_pc == NULL && !SLIST_EMPTY(&free)) {
1509                 m_pc = SLIST_FIRST(&free);
1510                 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
1511                 /* Recycle a freed page table page. */
1512                 m_pc->ref_count = 1;
1513         }
1514         vm_page_free_pages_toq(&free, true);
1515         return (m_pc);
1516 }
1517
1518 /*
1519  * free the pv_entry back to the free list
1520  */
1521 static void
1522 free_pv_entry(pmap_t pmap, pv_entry_t pv)
1523 {
1524         struct pv_chunk *pc;
1525         int idx, field, bit;
1526
1527 #ifdef VERBOSE_PV
1528         if (pmap != kernel_pmap)
1529                 printf("%s(%p, %p)\n", __func__, pmap, pv);
1530 #endif
1531         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1532         PV_STAT(atomic_add_long(&pv_entry_frees, 1));
1533         PV_STAT(atomic_add_int(&pv_entry_spare, 1));
1534         PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
1535         pc = pv_to_chunk(pv);
1536         idx = pv - &pc->pc_pventry[0];
1537         field = idx / 64;
1538         bit = idx % 64;
1539         pc->pc_map[field] |= 1ul << bit;
1540         if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) {
1541                 /* 98% of the time, pc is already at the head of the list. */
1542                 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
1543                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1544                         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1545                 }
1546                 return;
1547         }
1548         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1549         free_pv_chunk(pc);
1550 }
1551
1552 static void
1553 free_pv_chunk(struct pv_chunk *pc)
1554 {
1555         vm_page_t m;
1556
1557         mtx_lock(&pv_chunks_mutex);
1558         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1559         mtx_unlock(&pv_chunks_mutex);
1560         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1561         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1562         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1563         /* entire chunk is free, return it */
1564         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1565         vm_page_unwire_noq(m);
1566         vm_page_free(m);
1567 }
1568
1569 /*
1570  * Returns a new PV entry, allocating a new PV chunk from the system when
1571  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
1572  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
1573  * returned.
1574  *
1575  * The given PV list lock may be released.
1576  */
1577 static pv_entry_t
1578 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
1579 {
1580         int bit, field;
1581         pv_entry_t pv;
1582         struct pv_chunk *pc;
1583         vm_page_t m;
1584
1585         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1586         PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
1587 retry:
1588         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1589         if (pc != NULL) {
1590                 for (field = 0; field < _NPCM; field++) {
1591                         if (pc->pc_map[field]) {
1592                                 bit = cnttzd(pc->pc_map[field]);
1593                                 break;
1594                         }
1595                 }
1596                 if (field < _NPCM) {
1597                         pv = &pc->pc_pventry[field * 64 + bit];
1598                         pc->pc_map[field] &= ~(1ul << bit);
1599                         /* If this was the last item, move it to tail */
1600                         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) {
1601                                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1602                                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
1603                                     pc_list);
1604                         }
1605                         PV_STAT(atomic_add_long(&pv_entry_count, 1));
1606                         PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
1607                         MPASS(PV_PMAP(pv) != NULL);
1608                         return (pv);
1609                 }
1610         }
1611         /* No free items, allocate another chunk */
1612         m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1613             VM_ALLOC_WIRED);
1614         if (m == NULL) {
1615                 if (lockp == NULL) {
1616                         PV_STAT(pc_chunk_tryfail++);
1617                         return (NULL);
1618                 }
1619                 m = reclaim_pv_chunk(pmap, lockp);
1620                 if (m == NULL)
1621                         goto retry;
1622         }
1623         PV_STAT(atomic_add_int(&pc_chunk_count, 1));
1624         PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
1625         pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1626         pc->pc_pmap = pmap;
1627         pc->pc_map[0] = PC_FREE0 & ~1ul;        /* preallocated bit 0 */
1628         pc->pc_map[1] = PC_FREE1;
1629         mtx_lock(&pv_chunks_mutex);
1630         TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1631         mtx_unlock(&pv_chunks_mutex);
1632         pv = &pc->pc_pventry[0];
1633         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1634         PV_STAT(atomic_add_long(&pv_entry_count, 1));
1635         PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
1636         MPASS(PV_PMAP(pv) != NULL);
1637         return (pv);
1638 }
1639
1640 #if VM_NRESERVLEVEL > 0
1641 /*
1642  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
1643  * replace the many pv entries for the 4KB page mappings by a single pv entry
1644  * for the 2MB page mapping.
1645  */
1646 static void
1647 pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1648     struct rwlock **lockp)
1649 {
1650         struct md_page *pvh;
1651         pv_entry_t pv;
1652         vm_offset_t va_last;
1653         vm_page_t m;
1654
1655         KASSERT((pa & L3_PAGE_MASK) == 0,
1656             ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
1657         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
1658
1659         /*
1660          * Transfer the first page's pv entry for this mapping to the 2mpage's
1661          * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
1662          * a transfer avoids the possibility that get_pv_entry() calls
1663          * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
1664          * mappings that is being promoted.
1665          */
1666         m = PHYS_TO_VM_PAGE(pa);
1667         va = trunc_2mpage(va);
1668         pv = pmap_pvh_remove(&m->md, pmap, va);
1669         KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
1670         pvh = pa_to_pvh(pa);
1671         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
1672         pvh->pv_gen++;
1673         /* Free the remaining NPTEPG - 1 pv entries. */
1674         va_last = va + L3_PAGE_SIZE - PAGE_SIZE;
1675         do {
1676                 m++;
1677                 va += PAGE_SIZE;
1678                 pmap_pvh_free(&m->md, pmap, va);
1679         } while (va < va_last);
1680 }
1681 #endif /* VM_NRESERVLEVEL > 0 */
1682
1683 /*
1684  * First find and then destroy the pv entry for the specified pmap and virtual
1685  * address.  This operation can be performed on pv lists for either 4KB or 2MB
1686  * page mappings.
1687  */
1688 static void
1689 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1690 {
1691         pv_entry_t pv;
1692
1693         pv = pmap_pvh_remove(pvh, pmap, va);
1694         KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
1695         free_pv_entry(pmap, pv);
1696 }
1697
1698 /*
1699  * Conditionally create the PV entry for a 4KB page mapping if the required
1700  * memory can be allocated without resorting to reclamation.
1701  */
1702 static boolean_t
1703 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
1704     struct rwlock **lockp)
1705 {
1706         pv_entry_t pv;
1707
1708         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1709         /* Pass NULL instead of the lock pointer to disable reclamation. */
1710         if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
1711                 pv->pv_va = va;
1712                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1713                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1714                 m->md.pv_gen++;
1715                 return (TRUE);
1716         } else
1717                 return (FALSE);
1718 }
1719
1720 vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX];
1721 #ifdef INVARIANTS
1722 static void
1723 validate_addr(vm_paddr_t addr, vm_size_t size)
1724 {
1725         vm_paddr_t end = addr + size;
1726         bool found = false;
1727
1728         for (int i = 0; i < 2 * phys_avail_count; i += 2) {
1729                 if (addr >= phys_avail_debug[i] &&
1730                         end <= phys_avail_debug[i + 1]) {
1731                         found = true;
1732                         break;
1733                 }
1734         }
1735         KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array",
1736                                         addr, end));
1737 }
1738 #else
1739 static void validate_addr(vm_paddr_t addr, vm_size_t size) {}
1740 #endif
1741 #define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A)
1742
1743 static vm_paddr_t
1744 alloc_pt_page(void)
1745 {
1746         vm_paddr_t page;
1747
1748         page = allocpages(1);
1749         pagezero(PHYS_TO_DMAP(page));
1750         return (page);
1751 }
1752
1753 static void
1754 mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end)
1755 {
1756         pt_entry_t *pte, pteval;
1757         vm_paddr_t page;
1758
1759         if (bootverbose)
1760                 printf("%s %lx -> %lx\n", __func__, start, end);
1761         while (start < end) {
1762                 pteval = start | DMAP_PAGE_BITS;
1763                 pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start));
1764                 if ((be64toh(*pte) & RPTE_VALID) == 0) {
1765                         page = alloc_pt_page();
1766                         pde_store(pte, page);
1767                 }
1768                 pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start));
1769                 if ((start & L2_PAGE_MASK) == 0 &&
1770                         end - start >= L2_PAGE_SIZE) {
1771                         start += L2_PAGE_SIZE;
1772                         goto done;
1773                 } else if ((be64toh(*pte) & RPTE_VALID) == 0) {
1774                         page = alloc_pt_page();
1775                         pde_store(pte, page);
1776                 }
1777
1778                 pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start));
1779                 if ((start & L3_PAGE_MASK) == 0 &&
1780                         end - start >= L3_PAGE_SIZE) {
1781                         start += L3_PAGE_SIZE;
1782                         goto done;
1783                 } else if ((be64toh(*pte) & RPTE_VALID) == 0) {
1784                         page = alloc_pt_page();
1785                         pde_store(pte, page);
1786                 }
1787                 pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start));
1788                 start += PAGE_SIZE;
1789         done:
1790                 pte_store(pte, pteval);
1791         }
1792 }
1793
1794 static void
1795 mmu_radix_dmap_populate(vm_size_t hwphyssz)
1796 {
1797         vm_paddr_t start, end;
1798
1799         for (int i = 0; i < pregions_sz; i++) {
1800                 start = pregions[i].mr_start;
1801                 end = start + pregions[i].mr_size;
1802                 if (hwphyssz && start >= hwphyssz)
1803                         break;
1804                 if (hwphyssz && hwphyssz < end)
1805                         end = hwphyssz;
1806                 mmu_radix_dmap_range(start, end);
1807         }
1808 }
1809
1810 static void
1811 mmu_radix_setup_pagetables(vm_size_t hwphyssz)
1812 {
1813         vm_paddr_t ptpages, pages;
1814         pt_entry_t *pte;
1815         vm_paddr_t l1phys;
1816
1817         bzero(kernel_pmap, sizeof(struct pmap));
1818         PMAP_LOCK_INIT(kernel_pmap);
1819
1820         ptpages = allocpages(3);
1821         l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE);
1822         validate_addr(l1phys, RADIX_PGD_SIZE);
1823         if (bootverbose)
1824                 printf("l1phys=%lx\n", l1phys);
1825         MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0);
1826         for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++)
1827                 pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE));
1828         kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys);
1829
1830         mmu_radix_dmap_populate(hwphyssz);
1831
1832         /*
1833          * Create page tables for first 128MB of KVA
1834          */
1835         pages = ptpages;
1836         pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS);
1837         *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
1838         pages += PAGE_SIZE;
1839         pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS);
1840         *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
1841         pages += PAGE_SIZE;
1842         pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS);
1843         /*
1844          * the kernel page table pages need to be preserved in
1845          * phys_avail and not overlap with previous  allocations
1846          */
1847         pages = allocpages(nkpt);
1848         if (bootverbose) {
1849                 printf("phys_avail after dmap populate and nkpt allocation\n");
1850                 for (int j = 0; j < 2 * phys_avail_count; j+=2)
1851                         printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n",
1852                                    j, phys_avail[j], j + 1, phys_avail[j + 1]);
1853         }
1854         KPTphys = pages;
1855         for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE)
1856                 *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
1857         kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE;
1858         if (bootverbose)
1859                 printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1);
1860         /*
1861          * Add a physical memory segment (vm_phys_seg) corresponding to the
1862          * preallocated kernel page table pages so that vm_page structures
1863          * representing these pages will be created.  The vm_page structures
1864          * are required for promotion of the corresponding kernel virtual
1865          * addresses to superpage mappings.
1866          */
1867         vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
1868 }
1869
1870 static void
1871 mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end)
1872 {
1873         vm_paddr_t      kpstart, kpend;
1874         vm_size_t       physsz, hwphyssz;
1875         //uint64_t      l2virt;
1876         int             rm_pavail, proctab_size;
1877         int             i, j;
1878
1879         kpstart = start & ~DMAP_BASE_ADDRESS;
1880         kpend = end & ~DMAP_BASE_ADDRESS;
1881
1882         /* Get physical memory regions from firmware */
1883         mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
1884         CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory");
1885
1886         if (2 * VM_PHYSSEG_MAX < regions_sz)
1887                 panic("mmu_radix_early_bootstrap: phys_avail too small");
1888
1889         if (bootverbose)
1890                 for (int i = 0; i < regions_sz; i++)
1891                         printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n",
1892                             i, regions[i].mr_start, i, regions[i].mr_size);
1893         /*
1894          * XXX workaround a simulator bug
1895          */
1896         for (int i = 0; i < regions_sz; i++)
1897                 if (regions[i].mr_start & PAGE_MASK) {
1898                         regions[i].mr_start += PAGE_MASK;
1899                         regions[i].mr_start &= ~PAGE_MASK;
1900                         regions[i].mr_size &= ~PAGE_MASK;
1901                 }
1902         if (bootverbose)
1903                 for (int i = 0; i < pregions_sz; i++)
1904                         printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n",
1905                             i, pregions[i].mr_start, i, pregions[i].mr_size);
1906
1907         phys_avail_count = 0;
1908         physsz = 0;
1909         hwphyssz = 0;
1910         TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
1911         for (i = 0, j = 0; i < regions_sz; i++) {
1912                 if (bootverbose)
1913                         printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n",
1914                             i, regions[i].mr_start, i, regions[i].mr_size);
1915
1916                 if (regions[i].mr_size < PAGE_SIZE)
1917                         continue;
1918
1919                 if (hwphyssz != 0 &&
1920                     (physsz + regions[i].mr_size) >= hwphyssz) {
1921                         if (physsz < hwphyssz) {
1922                                 phys_avail[j] = regions[i].mr_start;
1923                                 phys_avail[j + 1] = regions[i].mr_start +
1924                                     (hwphyssz - physsz);
1925                                 physsz = hwphyssz;
1926                                 phys_avail_count++;
1927                                 dump_avail[j] = phys_avail[j];
1928                                 dump_avail[j + 1] = phys_avail[j + 1];
1929                         }
1930                         break;
1931                 }
1932                 phys_avail[j] = regions[i].mr_start;
1933                 phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
1934                 dump_avail[j] = phys_avail[j];
1935                 dump_avail[j + 1] = phys_avail[j + 1];
1936
1937                 phys_avail_count++;
1938                 physsz += regions[i].mr_size;
1939                 j += 2;
1940         }
1941
1942         /* Check for overlap with the kernel and exception vectors */
1943         rm_pavail = 0;
1944         for (j = 0; j < 2 * phys_avail_count; j+=2) {
1945                 if (phys_avail[j] < EXC_LAST)
1946                         phys_avail[j] += EXC_LAST;
1947
1948                 if (phys_avail[j] >= kpstart &&
1949                     phys_avail[j + 1] <= kpend) {
1950                         phys_avail[j] = phys_avail[j + 1] = ~0;
1951                         rm_pavail++;
1952                         continue;
1953                 }
1954
1955                 if (kpstart >= phys_avail[j] &&
1956                     kpstart < phys_avail[j + 1]) {
1957                         if (kpend < phys_avail[j + 1]) {
1958                                 phys_avail[2 * phys_avail_count] =
1959                                     (kpend & ~PAGE_MASK) + PAGE_SIZE;
1960                                 phys_avail[2 * phys_avail_count + 1] =
1961                                     phys_avail[j + 1];
1962                                 phys_avail_count++;
1963                         }
1964
1965                         phys_avail[j + 1] = kpstart & ~PAGE_MASK;
1966                 }
1967
1968                 if (kpend >= phys_avail[j] &&
1969                     kpend < phys_avail[j + 1]) {
1970                         if (kpstart > phys_avail[j]) {
1971                                 phys_avail[2 * phys_avail_count] = phys_avail[j];
1972                                 phys_avail[2 * phys_avail_count + 1] =
1973                                     kpstart & ~PAGE_MASK;
1974                                 phys_avail_count++;
1975                         }
1976
1977                         phys_avail[j] = (kpend & ~PAGE_MASK) +
1978                             PAGE_SIZE;
1979                 }
1980         }
1981         qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp);
1982         for (i = 0; i < 2 * phys_avail_count; i++)
1983                 phys_avail_debug[i] = phys_avail[i];
1984
1985         /* Remove physical available regions marked for removal (~0) */
1986         if (rm_pavail) {
1987                 phys_avail_count -= rm_pavail;
1988                 for (i = 2 * phys_avail_count;
1989                      i < 2*(phys_avail_count + rm_pavail); i+=2)
1990                         phys_avail[i] = phys_avail[i + 1] = 0;
1991         }
1992         if (bootverbose) {
1993                 printf("phys_avail ranges after filtering:\n");
1994                 for (j = 0; j < 2 * phys_avail_count; j+=2)
1995                         printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n",
1996                                    j, phys_avail[j], j + 1, phys_avail[j + 1]);
1997         }
1998         physmem = btoc(physsz);
1999
2000         /* XXX assume we're running non-virtualized and
2001          * we don't support BHYVE
2002          */
2003         if (isa3_pid_bits == 0)
2004                 isa3_pid_bits = 20;
2005         parttab_phys = moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE);
2006         validate_addr(parttab_phys, PARTTAB_SIZE);
2007         for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++)
2008                 pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE));
2009
2010         proctab_size = 1UL << PROCTAB_SIZE_SHIFT;
2011         proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size);
2012         validate_addr(proctab0pa, proctab_size);
2013         for (int i = 0; i < proctab_size/PAGE_SIZE; i++)
2014                 pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE));
2015
2016         mmu_radix_setup_pagetables(hwphyssz);
2017 }
2018
2019 static void
2020 mmu_radix_late_bootstrap(vm_offset_t start, vm_offset_t end)
2021 {
2022         int             i;
2023         vm_paddr_t      pa;
2024         void            *dpcpu;
2025         vm_offset_t va;
2026
2027         /*
2028          * Set up the Open Firmware pmap and add its mappings if not in real
2029          * mode.
2030          */
2031         if (bootverbose)
2032                 printf("%s enter\n", __func__);
2033
2034         /*
2035          * Calculate the last available physical address, and reserve the
2036          * vm_page_array (upper bound).
2037          */
2038         Maxmem = 0;
2039         for (i = 0; phys_avail[i + 2] != 0; i += 2)
2040                 Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1]));
2041
2042         /*
2043          * Set the start and end of kva.
2044          */
2045         virtual_avail = VM_MIN_KERNEL_ADDRESS;
2046         virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS;
2047
2048         /*
2049          * Remap any early IO mappings (console framebuffer, etc.)
2050          */
2051         bs_remap_earlyboot();
2052
2053         /*
2054          * Allocate a kernel stack with a guard page for thread0 and map it
2055          * into the kernel page map.
2056          */
2057         pa = allocpages(kstack_pages);
2058         va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
2059         virtual_avail = va + kstack_pages * PAGE_SIZE;
2060         CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va);
2061         thread0.td_kstack = va;
2062         for (i = 0; i < kstack_pages; i++) {
2063                 mmu_radix_kenter(va, pa);
2064                 pa += PAGE_SIZE;
2065                 va += PAGE_SIZE;
2066         }
2067         thread0.td_kstack_pages = kstack_pages;
2068
2069         /*
2070          * Allocate virtual address space for the message buffer.
2071          */
2072         pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK)  >> PAGE_SHIFT);
2073         msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa);
2074
2075         /*
2076          * Allocate virtual address space for the dynamic percpu area.
2077          */
2078         pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT);
2079         dpcpu = (void *)PHYS_TO_DMAP(pa);
2080         dpcpu_init(dpcpu, curcpu);
2081         /*
2082          * Reserve some special page table entries/VA space for temporary
2083          * mapping of pages.
2084          */
2085 }
2086
2087 static void
2088 mmu_parttab_init(void)
2089 {
2090         uint64_t ptcr;
2091
2092         isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys);
2093
2094         if (bootverbose)
2095                 printf("%s parttab: %p\n", __func__, isa3_parttab);
2096         ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12);
2097         if (bootverbose)
2098                 printf("setting ptcr %lx\n", ptcr);
2099         mtspr(SPR_PTCR, ptcr);
2100 }
2101
2102 static void
2103 mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab)
2104 {
2105         uint64_t prev;
2106
2107         if (bootverbose)
2108                 printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab,
2109                            lpid, pagetab, proctab);
2110         prev = be64toh(isa3_parttab[lpid].pagetab);
2111         isa3_parttab[lpid].pagetab = htobe64(pagetab);
2112         isa3_parttab[lpid].proctab = htobe64(proctab);
2113
2114         if (prev & PARTTAB_HR) {
2115                 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
2116                              "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2117                 __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
2118                              "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2119         } else {
2120                 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
2121                              "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2122         }
2123         ttusync();
2124 }
2125
2126 static void
2127 mmu_radix_parttab_init(void)
2128 {
2129         uint64_t pagetab;
2130
2131         mmu_parttab_init();
2132         pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \
2133                          RADIX_PGD_INDEX_SHIFT | PARTTAB_HR;
2134         mmu_parttab_update(0, pagetab, 0);
2135 }
2136
2137 static void
2138 mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size)
2139 {
2140         uint64_t pagetab, proctab;
2141
2142         pagetab = be64toh(isa3_parttab[0].pagetab);
2143         proctab = proctabpa | table_size | PARTTAB_GR;
2144         mmu_parttab_update(0, pagetab, proctab);
2145 }
2146
2147 static void
2148 mmu_radix_proctab_init(void)
2149 {
2150
2151         isa3_base_pid = 1;
2152
2153         isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa);
2154         isa3_proctab->proctab0 =
2155             htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) |
2156                 RADIX_PGD_INDEX_SHIFT);
2157
2158         mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12);
2159
2160         __asm __volatile("ptesync" : : : "memory");
2161         __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
2162                      "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
2163         __asm __volatile("eieio; tlbsync; ptesync" : : : "memory");
2164         if (bootverbose)
2165                 printf("process table %p and kernel radix PDE: %p\n",
2166                            isa3_proctab, kernel_pmap->pm_pml1);
2167         mtmsr(mfmsr() | PSL_DR );
2168         mtmsr(mfmsr() &  ~PSL_DR);
2169         kernel_pmap->pm_pid = isa3_base_pid;
2170         isa3_base_pid++;
2171 }
2172
2173 void
2174 mmu_radix_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2175     int advice)
2176 {
2177         struct rwlock *lock;
2178         pml1_entry_t *l1e;
2179         pml2_entry_t *l2e;
2180         pml3_entry_t oldl3e, *l3e;
2181         pt_entry_t *pte;
2182         vm_offset_t va, va_next;
2183         vm_page_t m;
2184         boolean_t anychanged;
2185
2186         if (advice != MADV_DONTNEED && advice != MADV_FREE)
2187                 return;
2188         anychanged = FALSE;
2189         PMAP_LOCK(pmap);
2190         for (; sva < eva; sva = va_next) {
2191                 l1e = pmap_pml1e(pmap, sva);
2192                 if ((be64toh(*l1e) & PG_V) == 0) {
2193                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
2194                         if (va_next < sva)
2195                                 va_next = eva;
2196                         continue;
2197                 }
2198                 l2e = pmap_l1e_to_l2e(l1e, sva);
2199                 if ((be64toh(*l2e) & PG_V) == 0) {
2200                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
2201                         if (va_next < sva)
2202                                 va_next = eva;
2203                         continue;
2204                 }
2205                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
2206                 if (va_next < sva)
2207                         va_next = eva;
2208                 l3e = pmap_l2e_to_l3e(l2e, sva);
2209                 oldl3e = be64toh(*l3e);
2210                 if ((oldl3e & PG_V) == 0)
2211                         continue;
2212                 else if ((oldl3e & RPTE_LEAF) != 0) {
2213                         if ((oldl3e & PG_MANAGED) == 0)
2214                                 continue;
2215                         lock = NULL;
2216                         if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) {
2217                                 if (lock != NULL)
2218                                         rw_wunlock(lock);
2219
2220                                 /*
2221                                  * The large page mapping was destroyed.
2222                                  */
2223                                 continue;
2224                         }
2225
2226                         /*
2227                          * Unless the page mappings are wired, remove the
2228                          * mapping to a single page so that a subsequent
2229                          * access may repromote.  Since the underlying page
2230                          * table page is fully populated, this removal never
2231                          * frees a page table page.
2232                          */
2233                         if ((oldl3e & PG_W) == 0) {
2234                                 pte = pmap_l3e_to_pte(l3e, sva);
2235                                 KASSERT((be64toh(*pte) & PG_V) != 0,
2236                                     ("pmap_advise: invalid PTE"));
2237                                 pmap_remove_pte(pmap, pte, sva, be64toh(*l3e), NULL,
2238                                     &lock);
2239                                 anychanged = TRUE;
2240                         }
2241                         if (lock != NULL)
2242                                 rw_wunlock(lock);
2243                 }
2244                 if (va_next > eva)
2245                         va_next = eva;
2246                 va = va_next;
2247                 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next;
2248                          pte++, sva += PAGE_SIZE) {
2249                         MPASS(pte == pmap_pte(pmap, sva));
2250
2251                         if ((be64toh(*pte) & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
2252                                 goto maybe_invlrng;
2253                         else if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2254                                 if (advice == MADV_DONTNEED) {
2255                                         /*
2256                                          * Future calls to pmap_is_modified()
2257                                          * can be avoided by making the page
2258                                          * dirty now.
2259                                          */
2260                                         m = PHYS_TO_VM_PAGE(be64toh(*pte) & PG_FRAME);
2261                                         vm_page_dirty(m);
2262                                 }
2263                                 atomic_clear_long(pte, htobe64(PG_M | PG_A));
2264                         } else if ((be64toh(*pte) & PG_A) != 0)
2265                                 atomic_clear_long(pte, htobe64(PG_A));
2266                         else
2267                                 goto maybe_invlrng;
2268                         anychanged = TRUE;
2269                         continue;
2270 maybe_invlrng:
2271                         if (va != va_next) {
2272                                 anychanged = true;
2273                                 va = va_next;
2274                         }
2275                 }
2276                 if (va != va_next)
2277                         anychanged = true;
2278         }
2279         if (anychanged)
2280                 pmap_invalidate_all(pmap);
2281         PMAP_UNLOCK(pmap);
2282 }
2283
2284 /*
2285  * Routines used in machine-dependent code
2286  */
2287 static void
2288 mmu_radix_bootstrap(vm_offset_t start, vm_offset_t end)
2289 {
2290         uint64_t lpcr;
2291
2292         if (bootverbose)
2293                 printf("%s\n", __func__);
2294         hw_direct_map = 1;
2295         mmu_radix_early_bootstrap(start, end);
2296         if (bootverbose)
2297                 printf("early bootstrap complete\n");
2298         if (powernv_enabled) {
2299                 lpcr = mfspr(SPR_LPCR);
2300                 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
2301                 mmu_radix_parttab_init();
2302                 mmu_radix_init_amor();
2303                 if (bootverbose)
2304                         printf("powernv init complete\n");
2305         }
2306         mmu_radix_init_iamr();
2307         mmu_radix_proctab_init();
2308         mmu_radix_pid_set(kernel_pmap);
2309         /* XXX assume CPU_FTR_HVMODE */
2310         mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
2311
2312         mmu_radix_late_bootstrap(start, end);
2313         numa_mem_regions(&numa_pregions, &numa_pregions_sz);
2314         if (bootverbose)
2315                 printf("%s done\n", __func__);
2316         pmap_bootstrapped = 1;
2317         dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE);
2318         PCPU_SET(flags, PCPU_GET(flags) | PC_FLAG_NOSRS);
2319 }
2320
2321 static void
2322 mmu_radix_cpu_bootstrap(int ap)
2323 {
2324         uint64_t lpcr;
2325         uint64_t ptcr;
2326
2327         if (powernv_enabled) {
2328                 lpcr = mfspr(SPR_LPCR);
2329                 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
2330
2331                 ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12);
2332                 mtspr(SPR_PTCR, ptcr);
2333                 mmu_radix_init_amor();
2334         }
2335         mmu_radix_init_iamr();
2336         mmu_radix_pid_set(kernel_pmap);
2337         mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
2338 }
2339
2340 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0,
2341     "2MB page mapping counters");
2342
2343 static u_long pmap_l3e_demotions;
2344 SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD,
2345     &pmap_l3e_demotions, 0, "2MB page demotions");
2346
2347 static u_long pmap_l3e_mappings;
2348 SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD,
2349     &pmap_l3e_mappings, 0, "2MB page mappings");
2350
2351 static u_long pmap_l3e_p_failures;
2352 SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD,
2353     &pmap_l3e_p_failures, 0, "2MB page promotion failures");
2354
2355 static u_long pmap_l3e_promotions;
2356 SYSCTL_ULONG(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD,
2357     &pmap_l3e_promotions, 0, "2MB page promotions");
2358
2359 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0,
2360     "1GB page mapping counters");
2361
2362 static u_long pmap_l2e_demotions;
2363 SYSCTL_ULONG(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD,
2364     &pmap_l2e_demotions, 0, "1GB page demotions");
2365
2366 void
2367 mmu_radix_clear_modify(vm_page_t m)
2368 {
2369         struct md_page *pvh;
2370         pmap_t pmap;
2371         pv_entry_t next_pv, pv;
2372         pml3_entry_t oldl3e, *l3e;
2373         pt_entry_t oldpte, *pte;
2374         struct rwlock *lock;
2375         vm_offset_t va;
2376         int md_gen, pvh_gen;
2377
2378         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2379             ("pmap_clear_modify: page %p is not managed", m));
2380         vm_page_assert_busied(m);
2381         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
2382
2383         /*
2384          * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
2385          * If the object containing the page is locked and the page is not
2386          * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
2387          */
2388         if ((m->a.flags & PGA_WRITEABLE) == 0)
2389                 return;
2390         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2391             pa_to_pvh(VM_PAGE_TO_PHYS(m));
2392         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2393         rw_wlock(lock);
2394 restart:
2395         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) {
2396                 pmap = PV_PMAP(pv);
2397                 if (!PMAP_TRYLOCK(pmap)) {
2398                         pvh_gen = pvh->pv_gen;
2399                         rw_wunlock(lock);
2400                         PMAP_LOCK(pmap);
2401                         rw_wlock(lock);
2402                         if (pvh_gen != pvh->pv_gen) {
2403                                 PMAP_UNLOCK(pmap);
2404                                 goto restart;
2405                         }
2406                 }
2407                 va = pv->pv_va;
2408                 l3e = pmap_pml3e(pmap, va);
2409                 oldl3e = be64toh(*l3e);
2410                 if ((oldl3e & PG_RW) != 0) {
2411                         if (pmap_demote_l3e_locked(pmap, l3e, va, &lock)) {
2412                                 if ((oldl3e & PG_W) == 0) {
2413                                         /*
2414                                          * Write protect the mapping to a
2415                                          * single page so that a subsequent
2416                                          * write access may repromote.
2417                                          */
2418                                         va += VM_PAGE_TO_PHYS(m) - (oldl3e &
2419                                             PG_PS_FRAME);
2420                                         pte = pmap_l3e_to_pte(l3e, va);
2421                                         oldpte = be64toh(*pte);
2422                                         if ((oldpte & PG_V) != 0) {
2423                                                 while (!atomic_cmpset_long(pte,
2424                                                     htobe64(oldpte),
2425                                                         htobe64((oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW))))
2426                                                            oldpte = be64toh(*pte);
2427                                                 vm_page_dirty(m);
2428                                                 pmap_invalidate_page(pmap, va);
2429                                         }
2430                                 }
2431                         }
2432                 }
2433                 PMAP_UNLOCK(pmap);
2434         }
2435         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
2436                 pmap = PV_PMAP(pv);
2437                 if (!PMAP_TRYLOCK(pmap)) {
2438                         md_gen = m->md.pv_gen;
2439                         pvh_gen = pvh->pv_gen;
2440                         rw_wunlock(lock);
2441                         PMAP_LOCK(pmap);
2442                         rw_wlock(lock);
2443                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
2444                                 PMAP_UNLOCK(pmap);
2445                                 goto restart;
2446                         }
2447                 }
2448                 l3e = pmap_pml3e(pmap, pv->pv_va);
2449                 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_clear_modify: found"
2450                     " a 2mpage in page %p's pv list", m));
2451                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
2452                 if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2453                         atomic_clear_long(pte, htobe64(PG_M));
2454                         pmap_invalidate_page(pmap, pv->pv_va);
2455                 }
2456                 PMAP_UNLOCK(pmap);
2457         }
2458         rw_wunlock(lock);
2459 }
2460
2461 void
2462 mmu_radix_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
2463     vm_size_t len, vm_offset_t src_addr)
2464 {
2465         struct rwlock *lock;
2466         struct spglist free;
2467         vm_offset_t addr;
2468         vm_offset_t end_addr = src_addr + len;
2469         vm_offset_t va_next;
2470         vm_page_t dst_pdpg, dstmpte, srcmpte;
2471         bool invalidate_all;
2472
2473         CTR6(KTR_PMAP,
2474             "%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n",
2475             __func__, dst_pmap, src_pmap, dst_addr, len, src_addr);
2476
2477         if (dst_addr != src_addr)
2478                 return;
2479         lock = NULL;
2480         invalidate_all = false;
2481         if (dst_pmap < src_pmap) {
2482                 PMAP_LOCK(dst_pmap);
2483                 PMAP_LOCK(src_pmap);
2484         } else {
2485                 PMAP_LOCK(src_pmap);
2486                 PMAP_LOCK(dst_pmap);
2487         }
2488
2489         for (addr = src_addr; addr < end_addr; addr = va_next) {
2490                 pml1_entry_t *l1e;
2491                 pml2_entry_t *l2e;
2492                 pml3_entry_t srcptepaddr, *l3e;
2493                 pt_entry_t *src_pte, *dst_pte;
2494
2495                 l1e = pmap_pml1e(src_pmap, addr);
2496                 if ((be64toh(*l1e) & PG_V) == 0) {
2497                         va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
2498                         if (va_next < addr)
2499                                 va_next = end_addr;
2500                         continue;
2501                 }
2502
2503                 l2e = pmap_l1e_to_l2e(l1e, addr);
2504                 if ((be64toh(*l2e) & PG_V) == 0) {
2505                         va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
2506                         if (va_next < addr)
2507                                 va_next = end_addr;
2508                         continue;
2509                 }
2510
2511                 va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
2512                 if (va_next < addr)
2513                         va_next = end_addr;
2514
2515                 l3e = pmap_l2e_to_l3e(l2e, addr);
2516                 srcptepaddr = be64toh(*l3e);
2517                 if (srcptepaddr == 0)
2518                         continue;
2519
2520                 if (srcptepaddr & RPTE_LEAF) {
2521                         if ((addr & L3_PAGE_MASK) != 0 ||
2522                             addr + L3_PAGE_SIZE > end_addr)
2523                                 continue;
2524                         dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL);
2525                         if (dst_pdpg == NULL)
2526                                 break;
2527                         l3e = (pml3_entry_t *)
2528                             PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
2529                         l3e = &l3e[pmap_pml3e_index(addr)];
2530                         if (be64toh(*l3e) == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
2531                             pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr,
2532                             PMAP_ENTER_NORECLAIM, &lock))) {
2533                                 *l3e = htobe64(srcptepaddr & ~PG_W);
2534                                 pmap_resident_count_inc(dst_pmap,
2535                                     L3_PAGE_SIZE / PAGE_SIZE);
2536                                 atomic_add_long(&pmap_l3e_mappings, 1);
2537                         } else
2538                                 dst_pdpg->ref_count--;
2539                         continue;
2540                 }
2541
2542                 srcptepaddr &= PG_FRAME;
2543                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2544                 KASSERT(srcmpte->ref_count > 0,
2545                     ("pmap_copy: source page table page is unused"));
2546
2547                 if (va_next > end_addr)
2548                         va_next = end_addr;
2549
2550                 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
2551                 src_pte = &src_pte[pmap_pte_index(addr)];
2552                 dstmpte = NULL;
2553                 while (addr < va_next) {
2554                         pt_entry_t ptetemp;
2555                         ptetemp = be64toh(*src_pte);
2556                         /*
2557                          * we only virtual copy managed pages
2558                          */
2559                         if ((ptetemp & PG_MANAGED) != 0) {
2560                                 if (dstmpte != NULL &&
2561                                     dstmpte->pindex == pmap_l3e_pindex(addr))
2562                                         dstmpte->ref_count++;
2563                                 else if ((dstmpte = pmap_allocpte(dst_pmap,
2564                                     addr, NULL)) == NULL)
2565                                         goto out;
2566                                 dst_pte = (pt_entry_t *)
2567                                     PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
2568                                 dst_pte = &dst_pte[pmap_pte_index(addr)];
2569                                 if (be64toh(*dst_pte) == 0 &&
2570                                     pmap_try_insert_pv_entry(dst_pmap, addr,
2571                                     PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
2572                                     &lock)) {
2573                                         /*
2574                                          * Clear the wired, modified, and
2575                                          * accessed (referenced) bits
2576                                          * during the copy.
2577                                          */
2578                                         *dst_pte = htobe64(ptetemp & ~(PG_W | PG_M |
2579                                             PG_A));
2580                                         pmap_resident_count_inc(dst_pmap, 1);
2581                                 } else {
2582                                         SLIST_INIT(&free);
2583                                         if (pmap_unwire_ptp(dst_pmap, addr,
2584                                             dstmpte, &free)) {
2585                                                 /*
2586                                                  * Although "addr" is not
2587                                                  * mapped, paging-structure
2588                                                  * caches could nonetheless
2589                                                  * have entries that refer to
2590                                                  * the freed page table pages.
2591                                                  * Invalidate those entries.
2592                                                  */
2593                                                 invalidate_all = true;
2594                                                 vm_page_free_pages_toq(&free,
2595                                                     true);
2596                                         }
2597                                         goto out;
2598                                 }
2599                                 if (dstmpte->ref_count >= srcmpte->ref_count)
2600                                         break;
2601                         }
2602                         addr += PAGE_SIZE;
2603                         if (__predict_false((addr & L3_PAGE_MASK) == 0))
2604                                 src_pte = pmap_pte(src_pmap, addr);
2605                         else
2606                                 src_pte++;
2607                 }
2608         }
2609 out:
2610         if (invalidate_all)
2611                 pmap_invalidate_all(dst_pmap);
2612         if (lock != NULL)
2613                 rw_wunlock(lock);
2614         PMAP_UNLOCK(src_pmap);
2615         PMAP_UNLOCK(dst_pmap);
2616 }
2617
2618 static void
2619 mmu_radix_copy_page(vm_page_t msrc, vm_page_t mdst)
2620 {
2621         vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
2622         vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
2623
2624         CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst);
2625         /*
2626          * XXX slow
2627          */
2628         bcopy((void *)src, (void *)dst, PAGE_SIZE);
2629 }
2630
2631 static void
2632 mmu_radix_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
2633     vm_offset_t b_offset, int xfersize)
2634 {
2635         void *a_cp, *b_cp;
2636         vm_offset_t a_pg_offset, b_pg_offset;
2637         int cnt;
2638
2639         CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma,
2640             a_offset, mb, b_offset, xfersize);
2641         
2642         while (xfersize > 0) {
2643                 a_pg_offset = a_offset & PAGE_MASK;
2644                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
2645                 a_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
2646                     VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) +
2647                     a_pg_offset;
2648                 b_pg_offset = b_offset & PAGE_MASK;
2649                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
2650                 b_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
2651                     VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) +
2652                     b_pg_offset;
2653                 bcopy(a_cp, b_cp, cnt);
2654                 a_offset += cnt;
2655                 b_offset += cnt;
2656                 xfersize -= cnt;
2657         }
2658 }
2659
2660 #if VM_NRESERVLEVEL > 0
2661 /*
2662  * Tries to promote the 512, contiguous 4KB page mappings that are within a
2663  * single page table page (PTP) to a single 2MB page mapping.  For promotion
2664  * to occur, two conditions must be met: (1) the 4KB page mappings must map
2665  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
2666  * identical characteristics.
2667  */
2668 static int
2669 pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va,
2670     struct rwlock **lockp)
2671 {
2672         pml3_entry_t newpde;
2673         pt_entry_t *firstpte, oldpte, pa, *pte;
2674         vm_page_t mpte;
2675
2676         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2677
2678         /*
2679          * Examine the first PTE in the specified PTP.  Abort if this PTE is
2680          * either invalid, unused, or does not map the first 4KB physical page
2681          * within a 2MB page.
2682          */
2683         firstpte = (pt_entry_t *)PHYS_TO_DMAP(be64toh(*pde) & PG_FRAME);
2684 setpde:
2685         newpde = *firstpte;
2686         if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
2687                 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2688                     " in pmap %p", va, pmap);
2689                 goto fail;
2690         }
2691         if ((newpde & (PG_M | PG_RW)) == PG_RW) {
2692                 /*
2693                  * When PG_M is already clear, PG_RW can be cleared without
2694                  * a TLB invalidation.
2695                  */
2696                 if (!atomic_cmpset_long(firstpte, htobe64(newpde), htobe64((newpde | RPTE_EAA_R) & ~RPTE_EAA_W)))
2697                         goto setpde;
2698                 newpde &= ~RPTE_EAA_W;
2699         }
2700
2701         /*
2702          * Examine each of the other PTEs in the specified PTP.  Abort if this
2703          * PTE maps an unexpected 4KB physical page or does not have identical
2704          * characteristics to the first PTE.
2705          */
2706         pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE;
2707         for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
2708 setpte:
2709                 oldpte = be64toh(*pte);
2710                 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
2711                         CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2712                             " in pmap %p", va, pmap);
2713                         goto fail;
2714                 }
2715                 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
2716                         /*
2717                          * When PG_M is already clear, PG_RW can be cleared
2718                          * without a TLB invalidation.
2719                          */
2720                         if (!atomic_cmpset_long(pte, htobe64(oldpte), htobe64((oldpte | RPTE_EAA_R) & ~RPTE_EAA_W)))
2721                                 goto setpte;
2722                         oldpte &= ~RPTE_EAA_W;
2723                         CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx"
2724                             " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) |
2725                             (va & ~L3_PAGE_MASK), pmap);
2726                 }
2727                 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
2728                         CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2729                             " in pmap %p", va, pmap);
2730                         goto fail;
2731                 }
2732                 pa -= PAGE_SIZE;
2733         }
2734
2735         /*
2736          * Save the page table page in its current state until the PDE
2737          * mapping the superpage is demoted by pmap_demote_pde() or
2738          * destroyed by pmap_remove_pde().
2739          */
2740         mpte = PHYS_TO_VM_PAGE(be64toh(*pde) & PG_FRAME);
2741         KASSERT(mpte >= vm_page_array &&
2742             mpte < &vm_page_array[vm_page_array_size],
2743             ("pmap_promote_l3e: page table page is out of range"));
2744         KASSERT(mpte->pindex == pmap_l3e_pindex(va),
2745             ("pmap_promote_l3e: page table page's pindex is wrong"));
2746         if (pmap_insert_pt_page(pmap, mpte)) {
2747                 CTR2(KTR_PMAP,
2748                     "pmap_promote_l3e: failure for va %#lx in pmap %p", va,
2749                     pmap);
2750                 goto fail;
2751         }
2752
2753         /*
2754          * Promote the pv entries.
2755          */
2756         if ((newpde & PG_MANAGED) != 0)
2757                 pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp);
2758
2759         pte_store(pde, PG_PROMOTED | newpde);
2760         atomic_add_long(&pmap_l3e_promotions, 1);
2761         CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx"
2762             " in pmap %p", va, pmap);
2763         return (0);
2764  fail:
2765         atomic_add_long(&pmap_l3e_p_failures, 1);
2766         return (KERN_FAILURE);
2767 }
2768 #endif /* VM_NRESERVLEVEL > 0 */
2769
2770 int
2771 mmu_radix_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
2772     vm_prot_t prot, u_int flags, int8_t psind)
2773 {
2774         struct rwlock *lock;
2775         pml3_entry_t *l3e;
2776         pt_entry_t *pte;
2777         pt_entry_t newpte, origpte;
2778         pv_entry_t pv;
2779         vm_paddr_t opa, pa;
2780         vm_page_t mpte, om;
2781         int rv, retrycount;
2782         boolean_t nosleep, invalidate_all, invalidate_page;
2783
2784         va = trunc_page(va);
2785         retrycount = 0;
2786         invalidate_page = invalidate_all = false;
2787         CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va,
2788             m, prot, flags, psind);
2789         KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
2790         KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
2791             va >= kmi.clean_eva,
2792             ("pmap_enter: managed mapping within the clean submap"));
2793         if ((m->oflags & VPO_UNMANAGED) == 0)
2794                 VM_PAGE_OBJECT_BUSY_ASSERT(m);
2795
2796         KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
2797             ("pmap_enter: flags %u has reserved bits set", flags));
2798         pa = VM_PAGE_TO_PHYS(m);
2799         newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF);
2800         if ((flags & VM_PROT_WRITE) != 0)
2801                 newpte |= PG_M;
2802         if ((flags & VM_PROT_READ) != 0)
2803                 newpte |= PG_A;
2804         if (prot & VM_PROT_READ)
2805                 newpte |= RPTE_EAA_R;
2806         if ((prot & VM_PROT_WRITE) != 0)
2807                 newpte |= RPTE_EAA_W;
2808         KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
2809             ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
2810
2811         if (prot & VM_PROT_EXECUTE)
2812                 newpte |= PG_X;
2813         if ((flags & PMAP_ENTER_WIRED) != 0)
2814                 newpte |= PG_W;
2815         if (va >= DMAP_MIN_ADDRESS)
2816                 newpte |= RPTE_EAA_P;
2817         newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs);
2818         /*
2819          * Set modified bit gratuitously for writeable mappings if
2820          * the page is unmanaged. We do not want to take a fault
2821          * to do the dirty bit accounting for these mappings.
2822          */
2823         if ((m->oflags & VPO_UNMANAGED) != 0) {
2824                 if ((newpte & PG_RW) != 0)
2825                         newpte |= PG_M;
2826         } else
2827                 newpte |= PG_MANAGED;
2828
2829         lock = NULL;
2830         PMAP_LOCK(pmap);
2831         if (psind == 1) {
2832                 /* Assert the required virtual and physical alignment. */
2833                 KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned"));
2834                 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
2835                 rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock);
2836                 goto out;
2837         }
2838         mpte = NULL;
2839
2840         /*
2841          * In the case that a page table page is not
2842          * resident, we are creating it here.
2843          */
2844 retry:
2845         l3e = pmap_pml3e(pmap, va);
2846         if (l3e != NULL && (be64toh(*l3e) & PG_V) != 0 && ((be64toh(*l3e) & RPTE_LEAF) == 0 ||
2847             pmap_demote_l3e_locked(pmap, l3e, va, &lock))) {
2848                 pte = pmap_l3e_to_pte(l3e, va);
2849                 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
2850                         mpte = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME);
2851                         mpte->ref_count++;
2852                 }
2853         } else if (va < VM_MAXUSER_ADDRESS) {
2854                 /*
2855                  * Here if the pte page isn't mapped, or if it has been
2856                  * deallocated.
2857                  */
2858                 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
2859                 mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va),
2860                     nosleep ? NULL : &lock);
2861                 if (mpte == NULL && nosleep) {
2862                         rv = KERN_RESOURCE_SHORTAGE;
2863                         goto out;
2864                 }
2865                 if (__predict_false(retrycount++ == 6))
2866                         panic("too many retries");
2867                 invalidate_all = true;
2868                 goto retry;
2869         } else
2870                 panic("pmap_enter: invalid page directory va=%#lx", va);
2871
2872         origpte = be64toh(*pte);
2873         pv = NULL;
2874
2875         /*
2876          * Is the specified virtual address already mapped?
2877          */
2878         if ((origpte & PG_V) != 0) {
2879 #ifdef INVARIANTS
2880                 if (VERBOSE_PMAP || pmap_logging) {
2881                         printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --"
2882                             " asid=%lu curpid=%d name=%s origpte0x%lx\n",
2883                             pmap, va, m, prot, flags, psind, pmap->pm_pid,
2884                             curproc->p_pid, curproc->p_comm, origpte);
2885                         pmap_pte_walk(pmap->pm_pml1, va);
2886                 }
2887 #endif
2888                 /*
2889                  * Wiring change, just update stats. We don't worry about
2890                  * wiring PT pages as they remain resident as long as there
2891                  * are valid mappings in them. Hence, if a user page is wired,
2892                  * the PT page will be also.
2893                  */
2894                 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
2895                         pmap->pm_stats.wired_count++;
2896                 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
2897                         pmap->pm_stats.wired_count--;
2898
2899                 /*
2900                  * Remove the extra PT page reference.
2901                  */
2902                 if (mpte != NULL) {
2903                         mpte->ref_count--;
2904                         KASSERT(mpte->ref_count > 0,
2905                             ("pmap_enter: missing reference to page table page,"
2906                              " va: 0x%lx", va));
2907                 }
2908
2909                 /*
2910                  * Has the physical page changed?
2911                  */
2912                 opa = origpte & PG_FRAME;
2913                 if (opa == pa) {
2914                         /*
2915                          * No, might be a protection or wiring change.
2916                          */
2917                         if ((origpte & PG_MANAGED) != 0 &&
2918                             (newpte & PG_RW) != 0)
2919                                 vm_page_aflag_set(m, PGA_WRITEABLE);
2920                         if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) {
2921                                 if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) {
2922                                         if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte)))
2923                                                 goto retry;
2924                                         if ((newpte & PG_M) != (origpte & PG_M))
2925                                                 vm_page_dirty(m);
2926                                         if ((newpte & PG_A) != (origpte & PG_A))
2927                                                 vm_page_aflag_set(m, PGA_REFERENCED);
2928                                         ptesync();
2929                                 } else
2930                                         invalidate_all = true;
2931                                 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
2932                                         goto unchanged;
2933                         }
2934                         goto validate;
2935                 }
2936
2937                 /*
2938                  * The physical page has changed.  Temporarily invalidate
2939                  * the mapping.  This ensures that all threads sharing the
2940                  * pmap keep a consistent view of the mapping, which is
2941                  * necessary for the correct handling of COW faults.  It
2942                  * also permits reuse of the old mapping's PV entry,
2943                  * avoiding an allocation.
2944                  *
2945                  * For consistency, handle unmanaged mappings the same way.
2946                  */
2947                 origpte = be64toh(pte_load_clear(pte));
2948                 KASSERT((origpte & PG_FRAME) == opa,
2949                     ("pmap_enter: unexpected pa update for %#lx", va));
2950                 if ((origpte & PG_MANAGED) != 0) {
2951                         om = PHYS_TO_VM_PAGE(opa);
2952
2953                         /*
2954                          * The pmap lock is sufficient to synchronize with
2955                          * concurrent calls to pmap_page_test_mappings() and
2956                          * pmap_ts_referenced().
2957                          */
2958                         if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
2959                                 vm_page_dirty(om);
2960                         if ((origpte & PG_A) != 0)
2961                                 vm_page_aflag_set(om, PGA_REFERENCED);
2962                         CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
2963                         pv = pmap_pvh_remove(&om->md, pmap, va);
2964                         if ((newpte & PG_MANAGED) == 0)
2965                                 free_pv_entry(pmap, pv);
2966 #ifdef INVARIANTS
2967                         else if (origpte & PG_MANAGED) {
2968                                 if (pv == NULL) {
2969                                         pmap_page_print_mappings(om);
2970                                         MPASS(pv != NULL);
2971                                 }
2972                         }
2973 #endif
2974                         if ((om->a.flags & PGA_WRITEABLE) != 0 &&
2975                             TAILQ_EMPTY(&om->md.pv_list) &&
2976                             ((om->flags & PG_FICTITIOUS) != 0 ||
2977                             TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
2978                                 vm_page_aflag_clear(om, PGA_WRITEABLE);
2979                 }
2980                 if ((origpte & PG_A) != 0)
2981                         invalidate_page = true;
2982                 origpte = 0;
2983         } else {
2984                 if (pmap != kernel_pmap) {
2985 #ifdef INVARIANTS
2986                         if (VERBOSE_PMAP || pmap_logging)
2987                                 printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n",
2988                                     pmap, va, m, prot, flags, psind,
2989                                     pmap->pm_pid, curproc->p_pid,
2990                                     curproc->p_comm);
2991 #endif
2992                 }
2993
2994                 /*
2995                  * Increment the counters.
2996                  */
2997                 if ((newpte & PG_W) != 0)
2998                         pmap->pm_stats.wired_count++;
2999                 pmap_resident_count_inc(pmap, 1);
3000         }
3001
3002         /*
3003          * Enter on the PV list if part of our managed memory.
3004          */
3005         if ((newpte & PG_MANAGED) != 0) {
3006                 if (pv == NULL) {
3007                         pv = get_pv_entry(pmap, &lock);
3008                         pv->pv_va = va;
3009                 }
3010 #ifdef VERBOSE_PV
3011                 else
3012                         printf("reassigning pv: %p to pmap: %p\n",
3013                                    pv, pmap);
3014 #endif
3015                 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
3016                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
3017                 m->md.pv_gen++;
3018                 if ((newpte & PG_RW) != 0)
3019                         vm_page_aflag_set(m, PGA_WRITEABLE);
3020         }
3021
3022         /*
3023          * Update the PTE.
3024          */
3025         if ((origpte & PG_V) != 0) {
3026 validate:
3027                 origpte = be64toh(pte_load_store(pte, htobe64(newpte)));
3028                 KASSERT((origpte & PG_FRAME) == pa,
3029                     ("pmap_enter: unexpected pa update for %#lx", va));
3030                 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
3031                     (PG_M | PG_RW)) {
3032                         if ((origpte & PG_MANAGED) != 0)
3033                                 vm_page_dirty(m);
3034                         invalidate_page = true;
3035
3036                         /*
3037                          * Although the PTE may still have PG_RW set, TLB
3038                          * invalidation may nonetheless be required because
3039                          * the PTE no longer has PG_M set.
3040                          */
3041                 } else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) {
3042                         /*
3043                          * Removing capabilities requires invalidation on POWER
3044                          */
3045                         invalidate_page = true;
3046                         goto unchanged;
3047                 }
3048                 if ((origpte & PG_A) != 0)
3049                         invalidate_page = true;
3050         } else {
3051                 pte_store(pte, newpte);
3052                 ptesync();
3053         }
3054 unchanged:
3055
3056 #if VM_NRESERVLEVEL > 0
3057         /*
3058          * If both the page table page and the reservation are fully
3059          * populated, then attempt promotion.
3060          */
3061         if ((mpte == NULL || mpte->ref_count == NPTEPG) &&
3062             mmu_radix_ps_enabled(pmap) &&
3063             (m->flags & PG_FICTITIOUS) == 0 &&
3064             vm_reserv_level_iffullpop(m) == 0 &&
3065                 pmap_promote_l3e(pmap, l3e, va, &lock) == 0)
3066                 invalidate_all = true;
3067 #endif
3068         if (invalidate_all)
3069                 pmap_invalidate_all(pmap);
3070         else if (invalidate_page)
3071                 pmap_invalidate_page(pmap, va);
3072
3073         rv = KERN_SUCCESS;
3074 out:
3075         if (lock != NULL)
3076                 rw_wunlock(lock);
3077         PMAP_UNLOCK(pmap);
3078
3079         return (rv);
3080 }
3081
3082 /*
3083  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
3084  * if successful.  Returns false if (1) a page table page cannot be allocated
3085  * without sleeping, (2) a mapping already exists at the specified virtual
3086  * address, or (3) a PV entry cannot be allocated without reclaiming another
3087  * PV entry.
3088  */
3089 static bool
3090 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3091     struct rwlock **lockp)
3092 {
3093         pml3_entry_t newpde;
3094
3095         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3096         newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) |
3097             RPTE_LEAF | PG_V;
3098         if ((m->oflags & VPO_UNMANAGED) == 0)
3099                 newpde |= PG_MANAGED;
3100         if (prot & VM_PROT_EXECUTE)
3101                 newpde |= PG_X;
3102         if (prot & VM_PROT_READ)
3103                 newpde |= RPTE_EAA_R;
3104         if (va >= DMAP_MIN_ADDRESS)
3105                 newpde |= RPTE_EAA_P;
3106         return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
3107             PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
3108             KERN_SUCCESS);
3109 }
3110
3111 /*
3112  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
3113  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
3114  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
3115  * a mapping already exists at the specified virtual address.  Returns
3116  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
3117  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
3118  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
3119  *
3120  * The parameter "m" is only used when creating a managed, writeable mapping.
3121  */
3122 static int
3123 pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags,
3124     vm_page_t m, struct rwlock **lockp)
3125 {
3126         struct spglist free;
3127         pml3_entry_t oldl3e, *l3e;
3128         vm_page_t mt, pdpg;
3129
3130         KASSERT((newpde & (PG_M | PG_RW)) != PG_RW,
3131             ("pmap_enter_pde: newpde is missing PG_M"));
3132         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3133
3134         if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
3135             NULL : lockp)) == NULL) {
3136                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3137                     " in pmap %p", va, pmap);
3138                 return (KERN_RESOURCE_SHORTAGE);
3139         }
3140         l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
3141         l3e = &l3e[pmap_pml3e_index(va)];
3142         oldl3e = be64toh(*l3e);
3143         if ((oldl3e & PG_V) != 0) {
3144                 KASSERT(pdpg->ref_count > 1,
3145                     ("pmap_enter_pde: pdpg's wire count is too low"));
3146                 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
3147                         pdpg->ref_count--;
3148                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3149                             " in pmap %p", va, pmap);
3150                         return (KERN_FAILURE);
3151                 }
3152                 /* Break the existing mapping(s). */
3153                 SLIST_INIT(&free);
3154                 if ((oldl3e & RPTE_LEAF) != 0) {
3155                         /*
3156                          * The reference to the PD page that was acquired by
3157                          * pmap_allocl3e() ensures that it won't be freed.
3158                          * However, if the PDE resulted from a promotion, then
3159                          * a reserved PT page could be freed.
3160                          */
3161                         (void)pmap_remove_l3e(pmap, l3e, va, &free, lockp);
3162                 } else {
3163                         if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e,
3164                             &free, lockp))
3165                                pmap_invalidate_all(pmap);
3166                 }
3167                 vm_page_free_pages_toq(&free, true);
3168                 if (va >= VM_MAXUSER_ADDRESS) {
3169                         mt = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME);
3170                         if (pmap_insert_pt_page(pmap, mt)) {
3171                                 /*
3172                                  * XXX Currently, this can't happen because
3173                                  * we do not perform pmap_enter(psind == 1)
3174                                  * on the kernel pmap.
3175                                  */
3176                                 panic("pmap_enter_pde: trie insert failed");
3177                         }
3178                 } else
3179                         KASSERT(be64toh(*l3e) == 0, ("pmap_enter_pde: non-zero pde %p",
3180                             l3e));
3181         }
3182         if ((newpde & PG_MANAGED) != 0) {
3183                 /*
3184                  * Abort this mapping if its PV entry could not be created.
3185                  */
3186                 if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) {
3187                         SLIST_INIT(&free);
3188                         if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
3189                                 /*
3190                                  * Although "va" is not mapped, paging-
3191                                  * structure caches could nonetheless have
3192                                  * entries that refer to the freed page table
3193                                  * pages.  Invalidate those entries.
3194                                  */
3195                                 pmap_invalidate_page(pmap, va);
3196                                 vm_page_free_pages_toq(&free, true);
3197                         }
3198                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3199                             " in pmap %p", va, pmap);
3200                         return (KERN_RESOURCE_SHORTAGE);
3201                 }
3202                 if ((newpde & PG_RW) != 0) {
3203                         for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
3204                                 vm_page_aflag_set(mt, PGA_WRITEABLE);
3205                 }
3206         }
3207
3208         /*
3209          * Increment counters.
3210          */
3211         if ((newpde & PG_W) != 0)
3212                 pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE;
3213         pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE);
3214
3215         /*
3216          * Map the superpage.  (This is not a promoted mapping; there will not
3217          * be any lingering 4KB page mappings in the TLB.)
3218          */
3219         pte_store(l3e, newpde);
3220
3221         atomic_add_long(&pmap_l3e_mappings, 1);
3222         CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3223             " in pmap %p", va, pmap);
3224         return (KERN_SUCCESS);
3225 }
3226
3227 void
3228 mmu_radix_enter_object(pmap_t pmap, vm_offset_t start,
3229     vm_offset_t end, vm_page_t m_start, vm_prot_t prot)
3230 {
3231
3232         struct rwlock *lock;
3233         vm_offset_t va;
3234         vm_page_t m, mpte;
3235         vm_pindex_t diff, psize;
3236         bool invalidate;
3237         VM_OBJECT_ASSERT_LOCKED(m_start->object);
3238
3239         CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start,
3240             end, m_start, prot);
3241
3242         invalidate = false;
3243         psize = atop(end - start);
3244         mpte = NULL;
3245         m = m_start;
3246         lock = NULL;
3247         PMAP_LOCK(pmap);
3248         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3249                 va = start + ptoa(diff);
3250                 if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end &&
3251                     m->psind == 1 && mmu_radix_ps_enabled(pmap) &&
3252                     pmap_enter_2mpage(pmap, va, m, prot, &lock))
3253                         m = &m[L3_PAGE_SIZE / PAGE_SIZE - 1];
3254                 else
3255                         mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot,
3256                             mpte, &lock, &invalidate);
3257                 m = TAILQ_NEXT(m, listq);
3258         }
3259         ptesync();
3260         if (lock != NULL)
3261                 rw_wunlock(lock);
3262         if (invalidate)
3263                 pmap_invalidate_all(pmap);
3264         PMAP_UNLOCK(pmap);
3265 }
3266
3267 static vm_page_t
3268 mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3269     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate)
3270 {
3271         struct spglist free;
3272         pt_entry_t *pte;
3273         vm_paddr_t pa;
3274
3275         KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
3276             (m->oflags & VPO_UNMANAGED) != 0,
3277             ("mmu_radix_enter_quick_locked: managed mapping within the clean submap"));
3278         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3279
3280         /*
3281          * In the case that a page table page is not
3282          * resident, we are creating it here.
3283          */
3284         if (va < VM_MAXUSER_ADDRESS) {
3285                 vm_pindex_t ptepindex;
3286                 pml3_entry_t *ptepa;
3287
3288                 /*
3289                  * Calculate pagetable page index
3290                  */
3291                 ptepindex = pmap_l3e_pindex(va);
3292                 if (mpte && (mpte->pindex == ptepindex)) {
3293                         mpte->ref_count++;
3294                 } else {
3295                         /*
3296                          * Get the page directory entry
3297                          */
3298                         ptepa = pmap_pml3e(pmap, va);
3299
3300                         /*
3301                          * If the page table page is mapped, we just increment
3302                          * the hold count, and activate it.  Otherwise, we
3303                          * attempt to allocate a page table page.  If this
3304                          * attempt fails, we don't retry.  Instead, we give up.
3305                          */
3306                         if (ptepa && (be64toh(*ptepa) & PG_V) != 0) {
3307                                 if (be64toh(*ptepa) & RPTE_LEAF)
3308                                         return (NULL);
3309                                 mpte = PHYS_TO_VM_PAGE(be64toh(*ptepa) & PG_FRAME);
3310                                 mpte->ref_count++;
3311                         } else {
3312                                 /*
3313                                  * Pass NULL instead of the PV list lock
3314                                  * pointer, because we don't intend to sleep.
3315                                  */
3316                                 mpte = _pmap_allocpte(pmap, ptepindex, NULL);
3317                                 if (mpte == NULL)
3318                                         return (mpte);
3319                         }
3320                 }
3321                 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3322                 pte = &pte[pmap_pte_index(va)];
3323         } else {
3324                 mpte = NULL;
3325                 pte = pmap_pte(pmap, va);
3326         }
3327         if (be64toh(*pte)) {
3328                 if (mpte != NULL) {
3329                         mpte->ref_count--;
3330                         mpte = NULL;
3331                 }
3332                 return (mpte);
3333         }
3334
3335         /*
3336          * Enter on the PV list if part of our managed memory.
3337          */
3338         if ((m->oflags & VPO_UNMANAGED) == 0 &&
3339             !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3340                 if (mpte != NULL) {
3341                         SLIST_INIT(&free);
3342                         if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
3343                                 /*
3344                                  * Although "va" is not mapped, paging-
3345                                  * structure caches could nonetheless have
3346                                  * entries that refer to the freed page table
3347                                  * pages.  Invalidate those entries.
3348                                  */
3349                                 *invalidate = true;
3350                                 vm_page_free_pages_toq(&free, true);
3351                         }
3352                         mpte = NULL;
3353                 }
3354                 return (mpte);
3355         }
3356
3357         /*
3358          * Increment counters
3359          */
3360         pmap_resident_count_inc(pmap, 1);
3361
3362         pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs);
3363         if (prot & VM_PROT_EXECUTE)
3364                 pa |= PG_X;
3365         else
3366                 pa |= RPTE_EAA_R;
3367         if ((m->oflags & VPO_UNMANAGED) == 0)
3368                 pa |= PG_MANAGED;
3369
3370         pte_store(pte, pa);
3371         return (mpte);
3372 }
3373
3374 void
3375 mmu_radix_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
3376     vm_prot_t prot)
3377 {
3378         struct rwlock *lock;
3379         bool invalidate;
3380
3381         lock = NULL;
3382         invalidate = false;
3383         PMAP_LOCK(pmap);
3384         mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock,
3385             &invalidate);
3386         ptesync();
3387         if (lock != NULL)
3388                 rw_wunlock(lock);
3389         if (invalidate)
3390                 pmap_invalidate_all(pmap);
3391         PMAP_UNLOCK(pmap);
3392 }
3393
3394 vm_paddr_t
3395 mmu_radix_extract(pmap_t pmap, vm_offset_t va)
3396 {
3397         pml3_entry_t *l3e;
3398         pt_entry_t *pte;
3399         vm_paddr_t pa;
3400
3401         l3e = pmap_pml3e(pmap, va);
3402         if (__predict_false(l3e == NULL))
3403                 return (0);
3404         if (be64toh(*l3e) & RPTE_LEAF) {
3405                 pa = (be64toh(*l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK);
3406                 pa |= (va & L3_PAGE_MASK);
3407         } else {
3408                 /*
3409                  * Beware of a concurrent promotion that changes the
3410                  * PDE at this point!  For example, vtopte() must not
3411                  * be used to access the PTE because it would use the
3412                  * new PDE.  It is, however, safe to use the old PDE
3413                  * because the page table page is preserved by the
3414                  * promotion.
3415                  */
3416                 pte = pmap_l3e_to_pte(l3e, va);
3417                 if (__predict_false(pte == NULL))
3418                         return (0);
3419                 pa = be64toh(*pte);
3420                 pa = (pa & PG_FRAME) | (va & PAGE_MASK);
3421                 pa |= (va & PAGE_MASK);
3422         }
3423         return (pa);
3424 }
3425
3426 vm_page_t
3427 mmu_radix_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
3428 {
3429         pml3_entry_t l3e, *l3ep;
3430         pt_entry_t pte;
3431         vm_paddr_t pa;
3432         vm_page_t m;
3433
3434         pa = 0;
3435         m = NULL;
3436         CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot);
3437         PMAP_LOCK(pmap);
3438         l3ep = pmap_pml3e(pmap, va);
3439         if (l3ep != NULL && (l3e = be64toh(*l3ep))) {
3440                 if (l3e & RPTE_LEAF) {
3441                         if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0)
3442                                 m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) |
3443                                     (va & L3_PAGE_MASK));
3444                 } else {
3445                         /* Native endian PTE, do not pass to pmap functions */
3446                         pte = be64toh(*pmap_l3e_to_pte(l3ep, va));
3447                         if ((pte & PG_V) &&
3448                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0))
3449                                 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
3450                 }
3451                 if (m != NULL && !vm_page_wire_mapped(m))
3452                         m = NULL;
3453         }
3454         PMAP_UNLOCK(pmap);
3455         return (m);
3456 }
3457
3458 static void
3459 mmu_radix_growkernel(vm_offset_t addr)
3460 {
3461         vm_paddr_t paddr;
3462         vm_page_t nkpg;
3463         pml3_entry_t *l3e;
3464         pml2_entry_t *l2e;
3465
3466         CTR2(KTR_PMAP, "%s(%#x)", __func__, addr);
3467         if (VM_MIN_KERNEL_ADDRESS < addr &&
3468                 addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE))
3469                 return;
3470
3471         addr = roundup2(addr, L3_PAGE_SIZE);
3472         if (addr - 1 >= vm_map_max(kernel_map))
3473                 addr = vm_map_max(kernel_map);
3474         while (kernel_vm_end < addr) {
3475                 l2e = pmap_pml2e(kernel_pmap, kernel_vm_end);
3476                 if ((be64toh(*l2e) & PG_V) == 0) {
3477                         /* We need a new PDP entry */
3478                         nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_PAGE_SIZE_SHIFT,
3479                             VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
3480                             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3481                         if (nkpg == NULL)
3482                                 panic("pmap_growkernel: no memory to grow kernel");
3483                         if ((nkpg->flags & PG_ZERO) == 0)
3484                                 mmu_radix_zero_page(nkpg);
3485                         paddr = VM_PAGE_TO_PHYS(nkpg);
3486                         pde_store(l2e, paddr);
3487                         continue; /* try again */
3488                 }
3489                 l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end);
3490                 if ((be64toh(*l3e) & PG_V) != 0) {
3491                         kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
3492                         if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3493                                 kernel_vm_end = vm_map_max(kernel_map);
3494                                 break;
3495                         }
3496                         continue;
3497                 }
3498
3499                 nkpg = vm_page_alloc(NULL, pmap_l3e_pindex(kernel_vm_end),
3500                     VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
3501                     VM_ALLOC_ZERO);
3502                 if (nkpg == NULL)
3503                         panic("pmap_growkernel: no memory to grow kernel");
3504                 if ((nkpg->flags & PG_ZERO) == 0)
3505                         mmu_radix_zero_page(nkpg);
3506                 paddr = VM_PAGE_TO_PHYS(nkpg);
3507                 pde_store(l3e, paddr);
3508
3509                 kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
3510                 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3511                         kernel_vm_end = vm_map_max(kernel_map);
3512                         break;
3513                 }
3514         }
3515         ptesync();
3516 }
3517
3518 static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory");
3519 static uma_zone_t zone_radix_pgd;
3520
3521 static int
3522 radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused,
3523     int flags)
3524 {
3525
3526         for (int i = 0; i < count; i++) {
3527                 vm_page_t m = vm_page_alloc_contig(NULL, 0,
3528                     VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
3529                     VM_ALLOC_ZERO | VM_ALLOC_WAITOK, RADIX_PGD_SIZE/PAGE_SIZE,
3530                     0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE,
3531                     VM_MEMATTR_DEFAULT);
3532                 /* XXX zero on alloc here so we don't have to later */
3533                 store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3534         }
3535         return (count);
3536 }
3537
3538 static void
3539 radix_pgd_release(void *arg __unused, void **store, int count)
3540 {
3541         vm_page_t m;
3542         struct spglist free;
3543         int page_count;
3544
3545         SLIST_INIT(&free);
3546         page_count = RADIX_PGD_SIZE/PAGE_SIZE;
3547
3548         for (int i = 0; i < count; i++) {
3549                 /*
3550                  * XXX selectively remove dmap and KVA entries so we don't
3551                  * need to bzero
3552                  */
3553                 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
3554                 for (int j = page_count-1; j >= 0; j--) {
3555                         vm_page_unwire_noq(&m[j]);
3556                         SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss);
3557                 }
3558                 vm_page_free_pages_toq(&free, false);
3559         }
3560 }
3561
3562 static void
3563 mmu_radix_init()
3564 {
3565         vm_page_t mpte;
3566         vm_size_t s;
3567         int error, i, pv_npg;
3568
3569         /* XXX is this really needed for POWER? */
3570         /* L1TF, reserve page @0 unconditionally */
3571         vm_page_blacklist_add(0, bootverbose);
3572
3573         zone_radix_pgd = uma_zcache_create("radix_pgd_cache",
3574                 RADIX_PGD_SIZE, NULL, NULL,
3575 #ifdef INVARIANTS
3576             trash_init, trash_fini,
3577 #else
3578             NULL, NULL,
3579 #endif
3580                 radix_pgd_import, radix_pgd_release,
3581                 NULL, UMA_ZONE_NOBUCKET);
3582
3583         /*
3584          * Initialize the vm page array entries for the kernel pmap's
3585          * page table pages.
3586          */
3587         PMAP_LOCK(kernel_pmap);
3588         for (i = 0; i < nkpt; i++) {
3589                 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
3590                 KASSERT(mpte >= vm_page_array &&
3591                     mpte < &vm_page_array[vm_page_array_size],
3592                     ("pmap_init: page table page is out of range size: %lu",
3593                      vm_page_array_size));
3594                 mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i;
3595                 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
3596                 MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte);
3597                 //pmap_insert_pt_page(kernel_pmap, mpte);
3598                 mpte->ref_count = 1;
3599         }
3600         PMAP_UNLOCK(kernel_pmap);
3601         vm_wire_add(nkpt);
3602
3603         CTR1(KTR_PMAP, "%s()", __func__);
3604         TAILQ_INIT(&pv_dummy.pv_list);
3605
3606         /*
3607          * Are large page mappings enabled?
3608          */
3609         TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
3610         if (superpages_enabled) {
3611                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
3612                     ("pmap_init: can't assign to pagesizes[1]"));
3613                 pagesizes[1] = L3_PAGE_SIZE;
3614         }
3615
3616         /*
3617          * Initialize the pv chunk list mutex.
3618          */
3619         mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
3620
3621         /*
3622          * Initialize the pool of pv list locks.
3623          */
3624         for (i = 0; i < NPV_LIST_LOCKS; i++)
3625                 rw_init(&pv_list_locks[i], "pmap pv list");
3626
3627         /*
3628          * Calculate the size of the pv head table for superpages.
3629          */
3630         pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE);
3631
3632         /*
3633          * Allocate memory for the pv head table for superpages.
3634          */
3635         s = (vm_size_t)(pv_npg * sizeof(struct md_page));
3636         s = round_page(s);
3637         pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
3638         for (i = 0; i < pv_npg; i++)
3639                 TAILQ_INIT(&pv_table[i].pv_list);
3640         TAILQ_INIT(&pv_dummy.pv_list);
3641
3642         pmap_initialized = 1;
3643         mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
3644         error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
3645             (vmem_addr_t *)&qframe);
3646
3647         if (error != 0)
3648                 panic("qframe allocation failed");
3649         asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<<isa3_pid_bits),
3650             1, 1, M_WAITOK);
3651 }
3652
3653 static boolean_t
3654 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
3655 {
3656         struct rwlock *lock;
3657         pv_entry_t pv;
3658         struct md_page *pvh;
3659         pt_entry_t *pte, mask;
3660         pmap_t pmap;
3661         int md_gen, pvh_gen;
3662         boolean_t rv;
3663
3664         rv = FALSE;
3665         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3666         rw_rlock(lock);
3667 restart:
3668         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
3669                 pmap = PV_PMAP(pv);
3670                 if (!PMAP_TRYLOCK(pmap)) {
3671                         md_gen = m->md.pv_gen;
3672                         rw_runlock(lock);
3673                         PMAP_LOCK(pmap);
3674                         rw_rlock(lock);
3675                         if (md_gen != m->md.pv_gen) {
3676                                 PMAP_UNLOCK(pmap);
3677                                 goto restart;
3678                         }
3679                 }
3680                 pte = pmap_pte(pmap, pv->pv_va);
3681                 mask = 0;
3682                 if (modified)
3683                         mask |= PG_RW | PG_M;
3684                 if (accessed)
3685                         mask |= PG_V | PG_A;
3686                 rv = (be64toh(*pte) & mask) == mask;
3687                 PMAP_UNLOCK(pmap);
3688                 if (rv)
3689                         goto out;
3690         }
3691         if ((m->flags & PG_FICTITIOUS) == 0) {
3692                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3693                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
3694                         pmap = PV_PMAP(pv);
3695                         if (!PMAP_TRYLOCK(pmap)) {
3696                                 md_gen = m->md.pv_gen;
3697                                 pvh_gen = pvh->pv_gen;
3698                                 rw_runlock(lock);
3699                                 PMAP_LOCK(pmap);
3700                                 rw_rlock(lock);
3701                                 if (md_gen != m->md.pv_gen ||
3702                                     pvh_gen != pvh->pv_gen) {
3703                                         PMAP_UNLOCK(pmap);
3704                                         goto restart;
3705                                 }
3706                         }
3707                         pte = pmap_pml3e(pmap, pv->pv_va);
3708                         mask = 0;
3709                         if (modified)
3710                                 mask |= PG_RW | PG_M;
3711                         if (accessed)
3712                                 mask |= PG_V | PG_A;
3713                         rv = (be64toh(*pte) & mask) == mask;
3714                         PMAP_UNLOCK(pmap);
3715                         if (rv)
3716                                 goto out;
3717                 }
3718         }
3719 out:
3720         rw_runlock(lock);
3721         return (rv);
3722 }
3723
3724 /*
3725  *      pmap_is_modified:
3726  *
3727  *      Return whether or not the specified physical page was modified
3728  *      in any physical maps.
3729  */
3730 boolean_t
3731 mmu_radix_is_modified(vm_page_t m)
3732 {
3733
3734         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3735             ("pmap_is_modified: page %p is not managed", m));
3736
3737         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3738         /*
3739          * If the page is not busied then this check is racy.
3740          */
3741         if (!pmap_page_is_write_mapped(m))
3742                 return (FALSE);
3743         return (pmap_page_test_mappings(m, FALSE, TRUE));
3744 }
3745
3746 boolean_t
3747 mmu_radix_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3748 {
3749         pml3_entry_t *l3e;
3750         pt_entry_t *pte;
3751         boolean_t rv;
3752
3753         CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr);
3754         rv = FALSE;
3755         PMAP_LOCK(pmap);
3756         l3e = pmap_pml3e(pmap, addr);
3757         if (l3e != NULL && (be64toh(*l3e) & (RPTE_LEAF | PG_V)) == PG_V) {
3758                 pte = pmap_l3e_to_pte(l3e, addr);
3759                 rv = (be64toh(*pte) & PG_V) == 0;
3760         }
3761         PMAP_UNLOCK(pmap);
3762         return (rv);
3763 }
3764
3765 boolean_t
3766 mmu_radix_is_referenced(vm_page_t m)
3767 {
3768         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3769             ("pmap_is_referenced: page %p is not managed", m));
3770         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3771         return (pmap_page_test_mappings(m, TRUE, FALSE));
3772 }
3773
3774 /*
3775  *      pmap_ts_referenced:
3776  *
3777  *      Return a count of reference bits for a page, clearing those bits.
3778  *      It is not necessary for every reference bit to be cleared, but it
3779  *      is necessary that 0 only be returned when there are truly no
3780  *      reference bits set.
3781  *
3782  *      As an optimization, update the page's dirty field if a modified bit is
3783  *      found while counting reference bits.  This opportunistic update can be
3784  *      performed at low cost and can eliminate the need for some future calls
3785  *      to pmap_is_modified().  However, since this function stops after
3786  *      finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
3787  *      dirty pages.  Those dirty pages will only be detected by a future call
3788  *      to pmap_is_modified().
3789  *
3790  *      A DI block is not needed within this function, because
3791  *      invalidations are performed before the PV list lock is
3792  *      released.
3793  */
3794 boolean_t
3795 mmu_radix_ts_referenced(vm_page_t m)
3796 {
3797         struct md_page *pvh;
3798         pv_entry_t pv, pvf;
3799         pmap_t pmap;
3800         struct rwlock *lock;
3801         pml3_entry_t oldl3e, *l3e;
3802         pt_entry_t *pte;
3803         vm_paddr_t pa;
3804         int cleared, md_gen, not_cleared, pvh_gen;
3805         struct spglist free;
3806
3807         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3808         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3809             ("pmap_ts_referenced: page %p is not managed", m));
3810         SLIST_INIT(&free);
3811         cleared = 0;
3812         pa = VM_PAGE_TO_PHYS(m);
3813         lock = PHYS_TO_PV_LIST_LOCK(pa);
3814         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
3815         rw_wlock(lock);
3816 retry:
3817         not_cleared = 0;
3818         if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
3819                 goto small_mappings;
3820         pv = pvf;
3821         do {
3822                 if (pvf == NULL)
3823                         pvf = pv;
3824                 pmap = PV_PMAP(pv);
3825                 if (!PMAP_TRYLOCK(pmap)) {
3826                         pvh_gen = pvh->pv_gen;
3827                         rw_wunlock(lock);
3828                         PMAP_LOCK(pmap);
3829                         rw_wlock(lock);
3830                         if (pvh_gen != pvh->pv_gen) {
3831                                 PMAP_UNLOCK(pmap);
3832                                 goto retry;
3833                         }
3834                 }
3835                 l3e = pmap_pml3e(pmap, pv->pv_va);
3836                 oldl3e = be64toh(*l3e);
3837                 if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3838                         /*
3839                          * Although "oldpde" is mapping a 2MB page, because
3840                          * this function is called at a 4KB page granularity,
3841                          * we only update the 4KB page under test.
3842                          */
3843                         vm_page_dirty(m);
3844                 }
3845                 if ((oldl3e & PG_A) != 0) {
3846                         /*
3847                          * Since this reference bit is shared by 512 4KB
3848                          * pages, it should not be cleared every time it is
3849                          * tested.  Apply a simple "hash" function on the
3850                          * physical page number, the virtual superpage number,
3851                          * and the pmap address to select one 4KB page out of
3852                          * the 512 on which testing the reference bit will
3853                          * result in clearing that reference bit.  This
3854                          * function is designed to avoid the selection of the
3855                          * same 4KB page for every 2MB page mapping.
3856                          *
3857                          * On demotion, a mapping that hasn't been referenced
3858                          * is simply destroyed.  To avoid the possibility of a
3859                          * subsequent page fault on a demoted wired mapping,
3860                          * always leave its reference bit set.  Moreover,
3861                          * since the superpage is wired, the current state of
3862                          * its reference bit won't affect page replacement.
3863                          */
3864                         if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^
3865                             (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
3866                             (oldl3e & PG_W) == 0) {
3867                                 atomic_clear_long(l3e, htobe64(PG_A));
3868                                 pmap_invalidate_page(pmap, pv->pv_va);
3869                                 cleared++;
3870                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
3871                                     ("inconsistent pv lock %p %p for page %p",
3872                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
3873                         } else
3874                                 not_cleared++;
3875                 }
3876                 PMAP_UNLOCK(pmap);
3877                 /* Rotate the PV list if it has more than one entry. */
3878                 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) {
3879                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
3880                         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
3881                         pvh->pv_gen++;
3882                 }
3883                 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
3884                         goto out;
3885         } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
3886 small_mappings:
3887         if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
3888                 goto out;
3889         pv = pvf;
3890         do {
3891                 if (pvf == NULL)
3892                         pvf = pv;
3893                 pmap = PV_PMAP(pv);
3894                 if (!PMAP_TRYLOCK(pmap)) {
3895                         pvh_gen = pvh->pv_gen;
3896                         md_gen = m->md.pv_gen;
3897                         rw_wunlock(lock);
3898                         PMAP_LOCK(pmap);
3899                         rw_wlock(lock);
3900                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
3901                                 PMAP_UNLOCK(pmap);
3902                                 goto retry;
3903                         }
3904                 }
3905                 l3e = pmap_pml3e(pmap, pv->pv_va);
3906                 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0,
3907                     ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
3908                     m));
3909                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
3910                 if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW))
3911                         vm_page_dirty(m);
3912                 if ((be64toh(*pte) & PG_A) != 0) {
3913                         atomic_clear_long(pte, htobe64(PG_A));
3914                         pmap_invalidate_page(pmap, pv->pv_va);
3915                         cleared++;
3916                 }
3917                 PMAP_UNLOCK(pmap);
3918                 /* Rotate the PV list if it has more than one entry. */
3919                 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) {
3920                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
3921                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
3922                         m->md.pv_gen++;
3923                 }
3924         } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
3925             not_cleared < PMAP_TS_REFERENCED_MAX);
3926 out:
3927         rw_wunlock(lock);
3928         vm_page_free_pages_toq(&free, true);
3929         return (cleared + not_cleared);
3930 }
3931
3932 static vm_offset_t
3933 mmu_radix_map(vm_offset_t *virt __unused, vm_paddr_t start,
3934     vm_paddr_t end, int prot __unused)
3935 {
3936
3937         CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end,
3938                  prot);
3939         return (PHYS_TO_DMAP(start));
3940 }
3941
3942 void
3943 mmu_radix_object_init_pt(pmap_t pmap, vm_offset_t addr,
3944     vm_object_t object, vm_pindex_t pindex, vm_size_t size)
3945 {
3946         pml3_entry_t *l3e;
3947         vm_paddr_t pa, ptepa;
3948         vm_page_t p, pdpg;
3949         vm_memattr_t ma;
3950
3951         CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr,
3952             object, pindex, size);
3953         VM_OBJECT_ASSERT_WLOCKED(object);
3954         KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3955                         ("pmap_object_init_pt: non-device object"));
3956         /* NB: size can be logically ored with addr here */
3957         if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) {
3958                 if (!mmu_radix_ps_enabled(pmap))
3959                         return;
3960                 if (!vm_object_populate(object, pindex, pindex + atop(size)))
3961                         return;
3962                 p = vm_page_lookup(object, pindex);
3963                 KASSERT(p->valid == VM_PAGE_BITS_ALL,
3964                     ("pmap_object_init_pt: invalid page %p", p));
3965                 ma = p->md.mdpg_cache_attrs;
3966
3967                 /*
3968                  * Abort the mapping if the first page is not physically
3969                  * aligned to a 2MB page boundary.
3970                  */
3971                 ptepa = VM_PAGE_TO_PHYS(p);
3972                 if (ptepa & L3_PAGE_MASK)
3973                         return;
3974
3975                 /*
3976                  * Skip the first page.  Abort the mapping if the rest of
3977                  * the pages are not physically contiguous or have differing
3978                  * memory attributes.
3979                  */
3980                 p = TAILQ_NEXT(p, listq);
3981                 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
3982                     pa += PAGE_SIZE) {
3983                         KASSERT(p->valid == VM_PAGE_BITS_ALL,
3984                             ("pmap_object_init_pt: invalid page %p", p));
3985                         if (pa != VM_PAGE_TO_PHYS(p) ||
3986                             ma != p->md.mdpg_cache_attrs)
3987                                 return;
3988                         p = TAILQ_NEXT(p, listq);
3989                 }
3990
3991                 PMAP_LOCK(pmap);
3992                 for (pa = ptepa | pmap_cache_bits(ma);
3993                     pa < ptepa + size; pa += L3_PAGE_SIZE) {
3994                         pdpg = pmap_allocl3e(pmap, addr, NULL);
3995                         if (pdpg == NULL) {
3996                                 /*
3997                                  * The creation of mappings below is only an
3998                                  * optimization.  If a page directory page
3999                                  * cannot be allocated without blocking,
4000                                  * continue on to the next mapping rather than
4001                                  * blocking.
4002                                  */
4003                                 addr += L3_PAGE_SIZE;
4004                                 continue;
4005                         }
4006                         l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4007                         l3e = &l3e[pmap_pml3e_index(addr)];
4008                         if ((be64toh(*l3e) & PG_V) == 0) {
4009                                 pa |= PG_M | PG_A | PG_RW;
4010                                 pte_store(l3e, pa);
4011                                 pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE);
4012                                 atomic_add_long(&pmap_l3e_mappings, 1);
4013                         } else {
4014                                 /* Continue on if the PDE is already valid. */
4015                                 pdpg->ref_count--;
4016                                 KASSERT(pdpg->ref_count > 0,
4017                                     ("pmap_object_init_pt: missing reference "
4018                                     "to page directory page, va: 0x%lx", addr));
4019                         }
4020                         addr += L3_PAGE_SIZE;
4021                 }
4022                 ptesync();
4023                 PMAP_UNLOCK(pmap);
4024         }
4025 }
4026
4027 boolean_t
4028 mmu_radix_page_exists_quick(pmap_t pmap, vm_page_t m)
4029 {
4030         struct md_page *pvh;
4031         struct rwlock *lock;
4032         pv_entry_t pv;
4033         int loops = 0;
4034         boolean_t rv;
4035
4036         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4037             ("pmap_page_exists_quick: page %p is not managed", m));
4038         CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m);
4039         rv = FALSE;
4040         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4041         rw_rlock(lock);
4042         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
4043                 if (PV_PMAP(pv) == pmap) {
4044                         rv = TRUE;
4045                         break;
4046                 }
4047                 loops++;
4048                 if (loops >= 16)
4049                         break;
4050         }
4051         if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4052                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4053                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
4054                         if (PV_PMAP(pv) == pmap) {
4055                                 rv = TRUE;
4056                                 break;
4057                         }
4058                         loops++;
4059                         if (loops >= 16)
4060                                 break;
4061                 }
4062         }
4063         rw_runlock(lock);
4064         return (rv);
4065 }
4066
4067 void
4068 mmu_radix_page_init(vm_page_t m)
4069 {
4070
4071         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
4072         TAILQ_INIT(&m->md.pv_list);
4073         m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT;
4074 }
4075
4076 int
4077 mmu_radix_page_wired_mappings(vm_page_t m)
4078 {
4079         struct rwlock *lock;
4080         struct md_page *pvh;
4081         pmap_t pmap;
4082         pt_entry_t *pte;
4083         pv_entry_t pv;
4084         int count, md_gen, pvh_gen;
4085
4086         if ((m->oflags & VPO_UNMANAGED) != 0)
4087                 return (0);
4088         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
4089         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4090         rw_rlock(lock);
4091 restart:
4092         count = 0;
4093         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
4094                 pmap = PV_PMAP(pv);
4095                 if (!PMAP_TRYLOCK(pmap)) {
4096                         md_gen = m->md.pv_gen;
4097                         rw_runlock(lock);
4098                         PMAP_LOCK(pmap);
4099                         rw_rlock(lock);
4100                         if (md_gen != m->md.pv_gen) {
4101                                 PMAP_UNLOCK(pmap);
4102                                 goto restart;
4103                         }
4104                 }
4105                 pte = pmap_pte(pmap, pv->pv_va);
4106                 if ((be64toh(*pte) & PG_W) != 0)
4107                         count++;
4108                 PMAP_UNLOCK(pmap);
4109         }
4110         if ((m->flags & PG_FICTITIOUS) == 0) {
4111                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4112                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
4113                         pmap = PV_PMAP(pv);
4114                         if (!PMAP_TRYLOCK(pmap)) {
4115                                 md_gen = m->md.pv_gen;
4116                                 pvh_gen = pvh->pv_gen;
4117                                 rw_runlock(lock);
4118                                 PMAP_LOCK(pmap);
4119                                 rw_rlock(lock);
4120                                 if (md_gen != m->md.pv_gen ||
4121                                     pvh_gen != pvh->pv_gen) {
4122                                         PMAP_UNLOCK(pmap);
4123                                         goto restart;
4124                                 }
4125                         }
4126                         pte = pmap_pml3e(pmap, pv->pv_va);
4127                         if ((be64toh(*pte) & PG_W) != 0)
4128                                 count++;
4129                         PMAP_UNLOCK(pmap);
4130                 }
4131         }
4132         rw_runlock(lock);
4133         return (count);
4134 }
4135
4136 static void
4137 mmu_radix_update_proctab(int pid, pml1_entry_t l1pa)
4138 {
4139         isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE |  l1pa | RADIX_PGD_INDEX_SHIFT);
4140 }
4141
4142 int
4143 mmu_radix_pinit(pmap_t pmap)
4144 {
4145         vmem_addr_t pid;
4146         vm_paddr_t l1pa;
4147
4148         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4149
4150         /*
4151          * allocate the page directory page
4152          */
4153         pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK);
4154
4155         for (int j = 0; j <  RADIX_PGD_SIZE_SHIFT; j++)
4156                 pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE);
4157         pmap->pm_radix.rt_root = 0;
4158         TAILQ_INIT(&pmap->pm_pvchunk);
4159         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
4160         pmap->pm_flags = PMAP_PDE_SUPERPAGE;
4161         vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid);
4162
4163         pmap->pm_pid = pid;
4164         l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1);
4165         mmu_radix_update_proctab(pid, l1pa);
4166         __asm __volatile("ptesync;isync" : : : "memory");
4167
4168         return (1);
4169 }
4170
4171 /*
4172  * This routine is called if the desired page table page does not exist.
4173  *
4174  * If page table page allocation fails, this routine may sleep before
4175  * returning NULL.  It sleeps only if a lock pointer was given.
4176  *
4177  * Note: If a page allocation fails at page table level two or three,
4178  * one or two pages may be held during the wait, only to be released
4179  * afterwards.  This conservative approach is easily argued to avoid
4180  * race conditions.
4181  */
4182 static vm_page_t
4183 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
4184 {
4185         vm_page_t m, pdppg, pdpg;
4186
4187         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4188
4189         /*
4190          * Allocate a page table page.
4191          */
4192         if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
4193             VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
4194                 if (lockp != NULL) {
4195                         RELEASE_PV_LIST_LOCK(lockp);
4196                         PMAP_UNLOCK(pmap);
4197                         vm_wait(NULL);
4198                         PMAP_LOCK(pmap);
4199                 }
4200                 /*
4201                  * Indicate the need to retry.  While waiting, the page table
4202                  * page may have been allocated.
4203                  */
4204                 return (NULL);
4205         }
4206         if ((m->flags & PG_ZERO) == 0)
4207                 mmu_radix_zero_page(m);
4208
4209         /*
4210          * Map the pagetable page into the process address space, if
4211          * it isn't already there.
4212          */
4213
4214         if (ptepindex >= (NUPDE + NUPDPE)) {
4215                 pml1_entry_t *l1e;
4216                 vm_pindex_t pml1index;
4217
4218                 /* Wire up a new PDPE page */
4219                 pml1index = ptepindex - (NUPDE + NUPDPE);
4220                 l1e = &pmap->pm_pml1[pml1index];
4221                 pde_store(l1e, VM_PAGE_TO_PHYS(m));
4222
4223         } else if (ptepindex >= NUPDE) {
4224                 vm_pindex_t pml1index;
4225                 vm_pindex_t pdpindex;
4226                 pml1_entry_t *l1e;
4227                 pml2_entry_t *l2e;
4228
4229                 /* Wire up a new l2e page */
4230                 pdpindex = ptepindex - NUPDE;
4231                 pml1index = pdpindex >> RPTE_SHIFT;
4232
4233                 l1e = &pmap->pm_pml1[pml1index];
4234                 if ((be64toh(*l1e) & PG_V) == 0) {
4235                         /* Have to allocate a new pdp, recurse */
4236                         if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index,
4237                                 lockp) == NULL) {
4238                                 vm_page_unwire_noq(m);
4239                                 vm_page_free_zero(m);
4240                                 return (NULL);
4241                         }
4242                 } else {
4243                         /* Add reference to l2e page */
4244                         pdppg = PHYS_TO_VM_PAGE(be64toh(*l1e) & PG_FRAME);
4245                         pdppg->ref_count++;
4246                 }
4247                 l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
4248
4249                 /* Now find the pdp page */
4250                 l2e = &l2e[pdpindex & RPTE_MASK];
4251                 pde_store(l2e, VM_PAGE_TO_PHYS(m));
4252
4253         } else {
4254                 vm_pindex_t pml1index;
4255                 vm_pindex_t pdpindex;
4256                 pml1_entry_t *l1e;
4257                 pml2_entry_t *l2e;
4258                 pml3_entry_t *l3e;
4259
4260                 /* Wire up a new PTE page */
4261                 pdpindex = ptepindex >> RPTE_SHIFT;
4262                 pml1index = pdpindex >> RPTE_SHIFT;
4263
4264                 /* First, find the pdp and check that its valid. */
4265                 l1e = &pmap->pm_pml1[pml1index];
4266                 if ((be64toh(*l1e) & PG_V) == 0) {
4267                         /* Have to allocate a new pd, recurse */
4268                         if (_pmap_allocpte(pmap, NUPDE + pdpindex,
4269                             lockp) == NULL) {
4270                                 vm_page_unwire_noq(m);
4271                                 vm_page_free_zero(m);
4272                                 return (NULL);
4273                         }
4274                         l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
4275                         l2e = &l2e[pdpindex & RPTE_MASK];
4276                 } else {
4277                         l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
4278                         l2e = &l2e[pdpindex & RPTE_MASK];
4279                         if ((be64toh(*l2e) & PG_V) == 0) {
4280                                 /* Have to allocate a new pd, recurse */
4281                                 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
4282                                     lockp) == NULL) {
4283                                         vm_page_unwire_noq(m);
4284                                         vm_page_free_zero(m);
4285                                         return (NULL);
4286                                 }
4287                         } else {
4288                                 /* Add reference to the pd page */
4289                                 pdpg = PHYS_TO_VM_PAGE(be64toh(*l2e) & PG_FRAME);
4290                                 pdpg->ref_count++;
4291                         }
4292                 }
4293                 l3e = (pml3_entry_t *)PHYS_TO_DMAP(be64toh(*l2e) & PG_FRAME);
4294
4295                 /* Now we know where the page directory page is */
4296                 l3e = &l3e[ptepindex & RPTE_MASK];
4297                 pde_store(l3e, VM_PAGE_TO_PHYS(m));
4298         }
4299
4300         pmap_resident_count_inc(pmap, 1);
4301         return (m);
4302 }
4303 static vm_page_t
4304 pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
4305 {
4306         vm_pindex_t pdpindex, ptepindex;
4307         pml2_entry_t *pdpe;
4308         vm_page_t pdpg;
4309
4310 retry:
4311         pdpe = pmap_pml2e(pmap, va);
4312         if (pdpe != NULL && (be64toh(*pdpe) & PG_V) != 0) {
4313                 /* Add a reference to the pd page. */
4314                 pdpg = PHYS_TO_VM_PAGE(be64toh(*pdpe) & PG_FRAME);
4315                 pdpg->ref_count++;
4316         } else {
4317                 /* Allocate a pd page. */
4318                 ptepindex = pmap_l3e_pindex(va);
4319                 pdpindex = ptepindex >> RPTE_SHIFT;
4320                 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
4321                 if (pdpg == NULL && lockp != NULL)
4322                         goto retry;
4323         }
4324         return (pdpg);
4325 }
4326
4327 static vm_page_t
4328 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
4329 {
4330         vm_pindex_t ptepindex;
4331         pml3_entry_t *pd;
4332         vm_page_t m;
4333
4334         /*
4335          * Calculate pagetable page index
4336          */
4337         ptepindex = pmap_l3e_pindex(va);
4338 retry:
4339         /*
4340          * Get the page directory entry
4341          */
4342         pd = pmap_pml3e(pmap, va);
4343
4344         /*
4345          * This supports switching from a 2MB page to a
4346          * normal 4K page.
4347          */
4348         if (pd != NULL && (be64toh(*pd) & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) {
4349                 if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) {
4350                         /*
4351                          * Invalidation of the 2MB page mapping may have caused
4352                          * the deallocation of the underlying PD page.
4353                          */
4354                         pd = NULL;
4355                 }
4356         }
4357
4358         /*
4359          * If the page table page is mapped, we just increment the
4360          * hold count, and activate it.
4361          */
4362         if (pd != NULL && (be64toh(*pd) & PG_V) != 0) {
4363                 m = PHYS_TO_VM_PAGE(be64toh(*pd) & PG_FRAME);
4364                 m->ref_count++;
4365         } else {
4366                 /*
4367                  * Here if the pte page isn't mapped, or if it has been
4368                  * deallocated.
4369                  */
4370                 m = _pmap_allocpte(pmap, ptepindex, lockp);
4371                 if (m == NULL && lockp != NULL)
4372                         goto retry;
4373         }
4374         return (m);
4375 }
4376
4377 static void
4378 mmu_radix_pinit0(pmap_t pmap)
4379 {
4380
4381         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4382         PMAP_LOCK_INIT(pmap);
4383         pmap->pm_pml1 = kernel_pmap->pm_pml1;
4384         pmap->pm_pid = kernel_pmap->pm_pid;
4385
4386         pmap->pm_radix.rt_root = 0;
4387         TAILQ_INIT(&pmap->pm_pvchunk);
4388         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
4389         kernel_pmap->pm_flags =
4390                 pmap->pm_flags = PMAP_PDE_SUPERPAGE;
4391 }
4392 /*
4393  * pmap_protect_l3e: do the things to protect a 2mpage in a process
4394  */
4395 static boolean_t
4396 pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot)
4397 {
4398         pt_entry_t newpde, oldpde;
4399         vm_offset_t eva, va;
4400         vm_page_t m;
4401         boolean_t anychanged;
4402
4403         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4404         KASSERT((sva & L3_PAGE_MASK) == 0,
4405             ("pmap_protect_l3e: sva is not 2mpage aligned"));
4406         anychanged = FALSE;
4407 retry:
4408         oldpde = newpde = be64toh(*l3e);
4409         if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
4410             (PG_MANAGED | PG_M | PG_RW)) {
4411                 eva = sva + L3_PAGE_SIZE;
4412                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
4413                     va < eva; va += PAGE_SIZE, m++)
4414                         vm_page_dirty(m);
4415         }
4416         if ((prot & VM_PROT_WRITE) == 0) {
4417                 newpde &= ~(PG_RW | PG_M);
4418                 newpde |= RPTE_EAA_R;
4419         }
4420         if (prot & VM_PROT_EXECUTE)
4421                 newpde |= PG_X;
4422         if (newpde != oldpde) {
4423                 /*
4424                  * As an optimization to future operations on this PDE, clear
4425                  * PG_PROMOTED.  The impending invalidation will remove any
4426                  * lingering 4KB page mappings from the TLB.
4427                  */
4428                 if (!atomic_cmpset_long(l3e, htobe64(oldpde), htobe64(newpde & ~PG_PROMOTED)))
4429                         goto retry;
4430                 anychanged = TRUE;
4431         }
4432         return (anychanged);
4433 }
4434
4435 void
4436 mmu_radix_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
4437     vm_prot_t prot)
4438 {
4439         vm_offset_t va_next;
4440         pml1_entry_t *l1e;
4441         pml2_entry_t *l2e;
4442         pml3_entry_t ptpaddr, *l3e;
4443         pt_entry_t *pte;
4444         boolean_t anychanged;
4445
4446         CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva,
4447             prot);
4448
4449         KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4450         if (prot == VM_PROT_NONE) {
4451                 mmu_radix_remove(pmap, sva, eva);
4452                 return;
4453         }
4454
4455         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
4456             (VM_PROT_WRITE|VM_PROT_EXECUTE))
4457                 return;
4458
4459 #ifdef INVARIANTS
4460         if (VERBOSE_PROTECT || pmap_logging)
4461                 printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n",
4462                            pmap, sva, eva, prot, pmap->pm_pid);
4463 #endif
4464         anychanged = FALSE;
4465
4466         PMAP_LOCK(pmap);
4467         for (; sva < eva; sva = va_next) {
4468                 l1e = pmap_pml1e(pmap, sva);
4469                 if ((be64toh(*l1e) & PG_V) == 0) {
4470                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
4471                         if (va_next < sva)
4472                                 va_next = eva;
4473                         continue;
4474                 }
4475
4476                 l2e = pmap_l1e_to_l2e(l1e, sva);
4477                 if ((be64toh(*l2e) & PG_V) == 0) {
4478                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
4479                         if (va_next < sva)
4480                                 va_next = eva;
4481                         continue;
4482                 }
4483
4484                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
4485                 if (va_next < sva)
4486                         va_next = eva;
4487
4488                 l3e = pmap_l2e_to_l3e(l2e, sva);
4489                 ptpaddr = be64toh(*l3e);
4490
4491                 /*
4492                  * Weed out invalid mappings.
4493                  */
4494                 if (ptpaddr == 0)
4495                         continue;
4496
4497                 /*
4498                  * Check for large page.
4499                  */
4500                 if ((ptpaddr & RPTE_LEAF) != 0) {
4501                         /*
4502                          * Are we protecting the entire large page?  If not,
4503                          * demote the mapping and fall through.
4504                          */
4505                         if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
4506                                 if (pmap_protect_l3e(pmap, l3e, sva, prot))
4507                                         anychanged = TRUE;
4508                                 continue;
4509                         } else if (!pmap_demote_l3e(pmap, l3e, sva)) {
4510                                 /*
4511                                  * The large page mapping was destroyed.
4512                                  */
4513                                 continue;
4514                         }
4515                 }
4516
4517                 if (va_next > eva)
4518                         va_next = eva;
4519
4520                 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++,
4521                     sva += PAGE_SIZE) {
4522                         pt_entry_t obits, pbits;
4523                         vm_page_t m;
4524
4525 retry:
4526                         MPASS(pte == pmap_pte(pmap, sva));
4527                         obits = pbits = be64toh(*pte);
4528                         if ((pbits & PG_V) == 0)
4529                                 continue;
4530
4531                         if ((prot & VM_PROT_WRITE) == 0) {
4532                                 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
4533                                     (PG_MANAGED | PG_M | PG_RW)) {
4534                                         m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
4535                                         vm_page_dirty(m);
4536                                 }
4537                                 pbits &= ~(PG_RW | PG_M);
4538                                 pbits |= RPTE_EAA_R;
4539                         }
4540                         if (prot & VM_PROT_EXECUTE)
4541                                 pbits |= PG_X;
4542
4543                         if (pbits != obits) {
4544                                 if (!atomic_cmpset_long(pte, htobe64(obits), htobe64(pbits)))
4545                                         goto retry;
4546                                 if (obits & (PG_A|PG_M)) {
4547                                         anychanged = TRUE;
4548 #ifdef INVARIANTS
4549                                         if (VERBOSE_PROTECT || pmap_logging)
4550                                                 printf("%#lx %#lx -> %#lx\n",
4551                                                     sva, obits, pbits);
4552 #endif
4553                                 }
4554                         }
4555                 }
4556         }
4557         if (anychanged)
4558                 pmap_invalidate_all(pmap);
4559         PMAP_UNLOCK(pmap);
4560 }
4561
4562 void
4563 mmu_radix_qenter(vm_offset_t sva, vm_page_t *ma, int count)
4564 {
4565
4566         CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count);
4567         pt_entry_t oldpte, pa, *pte;
4568         vm_page_t m;
4569         uint64_t cache_bits, attr_bits;
4570         vm_offset_t va;
4571
4572         oldpte = 0;
4573         attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A;
4574         va = sva;
4575         pte = kvtopte(va);
4576         while (va < sva + PAGE_SIZE * count) {
4577                 if (__predict_false((va & L3_PAGE_MASK) == 0))
4578                         pte = kvtopte(va);
4579                 MPASS(pte == pmap_pte(kernel_pmap, va));
4580
4581                 /*
4582                  * XXX there has to be a more efficient way than traversing
4583                  * the page table every time - but go for correctness for
4584                  * today
4585                  */
4586
4587                 m = *ma++;
4588                 cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs);
4589                 pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits;
4590                 if (be64toh(*pte) != pa) {
4591                         oldpte |= be64toh(*pte);
4592                         pte_store(pte, pa);
4593                 }
4594                 va += PAGE_SIZE;
4595                 pte++;
4596         }
4597         if (__predict_false((oldpte & RPTE_VALID) != 0))
4598                 pmap_invalidate_range(kernel_pmap, sva, sva + count *
4599                     PAGE_SIZE);
4600         else
4601                 ptesync();
4602 }
4603
4604 void
4605 mmu_radix_qremove(vm_offset_t sva, int count)
4606 {
4607         vm_offset_t va;
4608         pt_entry_t *pte;
4609
4610         CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count);
4611         KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva));
4612
4613         va = sva;
4614         pte = kvtopte(va);
4615         while (va < sva + PAGE_SIZE * count) {
4616                 if (__predict_false((va & L3_PAGE_MASK) == 0))
4617                         pte = kvtopte(va);
4618                 pte_clear(pte);
4619                 pte++;
4620                 va += PAGE_SIZE;
4621         }
4622         pmap_invalidate_range(kernel_pmap, sva, va);
4623 }
4624
4625 /***************************************************
4626  * Page table page management routines.....
4627  ***************************************************/
4628 /*
4629  * Schedule the specified unused page table page to be freed.  Specifically,
4630  * add the page to the specified list of pages that will be released to the
4631  * physical memory manager after the TLB has been updated.
4632  */
4633 static __inline void
4634 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
4635     boolean_t set_PG_ZERO)
4636 {
4637
4638         if (set_PG_ZERO)
4639                 m->flags |= PG_ZERO;
4640         else
4641                 m->flags &= ~PG_ZERO;
4642         SLIST_INSERT_HEAD(free, m, plinks.s.ss);
4643 }
4644
4645 /*
4646  * Inserts the specified page table page into the specified pmap's collection
4647  * of idle page table pages.  Each of a pmap's page table pages is responsible
4648  * for mapping a distinct range of virtual addresses.  The pmap's collection is
4649  * ordered by this virtual address range.
4650  */
4651 static __inline int
4652 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
4653 {
4654
4655         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4656         return (vm_radix_insert(&pmap->pm_radix, mpte));
4657 }
4658
4659 /*
4660  * Removes the page table page mapping the specified virtual address from the
4661  * specified pmap's collection of idle page table pages, and returns it.
4662  * Otherwise, returns NULL if there is no page table page corresponding to the
4663  * specified virtual address.
4664  */
4665 static __inline vm_page_t
4666 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4667 {
4668
4669         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4670         return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va)));
4671 }
4672
4673 /*
4674  * Decrements a page table page's wire count, which is used to record the
4675  * number of valid page table entries within the page.  If the wire count
4676  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
4677  * page table page was unmapped and FALSE otherwise.
4678  */
4679 static inline boolean_t
4680 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
4681 {
4682
4683         --m->ref_count;
4684         if (m->ref_count == 0) {
4685                 _pmap_unwire_ptp(pmap, va, m, free);
4686                 return (TRUE);
4687         } else
4688                 return (FALSE);
4689 }
4690
4691 static void
4692 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
4693 {
4694
4695         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4696         /*
4697          * unmap the page table page
4698          */
4699         if (m->pindex >= (NUPDE + NUPDPE)) {
4700                 /* PDP page */
4701                 pml1_entry_t *pml1;
4702                 pml1 = pmap_pml1e(pmap, va);
4703                 *pml1 = 0;
4704         } else if (m->pindex >= NUPDE) {
4705                 /* PD page */
4706                 pml2_entry_t *l2e;
4707                 l2e = pmap_pml2e(pmap, va);
4708                 *l2e = 0;
4709         } else {
4710                 /* PTE page */
4711                 pml3_entry_t *l3e;
4712                 l3e = pmap_pml3e(pmap, va);
4713                 *l3e = 0;
4714         }
4715         pmap_resident_count_dec(pmap, 1);
4716         if (m->pindex < NUPDE) {
4717                 /* We just released a PT, unhold the matching PD */
4718                 vm_page_t pdpg;
4719
4720                 pdpg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml2e(pmap, va)) & PG_FRAME);
4721                 pmap_unwire_ptp(pmap, va, pdpg, free);
4722         }
4723         if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
4724                 /* We just released a PD, unhold the matching PDP */
4725                 vm_page_t pdppg;
4726
4727                 pdppg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml1e(pmap, va)) & PG_FRAME);
4728                 pmap_unwire_ptp(pmap, va, pdppg, free);
4729         }
4730
4731         /*
4732          * Put page on a list so that it is released after
4733          * *ALL* TLB shootdown is done
4734          */
4735         pmap_add_delayed_free_list(m, free, TRUE);
4736 }
4737
4738 /*
4739  * After removing a page table entry, this routine is used to
4740  * conditionally free the page, and manage the hold/wire counts.
4741  */
4742 static int
4743 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde,
4744     struct spglist *free)
4745 {
4746         vm_page_t mpte;
4747
4748         if (va >= VM_MAXUSER_ADDRESS)
4749                 return (0);
4750         KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
4751         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
4752         return (pmap_unwire_ptp(pmap, va, mpte, free));
4753 }
4754
4755 void
4756 mmu_radix_release(pmap_t pmap)
4757 {
4758
4759         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4760         KASSERT(pmap->pm_stats.resident_count == 0,
4761             ("pmap_release: pmap resident count %ld != 0",
4762             pmap->pm_stats.resident_count));
4763         KASSERT(vm_radix_is_empty(&pmap->pm_radix),
4764             ("pmap_release: pmap has reserved page table page(s)"));
4765
4766         pmap_invalidate_all(pmap);
4767         isa3_proctab[pmap->pm_pid].proctab0 = 0;
4768         uma_zfree(zone_radix_pgd, pmap->pm_pml1);
4769         vmem_free(asid_arena, pmap->pm_pid, 1);
4770 }
4771
4772 /*
4773  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
4774  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
4775  * false if the PV entry cannot be allocated without resorting to reclamation.
4776  */
4777 static bool
4778 pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags,
4779     struct rwlock **lockp)
4780 {
4781         struct md_page *pvh;
4782         pv_entry_t pv;
4783         vm_paddr_t pa;
4784
4785         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4786         /* Pass NULL instead of the lock pointer to disable reclamation. */
4787         if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
4788             NULL : lockp)) == NULL)
4789                 return (false);
4790         pv->pv_va = va;
4791         pa = pde & PG_PS_FRAME;
4792         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4793         pvh = pa_to_pvh(pa);
4794         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
4795         pvh->pv_gen++;
4796         return (true);
4797 }
4798
4799 /*
4800  * Fills a page table page with mappings to consecutive physical pages.
4801  */
4802 static void
4803 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
4804 {
4805         pt_entry_t *pte;
4806
4807         for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
4808                 *pte = htobe64(newpte);
4809                 newpte += PAGE_SIZE;
4810         }
4811 }
4812
4813 static boolean_t
4814 pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va)
4815 {
4816         struct rwlock *lock;
4817         boolean_t rv;
4818
4819         lock = NULL;
4820         rv = pmap_demote_l3e_locked(pmap, pde, va, &lock);
4821         if (lock != NULL)
4822                 rw_wunlock(lock);
4823         return (rv);
4824 }
4825
4826 static boolean_t
4827 pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va,
4828     struct rwlock **lockp)
4829 {
4830         pml3_entry_t oldpde;
4831         pt_entry_t *firstpte;
4832         vm_paddr_t mptepa;
4833         vm_page_t mpte;
4834         struct spglist free;
4835         vm_offset_t sva;
4836
4837         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4838         oldpde = be64toh(*l3e);
4839         KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V),
4840             ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx",
4841             oldpde));
4842         if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
4843             NULL) {
4844                 KASSERT((oldpde & PG_W) == 0,
4845                     ("pmap_demote_l3e: page table page for a wired mapping"
4846                     " is missing"));
4847
4848                 /*
4849                  * Invalidate the 2MB page mapping and return "failure" if the
4850                  * mapping was never accessed or the allocation of the new
4851                  * page table page fails.  If the 2MB page mapping belongs to
4852                  * the direct map region of the kernel's address space, then
4853                  * the page allocation request specifies the highest possible
4854                  * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
4855                  * normal.  Page table pages are preallocated for every other
4856                  * part of the kernel address space, so the direct map region
4857                  * is the only part of the kernel address space that must be
4858                  * handled here.
4859                  */
4860                 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
4861                     pmap_l3e_pindex(va), (va >= DMAP_MIN_ADDRESS && va <
4862                     DMAP_MAX_ADDRESS ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
4863                     VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
4864                         SLIST_INIT(&free);
4865                         sva = trunc_2mpage(va);
4866                         pmap_remove_l3e(pmap, l3e, sva, &free, lockp);
4867                         pmap_invalidate_l3e_page(pmap, sva, oldpde);
4868                         vm_page_free_pages_toq(&free, true);
4869                         CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx"
4870                             " in pmap %p", va, pmap);
4871                         return (FALSE);
4872                 }
4873                 if (va < VM_MAXUSER_ADDRESS)
4874                         pmap_resident_count_inc(pmap, 1);
4875         }
4876         mptepa = VM_PAGE_TO_PHYS(mpte);
4877         firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
4878         KASSERT((oldpde & PG_A) != 0,
4879             ("pmap_demote_l3e: oldpde is missing PG_A"));
4880         KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
4881             ("pmap_demote_l3e: oldpde is missing PG_M"));
4882
4883         /*
4884          * If the page table page is new, initialize it.
4885          */
4886         if (mpte->ref_count == 1) {
4887                 mpte->ref_count = NPTEPG;
4888                 pmap_fill_ptp(firstpte, oldpde);
4889         }
4890
4891         KASSERT((be64toh(*firstpte) & PG_FRAME) == (oldpde & PG_FRAME),
4892             ("pmap_demote_l3e: firstpte and newpte map different physical"
4893             " addresses"));
4894
4895         /*
4896          * If the mapping has changed attributes, update the page table
4897          * entries.
4898          */
4899         if ((be64toh(*firstpte) & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE))
4900                 pmap_fill_ptp(firstpte, oldpde);
4901
4902         /*
4903          * The spare PV entries must be reserved prior to demoting the
4904          * mapping, that is, prior to changing the PDE.  Otherwise, the state
4905          * of the PDE and the PV lists will be inconsistent, which can result
4906          * in reclaim_pv_chunk() attempting to remove a PV entry from the
4907          * wrong PV list and pmap_pv_demote_l3e() failing to find the expected
4908          * PV entry for the 2MB page mapping that is being demoted.
4909          */
4910         if ((oldpde & PG_MANAGED) != 0)
4911                 reserve_pv_entries(pmap, NPTEPG - 1, lockp);
4912
4913         /*
4914          * Demote the mapping.  This pmap is locked.  The old PDE has
4915          * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
4916          * set.  Thus, there is no danger of a race with another
4917          * processor changing the setting of PG_A and/or PG_M between
4918          * the read above and the store below.
4919          */
4920         pde_store(l3e, mptepa);
4921         ptesync();
4922         /*
4923          * Demote the PV entry.
4924          */
4925         if ((oldpde & PG_MANAGED) != 0)
4926                 pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp);
4927
4928         atomic_add_long(&pmap_l3e_demotions, 1);
4929         CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx"
4930             " in pmap %p", va, pmap);
4931         return (TRUE);
4932 }
4933
4934 /*
4935  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
4936  */
4937 static void
4938 pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va)
4939 {
4940         vm_paddr_t mptepa;
4941         vm_page_t mpte;
4942
4943         KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
4944         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4945         mpte = pmap_remove_pt_page(pmap, va);
4946         if (mpte == NULL)
4947                 panic("pmap_remove_kernel_pde: Missing pt page.");
4948
4949         mptepa = VM_PAGE_TO_PHYS(mpte);
4950
4951         /*
4952          * Initialize the page table page.
4953          */
4954         pagezero(PHYS_TO_DMAP(mptepa));
4955
4956         /*
4957          * Demote the mapping.
4958          */
4959         pde_store(l3e, mptepa);
4960         ptesync();
4961 }
4962
4963 /*
4964  * pmap_remove_l3e: do the things to unmap a superpage in a process
4965  */
4966 static int
4967 pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva,
4968     struct spglist *free, struct rwlock **lockp)
4969 {
4970         struct md_page *pvh;
4971         pml3_entry_t oldpde;
4972         vm_offset_t eva, va;
4973         vm_page_t m, mpte;
4974
4975         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4976         KASSERT((sva & L3_PAGE_MASK) == 0,
4977             ("pmap_remove_l3e: sva is not 2mpage aligned"));
4978         oldpde = be64toh(pte_load_clear(pdq));
4979         if (oldpde & PG_W)
4980                 pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE);
4981         pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE);
4982         if (oldpde & PG_MANAGED) {
4983                 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
4984                 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
4985                 pmap_pvh_free(pvh, pmap, sva);
4986                 eva = sva + L3_PAGE_SIZE;
4987                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
4988                     va < eva; va += PAGE_SIZE, m++) {
4989                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
4990                                 vm_page_dirty(m);
4991                         if (oldpde & PG_A)
4992                                 vm_page_aflag_set(m, PGA_REFERENCED);
4993                         if (TAILQ_EMPTY(&m->md.pv_list) &&
4994                             TAILQ_EMPTY(&pvh->pv_list))
4995                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
4996                 }
4997         }
4998         if (pmap == kernel_pmap) {
4999                 pmap_remove_kernel_l3e(pmap, pdq, sva);
5000         } else {
5001                 mpte = pmap_remove_pt_page(pmap, sva);
5002                 if (mpte != NULL) {
5003                         pmap_resident_count_dec(pmap, 1);
5004                         KASSERT(mpte->ref_count == NPTEPG,
5005                             ("pmap_remove_l3e: pte page wire count error"));
5006                         mpte->ref_count = 0;
5007                         pmap_add_delayed_free_list(mpte, free, FALSE);
5008                 }
5009         }
5010         return (pmap_unuse_pt(pmap, sva, be64toh(*pmap_pml2e(pmap, sva)), free));
5011 }
5012
5013 /*
5014  * pmap_remove_pte: do the things to unmap a page in a process
5015  */
5016 static int
5017 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
5018     pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
5019 {
5020         struct md_page *pvh;
5021         pt_entry_t oldpte;
5022         vm_page_t m;
5023
5024         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5025         oldpte = be64toh(pte_load_clear(ptq));
5026         if (oldpte & RPTE_WIRED)
5027                 pmap->pm_stats.wired_count -= 1;
5028         pmap_resident_count_dec(pmap, 1);
5029         if (oldpte & RPTE_MANAGED) {
5030                 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
5031                 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5032                         vm_page_dirty(m);
5033                 if (oldpte & PG_A)
5034                         vm_page_aflag_set(m, PGA_REFERENCED);
5035                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
5036                 pmap_pvh_free(&m->md, pmap, va);
5037                 if (TAILQ_EMPTY(&m->md.pv_list) &&
5038                     (m->flags & PG_FICTITIOUS) == 0) {
5039                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5040                         if (TAILQ_EMPTY(&pvh->pv_list))
5041                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
5042                 }
5043         }
5044         return (pmap_unuse_pt(pmap, va, ptepde, free));
5045 }
5046
5047 /*
5048  * Remove a single page from a process address space
5049  */
5050 static bool
5051 pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e,
5052     struct spglist *free)
5053 {
5054         struct rwlock *lock;
5055         pt_entry_t *pte;
5056         bool invalidate_all;
5057
5058         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5059         if ((be64toh(*l3e) & RPTE_VALID) == 0) {
5060                 return (false);
5061         }
5062         pte = pmap_l3e_to_pte(l3e, va);
5063         if ((be64toh(*pte) & RPTE_VALID) == 0) {
5064                 return (false);
5065         }
5066         lock = NULL;
5067
5068         invalidate_all = pmap_remove_pte(pmap, pte, va, be64toh(*l3e), free, &lock);
5069         if (lock != NULL)
5070                 rw_wunlock(lock);
5071         if (!invalidate_all)
5072                 pmap_invalidate_page(pmap, va);
5073         return (invalidate_all);
5074 }
5075
5076 /*
5077  * Removes the specified range of addresses from the page table page.
5078  */
5079 static bool
5080 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
5081     pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp)
5082 {
5083         pt_entry_t *pte;
5084         vm_offset_t va;
5085         bool anyvalid;
5086
5087         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5088         anyvalid = false;
5089         va = eva;
5090         for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++,
5091             sva += PAGE_SIZE) {
5092                 MPASS(pte == pmap_pte(pmap, sva));
5093                 if (*pte == 0) {
5094                         if (va != eva) {
5095                                 anyvalid = true;
5096                                 va = eva;
5097                         }
5098                         continue;
5099                 }
5100                 if (va == eva)
5101                         va = sva;
5102                 if (pmap_remove_pte(pmap, pte, sva, be64toh(*l3e), free, lockp)) {
5103                         anyvalid = true;
5104                         sva += PAGE_SIZE;
5105                         break;
5106                 }
5107         }
5108         if (anyvalid)
5109                 pmap_invalidate_all(pmap);
5110         else if (va != eva)
5111                 pmap_invalidate_range(pmap, va, sva);
5112         return (anyvalid);
5113 }
5114
5115 void
5116 mmu_radix_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5117 {
5118         struct rwlock *lock;
5119         vm_offset_t va_next;
5120         pml1_entry_t *l1e;
5121         pml2_entry_t *l2e;
5122         pml3_entry_t ptpaddr, *l3e;
5123         struct spglist free;
5124         bool anyvalid;
5125
5126         CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva);
5127
5128         /*
5129          * Perform an unsynchronized read.  This is, however, safe.
5130          */
5131         if (pmap->pm_stats.resident_count == 0)
5132                 return;
5133
5134         anyvalid = false;
5135         SLIST_INIT(&free);
5136
5137         /* XXX something fishy here */
5138         sva = (sva + PAGE_MASK) & ~PAGE_MASK;
5139         eva = (eva + PAGE_MASK) & ~PAGE_MASK;
5140
5141         PMAP_LOCK(pmap);
5142
5143         /*
5144          * special handling of removing one page.  a very
5145          * common operation and easy to short circuit some
5146          * code.
5147          */
5148         if (sva + PAGE_SIZE == eva) {
5149                 l3e = pmap_pml3e(pmap, sva);
5150                 if (l3e && (be64toh(*l3e) & RPTE_LEAF) == 0) {
5151                         anyvalid = pmap_remove_page(pmap, sva, l3e, &free);
5152                         goto out;
5153                 }
5154         }
5155
5156         lock = NULL;
5157         for (; sva < eva; sva = va_next) {
5158                 if (pmap->pm_stats.resident_count == 0)
5159                         break;
5160                 l1e = pmap_pml1e(pmap, sva);
5161                 if (l1e == NULL || (be64toh(*l1e) & PG_V) == 0) {
5162                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
5163                         if (va_next < sva)
5164                                 va_next = eva;
5165                         continue;
5166                 }
5167
5168                 l2e = pmap_l1e_to_l2e(l1e, sva);
5169                 if (l2e == NULL || (be64toh(*l2e) & PG_V) == 0) {
5170                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
5171                         if (va_next < sva)
5172                                 va_next = eva;
5173                         continue;
5174                 }
5175
5176                 /*
5177                  * Calculate index for next page table.
5178                  */
5179                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
5180                 if (va_next < sva)
5181                         va_next = eva;
5182
5183                 l3e = pmap_l2e_to_l3e(l2e, sva);
5184                 ptpaddr = be64toh(*l3e);
5185
5186                 /*
5187                  * Weed out invalid mappings.
5188                  */
5189                 if (ptpaddr == 0)
5190                         continue;
5191
5192                 /*
5193                  * Check for large page.
5194                  */
5195                 if ((ptpaddr & RPTE_LEAF) != 0) {
5196                         /*
5197                          * Are we removing the entire large page?  If not,
5198                          * demote the mapping and fall through.
5199                          */
5200                         if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
5201                                 pmap_remove_l3e(pmap, l3e, sva, &free, &lock);
5202                                 continue;
5203                         } else if (!pmap_demote_l3e_locked(pmap, l3e, sva,
5204                             &lock)) {
5205                                 /* The large page mapping was destroyed. */
5206                                 continue;
5207                         } else
5208                                 ptpaddr = be64toh(*l3e);
5209                 }
5210
5211                 /*
5212                  * Limit our scan to either the end of the va represented
5213                  * by the current page table page, or to the end of the
5214                  * range being removed.
5215                  */
5216                 if (va_next > eva)
5217                         va_next = eva;
5218
5219                 if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock))
5220                         anyvalid = true;
5221         }
5222         if (lock != NULL)
5223                 rw_wunlock(lock);
5224 out:
5225         if (anyvalid)
5226                 pmap_invalidate_all(pmap);
5227         PMAP_UNLOCK(pmap);
5228         vm_page_free_pages_toq(&free, true);
5229 }
5230
5231 void
5232 mmu_radix_remove_all(vm_page_t m)
5233 {
5234         struct md_page *pvh;
5235         pv_entry_t pv;
5236         pmap_t pmap;
5237         struct rwlock *lock;
5238         pt_entry_t *pte, tpte;
5239         pml3_entry_t *l3e;
5240         vm_offset_t va;
5241         struct spglist free;
5242         int pvh_gen, md_gen;
5243
5244         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5245         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5246             ("pmap_remove_all: page %p is not managed", m));
5247         SLIST_INIT(&free);
5248         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5249         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
5250             pa_to_pvh(VM_PAGE_TO_PHYS(m));
5251 retry:
5252         rw_wlock(lock);
5253         while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
5254                 pmap = PV_PMAP(pv);
5255                 if (!PMAP_TRYLOCK(pmap)) {
5256                         pvh_gen = pvh->pv_gen;
5257                         rw_wunlock(lock);
5258                         PMAP_LOCK(pmap);
5259                         rw_wlock(lock);
5260                         if (pvh_gen != pvh->pv_gen) {
5261                                 rw_wunlock(lock);
5262                                 PMAP_UNLOCK(pmap);
5263                                 goto retry;
5264                         }
5265                 }
5266                 va = pv->pv_va;
5267                 l3e = pmap_pml3e(pmap, va);
5268                 (void)pmap_demote_l3e_locked(pmap, l3e, va, &lock);
5269                 PMAP_UNLOCK(pmap);
5270         }
5271         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
5272                 pmap = PV_PMAP(pv);
5273                 if (!PMAP_TRYLOCK(pmap)) {
5274                         pvh_gen = pvh->pv_gen;
5275                         md_gen = m->md.pv_gen;
5276                         rw_wunlock(lock);
5277                         PMAP_LOCK(pmap);
5278                         rw_wlock(lock);
5279                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5280                                 rw_wunlock(lock);
5281                                 PMAP_UNLOCK(pmap);
5282                                 goto retry;
5283                         }
5284                 }
5285                 pmap_resident_count_dec(pmap, 1);
5286                 l3e = pmap_pml3e(pmap, pv->pv_va);
5287                 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_remove_all: found"
5288                     " a 2mpage in page %p's pv list", m));
5289                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
5290                 tpte = be64toh(pte_load_clear(pte));
5291                 if (tpte & PG_W)
5292                         pmap->pm_stats.wired_count--;
5293                 if (tpte & PG_A)
5294                         vm_page_aflag_set(m, PGA_REFERENCED);
5295
5296                 /*
5297                  * Update the vm_page_t clean and reference bits.
5298                  */
5299                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5300                         vm_page_dirty(m);
5301                 pmap_unuse_pt(pmap, pv->pv_va, be64toh(*l3e), &free);
5302                 pmap_invalidate_page(pmap, pv->pv_va);
5303                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
5304                 m->md.pv_gen++;
5305                 free_pv_entry(pmap, pv);
5306                 PMAP_UNLOCK(pmap);
5307         }
5308         vm_page_aflag_clear(m, PGA_WRITEABLE);
5309         rw_wunlock(lock);
5310         vm_page_free_pages_toq(&free, true);
5311 }
5312
5313 /*
5314  * Destroy all managed, non-wired mappings in the given user-space
5315  * pmap.  This pmap cannot be active on any processor besides the
5316  * caller.
5317  *
5318  * This function cannot be applied to the kernel pmap.  Moreover, it
5319  * is not intended for general use.  It is only to be used during
5320  * process termination.  Consequently, it can be implemented in ways
5321  * that make it faster than pmap_remove().  First, it can more quickly
5322  * destroy mappings by iterating over the pmap's collection of PV
5323  * entries, rather than searching the page table.  Second, it doesn't
5324  * have to test and clear the page table entries atomically, because
5325  * no processor is currently accessing the user address space.  In
5326  * particular, a page table entry's dirty bit won't change state once
5327  * this function starts.
5328  *
5329  * Although this function destroys all of the pmap's managed,
5330  * non-wired mappings, it can delay and batch the invalidation of TLB
5331  * entries without calling pmap_delayed_invl_started() and
5332  * pmap_delayed_invl_finished().  Because the pmap is not active on
5333  * any other processor, none of these TLB entries will ever be used
5334  * before their eventual invalidation.  Consequently, there is no need
5335  * for either pmap_remove_all() or pmap_remove_write() to wait for
5336  * that eventual TLB invalidation.
5337  */
5338
5339 void
5340 mmu_radix_remove_pages(pmap_t pmap)
5341 {
5342
5343         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
5344         pml3_entry_t ptel3e;
5345         pt_entry_t *pte, tpte;
5346         struct spglist free;
5347         vm_page_t m, mpte, mt;
5348         pv_entry_t pv;
5349         struct md_page *pvh;
5350         struct pv_chunk *pc, *npc;
5351         struct rwlock *lock;
5352         int64_t bit;
5353         uint64_t inuse, bitmask;
5354         int allfree, field, freed, idx;
5355         boolean_t superpage;
5356         vm_paddr_t pa;
5357
5358         /*
5359          * Assert that the given pmap is only active on the current
5360          * CPU.  Unfortunately, we cannot block another CPU from
5361          * activating the pmap while this function is executing.
5362          */
5363         KASSERT(pmap->pm_pid == mfspr(SPR_PID),
5364             ("non-current asid %lu - expected %lu", pmap->pm_pid,
5365             mfspr(SPR_PID)));
5366
5367         lock = NULL;
5368
5369         SLIST_INIT(&free);
5370         PMAP_LOCK(pmap);
5371         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5372                 allfree = 1;
5373                 freed = 0;
5374                 for (field = 0; field < _NPCM; field++) {
5375                         inuse = ~pc->pc_map[field] & pc_freemask[field];
5376                         while (inuse != 0) {
5377                                 bit = cnttzd(inuse);
5378                                 bitmask = 1UL << bit;
5379                                 idx = field * 64 + bit;
5380                                 pv = &pc->pc_pventry[idx];
5381                                 inuse &= ~bitmask;
5382
5383                                 pte = pmap_pml2e(pmap, pv->pv_va);
5384                                 ptel3e = be64toh(*pte);
5385                                 pte = pmap_l2e_to_l3e(pte, pv->pv_va);
5386                                 tpte = be64toh(*pte);
5387                                 if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) {
5388                                         superpage = FALSE;
5389                                         ptel3e = tpte;
5390                                         pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
5391                                             PG_FRAME);
5392                                         pte = &pte[pmap_pte_index(pv->pv_va)];
5393                                         tpte = be64toh(*pte);
5394                                 } else {
5395                                         /*
5396                                          * Keep track whether 'tpte' is a
5397                                          * superpage explicitly instead of
5398                                          * relying on RPTE_LEAF being set.
5399                                          *
5400                                          * This is because RPTE_LEAF is numerically
5401                                          * identical to PG_PTE_PAT and thus a
5402                                          * regular page could be mistaken for
5403                                          * a superpage.
5404                                          */
5405                                         superpage = TRUE;
5406                                 }
5407
5408                                 if ((tpte & PG_V) == 0) {
5409                                         panic("bad pte va %lx pte %lx",
5410                                             pv->pv_va, tpte);
5411                                 }
5412
5413 /*
5414  * We cannot remove wired pages from a process' mapping at this time
5415  */
5416                                 if (tpte & PG_W) {
5417                                         allfree = 0;
5418                                         continue;
5419                                 }
5420
5421                                 if (superpage)
5422                                         pa = tpte & PG_PS_FRAME;
5423                                 else
5424                                         pa = tpte & PG_FRAME;
5425
5426                                 m = PHYS_TO_VM_PAGE(pa);
5427                                 KASSERT(m->phys_addr == pa,
5428                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5429                                     m, (uintmax_t)m->phys_addr,
5430                                     (uintmax_t)tpte));
5431
5432                                 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5433                                     m < &vm_page_array[vm_page_array_size],
5434                                     ("pmap_remove_pages: bad tpte %#jx",
5435                                     (uintmax_t)tpte));
5436
5437                                 pte_clear(pte);
5438
5439                                 /*
5440                                  * Update the vm_page_t clean/reference bits.
5441                                  */
5442                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5443                                         if (superpage) {
5444                                                 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
5445                                                         vm_page_dirty(mt);
5446                                         } else
5447                                                 vm_page_dirty(m);
5448                                 }
5449
5450                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5451
5452                                 /* Mark free */
5453                                 pc->pc_map[field] |= bitmask;
5454                                 if (superpage) {
5455                                         pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE);
5456                                         pvh = pa_to_pvh(tpte & PG_PS_FRAME);
5457                                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
5458                                         pvh->pv_gen++;
5459                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
5460                                                 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
5461                                                         if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
5462                                                             TAILQ_EMPTY(&mt->md.pv_list))
5463                                                                 vm_page_aflag_clear(mt, PGA_WRITEABLE);
5464                                         }
5465                                         mpte = pmap_remove_pt_page(pmap, pv->pv_va);
5466                                         if (mpte != NULL) {
5467                                                 pmap_resident_count_dec(pmap, 1);
5468                                                 KASSERT(mpte->ref_count == NPTEPG,
5469                                                     ("pmap_remove_pages: pte page wire count error"));
5470                                                 mpte->ref_count = 0;
5471                                                 pmap_add_delayed_free_list(mpte, &free, FALSE);
5472                                         }
5473                                 } else {
5474                                         pmap_resident_count_dec(pmap, 1);
5475 #ifdef VERBOSE_PV
5476                                         printf("freeing pv (%p, %p)\n",
5477                                                    pmap, pv);
5478 #endif
5479                                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
5480                                         m->md.pv_gen++;
5481                                         if ((m->a.flags & PGA_WRITEABLE) != 0 &&
5482                                             TAILQ_EMPTY(&m->md.pv_list) &&
5483                                             (m->flags & PG_FICTITIOUS) == 0) {
5484                                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5485                                                 if (TAILQ_EMPTY(&pvh->pv_list))
5486                                                         vm_page_aflag_clear(m, PGA_WRITEABLE);
5487                                         }
5488                                 }
5489                                 pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free);
5490                                 freed++;
5491                         }
5492                 }
5493                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5494                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5495                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5496                 if (allfree) {
5497                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5498                         free_pv_chunk(pc);
5499                 }
5500         }
5501         if (lock != NULL)
5502                 rw_wunlock(lock);
5503         pmap_invalidate_all(pmap);
5504         PMAP_UNLOCK(pmap);
5505         vm_page_free_pages_toq(&free, true);
5506 }
5507
5508 void
5509 mmu_radix_remove_write(vm_page_t m)
5510 {
5511         struct md_page *pvh;
5512         pmap_t pmap;
5513         struct rwlock *lock;
5514         pv_entry_t next_pv, pv;
5515         pml3_entry_t *l3e;
5516         pt_entry_t oldpte, *pte;
5517         int pvh_gen, md_gen;
5518
5519         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5520         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5521             ("pmap_remove_write: page %p is not managed", m));
5522         vm_page_assert_busied(m);
5523
5524         if (!pmap_page_is_write_mapped(m))
5525                 return;
5526         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5527         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
5528             pa_to_pvh(VM_PAGE_TO_PHYS(m));
5529 retry_pv_loop:
5530         rw_wlock(lock);
5531         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) {
5532                 pmap = PV_PMAP(pv);
5533                 if (!PMAP_TRYLOCK(pmap)) {
5534                         pvh_gen = pvh->pv_gen;
5535                         rw_wunlock(lock);
5536                         PMAP_LOCK(pmap);
5537                         rw_wlock(lock);
5538                         if (pvh_gen != pvh->pv_gen) {
5539                                 PMAP_UNLOCK(pmap);
5540                                 rw_wunlock(lock);
5541                                 goto retry_pv_loop;
5542                         }
5543                 }
5544                 l3e = pmap_pml3e(pmap, pv->pv_va);
5545                 if ((be64toh(*l3e) & PG_RW) != 0)
5546                         (void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock);
5547                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5548                     ("inconsistent pv lock %p %p for page %p",
5549                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5550                 PMAP_UNLOCK(pmap);
5551         }
5552         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
5553                 pmap = PV_PMAP(pv);
5554                 if (!PMAP_TRYLOCK(pmap)) {
5555                         pvh_gen = pvh->pv_gen;
5556                         md_gen = m->md.pv_gen;
5557                         rw_wunlock(lock);
5558                         PMAP_LOCK(pmap);
5559                         rw_wlock(lock);
5560                         if (pvh_gen != pvh->pv_gen ||
5561                             md_gen != m->md.pv_gen) {
5562                                 PMAP_UNLOCK(pmap);
5563                                 rw_wunlock(lock);
5564                                 goto retry_pv_loop;
5565                         }
5566                 }
5567                 l3e = pmap_pml3e(pmap, pv->pv_va);
5568                 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0,
5569                     ("pmap_remove_write: found a 2mpage in page %p's pv list",
5570                     m));
5571                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
5572 retry:
5573                 oldpte = be64toh(*pte);
5574                 if (oldpte & PG_RW) {
5575                         if (!atomic_cmpset_long(pte, htobe64(oldpte),
5576                             htobe64((oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M))))
5577                                 goto retry;
5578                         if ((oldpte & PG_M) != 0)
5579                                 vm_page_dirty(m);
5580                         pmap_invalidate_page(pmap, pv->pv_va);
5581                 }
5582                 PMAP_UNLOCK(pmap);
5583         }
5584         rw_wunlock(lock);
5585         vm_page_aflag_clear(m, PGA_WRITEABLE);
5586 }
5587
5588 /*
5589  *      Clear the wired attribute from the mappings for the specified range of
5590  *      addresses in the given pmap.  Every valid mapping within that range
5591  *      must have the wired attribute set.  In contrast, invalid mappings
5592  *      cannot have the wired attribute set, so they are ignored.
5593  *
5594  *      The wired attribute of the page table entry is not a hardware
5595  *      feature, so there is no need to invalidate any TLB entries.
5596  *      Since pmap_demote_l3e() for the wired entry must never fail,
5597  *      pmap_delayed_invl_started()/finished() calls around the
5598  *      function are not needed.
5599  */
5600 void
5601 mmu_radix_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5602 {
5603         vm_offset_t va_next;
5604         pml1_entry_t *l1e;
5605         pml2_entry_t *l2e;
5606         pml3_entry_t *l3e;
5607         pt_entry_t *pte;
5608
5609         CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva);
5610         PMAP_LOCK(pmap);
5611         for (; sva < eva; sva = va_next) {
5612                 l1e = pmap_pml1e(pmap, sva);
5613                 if ((be64toh(*l1e) & PG_V) == 0) {
5614                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
5615                         if (va_next < sva)
5616                                 va_next = eva;
5617                         continue;
5618                 }
5619                 l2e = pmap_l1e_to_l2e(l1e, sva);
5620                 if ((be64toh(*l2e) & PG_V) == 0) {
5621                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
5622                         if (va_next < sva)
5623                                 va_next = eva;
5624                         continue;
5625                 }
5626                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
5627                 if (va_next < sva)
5628                         va_next = eva;
5629                 l3e = pmap_l2e_to_l3e(l2e, sva);
5630                 if ((be64toh(*l3e) & PG_V) == 0)
5631                         continue;
5632                 if ((be64toh(*l3e) & RPTE_LEAF) != 0) {
5633                         if ((be64toh(*l3e) & PG_W) == 0)
5634                                 panic("pmap_unwire: pde %#jx is missing PG_W",
5635                                     (uintmax_t)(be64toh(*l3e)));
5636
5637                         /*
5638                          * Are we unwiring the entire large page?  If not,
5639                          * demote the mapping and fall through.
5640                          */
5641                         if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
5642                                 atomic_clear_long(l3e, htobe64(PG_W));
5643                                 pmap->pm_stats.wired_count -= L3_PAGE_SIZE /
5644                                     PAGE_SIZE;
5645                                 continue;
5646                         } else if (!pmap_demote_l3e(pmap, l3e, sva))
5647                                 panic("pmap_unwire: demotion failed");
5648                 }
5649                 if (va_next > eva)
5650                         va_next = eva;
5651                 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++,
5652                     sva += PAGE_SIZE) {
5653                         MPASS(pte == pmap_pte(pmap, sva));
5654                         if ((be64toh(*pte) & PG_V) == 0)
5655                                 continue;
5656                         if ((be64toh(*pte) & PG_W) == 0)
5657                                 panic("pmap_unwire: pte %#jx is missing PG_W",
5658                                     (uintmax_t)(be64toh(*pte)));
5659
5660                         /*
5661                          * PG_W must be cleared atomically.  Although the pmap
5662                          * lock synchronizes access to PG_W, another processor
5663                          * could be setting PG_M and/or PG_A concurrently.
5664                          */
5665                         atomic_clear_long(pte, htobe64(PG_W));
5666                         pmap->pm_stats.wired_count--;
5667                 }
5668         }
5669         PMAP_UNLOCK(pmap);
5670 }
5671
5672 void
5673 mmu_radix_zero_page(vm_page_t m)
5674 {
5675         vm_offset_t addr;
5676
5677         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5678         addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5679         pagezero(addr);
5680 }
5681
5682 void
5683 mmu_radix_zero_page_area(vm_page_t m, int off, int size)
5684 {
5685         caddr_t addr;
5686
5687         CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size);
5688         MPASS(off + size <= PAGE_SIZE);
5689         addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5690         memset(addr + off, 0, size);
5691 }
5692
5693 static int
5694 mmu_radix_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5695 {
5696         pml3_entry_t *l3ep;
5697         pt_entry_t pte;
5698         vm_paddr_t pa;
5699         int val;
5700
5701         CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr);
5702         PMAP_LOCK(pmap);
5703
5704         l3ep = pmap_pml3e(pmap, addr);
5705         if (l3ep != NULL && (be64toh(*l3ep) & PG_V)) {
5706                 if (be64toh(*l3ep) & RPTE_LEAF) {
5707                         pte = be64toh(*l3ep);
5708                         /* Compute the physical address of the 4KB page. */
5709                         pa = ((be64toh(*l3ep) & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) &
5710                             PG_FRAME;
5711                         val = MINCORE_PSIND(1);
5712                 } else {
5713                         /* Native endian PTE, do not pass to functions */
5714                         pte = be64toh(*pmap_l3e_to_pte(l3ep, addr));
5715                         pa = pte & PG_FRAME;
5716                         val = 0;
5717                 }
5718         } else {
5719                 pte = 0;
5720                 pa = 0;
5721                 val = 0;
5722         }
5723         if ((pte & PG_V) != 0) {
5724                 val |= MINCORE_INCORE;
5725                 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5726                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5727                 if ((pte & PG_A) != 0)
5728                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5729         }
5730         if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5731             (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5732             (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5733                 *locked_pa = pa;
5734         }
5735         PMAP_UNLOCK(pmap);
5736         return (val);
5737 }
5738
5739 void
5740 mmu_radix_activate(struct thread *td)
5741 {
5742         pmap_t pmap;
5743         uint32_t curpid;
5744
5745         CTR2(KTR_PMAP, "%s(%p)", __func__, td);
5746         critical_enter();
5747         pmap = vmspace_pmap(td->td_proc->p_vmspace);
5748         curpid = mfspr(SPR_PID);
5749         if (pmap->pm_pid > isa3_base_pid &&
5750                 curpid != pmap->pm_pid) {
5751                 mmu_radix_pid_set(pmap);
5752         }
5753         critical_exit();
5754 }
5755
5756 /*
5757  *      Increase the starting virtual address of the given mapping if a
5758  *      different alignment might result in more superpage mappings.
5759  */
5760 void
5761 mmu_radix_align_superpage(vm_object_t object, vm_ooffset_t offset,
5762     vm_offset_t *addr, vm_size_t size)
5763 {
5764
5765         CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr,
5766             size);
5767         vm_offset_t superpage_offset;
5768
5769         if (size < L3_PAGE_SIZE)
5770                 return;
5771         if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5772                 offset += ptoa(object->pg_color);
5773         superpage_offset = offset & L3_PAGE_MASK;
5774         if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE ||
5775             (*addr & L3_PAGE_MASK) == superpage_offset)
5776                 return;
5777         if ((*addr & L3_PAGE_MASK) < superpage_offset)
5778                 *addr = (*addr & ~L3_PAGE_MASK) + superpage_offset;
5779         else
5780                 *addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset;
5781 }
5782
5783 static void *
5784 mmu_radix_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr)
5785 {
5786         vm_offset_t va, tmpva, ppa, offset;
5787
5788         ppa = trunc_page(pa);
5789         offset = pa & PAGE_MASK;
5790         size = roundup2(offset + size, PAGE_SIZE);
5791         if (pa < powerpc_ptob(Maxmem))
5792                 panic("bad pa: %#lx less than Maxmem %#lx\n",
5793                           pa, powerpc_ptob(Maxmem));
5794         va = kva_alloc(size);
5795         if (bootverbose)
5796                 printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr);
5797         KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr));
5798
5799         if (!va)
5800                 panic("%s: Couldn't alloc kernel virtual memory", __func__);
5801
5802         for (tmpva = va; size > 0;) {
5803                 mmu_radix_kenter_attr(tmpva, ppa, attr);
5804                 size -= PAGE_SIZE;
5805                 tmpva += PAGE_SIZE;
5806                 ppa += PAGE_SIZE;
5807         }
5808         ptesync();
5809
5810         return ((void *)(va + offset));
5811 }
5812
5813 static void *
5814 mmu_radix_mapdev(vm_paddr_t pa, vm_size_t size)
5815 {
5816
5817         CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
5818
5819         return (mmu_radix_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT));
5820 }
5821
5822 void
5823 mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5824 {
5825
5826         CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma);
5827         m->md.mdpg_cache_attrs = ma;
5828
5829         /*
5830          * If "m" is a normal page, update its direct mapping.  This update
5831          * can be relied upon to perform any cache operations that are
5832          * required for data coherence.
5833          */
5834         if ((m->flags & PG_FICTITIOUS) == 0 &&
5835             mmu_radix_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)),
5836             PAGE_SIZE, m->md.mdpg_cache_attrs))
5837                 panic("memory attribute change on the direct map failed");
5838 }
5839
5840 static void
5841 mmu_radix_unmapdev(vm_offset_t va, vm_size_t size)
5842 {
5843         vm_offset_t offset;
5844
5845         CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, va, size);
5846         /* If we gave a direct map region in pmap_mapdev, do nothing */
5847         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
5848                 return;
5849
5850         offset = va & PAGE_MASK;
5851         size = round_page(offset + size);
5852         va = trunc_page(va);
5853
5854         if (pmap_initialized) {
5855                 mmu_radix_qremove(va, atop(size));
5856                 kva_free(va, size);
5857         }
5858 }
5859
5860 static __inline void
5861 pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask)
5862 {
5863         uint64_t opte, npte;
5864
5865         /*
5866          * The cache mode bits are all in the low 32-bits of the
5867          * PTE, so we can just spin on updating the low 32-bits.
5868          */
5869         do {
5870                 opte = be64toh(*pte);
5871                 npte = opte & ~mask;
5872                 npte |= cache_bits;
5873         } while (npte != opte && !atomic_cmpset_long(pte, htobe64(opte), htobe64(npte)));
5874 }
5875
5876 /*
5877  * Tries to demote a 1GB page mapping.
5878  */
5879 static boolean_t
5880 pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va)
5881 {
5882         pml2_entry_t oldpdpe;
5883         pml3_entry_t *firstpde, newpde, *pde;
5884         vm_paddr_t pdpgpa;
5885         vm_page_t pdpg;
5886
5887         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5888         oldpdpe = be64toh(*l2e);
5889         KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V),
5890             ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
5891         pdpg = vm_page_alloc(NULL, va >> L2_PAGE_SIZE_SHIFT,
5892             VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
5893         if (pdpg == NULL) {
5894                 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
5895                     " in pmap %p", va, pmap);
5896                 return (FALSE);
5897         }
5898         pdpgpa = VM_PAGE_TO_PHYS(pdpg);
5899         firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa);
5900         KASSERT((oldpdpe & PG_A) != 0,
5901             ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
5902         KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
5903             ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
5904         newpde = oldpdpe;
5905
5906         /*
5907          * Initialize the page directory page.
5908          */
5909         for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
5910                 *pde = htobe64(newpde);
5911                 newpde += L3_PAGE_SIZE;
5912         }
5913
5914         /*
5915          * Demote the mapping.
5916          */
5917         pde_store(l2e, pdpgpa);
5918
5919         /*
5920          * Flush PWC --- XXX revisit
5921          */
5922         pmap_invalidate_all(pmap);
5923
5924         pmap_l2e_demotions++;
5925         CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
5926             " in pmap %p", va, pmap);
5927         return (TRUE);
5928 }
5929
5930 vm_paddr_t
5931 mmu_radix_kextract(vm_offset_t va)
5932 {
5933         pml3_entry_t l3e;
5934         vm_paddr_t pa;
5935
5936         CTR2(KTR_PMAP, "%s(%#x)", __func__, va);
5937         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
5938                 pa = DMAP_TO_PHYS(va);
5939         } else {
5940                 /* Big-endian PTE on stack */
5941                 l3e = *pmap_pml3e(kernel_pmap, va);
5942                 if (be64toh(l3e) & RPTE_LEAF) {
5943                         pa = (be64toh(l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK);
5944                         pa |= (va & L3_PAGE_MASK);
5945                 } else {
5946                         /*
5947                          * Beware of a concurrent promotion that changes the
5948                          * PDE at this point!  For example, vtopte() must not
5949                          * be used to access the PTE because it would use the
5950                          * new PDE.  It is, however, safe to use the old PDE
5951                          * because the page table page is preserved by the
5952                          * promotion.
5953                          */
5954                         pa = be64toh(*pmap_l3e_to_pte(&l3e, va));
5955                         pa = (pa & PG_FRAME) | (va & PAGE_MASK);
5956                         pa |= (va & PAGE_MASK);
5957                 }
5958         }
5959         return (pa);
5960 }
5961
5962 static pt_entry_t
5963 mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
5964 {
5965
5966         if (ma != VM_MEMATTR_DEFAULT) {
5967                 return pmap_cache_bits(ma);
5968         }
5969
5970         /*
5971          * Assume the page is cache inhibited and access is guarded unless
5972          * it's in our available memory array.
5973          */
5974         for (int i = 0; i < pregions_sz; i++) {
5975                 if ((pa >= pregions[i].mr_start) &&
5976                     (pa < (pregions[i].mr_start + pregions[i].mr_size)))
5977                         return (RPTE_ATTR_MEM);
5978         }
5979         return (RPTE_ATTR_GUARDEDIO);
5980 }
5981
5982 static void
5983 mmu_radix_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
5984 {
5985         pt_entry_t *pte, pteval;
5986         uint64_t cache_bits;
5987
5988         pte = kvtopte(va);
5989         MPASS(pte != NULL);
5990         pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A;
5991         cache_bits = mmu_radix_calc_wimg(pa, ma);
5992         pte_store(pte, pteval | cache_bits);
5993 }
5994
5995 void
5996 mmu_radix_kremove(vm_offset_t va)
5997 {
5998         pt_entry_t *pte;
5999
6000         CTR2(KTR_PMAP, "%s(%#x)", __func__, va);
6001
6002         pte = kvtopte(va);
6003         pte_clear(pte);
6004 }
6005
6006 int
6007 mmu_radix_decode_kernel_ptr(vm_offset_t addr,
6008     int *is_user, vm_offset_t *decoded)
6009 {
6010
6011         CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr);
6012         *decoded = addr;
6013         *is_user = (addr < VM_MAXUSER_ADDRESS);
6014         return (0);
6015 }
6016
6017 static boolean_t
6018 mmu_radix_dev_direct_mapped(vm_paddr_t pa, vm_size_t size)
6019 {
6020
6021         CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
6022         return (mem_valid(pa, size));
6023 }
6024
6025 static void
6026 mmu_radix_scan_init()
6027 {
6028
6029         CTR1(KTR_PMAP, "%s()", __func__);
6030         UNIMPLEMENTED();
6031 }
6032
6033 static void
6034 mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz,
6035         void **va)
6036 {
6037         CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va);
6038         UNIMPLEMENTED();
6039 }
6040
6041 vm_offset_t
6042 mmu_radix_quick_enter_page(vm_page_t m)
6043 {
6044         vm_paddr_t paddr;
6045
6046         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
6047         paddr = VM_PAGE_TO_PHYS(m);
6048         return (PHYS_TO_DMAP(paddr));
6049 }
6050
6051 void
6052 mmu_radix_quick_remove_page(vm_offset_t addr __unused)
6053 {
6054         /* no work to do here */
6055         CTR2(KTR_PMAP, "%s(%#x)", __func__, addr);
6056 }
6057
6058 static void
6059 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
6060 {
6061         cpu_flush_dcache((void *)sva, eva - sva);
6062 }
6063
6064 int
6065 mmu_radix_change_attr(vm_offset_t va, vm_size_t size,
6066     vm_memattr_t mode)
6067 {
6068         int error;
6069
6070         CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode);
6071         PMAP_LOCK(kernel_pmap);
6072         error = pmap_change_attr_locked(va, size, mode, true);
6073         PMAP_UNLOCK(kernel_pmap);
6074         return (error);
6075 }
6076
6077 static int
6078 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush)
6079 {
6080         vm_offset_t base, offset, tmpva;
6081         vm_paddr_t pa_start, pa_end, pa_end1;
6082         pml2_entry_t *l2e;
6083         pml3_entry_t *l3e;
6084         pt_entry_t *pte;
6085         int cache_bits, error;
6086         boolean_t changed;
6087
6088         PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6089         base = trunc_page(va);
6090         offset = va & PAGE_MASK;
6091         size = round_page(offset + size);
6092
6093         /*
6094          * Only supported on kernel virtual addresses, including the direct
6095          * map but excluding the recursive map.
6096          */
6097         if (base < DMAP_MIN_ADDRESS)
6098                 return (EINVAL);
6099
6100         cache_bits = pmap_cache_bits(mode);
6101         changed = FALSE;
6102
6103         /*
6104          * Pages that aren't mapped aren't supported.  Also break down 2MB pages
6105          * into 4KB pages if required.
6106          */
6107         for (tmpva = base; tmpva < base + size; ) {
6108                 l2e = pmap_pml2e(kernel_pmap, tmpva);
6109                 if (l2e == NULL || *l2e == 0)
6110                         return (EINVAL);
6111                 if (be64toh(*l2e) & RPTE_LEAF) {
6112                         /*
6113                          * If the current 1GB page already has the required
6114                          * memory type, then we need not demote this page. Just
6115                          * increment tmpva to the next 1GB page frame.
6116                          */
6117                         if ((be64toh(*l2e) & RPTE_ATTR_MASK) == cache_bits) {
6118                                 tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE;
6119                                 continue;
6120                         }
6121
6122                         /*
6123                          * If the current offset aligns with a 1GB page frame
6124                          * and there is at least 1GB left within the range, then
6125                          * we need not break down this page into 2MB pages.
6126                          */
6127                         if ((tmpva & L2_PAGE_MASK) == 0 &&
6128                             tmpva + L2_PAGE_MASK < base + size) {
6129                                 tmpva += L2_PAGE_MASK;
6130                                 continue;
6131                         }
6132                         if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva))
6133                                 return (ENOMEM);
6134                 }
6135                 l3e = pmap_l2e_to_l3e(l2e, tmpva);
6136                 KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n",
6137                     tmpva, l2e));
6138                 if (*l3e == 0)
6139                         return (EINVAL);
6140                 if (be64toh(*l3e) & RPTE_LEAF) {
6141                         /*
6142                          * If the current 2MB page already has the required
6143                          * memory type, then we need not demote this page. Just
6144                          * increment tmpva to the next 2MB page frame.
6145                          */
6146                         if ((be64toh(*l3e) & RPTE_ATTR_MASK) == cache_bits) {
6147                                 tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE;
6148                                 continue;
6149                         }
6150
6151                         /*
6152                          * If the current offset aligns with a 2MB page frame
6153                          * and there is at least 2MB left within the range, then
6154                          * we need not break down this page into 4KB pages.
6155                          */
6156                         if ((tmpva & L3_PAGE_MASK) == 0 &&
6157                             tmpva + L3_PAGE_MASK < base + size) {
6158                                 tmpva += L3_PAGE_SIZE;
6159                                 continue;
6160                         }
6161                         if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva))
6162                                 return (ENOMEM);
6163                 }
6164                 pte = pmap_l3e_to_pte(l3e, tmpva);
6165                 if (*pte == 0)
6166                         return (EINVAL);
6167                 tmpva += PAGE_SIZE;
6168         }
6169         error = 0;
6170
6171         /*
6172          * Ok, all the pages exist, so run through them updating their
6173          * cache mode if required.
6174          */
6175         pa_start = pa_end = 0;
6176         for (tmpva = base; tmpva < base + size; ) {
6177                 l2e = pmap_pml2e(kernel_pmap, tmpva);
6178                 if (be64toh(*l2e) & RPTE_LEAF) {
6179                         if ((be64toh(*l2e) & RPTE_ATTR_MASK) != cache_bits) {
6180                                 pmap_pte_attr(l2e, cache_bits,
6181                                     RPTE_ATTR_MASK);
6182                                 changed = TRUE;
6183                         }
6184                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6185                             (*l2e & PG_PS_FRAME) < dmaplimit) {
6186                                 if (pa_start == pa_end) {
6187                                         /* Start physical address run. */
6188                                         pa_start = be64toh(*l2e) & PG_PS_FRAME;
6189                                         pa_end = pa_start + L2_PAGE_SIZE;
6190                                 } else if (pa_end == (be64toh(*l2e) & PG_PS_FRAME))
6191                                         pa_end += L2_PAGE_SIZE;
6192                                 else {
6193                                         /* Run ended, update direct map. */
6194                                         error = pmap_change_attr_locked(
6195                                             PHYS_TO_DMAP(pa_start),
6196                                             pa_end - pa_start, mode, flush);
6197                                         if (error != 0)
6198                                                 break;
6199                                         /* Start physical address run. */
6200                                         pa_start = be64toh(*l2e) & PG_PS_FRAME;
6201                                         pa_end = pa_start + L2_PAGE_SIZE;
6202                                 }
6203                         }
6204                         tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE;
6205                         continue;
6206                 }
6207                 l3e = pmap_l2e_to_l3e(l2e, tmpva);
6208                 if (be64toh(*l3e) & RPTE_LEAF) {
6209                         if ((be64toh(*l3e) & RPTE_ATTR_MASK) != cache_bits) {
6210                                 pmap_pte_attr(l3e, cache_bits,
6211                                     RPTE_ATTR_MASK);
6212                                 changed = TRUE;
6213                         }
6214                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6215                             (be64toh(*l3e) & PG_PS_FRAME) < dmaplimit) {
6216                                 if (pa_start == pa_end) {
6217                                         /* Start physical address run. */
6218                                         pa_start = be64toh(*l3e) & PG_PS_FRAME;
6219                                         pa_end = pa_start + L3_PAGE_SIZE;
6220                                 } else if (pa_end == (be64toh(*l3e) & PG_PS_FRAME))
6221                                         pa_end += L3_PAGE_SIZE;
6222                                 else {
6223                                         /* Run ended, update direct map. */
6224                                         error = pmap_change_attr_locked(
6225                                             PHYS_TO_DMAP(pa_start),
6226                                             pa_end - pa_start, mode, flush);
6227                                         if (error != 0)
6228                                                 break;
6229                                         /* Start physical address run. */
6230                                         pa_start = be64toh(*l3e) & PG_PS_FRAME;
6231                                         pa_end = pa_start + L3_PAGE_SIZE;
6232                                 }
6233                         }
6234                         tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE;
6235                 } else {
6236                         pte = pmap_l3e_to_pte(l3e, tmpva);
6237                         if ((be64toh(*pte) & RPTE_ATTR_MASK) != cache_bits) {
6238                                 pmap_pte_attr(pte, cache_bits,
6239                                     RPTE_ATTR_MASK);
6240                                 changed = TRUE;
6241                         }
6242                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6243                             (be64toh(*pte) & PG_FRAME) < dmaplimit) {
6244                                 if (pa_start == pa_end) {
6245                                         /* Start physical address run. */
6246                                         pa_start = be64toh(*pte) & PG_FRAME;
6247                                         pa_end = pa_start + PAGE_SIZE;
6248                                 } else if (pa_end == (be64toh(*pte) & PG_FRAME))
6249                                         pa_end += PAGE_SIZE;
6250                                 else {
6251                                         /* Run ended, update direct map. */
6252                                         error = pmap_change_attr_locked(
6253                                             PHYS_TO_DMAP(pa_start),
6254                                             pa_end - pa_start, mode, flush);
6255                                         if (error != 0)
6256                                                 break;
6257                                         /* Start physical address run. */
6258                                         pa_start = be64toh(*pte) & PG_FRAME;
6259                                         pa_end = pa_start + PAGE_SIZE;
6260                                 }
6261                         }
6262                         tmpva += PAGE_SIZE;
6263                 }
6264         }
6265         if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
6266                 pa_end1 = MIN(pa_end, dmaplimit);
6267                 if (pa_start != pa_end1)
6268                         error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
6269                             pa_end1 - pa_start, mode, flush);
6270         }
6271
6272         /*
6273          * Flush CPU caches if required to make sure any data isn't cached that
6274          * shouldn't be, etc.
6275          */
6276         if (changed) {
6277                 pmap_invalidate_all(kernel_pmap);
6278
6279                 if (flush)
6280                         pmap_invalidate_cache_range(base, tmpva);
6281         }
6282         return (error);
6283 }
6284
6285 /*
6286  * Allocate physical memory for the vm_page array and map it into KVA,
6287  * attempting to back the vm_pages with domain-local memory.
6288  */
6289 void
6290 mmu_radix_page_array_startup(long pages)
6291 {
6292 #ifdef notyet
6293         pml2_entry_t *l2e;
6294         pml3_entry_t *pde;
6295         pml3_entry_t newl3;
6296         vm_offset_t va;
6297         long pfn;
6298         int domain, i;
6299 #endif
6300         vm_paddr_t pa;
6301         vm_offset_t start, end;
6302
6303         vm_page_array_size = pages;
6304
6305         start = VM_MIN_KERNEL_ADDRESS;
6306         end = start + pages * sizeof(struct vm_page);
6307
6308         pa = vm_phys_early_alloc(0, end - start);
6309
6310         start = mmu_radix_map(&start, pa, end - start, VM_MEMATTR_DEFAULT);
6311 #ifdef notyet
6312         /* TODO: NUMA vm_page_array.  Blocked out until then (copied from amd64). */
6313         for (va = start; va < end; va += L3_PAGE_SIZE) {
6314                 pfn = first_page + (va - start) / sizeof(struct vm_page);
6315                 domain = vm_phys_domain(ptoa(pfn));
6316                 l2e = pmap_pml2e(kernel_pmap, va);
6317                 if ((be64toh(*l2e) & PG_V) == 0) {
6318                         pa = vm_phys_early_alloc(domain, PAGE_SIZE);
6319                         dump_add_page(pa);
6320                         pagezero(PHYS_TO_DMAP(pa));
6321                         pde_store(l2e, (pml2_entry_t)pa);
6322                 }
6323                 pde = pmap_l2e_to_l3e(l2e, va);
6324                 if ((be64toh(*pde) & PG_V) != 0)
6325                         panic("Unexpected pde %p", pde);
6326                 pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE);
6327                 for (i = 0; i < NPDEPG; i++)
6328                         dump_add_page(pa + i * PAGE_SIZE);
6329                 newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W);
6330                 pte_store(pde, newl3);
6331         }
6332 #endif
6333         vm_page_array = (vm_page_t)start;
6334 }
6335
6336 #ifdef DDB
6337 #include <sys/kdb.h>
6338 #include <ddb/ddb.h>
6339
6340 static void
6341 pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va)
6342 {
6343         pml1_entry_t *l1e;
6344         pml2_entry_t *l2e;
6345         pml3_entry_t *l3e;
6346         pt_entry_t *pte;
6347
6348         l1e = &l1[pmap_pml1e_index(va)];
6349         db_printf("VA %#016lx l1e %#016lx", va, be64toh(*l1e));
6350         if ((be64toh(*l1e) & PG_V) == 0) {
6351                 db_printf("\n");
6352                 return;
6353         }
6354         l2e = pmap_l1e_to_l2e(l1e, va);
6355         db_printf(" l2e %#016lx", be64toh(*l2e));
6356         if ((be64toh(*l2e) & PG_V) == 0 || (be64toh(*l2e) & RPTE_LEAF) != 0) {
6357                 db_printf("\n");
6358                 return;
6359         }
6360         l3e = pmap_l2e_to_l3e(l2e, va);
6361         db_printf(" l3e %#016lx", be64toh(*l3e));
6362         if ((be64toh(*l3e) & PG_V) == 0 || (be64toh(*l3e) & RPTE_LEAF) != 0) {
6363                 db_printf("\n");
6364                 return;
6365         }
6366         pte = pmap_l3e_to_pte(l3e, va);
6367         db_printf(" pte %#016lx\n", be64toh(*pte));
6368 }
6369
6370 void
6371 pmap_page_print_mappings(vm_page_t m)
6372 {
6373         pmap_t pmap;
6374         pv_entry_t pv;
6375
6376         db_printf("page %p(%lx)\n", m, m->phys_addr);
6377         /* need to elide locks if running in ddb */
6378         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
6379                 db_printf("pv: %p ", pv);
6380                 db_printf("va: %#016lx ", pv->pv_va);
6381                 pmap = PV_PMAP(pv);
6382                 db_printf("pmap %p  ", pmap);
6383                 if (pmap != NULL) {
6384                         db_printf("asid: %lu\n", pmap->pm_pid);
6385                         pmap_pte_walk(pmap->pm_pml1, pv->pv_va);
6386                 }
6387         }
6388 }
6389
6390 DB_SHOW_COMMAND(pte, pmap_print_pte)
6391 {
6392         vm_offset_t va;
6393         pmap_t pmap;
6394
6395         if (!have_addr) {
6396                 db_printf("show pte addr\n");
6397                 return;
6398         }
6399         va = (vm_offset_t)addr;
6400
6401         if (va >= DMAP_MIN_ADDRESS)
6402                 pmap = kernel_pmap;
6403         else if (kdb_thread != NULL)
6404                 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
6405         else
6406                 pmap = vmspace_pmap(curthread->td_proc->p_vmspace);
6407
6408         pmap_pte_walk(pmap->pm_pml1, va);
6409 }
6410
6411 #endif