]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/powerpc/aim/mmu_radix.c
powerpc/pmap: Add pmap_sync_icache() for radix pmap
[FreeBSD/FreeBSD.git] / sys / powerpc / aim / mmu_radix.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2018 Matthew Macy
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 #include "opt_platform.h"
29
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32
33 #include <sys/param.h>
34 #include <sys/kernel.h>
35 #include <sys/systm.h>
36 #include <sys/conf.h>
37 #include <sys/bitstring.h>
38 #include <sys/queue.h>
39 #include <sys/cpuset.h>
40 #include <sys/endian.h>
41 #include <sys/kerneldump.h>
42 #include <sys/ktr.h>
43 #include <sys/lock.h>
44 #include <sys/syslog.h>
45 #include <sys/msgbuf.h>
46 #include <sys/malloc.h>
47 #include <sys/mman.h>
48 #include <sys/mutex.h>
49 #include <sys/proc.h>
50 #include <sys/rwlock.h>
51 #include <sys/sched.h>
52 #include <sys/sysctl.h>
53 #include <sys/systm.h>
54 #include <sys/vmem.h>
55 #include <sys/vmmeter.h>
56 #include <sys/smp.h>
57
58 #include <sys/kdb.h>
59
60 #include <dev/ofw/openfirm.h>
61
62 #include <vm/vm.h>
63 #include <vm/pmap.h>
64 #include <vm/vm_param.h>
65 #include <vm/vm_kern.h>
66 #include <vm/vm_page.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_extern.h>
70 #include <vm/vm_pageout.h>
71 #include <vm/vm_phys.h>
72 #include <vm/vm_reserv.h>
73 #include <vm/vm_dumpset.h>
74 #include <vm/uma.h>
75
76 #include <machine/_inttypes.h>
77 #include <machine/cpu.h>
78 #include <machine/platform.h>
79 #include <machine/frame.h>
80 #include <machine/md_var.h>
81 #include <machine/psl.h>
82 #include <machine/bat.h>
83 #include <machine/hid.h>
84 #include <machine/pte.h>
85 #include <machine/sr.h>
86 #include <machine/trap.h>
87 #include <machine/mmuvar.h>
88
89 /* For pseries bit. */
90 #include <powerpc/pseries/phyp-hvcall.h>
91
92 #ifdef INVARIANTS
93 #include <vm/uma_dbg.h>
94 #endif
95
96 #define PPC_BITLSHIFT(bit)      (sizeof(long)*NBBY - 1 - (bit))
97 #define PPC_BIT(bit)            (1UL << PPC_BITLSHIFT(bit))
98 #define PPC_BITLSHIFT_VAL(val, bit) ((val) << PPC_BITLSHIFT(bit))
99
100 #include "opt_ddb.h"
101
102 #ifdef DDB
103 static void pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va);
104 #endif
105
106 #define PG_W    RPTE_WIRED
107 #define PG_V    RPTE_VALID
108 #define PG_MANAGED      RPTE_MANAGED
109 #define PG_PROMOTED     RPTE_PROMOTED
110 #define PG_M    RPTE_C
111 #define PG_A    RPTE_R
112 #define PG_X    RPTE_EAA_X
113 #define PG_RW   RPTE_EAA_W
114 #define PG_PTE_CACHE RPTE_ATTR_MASK
115
116 #define RPTE_SHIFT 9
117 #define NLS_MASK ((1UL<<5)-1)
118 #define RPTE_ENTRIES (1UL<<RPTE_SHIFT)
119 #define RPTE_MASK (RPTE_ENTRIES-1)
120
121 #define NLB_SHIFT 0
122 #define NLB_MASK (((1UL<<52)-1) << 8)
123
124 extern int nkpt;
125 extern caddr_t crashdumpmap;
126
127 #define RIC_FLUSH_TLB 0
128 #define RIC_FLUSH_PWC 1
129 #define RIC_FLUSH_ALL 2
130
131 #define POWER9_TLB_SETS_RADIX   128     /* # sets in POWER9 TLB Radix mode */
132
133 #define PPC_INST_TLBIE                  0x7c000264
134 #define PPC_INST_TLBIEL                 0x7c000224
135 #define PPC_INST_SLBIA                  0x7c0003e4
136
137 #define ___PPC_RA(a)    (((a) & 0x1f) << 16)
138 #define ___PPC_RB(b)    (((b) & 0x1f) << 11)
139 #define ___PPC_RS(s)    (((s) & 0x1f) << 21)
140 #define ___PPC_RT(t)    ___PPC_RS(t)
141 #define ___PPC_R(r)     (((r) & 0x1) << 16)
142 #define ___PPC_PRS(prs) (((prs) & 0x1) << 17)
143 #define ___PPC_RIC(ric) (((ric) & 0x3) << 18)
144
145 #define PPC_SLBIA(IH)   __XSTRING(.long PPC_INST_SLBIA | \
146                                        ((IH & 0x7) << 21))
147 #define PPC_TLBIE_5(rb,rs,ric,prs,r)                            \
148         __XSTRING(.long PPC_INST_TLBIE |                        \
149                           ___PPC_RB(rb) | ___PPC_RS(rs) |       \
150                           ___PPC_RIC(ric) | ___PPC_PRS(prs) |   \
151                           ___PPC_R(r))
152
153 #define PPC_TLBIEL(rb,rs,ric,prs,r) \
154          __XSTRING(.long PPC_INST_TLBIEL | \
155                            ___PPC_RB(rb) | ___PPC_RS(rs) |      \
156                            ___PPC_RIC(ric) | ___PPC_PRS(prs) |  \
157                            ___PPC_R(r))
158
159 #define PPC_INVALIDATE_ERAT             PPC_SLBIA(7)
160
161 static __inline void
162 ttusync(void)
163 {
164         __asm __volatile("eieio; tlbsync; ptesync" ::: "memory");
165 }
166
167 #define TLBIEL_INVAL_SEL_MASK   0xc00   /* invalidation selector */
168 #define  TLBIEL_INVAL_PAGE      0x000   /* invalidate a single page */
169 #define  TLBIEL_INVAL_SET_PID   0x400   /* invalidate a set for the current PID */
170 #define  TLBIEL_INVAL_SET_LPID  0x800   /* invalidate a set for current LPID */
171 #define  TLBIEL_INVAL_SET       0xc00   /* invalidate a set for all LPIDs */
172
173 #define TLBIE_ACTUAL_PAGE_MASK          0xe0
174 #define  TLBIE_ACTUAL_PAGE_4K           0x00
175 #define  TLBIE_ACTUAL_PAGE_64K          0xa0
176 #define  TLBIE_ACTUAL_PAGE_2M           0x20
177 #define  TLBIE_ACTUAL_PAGE_1G           0x40
178
179 #define TLBIE_PRS_PARTITION_SCOPE       0x0
180 #define TLBIE_PRS_PROCESS_SCOPE 0x1
181
182 #define TLBIE_RIC_INVALIDATE_TLB        0x0     /* Invalidate just TLB */
183 #define TLBIE_RIC_INVALIDATE_PWC        0x1     /* Invalidate just PWC */
184 #define TLBIE_RIC_INVALIDATE_ALL        0x2     /* Invalidate TLB, PWC,
185                                                  * cached {proc, part}tab entries
186                                                  */
187 #define TLBIE_RIC_INVALIDATE_SEQ        0x3     /* HPT - only:
188                                                  * Invalidate a range of translations
189                                                  */
190
191 static __always_inline void
192 radix_tlbie(uint8_t ric, uint8_t prs, uint16_t is, uint32_t pid, uint32_t lpid,
193                         vm_offset_t va, uint16_t ap)
194 {
195         uint64_t rb, rs;
196
197         MPASS((va & PAGE_MASK) == 0);
198
199         rs = ((uint64_t)pid << 32) | lpid;
200         rb = va | is | ap;
201         __asm __volatile(PPC_TLBIE_5(%0, %1, %2, %3, 1) : :
202                 "r" (rb), "r" (rs), "i" (ric), "i" (prs) : "memory");
203 }
204
205 static __inline void
206 radix_tlbie_fixup(uint32_t pid, vm_offset_t va, int ap)
207 {
208
209         __asm __volatile("ptesync" ::: "memory");
210         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
211             TLBIEL_INVAL_PAGE, 0, 0, va, ap);
212         __asm __volatile("ptesync" ::: "memory");
213         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
214             TLBIEL_INVAL_PAGE, pid, 0, va, ap);
215 }
216
217 static __inline void
218 radix_tlbie_invlpg_user_4k(uint32_t pid, vm_offset_t va)
219 {
220
221         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
222                 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_4K);
223         radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_4K);
224 }
225
226 static __inline void
227 radix_tlbie_invlpg_user_2m(uint32_t pid, vm_offset_t va)
228 {
229
230         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
231                 TLBIEL_INVAL_PAGE, pid, 0, va, TLBIE_ACTUAL_PAGE_2M);
232         radix_tlbie_fixup(pid, va, TLBIE_ACTUAL_PAGE_2M);
233 }
234
235 static __inline void
236 radix_tlbie_invlpwc_user(uint32_t pid)
237 {
238
239         radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE,
240                 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0);
241 }
242
243 static __inline void
244 radix_tlbie_flush_user(uint32_t pid)
245 {
246
247         radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE,
248                 TLBIEL_INVAL_SET_PID, pid, 0, 0, 0);
249 }
250
251 static __inline void
252 radix_tlbie_invlpg_kernel_4k(vm_offset_t va)
253 {
254
255         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
256             TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_4K);
257         radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_4K);
258 }
259
260 static __inline void
261 radix_tlbie_invlpg_kernel_2m(vm_offset_t va)
262 {
263
264         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
265             TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_2M);
266         radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_2M);
267 }
268
269 /* 1GB pages aren't currently supported. */
270 static __inline __unused void
271 radix_tlbie_invlpg_kernel_1g(vm_offset_t va)
272 {
273
274         radix_tlbie(TLBIE_RIC_INVALIDATE_TLB, TLBIE_PRS_PROCESS_SCOPE,
275             TLBIEL_INVAL_PAGE, 0, 0, va, TLBIE_ACTUAL_PAGE_1G);
276         radix_tlbie_fixup(0, va, TLBIE_ACTUAL_PAGE_1G);
277 }
278
279 static __inline void
280 radix_tlbie_invlpwc_kernel(void)
281 {
282
283         radix_tlbie(TLBIE_RIC_INVALIDATE_PWC, TLBIE_PRS_PROCESS_SCOPE,
284             TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0);
285 }
286
287 static __inline void
288 radix_tlbie_flush_kernel(void)
289 {
290
291         radix_tlbie(TLBIE_RIC_INVALIDATE_ALL, TLBIE_PRS_PROCESS_SCOPE,
292             TLBIEL_INVAL_SET_LPID, 0, 0, 0, 0);
293 }
294
295 static __inline vm_pindex_t
296 pmap_l3e_pindex(vm_offset_t va)
297 {
298         return ((va & PG_FRAME) >> L3_PAGE_SIZE_SHIFT);
299 }
300
301 static __inline vm_pindex_t
302 pmap_pml3e_index(vm_offset_t va)
303 {
304
305         return ((va >> L3_PAGE_SIZE_SHIFT) & RPTE_MASK);
306 }
307
308 static __inline vm_pindex_t
309 pmap_pml2e_index(vm_offset_t va)
310 {
311         return ((va >> L2_PAGE_SIZE_SHIFT) & RPTE_MASK);
312 }
313
314 static __inline vm_pindex_t
315 pmap_pml1e_index(vm_offset_t va)
316 {
317         return ((va & PG_FRAME) >> L1_PAGE_SIZE_SHIFT);
318 }
319
320 /* Return various clipped indexes for a given VA */
321 static __inline vm_pindex_t
322 pmap_pte_index(vm_offset_t va)
323 {
324
325         return ((va >> PAGE_SHIFT) & RPTE_MASK);
326 }
327
328 /* Return a pointer to the PT slot that corresponds to a VA */
329 static __inline pt_entry_t *
330 pmap_l3e_to_pte(pt_entry_t *l3e, vm_offset_t va)
331 {
332         pt_entry_t *pte;
333         vm_paddr_t ptepa;
334
335         ptepa = (be64toh(*l3e) & NLB_MASK);
336         pte = (pt_entry_t *)PHYS_TO_DMAP(ptepa);
337         return (&pte[pmap_pte_index(va)]);
338 }
339
340 /* Return a pointer to the PD slot that corresponds to a VA */
341 static __inline pt_entry_t *
342 pmap_l2e_to_l3e(pt_entry_t *l2e, vm_offset_t va)
343 {
344         pt_entry_t *l3e;
345         vm_paddr_t l3pa;
346
347         l3pa = (be64toh(*l2e) & NLB_MASK);
348         l3e = (pml3_entry_t *)PHYS_TO_DMAP(l3pa);
349         return (&l3e[pmap_pml3e_index(va)]);
350 }
351
352 /* Return a pointer to the PD slot that corresponds to a VA */
353 static __inline pt_entry_t *
354 pmap_l1e_to_l2e(pt_entry_t *l1e, vm_offset_t va)
355 {
356         pt_entry_t *l2e;
357         vm_paddr_t l2pa;
358
359         l2pa = (be64toh(*l1e) & NLB_MASK);
360
361         l2e = (pml2_entry_t *)PHYS_TO_DMAP(l2pa);
362         return (&l2e[pmap_pml2e_index(va)]);
363 }
364
365 static __inline pml1_entry_t *
366 pmap_pml1e(pmap_t pmap, vm_offset_t va)
367 {
368
369         return (&pmap->pm_pml1[pmap_pml1e_index(va)]);
370 }
371
372 static pt_entry_t *
373 pmap_pml2e(pmap_t pmap, vm_offset_t va)
374 {
375         pt_entry_t *l1e;
376
377         l1e = pmap_pml1e(pmap, va);
378         if (l1e == NULL || (be64toh(*l1e) & RPTE_VALID) == 0)
379                 return (NULL);
380         return (pmap_l1e_to_l2e(l1e, va));
381 }
382
383 static __inline pt_entry_t *
384 pmap_pml3e(pmap_t pmap, vm_offset_t va)
385 {
386         pt_entry_t *l2e;
387
388         l2e = pmap_pml2e(pmap, va);
389         if (l2e == NULL || (be64toh(*l2e) & RPTE_VALID) == 0)
390                 return (NULL);
391         return (pmap_l2e_to_l3e(l2e, va));
392 }
393
394 static __inline pt_entry_t *
395 pmap_pte(pmap_t pmap, vm_offset_t va)
396 {
397         pt_entry_t *l3e;
398
399         l3e = pmap_pml3e(pmap, va);
400         if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0)
401                 return (NULL);
402         return (pmap_l3e_to_pte(l3e, va));
403 }
404
405 int nkpt = 64;
406 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
407     "Number of kernel page table pages allocated on bootup");
408
409 vm_paddr_t dmaplimit;
410
411 SYSCTL_DECL(_vm_pmap);
412
413 #ifdef INVARIANTS
414 #define VERBOSE_PMAP 0
415 #define VERBOSE_PROTECT 0
416 static int pmap_logging;
417 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_logging, CTLFLAG_RWTUN,
418     &pmap_logging, 0, "verbose debug logging");
419 #endif
420
421 static u_int64_t        KPTphys;        /* phys addr of kernel level 1 */
422
423 //static vm_paddr_t     KERNend;        /* phys addr of end of bootstrap data */
424
425 static vm_offset_t qframe = 0;
426 static struct mtx qframe_mtx;
427
428 void mmu_radix_activate(struct thread *);
429 void mmu_radix_advise(pmap_t, vm_offset_t, vm_offset_t, int);
430 void mmu_radix_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *,
431     vm_size_t);
432 void mmu_radix_clear_modify(vm_page_t);
433 void mmu_radix_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t);
434 int mmu_radix_decode_kernel_ptr(vm_offset_t, int *, vm_offset_t *);
435 int mmu_radix_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t, u_int, int8_t);
436 void mmu_radix_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
437         vm_prot_t);
438 void mmu_radix_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
439 vm_paddr_t mmu_radix_extract(pmap_t pmap, vm_offset_t va);
440 vm_page_t mmu_radix_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t);
441 void mmu_radix_kenter(vm_offset_t, vm_paddr_t);
442 vm_paddr_t mmu_radix_kextract(vm_offset_t);
443 void mmu_radix_kremove(vm_offset_t);
444 boolean_t mmu_radix_is_modified(vm_page_t);
445 boolean_t mmu_radix_is_prefaultable(pmap_t, vm_offset_t);
446 boolean_t mmu_radix_is_referenced(vm_page_t);
447 void mmu_radix_object_init_pt(pmap_t, vm_offset_t, vm_object_t,
448         vm_pindex_t, vm_size_t);
449 boolean_t mmu_radix_page_exists_quick(pmap_t, vm_page_t);
450 void mmu_radix_page_init(vm_page_t);
451 boolean_t mmu_radix_page_is_mapped(vm_page_t m);
452 void mmu_radix_page_set_memattr(vm_page_t, vm_memattr_t);
453 int mmu_radix_page_wired_mappings(vm_page_t);
454 int mmu_radix_pinit(pmap_t);
455 void mmu_radix_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
456 bool mmu_radix_ps_enabled(pmap_t);
457 void mmu_radix_qenter(vm_offset_t, vm_page_t *, int);
458 void mmu_radix_qremove(vm_offset_t, int);
459 vm_offset_t mmu_radix_quick_enter_page(vm_page_t);
460 void mmu_radix_quick_remove_page(vm_offset_t);
461 boolean_t mmu_radix_ts_referenced(vm_page_t);
462 void mmu_radix_release(pmap_t);
463 void mmu_radix_remove(pmap_t, vm_offset_t, vm_offset_t);
464 void mmu_radix_remove_all(vm_page_t);
465 void mmu_radix_remove_pages(pmap_t);
466 void mmu_radix_remove_write(vm_page_t);
467 void mmu_radix_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz);
468 void mmu_radix_unwire(pmap_t, vm_offset_t, vm_offset_t);
469 void mmu_radix_zero_page(vm_page_t);
470 void mmu_radix_zero_page_area(vm_page_t, int, int);
471 int mmu_radix_change_attr(vm_offset_t, vm_size_t, vm_memattr_t);
472 void mmu_radix_page_array_startup(long pages);
473
474 #include "mmu_oea64.h"
475
476 /*
477  * Kernel MMU interface
478  */
479
480 static void     mmu_radix_bootstrap(vm_offset_t, vm_offset_t);
481
482 static void mmu_radix_copy_page(vm_page_t, vm_page_t);
483 static void mmu_radix_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
484     vm_page_t *mb, vm_offset_t b_offset, int xfersize);
485 static void mmu_radix_growkernel(vm_offset_t);
486 static void mmu_radix_init(void);
487 static int mmu_radix_mincore(pmap_t, vm_offset_t, vm_paddr_t *);
488 static vm_offset_t mmu_radix_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
489 static void mmu_radix_pinit0(pmap_t);
490
491 static void *mmu_radix_mapdev(vm_paddr_t, vm_size_t);
492 static void *mmu_radix_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t);
493 static void mmu_radix_unmapdev(void *, vm_size_t);
494 static void mmu_radix_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma);
495 static boolean_t mmu_radix_dev_direct_mapped(vm_paddr_t, vm_size_t);
496 static void mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz, void **va);
497 static void mmu_radix_scan_init(void);
498 static void     mmu_radix_cpu_bootstrap(int ap);
499 static void     mmu_radix_tlbie_all(void);
500
501 static struct pmap_funcs mmu_radix_methods = {
502         .bootstrap = mmu_radix_bootstrap,
503         .copy_page = mmu_radix_copy_page,
504         .copy_pages = mmu_radix_copy_pages,
505         .cpu_bootstrap = mmu_radix_cpu_bootstrap,
506         .growkernel = mmu_radix_growkernel,
507         .init = mmu_radix_init,
508         .map =                  mmu_radix_map,
509         .mincore =              mmu_radix_mincore,
510         .pinit = mmu_radix_pinit,
511         .pinit0 = mmu_radix_pinit0,
512
513         .mapdev = mmu_radix_mapdev,
514         .mapdev_attr = mmu_radix_mapdev_attr,
515         .unmapdev = mmu_radix_unmapdev,
516         .kenter_attr = mmu_radix_kenter_attr,
517         .dev_direct_mapped = mmu_radix_dev_direct_mapped,
518         .dumpsys_pa_init = mmu_radix_scan_init,
519         .dumpsys_map_chunk = mmu_radix_dumpsys_map,
520         .page_is_mapped = mmu_radix_page_is_mapped,
521         .ps_enabled = mmu_radix_ps_enabled,
522         .align_superpage = mmu_radix_align_superpage,
523         .object_init_pt = mmu_radix_object_init_pt,
524         .protect = mmu_radix_protect,
525         /* pmap dispatcher interface */
526         .clear_modify = mmu_radix_clear_modify,
527         .copy = mmu_radix_copy,
528         .enter = mmu_radix_enter,
529         .enter_object = mmu_radix_enter_object,
530         .enter_quick = mmu_radix_enter_quick,
531         .extract = mmu_radix_extract,
532         .extract_and_hold = mmu_radix_extract_and_hold,
533         .is_modified = mmu_radix_is_modified,
534         .is_prefaultable = mmu_radix_is_prefaultable,
535         .is_referenced = mmu_radix_is_referenced,
536         .ts_referenced = mmu_radix_ts_referenced,
537         .page_exists_quick = mmu_radix_page_exists_quick,
538         .page_init = mmu_radix_page_init,
539         .page_wired_mappings =  mmu_radix_page_wired_mappings,
540         .qenter = mmu_radix_qenter,
541         .qremove = mmu_radix_qremove,
542         .release = mmu_radix_release,
543         .remove = mmu_radix_remove,
544         .remove_all = mmu_radix_remove_all,
545         .remove_write = mmu_radix_remove_write,
546         .sync_icache = mmu_radix_sync_icache,
547         .unwire = mmu_radix_unwire,
548         .zero_page = mmu_radix_zero_page,
549         .zero_page_area = mmu_radix_zero_page_area,
550         .activate = mmu_radix_activate,
551         .quick_enter_page =  mmu_radix_quick_enter_page,
552         .quick_remove_page =  mmu_radix_quick_remove_page,
553         .page_set_memattr = mmu_radix_page_set_memattr,
554         .page_array_startup =  mmu_radix_page_array_startup,
555
556         /* Internal interfaces */
557         .kenter = mmu_radix_kenter,
558         .kextract = mmu_radix_kextract,
559         .kremove = mmu_radix_kremove,
560         .change_attr = mmu_radix_change_attr,
561         .decode_kernel_ptr =  mmu_radix_decode_kernel_ptr,
562
563         .tlbie_all = mmu_radix_tlbie_all,
564 };
565
566 MMU_DEF(mmu_radix, MMU_TYPE_RADIX, mmu_radix_methods);
567
568 static boolean_t pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va,
569         struct rwlock **lockp);
570 static boolean_t pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va);
571 static int pmap_unuse_pt(pmap_t, vm_offset_t, pml3_entry_t, struct spglist *);
572 static int pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva,
573     struct spglist *free, struct rwlock **lockp);
574 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
575     pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
576 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
577 static bool pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *pde,
578     struct spglist *free);
579 static bool     pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
580         pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp);
581
582 static bool     pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e,
583                     u_int flags, struct rwlock **lockp);
584 #if VM_NRESERVLEVEL > 0
585 static void     pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
586         struct rwlock **lockp);
587 #endif
588 static void     pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
589 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
590 static vm_page_t mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
591         vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate);
592
593 static bool     pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
594         vm_prot_t prot, struct rwlock **lockp);
595 static int      pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde,
596         u_int flags, vm_page_t m, struct rwlock **lockp);
597
598 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
599 static void free_pv_chunk(struct pv_chunk *pc);
600 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp);
601 static vm_page_t pmap_allocl3e(pmap_t pmap, vm_offset_t va,
602         struct rwlock **lockp);
603 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
604         struct rwlock **lockp);
605 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
606     struct spglist *free);
607 static boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free);
608
609 static void pmap_invalidate_page(pmap_t pmap, vm_offset_t start);
610 static void pmap_invalidate_all(pmap_t pmap);
611 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush);
612
613 /*
614  * Internal flags for pmap_enter()'s helper functions.
615  */
616 #define PMAP_ENTER_NORECLAIM    0x1000000       /* Don't reclaim PV entries. */
617 #define PMAP_ENTER_NOREPLACE    0x2000000       /* Don't replace mappings. */
618
619 #define UNIMPLEMENTED() panic("%s not implemented", __func__)
620 #define UNTESTED() panic("%s not yet tested", __func__)
621
622 /* Number of supported PID bits */
623 static unsigned int isa3_pid_bits;
624
625 /* PID to start allocating from */
626 static unsigned int isa3_base_pid;
627
628 #define PROCTAB_SIZE_SHIFT      (isa3_pid_bits + 4)
629 #define PROCTAB_ENTRIES (1ul << isa3_pid_bits)
630
631 /*
632  * Map of physical memory regions.
633  */
634 static struct   mem_region *regions, *pregions;
635 static struct   numa_mem_region *numa_pregions;
636 static u_int    phys_avail_count;
637 static int      regions_sz, pregions_sz, numa_pregions_sz;
638 static struct pate *isa3_parttab;
639 static struct prte *isa3_proctab;
640 static vmem_t *asid_arena;
641
642 extern void bs_remap_earlyboot(void);
643
644 #define RADIX_PGD_SIZE_SHIFT    16
645 #define RADIX_PGD_SIZE  (1UL << RADIX_PGD_SIZE_SHIFT)
646
647 #define RADIX_PGD_INDEX_SHIFT   (RADIX_PGD_SIZE_SHIFT-3)
648 #define NL2EPG (PAGE_SIZE/sizeof(pml2_entry_t))
649 #define NL3EPG (PAGE_SIZE/sizeof(pml3_entry_t))
650
651 #define NUPML1E         (RADIX_PGD_SIZE/sizeof(uint64_t))       /* number of userland PML1 pages */
652 #define NUPDPE          (NUPML1E * NL2EPG)/* number of userland PDP pages */
653 #define NUPDE           (NUPDPE * NL3EPG)       /* number of userland PD entries */
654
655 /* POWER9 only permits a 64k partition table size. */
656 #define PARTTAB_SIZE_SHIFT      16
657 #define PARTTAB_SIZE    (1UL << PARTTAB_SIZE_SHIFT)
658
659 #define PARTTAB_HR              (1UL << 63) /* host uses radix */
660 #define PARTTAB_GR              (1UL << 63) /* guest uses radix must match host */
661
662 /* TLB flush actions. Used as argument to tlbiel_flush() */
663 enum {
664         TLB_INVAL_SCOPE_LPID = 2,       /* invalidate TLBs for current LPID */
665         TLB_INVAL_SCOPE_GLOBAL = 3,     /* invalidate all TLBs */
666 };
667
668 #define NPV_LIST_LOCKS  MAXCPU
669 static int pmap_initialized;
670 static vm_paddr_t proctab0pa;
671 static vm_paddr_t parttab_phys;
672 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
673
674 /*
675  * Data for the pv entry allocation mechanism.
676  * Updates to pv_invl_gen are protected by the pv_list_locks[]
677  * elements, but reads are not.
678  */
679 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
680 static struct mtx __exclusive_cache_line pv_chunks_mutex;
681 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
682 static struct md_page *pv_table;
683 static struct md_page pv_dummy;
684
685 #ifdef PV_STATS
686 #define PV_STAT(x)      do { x ; } while (0)
687 #else
688 #define PV_STAT(x)      do { } while (0)
689 #endif
690
691 #define pa_radix_index(pa)      ((pa) >> L3_PAGE_SIZE_SHIFT)
692 #define pa_to_pvh(pa)   (&pv_table[pa_radix_index(pa)])
693
694 #define PHYS_TO_PV_LIST_LOCK(pa)        \
695                         (&pv_list_locks[pa_radix_index(pa) % NPV_LIST_LOCKS])
696
697 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)  do {    \
698         struct rwlock **_lockp = (lockp);               \
699         struct rwlock *_new_lock;                       \
700                                                         \
701         _new_lock = PHYS_TO_PV_LIST_LOCK(pa);           \
702         if (_new_lock != *_lockp) {                     \
703                 if (*_lockp != NULL)                    \
704                         rw_wunlock(*_lockp);            \
705                 *_lockp = _new_lock;                    \
706                 rw_wlock(*_lockp);                      \
707         }                                               \
708 } while (0)
709
710 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)        \
711         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
712
713 #define RELEASE_PV_LIST_LOCK(lockp)             do {    \
714         struct rwlock **_lockp = (lockp);               \
715                                                         \
716         if (*_lockp != NULL) {                          \
717                 rw_wunlock(*_lockp);                    \
718                 *_lockp = NULL;                         \
719         }                                               \
720 } while (0)
721
722 #define VM_PAGE_TO_PV_LIST_LOCK(m)      \
723         PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
724
725 /*
726  * We support 52 bits, hence:
727  * bits 52 - 31 = 21, 0b10101
728  * RTS encoding details
729  * bits 0 - 3 of rts -> bits 6 - 8 unsigned long
730  * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long
731  */
732 #define RTS_SIZE ((0x2UL << 61) | (0x5UL << 5))
733
734 static int powernv_enabled = 1;
735
736 static __always_inline void
737 tlbiel_radix_set_isa300(uint32_t set, uint32_t is,
738         uint32_t pid, uint32_t ric, uint32_t prs)
739 {
740         uint64_t rb;
741         uint64_t rs;
742
743         rb = PPC_BITLSHIFT_VAL(set, 51) | PPC_BITLSHIFT_VAL(is, 53);
744         rs = PPC_BITLSHIFT_VAL((uint64_t)pid, 31);
745
746         __asm __volatile(PPC_TLBIEL(%0, %1, %2, %3, 1)
747                      : : "r"(rb), "r"(rs), "i"(ric), "i"(prs)
748                      : "memory");
749 }
750
751 static void
752 tlbiel_flush_isa3(uint32_t num_sets, uint32_t is)
753 {
754         uint32_t set;
755
756         __asm __volatile("ptesync": : :"memory");
757
758         /*
759          * Flush the first set of the TLB, and the entire Page Walk Cache
760          * and partition table entries. Then flush the remaining sets of the
761          * TLB.
762          */
763         if (is == TLB_INVAL_SCOPE_GLOBAL) {
764                 tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 0);
765                 for (set = 1; set < num_sets; set++)
766                         tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 0);
767         }
768
769         /* Do the same for process scoped entries. */
770         tlbiel_radix_set_isa300(0, is, 0, RIC_FLUSH_ALL, 1);
771         for (set = 1; set < num_sets; set++)
772                 tlbiel_radix_set_isa300(set, is, 0, RIC_FLUSH_TLB, 1);
773
774         __asm __volatile("ptesync": : :"memory");
775 }
776
777 static void
778 mmu_radix_tlbiel_flush(int scope)
779 {
780         MPASS(scope == TLB_INVAL_SCOPE_LPID ||
781                   scope == TLB_INVAL_SCOPE_GLOBAL);
782
783         tlbiel_flush_isa3(POWER9_TLB_SETS_RADIX, scope);
784         __asm __volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
785 }
786
787 static void
788 mmu_radix_tlbie_all(void)
789 {
790         if (powernv_enabled)
791                 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
792         else
793                 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
794 }
795
796 static void
797 mmu_radix_init_amor(void)
798 {
799         /*
800         * In HV mode, we init AMOR (Authority Mask Override Register) so that
801         * the hypervisor and guest can setup IAMR (Instruction Authority Mask
802         * Register), enable key 0 and set it to 1.
803         *
804         * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
805         */
806         mtspr(SPR_AMOR, (3ul << 62));
807 }
808
809 static void
810 mmu_radix_init_iamr(void)
811 {
812         /*
813          * Radix always uses key0 of the IAMR to determine if an access is
814          * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
815          * fetch.
816          */
817         mtspr(SPR_IAMR, (1ul << 62));
818 }
819
820 static void
821 mmu_radix_pid_set(pmap_t pmap)
822 {
823
824         mtspr(SPR_PID, pmap->pm_pid);
825         isync();
826 }
827
828 /* Quick sort callout for comparing physical addresses. */
829 static int
830 pa_cmp(const void *a, const void *b)
831 {
832         const vm_paddr_t *pa = a, *pb = b;
833
834         if (*pa < *pb)
835                 return (-1);
836         else if (*pa > *pb)
837                 return (1);
838         else
839                 return (0);
840 }
841
842 #define pte_load_store(ptep, pte)       atomic_swap_long(ptep, pte)
843 #define pte_load_clear(ptep)            atomic_swap_long(ptep, 0)
844 #define pte_store(ptep, pte) do {          \
845         MPASS((pte) & (RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_X));  \
846         *(u_long *)(ptep) = htobe64((u_long)((pte) | PG_V | RPTE_LEAF)); \
847 } while (0)
848 /*
849  * NB: should only be used for adding directories - not for direct mappings
850  */
851 #define pde_store(ptep, pa) do {                                \
852         *(u_long *)(ptep) = htobe64((u_long)(pa|RPTE_VALID|RPTE_SHIFT)); \
853 } while (0)
854
855 #define pte_clear(ptep) do {                                    \
856                 *(u_long *)(ptep) = (u_long)(0);                \
857 } while (0)
858
859 #define PMAP_PDE_SUPERPAGE      (1 << 8)        /* supports 2MB superpages */
860
861 /*
862  * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
863  * (PTE) page mappings have identical settings for the following fields:
864  */
865 #define PG_PTE_PROMOTE  (PG_X | PG_MANAGED | PG_W | PG_PTE_CACHE | \
866             PG_M | PG_A | RPTE_EAA_MASK | PG_V)
867
868 static __inline void
869 pmap_resident_count_inc(pmap_t pmap, int count)
870 {
871
872         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
873         pmap->pm_stats.resident_count += count;
874 }
875
876 static __inline void
877 pmap_resident_count_dec(pmap_t pmap, int count)
878 {
879
880         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
881         KASSERT(pmap->pm_stats.resident_count >= count,
882             ("pmap %p resident count underflow %ld %d", pmap,
883             pmap->pm_stats.resident_count, count));
884         pmap->pm_stats.resident_count -= count;
885 }
886
887 static void
888 pagezero(vm_offset_t va)
889 {
890         va = trunc_page(va);
891
892         bzero((void *)va, PAGE_SIZE);
893 }
894
895 static uint64_t
896 allocpages(int n)
897 {
898         u_int64_t ret;
899
900         ret = moea64_bootstrap_alloc(n * PAGE_SIZE, PAGE_SIZE);
901         for (int i = 0; i < n; i++)
902                 pagezero(PHYS_TO_DMAP(ret + i * PAGE_SIZE));
903         return (ret);
904 }
905
906 static pt_entry_t *
907 kvtopte(vm_offset_t va)
908 {
909         pt_entry_t *l3e;
910
911         l3e = pmap_pml3e(kernel_pmap, va);
912         if (l3e == NULL || (be64toh(*l3e) & RPTE_VALID) == 0)
913                 return (NULL);
914         return (pmap_l3e_to_pte(l3e, va));
915 }
916
917 void
918 mmu_radix_kenter(vm_offset_t va, vm_paddr_t pa)
919 {
920         pt_entry_t *pte;
921
922         pte = kvtopte(va);
923         MPASS(pte != NULL);
924         *pte = htobe64(pa | RPTE_VALID | RPTE_LEAF | RPTE_EAA_R | \
925             RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A);
926 }
927
928 bool
929 mmu_radix_ps_enabled(pmap_t pmap)
930 {
931         return (superpages_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
932 }
933
934 static pt_entry_t *
935 pmap_nofault_pte(pmap_t pmap, vm_offset_t va, int *is_l3e)
936 {
937         pml3_entry_t *l3e;
938         pt_entry_t *pte;
939
940         va &= PG_PS_FRAME;
941         l3e = pmap_pml3e(pmap, va);
942         if (l3e == NULL || (be64toh(*l3e) & PG_V) == 0)
943                 return (NULL);
944
945         if (be64toh(*l3e) & RPTE_LEAF) {
946                 *is_l3e = 1;
947                 return (l3e);
948         }
949         *is_l3e = 0;
950         va &= PG_FRAME;
951         pte = pmap_l3e_to_pte(l3e, va);
952         if (pte == NULL || (be64toh(*pte) & PG_V) == 0)
953                 return (NULL);
954         return (pte);
955 }
956
957 int
958 pmap_nofault(pmap_t pmap, vm_offset_t va, vm_prot_t flags)
959 {
960         pt_entry_t *pte;
961         pt_entry_t startpte, origpte, newpte;
962         vm_page_t m;
963         int is_l3e;
964
965         startpte = 0;
966  retry:
967         if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL)
968                 return (KERN_INVALID_ADDRESS);
969         origpte = newpte = be64toh(*pte);
970         if (startpte == 0) {
971                 startpte = origpte;
972                 if (((flags & VM_PROT_WRITE) && (startpte & PG_M)) ||
973                     ((flags & VM_PROT_READ) && (startpte & PG_A))) {
974                         pmap_invalidate_all(pmap);
975 #ifdef INVARIANTS
976                         if (VERBOSE_PMAP || pmap_logging)
977                                 printf("%s(%p, %#lx, %#x) (%#lx) -- invalidate all\n",
978                                     __func__, pmap, va, flags, origpte);
979 #endif
980                         return (KERN_FAILURE);
981                 }
982         }
983 #ifdef INVARIANTS
984         if (VERBOSE_PMAP || pmap_logging)
985                 printf("%s(%p, %#lx, %#x) (%#lx)\n", __func__, pmap, va,
986                     flags, origpte);
987 #endif
988         PMAP_LOCK(pmap);
989         if ((pte = pmap_nofault_pte(pmap, va, &is_l3e)) == NULL ||
990             be64toh(*pte) != origpte) {
991                 PMAP_UNLOCK(pmap);
992                 return (KERN_FAILURE);
993         }
994         m = PHYS_TO_VM_PAGE(newpte & PG_FRAME);
995         MPASS(m != NULL);
996         switch (flags) {
997         case VM_PROT_READ:
998                 if ((newpte & (RPTE_EAA_R|RPTE_EAA_X)) == 0)
999                         goto protfail;
1000                 newpte |= PG_A;
1001                 vm_page_aflag_set(m, PGA_REFERENCED);
1002                 break;
1003         case VM_PROT_WRITE:
1004                 if ((newpte & RPTE_EAA_W) == 0)
1005                         goto protfail;
1006                 if (is_l3e)
1007                         goto protfail;
1008                 newpte |= PG_M;
1009                 vm_page_dirty(m);
1010                 break;
1011         case VM_PROT_EXECUTE:
1012                 if ((newpte & RPTE_EAA_X) == 0)
1013                         goto protfail;
1014                 newpte |= PG_A;
1015                 vm_page_aflag_set(m, PGA_REFERENCED);
1016                 break;
1017         }
1018
1019         if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte)))
1020                 goto retry;
1021         ptesync();
1022         PMAP_UNLOCK(pmap);
1023         if (startpte == newpte)
1024                 return (KERN_FAILURE);
1025         return (0);
1026  protfail:
1027         PMAP_UNLOCK(pmap);
1028         return (KERN_PROTECTION_FAILURE);
1029 }
1030
1031 /*
1032  * Returns TRUE if the given page is mapped individually or as part of
1033  * a 2mpage.  Otherwise, returns FALSE.
1034  */
1035 boolean_t
1036 mmu_radix_page_is_mapped(vm_page_t m)
1037 {
1038         struct rwlock *lock;
1039         boolean_t rv;
1040
1041         if ((m->oflags & VPO_UNMANAGED) != 0)
1042                 return (FALSE);
1043         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
1044         rw_rlock(lock);
1045         rv = !TAILQ_EMPTY(&m->md.pv_list) ||
1046             ((m->flags & PG_FICTITIOUS) == 0 &&
1047             !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
1048         rw_runlock(lock);
1049         return (rv);
1050 }
1051
1052 /*
1053  * Determine the appropriate bits to set in a PTE or PDE for a specified
1054  * caching mode.
1055  */
1056 static int
1057 pmap_cache_bits(vm_memattr_t ma)
1058 {
1059         if (ma != VM_MEMATTR_DEFAULT) {
1060                 switch (ma) {
1061                 case VM_MEMATTR_UNCACHEABLE:
1062                         return (RPTE_ATTR_GUARDEDIO);
1063                 case VM_MEMATTR_CACHEABLE:
1064                         return (RPTE_ATTR_MEM);
1065                 case VM_MEMATTR_WRITE_BACK:
1066                 case VM_MEMATTR_PREFETCHABLE:
1067                 case VM_MEMATTR_WRITE_COMBINING:
1068                         return (RPTE_ATTR_UNGUARDEDIO);
1069                 }
1070         }
1071         return (0);
1072 }
1073
1074 static void
1075 pmap_invalidate_page(pmap_t pmap, vm_offset_t start)
1076 {
1077         ptesync();
1078         if (pmap == kernel_pmap)
1079                 radix_tlbie_invlpg_kernel_4k(start);
1080         else
1081                 radix_tlbie_invlpg_user_4k(pmap->pm_pid, start);
1082         ttusync();
1083 }
1084
1085 static void
1086 pmap_invalidate_page_2m(pmap_t pmap, vm_offset_t start)
1087 {
1088         ptesync();
1089         if (pmap == kernel_pmap)
1090                 radix_tlbie_invlpg_kernel_2m(start);
1091         else
1092                 radix_tlbie_invlpg_user_2m(pmap->pm_pid, start);
1093         ttusync();
1094 }
1095
1096 static void
1097 pmap_invalidate_pwc(pmap_t pmap)
1098 {
1099         ptesync();
1100         if (pmap == kernel_pmap)
1101                 radix_tlbie_invlpwc_kernel();
1102         else
1103                 radix_tlbie_invlpwc_user(pmap->pm_pid);
1104         ttusync();
1105 }
1106
1107 static void
1108 pmap_invalidate_range(pmap_t pmap, vm_offset_t start, vm_offset_t end)
1109 {
1110         if (((start - end) >> PAGE_SHIFT) > 8) {
1111                 pmap_invalidate_all(pmap);
1112                 return;
1113         }
1114         ptesync();
1115         if (pmap == kernel_pmap) {
1116                 while (start < end) {
1117                         radix_tlbie_invlpg_kernel_4k(start);
1118                         start += PAGE_SIZE;
1119                 }
1120         } else {
1121                 while (start < end) {
1122                         radix_tlbie_invlpg_user_4k(pmap->pm_pid, start);
1123                         start += PAGE_SIZE;
1124                 }
1125         }
1126         ttusync();
1127 }
1128
1129 static void
1130 pmap_invalidate_all(pmap_t pmap)
1131 {
1132         ptesync();
1133         if (pmap == kernel_pmap)
1134                 radix_tlbie_flush_kernel();
1135         else
1136                 radix_tlbie_flush_user(pmap->pm_pid);
1137         ttusync();
1138 }
1139
1140 static void
1141 pmap_invalidate_l3e_page(pmap_t pmap, vm_offset_t va, pml3_entry_t l3e)
1142 {
1143
1144         /*
1145          * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
1146          * by a promotion that did not invalidate the 512 4KB page mappings
1147          * that might exist in the TLB.  Consequently, at this point, the TLB
1148          * may hold both 4KB and 2MB page mappings for the address range [va,
1149          * va + L3_PAGE_SIZE).  Therefore, the entire range must be invalidated here.
1150          * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
1151          * 4KB page mappings for the address range [va, va + L3_PAGE_SIZE), and so a
1152          * single INVLPG suffices to invalidate the 2MB page mapping from the
1153          * TLB.
1154          */
1155         ptesync();
1156         if ((l3e & PG_PROMOTED) != 0)
1157                 pmap_invalidate_range(pmap, va, va + L3_PAGE_SIZE - 1);
1158         else
1159                 pmap_invalidate_page_2m(pmap, va);
1160
1161         pmap_invalidate_pwc(pmap);
1162 }
1163
1164 static __inline struct pv_chunk *
1165 pv_to_chunk(pv_entry_t pv)
1166 {
1167
1168         return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
1169 }
1170
1171 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1172
1173 #define PC_FREE0        0xfffffffffffffffful
1174 #define PC_FREE1        ((1ul << (_NPCPV % 64)) - 1)
1175
1176 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1 };
1177
1178 /*
1179  * Ensure that the number of spare PV entries in the specified pmap meets or
1180  * exceeds the given count, "needed".
1181  *
1182  * The given PV list lock may be released.
1183  */
1184 static void
1185 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
1186 {
1187         struct pch new_tail;
1188         struct pv_chunk *pc;
1189         vm_page_t m;
1190         int avail, free;
1191         bool reclaimed;
1192
1193         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1194         KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
1195
1196         /*
1197          * Newly allocated PV chunks must be stored in a private list until
1198          * the required number of PV chunks have been allocated.  Otherwise,
1199          * reclaim_pv_chunk() could recycle one of these chunks.  In
1200          * contrast, these chunks must be added to the pmap upon allocation.
1201          */
1202         TAILQ_INIT(&new_tail);
1203 retry:
1204         avail = 0;
1205         TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
1206                 //              if ((cpu_feature2 & CPUID2_POPCNT) == 0)
1207                 bit_count((bitstr_t *)pc->pc_map, 0,
1208                                   sizeof(pc->pc_map) * NBBY, &free);
1209 #if 0
1210                 free = popcnt_pc_map_pq(pc->pc_map);
1211 #endif
1212                 if (free == 0)
1213                         break;
1214                 avail += free;
1215                 if (avail >= needed)
1216                         break;
1217         }
1218         for (reclaimed = false; avail < needed; avail += _NPCPV) {
1219                 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
1220                 if (m == NULL) {
1221                         m = reclaim_pv_chunk(pmap, lockp);
1222                         if (m == NULL)
1223                                 goto retry;
1224                         reclaimed = true;
1225                 }
1226                 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
1227                 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
1228                 dump_add_page(m->phys_addr);
1229                 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1230                 pc->pc_pmap = pmap;
1231                 pc->pc_map[0] = PC_FREE0;
1232                 pc->pc_map[1] = PC_FREE1;
1233                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1234                 TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
1235                 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
1236
1237                 /*
1238                  * The reclaim might have freed a chunk from the current pmap.
1239                  * If that chunk contained available entries, we need to
1240                  * re-count the number of available entries.
1241                  */
1242                 if (reclaimed)
1243                         goto retry;
1244         }
1245         if (!TAILQ_EMPTY(&new_tail)) {
1246                 mtx_lock(&pv_chunks_mutex);
1247                 TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
1248                 mtx_unlock(&pv_chunks_mutex);
1249         }
1250 }
1251
1252 /*
1253  * First find and then remove the pv entry for the specified pmap and virtual
1254  * address from the specified pv list.  Returns the pv entry if found and NULL
1255  * otherwise.  This operation can be performed on pv lists for either 4KB or
1256  * 2MB page mappings.
1257  */
1258 static __inline pv_entry_t
1259 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1260 {
1261         pv_entry_t pv;
1262
1263         TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
1264 #ifdef INVARIANTS
1265                 if (PV_PMAP(pv) == NULL) {
1266                         printf("corrupted pv_chunk/pv %p\n", pv);
1267                         printf("pv_chunk: %64D\n", pv_to_chunk(pv), ":");
1268                 }
1269                 MPASS(PV_PMAP(pv) != NULL);
1270                 MPASS(pv->pv_va != 0);
1271 #endif
1272                 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
1273                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
1274                         pvh->pv_gen++;
1275                         break;
1276                 }
1277         }
1278         return (pv);
1279 }
1280
1281 /*
1282  * After demotion from a 2MB page mapping to 512 4KB page mappings,
1283  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
1284  * entries for each of the 4KB page mappings.
1285  */
1286 static void
1287 pmap_pv_demote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1288     struct rwlock **lockp)
1289 {
1290         struct md_page *pvh;
1291         struct pv_chunk *pc;
1292         pv_entry_t pv;
1293         vm_offset_t va_last;
1294         vm_page_t m;
1295         int bit, field;
1296
1297         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1298         KASSERT((pa & L3_PAGE_MASK) == 0,
1299             ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
1300         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
1301
1302         /*
1303          * Transfer the 2mpage's pv entry for this mapping to the first
1304          * page's pv list.  Once this transfer begins, the pv list lock
1305          * must not be released until the last pv entry is reinstantiated.
1306          */
1307         pvh = pa_to_pvh(pa);
1308         va = trunc_2mpage(va);
1309         pv = pmap_pvh_remove(pvh, pmap, va);
1310         KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
1311         m = PHYS_TO_VM_PAGE(pa);
1312         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1313
1314         m->md.pv_gen++;
1315         /* Instantiate the remaining NPTEPG - 1 pv entries. */
1316         PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
1317         va_last = va + L3_PAGE_SIZE - PAGE_SIZE;
1318         for (;;) {
1319                 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1320                 KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0
1321                     , ("pmap_pv_demote_pde: missing spare"));
1322                 for (field = 0; field < _NPCM; field++) {
1323                         while (pc->pc_map[field]) {
1324                                 bit = cnttzd(pc->pc_map[field]);
1325                                 pc->pc_map[field] &= ~(1ul << bit);
1326                                 pv = &pc->pc_pventry[field * 64 + bit];
1327                                 va += PAGE_SIZE;
1328                                 pv->pv_va = va;
1329                                 m++;
1330                                 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1331                             ("pmap_pv_demote_pde: page %p is not managed", m));
1332                                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1333
1334                                 m->md.pv_gen++;
1335                                 if (va == va_last)
1336                                         goto out;
1337                         }
1338                 }
1339                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1340                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1341         }
1342 out:
1343         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) {
1344                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1345                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1346         }
1347         PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
1348         PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
1349 }
1350
1351 static void
1352 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap)
1353 {
1354
1355         if (pmap == NULL)
1356                 return;
1357         pmap_invalidate_all(pmap);
1358         if (pmap != locked_pmap)
1359                 PMAP_UNLOCK(pmap);
1360 }
1361
1362 /*
1363  * We are in a serious low memory condition.  Resort to
1364  * drastic measures to free some pages so we can allocate
1365  * another pv entry chunk.
1366  *
1367  * Returns NULL if PV entries were reclaimed from the specified pmap.
1368  *
1369  * We do not, however, unmap 2mpages because subsequent accesses will
1370  * allocate per-page pv entries until repromotion occurs, thereby
1371  * exacerbating the shortage of free pv entries.
1372  */
1373 static int active_reclaims = 0;
1374 static vm_page_t
1375 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
1376 {
1377         struct pv_chunk *pc, *pc_marker, *pc_marker_end;
1378         struct pv_chunk_header pc_marker_b, pc_marker_end_b;
1379         struct md_page *pvh;
1380         pml3_entry_t *l3e;
1381         pmap_t next_pmap, pmap;
1382         pt_entry_t *pte, tpte;
1383         pv_entry_t pv;
1384         vm_offset_t va;
1385         vm_page_t m, m_pc;
1386         struct spglist free;
1387         uint64_t inuse;
1388         int bit, field, freed;
1389
1390         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
1391         KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
1392         pmap = NULL;
1393         m_pc = NULL;
1394         SLIST_INIT(&free);
1395         bzero(&pc_marker_b, sizeof(pc_marker_b));
1396         bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
1397         pc_marker = (struct pv_chunk *)&pc_marker_b;
1398         pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
1399
1400         mtx_lock(&pv_chunks_mutex);
1401         active_reclaims++;
1402         TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
1403         TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
1404         while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
1405             SLIST_EMPTY(&free)) {
1406                 next_pmap = pc->pc_pmap;
1407                 if (next_pmap == NULL) {
1408                         /*
1409                          * The next chunk is a marker.  However, it is
1410                          * not our marker, so active_reclaims must be
1411                          * > 1.  Consequently, the next_chunk code
1412                          * will not rotate the pv_chunks list.
1413                          */
1414                         goto next_chunk;
1415                 }
1416                 mtx_unlock(&pv_chunks_mutex);
1417
1418                 /*
1419                  * A pv_chunk can only be removed from the pc_lru list
1420                  * when both pc_chunks_mutex is owned and the
1421                  * corresponding pmap is locked.
1422                  */
1423                 if (pmap != next_pmap) {
1424                         reclaim_pv_chunk_leave_pmap(pmap, locked_pmap);
1425                         pmap = next_pmap;
1426                         /* Avoid deadlock and lock recursion. */
1427                         if (pmap > locked_pmap) {
1428                                 RELEASE_PV_LIST_LOCK(lockp);
1429                                 PMAP_LOCK(pmap);
1430                                 mtx_lock(&pv_chunks_mutex);
1431                                 continue;
1432                         } else if (pmap != locked_pmap) {
1433                                 if (PMAP_TRYLOCK(pmap)) {
1434                                         mtx_lock(&pv_chunks_mutex);
1435                                         continue;
1436                                 } else {
1437                                         pmap = NULL; /* pmap is not locked */
1438                                         mtx_lock(&pv_chunks_mutex);
1439                                         pc = TAILQ_NEXT(pc_marker, pc_lru);
1440                                         if (pc == NULL ||
1441                                             pc->pc_pmap != next_pmap)
1442                                                 continue;
1443                                         goto next_chunk;
1444                                 }
1445                         }
1446                 }
1447
1448                 /*
1449                  * Destroy every non-wired, 4 KB page mapping in the chunk.
1450                  */
1451                 freed = 0;
1452                 for (field = 0; field < _NPCM; field++) {
1453                         for (inuse = ~pc->pc_map[field] & pc_freemask[field];
1454                             inuse != 0; inuse &= ~(1UL << bit)) {
1455                                 bit = cnttzd(inuse);
1456                                 pv = &pc->pc_pventry[field * 64 + bit];
1457                                 va = pv->pv_va;
1458                                 l3e = pmap_pml3e(pmap, va);
1459                                 if ((be64toh(*l3e) & RPTE_LEAF) != 0)
1460                                         continue;
1461                                 pte = pmap_l3e_to_pte(l3e, va);
1462                                 if ((be64toh(*pte) & PG_W) != 0)
1463                                         continue;
1464                                 tpte = be64toh(pte_load_clear(pte));
1465                                 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
1466                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
1467                                         vm_page_dirty(m);
1468                                 if ((tpte & PG_A) != 0)
1469                                         vm_page_aflag_set(m, PGA_REFERENCED);
1470                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1471                                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
1472
1473                                 m->md.pv_gen++;
1474                                 if (TAILQ_EMPTY(&m->md.pv_list) &&
1475                                     (m->flags & PG_FICTITIOUS) == 0) {
1476                                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
1477                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
1478                                                 vm_page_aflag_clear(m,
1479                                                     PGA_WRITEABLE);
1480                                         }
1481                                 }
1482                                 pc->pc_map[field] |= 1UL << bit;
1483                                 pmap_unuse_pt(pmap, va, be64toh(*l3e), &free);
1484                                 freed++;
1485                         }
1486                 }
1487                 if (freed == 0) {
1488                         mtx_lock(&pv_chunks_mutex);
1489                         goto next_chunk;
1490                 }
1491                 /* Every freed mapping is for a 4 KB page. */
1492                 pmap_resident_count_dec(pmap, freed);
1493                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
1494                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
1495                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
1496                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1497                 if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1) {
1498                         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1499                         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1500                         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1501                         /* Entire chunk is free; return it. */
1502                         m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1503                         dump_drop_page(m_pc->phys_addr);
1504                         mtx_lock(&pv_chunks_mutex);
1505                         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1506                         break;
1507                 }
1508                 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1509                 mtx_lock(&pv_chunks_mutex);
1510                 /* One freed pv entry in locked_pmap is sufficient. */
1511                 if (pmap == locked_pmap)
1512                         break;
1513 next_chunk:
1514                 TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
1515                 TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
1516                 if (active_reclaims == 1 && pmap != NULL) {
1517                         /*
1518                          * Rotate the pv chunks list so that we do not
1519                          * scan the same pv chunks that could not be
1520                          * freed (because they contained a wired
1521                          * and/or superpage mapping) on every
1522                          * invocation of reclaim_pv_chunk().
1523                          */
1524                         while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
1525                                 MPASS(pc->pc_pmap != NULL);
1526                                 TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1527                                 TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1528                         }
1529                 }
1530         }
1531         TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
1532         TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
1533         active_reclaims--;
1534         mtx_unlock(&pv_chunks_mutex);
1535         reclaim_pv_chunk_leave_pmap(pmap, locked_pmap);
1536         if (m_pc == NULL && !SLIST_EMPTY(&free)) {
1537                 m_pc = SLIST_FIRST(&free);
1538                 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
1539                 /* Recycle a freed page table page. */
1540                 m_pc->ref_count = 1;
1541         }
1542         vm_page_free_pages_toq(&free, true);
1543         return (m_pc);
1544 }
1545
1546 /*
1547  * free the pv_entry back to the free list
1548  */
1549 static void
1550 free_pv_entry(pmap_t pmap, pv_entry_t pv)
1551 {
1552         struct pv_chunk *pc;
1553         int idx, field, bit;
1554
1555 #ifdef VERBOSE_PV
1556         if (pmap != kernel_pmap)
1557                 printf("%s(%p, %p)\n", __func__, pmap, pv);
1558 #endif
1559         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1560         PV_STAT(atomic_add_long(&pv_entry_frees, 1));
1561         PV_STAT(atomic_add_int(&pv_entry_spare, 1));
1562         PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
1563         pc = pv_to_chunk(pv);
1564         idx = pv - &pc->pc_pventry[0];
1565         field = idx / 64;
1566         bit = idx % 64;
1567         pc->pc_map[field] |= 1ul << bit;
1568         if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1) {
1569                 /* 98% of the time, pc is already at the head of the list. */
1570                 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
1571                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1572                         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1573                 }
1574                 return;
1575         }
1576         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1577         free_pv_chunk(pc);
1578 }
1579
1580 static void
1581 free_pv_chunk(struct pv_chunk *pc)
1582 {
1583         vm_page_t m;
1584
1585         mtx_lock(&pv_chunks_mutex);
1586         TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1587         mtx_unlock(&pv_chunks_mutex);
1588         PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1589         PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1590         PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1591         /* entire chunk is free, return it */
1592         m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1593         dump_drop_page(m->phys_addr);
1594         vm_page_unwire_noq(m);
1595         vm_page_free(m);
1596 }
1597
1598 /*
1599  * Returns a new PV entry, allocating a new PV chunk from the system when
1600  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
1601  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
1602  * returned.
1603  *
1604  * The given PV list lock may be released.
1605  */
1606 static pv_entry_t
1607 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
1608 {
1609         int bit, field;
1610         pv_entry_t pv;
1611         struct pv_chunk *pc;
1612         vm_page_t m;
1613
1614         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1615         PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
1616 retry:
1617         pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1618         if (pc != NULL) {
1619                 for (field = 0; field < _NPCM; field++) {
1620                         if (pc->pc_map[field]) {
1621                                 bit = cnttzd(pc->pc_map[field]);
1622                                 break;
1623                         }
1624                 }
1625                 if (field < _NPCM) {
1626                         pv = &pc->pc_pventry[field * 64 + bit];
1627                         pc->pc_map[field] &= ~(1ul << bit);
1628                         /* If this was the last item, move it to tail */
1629                         if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0) {
1630                                 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1631                                 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
1632                                     pc_list);
1633                         }
1634                         PV_STAT(atomic_add_long(&pv_entry_count, 1));
1635                         PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
1636                         MPASS(PV_PMAP(pv) != NULL);
1637                         return (pv);
1638                 }
1639         }
1640         /* No free items, allocate another chunk */
1641         m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
1642         if (m == NULL) {
1643                 if (lockp == NULL) {
1644                         PV_STAT(pc_chunk_tryfail++);
1645                         return (NULL);
1646                 }
1647                 m = reclaim_pv_chunk(pmap, lockp);
1648                 if (m == NULL)
1649                         goto retry;
1650         }
1651         PV_STAT(atomic_add_int(&pc_chunk_count, 1));
1652         PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
1653         dump_add_page(m->phys_addr);
1654         pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1655         pc->pc_pmap = pmap;
1656         pc->pc_map[0] = PC_FREE0 & ~1ul;        /* preallocated bit 0 */
1657         pc->pc_map[1] = PC_FREE1;
1658         mtx_lock(&pv_chunks_mutex);
1659         TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1660         mtx_unlock(&pv_chunks_mutex);
1661         pv = &pc->pc_pventry[0];
1662         TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1663         PV_STAT(atomic_add_long(&pv_entry_count, 1));
1664         PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
1665         MPASS(PV_PMAP(pv) != NULL);
1666         return (pv);
1667 }
1668
1669 #if VM_NRESERVLEVEL > 0
1670 /*
1671  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
1672  * replace the many pv entries for the 4KB page mappings by a single pv entry
1673  * for the 2MB page mapping.
1674  */
1675 static void
1676 pmap_pv_promote_l3e(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1677     struct rwlock **lockp)
1678 {
1679         struct md_page *pvh;
1680         pv_entry_t pv;
1681         vm_offset_t va_last;
1682         vm_page_t m;
1683
1684         KASSERT((pa & L3_PAGE_MASK) == 0,
1685             ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
1686         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
1687
1688         /*
1689          * Transfer the first page's pv entry for this mapping to the 2mpage's
1690          * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
1691          * a transfer avoids the possibility that get_pv_entry() calls
1692          * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
1693          * mappings that is being promoted.
1694          */
1695         m = PHYS_TO_VM_PAGE(pa);
1696         va = trunc_2mpage(va);
1697         pv = pmap_pvh_remove(&m->md, pmap, va);
1698         KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
1699         pvh = pa_to_pvh(pa);
1700         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
1701         pvh->pv_gen++;
1702         /* Free the remaining NPTEPG - 1 pv entries. */
1703         va_last = va + L3_PAGE_SIZE - PAGE_SIZE;
1704         do {
1705                 m++;
1706                 va += PAGE_SIZE;
1707                 pmap_pvh_free(&m->md, pmap, va);
1708         } while (va < va_last);
1709 }
1710 #endif /* VM_NRESERVLEVEL > 0 */
1711
1712 /*
1713  * First find and then destroy the pv entry for the specified pmap and virtual
1714  * address.  This operation can be performed on pv lists for either 4KB or 2MB
1715  * page mappings.
1716  */
1717 static void
1718 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1719 {
1720         pv_entry_t pv;
1721
1722         pv = pmap_pvh_remove(pvh, pmap, va);
1723         KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
1724         free_pv_entry(pmap, pv);
1725 }
1726
1727 /*
1728  * Conditionally create the PV entry for a 4KB page mapping if the required
1729  * memory can be allocated without resorting to reclamation.
1730  */
1731 static boolean_t
1732 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
1733     struct rwlock **lockp)
1734 {
1735         pv_entry_t pv;
1736
1737         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1738         /* Pass NULL instead of the lock pointer to disable reclamation. */
1739         if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
1740                 pv->pv_va = va;
1741                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1742                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
1743                 m->md.pv_gen++;
1744                 return (TRUE);
1745         } else
1746                 return (FALSE);
1747 }
1748
1749 vm_paddr_t phys_avail_debug[2 * VM_PHYSSEG_MAX];
1750 #ifdef INVARIANTS
1751 static void
1752 validate_addr(vm_paddr_t addr, vm_size_t size)
1753 {
1754         vm_paddr_t end = addr + size;
1755         bool found = false;
1756
1757         for (int i = 0; i < 2 * phys_avail_count; i += 2) {
1758                 if (addr >= phys_avail_debug[i] &&
1759                         end <= phys_avail_debug[i + 1]) {
1760                         found = true;
1761                         break;
1762                 }
1763         }
1764         KASSERT(found, ("%#lx-%#lx outside of initial phys_avail array",
1765                                         addr, end));
1766 }
1767 #else
1768 static void validate_addr(vm_paddr_t addr, vm_size_t size) {}
1769 #endif
1770 #define DMAP_PAGE_BITS (RPTE_VALID | RPTE_LEAF | RPTE_EAA_MASK | PG_M | PG_A)
1771
1772 static vm_paddr_t
1773 alloc_pt_page(void)
1774 {
1775         vm_paddr_t page;
1776
1777         page = allocpages(1);
1778         pagezero(PHYS_TO_DMAP(page));
1779         return (page);
1780 }
1781
1782 static void
1783 mmu_radix_dmap_range(vm_paddr_t start, vm_paddr_t end)
1784 {
1785         pt_entry_t *pte, pteval;
1786         vm_paddr_t page;
1787
1788         if (bootverbose)
1789                 printf("%s %lx -> %lx\n", __func__, start, end);
1790         while (start < end) {
1791                 pteval = start | DMAP_PAGE_BITS;
1792                 pte = pmap_pml1e(kernel_pmap, PHYS_TO_DMAP(start));
1793                 if ((be64toh(*pte) & RPTE_VALID) == 0) {
1794                         page = alloc_pt_page();
1795                         pde_store(pte, page);
1796                 }
1797                 pte = pmap_l1e_to_l2e(pte, PHYS_TO_DMAP(start));
1798                 if ((start & L2_PAGE_MASK) == 0 &&
1799                         end - start >= L2_PAGE_SIZE) {
1800                         start += L2_PAGE_SIZE;
1801                         goto done;
1802                 } else if ((be64toh(*pte) & RPTE_VALID) == 0) {
1803                         page = alloc_pt_page();
1804                         pde_store(pte, page);
1805                 }
1806
1807                 pte = pmap_l2e_to_l3e(pte, PHYS_TO_DMAP(start));
1808                 if ((start & L3_PAGE_MASK) == 0 &&
1809                         end - start >= L3_PAGE_SIZE) {
1810                         start += L3_PAGE_SIZE;
1811                         goto done;
1812                 } else if ((be64toh(*pte) & RPTE_VALID) == 0) {
1813                         page = alloc_pt_page();
1814                         pde_store(pte, page);
1815                 }
1816                 pte = pmap_l3e_to_pte(pte, PHYS_TO_DMAP(start));
1817                 start += PAGE_SIZE;
1818         done:
1819                 pte_store(pte, pteval);
1820         }
1821 }
1822
1823 static void
1824 mmu_radix_dmap_populate(vm_size_t hwphyssz)
1825 {
1826         vm_paddr_t start, end;
1827
1828         for (int i = 0; i < pregions_sz; i++) {
1829                 start = pregions[i].mr_start;
1830                 end = start + pregions[i].mr_size;
1831                 if (hwphyssz && start >= hwphyssz)
1832                         break;
1833                 if (hwphyssz && hwphyssz < end)
1834                         end = hwphyssz;
1835                 mmu_radix_dmap_range(start, end);
1836         }
1837 }
1838
1839 static void
1840 mmu_radix_setup_pagetables(vm_size_t hwphyssz)
1841 {
1842         vm_paddr_t ptpages, pages;
1843         pt_entry_t *pte;
1844         vm_paddr_t l1phys;
1845
1846         bzero(kernel_pmap, sizeof(struct pmap));
1847         PMAP_LOCK_INIT(kernel_pmap);
1848
1849         ptpages = allocpages(3);
1850         l1phys = moea64_bootstrap_alloc(RADIX_PGD_SIZE, RADIX_PGD_SIZE);
1851         validate_addr(l1phys, RADIX_PGD_SIZE);
1852         if (bootverbose)
1853                 printf("l1phys=%lx\n", l1phys);
1854         MPASS((l1phys & (RADIX_PGD_SIZE-1)) == 0);
1855         for (int i = 0; i < RADIX_PGD_SIZE/PAGE_SIZE; i++)
1856                 pagezero(PHYS_TO_DMAP(l1phys + i * PAGE_SIZE));
1857         kernel_pmap->pm_pml1 = (pml1_entry_t *)PHYS_TO_DMAP(l1phys);
1858
1859         mmu_radix_dmap_populate(hwphyssz);
1860
1861         /*
1862          * Create page tables for first 128MB of KVA
1863          */
1864         pages = ptpages;
1865         pte = pmap_pml1e(kernel_pmap, VM_MIN_KERNEL_ADDRESS);
1866         *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
1867         pages += PAGE_SIZE;
1868         pte = pmap_l1e_to_l2e(pte, VM_MIN_KERNEL_ADDRESS);
1869         *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
1870         pages += PAGE_SIZE;
1871         pte = pmap_l2e_to_l3e(pte, VM_MIN_KERNEL_ADDRESS);
1872         /*
1873          * the kernel page table pages need to be preserved in
1874          * phys_avail and not overlap with previous  allocations
1875          */
1876         pages = allocpages(nkpt);
1877         if (bootverbose) {
1878                 printf("phys_avail after dmap populate and nkpt allocation\n");
1879                 for (int j = 0; j < 2 * phys_avail_count; j+=2)
1880                         printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n",
1881                                    j, phys_avail[j], j + 1, phys_avail[j + 1]);
1882         }
1883         KPTphys = pages;
1884         for (int i = 0; i < nkpt; i++, pte++, pages += PAGE_SIZE)
1885                 *pte = htobe64(pages | RPTE_VALID | RPTE_SHIFT);
1886         kernel_vm_end = VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE;
1887         if (bootverbose)
1888                 printf("kernel_pmap pml1 %p\n", kernel_pmap->pm_pml1);
1889         /*
1890          * Add a physical memory segment (vm_phys_seg) corresponding to the
1891          * preallocated kernel page table pages so that vm_page structures
1892          * representing these pages will be created.  The vm_page structures
1893          * are required for promotion of the corresponding kernel virtual
1894          * addresses to superpage mappings.
1895          */
1896         vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
1897 }
1898
1899 static void
1900 mmu_radix_early_bootstrap(vm_offset_t start, vm_offset_t end)
1901 {
1902         vm_paddr_t      kpstart, kpend;
1903         vm_size_t       physsz, hwphyssz;
1904         //uint64_t      l2virt;
1905         int             rm_pavail, proctab_size;
1906         int             i, j;
1907
1908         kpstart = start & ~DMAP_BASE_ADDRESS;
1909         kpend = end & ~DMAP_BASE_ADDRESS;
1910
1911         /* Get physical memory regions from firmware */
1912         mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
1913         CTR0(KTR_PMAP, "mmu_radix_early_bootstrap: physical memory");
1914
1915         if (2 * VM_PHYSSEG_MAX < regions_sz)
1916                 panic("mmu_radix_early_bootstrap: phys_avail too small");
1917
1918         if (bootverbose)
1919                 for (int i = 0; i < regions_sz; i++)
1920                         printf("regions[%d].mr_start=%lx regions[%d].mr_size=%lx\n",
1921                             i, regions[i].mr_start, i, regions[i].mr_size);
1922         /*
1923          * XXX workaround a simulator bug
1924          */
1925         for (int i = 0; i < regions_sz; i++)
1926                 if (regions[i].mr_start & PAGE_MASK) {
1927                         regions[i].mr_start += PAGE_MASK;
1928                         regions[i].mr_start &= ~PAGE_MASK;
1929                         regions[i].mr_size &= ~PAGE_MASK;
1930                 }
1931         if (bootverbose)
1932                 for (int i = 0; i < pregions_sz; i++)
1933                         printf("pregions[%d].mr_start=%lx pregions[%d].mr_size=%lx\n",
1934                             i, pregions[i].mr_start, i, pregions[i].mr_size);
1935
1936         phys_avail_count = 0;
1937         physsz = 0;
1938         hwphyssz = 0;
1939         TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
1940         for (i = 0, j = 0; i < regions_sz; i++) {
1941                 if (bootverbose)
1942                         printf("regions[%d].mr_start=%016lx regions[%d].mr_size=%016lx\n",
1943                             i, regions[i].mr_start, i, regions[i].mr_size);
1944
1945                 if (regions[i].mr_size < PAGE_SIZE)
1946                         continue;
1947
1948                 if (hwphyssz != 0 &&
1949                     (physsz + regions[i].mr_size) >= hwphyssz) {
1950                         if (physsz < hwphyssz) {
1951                                 phys_avail[j] = regions[i].mr_start;
1952                                 phys_avail[j + 1] = regions[i].mr_start +
1953                                     (hwphyssz - physsz);
1954                                 physsz = hwphyssz;
1955                                 phys_avail_count++;
1956                                 dump_avail[j] = phys_avail[j];
1957                                 dump_avail[j + 1] = phys_avail[j + 1];
1958                         }
1959                         break;
1960                 }
1961                 phys_avail[j] = regions[i].mr_start;
1962                 phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
1963                 dump_avail[j] = phys_avail[j];
1964                 dump_avail[j + 1] = phys_avail[j + 1];
1965
1966                 phys_avail_count++;
1967                 physsz += regions[i].mr_size;
1968                 j += 2;
1969         }
1970
1971         /* Check for overlap with the kernel and exception vectors */
1972         rm_pavail = 0;
1973         for (j = 0; j < 2 * phys_avail_count; j+=2) {
1974                 if (phys_avail[j] < EXC_LAST)
1975                         phys_avail[j] += EXC_LAST;
1976
1977                 if (phys_avail[j] >= kpstart &&
1978                     phys_avail[j + 1] <= kpend) {
1979                         phys_avail[j] = phys_avail[j + 1] = ~0;
1980                         rm_pavail++;
1981                         continue;
1982                 }
1983
1984                 if (kpstart >= phys_avail[j] &&
1985                     kpstart < phys_avail[j + 1]) {
1986                         if (kpend < phys_avail[j + 1]) {
1987                                 phys_avail[2 * phys_avail_count] =
1988                                     (kpend & ~PAGE_MASK) + PAGE_SIZE;
1989                                 phys_avail[2 * phys_avail_count + 1] =
1990                                     phys_avail[j + 1];
1991                                 phys_avail_count++;
1992                         }
1993
1994                         phys_avail[j + 1] = kpstart & ~PAGE_MASK;
1995                 }
1996
1997                 if (kpend >= phys_avail[j] &&
1998                     kpend < phys_avail[j + 1]) {
1999                         if (kpstart > phys_avail[j]) {
2000                                 phys_avail[2 * phys_avail_count] = phys_avail[j];
2001                                 phys_avail[2 * phys_avail_count + 1] =
2002                                     kpstart & ~PAGE_MASK;
2003                                 phys_avail_count++;
2004                         }
2005
2006                         phys_avail[j] = (kpend & ~PAGE_MASK) +
2007                             PAGE_SIZE;
2008                 }
2009         }
2010         qsort(phys_avail, 2 * phys_avail_count, sizeof(phys_avail[0]), pa_cmp);
2011         for (i = 0; i < 2 * phys_avail_count; i++)
2012                 phys_avail_debug[i] = phys_avail[i];
2013
2014         /* Remove physical available regions marked for removal (~0) */
2015         if (rm_pavail) {
2016                 phys_avail_count -= rm_pavail;
2017                 for (i = 2 * phys_avail_count;
2018                      i < 2*(phys_avail_count + rm_pavail); i+=2)
2019                         phys_avail[i] = phys_avail[i + 1] = 0;
2020         }
2021         if (bootverbose) {
2022                 printf("phys_avail ranges after filtering:\n");
2023                 for (j = 0; j < 2 * phys_avail_count; j+=2)
2024                         printf("phys_avail[%d]=%08lx - phys_avail[%d]=%08lx\n",
2025                                    j, phys_avail[j], j + 1, phys_avail[j + 1]);
2026         }
2027         physmem = btoc(physsz);
2028
2029         /* XXX assume we're running non-virtualized and
2030          * we don't support BHYVE
2031          */
2032         if (isa3_pid_bits == 0)
2033                 isa3_pid_bits = 20;
2034         if (powernv_enabled) {
2035                 parttab_phys =
2036                     moea64_bootstrap_alloc(PARTTAB_SIZE, PARTTAB_SIZE);
2037                 validate_addr(parttab_phys, PARTTAB_SIZE);
2038                 for (int i = 0; i < PARTTAB_SIZE/PAGE_SIZE; i++)
2039                         pagezero(PHYS_TO_DMAP(parttab_phys + i * PAGE_SIZE));
2040
2041         }
2042         proctab_size = 1UL << PROCTAB_SIZE_SHIFT;
2043         proctab0pa = moea64_bootstrap_alloc(proctab_size, proctab_size);
2044         validate_addr(proctab0pa, proctab_size);
2045         for (int i = 0; i < proctab_size/PAGE_SIZE; i++)
2046                 pagezero(PHYS_TO_DMAP(proctab0pa + i * PAGE_SIZE));
2047
2048         mmu_radix_setup_pagetables(hwphyssz);
2049 }
2050
2051 static void
2052 mmu_radix_late_bootstrap(vm_offset_t start, vm_offset_t end)
2053 {
2054         int             i;
2055         vm_paddr_t      pa;
2056         void            *dpcpu;
2057         vm_offset_t va;
2058
2059         /*
2060          * Set up the Open Firmware pmap and add its mappings if not in real
2061          * mode.
2062          */
2063         if (bootverbose)
2064                 printf("%s enter\n", __func__);
2065
2066         /*
2067          * Calculate the last available physical address, and reserve the
2068          * vm_page_array (upper bound).
2069          */
2070         Maxmem = 0;
2071         for (i = 0; phys_avail[i + 1] != 0; i += 2)
2072                 Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1]));
2073
2074         /*
2075          * Remap any early IO mappings (console framebuffer, etc.)
2076          */
2077         bs_remap_earlyboot();
2078
2079         /*
2080          * Allocate a kernel stack with a guard page for thread0 and map it
2081          * into the kernel page map.
2082          */
2083         pa = allocpages(kstack_pages);
2084         va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
2085         virtual_avail = va + kstack_pages * PAGE_SIZE;
2086         CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va);
2087         thread0.td_kstack = va;
2088         for (i = 0; i < kstack_pages; i++) {
2089                 mmu_radix_kenter(va, pa);
2090                 pa += PAGE_SIZE;
2091                 va += PAGE_SIZE;
2092         }
2093         thread0.td_kstack_pages = kstack_pages;
2094
2095         /*
2096          * Allocate virtual address space for the message buffer.
2097          */
2098         pa = msgbuf_phys = allocpages((msgbufsize + PAGE_MASK)  >> PAGE_SHIFT);
2099         msgbufp = (struct msgbuf *)PHYS_TO_DMAP(pa);
2100
2101         /*
2102          * Allocate virtual address space for the dynamic percpu area.
2103          */
2104         pa = allocpages(DPCPU_SIZE >> PAGE_SHIFT);
2105         dpcpu = (void *)PHYS_TO_DMAP(pa);
2106         dpcpu_init(dpcpu, curcpu);
2107
2108         crashdumpmap = (caddr_t)virtual_avail;
2109         virtual_avail += MAXDUMPPGS * PAGE_SIZE;
2110
2111         /*
2112          * Reserve some special page table entries/VA space for temporary
2113          * mapping of pages.
2114          */
2115 }
2116
2117 static void
2118 mmu_parttab_init(void)
2119 {
2120         uint64_t ptcr;
2121
2122         isa3_parttab = (struct pate *)PHYS_TO_DMAP(parttab_phys);
2123
2124         if (bootverbose)
2125                 printf("%s parttab: %p\n", __func__, isa3_parttab);
2126         ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12);
2127         if (bootverbose)
2128                 printf("setting ptcr %lx\n", ptcr);
2129         mtspr(SPR_PTCR, ptcr);
2130 }
2131
2132 static void
2133 mmu_parttab_update(uint64_t lpid, uint64_t pagetab, uint64_t proctab)
2134 {
2135         uint64_t prev;
2136
2137         if (bootverbose)
2138                 printf("%s isa3_parttab %p lpid %lx pagetab %lx proctab %lx\n", __func__, isa3_parttab,
2139                            lpid, pagetab, proctab);
2140         prev = be64toh(isa3_parttab[lpid].pagetab);
2141         isa3_parttab[lpid].pagetab = htobe64(pagetab);
2142         isa3_parttab[lpid].proctab = htobe64(proctab);
2143
2144         if (prev & PARTTAB_HR) {
2145                 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
2146                              "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2147                 __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
2148                              "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2149         } else {
2150                 __asm __volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
2151                              "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
2152         }
2153         ttusync();
2154 }
2155
2156 static void
2157 mmu_radix_parttab_init(void)
2158 {
2159         uint64_t pagetab;
2160
2161         mmu_parttab_init();
2162         pagetab = RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) | \
2163                          RADIX_PGD_INDEX_SHIFT | PARTTAB_HR;
2164         mmu_parttab_update(0, pagetab, 0);
2165 }
2166
2167 static void
2168 mmu_radix_proctab_register(vm_paddr_t proctabpa, uint64_t table_size)
2169 {
2170         uint64_t pagetab, proctab;
2171
2172         pagetab = be64toh(isa3_parttab[0].pagetab);
2173         proctab = proctabpa | table_size | PARTTAB_GR;
2174         mmu_parttab_update(0, pagetab, proctab);
2175 }
2176
2177 static void
2178 mmu_radix_proctab_init(void)
2179 {
2180
2181         isa3_base_pid = 1;
2182
2183         isa3_proctab = (void*)PHYS_TO_DMAP(proctab0pa);
2184         isa3_proctab->proctab0 =
2185             htobe64(RTS_SIZE | DMAP_TO_PHYS((vm_offset_t)kernel_pmap->pm_pml1) |
2186                 RADIX_PGD_INDEX_SHIFT);
2187
2188         if (powernv_enabled) {
2189                 mmu_radix_proctab_register(proctab0pa, PROCTAB_SIZE_SHIFT - 12);
2190                 __asm __volatile("ptesync" : : : "memory");
2191                 __asm __volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
2192                              "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
2193                 __asm __volatile("eieio; tlbsync; ptesync" : : : "memory");
2194 #ifdef PSERIES
2195         } else {
2196                 int64_t rc;
2197
2198                 rc = phyp_hcall(H_REGISTER_PROC_TBL,
2199                     PROC_TABLE_NEW | PROC_TABLE_RADIX | PROC_TABLE_GTSE,
2200                     proctab0pa, 0, PROCTAB_SIZE_SHIFT - 12);
2201                 if (rc != H_SUCCESS)
2202                         panic("mmu_radix_proctab_init: "
2203                                 "failed to register process table: rc=%jd",
2204                                 (intmax_t)rc);
2205 #endif
2206         }
2207
2208         if (bootverbose)
2209                 printf("process table %p and kernel radix PDE: %p\n",
2210                            isa3_proctab, kernel_pmap->pm_pml1);
2211         mtmsr(mfmsr() | PSL_DR );
2212         mtmsr(mfmsr() &  ~PSL_DR);
2213         kernel_pmap->pm_pid = isa3_base_pid;
2214         isa3_base_pid++;
2215 }
2216
2217 void
2218 mmu_radix_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
2219     int advice)
2220 {
2221         struct rwlock *lock;
2222         pml1_entry_t *l1e;
2223         pml2_entry_t *l2e;
2224         pml3_entry_t oldl3e, *l3e;
2225         pt_entry_t *pte;
2226         vm_offset_t va, va_next;
2227         vm_page_t m;
2228         bool anychanged;
2229
2230         if (advice != MADV_DONTNEED && advice != MADV_FREE)
2231                 return;
2232         anychanged = false;
2233         PMAP_LOCK(pmap);
2234         for (; sva < eva; sva = va_next) {
2235                 l1e = pmap_pml1e(pmap, sva);
2236                 if ((be64toh(*l1e) & PG_V) == 0) {
2237                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
2238                         if (va_next < sva)
2239                                 va_next = eva;
2240                         continue;
2241                 }
2242                 l2e = pmap_l1e_to_l2e(l1e, sva);
2243                 if ((be64toh(*l2e) & PG_V) == 0) {
2244                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
2245                         if (va_next < sva)
2246                                 va_next = eva;
2247                         continue;
2248                 }
2249                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
2250                 if (va_next < sva)
2251                         va_next = eva;
2252                 l3e = pmap_l2e_to_l3e(l2e, sva);
2253                 oldl3e = be64toh(*l3e);
2254                 if ((oldl3e & PG_V) == 0)
2255                         continue;
2256                 else if ((oldl3e & RPTE_LEAF) != 0) {
2257                         if ((oldl3e & PG_MANAGED) == 0)
2258                                 continue;
2259                         lock = NULL;
2260                         if (!pmap_demote_l3e_locked(pmap, l3e, sva, &lock)) {
2261                                 if (lock != NULL)
2262                                         rw_wunlock(lock);
2263
2264                                 /*
2265                                  * The large page mapping was destroyed.
2266                                  */
2267                                 continue;
2268                         }
2269
2270                         /*
2271                          * Unless the page mappings are wired, remove the
2272                          * mapping to a single page so that a subsequent
2273                          * access may repromote.  Choosing the last page
2274                          * within the address range [sva, min(va_next, eva))
2275                          * generally results in more repromotions.  Since the
2276                          * underlying page table page is fully populated, this
2277                          * removal never frees a page table page.
2278                          */
2279                         if ((oldl3e & PG_W) == 0) {
2280                                 va = eva;
2281                                 if (va > va_next)
2282                                         va = va_next;
2283                                 va -= PAGE_SIZE;
2284                                 KASSERT(va >= sva,
2285                                     ("mmu_radix_advise: no address gap"));
2286                                 pte = pmap_l3e_to_pte(l3e, va);
2287                                 KASSERT((be64toh(*pte) & PG_V) != 0,
2288                                     ("pmap_advise: invalid PTE"));
2289                                 pmap_remove_pte(pmap, pte, va, be64toh(*l3e), NULL,
2290                                     &lock);
2291                                 anychanged = true;
2292                         }
2293                         if (lock != NULL)
2294                                 rw_wunlock(lock);
2295                 }
2296                 if (va_next > eva)
2297                         va_next = eva;
2298                 va = va_next;
2299                 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next;
2300                          pte++, sva += PAGE_SIZE) {
2301                         MPASS(pte == pmap_pte(pmap, sva));
2302
2303                         if ((be64toh(*pte) & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
2304                                 goto maybe_invlrng;
2305                         else if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2306                                 if (advice == MADV_DONTNEED) {
2307                                         /*
2308                                          * Future calls to pmap_is_modified()
2309                                          * can be avoided by making the page
2310                                          * dirty now.
2311                                          */
2312                                         m = PHYS_TO_VM_PAGE(be64toh(*pte) & PG_FRAME);
2313                                         vm_page_dirty(m);
2314                                 }
2315                                 atomic_clear_long(pte, htobe64(PG_M | PG_A));
2316                         } else if ((be64toh(*pte) & PG_A) != 0)
2317                                 atomic_clear_long(pte, htobe64(PG_A));
2318                         else
2319                                 goto maybe_invlrng;
2320                         anychanged = true;
2321                         continue;
2322 maybe_invlrng:
2323                         if (va != va_next) {
2324                                 anychanged = true;
2325                                 va = va_next;
2326                         }
2327                 }
2328                 if (va != va_next)
2329                         anychanged = true;
2330         }
2331         if (anychanged)
2332                 pmap_invalidate_all(pmap);
2333         PMAP_UNLOCK(pmap);
2334 }
2335
2336 /*
2337  * Routines used in machine-dependent code
2338  */
2339 static void
2340 mmu_radix_bootstrap(vm_offset_t start, vm_offset_t end)
2341 {
2342         uint64_t lpcr;
2343
2344         if (bootverbose)
2345                 printf("%s\n", __func__);
2346         hw_direct_map = 1;
2347         powernv_enabled = (mfmsr() & PSL_HV) ? 1 : 0;
2348         mmu_radix_early_bootstrap(start, end);
2349         if (bootverbose)
2350                 printf("early bootstrap complete\n");
2351         if (powernv_enabled) {
2352                 lpcr = mfspr(SPR_LPCR);
2353                 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
2354                 mmu_radix_parttab_init();
2355                 mmu_radix_init_amor();
2356                 if (bootverbose)
2357                         printf("powernv init complete\n");
2358         }
2359         mmu_radix_init_iamr();
2360         mmu_radix_proctab_init();
2361         mmu_radix_pid_set(kernel_pmap);
2362         if (powernv_enabled)
2363                 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
2364         else
2365                 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
2366
2367         mmu_radix_late_bootstrap(start, end);
2368         numa_mem_regions(&numa_pregions, &numa_pregions_sz);
2369         if (bootverbose)
2370                 printf("%s done\n", __func__);
2371         pmap_bootstrapped = 1;
2372         dmaplimit = roundup2(powerpc_ptob(Maxmem), L2_PAGE_SIZE);
2373         PCPU_SET(flags, PCPU_GET(flags) | PC_FLAG_NOSRS);
2374 }
2375
2376 static void
2377 mmu_radix_cpu_bootstrap(int ap)
2378 {
2379         uint64_t lpcr;
2380         uint64_t ptcr;
2381
2382         if (powernv_enabled) {
2383                 lpcr = mfspr(SPR_LPCR);
2384                 mtspr(SPR_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
2385
2386                 ptcr = parttab_phys | (PARTTAB_SIZE_SHIFT-12);
2387                 mtspr(SPR_PTCR, ptcr);
2388                 mmu_radix_init_amor();
2389         }
2390         mmu_radix_init_iamr();
2391         mmu_radix_pid_set(kernel_pmap);
2392         if (powernv_enabled)
2393                 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_GLOBAL);
2394         else
2395                 mmu_radix_tlbiel_flush(TLB_INVAL_SCOPE_LPID);
2396 }
2397
2398 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3e, CTLFLAG_RD, 0,
2399     "2MB page mapping counters");
2400
2401 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_demotions);
2402 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, demotions, CTLFLAG_RD,
2403     &pmap_l3e_demotions, "2MB page demotions");
2404
2405 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_mappings);
2406 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, mappings, CTLFLAG_RD,
2407     &pmap_l3e_mappings, "2MB page mappings");
2408
2409 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_p_failures);
2410 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, p_failures, CTLFLAG_RD,
2411     &pmap_l3e_p_failures, "2MB page promotion failures");
2412
2413 static COUNTER_U64_DEFINE_EARLY(pmap_l3e_promotions);
2414 SYSCTL_COUNTER_U64(_vm_pmap_l3e, OID_AUTO, promotions, CTLFLAG_RD,
2415     &pmap_l3e_promotions, "2MB page promotions");
2416
2417 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2e, CTLFLAG_RD, 0,
2418     "1GB page mapping counters");
2419
2420 static COUNTER_U64_DEFINE_EARLY(pmap_l2e_demotions);
2421 SYSCTL_COUNTER_U64(_vm_pmap_l2e, OID_AUTO, demotions, CTLFLAG_RD,
2422     &pmap_l2e_demotions, "1GB page demotions");
2423
2424 void
2425 mmu_radix_clear_modify(vm_page_t m)
2426 {
2427         struct md_page *pvh;
2428         pmap_t pmap;
2429         pv_entry_t next_pv, pv;
2430         pml3_entry_t oldl3e, *l3e;
2431         pt_entry_t oldpte, *pte;
2432         struct rwlock *lock;
2433         vm_offset_t va;
2434         int md_gen, pvh_gen;
2435
2436         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2437             ("pmap_clear_modify: page %p is not managed", m));
2438         vm_page_assert_busied(m);
2439         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
2440
2441         /*
2442          * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
2443          * If the object containing the page is locked and the page is not
2444          * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
2445          */
2446         if ((m->a.flags & PGA_WRITEABLE) == 0)
2447                 return;
2448         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2449             pa_to_pvh(VM_PAGE_TO_PHYS(m));
2450         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
2451         rw_wlock(lock);
2452 restart:
2453         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) {
2454                 pmap = PV_PMAP(pv);
2455                 if (!PMAP_TRYLOCK(pmap)) {
2456                         pvh_gen = pvh->pv_gen;
2457                         rw_wunlock(lock);
2458                         PMAP_LOCK(pmap);
2459                         rw_wlock(lock);
2460                         if (pvh_gen != pvh->pv_gen) {
2461                                 PMAP_UNLOCK(pmap);
2462                                 goto restart;
2463                         }
2464                 }
2465                 va = pv->pv_va;
2466                 l3e = pmap_pml3e(pmap, va);
2467                 oldl3e = be64toh(*l3e);
2468                 if ((oldl3e & PG_RW) != 0 &&
2469                     pmap_demote_l3e_locked(pmap, l3e, va, &lock) &&
2470                     (oldl3e & PG_W) == 0) {
2471                         /*
2472                          * Write protect the mapping to a
2473                          * single page so that a subsequent
2474                          * write access may repromote.
2475                          */
2476                         va += VM_PAGE_TO_PHYS(m) - (oldl3e &
2477                             PG_PS_FRAME);
2478                         pte = pmap_l3e_to_pte(l3e, va);
2479                         oldpte = be64toh(*pte);
2480                         while (!atomic_cmpset_long(pte,
2481                             htobe64(oldpte),
2482                                 htobe64((oldpte | RPTE_EAA_R) & ~(PG_M | PG_RW))))
2483                                    oldpte = be64toh(*pte);
2484                         vm_page_dirty(m);
2485                         pmap_invalidate_page(pmap, va);
2486                 }
2487                 PMAP_UNLOCK(pmap);
2488         }
2489         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
2490                 pmap = PV_PMAP(pv);
2491                 if (!PMAP_TRYLOCK(pmap)) {
2492                         md_gen = m->md.pv_gen;
2493                         pvh_gen = pvh->pv_gen;
2494                         rw_wunlock(lock);
2495                         PMAP_LOCK(pmap);
2496                         rw_wlock(lock);
2497                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
2498                                 PMAP_UNLOCK(pmap);
2499                                 goto restart;
2500                         }
2501                 }
2502                 l3e = pmap_pml3e(pmap, pv->pv_va);
2503                 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_clear_modify: found"
2504                     " a 2mpage in page %p's pv list", m));
2505                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
2506                 if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
2507                         atomic_clear_long(pte, htobe64(PG_M));
2508                         pmap_invalidate_page(pmap, pv->pv_va);
2509                 }
2510                 PMAP_UNLOCK(pmap);
2511         }
2512         rw_wunlock(lock);
2513 }
2514
2515 void
2516 mmu_radix_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
2517     vm_size_t len, vm_offset_t src_addr)
2518 {
2519         struct rwlock *lock;
2520         struct spglist free;
2521         vm_offset_t addr;
2522         vm_offset_t end_addr = src_addr + len;
2523         vm_offset_t va_next;
2524         vm_page_t dst_pdpg, dstmpte, srcmpte;
2525         bool invalidate_all;
2526
2527         CTR6(KTR_PMAP,
2528             "%s(dst_pmap=%p, src_pmap=%p, dst_addr=%lx, len=%lu, src_addr=%lx)\n",
2529             __func__, dst_pmap, src_pmap, dst_addr, len, src_addr);
2530
2531         if (dst_addr != src_addr)
2532                 return;
2533         lock = NULL;
2534         invalidate_all = false;
2535         if (dst_pmap < src_pmap) {
2536                 PMAP_LOCK(dst_pmap);
2537                 PMAP_LOCK(src_pmap);
2538         } else {
2539                 PMAP_LOCK(src_pmap);
2540                 PMAP_LOCK(dst_pmap);
2541         }
2542
2543         for (addr = src_addr; addr < end_addr; addr = va_next) {
2544                 pml1_entry_t *l1e;
2545                 pml2_entry_t *l2e;
2546                 pml3_entry_t srcptepaddr, *l3e;
2547                 pt_entry_t *src_pte, *dst_pte;
2548
2549                 l1e = pmap_pml1e(src_pmap, addr);
2550                 if ((be64toh(*l1e) & PG_V) == 0) {
2551                         va_next = (addr + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
2552                         if (va_next < addr)
2553                                 va_next = end_addr;
2554                         continue;
2555                 }
2556
2557                 l2e = pmap_l1e_to_l2e(l1e, addr);
2558                 if ((be64toh(*l2e) & PG_V) == 0) {
2559                         va_next = (addr + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
2560                         if (va_next < addr)
2561                                 va_next = end_addr;
2562                         continue;
2563                 }
2564
2565                 va_next = (addr + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
2566                 if (va_next < addr)
2567                         va_next = end_addr;
2568
2569                 l3e = pmap_l2e_to_l3e(l2e, addr);
2570                 srcptepaddr = be64toh(*l3e);
2571                 if (srcptepaddr == 0)
2572                         continue;
2573
2574                 if (srcptepaddr & RPTE_LEAF) {
2575                         if ((addr & L3_PAGE_MASK) != 0 ||
2576                             addr + L3_PAGE_SIZE > end_addr)
2577                                 continue;
2578                         dst_pdpg = pmap_allocl3e(dst_pmap, addr, NULL);
2579                         if (dst_pdpg == NULL)
2580                                 break;
2581                         l3e = (pml3_entry_t *)
2582                             PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
2583                         l3e = &l3e[pmap_pml3e_index(addr)];
2584                         if (be64toh(*l3e) == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
2585                             pmap_pv_insert_l3e(dst_pmap, addr, srcptepaddr,
2586                             PMAP_ENTER_NORECLAIM, &lock))) {
2587                                 *l3e = htobe64(srcptepaddr & ~PG_W);
2588                                 pmap_resident_count_inc(dst_pmap,
2589                                     L3_PAGE_SIZE / PAGE_SIZE);
2590                                 counter_u64_add(pmap_l3e_mappings, 1);
2591                         } else
2592                                 dst_pdpg->ref_count--;
2593                         continue;
2594                 }
2595
2596                 srcptepaddr &= PG_FRAME;
2597                 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
2598                 KASSERT(srcmpte->ref_count > 0,
2599                     ("pmap_copy: source page table page is unused"));
2600
2601                 if (va_next > end_addr)
2602                         va_next = end_addr;
2603
2604                 src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
2605                 src_pte = &src_pte[pmap_pte_index(addr)];
2606                 dstmpte = NULL;
2607                 while (addr < va_next) {
2608                         pt_entry_t ptetemp;
2609                         ptetemp = be64toh(*src_pte);
2610                         /*
2611                          * we only virtual copy managed pages
2612                          */
2613                         if ((ptetemp & PG_MANAGED) != 0) {
2614                                 if (dstmpte != NULL &&
2615                                     dstmpte->pindex == pmap_l3e_pindex(addr))
2616                                         dstmpte->ref_count++;
2617                                 else if ((dstmpte = pmap_allocpte(dst_pmap,
2618                                     addr, NULL)) == NULL)
2619                                         goto out;
2620                                 dst_pte = (pt_entry_t *)
2621                                     PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
2622                                 dst_pte = &dst_pte[pmap_pte_index(addr)];
2623                                 if (be64toh(*dst_pte) == 0 &&
2624                                     pmap_try_insert_pv_entry(dst_pmap, addr,
2625                                     PHYS_TO_VM_PAGE(ptetemp & PG_FRAME),
2626                                     &lock)) {
2627                                         /*
2628                                          * Clear the wired, modified, and
2629                                          * accessed (referenced) bits
2630                                          * during the copy.
2631                                          */
2632                                         *dst_pte = htobe64(ptetemp & ~(PG_W | PG_M |
2633                                             PG_A));
2634                                         pmap_resident_count_inc(dst_pmap, 1);
2635                                 } else {
2636                                         SLIST_INIT(&free);
2637                                         if (pmap_unwire_ptp(dst_pmap, addr,
2638                                             dstmpte, &free)) {
2639                                                 /*
2640                                                  * Although "addr" is not
2641                                                  * mapped, paging-structure
2642                                                  * caches could nonetheless
2643                                                  * have entries that refer to
2644                                                  * the freed page table pages.
2645                                                  * Invalidate those entries.
2646                                                  */
2647                                                 invalidate_all = true;
2648                                                 vm_page_free_pages_toq(&free,
2649                                                     true);
2650                                         }
2651                                         goto out;
2652                                 }
2653                                 if (dstmpte->ref_count >= srcmpte->ref_count)
2654                                         break;
2655                         }
2656                         addr += PAGE_SIZE;
2657                         if (__predict_false((addr & L3_PAGE_MASK) == 0))
2658                                 src_pte = pmap_pte(src_pmap, addr);
2659                         else
2660                                 src_pte++;
2661                 }
2662         }
2663 out:
2664         if (invalidate_all)
2665                 pmap_invalidate_all(dst_pmap);
2666         if (lock != NULL)
2667                 rw_wunlock(lock);
2668         PMAP_UNLOCK(src_pmap);
2669         PMAP_UNLOCK(dst_pmap);
2670 }
2671
2672 static void
2673 mmu_radix_copy_page(vm_page_t msrc, vm_page_t mdst)
2674 {
2675         vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
2676         vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
2677
2678         CTR3(KTR_PMAP, "%s(%p, %p)", __func__, src, dst);
2679         /*
2680          * XXX slow
2681          */
2682         bcopy((void *)src, (void *)dst, PAGE_SIZE);
2683 }
2684
2685 static void
2686 mmu_radix_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
2687     vm_offset_t b_offset, int xfersize)
2688 {
2689         void *a_cp, *b_cp;
2690         vm_offset_t a_pg_offset, b_pg_offset;
2691         int cnt;
2692
2693         CTR6(KTR_PMAP, "%s(%p, %#x, %p, %#x, %#x)", __func__, ma,
2694             a_offset, mb, b_offset, xfersize);
2695         
2696         while (xfersize > 0) {
2697                 a_pg_offset = a_offset & PAGE_MASK;
2698                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
2699                 a_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
2700                     VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) +
2701                     a_pg_offset;
2702                 b_pg_offset = b_offset & PAGE_MASK;
2703                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
2704                 b_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
2705                     VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) +
2706                     b_pg_offset;
2707                 bcopy(a_cp, b_cp, cnt);
2708                 a_offset += cnt;
2709                 b_offset += cnt;
2710                 xfersize -= cnt;
2711         }
2712 }
2713
2714 #if VM_NRESERVLEVEL > 0
2715 /*
2716  * Tries to promote the 512, contiguous 4KB page mappings that are within a
2717  * single page table page (PTP) to a single 2MB page mapping.  For promotion
2718  * to occur, two conditions must be met: (1) the 4KB page mappings must map
2719  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
2720  * identical characteristics.
2721  */
2722 static int
2723 pmap_promote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va,
2724     struct rwlock **lockp)
2725 {
2726         pml3_entry_t newpde;
2727         pt_entry_t *firstpte, oldpte, pa, *pte;
2728         vm_page_t mpte;
2729
2730         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2731
2732         /*
2733          * Examine the first PTE in the specified PTP.  Abort if this PTE is
2734          * either invalid, unused, or does not map the first 4KB physical page
2735          * within a 2MB page.
2736          */
2737         firstpte = (pt_entry_t *)PHYS_TO_DMAP(be64toh(*pde) & PG_FRAME);
2738 setpde:
2739         newpde = be64toh(*firstpte);
2740         if ((newpde & ((PG_FRAME & L3_PAGE_MASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
2741                 CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2742                     " in pmap %p", va, pmap);
2743                 goto fail;
2744         }
2745         if ((newpde & (PG_M | PG_RW)) == PG_RW) {
2746                 /*
2747                  * When PG_M is already clear, PG_RW can be cleared without
2748                  * a TLB invalidation.
2749                  */
2750                 if (!atomic_cmpset_long(firstpte, htobe64(newpde), htobe64((newpde | RPTE_EAA_R) & ~RPTE_EAA_W)))
2751                         goto setpde;
2752                 newpde &= ~RPTE_EAA_W;
2753         }
2754
2755         /*
2756          * Examine each of the other PTEs in the specified PTP.  Abort if this
2757          * PTE maps an unexpected 4KB physical page or does not have identical
2758          * characteristics to the first PTE.
2759          */
2760         pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + L3_PAGE_SIZE - PAGE_SIZE;
2761         for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
2762 setpte:
2763                 oldpte = be64toh(*pte);
2764                 if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
2765                         CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2766                             " in pmap %p", va, pmap);
2767                         goto fail;
2768                 }
2769                 if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
2770                         /*
2771                          * When PG_M is already clear, PG_RW can be cleared
2772                          * without a TLB invalidation.
2773                          */
2774                         if (!atomic_cmpset_long(pte, htobe64(oldpte), htobe64((oldpte | RPTE_EAA_R) & ~RPTE_EAA_W)))
2775                                 goto setpte;
2776                         oldpte &= ~RPTE_EAA_W;
2777                         CTR2(KTR_PMAP, "pmap_promote_l3e: protect for va %#lx"
2778                             " in pmap %p", (oldpte & PG_FRAME & L3_PAGE_MASK) |
2779                             (va & ~L3_PAGE_MASK), pmap);
2780                 }
2781                 if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
2782                         CTR2(KTR_PMAP, "pmap_promote_l3e: failure for va %#lx"
2783                             " in pmap %p", va, pmap);
2784                         goto fail;
2785                 }
2786                 pa -= PAGE_SIZE;
2787         }
2788
2789         /*
2790          * Save the page table page in its current state until the PDE
2791          * mapping the superpage is demoted by pmap_demote_pde() or
2792          * destroyed by pmap_remove_pde().
2793          */
2794         mpte = PHYS_TO_VM_PAGE(be64toh(*pde) & PG_FRAME);
2795         KASSERT(mpte >= vm_page_array &&
2796             mpte < &vm_page_array[vm_page_array_size],
2797             ("pmap_promote_l3e: page table page is out of range"));
2798         KASSERT(mpte->pindex == pmap_l3e_pindex(va),
2799             ("pmap_promote_l3e: page table page's pindex is wrong"));
2800         if (pmap_insert_pt_page(pmap, mpte)) {
2801                 CTR2(KTR_PMAP,
2802                     "pmap_promote_l3e: failure for va %#lx in pmap %p", va,
2803                     pmap);
2804                 goto fail;
2805         }
2806
2807         /*
2808          * Promote the pv entries.
2809          */
2810         if ((newpde & PG_MANAGED) != 0)
2811                 pmap_pv_promote_l3e(pmap, va, newpde & PG_PS_FRAME, lockp);
2812
2813         pte_store(pde, PG_PROMOTED | newpde);
2814         ptesync();
2815         counter_u64_add(pmap_l3e_promotions, 1);
2816         CTR2(KTR_PMAP, "pmap_promote_l3e: success for va %#lx"
2817             " in pmap %p", va, pmap);
2818         return (0);
2819  fail:
2820         counter_u64_add(pmap_l3e_p_failures, 1);
2821         return (KERN_FAILURE);
2822 }
2823 #endif /* VM_NRESERVLEVEL > 0 */
2824
2825 int
2826 mmu_radix_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
2827     vm_prot_t prot, u_int flags, int8_t psind)
2828 {
2829         struct rwlock *lock;
2830         pml3_entry_t *l3e;
2831         pt_entry_t *pte;
2832         pt_entry_t newpte, origpte;
2833         pv_entry_t pv;
2834         vm_paddr_t opa, pa;
2835         vm_page_t mpte, om;
2836         int rv, retrycount;
2837         boolean_t nosleep, invalidate_all, invalidate_page;
2838
2839         va = trunc_page(va);
2840         retrycount = 0;
2841         invalidate_page = invalidate_all = false;
2842         CTR6(KTR_PMAP, "pmap_enter(%p, %#lx, %p, %#x, %#x, %d)", pmap, va,
2843             m, prot, flags, psind);
2844         KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
2845         KASSERT((m->oflags & VPO_UNMANAGED) != 0 || !VA_IS_CLEANMAP(va),
2846             ("pmap_enter: managed mapping within the clean submap"));
2847         if ((m->oflags & VPO_UNMANAGED) == 0)
2848                 VM_PAGE_OBJECT_BUSY_ASSERT(m);
2849
2850         KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
2851             ("pmap_enter: flags %u has reserved bits set", flags));
2852         pa = VM_PAGE_TO_PHYS(m);
2853         newpte = (pt_entry_t)(pa | PG_A | PG_V | RPTE_LEAF);
2854         if ((flags & VM_PROT_WRITE) != 0)
2855                 newpte |= PG_M;
2856         if ((flags & VM_PROT_READ) != 0)
2857                 newpte |= PG_A;
2858         if (prot & VM_PROT_READ)
2859                 newpte |= RPTE_EAA_R;
2860         if ((prot & VM_PROT_WRITE) != 0)
2861                 newpte |= RPTE_EAA_W;
2862         KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
2863             ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
2864
2865         if (prot & VM_PROT_EXECUTE)
2866                 newpte |= PG_X;
2867         if ((flags & PMAP_ENTER_WIRED) != 0)
2868                 newpte |= PG_W;
2869         if (va >= DMAP_MIN_ADDRESS)
2870                 newpte |= RPTE_EAA_P;
2871         newpte |= pmap_cache_bits(m->md.mdpg_cache_attrs);
2872         /*
2873          * Set modified bit gratuitously for writeable mappings if
2874          * the page is unmanaged. We do not want to take a fault
2875          * to do the dirty bit accounting for these mappings.
2876          */
2877         if ((m->oflags & VPO_UNMANAGED) != 0) {
2878                 if ((newpte & PG_RW) != 0)
2879                         newpte |= PG_M;
2880         } else
2881                 newpte |= PG_MANAGED;
2882
2883         lock = NULL;
2884         PMAP_LOCK(pmap);
2885         if (psind == 1) {
2886                 /* Assert the required virtual and physical alignment. */
2887                 KASSERT((va & L3_PAGE_MASK) == 0, ("pmap_enter: va unaligned"));
2888                 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
2889                 rv = pmap_enter_l3e(pmap, va, newpte | RPTE_LEAF, flags, m, &lock);
2890                 goto out;
2891         }
2892         mpte = NULL;
2893
2894         /*
2895          * In the case that a page table page is not
2896          * resident, we are creating it here.
2897          */
2898 retry:
2899         l3e = pmap_pml3e(pmap, va);
2900         if (l3e != NULL && (be64toh(*l3e) & PG_V) != 0 && ((be64toh(*l3e) & RPTE_LEAF) == 0 ||
2901             pmap_demote_l3e_locked(pmap, l3e, va, &lock))) {
2902                 pte = pmap_l3e_to_pte(l3e, va);
2903                 if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
2904                         mpte = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME);
2905                         mpte->ref_count++;
2906                 }
2907         } else if (va < VM_MAXUSER_ADDRESS) {
2908                 /*
2909                  * Here if the pte page isn't mapped, or if it has been
2910                  * deallocated.
2911                  */
2912                 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
2913                 mpte = _pmap_allocpte(pmap, pmap_l3e_pindex(va),
2914                     nosleep ? NULL : &lock);
2915                 if (mpte == NULL && nosleep) {
2916                         rv = KERN_RESOURCE_SHORTAGE;
2917                         goto out;
2918                 }
2919                 if (__predict_false(retrycount++ == 6))
2920                         panic("too many retries");
2921                 invalidate_all = true;
2922                 goto retry;
2923         } else
2924                 panic("pmap_enter: invalid page directory va=%#lx", va);
2925
2926         origpte = be64toh(*pte);
2927         pv = NULL;
2928
2929         /*
2930          * Is the specified virtual address already mapped?
2931          */
2932         if ((origpte & PG_V) != 0) {
2933 #ifdef INVARIANTS
2934                 if (VERBOSE_PMAP || pmap_logging) {
2935                         printf("cow fault pmap_enter(%p, %#lx, %p, %#x, %x, %d) --"
2936                             " asid=%lu curpid=%d name=%s origpte0x%lx\n",
2937                             pmap, va, m, prot, flags, psind, pmap->pm_pid,
2938                             curproc->p_pid, curproc->p_comm, origpte);
2939 #ifdef DDB
2940                         pmap_pte_walk(pmap->pm_pml1, va);
2941 #endif
2942                 }
2943 #endif
2944                 /*
2945                  * Wiring change, just update stats. We don't worry about
2946                  * wiring PT pages as they remain resident as long as there
2947                  * are valid mappings in them. Hence, if a user page is wired,
2948                  * the PT page will be also.
2949                  */
2950                 if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
2951                         pmap->pm_stats.wired_count++;
2952                 else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
2953                         pmap->pm_stats.wired_count--;
2954
2955                 /*
2956                  * Remove the extra PT page reference.
2957                  */
2958                 if (mpte != NULL) {
2959                         mpte->ref_count--;
2960                         KASSERT(mpte->ref_count > 0,
2961                             ("pmap_enter: missing reference to page table page,"
2962                              " va: 0x%lx", va));
2963                 }
2964
2965                 /*
2966                  * Has the physical page changed?
2967                  */
2968                 opa = origpte & PG_FRAME;
2969                 if (opa == pa) {
2970                         /*
2971                          * No, might be a protection or wiring change.
2972                          */
2973                         if ((origpte & PG_MANAGED) != 0 &&
2974                             (newpte & PG_RW) != 0)
2975                                 vm_page_aflag_set(m, PGA_WRITEABLE);
2976                         if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) {
2977                                 if ((newpte & (PG_A|PG_M)) != (origpte & (PG_A|PG_M))) {
2978                                         if (!atomic_cmpset_long(pte, htobe64(origpte), htobe64(newpte)))
2979                                                 goto retry;
2980                                         if ((newpte & PG_M) != (origpte & PG_M))
2981                                                 vm_page_dirty(m);
2982                                         if ((newpte & PG_A) != (origpte & PG_A))
2983                                                 vm_page_aflag_set(m, PGA_REFERENCED);
2984                                         ptesync();
2985                                 } else
2986                                         invalidate_all = true;
2987                                 if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
2988                                         goto unchanged;
2989                         }
2990                         goto validate;
2991                 }
2992
2993                 /*
2994                  * The physical page has changed.  Temporarily invalidate
2995                  * the mapping.  This ensures that all threads sharing the
2996                  * pmap keep a consistent view of the mapping, which is
2997                  * necessary for the correct handling of COW faults.  It
2998                  * also permits reuse of the old mapping's PV entry,
2999                  * avoiding an allocation.
3000                  *
3001                  * For consistency, handle unmanaged mappings the same way.
3002                  */
3003                 origpte = be64toh(pte_load_clear(pte));
3004                 KASSERT((origpte & PG_FRAME) == opa,
3005                     ("pmap_enter: unexpected pa update for %#lx", va));
3006                 if ((origpte & PG_MANAGED) != 0) {
3007                         om = PHYS_TO_VM_PAGE(opa);
3008
3009                         /*
3010                          * The pmap lock is sufficient to synchronize with
3011                          * concurrent calls to pmap_page_test_mappings() and
3012                          * pmap_ts_referenced().
3013                          */
3014                         if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
3015                                 vm_page_dirty(om);
3016                         if ((origpte & PG_A) != 0)
3017                                 vm_page_aflag_set(om, PGA_REFERENCED);
3018                         CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
3019                         pv = pmap_pvh_remove(&om->md, pmap, va);
3020                         if ((newpte & PG_MANAGED) == 0)
3021                                 free_pv_entry(pmap, pv);
3022 #ifdef INVARIANTS
3023                         else if (origpte & PG_MANAGED) {
3024                                 if (pv == NULL) {
3025 #ifdef DDB
3026                                         pmap_page_print_mappings(om);
3027 #endif
3028                                         MPASS(pv != NULL);
3029                                 }
3030                         }
3031 #endif
3032                         if ((om->a.flags & PGA_WRITEABLE) != 0 &&
3033                             TAILQ_EMPTY(&om->md.pv_list) &&
3034                             ((om->flags & PG_FICTITIOUS) != 0 ||
3035                             TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3036                                 vm_page_aflag_clear(om, PGA_WRITEABLE);
3037                 }
3038                 if ((origpte & PG_A) != 0)
3039                         invalidate_page = true;
3040                 origpte = 0;
3041         } else {
3042                 if (pmap != kernel_pmap) {
3043 #ifdef INVARIANTS
3044                         if (VERBOSE_PMAP || pmap_logging)
3045                                 printf("pmap_enter(%p, %#lx, %p, %#x, %x, %d) -- asid=%lu curpid=%d name=%s\n",
3046                                     pmap, va, m, prot, flags, psind,
3047                                     pmap->pm_pid, curproc->p_pid,
3048                                     curproc->p_comm);
3049 #endif
3050                 }
3051
3052                 /*
3053                  * Increment the counters.
3054                  */
3055                 if ((newpte & PG_W) != 0)
3056                         pmap->pm_stats.wired_count++;
3057                 pmap_resident_count_inc(pmap, 1);
3058         }
3059
3060         /*
3061          * Enter on the PV list if part of our managed memory.
3062          */
3063         if ((newpte & PG_MANAGED) != 0) {
3064                 if (pv == NULL) {
3065                         pv = get_pv_entry(pmap, &lock);
3066                         pv->pv_va = va;
3067                 }
3068 #ifdef VERBOSE_PV
3069                 else
3070                         printf("reassigning pv: %p to pmap: %p\n",
3071                                    pv, pmap);
3072 #endif
3073                 CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
3074                 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
3075                 m->md.pv_gen++;
3076                 if ((newpte & PG_RW) != 0)
3077                         vm_page_aflag_set(m, PGA_WRITEABLE);
3078         }
3079
3080         /*
3081          * Update the PTE.
3082          */
3083         if ((origpte & PG_V) != 0) {
3084 validate:
3085                 origpte = be64toh(pte_load_store(pte, htobe64(newpte)));
3086                 KASSERT((origpte & PG_FRAME) == pa,
3087                     ("pmap_enter: unexpected pa update for %#lx", va));
3088                 if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
3089                     (PG_M | PG_RW)) {
3090                         if ((origpte & PG_MANAGED) != 0)
3091                                 vm_page_dirty(m);
3092                         invalidate_page = true;
3093
3094                         /*
3095                          * Although the PTE may still have PG_RW set, TLB
3096                          * invalidation may nonetheless be required because
3097                          * the PTE no longer has PG_M set.
3098                          */
3099                 } else if ((origpte & PG_X) != 0 || (newpte & PG_X) == 0) {
3100                         /*
3101                          * Removing capabilities requires invalidation on POWER
3102                          */
3103                         invalidate_page = true;
3104                         goto unchanged;
3105                 }
3106                 if ((origpte & PG_A) != 0)
3107                         invalidate_page = true;
3108         } else {
3109                 pte_store(pte, newpte);
3110                 ptesync();
3111         }
3112 unchanged:
3113
3114 #if VM_NRESERVLEVEL > 0
3115         /*
3116          * If both the page table page and the reservation are fully
3117          * populated, then attempt promotion.
3118          */
3119         if ((mpte == NULL || mpte->ref_count == NPTEPG) &&
3120             mmu_radix_ps_enabled(pmap) &&
3121             (m->flags & PG_FICTITIOUS) == 0 &&
3122             vm_reserv_level_iffullpop(m) == 0 &&
3123                 pmap_promote_l3e(pmap, l3e, va, &lock) == 0)
3124                 invalidate_all = true;
3125 #endif
3126         if (invalidate_all)
3127                 pmap_invalidate_all(pmap);
3128         else if (invalidate_page)
3129                 pmap_invalidate_page(pmap, va);
3130
3131         rv = KERN_SUCCESS;
3132 out:
3133         if (lock != NULL)
3134                 rw_wunlock(lock);
3135         PMAP_UNLOCK(pmap);
3136
3137         return (rv);
3138 }
3139
3140 /*
3141  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
3142  * if successful.  Returns false if (1) a page table page cannot be allocated
3143  * without sleeping, (2) a mapping already exists at the specified virtual
3144  * address, or (3) a PV entry cannot be allocated without reclaiming another
3145  * PV entry.
3146  */
3147 static bool
3148 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3149     struct rwlock **lockp)
3150 {
3151         pml3_entry_t newpde;
3152
3153         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3154         newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs) |
3155             RPTE_LEAF | PG_V;
3156         if ((m->oflags & VPO_UNMANAGED) == 0)
3157                 newpde |= PG_MANAGED;
3158         if (prot & VM_PROT_EXECUTE)
3159                 newpde |= PG_X;
3160         if (prot & VM_PROT_READ)
3161                 newpde |= RPTE_EAA_R;
3162         if (va >= DMAP_MIN_ADDRESS)
3163                 newpde |= RPTE_EAA_P;
3164         return (pmap_enter_l3e(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
3165             PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
3166             KERN_SUCCESS);
3167 }
3168
3169 /*
3170  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
3171  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
3172  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
3173  * a mapping already exists at the specified virtual address.  Returns
3174  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
3175  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
3176  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
3177  *
3178  * The parameter "m" is only used when creating a managed, writeable mapping.
3179  */
3180 static int
3181 pmap_enter_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t newpde, u_int flags,
3182     vm_page_t m, struct rwlock **lockp)
3183 {
3184         struct spglist free;
3185         pml3_entry_t oldl3e, *l3e;
3186         vm_page_t mt, pdpg;
3187
3188         KASSERT((newpde & (PG_M | PG_RW)) != PG_RW,
3189             ("pmap_enter_pde: newpde is missing PG_M"));
3190         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3191
3192         if ((pdpg = pmap_allocl3e(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
3193             NULL : lockp)) == NULL) {
3194                 CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3195                     " in pmap %p", va, pmap);
3196                 return (KERN_RESOURCE_SHORTAGE);
3197         }
3198         l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
3199         l3e = &l3e[pmap_pml3e_index(va)];
3200         oldl3e = be64toh(*l3e);
3201         if ((oldl3e & PG_V) != 0) {
3202                 KASSERT(pdpg->ref_count > 1,
3203                     ("pmap_enter_pde: pdpg's wire count is too low"));
3204                 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
3205                         pdpg->ref_count--;
3206                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3207                             " in pmap %p", va, pmap);
3208                         return (KERN_FAILURE);
3209                 }
3210                 /* Break the existing mapping(s). */
3211                 SLIST_INIT(&free);
3212                 if ((oldl3e & RPTE_LEAF) != 0) {
3213                         /*
3214                          * The reference to the PD page that was acquired by
3215                          * pmap_allocl3e() ensures that it won't be freed.
3216                          * However, if the PDE resulted from a promotion, then
3217                          * a reserved PT page could be freed.
3218                          */
3219                         (void)pmap_remove_l3e(pmap, l3e, va, &free, lockp);
3220                         pmap_invalidate_l3e_page(pmap, va, oldl3e);
3221                 } else {
3222                         if (pmap_remove_ptes(pmap, va, va + L3_PAGE_SIZE, l3e,
3223                             &free, lockp))
3224                                pmap_invalidate_all(pmap);
3225                 }
3226                 vm_page_free_pages_toq(&free, true);
3227                 if (va >= VM_MAXUSER_ADDRESS) {
3228                         mt = PHYS_TO_VM_PAGE(be64toh(*l3e) & PG_FRAME);
3229                         if (pmap_insert_pt_page(pmap, mt)) {
3230                                 /*
3231                                  * XXX Currently, this can't happen because
3232                                  * we do not perform pmap_enter(psind == 1)
3233                                  * on the kernel pmap.
3234                                  */
3235                                 panic("pmap_enter_pde: trie insert failed");
3236                         }
3237                 } else
3238                         KASSERT(be64toh(*l3e) == 0, ("pmap_enter_pde: non-zero pde %p",
3239                             l3e));
3240         }
3241         if ((newpde & PG_MANAGED) != 0) {
3242                 /*
3243                  * Abort this mapping if its PV entry could not be created.
3244                  */
3245                 if (!pmap_pv_insert_l3e(pmap, va, newpde, flags, lockp)) {
3246                         SLIST_INIT(&free);
3247                         if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
3248                                 /*
3249                                  * Although "va" is not mapped, paging-
3250                                  * structure caches could nonetheless have
3251                                  * entries that refer to the freed page table
3252                                  * pages.  Invalidate those entries.
3253                                  */
3254                                 pmap_invalidate_page(pmap, va);
3255                                 vm_page_free_pages_toq(&free, true);
3256                         }
3257                         CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
3258                             " in pmap %p", va, pmap);
3259                         return (KERN_RESOURCE_SHORTAGE);
3260                 }
3261                 if ((newpde & PG_RW) != 0) {
3262                         for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
3263                                 vm_page_aflag_set(mt, PGA_WRITEABLE);
3264                 }
3265         }
3266
3267         /*
3268          * Increment counters.
3269          */
3270         if ((newpde & PG_W) != 0)
3271                 pmap->pm_stats.wired_count += L3_PAGE_SIZE / PAGE_SIZE;
3272         pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE);
3273
3274         /*
3275          * Map the superpage.  (This is not a promoted mapping; there will not
3276          * be any lingering 4KB page mappings in the TLB.)
3277          */
3278         pte_store(l3e, newpde);
3279         ptesync();
3280
3281         counter_u64_add(pmap_l3e_mappings, 1);
3282         CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
3283             " in pmap %p", va, pmap);
3284         return (KERN_SUCCESS);
3285 }
3286
3287 void
3288 mmu_radix_enter_object(pmap_t pmap, vm_offset_t start,
3289     vm_offset_t end, vm_page_t m_start, vm_prot_t prot)
3290 {
3291
3292         struct rwlock *lock;
3293         vm_offset_t va;
3294         vm_page_t m, mpte;
3295         vm_pindex_t diff, psize;
3296         bool invalidate;
3297         VM_OBJECT_ASSERT_LOCKED(m_start->object);
3298
3299         CTR6(KTR_PMAP, "%s(%p, %#x, %#x, %p, %#x)", __func__, pmap, start,
3300             end, m_start, prot);
3301
3302         invalidate = false;
3303         psize = atop(end - start);
3304         mpte = NULL;
3305         m = m_start;
3306         lock = NULL;
3307         PMAP_LOCK(pmap);
3308         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3309                 va = start + ptoa(diff);
3310                 if ((va & L3_PAGE_MASK) == 0 && va + L3_PAGE_SIZE <= end &&
3311                     m->psind == 1 && mmu_radix_ps_enabled(pmap) &&
3312                     pmap_enter_2mpage(pmap, va, m, prot, &lock))
3313                         m = &m[L3_PAGE_SIZE / PAGE_SIZE - 1];
3314                 else
3315                         mpte = mmu_radix_enter_quick_locked(pmap, va, m, prot,
3316                             mpte, &lock, &invalidate);
3317                 m = TAILQ_NEXT(m, listq);
3318         }
3319         ptesync();
3320         if (lock != NULL)
3321                 rw_wunlock(lock);
3322         if (invalidate)
3323                 pmap_invalidate_all(pmap);
3324         PMAP_UNLOCK(pmap);
3325 }
3326
3327 static vm_page_t
3328 mmu_radix_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3329     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp, bool *invalidate)
3330 {
3331         struct spglist free;
3332         pt_entry_t *pte;
3333         vm_paddr_t pa;
3334
3335         KASSERT(!VA_IS_CLEANMAP(va) ||
3336             (m->oflags & VPO_UNMANAGED) != 0,
3337             ("mmu_radix_enter_quick_locked: managed mapping within the clean submap"));
3338         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3339
3340         /*
3341          * In the case that a page table page is not
3342          * resident, we are creating it here.
3343          */
3344         if (va < VM_MAXUSER_ADDRESS) {
3345                 vm_pindex_t ptepindex;
3346                 pml3_entry_t *ptepa;
3347
3348                 /*
3349                  * Calculate pagetable page index
3350                  */
3351                 ptepindex = pmap_l3e_pindex(va);
3352                 if (mpte && (mpte->pindex == ptepindex)) {
3353                         mpte->ref_count++;
3354                 } else {
3355                         /*
3356                          * Get the page directory entry
3357                          */
3358                         ptepa = pmap_pml3e(pmap, va);
3359
3360                         /*
3361                          * If the page table page is mapped, we just increment
3362                          * the hold count, and activate it.  Otherwise, we
3363                          * attempt to allocate a page table page.  If this
3364                          * attempt fails, we don't retry.  Instead, we give up.
3365                          */
3366                         if (ptepa && (be64toh(*ptepa) & PG_V) != 0) {
3367                                 if (be64toh(*ptepa) & RPTE_LEAF)
3368                                         return (NULL);
3369                                 mpte = PHYS_TO_VM_PAGE(be64toh(*ptepa) & PG_FRAME);
3370                                 mpte->ref_count++;
3371                         } else {
3372                                 /*
3373                                  * Pass NULL instead of the PV list lock
3374                                  * pointer, because we don't intend to sleep.
3375                                  */
3376                                 mpte = _pmap_allocpte(pmap, ptepindex, NULL);
3377                                 if (mpte == NULL)
3378                                         return (mpte);
3379                         }
3380                 }
3381                 pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3382                 pte = &pte[pmap_pte_index(va)];
3383         } else {
3384                 mpte = NULL;
3385                 pte = pmap_pte(pmap, va);
3386         }
3387         if (be64toh(*pte)) {
3388                 if (mpte != NULL) {
3389                         mpte->ref_count--;
3390                         mpte = NULL;
3391                 }
3392                 return (mpte);
3393         }
3394
3395         /*
3396          * Enter on the PV list if part of our managed memory.
3397          */
3398         if ((m->oflags & VPO_UNMANAGED) == 0 &&
3399             !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3400                 if (mpte != NULL) {
3401                         SLIST_INIT(&free);
3402                         if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
3403                                 /*
3404                                  * Although "va" is not mapped, paging-
3405                                  * structure caches could nonetheless have
3406                                  * entries that refer to the freed page table
3407                                  * pages.  Invalidate those entries.
3408                                  */
3409                                 *invalidate = true;
3410                                 vm_page_free_pages_toq(&free, true);
3411                         }
3412                         mpte = NULL;
3413                 }
3414                 return (mpte);
3415         }
3416
3417         /*
3418          * Increment counters
3419          */
3420         pmap_resident_count_inc(pmap, 1);
3421
3422         pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.mdpg_cache_attrs);
3423         if (prot & VM_PROT_EXECUTE)
3424                 pa |= PG_X;
3425         else
3426                 pa |= RPTE_EAA_R;
3427         if ((m->oflags & VPO_UNMANAGED) == 0)
3428                 pa |= PG_MANAGED;
3429
3430         pte_store(pte, pa);
3431         return (mpte);
3432 }
3433
3434 void
3435 mmu_radix_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
3436     vm_prot_t prot)
3437 {
3438         struct rwlock *lock;
3439         bool invalidate;
3440
3441         lock = NULL;
3442         invalidate = false;
3443         PMAP_LOCK(pmap);
3444         mmu_radix_enter_quick_locked(pmap, va, m, prot, NULL, &lock,
3445             &invalidate);
3446         ptesync();
3447         if (lock != NULL)
3448                 rw_wunlock(lock);
3449         if (invalidate)
3450                 pmap_invalidate_all(pmap);
3451         PMAP_UNLOCK(pmap);
3452 }
3453
3454 vm_paddr_t
3455 mmu_radix_extract(pmap_t pmap, vm_offset_t va)
3456 {
3457         pml3_entry_t *l3e;
3458         pt_entry_t *pte;
3459         vm_paddr_t pa;
3460
3461         l3e = pmap_pml3e(pmap, va);
3462         if (__predict_false(l3e == NULL))
3463                 return (0);
3464         if (be64toh(*l3e) & RPTE_LEAF) {
3465                 pa = (be64toh(*l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK);
3466                 pa |= (va & L3_PAGE_MASK);
3467         } else {
3468                 /*
3469                  * Beware of a concurrent promotion that changes the
3470                  * PDE at this point!  For example, vtopte() must not
3471                  * be used to access the PTE because it would use the
3472                  * new PDE.  It is, however, safe to use the old PDE
3473                  * because the page table page is preserved by the
3474                  * promotion.
3475                  */
3476                 pte = pmap_l3e_to_pte(l3e, va);
3477                 if (__predict_false(pte == NULL))
3478                         return (0);
3479                 pa = be64toh(*pte);
3480                 pa = (pa & PG_FRAME) | (va & PAGE_MASK);
3481                 pa |= (va & PAGE_MASK);
3482         }
3483         return (pa);
3484 }
3485
3486 vm_page_t
3487 mmu_radix_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
3488 {
3489         pml3_entry_t l3e, *l3ep;
3490         pt_entry_t pte;
3491         vm_page_t m;
3492
3493         m = NULL;
3494         CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, va, prot);
3495         PMAP_LOCK(pmap);
3496         l3ep = pmap_pml3e(pmap, va);
3497         if (l3ep != NULL && (l3e = be64toh(*l3ep))) {
3498                 if (l3e & RPTE_LEAF) {
3499                         if ((l3e & PG_RW) || (prot & VM_PROT_WRITE) == 0)
3500                                 m = PHYS_TO_VM_PAGE((l3e & PG_PS_FRAME) |
3501                                     (va & L3_PAGE_MASK));
3502                 } else {
3503                         /* Native endian PTE, do not pass to pmap functions */
3504                         pte = be64toh(*pmap_l3e_to_pte(l3ep, va));
3505                         if ((pte & PG_V) &&
3506                             ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0))
3507                                 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
3508                 }
3509                 if (m != NULL && !vm_page_wire_mapped(m))
3510                         m = NULL;
3511         }
3512         PMAP_UNLOCK(pmap);
3513         return (m);
3514 }
3515
3516 static void
3517 mmu_radix_growkernel(vm_offset_t addr)
3518 {
3519         vm_paddr_t paddr;
3520         vm_page_t nkpg;
3521         pml3_entry_t *l3e;
3522         pml2_entry_t *l2e;
3523
3524         CTR2(KTR_PMAP, "%s(%#x)", __func__, addr);
3525         if (VM_MIN_KERNEL_ADDRESS < addr &&
3526                 addr < (VM_MIN_KERNEL_ADDRESS + nkpt * L3_PAGE_SIZE))
3527                 return;
3528
3529         addr = roundup2(addr, L3_PAGE_SIZE);
3530         if (addr - 1 >= vm_map_max(kernel_map))
3531                 addr = vm_map_max(kernel_map);
3532         while (kernel_vm_end < addr) {
3533                 l2e = pmap_pml2e(kernel_pmap, kernel_vm_end);
3534                 if ((be64toh(*l2e) & PG_V) == 0) {
3535                         /* We need a new PDP entry */
3536                         nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
3537                             VM_ALLOC_WIRED | VM_ALLOC_ZERO);
3538                         if (nkpg == NULL)
3539                                 panic("pmap_growkernel: no memory to grow kernel");
3540                         nkpg->pindex = kernel_vm_end >> L2_PAGE_SIZE_SHIFT;
3541                         paddr = VM_PAGE_TO_PHYS(nkpg);
3542                         pde_store(l2e, paddr);
3543                         continue; /* try again */
3544                 }
3545                 l3e = pmap_l2e_to_l3e(l2e, kernel_vm_end);
3546                 if ((be64toh(*l3e) & PG_V) != 0) {
3547                         kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
3548                         if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3549                                 kernel_vm_end = vm_map_max(kernel_map);
3550                                 break;
3551                         }
3552                         continue;
3553                 }
3554
3555                 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
3556                     VM_ALLOC_ZERO);
3557                 if (nkpg == NULL)
3558                         panic("pmap_growkernel: no memory to grow kernel");
3559                 nkpg->pindex = pmap_l3e_pindex(kernel_vm_end);
3560                 paddr = VM_PAGE_TO_PHYS(nkpg);
3561                 pde_store(l3e, paddr);
3562
3563                 kernel_vm_end = (kernel_vm_end + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
3564                 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3565                         kernel_vm_end = vm_map_max(kernel_map);
3566                         break;
3567                 }
3568         }
3569         ptesync();
3570 }
3571
3572 static MALLOC_DEFINE(M_RADIX_PGD, "radix_pgd", "radix page table root directory");
3573 static uma_zone_t zone_radix_pgd;
3574
3575 static int
3576 radix_pgd_import(void *arg __unused, void **store, int count, int domain __unused,
3577     int flags)
3578 {
3579         int req;
3580
3581         req = VM_ALLOC_WIRED | malloc2vm_flags(flags);
3582         for (int i = 0; i < count; i++) {
3583                 vm_page_t m = vm_page_alloc_noobj_contig(req,
3584                     RADIX_PGD_SIZE / PAGE_SIZE,
3585                     0, (vm_paddr_t)-1, RADIX_PGD_SIZE, L1_PAGE_SIZE,
3586                     VM_MEMATTR_DEFAULT);
3587                 store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3588         }
3589         return (count);
3590 }
3591
3592 static void
3593 radix_pgd_release(void *arg __unused, void **store, int count)
3594 {
3595         vm_page_t m;
3596         struct spglist free;
3597         int page_count;
3598
3599         SLIST_INIT(&free);
3600         page_count = RADIX_PGD_SIZE/PAGE_SIZE;
3601
3602         for (int i = 0; i < count; i++) {
3603                 /*
3604                  * XXX selectively remove dmap and KVA entries so we don't
3605                  * need to bzero
3606                  */
3607                 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
3608                 for (int j = page_count-1; j >= 0; j--) {
3609                         vm_page_unwire_noq(&m[j]);
3610                         SLIST_INSERT_HEAD(&free, &m[j], plinks.s.ss);
3611                 }
3612                 vm_page_free_pages_toq(&free, false);
3613         }
3614 }
3615
3616 static void
3617 mmu_radix_init(void)
3618 {
3619         vm_page_t mpte;
3620         vm_size_t s;
3621         int error, i, pv_npg;
3622
3623         /* XXX is this really needed for POWER? */
3624         /* L1TF, reserve page @0 unconditionally */
3625         vm_page_blacklist_add(0, bootverbose);
3626
3627         zone_radix_pgd = uma_zcache_create("radix_pgd_cache",
3628                 RADIX_PGD_SIZE, NULL, NULL,
3629 #ifdef INVARIANTS
3630             trash_init, trash_fini,
3631 #else
3632             NULL, NULL,
3633 #endif
3634                 radix_pgd_import, radix_pgd_release,
3635                 NULL, UMA_ZONE_NOBUCKET);
3636
3637         /*
3638          * Initialize the vm page array entries for the kernel pmap's
3639          * page table pages.
3640          */
3641         PMAP_LOCK(kernel_pmap);
3642         for (i = 0; i < nkpt; i++) {
3643                 mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
3644                 KASSERT(mpte >= vm_page_array &&
3645                     mpte < &vm_page_array[vm_page_array_size],
3646                     ("pmap_init: page table page is out of range size: %lu",
3647                      vm_page_array_size));
3648                 mpte->pindex = pmap_l3e_pindex(VM_MIN_KERNEL_ADDRESS) + i;
3649                 mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
3650                 MPASS(PHYS_TO_VM_PAGE(mpte->phys_addr) == mpte);
3651                 //pmap_insert_pt_page(kernel_pmap, mpte);
3652                 mpte->ref_count = 1;
3653         }
3654         PMAP_UNLOCK(kernel_pmap);
3655         vm_wire_add(nkpt);
3656
3657         CTR1(KTR_PMAP, "%s()", __func__);
3658         TAILQ_INIT(&pv_dummy.pv_list);
3659
3660         /*
3661          * Are large page mappings enabled?
3662          */
3663         TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
3664         if (superpages_enabled) {
3665                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
3666                     ("pmap_init: can't assign to pagesizes[1]"));
3667                 pagesizes[1] = L3_PAGE_SIZE;
3668         }
3669
3670         /*
3671          * Initialize the pv chunk list mutex.
3672          */
3673         mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
3674
3675         /*
3676          * Initialize the pool of pv list locks.
3677          */
3678         for (i = 0; i < NPV_LIST_LOCKS; i++)
3679                 rw_init(&pv_list_locks[i], "pmap pv list");
3680
3681         /*
3682          * Calculate the size of the pv head table for superpages.
3683          */
3684         pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L3_PAGE_SIZE);
3685
3686         /*
3687          * Allocate memory for the pv head table for superpages.
3688          */
3689         s = (vm_size_t)(pv_npg * sizeof(struct md_page));
3690         s = round_page(s);
3691         pv_table = kmem_malloc(s, M_WAITOK | M_ZERO);
3692         for (i = 0; i < pv_npg; i++)
3693                 TAILQ_INIT(&pv_table[i].pv_list);
3694         TAILQ_INIT(&pv_dummy.pv_list);
3695
3696         pmap_initialized = 1;
3697         mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
3698         error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
3699             (vmem_addr_t *)&qframe);
3700
3701         if (error != 0)
3702                 panic("qframe allocation failed");
3703         asid_arena = vmem_create("ASID", isa3_base_pid + 1, (1<<isa3_pid_bits),
3704             1, 1, M_WAITOK);
3705 }
3706
3707 static boolean_t
3708 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
3709 {
3710         struct rwlock *lock;
3711         pv_entry_t pv;
3712         struct md_page *pvh;
3713         pt_entry_t *pte, mask;
3714         pmap_t pmap;
3715         int md_gen, pvh_gen;
3716         boolean_t rv;
3717
3718         rv = FALSE;
3719         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3720         rw_rlock(lock);
3721 restart:
3722         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
3723                 pmap = PV_PMAP(pv);
3724                 if (!PMAP_TRYLOCK(pmap)) {
3725                         md_gen = m->md.pv_gen;
3726                         rw_runlock(lock);
3727                         PMAP_LOCK(pmap);
3728                         rw_rlock(lock);
3729                         if (md_gen != m->md.pv_gen) {
3730                                 PMAP_UNLOCK(pmap);
3731                                 goto restart;
3732                         }
3733                 }
3734                 pte = pmap_pte(pmap, pv->pv_va);
3735                 mask = 0;
3736                 if (modified)
3737                         mask |= PG_RW | PG_M;
3738                 if (accessed)
3739                         mask |= PG_V | PG_A;
3740                 rv = (be64toh(*pte) & mask) == mask;
3741                 PMAP_UNLOCK(pmap);
3742                 if (rv)
3743                         goto out;
3744         }
3745         if ((m->flags & PG_FICTITIOUS) == 0) {
3746                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3747                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
3748                         pmap = PV_PMAP(pv);
3749                         if (!PMAP_TRYLOCK(pmap)) {
3750                                 md_gen = m->md.pv_gen;
3751                                 pvh_gen = pvh->pv_gen;
3752                                 rw_runlock(lock);
3753                                 PMAP_LOCK(pmap);
3754                                 rw_rlock(lock);
3755                                 if (md_gen != m->md.pv_gen ||
3756                                     pvh_gen != pvh->pv_gen) {
3757                                         PMAP_UNLOCK(pmap);
3758                                         goto restart;
3759                                 }
3760                         }
3761                         pte = pmap_pml3e(pmap, pv->pv_va);
3762                         mask = 0;
3763                         if (modified)
3764                                 mask |= PG_RW | PG_M;
3765                         if (accessed)
3766                                 mask |= PG_V | PG_A;
3767                         rv = (be64toh(*pte) & mask) == mask;
3768                         PMAP_UNLOCK(pmap);
3769                         if (rv)
3770                                 goto out;
3771                 }
3772         }
3773 out:
3774         rw_runlock(lock);
3775         return (rv);
3776 }
3777
3778 /*
3779  *      pmap_is_modified:
3780  *
3781  *      Return whether or not the specified physical page was modified
3782  *      in any physical maps.
3783  */
3784 boolean_t
3785 mmu_radix_is_modified(vm_page_t m)
3786 {
3787
3788         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3789             ("pmap_is_modified: page %p is not managed", m));
3790
3791         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3792         /*
3793          * If the page is not busied then this check is racy.
3794          */
3795         if (!pmap_page_is_write_mapped(m))
3796                 return (FALSE);
3797         return (pmap_page_test_mappings(m, FALSE, TRUE));
3798 }
3799
3800 boolean_t
3801 mmu_radix_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3802 {
3803         pml3_entry_t *l3e;
3804         pt_entry_t *pte;
3805         boolean_t rv;
3806
3807         CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr);
3808         rv = FALSE;
3809         PMAP_LOCK(pmap);
3810         l3e = pmap_pml3e(pmap, addr);
3811         if (l3e != NULL && (be64toh(*l3e) & (RPTE_LEAF | PG_V)) == PG_V) {
3812                 pte = pmap_l3e_to_pte(l3e, addr);
3813                 rv = (be64toh(*pte) & PG_V) == 0;
3814         }
3815         PMAP_UNLOCK(pmap);
3816         return (rv);
3817 }
3818
3819 boolean_t
3820 mmu_radix_is_referenced(vm_page_t m)
3821 {
3822         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3823             ("pmap_is_referenced: page %p is not managed", m));
3824         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3825         return (pmap_page_test_mappings(m, TRUE, FALSE));
3826 }
3827
3828 /*
3829  *      pmap_ts_referenced:
3830  *
3831  *      Return a count of reference bits for a page, clearing those bits.
3832  *      It is not necessary for every reference bit to be cleared, but it
3833  *      is necessary that 0 only be returned when there are truly no
3834  *      reference bits set.
3835  *
3836  *      As an optimization, update the page's dirty field if a modified bit is
3837  *      found while counting reference bits.  This opportunistic update can be
3838  *      performed at low cost and can eliminate the need for some future calls
3839  *      to pmap_is_modified().  However, since this function stops after
3840  *      finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
3841  *      dirty pages.  Those dirty pages will only be detected by a future call
3842  *      to pmap_is_modified().
3843  *
3844  *      A DI block is not needed within this function, because
3845  *      invalidations are performed before the PV list lock is
3846  *      released.
3847  */
3848 boolean_t
3849 mmu_radix_ts_referenced(vm_page_t m)
3850 {
3851         struct md_page *pvh;
3852         pv_entry_t pv, pvf;
3853         pmap_t pmap;
3854         struct rwlock *lock;
3855         pml3_entry_t oldl3e, *l3e;
3856         pt_entry_t *pte;
3857         vm_paddr_t pa;
3858         int cleared, md_gen, not_cleared, pvh_gen;
3859         struct spglist free;
3860
3861         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
3862         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3863             ("pmap_ts_referenced: page %p is not managed", m));
3864         SLIST_INIT(&free);
3865         cleared = 0;
3866         pa = VM_PAGE_TO_PHYS(m);
3867         lock = PHYS_TO_PV_LIST_LOCK(pa);
3868         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
3869         rw_wlock(lock);
3870 retry:
3871         not_cleared = 0;
3872         if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
3873                 goto small_mappings;
3874         pv = pvf;
3875         do {
3876                 if (pvf == NULL)
3877                         pvf = pv;
3878                 pmap = PV_PMAP(pv);
3879                 if (!PMAP_TRYLOCK(pmap)) {
3880                         pvh_gen = pvh->pv_gen;
3881                         rw_wunlock(lock);
3882                         PMAP_LOCK(pmap);
3883                         rw_wlock(lock);
3884                         if (pvh_gen != pvh->pv_gen) {
3885                                 PMAP_UNLOCK(pmap);
3886                                 goto retry;
3887                         }
3888                 }
3889                 l3e = pmap_pml3e(pmap, pv->pv_va);
3890                 oldl3e = be64toh(*l3e);
3891                 if ((oldl3e & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
3892                         /*
3893                          * Although "oldpde" is mapping a 2MB page, because
3894                          * this function is called at a 4KB page granularity,
3895                          * we only update the 4KB page under test.
3896                          */
3897                         vm_page_dirty(m);
3898                 }
3899                 if ((oldl3e & PG_A) != 0) {
3900                         /*
3901                          * Since this reference bit is shared by 512 4KB
3902                          * pages, it should not be cleared every time it is
3903                          * tested.  Apply a simple "hash" function on the
3904                          * physical page number, the virtual superpage number,
3905                          * and the pmap address to select one 4KB page out of
3906                          * the 512 on which testing the reference bit will
3907                          * result in clearing that reference bit.  This
3908                          * function is designed to avoid the selection of the
3909                          * same 4KB page for every 2MB page mapping.
3910                          *
3911                          * On demotion, a mapping that hasn't been referenced
3912                          * is simply destroyed.  To avoid the possibility of a
3913                          * subsequent page fault on a demoted wired mapping,
3914                          * always leave its reference bit set.  Moreover,
3915                          * since the superpage is wired, the current state of
3916                          * its reference bit won't affect page replacement.
3917                          */
3918                         if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L3_PAGE_SIZE_SHIFT) ^
3919                             (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
3920                             (oldl3e & PG_W) == 0) {
3921                                 atomic_clear_long(l3e, htobe64(PG_A));
3922                                 pmap_invalidate_page(pmap, pv->pv_va);
3923                                 cleared++;
3924                                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
3925                                     ("inconsistent pv lock %p %p for page %p",
3926                                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
3927                         } else
3928                                 not_cleared++;
3929                 }
3930                 PMAP_UNLOCK(pmap);
3931                 /* Rotate the PV list if it has more than one entry. */
3932                 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) {
3933                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
3934                         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
3935                         pvh->pv_gen++;
3936                 }
3937                 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
3938                         goto out;
3939         } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
3940 small_mappings:
3941         if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
3942                 goto out;
3943         pv = pvf;
3944         do {
3945                 if (pvf == NULL)
3946                         pvf = pv;
3947                 pmap = PV_PMAP(pv);
3948                 if (!PMAP_TRYLOCK(pmap)) {
3949                         pvh_gen = pvh->pv_gen;
3950                         md_gen = m->md.pv_gen;
3951                         rw_wunlock(lock);
3952                         PMAP_LOCK(pmap);
3953                         rw_wlock(lock);
3954                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
3955                                 PMAP_UNLOCK(pmap);
3956                                 goto retry;
3957                         }
3958                 }
3959                 l3e = pmap_pml3e(pmap, pv->pv_va);
3960                 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0,
3961                     ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
3962                     m));
3963                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
3964                 if ((be64toh(*pte) & (PG_M | PG_RW)) == (PG_M | PG_RW))
3965                         vm_page_dirty(m);
3966                 if ((be64toh(*pte) & PG_A) != 0) {
3967                         atomic_clear_long(pte, htobe64(PG_A));
3968                         pmap_invalidate_page(pmap, pv->pv_va);
3969                         cleared++;
3970                 }
3971                 PMAP_UNLOCK(pmap);
3972                 /* Rotate the PV list if it has more than one entry. */
3973                 if (pv != NULL && TAILQ_NEXT(pv, pv_link) != NULL) {
3974                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
3975                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_link);
3976                         m->md.pv_gen++;
3977                 }
3978         } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
3979             not_cleared < PMAP_TS_REFERENCED_MAX);
3980 out:
3981         rw_wunlock(lock);
3982         vm_page_free_pages_toq(&free, true);
3983         return (cleared + not_cleared);
3984 }
3985
3986 static vm_offset_t
3987 mmu_radix_map(vm_offset_t *virt __unused, vm_paddr_t start,
3988     vm_paddr_t end, int prot __unused)
3989 {
3990
3991         CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, virt, start, end,
3992                  prot);
3993         return (PHYS_TO_DMAP(start));
3994 }
3995
3996 void
3997 mmu_radix_object_init_pt(pmap_t pmap, vm_offset_t addr,
3998     vm_object_t object, vm_pindex_t pindex, vm_size_t size)
3999 {
4000         pml3_entry_t *l3e;
4001         vm_paddr_t pa, ptepa;
4002         vm_page_t p, pdpg;
4003         vm_memattr_t ma;
4004
4005         CTR6(KTR_PMAP, "%s(%p, %#x, %p, %u, %#x)", __func__, pmap, addr,
4006             object, pindex, size);
4007         VM_OBJECT_ASSERT_WLOCKED(object);
4008         KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
4009                         ("pmap_object_init_pt: non-device object"));
4010         /* NB: size can be logically ored with addr here */
4011         if ((addr & L3_PAGE_MASK) == 0 && (size & L3_PAGE_MASK) == 0) {
4012                 if (!mmu_radix_ps_enabled(pmap))
4013                         return;
4014                 if (!vm_object_populate(object, pindex, pindex + atop(size)))
4015                         return;
4016                 p = vm_page_lookup(object, pindex);
4017                 KASSERT(p->valid == VM_PAGE_BITS_ALL,
4018                     ("pmap_object_init_pt: invalid page %p", p));
4019                 ma = p->md.mdpg_cache_attrs;
4020
4021                 /*
4022                  * Abort the mapping if the first page is not physically
4023                  * aligned to a 2MB page boundary.
4024                  */
4025                 ptepa = VM_PAGE_TO_PHYS(p);
4026                 if (ptepa & L3_PAGE_MASK)
4027                         return;
4028
4029                 /*
4030                  * Skip the first page.  Abort the mapping if the rest of
4031                  * the pages are not physically contiguous or have differing
4032                  * memory attributes.
4033                  */
4034                 p = TAILQ_NEXT(p, listq);
4035                 for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
4036                     pa += PAGE_SIZE) {
4037                         KASSERT(p->valid == VM_PAGE_BITS_ALL,
4038                             ("pmap_object_init_pt: invalid page %p", p));
4039                         if (pa != VM_PAGE_TO_PHYS(p) ||
4040                             ma != p->md.mdpg_cache_attrs)
4041                                 return;
4042                         p = TAILQ_NEXT(p, listq);
4043                 }
4044
4045                 PMAP_LOCK(pmap);
4046                 for (pa = ptepa | pmap_cache_bits(ma);
4047                     pa < ptepa + size; pa += L3_PAGE_SIZE) {
4048                         pdpg = pmap_allocl3e(pmap, addr, NULL);
4049                         if (pdpg == NULL) {
4050                                 /*
4051                                  * The creation of mappings below is only an
4052                                  * optimization.  If a page directory page
4053                                  * cannot be allocated without blocking,
4054                                  * continue on to the next mapping rather than
4055                                  * blocking.
4056                                  */
4057                                 addr += L3_PAGE_SIZE;
4058                                 continue;
4059                         }
4060                         l3e = (pml3_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
4061                         l3e = &l3e[pmap_pml3e_index(addr)];
4062                         if ((be64toh(*l3e) & PG_V) == 0) {
4063                                 pa |= PG_M | PG_A | PG_RW;
4064                                 pte_store(l3e, pa);
4065                                 pmap_resident_count_inc(pmap, L3_PAGE_SIZE / PAGE_SIZE);
4066                                 counter_u64_add(pmap_l3e_mappings, 1);
4067                         } else {
4068                                 /* Continue on if the PDE is already valid. */
4069                                 pdpg->ref_count--;
4070                                 KASSERT(pdpg->ref_count > 0,
4071                                     ("pmap_object_init_pt: missing reference "
4072                                     "to page directory page, va: 0x%lx", addr));
4073                         }
4074                         addr += L3_PAGE_SIZE;
4075                 }
4076                 ptesync();
4077                 PMAP_UNLOCK(pmap);
4078         }
4079 }
4080
4081 boolean_t
4082 mmu_radix_page_exists_quick(pmap_t pmap, vm_page_t m)
4083 {
4084         struct md_page *pvh;
4085         struct rwlock *lock;
4086         pv_entry_t pv;
4087         int loops = 0;
4088         boolean_t rv;
4089
4090         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4091             ("pmap_page_exists_quick: page %p is not managed", m));
4092         CTR3(KTR_PMAP, "%s(%p, %p)", __func__, pmap, m);
4093         rv = FALSE;
4094         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4095         rw_rlock(lock);
4096         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
4097                 if (PV_PMAP(pv) == pmap) {
4098                         rv = TRUE;
4099                         break;
4100                 }
4101                 loops++;
4102                 if (loops >= 16)
4103                         break;
4104         }
4105         if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4106                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4107                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
4108                         if (PV_PMAP(pv) == pmap) {
4109                                 rv = TRUE;
4110                                 break;
4111                         }
4112                         loops++;
4113                         if (loops >= 16)
4114                                 break;
4115                 }
4116         }
4117         rw_runlock(lock);
4118         return (rv);
4119 }
4120
4121 void
4122 mmu_radix_page_init(vm_page_t m)
4123 {
4124
4125         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
4126         TAILQ_INIT(&m->md.pv_list);
4127         m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT;
4128 }
4129
4130 int
4131 mmu_radix_page_wired_mappings(vm_page_t m)
4132 {
4133         struct rwlock *lock;
4134         struct md_page *pvh;
4135         pmap_t pmap;
4136         pt_entry_t *pte;
4137         pv_entry_t pv;
4138         int count, md_gen, pvh_gen;
4139
4140         if ((m->oflags & VPO_UNMANAGED) != 0)
4141                 return (0);
4142         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
4143         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4144         rw_rlock(lock);
4145 restart:
4146         count = 0;
4147         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
4148                 pmap = PV_PMAP(pv);
4149                 if (!PMAP_TRYLOCK(pmap)) {
4150                         md_gen = m->md.pv_gen;
4151                         rw_runlock(lock);
4152                         PMAP_LOCK(pmap);
4153                         rw_rlock(lock);
4154                         if (md_gen != m->md.pv_gen) {
4155                                 PMAP_UNLOCK(pmap);
4156                                 goto restart;
4157                         }
4158                 }
4159                 pte = pmap_pte(pmap, pv->pv_va);
4160                 if ((be64toh(*pte) & PG_W) != 0)
4161                         count++;
4162                 PMAP_UNLOCK(pmap);
4163         }
4164         if ((m->flags & PG_FICTITIOUS) == 0) {
4165                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4166                 TAILQ_FOREACH(pv, &pvh->pv_list, pv_link) {
4167                         pmap = PV_PMAP(pv);
4168                         if (!PMAP_TRYLOCK(pmap)) {
4169                                 md_gen = m->md.pv_gen;
4170                                 pvh_gen = pvh->pv_gen;
4171                                 rw_runlock(lock);
4172                                 PMAP_LOCK(pmap);
4173                                 rw_rlock(lock);
4174                                 if (md_gen != m->md.pv_gen ||
4175                                     pvh_gen != pvh->pv_gen) {
4176                                         PMAP_UNLOCK(pmap);
4177                                         goto restart;
4178                                 }
4179                         }
4180                         pte = pmap_pml3e(pmap, pv->pv_va);
4181                         if ((be64toh(*pte) & PG_W) != 0)
4182                                 count++;
4183                         PMAP_UNLOCK(pmap);
4184                 }
4185         }
4186         rw_runlock(lock);
4187         return (count);
4188 }
4189
4190 static void
4191 mmu_radix_update_proctab(int pid, pml1_entry_t l1pa)
4192 {
4193         isa3_proctab[pid].proctab0 = htobe64(RTS_SIZE |  l1pa | RADIX_PGD_INDEX_SHIFT);
4194 }
4195
4196 int
4197 mmu_radix_pinit(pmap_t pmap)
4198 {
4199         vmem_addr_t pid;
4200         vm_paddr_t l1pa;
4201
4202         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4203
4204         /*
4205          * allocate the page directory page
4206          */
4207         pmap->pm_pml1 = uma_zalloc(zone_radix_pgd, M_WAITOK);
4208
4209         for (int j = 0; j <  RADIX_PGD_SIZE_SHIFT; j++)
4210                 pagezero((vm_offset_t)pmap->pm_pml1 + j * PAGE_SIZE);
4211         vm_radix_init(&pmap->pm_radix);
4212         TAILQ_INIT(&pmap->pm_pvchunk);
4213         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
4214         pmap->pm_flags = PMAP_PDE_SUPERPAGE;
4215         vmem_alloc(asid_arena, 1, M_FIRSTFIT|M_WAITOK, &pid);
4216
4217         pmap->pm_pid = pid;
4218         l1pa = DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml1);
4219         mmu_radix_update_proctab(pid, l1pa);
4220         __asm __volatile("ptesync;isync" : : : "memory");
4221
4222         return (1);
4223 }
4224
4225 /*
4226  * This routine is called if the desired page table page does not exist.
4227  *
4228  * If page table page allocation fails, this routine may sleep before
4229  * returning NULL.  It sleeps only if a lock pointer was given.
4230  *
4231  * Note: If a page allocation fails at page table level two or three,
4232  * one or two pages may be held during the wait, only to be released
4233  * afterwards.  This conservative approach is easily argued to avoid
4234  * race conditions.
4235  */
4236 static vm_page_t
4237 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
4238 {
4239         vm_page_t m, pdppg, pdpg;
4240
4241         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4242
4243         /*
4244          * Allocate a page table page.
4245          */
4246         if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
4247                 if (lockp != NULL) {
4248                         RELEASE_PV_LIST_LOCK(lockp);
4249                         PMAP_UNLOCK(pmap);
4250                         vm_wait(NULL);
4251                         PMAP_LOCK(pmap);
4252                 }
4253                 /*
4254                  * Indicate the need to retry.  While waiting, the page table
4255                  * page may have been allocated.
4256                  */
4257                 return (NULL);
4258         }
4259         m->pindex = ptepindex;
4260
4261         /*
4262          * Map the pagetable page into the process address space, if
4263          * it isn't already there.
4264          */
4265
4266         if (ptepindex >= (NUPDE + NUPDPE)) {
4267                 pml1_entry_t *l1e;
4268                 vm_pindex_t pml1index;
4269
4270                 /* Wire up a new PDPE page */
4271                 pml1index = ptepindex - (NUPDE + NUPDPE);
4272                 l1e = &pmap->pm_pml1[pml1index];
4273                 KASSERT((be64toh(*l1e) & PG_V) == 0,
4274                     ("%s: L1 entry %#lx is valid", __func__, *l1e));
4275                 pde_store(l1e, VM_PAGE_TO_PHYS(m));
4276         } else if (ptepindex >= NUPDE) {
4277                 vm_pindex_t pml1index;
4278                 vm_pindex_t pdpindex;
4279                 pml1_entry_t *l1e;
4280                 pml2_entry_t *l2e;
4281
4282                 /* Wire up a new l2e page */
4283                 pdpindex = ptepindex - NUPDE;
4284                 pml1index = pdpindex >> RPTE_SHIFT;
4285
4286                 l1e = &pmap->pm_pml1[pml1index];
4287                 if ((be64toh(*l1e) & PG_V) == 0) {
4288                         /* Have to allocate a new pdp, recurse */
4289                         if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml1index,
4290                                 lockp) == NULL) {
4291                                 vm_page_unwire_noq(m);
4292                                 vm_page_free_zero(m);
4293                                 return (NULL);
4294                         }
4295                 } else {
4296                         /* Add reference to l2e page */
4297                         pdppg = PHYS_TO_VM_PAGE(be64toh(*l1e) & PG_FRAME);
4298                         pdppg->ref_count++;
4299                 }
4300                 l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
4301
4302                 /* Now find the pdp page */
4303                 l2e = &l2e[pdpindex & RPTE_MASK];
4304                 KASSERT((be64toh(*l2e) & PG_V) == 0,
4305                     ("%s: L2 entry %#lx is valid", __func__, *l2e));
4306                 pde_store(l2e, VM_PAGE_TO_PHYS(m));
4307         } else {
4308                 vm_pindex_t pml1index;
4309                 vm_pindex_t pdpindex;
4310                 pml1_entry_t *l1e;
4311                 pml2_entry_t *l2e;
4312                 pml3_entry_t *l3e;
4313
4314                 /* Wire up a new PTE page */
4315                 pdpindex = ptepindex >> RPTE_SHIFT;
4316                 pml1index = pdpindex >> RPTE_SHIFT;
4317
4318                 /* First, find the pdp and check that its valid. */
4319                 l1e = &pmap->pm_pml1[pml1index];
4320                 if ((be64toh(*l1e) & PG_V) == 0) {
4321                         /* Have to allocate a new pd, recurse */
4322                         if (_pmap_allocpte(pmap, NUPDE + pdpindex,
4323                             lockp) == NULL) {
4324                                 vm_page_unwire_noq(m);
4325                                 vm_page_free_zero(m);
4326                                 return (NULL);
4327                         }
4328                         l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
4329                         l2e = &l2e[pdpindex & RPTE_MASK];
4330                 } else {
4331                         l2e = (pml2_entry_t *)PHYS_TO_DMAP(be64toh(*l1e) & PG_FRAME);
4332                         l2e = &l2e[pdpindex & RPTE_MASK];
4333                         if ((be64toh(*l2e) & PG_V) == 0) {
4334                                 /* Have to allocate a new pd, recurse */
4335                                 if (_pmap_allocpte(pmap, NUPDE + pdpindex,
4336                                     lockp) == NULL) {
4337                                         vm_page_unwire_noq(m);
4338                                         vm_page_free_zero(m);
4339                                         return (NULL);
4340                                 }
4341                         } else {
4342                                 /* Add reference to the pd page */
4343                                 pdpg = PHYS_TO_VM_PAGE(be64toh(*l2e) & PG_FRAME);
4344                                 pdpg->ref_count++;
4345                         }
4346                 }
4347                 l3e = (pml3_entry_t *)PHYS_TO_DMAP(be64toh(*l2e) & PG_FRAME);
4348
4349                 /* Now we know where the page directory page is */
4350                 l3e = &l3e[ptepindex & RPTE_MASK];
4351                 KASSERT((be64toh(*l3e) & PG_V) == 0,
4352                     ("%s: L3 entry %#lx is valid", __func__, *l3e));
4353                 pde_store(l3e, VM_PAGE_TO_PHYS(m));
4354         }
4355
4356         pmap_resident_count_inc(pmap, 1);
4357         return (m);
4358 }
4359 static vm_page_t
4360 pmap_allocl3e(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
4361 {
4362         vm_pindex_t pdpindex, ptepindex;
4363         pml2_entry_t *pdpe;
4364         vm_page_t pdpg;
4365
4366 retry:
4367         pdpe = pmap_pml2e(pmap, va);
4368         if (pdpe != NULL && (be64toh(*pdpe) & PG_V) != 0) {
4369                 /* Add a reference to the pd page. */
4370                 pdpg = PHYS_TO_VM_PAGE(be64toh(*pdpe) & PG_FRAME);
4371                 pdpg->ref_count++;
4372         } else {
4373                 /* Allocate a pd page. */
4374                 ptepindex = pmap_l3e_pindex(va);
4375                 pdpindex = ptepindex >> RPTE_SHIFT;
4376                 pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
4377                 if (pdpg == NULL && lockp != NULL)
4378                         goto retry;
4379         }
4380         return (pdpg);
4381 }
4382
4383 static vm_page_t
4384 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
4385 {
4386         vm_pindex_t ptepindex;
4387         pml3_entry_t *pd;
4388         vm_page_t m;
4389
4390         /*
4391          * Calculate pagetable page index
4392          */
4393         ptepindex = pmap_l3e_pindex(va);
4394 retry:
4395         /*
4396          * Get the page directory entry
4397          */
4398         pd = pmap_pml3e(pmap, va);
4399
4400         /*
4401          * This supports switching from a 2MB page to a
4402          * normal 4K page.
4403          */
4404         if (pd != NULL && (be64toh(*pd) & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V)) {
4405                 if (!pmap_demote_l3e_locked(pmap, pd, va, lockp)) {
4406                         /*
4407                          * Invalidation of the 2MB page mapping may have caused
4408                          * the deallocation of the underlying PD page.
4409                          */
4410                         pd = NULL;
4411                 }
4412         }
4413
4414         /*
4415          * If the page table page is mapped, we just increment the
4416          * hold count, and activate it.
4417          */
4418         if (pd != NULL && (be64toh(*pd) & PG_V) != 0) {
4419                 m = PHYS_TO_VM_PAGE(be64toh(*pd) & PG_FRAME);
4420                 m->ref_count++;
4421         } else {
4422                 /*
4423                  * Here if the pte page isn't mapped, or if it has been
4424                  * deallocated.
4425                  */
4426                 m = _pmap_allocpte(pmap, ptepindex, lockp);
4427                 if (m == NULL && lockp != NULL)
4428                         goto retry;
4429         }
4430         return (m);
4431 }
4432
4433 static void
4434 mmu_radix_pinit0(pmap_t pmap)
4435 {
4436
4437         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4438         PMAP_LOCK_INIT(pmap);
4439         pmap->pm_pml1 = kernel_pmap->pm_pml1;
4440         pmap->pm_pid = kernel_pmap->pm_pid;
4441
4442         vm_radix_init(&pmap->pm_radix);
4443         TAILQ_INIT(&pmap->pm_pvchunk);
4444         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
4445         kernel_pmap->pm_flags =
4446                 pmap->pm_flags = PMAP_PDE_SUPERPAGE;
4447 }
4448 /*
4449  * pmap_protect_l3e: do the things to protect a 2mpage in a process
4450  */
4451 static boolean_t
4452 pmap_protect_l3e(pmap_t pmap, pt_entry_t *l3e, vm_offset_t sva, vm_prot_t prot)
4453 {
4454         pt_entry_t newpde, oldpde;
4455         vm_offset_t eva, va;
4456         vm_page_t m;
4457         boolean_t anychanged;
4458
4459         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4460         KASSERT((sva & L3_PAGE_MASK) == 0,
4461             ("pmap_protect_l3e: sva is not 2mpage aligned"));
4462         anychanged = FALSE;
4463 retry:
4464         oldpde = newpde = be64toh(*l3e);
4465         if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
4466             (PG_MANAGED | PG_M | PG_RW)) {
4467                 eva = sva + L3_PAGE_SIZE;
4468                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
4469                     va < eva; va += PAGE_SIZE, m++)
4470                         vm_page_dirty(m);
4471         }
4472         if ((prot & VM_PROT_WRITE) == 0) {
4473                 newpde &= ~(PG_RW | PG_M);
4474                 newpde |= RPTE_EAA_R;
4475         }
4476         if (prot & VM_PROT_EXECUTE)
4477                 newpde |= PG_X;
4478         if (newpde != oldpde) {
4479                 /*
4480                  * As an optimization to future operations on this PDE, clear
4481                  * PG_PROMOTED.  The impending invalidation will remove any
4482                  * lingering 4KB page mappings from the TLB.
4483                  */
4484                 if (!atomic_cmpset_long(l3e, htobe64(oldpde), htobe64(newpde & ~PG_PROMOTED)))
4485                         goto retry;
4486                 anychanged = TRUE;
4487         }
4488         return (anychanged);
4489 }
4490
4491 void
4492 mmu_radix_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
4493     vm_prot_t prot)
4494 {
4495         vm_offset_t va_next;
4496         pml1_entry_t *l1e;
4497         pml2_entry_t *l2e;
4498         pml3_entry_t ptpaddr, *l3e;
4499         pt_entry_t *pte;
4500         boolean_t anychanged;
4501
4502         CTR5(KTR_PMAP, "%s(%p, %#x, %#x, %#x)", __func__, pmap, sva, eva,
4503             prot);
4504
4505         KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4506         if (prot == VM_PROT_NONE) {
4507                 mmu_radix_remove(pmap, sva, eva);
4508                 return;
4509         }
4510
4511         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
4512             (VM_PROT_WRITE|VM_PROT_EXECUTE))
4513                 return;
4514
4515 #ifdef INVARIANTS
4516         if (VERBOSE_PROTECT || pmap_logging)
4517                 printf("pmap_protect(%p, %#lx, %#lx, %x) - asid: %lu\n",
4518                            pmap, sva, eva, prot, pmap->pm_pid);
4519 #endif
4520         anychanged = FALSE;
4521
4522         PMAP_LOCK(pmap);
4523         for (; sva < eva; sva = va_next) {
4524                 l1e = pmap_pml1e(pmap, sva);
4525                 if ((be64toh(*l1e) & PG_V) == 0) {
4526                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
4527                         if (va_next < sva)
4528                                 va_next = eva;
4529                         continue;
4530                 }
4531
4532                 l2e = pmap_l1e_to_l2e(l1e, sva);
4533                 if ((be64toh(*l2e) & PG_V) == 0) {
4534                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
4535                         if (va_next < sva)
4536                                 va_next = eva;
4537                         continue;
4538                 }
4539
4540                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
4541                 if (va_next < sva)
4542                         va_next = eva;
4543
4544                 l3e = pmap_l2e_to_l3e(l2e, sva);
4545                 ptpaddr = be64toh(*l3e);
4546
4547                 /*
4548                  * Weed out invalid mappings.
4549                  */
4550                 if (ptpaddr == 0)
4551                         continue;
4552
4553                 /*
4554                  * Check for large page.
4555                  */
4556                 if ((ptpaddr & RPTE_LEAF) != 0) {
4557                         /*
4558                          * Are we protecting the entire large page?  If not,
4559                          * demote the mapping and fall through.
4560                          */
4561                         if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
4562                                 if (pmap_protect_l3e(pmap, l3e, sva, prot))
4563                                         anychanged = TRUE;
4564                                 continue;
4565                         } else if (!pmap_demote_l3e(pmap, l3e, sva)) {
4566                                 /*
4567                                  * The large page mapping was destroyed.
4568                                  */
4569                                 continue;
4570                         }
4571                 }
4572
4573                 if (va_next > eva)
4574                         va_next = eva;
4575
4576                 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++,
4577                     sva += PAGE_SIZE) {
4578                         pt_entry_t obits, pbits;
4579                         vm_page_t m;
4580
4581 retry:
4582                         MPASS(pte == pmap_pte(pmap, sva));
4583                         obits = pbits = be64toh(*pte);
4584                         if ((pbits & PG_V) == 0)
4585                                 continue;
4586
4587                         if ((prot & VM_PROT_WRITE) == 0) {
4588                                 if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
4589                                     (PG_MANAGED | PG_M | PG_RW)) {
4590                                         m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
4591                                         vm_page_dirty(m);
4592                                 }
4593                                 pbits &= ~(PG_RW | PG_M);
4594                                 pbits |= RPTE_EAA_R;
4595                         }
4596                         if (prot & VM_PROT_EXECUTE)
4597                                 pbits |= PG_X;
4598
4599                         if (pbits != obits) {
4600                                 if (!atomic_cmpset_long(pte, htobe64(obits), htobe64(pbits)))
4601                                         goto retry;
4602                                 if (obits & (PG_A|PG_M)) {
4603                                         anychanged = TRUE;
4604 #ifdef INVARIANTS
4605                                         if (VERBOSE_PROTECT || pmap_logging)
4606                                                 printf("%#lx %#lx -> %#lx\n",
4607                                                     sva, obits, pbits);
4608 #endif
4609                                 }
4610                         }
4611                 }
4612         }
4613         if (anychanged)
4614                 pmap_invalidate_all(pmap);
4615         PMAP_UNLOCK(pmap);
4616 }
4617
4618 void
4619 mmu_radix_qenter(vm_offset_t sva, vm_page_t *ma, int count)
4620 {
4621
4622         CTR4(KTR_PMAP, "%s(%#x, %p, %d)", __func__, sva, ma, count);
4623         pt_entry_t oldpte, pa, *pte;
4624         vm_page_t m;
4625         uint64_t cache_bits, attr_bits;
4626         vm_offset_t va;
4627
4628         oldpte = 0;
4629         attr_bits = RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A;
4630         va = sva;
4631         pte = kvtopte(va);
4632         while (va < sva + PAGE_SIZE * count) {
4633                 if (__predict_false((va & L3_PAGE_MASK) == 0))
4634                         pte = kvtopte(va);
4635                 MPASS(pte == pmap_pte(kernel_pmap, va));
4636
4637                 /*
4638                  * XXX there has to be a more efficient way than traversing
4639                  * the page table every time - but go for correctness for
4640                  * today
4641                  */
4642
4643                 m = *ma++;
4644                 cache_bits = pmap_cache_bits(m->md.mdpg_cache_attrs);
4645                 pa = VM_PAGE_TO_PHYS(m) | cache_bits | attr_bits;
4646                 if (be64toh(*pte) != pa) {
4647                         oldpte |= be64toh(*pte);
4648                         pte_store(pte, pa);
4649                 }
4650                 va += PAGE_SIZE;
4651                 pte++;
4652         }
4653         if (__predict_false((oldpte & RPTE_VALID) != 0))
4654                 pmap_invalidate_range(kernel_pmap, sva, sva + count *
4655                     PAGE_SIZE);
4656         else
4657                 ptesync();
4658 }
4659
4660 void
4661 mmu_radix_qremove(vm_offset_t sva, int count)
4662 {
4663         vm_offset_t va;
4664         pt_entry_t *pte;
4665
4666         CTR3(KTR_PMAP, "%s(%#x, %d)", __func__, sva, count);
4667         KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode or dmap va %lx", sva));
4668
4669         va = sva;
4670         pte = kvtopte(va);
4671         while (va < sva + PAGE_SIZE * count) {
4672                 if (__predict_false((va & L3_PAGE_MASK) == 0))
4673                         pte = kvtopte(va);
4674                 pte_clear(pte);
4675                 pte++;
4676                 va += PAGE_SIZE;
4677         }
4678         pmap_invalidate_range(kernel_pmap, sva, va);
4679 }
4680
4681 /***************************************************
4682  * Page table page management routines.....
4683  ***************************************************/
4684 /*
4685  * Schedule the specified unused page table page to be freed.  Specifically,
4686  * add the page to the specified list of pages that will be released to the
4687  * physical memory manager after the TLB has been updated.
4688  */
4689 static __inline void
4690 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
4691     boolean_t set_PG_ZERO)
4692 {
4693
4694         if (set_PG_ZERO)
4695                 m->flags |= PG_ZERO;
4696         else
4697                 m->flags &= ~PG_ZERO;
4698         SLIST_INSERT_HEAD(free, m, plinks.s.ss);
4699 }
4700
4701 /*
4702  * Inserts the specified page table page into the specified pmap's collection
4703  * of idle page table pages.  Each of a pmap's page table pages is responsible
4704  * for mapping a distinct range of virtual addresses.  The pmap's collection is
4705  * ordered by this virtual address range.
4706  */
4707 static __inline int
4708 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
4709 {
4710
4711         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4712         return (vm_radix_insert(&pmap->pm_radix, mpte));
4713 }
4714
4715 /*
4716  * Removes the page table page mapping the specified virtual address from the
4717  * specified pmap's collection of idle page table pages, and returns it.
4718  * Otherwise, returns NULL if there is no page table page corresponding to the
4719  * specified virtual address.
4720  */
4721 static __inline vm_page_t
4722 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4723 {
4724
4725         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4726         return (vm_radix_remove(&pmap->pm_radix, pmap_l3e_pindex(va)));
4727 }
4728
4729 /*
4730  * Decrements a page table page's wire count, which is used to record the
4731  * number of valid page table entries within the page.  If the wire count
4732  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
4733  * page table page was unmapped and FALSE otherwise.
4734  */
4735 static inline boolean_t
4736 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
4737 {
4738
4739         --m->ref_count;
4740         if (m->ref_count == 0) {
4741                 _pmap_unwire_ptp(pmap, va, m, free);
4742                 return (TRUE);
4743         } else
4744                 return (FALSE);
4745 }
4746
4747 static void
4748 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
4749 {
4750
4751         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4752         /*
4753          * unmap the page table page
4754          */
4755         if (m->pindex >= NUPDE + NUPDPE) {
4756                 /* PDP page */
4757                 pml1_entry_t *pml1;
4758                 pml1 = pmap_pml1e(pmap, va);
4759                 *pml1 = 0;
4760         } else if (m->pindex >= NUPDE) {
4761                 /* PD page */
4762                 pml2_entry_t *l2e;
4763                 l2e = pmap_pml2e(pmap, va);
4764                 *l2e = 0;
4765         } else {
4766                 /* PTE page */
4767                 pml3_entry_t *l3e;
4768                 l3e = pmap_pml3e(pmap, va);
4769                 *l3e = 0;
4770         }
4771         pmap_resident_count_dec(pmap, 1);
4772         if (m->pindex < NUPDE) {
4773                 /* We just released a PT, unhold the matching PD */
4774                 vm_page_t pdpg;
4775
4776                 pdpg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml2e(pmap, va)) & PG_FRAME);
4777                 pmap_unwire_ptp(pmap, va, pdpg, free);
4778         }
4779         else if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
4780                 /* We just released a PD, unhold the matching PDP */
4781                 vm_page_t pdppg;
4782
4783                 pdppg = PHYS_TO_VM_PAGE(be64toh(*pmap_pml1e(pmap, va)) & PG_FRAME);
4784                 pmap_unwire_ptp(pmap, va, pdppg, free);
4785         }
4786
4787         /*
4788          * Put page on a list so that it is released after
4789          * *ALL* TLB shootdown is done
4790          */
4791         pmap_add_delayed_free_list(m, free, TRUE);
4792 }
4793
4794 /*
4795  * After removing a page table entry, this routine is used to
4796  * conditionally free the page, and manage the hold/wire counts.
4797  */
4798 static int
4799 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pml3_entry_t ptepde,
4800     struct spglist *free)
4801 {
4802         vm_page_t mpte;
4803
4804         if (va >= VM_MAXUSER_ADDRESS)
4805                 return (0);
4806         KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
4807         mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
4808         return (pmap_unwire_ptp(pmap, va, mpte, free));
4809 }
4810
4811 void
4812 mmu_radix_release(pmap_t pmap)
4813 {
4814
4815         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
4816         KASSERT(pmap->pm_stats.resident_count == 0,
4817             ("pmap_release: pmap resident count %ld != 0",
4818             pmap->pm_stats.resident_count));
4819         KASSERT(vm_radix_is_empty(&pmap->pm_radix),
4820             ("pmap_release: pmap has reserved page table page(s)"));
4821
4822         pmap_invalidate_all(pmap);
4823         isa3_proctab[pmap->pm_pid].proctab0 = 0;
4824         uma_zfree(zone_radix_pgd, pmap->pm_pml1);
4825         vmem_free(asid_arena, pmap->pm_pid, 1);
4826 }
4827
4828 /*
4829  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
4830  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
4831  * false if the PV entry cannot be allocated without resorting to reclamation.
4832  */
4833 static bool
4834 pmap_pv_insert_l3e(pmap_t pmap, vm_offset_t va, pml3_entry_t pde, u_int flags,
4835     struct rwlock **lockp)
4836 {
4837         struct md_page *pvh;
4838         pv_entry_t pv;
4839         vm_paddr_t pa;
4840
4841         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4842         /* Pass NULL instead of the lock pointer to disable reclamation. */
4843         if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
4844             NULL : lockp)) == NULL)
4845                 return (false);
4846         pv->pv_va = va;
4847         pa = pde & PG_PS_FRAME;
4848         CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4849         pvh = pa_to_pvh(pa);
4850         TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_link);
4851         pvh->pv_gen++;
4852         return (true);
4853 }
4854
4855 /*
4856  * Fills a page table page with mappings to consecutive physical pages.
4857  */
4858 static void
4859 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
4860 {
4861         pt_entry_t *pte;
4862
4863         for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
4864                 *pte = htobe64(newpte);
4865                 newpte += PAGE_SIZE;
4866         }
4867 }
4868
4869 static boolean_t
4870 pmap_demote_l3e(pmap_t pmap, pml3_entry_t *pde, vm_offset_t va)
4871 {
4872         struct rwlock *lock;
4873         boolean_t rv;
4874
4875         lock = NULL;
4876         rv = pmap_demote_l3e_locked(pmap, pde, va, &lock);
4877         if (lock != NULL)
4878                 rw_wunlock(lock);
4879         return (rv);
4880 }
4881
4882 static boolean_t
4883 pmap_demote_l3e_locked(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va,
4884     struct rwlock **lockp)
4885 {
4886         pml3_entry_t oldpde;
4887         pt_entry_t *firstpte;
4888         vm_paddr_t mptepa;
4889         vm_page_t mpte;
4890         struct spglist free;
4891         vm_offset_t sva;
4892
4893         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4894         oldpde = be64toh(*l3e);
4895         KASSERT((oldpde & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V),
4896             ("pmap_demote_l3e: oldpde is missing RPTE_LEAF and/or PG_V %lx",
4897             oldpde));
4898         if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
4899             NULL) {
4900                 KASSERT((oldpde & PG_W) == 0,
4901                     ("pmap_demote_l3e: page table page for a wired mapping"
4902                     " is missing"));
4903
4904                 /*
4905                  * Invalidate the 2MB page mapping and return "failure" if the
4906                  * mapping was never accessed or the allocation of the new
4907                  * page table page fails.  If the 2MB page mapping belongs to
4908                  * the direct map region of the kernel's address space, then
4909                  * the page allocation request specifies the highest possible
4910                  * priority (VM_ALLOC_INTERRUPT).  Otherwise, the priority is
4911                  * normal.  Page table pages are preallocated for every other
4912                  * part of the kernel address space, so the direct map region
4913                  * is the only part of the kernel address space that must be
4914                  * handled here.
4915                  */
4916                 if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc_noobj(
4917                     (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS ?
4918                     VM_ALLOC_INTERRUPT : 0) | VM_ALLOC_WIRED)) == NULL) {
4919                         SLIST_INIT(&free);
4920                         sva = trunc_2mpage(va);
4921                         pmap_remove_l3e(pmap, l3e, sva, &free, lockp);
4922                         pmap_invalidate_l3e_page(pmap, sva, oldpde);
4923                         vm_page_free_pages_toq(&free, true);
4924                         CTR2(KTR_PMAP, "pmap_demote_l3e: failure for va %#lx"
4925                             " in pmap %p", va, pmap);
4926                         return (FALSE);
4927                 }
4928                 mpte->pindex = pmap_l3e_pindex(va);
4929                 if (va < VM_MAXUSER_ADDRESS)
4930                         pmap_resident_count_inc(pmap, 1);
4931         }
4932         mptepa = VM_PAGE_TO_PHYS(mpte);
4933         firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
4934         KASSERT((oldpde & PG_A) != 0,
4935             ("pmap_demote_l3e: oldpde is missing PG_A"));
4936         KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
4937             ("pmap_demote_l3e: oldpde is missing PG_M"));
4938
4939         /*
4940          * If the page table page is new, initialize it.
4941          */
4942         if (mpte->ref_count == 1) {
4943                 mpte->ref_count = NPTEPG;
4944                 pmap_fill_ptp(firstpte, oldpde);
4945         }
4946
4947         KASSERT((be64toh(*firstpte) & PG_FRAME) == (oldpde & PG_FRAME),
4948             ("pmap_demote_l3e: firstpte and newpte map different physical"
4949             " addresses"));
4950
4951         /*
4952          * If the mapping has changed attributes, update the page table
4953          * entries.
4954          */
4955         if ((be64toh(*firstpte) & PG_PTE_PROMOTE) != (oldpde & PG_PTE_PROMOTE))
4956                 pmap_fill_ptp(firstpte, oldpde);
4957
4958         /*
4959          * The spare PV entries must be reserved prior to demoting the
4960          * mapping, that is, prior to changing the PDE.  Otherwise, the state
4961          * of the PDE and the PV lists will be inconsistent, which can result
4962          * in reclaim_pv_chunk() attempting to remove a PV entry from the
4963          * wrong PV list and pmap_pv_demote_l3e() failing to find the expected
4964          * PV entry for the 2MB page mapping that is being demoted.
4965          */
4966         if ((oldpde & PG_MANAGED) != 0)
4967                 reserve_pv_entries(pmap, NPTEPG - 1, lockp);
4968
4969         /*
4970          * Demote the mapping.  This pmap is locked.  The old PDE has
4971          * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
4972          * set.  Thus, there is no danger of a race with another
4973          * processor changing the setting of PG_A and/or PG_M between
4974          * the read above and the store below.
4975          */
4976         pde_store(l3e, mptepa);
4977         pmap_invalidate_l3e_page(pmap, trunc_2mpage(va), oldpde);
4978         /*
4979          * Demote the PV entry.
4980          */
4981         if ((oldpde & PG_MANAGED) != 0)
4982                 pmap_pv_demote_l3e(pmap, va, oldpde & PG_PS_FRAME, lockp);
4983
4984         counter_u64_add(pmap_l3e_demotions, 1);
4985         CTR2(KTR_PMAP, "pmap_demote_l3e: success for va %#lx"
4986             " in pmap %p", va, pmap);
4987         return (TRUE);
4988 }
4989
4990 /*
4991  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
4992  */
4993 static void
4994 pmap_remove_kernel_l3e(pmap_t pmap, pml3_entry_t *l3e, vm_offset_t va)
4995 {
4996         vm_paddr_t mptepa;
4997         vm_page_t mpte;
4998
4999         KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
5000         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5001         mpte = pmap_remove_pt_page(pmap, va);
5002         if (mpte == NULL)
5003                 panic("pmap_remove_kernel_pde: Missing pt page.");
5004
5005         mptepa = VM_PAGE_TO_PHYS(mpte);
5006
5007         /*
5008          * Initialize the page table page.
5009          */
5010         pagezero(PHYS_TO_DMAP(mptepa));
5011
5012         /*
5013          * Demote the mapping.
5014          */
5015         pde_store(l3e, mptepa);
5016         ptesync();
5017 }
5018
5019 /*
5020  * pmap_remove_l3e: do the things to unmap a superpage in a process
5021  */
5022 static int
5023 pmap_remove_l3e(pmap_t pmap, pml3_entry_t *pdq, vm_offset_t sva,
5024     struct spglist *free, struct rwlock **lockp)
5025 {
5026         struct md_page *pvh;
5027         pml3_entry_t oldpde;
5028         vm_offset_t eva, va;
5029         vm_page_t m, mpte;
5030
5031         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5032         KASSERT((sva & L3_PAGE_MASK) == 0,
5033             ("pmap_remove_l3e: sva is not 2mpage aligned"));
5034         oldpde = be64toh(pte_load_clear(pdq));
5035         if (oldpde & PG_W)
5036                 pmap->pm_stats.wired_count -= (L3_PAGE_SIZE / PAGE_SIZE);
5037         pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE);
5038         if (oldpde & PG_MANAGED) {
5039                 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
5040                 pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
5041                 pmap_pvh_free(pvh, pmap, sva);
5042                 eva = sva + L3_PAGE_SIZE;
5043                 for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
5044                     va < eva; va += PAGE_SIZE, m++) {
5045                         if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
5046                                 vm_page_dirty(m);
5047                         if (oldpde & PG_A)
5048                                 vm_page_aflag_set(m, PGA_REFERENCED);
5049                         if (TAILQ_EMPTY(&m->md.pv_list) &&
5050                             TAILQ_EMPTY(&pvh->pv_list))
5051                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
5052                 }
5053         }
5054         if (pmap == kernel_pmap) {
5055                 pmap_remove_kernel_l3e(pmap, pdq, sva);
5056         } else {
5057                 mpte = pmap_remove_pt_page(pmap, sva);
5058                 if (mpte != NULL) {
5059                         pmap_resident_count_dec(pmap, 1);
5060                         KASSERT(mpte->ref_count == NPTEPG,
5061                             ("pmap_remove_l3e: pte page wire count error"));
5062                         mpte->ref_count = 0;
5063                         pmap_add_delayed_free_list(mpte, free, FALSE);
5064                 }
5065         }
5066         return (pmap_unuse_pt(pmap, sva, be64toh(*pmap_pml2e(pmap, sva)), free));
5067 }
5068
5069 /*
5070  * pmap_remove_pte: do the things to unmap a page in a process
5071  */
5072 static int
5073 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
5074     pml3_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
5075 {
5076         struct md_page *pvh;
5077         pt_entry_t oldpte;
5078         vm_page_t m;
5079
5080         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5081         oldpte = be64toh(pte_load_clear(ptq));
5082         if (oldpte & RPTE_WIRED)
5083                 pmap->pm_stats.wired_count -= 1;
5084         pmap_resident_count_dec(pmap, 1);
5085         if (oldpte & RPTE_MANAGED) {
5086                 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
5087                 if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5088                         vm_page_dirty(m);
5089                 if (oldpte & PG_A)
5090                         vm_page_aflag_set(m, PGA_REFERENCED);
5091                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
5092                 pmap_pvh_free(&m->md, pmap, va);
5093                 if (TAILQ_EMPTY(&m->md.pv_list) &&
5094                     (m->flags & PG_FICTITIOUS) == 0) {
5095                         pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5096                         if (TAILQ_EMPTY(&pvh->pv_list))
5097                                 vm_page_aflag_clear(m, PGA_WRITEABLE);
5098                 }
5099         }
5100         return (pmap_unuse_pt(pmap, va, ptepde, free));
5101 }
5102
5103 /*
5104  * Remove a single page from a process address space
5105  */
5106 static bool
5107 pmap_remove_page(pmap_t pmap, vm_offset_t va, pml3_entry_t *l3e,
5108     struct spglist *free)
5109 {
5110         struct rwlock *lock;
5111         pt_entry_t *pte;
5112         bool invalidate_all;
5113
5114         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5115         if ((be64toh(*l3e) & RPTE_VALID) == 0) {
5116                 return (false);
5117         }
5118         pte = pmap_l3e_to_pte(l3e, va);
5119         if ((be64toh(*pte) & RPTE_VALID) == 0) {
5120                 return (false);
5121         }
5122         lock = NULL;
5123
5124         invalidate_all = pmap_remove_pte(pmap, pte, va, be64toh(*l3e), free, &lock);
5125         if (lock != NULL)
5126                 rw_wunlock(lock);
5127         if (!invalidate_all)
5128                 pmap_invalidate_page(pmap, va);
5129         return (invalidate_all);
5130 }
5131
5132 /*
5133  * Removes the specified range of addresses from the page table page.
5134  */
5135 static bool
5136 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
5137     pml3_entry_t *l3e, struct spglist *free, struct rwlock **lockp)
5138 {
5139         pt_entry_t *pte;
5140         vm_offset_t va;
5141         bool anyvalid;
5142
5143         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5144         anyvalid = false;
5145         va = eva;
5146         for (pte = pmap_l3e_to_pte(l3e, sva); sva != eva; pte++,
5147             sva += PAGE_SIZE) {
5148                 MPASS(pte == pmap_pte(pmap, sva));
5149                 if (*pte == 0) {
5150                         if (va != eva) {
5151                                 anyvalid = true;
5152                                 va = eva;
5153                         }
5154                         continue;
5155                 }
5156                 if (va == eva)
5157                         va = sva;
5158                 if (pmap_remove_pte(pmap, pte, sva, be64toh(*l3e), free, lockp)) {
5159                         anyvalid = true;
5160                         sva += PAGE_SIZE;
5161                         break;
5162                 }
5163         }
5164         if (anyvalid)
5165                 pmap_invalidate_all(pmap);
5166         else if (va != eva)
5167                 pmap_invalidate_range(pmap, va, sva);
5168         return (anyvalid);
5169 }
5170
5171 void
5172 mmu_radix_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5173 {
5174         struct rwlock *lock;
5175         vm_offset_t va_next;
5176         pml1_entry_t *l1e;
5177         pml2_entry_t *l2e;
5178         pml3_entry_t ptpaddr, *l3e;
5179         struct spglist free;
5180         bool anyvalid;
5181
5182         CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva);
5183
5184         /*
5185          * Perform an unsynchronized read.  This is, however, safe.
5186          */
5187         if (pmap->pm_stats.resident_count == 0)
5188                 return;
5189
5190         anyvalid = false;
5191         SLIST_INIT(&free);
5192
5193         /* XXX something fishy here */
5194         sva = (sva + PAGE_MASK) & ~PAGE_MASK;
5195         eva = (eva + PAGE_MASK) & ~PAGE_MASK;
5196
5197         PMAP_LOCK(pmap);
5198
5199         /*
5200          * special handling of removing one page.  a very
5201          * common operation and easy to short circuit some
5202          * code.
5203          */
5204         if (sva + PAGE_SIZE == eva) {
5205                 l3e = pmap_pml3e(pmap, sva);
5206                 if (l3e && (be64toh(*l3e) & RPTE_LEAF) == 0) {
5207                         anyvalid = pmap_remove_page(pmap, sva, l3e, &free);
5208                         goto out;
5209                 }
5210         }
5211
5212         lock = NULL;
5213         for (; sva < eva; sva = va_next) {
5214                 if (pmap->pm_stats.resident_count == 0)
5215                         break;
5216                 l1e = pmap_pml1e(pmap, sva);
5217                 if (l1e == NULL || (be64toh(*l1e) & PG_V) == 0) {
5218                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
5219                         if (va_next < sva)
5220                                 va_next = eva;
5221                         continue;
5222                 }
5223
5224                 l2e = pmap_l1e_to_l2e(l1e, sva);
5225                 if (l2e == NULL || (be64toh(*l2e) & PG_V) == 0) {
5226                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
5227                         if (va_next < sva)
5228                                 va_next = eva;
5229                         continue;
5230                 }
5231
5232                 /*
5233                  * Calculate index for next page table.
5234                  */
5235                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
5236                 if (va_next < sva)
5237                         va_next = eva;
5238
5239                 l3e = pmap_l2e_to_l3e(l2e, sva);
5240                 ptpaddr = be64toh(*l3e);
5241
5242                 /*
5243                  * Weed out invalid mappings.
5244                  */
5245                 if (ptpaddr == 0)
5246                         continue;
5247
5248                 /*
5249                  * Check for large page.
5250                  */
5251                 if ((ptpaddr & RPTE_LEAF) != 0) {
5252                         /*
5253                          * Are we removing the entire large page?  If not,
5254                          * demote the mapping and fall through.
5255                          */
5256                         if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
5257                                 pmap_remove_l3e(pmap, l3e, sva, &free, &lock);
5258                                 anyvalid = true;
5259                                 continue;
5260                         } else if (!pmap_demote_l3e_locked(pmap, l3e, sva,
5261                             &lock)) {
5262                                 /* The large page mapping was destroyed. */
5263                                 continue;
5264                         } else
5265                                 ptpaddr = be64toh(*l3e);
5266                 }
5267
5268                 /*
5269                  * Limit our scan to either the end of the va represented
5270                  * by the current page table page, or to the end of the
5271                  * range being removed.
5272                  */
5273                 if (va_next > eva)
5274                         va_next = eva;
5275
5276                 if (pmap_remove_ptes(pmap, sva, va_next, l3e, &free, &lock))
5277                         anyvalid = true;
5278         }
5279         if (lock != NULL)
5280                 rw_wunlock(lock);
5281 out:
5282         if (anyvalid)
5283                 pmap_invalidate_all(pmap);
5284         PMAP_UNLOCK(pmap);
5285         vm_page_free_pages_toq(&free, true);
5286 }
5287
5288 void
5289 mmu_radix_remove_all(vm_page_t m)
5290 {
5291         struct md_page *pvh;
5292         pv_entry_t pv;
5293         pmap_t pmap;
5294         struct rwlock *lock;
5295         pt_entry_t *pte, tpte;
5296         pml3_entry_t *l3e;
5297         vm_offset_t va;
5298         struct spglist free;
5299         int pvh_gen, md_gen;
5300
5301         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5302         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5303             ("pmap_remove_all: page %p is not managed", m));
5304         SLIST_INIT(&free);
5305         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5306         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
5307             pa_to_pvh(VM_PAGE_TO_PHYS(m));
5308 retry:
5309         rw_wlock(lock);
5310         while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
5311                 pmap = PV_PMAP(pv);
5312                 if (!PMAP_TRYLOCK(pmap)) {
5313                         pvh_gen = pvh->pv_gen;
5314                         rw_wunlock(lock);
5315                         PMAP_LOCK(pmap);
5316                         rw_wlock(lock);
5317                         if (pvh_gen != pvh->pv_gen) {
5318                                 rw_wunlock(lock);
5319                                 PMAP_UNLOCK(pmap);
5320                                 goto retry;
5321                         }
5322                 }
5323                 va = pv->pv_va;
5324                 l3e = pmap_pml3e(pmap, va);
5325                 (void)pmap_demote_l3e_locked(pmap, l3e, va, &lock);
5326                 PMAP_UNLOCK(pmap);
5327         }
5328         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
5329                 pmap = PV_PMAP(pv);
5330                 if (!PMAP_TRYLOCK(pmap)) {
5331                         pvh_gen = pvh->pv_gen;
5332                         md_gen = m->md.pv_gen;
5333                         rw_wunlock(lock);
5334                         PMAP_LOCK(pmap);
5335                         rw_wlock(lock);
5336                         if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5337                                 rw_wunlock(lock);
5338                                 PMAP_UNLOCK(pmap);
5339                                 goto retry;
5340                         }
5341                 }
5342                 pmap_resident_count_dec(pmap, 1);
5343                 l3e = pmap_pml3e(pmap, pv->pv_va);
5344                 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0, ("pmap_remove_all: found"
5345                     " a 2mpage in page %p's pv list", m));
5346                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
5347                 tpte = be64toh(pte_load_clear(pte));
5348                 if (tpte & PG_W)
5349                         pmap->pm_stats.wired_count--;
5350                 if (tpte & PG_A)
5351                         vm_page_aflag_set(m, PGA_REFERENCED);
5352
5353                 /*
5354                  * Update the vm_page_t clean and reference bits.
5355                  */
5356                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5357                         vm_page_dirty(m);
5358                 pmap_unuse_pt(pmap, pv->pv_va, be64toh(*l3e), &free);
5359                 pmap_invalidate_page(pmap, pv->pv_va);
5360                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
5361                 m->md.pv_gen++;
5362                 free_pv_entry(pmap, pv);
5363                 PMAP_UNLOCK(pmap);
5364         }
5365         vm_page_aflag_clear(m, PGA_WRITEABLE);
5366         rw_wunlock(lock);
5367         vm_page_free_pages_toq(&free, true);
5368 }
5369
5370 /*
5371  * Destroy all managed, non-wired mappings in the given user-space
5372  * pmap.  This pmap cannot be active on any processor besides the
5373  * caller.
5374  *
5375  * This function cannot be applied to the kernel pmap.  Moreover, it
5376  * is not intended for general use.  It is only to be used during
5377  * process termination.  Consequently, it can be implemented in ways
5378  * that make it faster than pmap_remove().  First, it can more quickly
5379  * destroy mappings by iterating over the pmap's collection of PV
5380  * entries, rather than searching the page table.  Second, it doesn't
5381  * have to test and clear the page table entries atomically, because
5382  * no processor is currently accessing the user address space.  In
5383  * particular, a page table entry's dirty bit won't change state once
5384  * this function starts.
5385  *
5386  * Although this function destroys all of the pmap's managed,
5387  * non-wired mappings, it can delay and batch the invalidation of TLB
5388  * entries without calling pmap_delayed_invl_started() and
5389  * pmap_delayed_invl_finished().  Because the pmap is not active on
5390  * any other processor, none of these TLB entries will ever be used
5391  * before their eventual invalidation.  Consequently, there is no need
5392  * for either pmap_remove_all() or pmap_remove_write() to wait for
5393  * that eventual TLB invalidation.
5394  */
5395
5396 void
5397 mmu_radix_remove_pages(pmap_t pmap)
5398 {
5399
5400         CTR2(KTR_PMAP, "%s(%p)", __func__, pmap);
5401         pml3_entry_t ptel3e;
5402         pt_entry_t *pte, tpte;
5403         struct spglist free;
5404         vm_page_t m, mpte, mt;
5405         pv_entry_t pv;
5406         struct md_page *pvh;
5407         struct pv_chunk *pc, *npc;
5408         struct rwlock *lock;
5409         int64_t bit;
5410         uint64_t inuse, bitmask;
5411         int allfree, field, idx;
5412 #ifdef PV_STATS
5413         int freed;
5414 #endif
5415         boolean_t superpage;
5416         vm_paddr_t pa;
5417
5418         /*
5419          * Assert that the given pmap is only active on the current
5420          * CPU.  Unfortunately, we cannot block another CPU from
5421          * activating the pmap while this function is executing.
5422          */
5423         KASSERT(pmap->pm_pid == mfspr(SPR_PID),
5424             ("non-current asid %lu - expected %lu", pmap->pm_pid,
5425             mfspr(SPR_PID)));
5426
5427         lock = NULL;
5428
5429         SLIST_INIT(&free);
5430         PMAP_LOCK(pmap);
5431         TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5432                 allfree = 1;
5433 #ifdef PV_STATS
5434                 freed = 0;
5435 #endif
5436                 for (field = 0; field < _NPCM; field++) {
5437                         inuse = ~pc->pc_map[field] & pc_freemask[field];
5438                         while (inuse != 0) {
5439                                 bit = cnttzd(inuse);
5440                                 bitmask = 1UL << bit;
5441                                 idx = field * 64 + bit;
5442                                 pv = &pc->pc_pventry[idx];
5443                                 inuse &= ~bitmask;
5444
5445                                 pte = pmap_pml2e(pmap, pv->pv_va);
5446                                 ptel3e = be64toh(*pte);
5447                                 pte = pmap_l2e_to_l3e(pte, pv->pv_va);
5448                                 tpte = be64toh(*pte);
5449                                 if ((tpte & (RPTE_LEAF | PG_V)) == PG_V) {
5450                                         superpage = FALSE;
5451                                         ptel3e = tpte;
5452                                         pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
5453                                             PG_FRAME);
5454                                         pte = &pte[pmap_pte_index(pv->pv_va)];
5455                                         tpte = be64toh(*pte);
5456                                 } else {
5457                                         /*
5458                                          * Keep track whether 'tpte' is a
5459                                          * superpage explicitly instead of
5460                                          * relying on RPTE_LEAF being set.
5461                                          *
5462                                          * This is because RPTE_LEAF is numerically
5463                                          * identical to PG_PTE_PAT and thus a
5464                                          * regular page could be mistaken for
5465                                          * a superpage.
5466                                          */
5467                                         superpage = TRUE;
5468                                 }
5469
5470                                 if ((tpte & PG_V) == 0) {
5471                                         panic("bad pte va %lx pte %lx",
5472                                             pv->pv_va, tpte);
5473                                 }
5474
5475 /*
5476  * We cannot remove wired pages from a process' mapping at this time
5477  */
5478                                 if (tpte & PG_W) {
5479                                         allfree = 0;
5480                                         continue;
5481                                 }
5482
5483                                 if (superpage)
5484                                         pa = tpte & PG_PS_FRAME;
5485                                 else
5486                                         pa = tpte & PG_FRAME;
5487
5488                                 m = PHYS_TO_VM_PAGE(pa);
5489                                 KASSERT(m->phys_addr == pa,
5490                                     ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5491                                     m, (uintmax_t)m->phys_addr,
5492                                     (uintmax_t)tpte));
5493
5494                                 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5495                                     m < &vm_page_array[vm_page_array_size],
5496                                     ("pmap_remove_pages: bad tpte %#jx",
5497                                     (uintmax_t)tpte));
5498
5499                                 pte_clear(pte);
5500
5501                                 /*
5502                                  * Update the vm_page_t clean/reference bits.
5503                                  */
5504                                 if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
5505                                         if (superpage) {
5506                                                 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
5507                                                         vm_page_dirty(mt);
5508                                         } else
5509                                                 vm_page_dirty(m);
5510                                 }
5511
5512                                 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5513
5514                                 /* Mark free */
5515                                 pc->pc_map[field] |= bitmask;
5516                                 if (superpage) {
5517                                         pmap_resident_count_dec(pmap, L3_PAGE_SIZE / PAGE_SIZE);
5518                                         pvh = pa_to_pvh(tpte & PG_PS_FRAME);
5519                                         TAILQ_REMOVE(&pvh->pv_list, pv, pv_link);
5520                                         pvh->pv_gen++;
5521                                         if (TAILQ_EMPTY(&pvh->pv_list)) {
5522                                                 for (mt = m; mt < &m[L3_PAGE_SIZE / PAGE_SIZE]; mt++)
5523                                                         if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
5524                                                             TAILQ_EMPTY(&mt->md.pv_list))
5525                                                                 vm_page_aflag_clear(mt, PGA_WRITEABLE);
5526                                         }
5527                                         mpte = pmap_remove_pt_page(pmap, pv->pv_va);
5528                                         if (mpte != NULL) {
5529                                                 pmap_resident_count_dec(pmap, 1);
5530                                                 KASSERT(mpte->ref_count == NPTEPG,
5531                                                     ("pmap_remove_pages: pte page wire count error"));
5532                                                 mpte->ref_count = 0;
5533                                                 pmap_add_delayed_free_list(mpte, &free, FALSE);
5534                                         }
5535                                 } else {
5536                                         pmap_resident_count_dec(pmap, 1);
5537 #ifdef VERBOSE_PV
5538                                         printf("freeing pv (%p, %p)\n",
5539                                                    pmap, pv);
5540 #endif
5541                                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_link);
5542                                         m->md.pv_gen++;
5543                                         if ((m->a.flags & PGA_WRITEABLE) != 0 &&
5544                                             TAILQ_EMPTY(&m->md.pv_list) &&
5545                                             (m->flags & PG_FICTITIOUS) == 0) {
5546                                                 pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
5547                                                 if (TAILQ_EMPTY(&pvh->pv_list))
5548                                                         vm_page_aflag_clear(m, PGA_WRITEABLE);
5549                                         }
5550                                 }
5551                                 pmap_unuse_pt(pmap, pv->pv_va, ptel3e, &free);
5552 #ifdef PV_STATS
5553                                 freed++;
5554 #endif
5555                         }
5556                 }
5557                 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5558                 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5559                 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5560                 if (allfree) {
5561                         TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5562                         free_pv_chunk(pc);
5563                 }
5564         }
5565         if (lock != NULL)
5566                 rw_wunlock(lock);
5567         pmap_invalidate_all(pmap);
5568         PMAP_UNLOCK(pmap);
5569         vm_page_free_pages_toq(&free, true);
5570 }
5571
5572 void
5573 mmu_radix_remove_write(vm_page_t m)
5574 {
5575         struct md_page *pvh;
5576         pmap_t pmap;
5577         struct rwlock *lock;
5578         pv_entry_t next_pv, pv;
5579         pml3_entry_t *l3e;
5580         pt_entry_t oldpte, *pte;
5581         int pvh_gen, md_gen;
5582
5583         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5584         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5585             ("pmap_remove_write: page %p is not managed", m));
5586         vm_page_assert_busied(m);
5587
5588         if (!pmap_page_is_write_mapped(m))
5589                 return;
5590         lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5591         pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
5592             pa_to_pvh(VM_PAGE_TO_PHYS(m));
5593 retry_pv_loop:
5594         rw_wlock(lock);
5595         TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_link, next_pv) {
5596                 pmap = PV_PMAP(pv);
5597                 if (!PMAP_TRYLOCK(pmap)) {
5598                         pvh_gen = pvh->pv_gen;
5599                         rw_wunlock(lock);
5600                         PMAP_LOCK(pmap);
5601                         rw_wlock(lock);
5602                         if (pvh_gen != pvh->pv_gen) {
5603                                 PMAP_UNLOCK(pmap);
5604                                 rw_wunlock(lock);
5605                                 goto retry_pv_loop;
5606                         }
5607                 }
5608                 l3e = pmap_pml3e(pmap, pv->pv_va);
5609                 if ((be64toh(*l3e) & PG_RW) != 0)
5610                         (void)pmap_demote_l3e_locked(pmap, l3e, pv->pv_va, &lock);
5611                 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5612                     ("inconsistent pv lock %p %p for page %p",
5613                     lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5614                 PMAP_UNLOCK(pmap);
5615         }
5616         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
5617                 pmap = PV_PMAP(pv);
5618                 if (!PMAP_TRYLOCK(pmap)) {
5619                         pvh_gen = pvh->pv_gen;
5620                         md_gen = m->md.pv_gen;
5621                         rw_wunlock(lock);
5622                         PMAP_LOCK(pmap);
5623                         rw_wlock(lock);
5624                         if (pvh_gen != pvh->pv_gen ||
5625                             md_gen != m->md.pv_gen) {
5626                                 PMAP_UNLOCK(pmap);
5627                                 rw_wunlock(lock);
5628                                 goto retry_pv_loop;
5629                         }
5630                 }
5631                 l3e = pmap_pml3e(pmap, pv->pv_va);
5632                 KASSERT((be64toh(*l3e) & RPTE_LEAF) == 0,
5633                     ("pmap_remove_write: found a 2mpage in page %p's pv list",
5634                     m));
5635                 pte = pmap_l3e_to_pte(l3e, pv->pv_va);
5636 retry:
5637                 oldpte = be64toh(*pte);
5638                 if (oldpte & PG_RW) {
5639                         if (!atomic_cmpset_long(pte, htobe64(oldpte),
5640                             htobe64((oldpte | RPTE_EAA_R) & ~(PG_RW | PG_M))))
5641                                 goto retry;
5642                         if ((oldpte & PG_M) != 0)
5643                                 vm_page_dirty(m);
5644                         pmap_invalidate_page(pmap, pv->pv_va);
5645                 }
5646                 PMAP_UNLOCK(pmap);
5647         }
5648         rw_wunlock(lock);
5649         vm_page_aflag_clear(m, PGA_WRITEABLE);
5650 }
5651
5652 /*
5653  *      Clear the wired attribute from the mappings for the specified range of
5654  *      addresses in the given pmap.  Every valid mapping within that range
5655  *      must have the wired attribute set.  In contrast, invalid mappings
5656  *      cannot have the wired attribute set, so they are ignored.
5657  *
5658  *      The wired attribute of the page table entry is not a hardware
5659  *      feature, so there is no need to invalidate any TLB entries.
5660  *      Since pmap_demote_l3e() for the wired entry must never fail,
5661  *      pmap_delayed_invl_started()/finished() calls around the
5662  *      function are not needed.
5663  */
5664 void
5665 mmu_radix_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5666 {
5667         vm_offset_t va_next;
5668         pml1_entry_t *l1e;
5669         pml2_entry_t *l2e;
5670         pml3_entry_t *l3e;
5671         pt_entry_t *pte;
5672
5673         CTR4(KTR_PMAP, "%s(%p, %#x, %#x)", __func__, pmap, sva, eva);
5674         PMAP_LOCK(pmap);
5675         for (; sva < eva; sva = va_next) {
5676                 l1e = pmap_pml1e(pmap, sva);
5677                 if ((be64toh(*l1e) & PG_V) == 0) {
5678                         va_next = (sva + L1_PAGE_SIZE) & ~L1_PAGE_MASK;
5679                         if (va_next < sva)
5680                                 va_next = eva;
5681                         continue;
5682                 }
5683                 l2e = pmap_l1e_to_l2e(l1e, sva);
5684                 if ((be64toh(*l2e) & PG_V) == 0) {
5685                         va_next = (sva + L2_PAGE_SIZE) & ~L2_PAGE_MASK;
5686                         if (va_next < sva)
5687                                 va_next = eva;
5688                         continue;
5689                 }
5690                 va_next = (sva + L3_PAGE_SIZE) & ~L3_PAGE_MASK;
5691                 if (va_next < sva)
5692                         va_next = eva;
5693                 l3e = pmap_l2e_to_l3e(l2e, sva);
5694                 if ((be64toh(*l3e) & PG_V) == 0)
5695                         continue;
5696                 if ((be64toh(*l3e) & RPTE_LEAF) != 0) {
5697                         if ((be64toh(*l3e) & PG_W) == 0)
5698                                 panic("pmap_unwire: pde %#jx is missing PG_W",
5699                                     (uintmax_t)(be64toh(*l3e)));
5700
5701                         /*
5702                          * Are we unwiring the entire large page?  If not,
5703                          * demote the mapping and fall through.
5704                          */
5705                         if (sva + L3_PAGE_SIZE == va_next && eva >= va_next) {
5706                                 atomic_clear_long(l3e, htobe64(PG_W));
5707                                 pmap->pm_stats.wired_count -= L3_PAGE_SIZE /
5708                                     PAGE_SIZE;
5709                                 continue;
5710                         } else if (!pmap_demote_l3e(pmap, l3e, sva))
5711                                 panic("pmap_unwire: demotion failed");
5712                 }
5713                 if (va_next > eva)
5714                         va_next = eva;
5715                 for (pte = pmap_l3e_to_pte(l3e, sva); sva != va_next; pte++,
5716                     sva += PAGE_SIZE) {
5717                         MPASS(pte == pmap_pte(pmap, sva));
5718                         if ((be64toh(*pte) & PG_V) == 0)
5719                                 continue;
5720                         if ((be64toh(*pte) & PG_W) == 0)
5721                                 panic("pmap_unwire: pte %#jx is missing PG_W",
5722                                     (uintmax_t)(be64toh(*pte)));
5723
5724                         /*
5725                          * PG_W must be cleared atomically.  Although the pmap
5726                          * lock synchronizes access to PG_W, another processor
5727                          * could be setting PG_M and/or PG_A concurrently.
5728                          */
5729                         atomic_clear_long(pte, htobe64(PG_W));
5730                         pmap->pm_stats.wired_count--;
5731                 }
5732         }
5733         PMAP_UNLOCK(pmap);
5734 }
5735
5736 void
5737 mmu_radix_zero_page(vm_page_t m)
5738 {
5739         vm_offset_t addr;
5740
5741         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
5742         addr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5743         pagezero(addr);
5744 }
5745
5746 void
5747 mmu_radix_zero_page_area(vm_page_t m, int off, int size)
5748 {
5749         caddr_t addr;
5750
5751         CTR4(KTR_PMAP, "%s(%p, %d, %d)", __func__, m, off, size);
5752         MPASS(off + size <= PAGE_SIZE);
5753         addr = (caddr_t)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5754         memset(addr + off, 0, size);
5755 }
5756
5757 static int
5758 mmu_radix_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
5759 {
5760         pml3_entry_t *l3ep;
5761         pt_entry_t pte;
5762         vm_paddr_t pa;
5763         int val;
5764
5765         CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, pmap, addr);
5766         PMAP_LOCK(pmap);
5767
5768         l3ep = pmap_pml3e(pmap, addr);
5769         if (l3ep != NULL && (be64toh(*l3ep) & PG_V)) {
5770                 if (be64toh(*l3ep) & RPTE_LEAF) {
5771                         pte = be64toh(*l3ep);
5772                         /* Compute the physical address of the 4KB page. */
5773                         pa = ((be64toh(*l3ep) & PG_PS_FRAME) | (addr & L3_PAGE_MASK)) &
5774                             PG_FRAME;
5775                         val = MINCORE_PSIND(1);
5776                 } else {
5777                         /* Native endian PTE, do not pass to functions */
5778                         pte = be64toh(*pmap_l3e_to_pte(l3ep, addr));
5779                         pa = pte & PG_FRAME;
5780                         val = 0;
5781                 }
5782         } else {
5783                 pte = 0;
5784                 pa = 0;
5785                 val = 0;
5786         }
5787         if ((pte & PG_V) != 0) {
5788                 val |= MINCORE_INCORE;
5789                 if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
5790                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
5791                 if ((pte & PG_A) != 0)
5792                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
5793         }
5794         if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
5795             (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
5796             (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
5797                 *locked_pa = pa;
5798         }
5799         PMAP_UNLOCK(pmap);
5800         return (val);
5801 }
5802
5803 void
5804 mmu_radix_activate(struct thread *td)
5805 {
5806         pmap_t pmap;
5807         uint32_t curpid;
5808
5809         CTR2(KTR_PMAP, "%s(%p)", __func__, td);
5810         critical_enter();
5811         pmap = vmspace_pmap(td->td_proc->p_vmspace);
5812         curpid = mfspr(SPR_PID);
5813         if (pmap->pm_pid > isa3_base_pid &&
5814                 curpid != pmap->pm_pid) {
5815                 mmu_radix_pid_set(pmap);
5816         }
5817         critical_exit();
5818 }
5819
5820 /*
5821  *      Increase the starting virtual address of the given mapping if a
5822  *      different alignment might result in more superpage mappings.
5823  */
5824 void
5825 mmu_radix_align_superpage(vm_object_t object, vm_ooffset_t offset,
5826     vm_offset_t *addr, vm_size_t size)
5827 {
5828
5829         CTR5(KTR_PMAP, "%s(%p, %#x, %p, %#x)", __func__, object, offset, addr,
5830             size);
5831         vm_offset_t superpage_offset;
5832
5833         if (size < L3_PAGE_SIZE)
5834                 return;
5835         if (object != NULL && (object->flags & OBJ_COLORED) != 0)
5836                 offset += ptoa(object->pg_color);
5837         superpage_offset = offset & L3_PAGE_MASK;
5838         if (size - ((L3_PAGE_SIZE - superpage_offset) & L3_PAGE_MASK) < L3_PAGE_SIZE ||
5839             (*addr & L3_PAGE_MASK) == superpage_offset)
5840                 return;
5841         if ((*addr & L3_PAGE_MASK) < superpage_offset)
5842                 *addr = (*addr & ~L3_PAGE_MASK) + superpage_offset;
5843         else
5844                 *addr = ((*addr + L3_PAGE_MASK) & ~L3_PAGE_MASK) + superpage_offset;
5845 }
5846
5847 static void *
5848 mmu_radix_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t attr)
5849 {
5850         vm_offset_t va, tmpva, ppa, offset;
5851
5852         ppa = trunc_page(pa);
5853         offset = pa & PAGE_MASK;
5854         size = roundup2(offset + size, PAGE_SIZE);
5855         if (pa < powerpc_ptob(Maxmem))
5856                 panic("bad pa: %#lx less than Maxmem %#lx\n",
5857                           pa, powerpc_ptob(Maxmem));
5858         va = kva_alloc(size);
5859         if (bootverbose)
5860                 printf("%s(%#lx, %lu, %d)\n", __func__, pa, size, attr);
5861         KASSERT(size > 0, ("%s(%#lx, %lu, %d)", __func__, pa, size, attr));
5862
5863         if (!va)
5864                 panic("%s: Couldn't alloc kernel virtual memory", __func__);
5865
5866         for (tmpva = va; size > 0;) {
5867                 mmu_radix_kenter_attr(tmpva, ppa, attr);
5868                 size -= PAGE_SIZE;
5869                 tmpva += PAGE_SIZE;
5870                 ppa += PAGE_SIZE;
5871         }
5872         ptesync();
5873
5874         return ((void *)(va + offset));
5875 }
5876
5877 static void *
5878 mmu_radix_mapdev(vm_paddr_t pa, vm_size_t size)
5879 {
5880
5881         CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
5882
5883         return (mmu_radix_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT));
5884 }
5885
5886 void
5887 mmu_radix_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5888 {
5889
5890         CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, m, ma);
5891         m->md.mdpg_cache_attrs = ma;
5892
5893         /*
5894          * If "m" is a normal page, update its direct mapping.  This update
5895          * can be relied upon to perform any cache operations that are
5896          * required for data coherence.
5897          */
5898         if ((m->flags & PG_FICTITIOUS) == 0 &&
5899             mmu_radix_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)),
5900             PAGE_SIZE, m->md.mdpg_cache_attrs))
5901                 panic("memory attribute change on the direct map failed");
5902 }
5903
5904 static void
5905 mmu_radix_unmapdev(void *p, vm_size_t size)
5906 {
5907         vm_offset_t offset, va;
5908
5909         CTR3(KTR_PMAP, "%s(%p, %#x)", __func__, p, size);
5910
5911         /* If we gave a direct map region in pmap_mapdev, do nothing */
5912         va = (vm_offset_t)p;
5913         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
5914                 return;
5915
5916         offset = va & PAGE_MASK;
5917         size = round_page(offset + size);
5918         va = trunc_page(va);
5919
5920         if (pmap_initialized) {
5921                 mmu_radix_qremove(va, atop(size));
5922                 kva_free(va, size);
5923         }
5924 }
5925
5926 void
5927 mmu_radix_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
5928 {
5929         vm_paddr_t pa = 0;
5930         int sync_sz;
5931
5932         while (sz > 0) {
5933                 pa = pmap_extract(pm, va);
5934                 sync_sz = PAGE_SIZE - (va & PAGE_MASK);
5935                 sync_sz = min(sync_sz, sz);
5936                 if (pa != 0) {
5937                         pa += (va & PAGE_MASK);
5938                         __syncicache((void *)PHYS_TO_DMAP(pa), sync_sz);
5939                 }
5940                 va += sync_sz;
5941                 sz -= sync_sz;
5942         }
5943 }
5944
5945 static __inline void
5946 pmap_pte_attr(pt_entry_t *pte, uint64_t cache_bits, uint64_t mask)
5947 {
5948         uint64_t opte, npte;
5949
5950         /*
5951          * The cache mode bits are all in the low 32-bits of the
5952          * PTE, so we can just spin on updating the low 32-bits.
5953          */
5954         do {
5955                 opte = be64toh(*pte);
5956                 npte = opte & ~mask;
5957                 npte |= cache_bits;
5958         } while (npte != opte && !atomic_cmpset_long(pte, htobe64(opte), htobe64(npte)));
5959 }
5960
5961 /*
5962  * Tries to demote a 1GB page mapping.
5963  */
5964 static boolean_t
5965 pmap_demote_l2e(pmap_t pmap, pml2_entry_t *l2e, vm_offset_t va)
5966 {
5967         pml2_entry_t oldpdpe;
5968         pml3_entry_t *firstpde, newpde, *pde;
5969         vm_paddr_t pdpgpa;
5970         vm_page_t pdpg;
5971
5972         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5973         oldpdpe = be64toh(*l2e);
5974         KASSERT((oldpdpe & (RPTE_LEAF | PG_V)) == (RPTE_LEAF | PG_V),
5975             ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
5976         pdpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
5977         if (pdpg == NULL) {
5978                 CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
5979                     " in pmap %p", va, pmap);
5980                 return (FALSE);
5981         }
5982         pdpg->pindex = va >> L2_PAGE_SIZE_SHIFT;
5983         pdpgpa = VM_PAGE_TO_PHYS(pdpg);
5984         firstpde = (pml3_entry_t *)PHYS_TO_DMAP(pdpgpa);
5985         KASSERT((oldpdpe & PG_A) != 0,
5986             ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
5987         KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
5988             ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
5989         newpde = oldpdpe;
5990
5991         /*
5992          * Initialize the page directory page.
5993          */
5994         for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
5995                 *pde = htobe64(newpde);
5996                 newpde += L3_PAGE_SIZE;
5997         }
5998
5999         /*
6000          * Demote the mapping.
6001          */
6002         pde_store(l2e, pdpgpa);
6003
6004         /*
6005          * Flush PWC --- XXX revisit
6006          */
6007         pmap_invalidate_all(pmap);
6008
6009         counter_u64_add(pmap_l2e_demotions, 1);
6010         CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
6011             " in pmap %p", va, pmap);
6012         return (TRUE);
6013 }
6014
6015 vm_paddr_t
6016 mmu_radix_kextract(vm_offset_t va)
6017 {
6018         pml3_entry_t l3e;
6019         vm_paddr_t pa;
6020
6021         CTR2(KTR_PMAP, "%s(%#x)", __func__, va);
6022         if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
6023                 pa = DMAP_TO_PHYS(va);
6024         } else {
6025                 /* Big-endian PTE on stack */
6026                 l3e = *pmap_pml3e(kernel_pmap, va);
6027                 if (be64toh(l3e) & RPTE_LEAF) {
6028                         pa = (be64toh(l3e) & PG_PS_FRAME) | (va & L3_PAGE_MASK);
6029                         pa |= (va & L3_PAGE_MASK);
6030                 } else {
6031                         /*
6032                          * Beware of a concurrent promotion that changes the
6033                          * PDE at this point!  For example, vtopte() must not
6034                          * be used to access the PTE because it would use the
6035                          * new PDE.  It is, however, safe to use the old PDE
6036                          * because the page table page is preserved by the
6037                          * promotion.
6038                          */
6039                         pa = be64toh(*pmap_l3e_to_pte(&l3e, va));
6040                         pa = (pa & PG_FRAME) | (va & PAGE_MASK);
6041                         pa |= (va & PAGE_MASK);
6042                 }
6043         }
6044         return (pa);
6045 }
6046
6047 static pt_entry_t
6048 mmu_radix_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
6049 {
6050
6051         if (ma != VM_MEMATTR_DEFAULT) {
6052                 return pmap_cache_bits(ma);
6053         }
6054
6055         /*
6056          * Assume the page is cache inhibited and access is guarded unless
6057          * it's in our available memory array.
6058          */
6059         for (int i = 0; i < pregions_sz; i++) {
6060                 if ((pa >= pregions[i].mr_start) &&
6061                     (pa < (pregions[i].mr_start + pregions[i].mr_size)))
6062                         return (RPTE_ATTR_MEM);
6063         }
6064         return (RPTE_ATTR_GUARDEDIO);
6065 }
6066
6067 static void
6068 mmu_radix_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
6069 {
6070         pt_entry_t *pte, pteval;
6071         uint64_t cache_bits;
6072
6073         pte = kvtopte(va);
6074         MPASS(pte != NULL);
6075         pteval = pa | RPTE_EAA_R | RPTE_EAA_W | RPTE_EAA_P | PG_M | PG_A;
6076         cache_bits = mmu_radix_calc_wimg(pa, ma);
6077         pte_store(pte, pteval | cache_bits);
6078 }
6079
6080 void
6081 mmu_radix_kremove(vm_offset_t va)
6082 {
6083         pt_entry_t *pte;
6084
6085         CTR2(KTR_PMAP, "%s(%#x)", __func__, va);
6086
6087         pte = kvtopte(va);
6088         pte_clear(pte);
6089 }
6090
6091 int
6092 mmu_radix_decode_kernel_ptr(vm_offset_t addr,
6093     int *is_user, vm_offset_t *decoded)
6094 {
6095
6096         CTR2(KTR_PMAP, "%s(%#jx)", __func__, (uintmax_t)addr);
6097         *decoded = addr;
6098         *is_user = (addr < VM_MAXUSER_ADDRESS);
6099         return (0);
6100 }
6101
6102 static boolean_t
6103 mmu_radix_dev_direct_mapped(vm_paddr_t pa, vm_size_t size)
6104 {
6105
6106         CTR3(KTR_PMAP, "%s(%#x, %#x)", __func__, pa, size);
6107         return (mem_valid(pa, size));
6108 }
6109
6110 static void
6111 mmu_radix_scan_init(void)
6112 {
6113
6114         CTR1(KTR_PMAP, "%s()", __func__);
6115         UNIMPLEMENTED();
6116 }
6117
6118 static void
6119 mmu_radix_dumpsys_map(vm_paddr_t pa, size_t sz,
6120         void **va)
6121 {
6122         CTR4(KTR_PMAP, "%s(%#jx, %#zx, %p)", __func__, (uintmax_t)pa, sz, va);
6123         UNIMPLEMENTED();
6124 }
6125
6126 vm_offset_t
6127 mmu_radix_quick_enter_page(vm_page_t m)
6128 {
6129         vm_paddr_t paddr;
6130
6131         CTR2(KTR_PMAP, "%s(%p)", __func__, m);
6132         paddr = VM_PAGE_TO_PHYS(m);
6133         return (PHYS_TO_DMAP(paddr));
6134 }
6135
6136 void
6137 mmu_radix_quick_remove_page(vm_offset_t addr __unused)
6138 {
6139         /* no work to do here */
6140         CTR2(KTR_PMAP, "%s(%#x)", __func__, addr);
6141 }
6142
6143 static void
6144 pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
6145 {
6146         cpu_flush_dcache((void *)sva, eva - sva);
6147 }
6148
6149 int
6150 mmu_radix_change_attr(vm_offset_t va, vm_size_t size,
6151     vm_memattr_t mode)
6152 {
6153         int error;
6154
6155         CTR4(KTR_PMAP, "%s(%#x, %#zx, %d)", __func__, va, size, mode);
6156         PMAP_LOCK(kernel_pmap);
6157         error = pmap_change_attr_locked(va, size, mode, true);
6158         PMAP_UNLOCK(kernel_pmap);
6159         return (error);
6160 }
6161
6162 static int
6163 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool flush)
6164 {
6165         vm_offset_t base, offset, tmpva;
6166         vm_paddr_t pa_start, pa_end, pa_end1;
6167         pml2_entry_t *l2e;
6168         pml3_entry_t *l3e;
6169         pt_entry_t *pte;
6170         int cache_bits, error;
6171         boolean_t changed;
6172
6173         PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6174         base = trunc_page(va);
6175         offset = va & PAGE_MASK;
6176         size = round_page(offset + size);
6177
6178         /*
6179          * Only supported on kernel virtual addresses, including the direct
6180          * map but excluding the recursive map.
6181          */
6182         if (base < DMAP_MIN_ADDRESS)
6183                 return (EINVAL);
6184
6185         cache_bits = pmap_cache_bits(mode);
6186         changed = FALSE;
6187
6188         /*
6189          * Pages that aren't mapped aren't supported.  Also break down 2MB pages
6190          * into 4KB pages if required.
6191          */
6192         for (tmpva = base; tmpva < base + size; ) {
6193                 l2e = pmap_pml2e(kernel_pmap, tmpva);
6194                 if (l2e == NULL || *l2e == 0)
6195                         return (EINVAL);
6196                 if (be64toh(*l2e) & RPTE_LEAF) {
6197                         /*
6198                          * If the current 1GB page already has the required
6199                          * memory type, then we need not demote this page. Just
6200                          * increment tmpva to the next 1GB page frame.
6201                          */
6202                         if ((be64toh(*l2e) & RPTE_ATTR_MASK) == cache_bits) {
6203                                 tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE;
6204                                 continue;
6205                         }
6206
6207                         /*
6208                          * If the current offset aligns with a 1GB page frame
6209                          * and there is at least 1GB left within the range, then
6210                          * we need not break down this page into 2MB pages.
6211                          */
6212                         if ((tmpva & L2_PAGE_MASK) == 0 &&
6213                             tmpva + L2_PAGE_MASK < base + size) {
6214                                 tmpva += L2_PAGE_MASK;
6215                                 continue;
6216                         }
6217                         if (!pmap_demote_l2e(kernel_pmap, l2e, tmpva))
6218                                 return (ENOMEM);
6219                 }
6220                 l3e = pmap_l2e_to_l3e(l2e, tmpva);
6221                 KASSERT(l3e != NULL, ("no l3e entry for %#lx in %p\n",
6222                     tmpva, l2e));
6223                 if (*l3e == 0)
6224                         return (EINVAL);
6225                 if (be64toh(*l3e) & RPTE_LEAF) {
6226                         /*
6227                          * If the current 2MB page already has the required
6228                          * memory type, then we need not demote this page. Just
6229                          * increment tmpva to the next 2MB page frame.
6230                          */
6231                         if ((be64toh(*l3e) & RPTE_ATTR_MASK) == cache_bits) {
6232                                 tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE;
6233                                 continue;
6234                         }
6235
6236                         /*
6237                          * If the current offset aligns with a 2MB page frame
6238                          * and there is at least 2MB left within the range, then
6239                          * we need not break down this page into 4KB pages.
6240                          */
6241                         if ((tmpva & L3_PAGE_MASK) == 0 &&
6242                             tmpva + L3_PAGE_MASK < base + size) {
6243                                 tmpva += L3_PAGE_SIZE;
6244                                 continue;
6245                         }
6246                         if (!pmap_demote_l3e(kernel_pmap, l3e, tmpva))
6247                                 return (ENOMEM);
6248                 }
6249                 pte = pmap_l3e_to_pte(l3e, tmpva);
6250                 if (*pte == 0)
6251                         return (EINVAL);
6252                 tmpva += PAGE_SIZE;
6253         }
6254         error = 0;
6255
6256         /*
6257          * Ok, all the pages exist, so run through them updating their
6258          * cache mode if required.
6259          */
6260         pa_start = pa_end = 0;
6261         for (tmpva = base; tmpva < base + size; ) {
6262                 l2e = pmap_pml2e(kernel_pmap, tmpva);
6263                 if (be64toh(*l2e) & RPTE_LEAF) {
6264                         if ((be64toh(*l2e) & RPTE_ATTR_MASK) != cache_bits) {
6265                                 pmap_pte_attr(l2e, cache_bits,
6266                                     RPTE_ATTR_MASK);
6267                                 changed = TRUE;
6268                         }
6269                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6270                             (*l2e & PG_PS_FRAME) < dmaplimit) {
6271                                 if (pa_start == pa_end) {
6272                                         /* Start physical address run. */
6273                                         pa_start = be64toh(*l2e) & PG_PS_FRAME;
6274                                         pa_end = pa_start + L2_PAGE_SIZE;
6275                                 } else if (pa_end == (be64toh(*l2e) & PG_PS_FRAME))
6276                                         pa_end += L2_PAGE_SIZE;
6277                                 else {
6278                                         /* Run ended, update direct map. */
6279                                         error = pmap_change_attr_locked(
6280                                             PHYS_TO_DMAP(pa_start),
6281                                             pa_end - pa_start, mode, flush);
6282                                         if (error != 0)
6283                                                 break;
6284                                         /* Start physical address run. */
6285                                         pa_start = be64toh(*l2e) & PG_PS_FRAME;
6286                                         pa_end = pa_start + L2_PAGE_SIZE;
6287                                 }
6288                         }
6289                         tmpva = trunc_1gpage(tmpva) + L2_PAGE_SIZE;
6290                         continue;
6291                 }
6292                 l3e = pmap_l2e_to_l3e(l2e, tmpva);
6293                 if (be64toh(*l3e) & RPTE_LEAF) {
6294                         if ((be64toh(*l3e) & RPTE_ATTR_MASK) != cache_bits) {
6295                                 pmap_pte_attr(l3e, cache_bits,
6296                                     RPTE_ATTR_MASK);
6297                                 changed = TRUE;
6298                         }
6299                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6300                             (be64toh(*l3e) & PG_PS_FRAME) < dmaplimit) {
6301                                 if (pa_start == pa_end) {
6302                                         /* Start physical address run. */
6303                                         pa_start = be64toh(*l3e) & PG_PS_FRAME;
6304                                         pa_end = pa_start + L3_PAGE_SIZE;
6305                                 } else if (pa_end == (be64toh(*l3e) & PG_PS_FRAME))
6306                                         pa_end += L3_PAGE_SIZE;
6307                                 else {
6308                                         /* Run ended, update direct map. */
6309                                         error = pmap_change_attr_locked(
6310                                             PHYS_TO_DMAP(pa_start),
6311                                             pa_end - pa_start, mode, flush);
6312                                         if (error != 0)
6313                                                 break;
6314                                         /* Start physical address run. */
6315                                         pa_start = be64toh(*l3e) & PG_PS_FRAME;
6316                                         pa_end = pa_start + L3_PAGE_SIZE;
6317                                 }
6318                         }
6319                         tmpva = trunc_2mpage(tmpva) + L3_PAGE_SIZE;
6320                 } else {
6321                         pte = pmap_l3e_to_pte(l3e, tmpva);
6322                         if ((be64toh(*pte) & RPTE_ATTR_MASK) != cache_bits) {
6323                                 pmap_pte_attr(pte, cache_bits,
6324                                     RPTE_ATTR_MASK);
6325                                 changed = TRUE;
6326                         }
6327                         if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
6328                             (be64toh(*pte) & PG_FRAME) < dmaplimit) {
6329                                 if (pa_start == pa_end) {
6330                                         /* Start physical address run. */
6331                                         pa_start = be64toh(*pte) & PG_FRAME;
6332                                         pa_end = pa_start + PAGE_SIZE;
6333                                 } else if (pa_end == (be64toh(*pte) & PG_FRAME))
6334                                         pa_end += PAGE_SIZE;
6335                                 else {
6336                                         /* Run ended, update direct map. */
6337                                         error = pmap_change_attr_locked(
6338                                             PHYS_TO_DMAP(pa_start),
6339                                             pa_end - pa_start, mode, flush);
6340                                         if (error != 0)
6341                                                 break;
6342                                         /* Start physical address run. */
6343                                         pa_start = be64toh(*pte) & PG_FRAME;
6344                                         pa_end = pa_start + PAGE_SIZE;
6345                                 }
6346                         }
6347                         tmpva += PAGE_SIZE;
6348                 }
6349         }
6350         if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
6351                 pa_end1 = MIN(pa_end, dmaplimit);
6352                 if (pa_start != pa_end1)
6353                         error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
6354                             pa_end1 - pa_start, mode, flush);
6355         }
6356
6357         /*
6358          * Flush CPU caches if required to make sure any data isn't cached that
6359          * shouldn't be, etc.
6360          */
6361         if (changed) {
6362                 pmap_invalidate_all(kernel_pmap);
6363
6364                 if (flush)
6365                         pmap_invalidate_cache_range(base, tmpva);
6366         }
6367         return (error);
6368 }
6369
6370 /*
6371  * Allocate physical memory for the vm_page array and map it into KVA,
6372  * attempting to back the vm_pages with domain-local memory.
6373  */
6374 void
6375 mmu_radix_page_array_startup(long pages)
6376 {
6377 #ifdef notyet
6378         pml2_entry_t *l2e;
6379         pml3_entry_t *pde;
6380         pml3_entry_t newl3;
6381         vm_offset_t va;
6382         long pfn;
6383         int domain, i;
6384 #endif
6385         vm_paddr_t pa;
6386         vm_offset_t start, end;
6387
6388         vm_page_array_size = pages;
6389
6390         start = VM_MIN_KERNEL_ADDRESS;
6391         end = start + pages * sizeof(struct vm_page);
6392
6393         pa = vm_phys_early_alloc(0, end - start);
6394
6395         start = mmu_radix_map(&start, pa, end - start, VM_MEMATTR_DEFAULT);
6396 #ifdef notyet
6397         /* TODO: NUMA vm_page_array.  Blocked out until then (copied from amd64). */
6398         for (va = start; va < end; va += L3_PAGE_SIZE) {
6399                 pfn = first_page + (va - start) / sizeof(struct vm_page);
6400                 domain = vm_phys_domain(ptoa(pfn));
6401                 l2e = pmap_pml2e(kernel_pmap, va);
6402                 if ((be64toh(*l2e) & PG_V) == 0) {
6403                         pa = vm_phys_early_alloc(domain, PAGE_SIZE);
6404                         dump_add_page(pa);
6405                         pagezero(PHYS_TO_DMAP(pa));
6406                         pde_store(l2e, (pml2_entry_t)pa);
6407                 }
6408                 pde = pmap_l2e_to_l3e(l2e, va);
6409                 if ((be64toh(*pde) & PG_V) != 0)
6410                         panic("Unexpected pde %p", pde);
6411                 pa = vm_phys_early_alloc(domain, L3_PAGE_SIZE);
6412                 for (i = 0; i < NPDEPG; i++)
6413                         dump_add_page(pa + i * PAGE_SIZE);
6414                 newl3 = (pml3_entry_t)(pa | RPTE_EAA_P | RPTE_EAA_R | RPTE_EAA_W);
6415                 pte_store(pde, newl3);
6416         }
6417 #endif
6418         vm_page_array = (vm_page_t)start;
6419 }
6420
6421 #ifdef DDB
6422 #include <sys/kdb.h>
6423 #include <ddb/ddb.h>
6424
6425 static void
6426 pmap_pte_walk(pml1_entry_t *l1, vm_offset_t va)
6427 {
6428         pml1_entry_t *l1e;
6429         pml2_entry_t *l2e;
6430         pml3_entry_t *l3e;
6431         pt_entry_t *pte;
6432
6433         l1e = &l1[pmap_pml1e_index(va)];
6434         db_printf("VA %#016lx l1e %#016lx", va, be64toh(*l1e));
6435         if ((be64toh(*l1e) & PG_V) == 0) {
6436                 db_printf("\n");
6437                 return;
6438         }
6439         l2e = pmap_l1e_to_l2e(l1e, va);
6440         db_printf(" l2e %#016lx", be64toh(*l2e));
6441         if ((be64toh(*l2e) & PG_V) == 0 || (be64toh(*l2e) & RPTE_LEAF) != 0) {
6442                 db_printf("\n");
6443                 return;
6444         }
6445         l3e = pmap_l2e_to_l3e(l2e, va);
6446         db_printf(" l3e %#016lx", be64toh(*l3e));
6447         if ((be64toh(*l3e) & PG_V) == 0 || (be64toh(*l3e) & RPTE_LEAF) != 0) {
6448                 db_printf("\n");
6449                 return;
6450         }
6451         pte = pmap_l3e_to_pte(l3e, va);
6452         db_printf(" pte %#016lx\n", be64toh(*pte));
6453 }
6454
6455 void
6456 pmap_page_print_mappings(vm_page_t m)
6457 {
6458         pmap_t pmap;
6459         pv_entry_t pv;
6460
6461         db_printf("page %p(%lx)\n", m, m->phys_addr);
6462         /* need to elide locks if running in ddb */
6463         TAILQ_FOREACH(pv, &m->md.pv_list, pv_link) {
6464                 db_printf("pv: %p ", pv);
6465                 db_printf("va: %#016lx ", pv->pv_va);
6466                 pmap = PV_PMAP(pv);
6467                 db_printf("pmap %p  ", pmap);
6468                 if (pmap != NULL) {
6469                         db_printf("asid: %lu\n", pmap->pm_pid);
6470                         pmap_pte_walk(pmap->pm_pml1, pv->pv_va);
6471                 }
6472         }
6473 }
6474
6475 DB_SHOW_COMMAND(pte, pmap_print_pte)
6476 {
6477         vm_offset_t va;
6478         pmap_t pmap;
6479
6480         if (!have_addr) {
6481                 db_printf("show pte addr\n");
6482                 return;
6483         }
6484         va = (vm_offset_t)addr;
6485
6486         if (va >= DMAP_MIN_ADDRESS)
6487                 pmap = kernel_pmap;
6488         else if (kdb_thread != NULL)
6489                 pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
6490         else
6491                 pmap = vmspace_pmap(curthread->td_proc->p_vmspace);
6492
6493         pmap_pte_walk(pmap->pm_pml1, va);
6494 }
6495
6496 #endif