sys/powerpc/aim/mmu_oea64.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2008-2015 Nathan Whitehorn
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 #include <sys/cdefs.h>
  30 __FBSDID("$FreeBSD$");
  31
  32 /*
  33  * Manages physical address maps.
  34  *
  35  * Since the information managed by this module is also stored by the
  36  * logical address mapping module, this module may throw away valid virtual
  37  * to physical mappings at almost any time.  However, invalidations of
  38  * mappings must be done as requested.
  39  *
  40  * In order to cope with hardware architectures which make virtual to
  41  * physical map invalidates expensive, this module may delay invalidate
  42  * reduced protection operations until such time as they are actually
  43  * necessary.  This module is given full information as to which processors
  44  * are currently using which maps, and to when physical maps must be made
  45  * correct.
  46  */
  47
  48 #include "opt_kstack_pages.h"
  49
  50 #include <sys/param.h>
  51 #include <sys/kernel.h>
  52 #include <sys/conf.h>
  53 #include <sys/queue.h>
  54 #include <sys/cpuset.h>
  55 #include <sys/kerneldump.h>
  56 #include <sys/ktr.h>
  57 #include <sys/lock.h>
  58 #include <sys/msgbuf.h>
  59 #include <sys/malloc.h>
  60 #include <sys/mman.h>
  61 #include <sys/mutex.h>
  62 #include <sys/proc.h>
  63 #include <sys/rwlock.h>
  64 #include <sys/sched.h>
  65 #include <sys/sysctl.h>
  66 #include <sys/systm.h>
  67 #include <sys/vmmeter.h>
  68 #include <sys/smp.h>
  69 #include <sys/reboot.h>
  70
  71 #include <sys/kdb.h>
  72
  73 #include <dev/ofw/openfirm.h>
  74
  75 #include <vm/vm.h>
  76 #include <vm/pmap.h>
  77 #include <vm/vm_param.h>
  78 #include <vm/vm_kern.h>
  79 #include <vm/vm_page.h>
  80 #include <vm/vm_phys.h>
  81 #include <vm/vm_map.h>
  82 #include <vm/vm_object.h>
  83 #include <vm/vm_extern.h>
  84 #include <vm/vm_pageout.h>
  85 #include <vm/vm_dumpset.h>
  86 #include <vm/vm_reserv.h>
  87 #include <vm/uma.h>
  88
  89 #include <machine/_inttypes.h>
  90 #include <machine/cpu.h>
  91 #include <machine/ifunc.h>
  92 #include <machine/platform.h>
  93 #include <machine/frame.h>
  94 #include <machine/md_var.h>
  95 #include <machine/psl.h>
  96 #include <machine/bat.h>
  97 #include <machine/hid.h>
  98 #include <machine/pte.h>
  99 #include <machine/sr.h>
 100 #include <machine/trap.h>
 101 #include <machine/mmuvar.h>
 102
 103 #include "mmu_oea64.h"
 104
 105 void moea64_release_vsid(uint64_t vsid);
 106 uintptr_t moea64_get_unique_vsid(void);
 107
 108 #define DISABLE_TRANS(msr)      msr = mfmsr(); mtmsr(msr & ~PSL_DR)
 109 #define ENABLE_TRANS(msr)       mtmsr(msr)
 110
 111 #define VSID_MAKE(sr, hash)     ((sr) | (((hash) & 0xfffff) << 4))
 112 #define VSID_TO_HASH(vsid)      (((vsid) >> 4) & 0xfffff)
 113 #define VSID_HASH_MASK          0x0000007fffffffffULL
 114
 115 /*
 116  * Locking semantics:
 117  *
 118  * There are two locks of interest: the page locks and the pmap locks, which
 119  * protect their individual PVO lists and are locked in that order. The contents
 120  * of all PVO entries are protected by the locks of their respective pmaps.
 121  * The pmap of any PVO is guaranteed not to change so long as the PVO is linked
 122  * into any list.
 123  *
 124  */
 125
 126 #define PV_LOCK_COUNT   PA_LOCK_COUNT
 127 static struct mtx_padalign pv_lock[PV_LOCK_COUNT];
 128
 129 /*
 130  * Cheap NUMA-izing of the pv locks, to reduce contention across domains.
 131  * NUMA domains on POWER9 appear to be indexed as sparse memory spaces, with the
 132  * index at (N << 45).
 133  */
 134 #ifdef __powerpc64__
 135 #define PV_LOCK_IDX(pa) ((pa_index(pa) * (((pa) >> 45) + 1)) % PV_LOCK_COUNT)
 136 #else
 137 #define PV_LOCK_IDX(pa) (pa_index(pa) % PV_LOCK_COUNT)
 138 #endif
 139 #define PV_LOCKPTR(pa)  ((struct mtx *)(&pv_lock[PV_LOCK_IDX(pa)]))
 140 #define PV_LOCK(pa)             mtx_lock(PV_LOCKPTR(pa))
 141 #define PV_UNLOCK(pa)           mtx_unlock(PV_LOCKPTR(pa))
 142 #define PV_LOCKASSERT(pa)       mtx_assert(PV_LOCKPTR(pa), MA_OWNED)
 143 #define PV_PAGE_LOCK(m)         PV_LOCK(VM_PAGE_TO_PHYS(m))
 144 #define PV_PAGE_UNLOCK(m)       PV_UNLOCK(VM_PAGE_TO_PHYS(m))
 145 #define PV_PAGE_LOCKASSERT(m)   PV_LOCKASSERT(VM_PAGE_TO_PHYS(m))
 146
 147 /* Superpage PV lock */
 148
 149 #define PV_LOCK_SIZE            (1<<PDRSHIFT)
 150
 151 static __always_inline void
 152 moea64_sp_pv_lock(vm_paddr_t pa)
 153 {
 154         vm_paddr_t pa_end;
 155
 156         /* Note: breaking when pa_end is reached to avoid overflows */
 157         pa_end = pa + (HPT_SP_SIZE - PV_LOCK_SIZE);
 158         for (;;) {
 159                 mtx_lock_flags(PV_LOCKPTR(pa), MTX_DUPOK);
 160                 if (pa == pa_end)
 161                         break;
 162                 pa += PV_LOCK_SIZE;
 163         }
 164 }
 165
 166 static __always_inline void
 167 moea64_sp_pv_unlock(vm_paddr_t pa)
 168 {
 169         vm_paddr_t pa_end;
 170
 171         /* Note: breaking when pa_end is reached to avoid overflows */
 172         pa_end = pa;
 173         pa += HPT_SP_SIZE - PV_LOCK_SIZE;
 174         for (;;) {
 175                 mtx_unlock_flags(PV_LOCKPTR(pa), MTX_DUPOK);
 176                 if (pa == pa_end)
 177                         break;
 178                 pa -= PV_LOCK_SIZE;
 179         }
 180 }
 181
 182 #define SP_PV_LOCK_ALIGNED(pa)          moea64_sp_pv_lock(pa)
 183 #define SP_PV_UNLOCK_ALIGNED(pa)        moea64_sp_pv_unlock(pa)
 184 #define SP_PV_LOCK(pa)                  moea64_sp_pv_lock((pa) & ~HPT_SP_MASK)
 185 #define SP_PV_UNLOCK(pa)                moea64_sp_pv_unlock((pa) & ~HPT_SP_MASK)
 186 #define SP_PV_PAGE_LOCK(m)              SP_PV_LOCK(VM_PAGE_TO_PHYS(m))
 187 #define SP_PV_PAGE_UNLOCK(m)            SP_PV_UNLOCK(VM_PAGE_TO_PHYS(m))
 188
 189 struct ofw_map {
 190         cell_t  om_va;
 191         cell_t  om_len;
 192         uint64_t om_pa;
 193         cell_t  om_mode;
 194 };
 195
 196 extern unsigned char _etext[];
 197 extern unsigned char _end[];
 198
 199 extern void *slbtrap, *slbtrapend;
 200
 201 /*
 202  * Map of physical memory regions.
 203  */
 204 static struct   mem_region *regions;
 205 static struct   mem_region *pregions;
 206 static struct   numa_mem_region *numa_pregions;
 207 static u_int    phys_avail_count;
 208 static int      regions_sz, pregions_sz, numapregions_sz;
 209
 210 extern void bs_remap_earlyboot(void);
 211
 212 /*
 213  * Lock for the SLB tables.
 214  */
 215 struct mtx      moea64_slb_mutex;
 216
 217 /*
 218  * PTEG data.
 219  */
 220 u_long          moea64_pteg_count;
 221 u_long          moea64_pteg_mask;
 222
 223 /*
 224  * PVO data.
 225  */
 226
 227 uma_zone_t      moea64_pvo_zone; /* zone for pvo entries */
 228
 229 static struct   pvo_entry *moea64_bpvo_pool;
 230 static int      moea64_bpvo_pool_index = 0;
 231 static int      moea64_bpvo_pool_size = 0;
 232 SYSCTL_INT(_machdep, OID_AUTO, moea64_allocated_bpvo_entries, CTLFLAG_RD,
 233     &moea64_bpvo_pool_index, 0, "");
 234
 235 #define BPVO_POOL_SIZE  327680 /* Sensible historical default value */
 236 #define BPVO_POOL_EXPANSION_FACTOR      3
 237 #define VSID_NBPW       (sizeof(u_int32_t) * 8)
 238 #ifdef __powerpc64__
 239 #define NVSIDS          (NPMAPS * 16)
 240 #define VSID_HASHMASK   0xffffffffUL
 241 #else
 242 #define NVSIDS          NPMAPS
 243 #define VSID_HASHMASK   0xfffffUL
 244 #endif
 245 static u_int    moea64_vsid_bitmap[NVSIDS / VSID_NBPW];
 246
 247 static boolean_t moea64_initialized = FALSE;
 248
 249 #ifdef MOEA64_STATS
 250 /*
 251  * Statistics.
 252  */
 253 u_int   moea64_pte_valid = 0;
 254 u_int   moea64_pte_overflow = 0;
 255 u_int   moea64_pvo_entries = 0;
 256 u_int   moea64_pvo_enter_calls = 0;
 257 u_int   moea64_pvo_remove_calls = 0;
 258 SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD,
 259     &moea64_pte_valid, 0, "");
 260 SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD,
 261     &moea64_pte_overflow, 0, "");
 262 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD,
 263     &moea64_pvo_entries, 0, "");
 264 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD,
 265     &moea64_pvo_enter_calls, 0, "");
 266 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD,
 267     &moea64_pvo_remove_calls, 0, "");
 268 #endif
 269
 270 vm_offset_t     moea64_scratchpage_va[2];
 271 struct pvo_entry *moea64_scratchpage_pvo[2];
 272 struct  mtx     moea64_scratchpage_mtx;
 273
 274 uint64_t        moea64_large_page_mask = 0;
 275 uint64_t        moea64_large_page_size = 0;
 276 int             moea64_large_page_shift = 0;
 277 bool            moea64_has_lp_4k_16m = false;
 278
 279 /*
 280  * PVO calls.
 281  */
 282 static int      moea64_pvo_enter(struct pvo_entry *pvo,
 283                     struct pvo_head *pvo_head, struct pvo_entry **oldpvo);
 284 static void     moea64_pvo_remove_from_pmap(struct pvo_entry *pvo);
 285 static void     moea64_pvo_remove_from_page(struct pvo_entry *pvo);
 286 static void     moea64_pvo_remove_from_page_locked(
 287                     struct pvo_entry *pvo, vm_page_t m);
 288 static struct   pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t);
 289
 290 /*
 291  * Utility routines.
 292  */
 293 static boolean_t        moea64_query_bit(vm_page_t, uint64_t);
 294 static u_int            moea64_clear_bit(vm_page_t, uint64_t);
 295 static void             moea64_kremove(vm_offset_t);
 296 static void             moea64_syncicache(pmap_t pmap, vm_offset_t va,
 297                             vm_paddr_t pa, vm_size_t sz);
 298 static void             moea64_pmap_init_qpages(void);
 299 static void             moea64_remove_locked(pmap_t, vm_offset_t,
 300                             vm_offset_t, struct pvo_dlist *);
 301
 302 /*
 303  * Superpages data and routines.
 304  */
 305
 306 /*
 307  * PVO flags (in vaddr) that must match for promotion to succeed.
 308  * Note that protection bits are checked separately, as they reside in
 309  * another field.
 310  */
 311 #define PVO_FLAGS_PROMOTE       (PVO_WIRED | PVO_MANAGED | PVO_PTEGIDX_VALID)
 312
 313 #define PVO_IS_SP(pvo)          (((pvo)->pvo_vaddr & PVO_LARGE) && \
 314                                  (pvo)->pvo_pmap != kernel_pmap)
 315
 316 /* Get physical address from PVO. */
 317 #define PVO_PADDR(pvo)          moea64_pvo_paddr(pvo)
 318
 319 /* MD page flag indicating that the page is a superpage. */
 320 #define MDPG_ATTR_SP            0x40000000
 321
 322 SYSCTL_DECL(_vm_pmap);
 323
 324 static SYSCTL_NODE(_vm_pmap, OID_AUTO, sp, CTLFLAG_RD, 0,
 325     "SP page mapping counters");
 326
 327 static u_long sp_demotions;
 328 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, demotions, CTLFLAG_RD,
 329     &sp_demotions, 0, "SP page demotions");
 330
 331 static u_long sp_mappings;
 332 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, mappings, CTLFLAG_RD,
 333     &sp_mappings, 0, "SP page mappings");
 334
 335 static u_long sp_p_failures;
 336 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_failures, CTLFLAG_RD,
 337     &sp_p_failures, 0, "SP page promotion failures");
 338
 339 static u_long sp_p_fail_pa;
 340 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_pa, CTLFLAG_RD,
 341     &sp_p_fail_pa, 0, "SP page promotion failure: PAs don't match");
 342
 343 static u_long sp_p_fail_flags;
 344 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_flags, CTLFLAG_RD,
 345     &sp_p_fail_flags, 0, "SP page promotion failure: page flags don't match");
 346
 347 static u_long sp_p_fail_prot;
 348 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_prot, CTLFLAG_RD,
 349     &sp_p_fail_prot, 0,
 350     "SP page promotion failure: page protections don't match");
 351
 352 static u_long sp_p_fail_wimg;
 353 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, p_fail_wimg, CTLFLAG_RD,
 354     &sp_p_fail_wimg, 0, "SP page promotion failure: WIMG bits don't match");
 355
 356 static u_long sp_promotions;
 357 SYSCTL_ULONG(_vm_pmap_sp, OID_AUTO, promotions, CTLFLAG_RD,
 358     &sp_promotions, 0, "SP page promotions");
 359
 360 static bool moea64_ps_enabled(pmap_t);
 361 static void moea64_align_superpage(vm_object_t, vm_ooffset_t,
 362     vm_offset_t *, vm_size_t);
 363
 364 static int moea64_sp_enter(pmap_t pmap, vm_offset_t va,
 365     vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind);
 366 static struct pvo_entry *moea64_sp_remove(struct pvo_entry *sp,
 367     struct pvo_dlist *tofree);
 368
 369 static void moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m);
 370 static void moea64_sp_demote_aligned(struct pvo_entry *sp);
 371 static void moea64_sp_demote(struct pvo_entry *pvo);
 372
 373 static struct pvo_entry *moea64_sp_unwire(struct pvo_entry *sp);
 374 static struct pvo_entry *moea64_sp_protect(struct pvo_entry *sp,
 375     vm_prot_t prot);
 376
 377 static int64_t moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit);
 378 static int64_t moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m,
 379     uint64_t ptebit);
 380
 381 static __inline bool moea64_sp_pvo_in_range(struct pvo_entry *pvo,
 382     vm_offset_t sva, vm_offset_t eva);
 383
 384 /*
 385  * Kernel MMU interface
 386  */
 387 void moea64_clear_modify(vm_page_t);
 388 void moea64_copy_page(vm_page_t, vm_page_t);
 389 void moea64_copy_page_dmap(vm_page_t, vm_page_t);
 390 void moea64_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
 391     vm_page_t *mb, vm_offset_t b_offset, int xfersize);
 392 void moea64_copy_pages_dmap(vm_page_t *ma, vm_offset_t a_offset,
 393     vm_page_t *mb, vm_offset_t b_offset, int xfersize);
 394 int moea64_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t,
 395     u_int flags, int8_t psind);
 396 void moea64_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
 397     vm_prot_t);
 398 void moea64_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
 399 vm_paddr_t moea64_extract(pmap_t, vm_offset_t);
 400 vm_page_t moea64_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t);
 401 void moea64_init(void);
 402 boolean_t moea64_is_modified(vm_page_t);
 403 boolean_t moea64_is_prefaultable(pmap_t, vm_offset_t);
 404 boolean_t moea64_is_referenced(vm_page_t);
 405 int moea64_ts_referenced(vm_page_t);
 406 vm_offset_t moea64_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
 407 boolean_t moea64_page_exists_quick(pmap_t, vm_page_t);
 408 void moea64_page_init(vm_page_t);
 409 int moea64_page_wired_mappings(vm_page_t);
 410 int moea64_pinit(pmap_t);
 411 void moea64_pinit0(pmap_t);
 412 void moea64_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
 413 void moea64_qenter(vm_offset_t, vm_page_t *, int);
 414 void moea64_qremove(vm_offset_t, int);
 415 void moea64_release(pmap_t);
 416 void moea64_remove(pmap_t, vm_offset_t, vm_offset_t);
 417 void moea64_remove_pages(pmap_t);
 418 void moea64_remove_all(vm_page_t);
 419 void moea64_remove_write(vm_page_t);
 420 void moea64_unwire(pmap_t, vm_offset_t, vm_offset_t);
 421 void moea64_zero_page(vm_page_t);
 422 void moea64_zero_page_dmap(vm_page_t);
 423 void moea64_zero_page_area(vm_page_t, int, int);
 424 void moea64_activate(struct thread *);
 425 void moea64_deactivate(struct thread *);
 426 void *moea64_mapdev(vm_paddr_t, vm_size_t);
 427 void *moea64_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t);
 428 void moea64_unmapdev(vm_offset_t, vm_size_t);
 429 vm_paddr_t moea64_kextract(vm_offset_t);
 430 void moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma);
 431 void moea64_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma);
 432 void moea64_kenter(vm_offset_t, vm_paddr_t);
 433 boolean_t moea64_dev_direct_mapped(vm_paddr_t, vm_size_t);
 434 static void moea64_sync_icache(pmap_t, vm_offset_t, vm_size_t);
 435 void moea64_dumpsys_map(vm_paddr_t pa, size_t sz,
 436     void **va);
 437 void moea64_scan_init(void);
 438 vm_offset_t moea64_quick_enter_page(vm_page_t m);
 439 vm_offset_t moea64_quick_enter_page_dmap(vm_page_t m);
 440 void moea64_quick_remove_page(vm_offset_t addr);
 441 boolean_t moea64_page_is_mapped(vm_page_t m);
 442 static int moea64_map_user_ptr(pmap_t pm,
 443     volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen);
 444 static int moea64_decode_kernel_ptr(vm_offset_t addr,
 445     int *is_user, vm_offset_t *decoded_addr);
 446 static size_t moea64_scan_pmap(struct bitset *dump_bitset);
 447 static void *moea64_dump_pmap_init(unsigned blkpgs);
 448 #ifdef __powerpc64__
 449 static void moea64_page_array_startup(long);
 450 #endif
 451 static int moea64_mincore(pmap_t, vm_offset_t, vm_paddr_t *);
 452
 453 static struct pmap_funcs moea64_methods = {
 454         .clear_modify = moea64_clear_modify,
 455         .copy_page = moea64_copy_page,
 456         .copy_pages = moea64_copy_pages,
 457         .enter = moea64_enter,
 458         .enter_object = moea64_enter_object,
 459         .enter_quick = moea64_enter_quick,
 460         .extract = moea64_extract,
 461         .extract_and_hold = moea64_extract_and_hold,
 462         .init = moea64_init,
 463         .is_modified = moea64_is_modified,
 464         .is_prefaultable = moea64_is_prefaultable,
 465         .is_referenced = moea64_is_referenced,
 466         .ts_referenced = moea64_ts_referenced,
 467         .map =                  moea64_map,
 468         .mincore = moea64_mincore,
 469         .page_exists_quick = moea64_page_exists_quick,
 470         .page_init = moea64_page_init,
 471         .page_wired_mappings = moea64_page_wired_mappings,
 472         .pinit = moea64_pinit,
 473         .pinit0 = moea64_pinit0,
 474         .protect = moea64_protect,
 475         .qenter = moea64_qenter,
 476         .qremove = moea64_qremove,
 477         .release = moea64_release,
 478         .remove = moea64_remove,
 479         .remove_pages = moea64_remove_pages,
 480         .remove_all =           moea64_remove_all,
 481         .remove_write = moea64_remove_write,
 482         .sync_icache = moea64_sync_icache,
 483         .unwire = moea64_unwire,
 484         .zero_page =            moea64_zero_page,
 485         .zero_page_area = moea64_zero_page_area,
 486         .activate = moea64_activate,
 487         .deactivate =           moea64_deactivate,
 488         .page_set_memattr = moea64_page_set_memattr,
 489         .quick_enter_page =  moea64_quick_enter_page,
 490         .quick_remove_page =  moea64_quick_remove_page,
 491         .page_is_mapped = moea64_page_is_mapped,
 492 #ifdef __powerpc64__
 493         .page_array_startup = moea64_page_array_startup,
 494 #endif
 495         .ps_enabled = moea64_ps_enabled,
 496         .align_superpage = moea64_align_superpage,
 497
 498         /* Internal interfaces */
 499         .mapdev = moea64_mapdev,
 500         .mapdev_attr = moea64_mapdev_attr,
 501         .unmapdev = moea64_unmapdev,
 502         .kextract = moea64_kextract,
 503         .kenter = moea64_kenter,
 504         .kenter_attr = moea64_kenter_attr,
 505         .dev_direct_mapped = moea64_dev_direct_mapped,
 506         .dumpsys_pa_init = moea64_scan_init,
 507         .dumpsys_scan_pmap = moea64_scan_pmap,
 508         .dumpsys_dump_pmap_init =    moea64_dump_pmap_init,
 509         .dumpsys_map_chunk = moea64_dumpsys_map,
 510         .map_user_ptr = moea64_map_user_ptr,
 511         .decode_kernel_ptr =  moea64_decode_kernel_ptr,
 512 };
 513
 514 MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods);
 515
 516 /*
 517  * Get physical address from PVO.
 518  *
 519  * For superpages, the lower bits are not stored on pvo_pte.pa and must be
 520  * obtained from VA.
 521  */
 522 static __always_inline vm_paddr_t
 523 moea64_pvo_paddr(struct pvo_entry *pvo)
 524 {
 525         vm_paddr_t pa;
 526
 527         pa = (pvo)->pvo_pte.pa & LPTE_RPGN;
 528
 529         if (PVO_IS_SP(pvo)) {
 530                 pa &= ~HPT_SP_MASK; /* This is needed to clear LPTE_LP bits. */
 531                 pa |= PVO_VADDR(pvo) & HPT_SP_MASK;
 532         }
 533         return (pa);
 534 }
 535
 536 static struct pvo_head *
 537 vm_page_to_pvoh(vm_page_t m)
 538 {
 539
 540         mtx_assert(PV_LOCKPTR(VM_PAGE_TO_PHYS(m)), MA_OWNED);
 541         return (&m->md.mdpg_pvoh);
 542 }
 543
 544 static struct pvo_entry *
 545 alloc_pvo_entry(int bootstrap)
 546 {
 547         struct pvo_entry *pvo;
 548
 549         if (!moea64_initialized || bootstrap) {
 550                 if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) {
 551                         panic("%s: bpvo pool exhausted, index=%d, size=%d, bytes=%zd."
 552                             "Try setting machdep.moea64_bpvo_pool_size tunable",
 553                             __func__, moea64_bpvo_pool_index,
 554                             moea64_bpvo_pool_size,
 555                             moea64_bpvo_pool_size * sizeof(struct pvo_entry));
 556                 }
 557                 pvo = &moea64_bpvo_pool[
 558                     atomic_fetchadd_int(&moea64_bpvo_pool_index, 1)];
 559                 bzero(pvo, sizeof(*pvo));
 560                 pvo->pvo_vaddr = PVO_BOOTSTRAP;
 561         } else
 562                 pvo = uma_zalloc(moea64_pvo_zone, M_NOWAIT | M_ZERO);
 563
 564         return (pvo);
 565 }
 566
 567 static void
 568 init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va)
 569 {
 570         uint64_t vsid;
 571         uint64_t hash;
 572         int shift;
 573
 574         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 575
 576         pvo->pvo_pmap = pmap;
 577         va &= ~ADDR_POFF;
 578         pvo->pvo_vaddr |= va;
 579         vsid = va_to_vsid(pmap, va);
 580         pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT)
 581             | (vsid << 16);
 582
 583         if (pmap == kernel_pmap && (pvo->pvo_vaddr & PVO_LARGE) != 0)
 584                 shift = moea64_large_page_shift;
 585         else
 586                 shift = ADDR_PIDX_SHFT;
 587         hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift);
 588         pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3;
 589 }
 590
 591 static void
 592 free_pvo_entry(struct pvo_entry *pvo)
 593 {
 594
 595         if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP))
 596                 uma_zfree(moea64_pvo_zone, pvo);
 597 }
 598
 599 void
 600 moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte)
 601 {
 602
 603         lpte->pte_hi = moea64_pte_vpn_from_pvo_vpn(pvo);
 604         lpte->pte_hi |= LPTE_VALID;
 605
 606         if (pvo->pvo_vaddr & PVO_LARGE)
 607                 lpte->pte_hi |= LPTE_BIG;
 608         if (pvo->pvo_vaddr & PVO_WIRED)
 609                 lpte->pte_hi |= LPTE_WIRED;
 610         if (pvo->pvo_vaddr & PVO_HID)
 611                 lpte->pte_hi |= LPTE_HID;
 612
 613         lpte->pte_lo = pvo->pvo_pte.pa; /* Includes WIMG bits */
 614         if (pvo->pvo_pte.prot & VM_PROT_WRITE)
 615                 lpte->pte_lo |= LPTE_BW;
 616         else
 617                 lpte->pte_lo |= LPTE_BR;
 618
 619         if (!(pvo->pvo_pte.prot & VM_PROT_EXECUTE))
 620                 lpte->pte_lo |= LPTE_NOEXEC;
 621 }
 622
 623 static __inline uint64_t
 624 moea64_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
 625 {
 626         uint64_t pte_lo;
 627         int i;
 628
 629         if (ma != VM_MEMATTR_DEFAULT) {
 630                 switch (ma) {
 631                 case VM_MEMATTR_UNCACHEABLE:
 632                         return (LPTE_I | LPTE_G);
 633                 case VM_MEMATTR_CACHEABLE:
 634                         return (LPTE_M);
 635                 case VM_MEMATTR_WRITE_COMBINING:
 636                 case VM_MEMATTR_WRITE_BACK:
 637                 case VM_MEMATTR_PREFETCHABLE:
 638                         return (LPTE_I);
 639                 case VM_MEMATTR_WRITE_THROUGH:
 640                         return (LPTE_W | LPTE_M);
 641                 }
 642         }
 643
 644         /*
 645          * Assume the page is cache inhibited and access is guarded unless
 646          * it's in our available memory array.
 647          */
 648         pte_lo = LPTE_I | LPTE_G;
 649         for (i = 0; i < pregions_sz; i++) {
 650                 if ((pa >= pregions[i].mr_start) &&
 651                     (pa < (pregions[i].mr_start + pregions[i].mr_size))) {
 652                         pte_lo &= ~(LPTE_I | LPTE_G);
 653                         pte_lo |= LPTE_M;
 654                         break;
 655                 }
 656         }
 657
 658         return pte_lo;
 659 }
 660
 661 /*
 662  * Quick sort callout for comparing memory regions.
 663  */
 664 static int      om_cmp(const void *a, const void *b);
 665
 666 static int
 667 om_cmp(const void *a, const void *b)
 668 {
 669         const struct    ofw_map *mapa;
 670         const struct    ofw_map *mapb;
 671
 672         mapa = a;
 673         mapb = b;
 674         if (mapa->om_pa < mapb->om_pa)
 675                 return (-1);
 676         else if (mapa->om_pa > mapb->om_pa)
 677                 return (1);
 678         else
 679                 return (0);
 680 }
 681
 682 static void
 683 moea64_add_ofw_mappings(phandle_t mmu, size_t sz)
 684 {
 685         struct ofw_map  translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */
 686         pcell_t         acells, trans_cells[sz/sizeof(cell_t)];
 687         struct pvo_entry *pvo;
 688         register_t      msr;
 689         vm_offset_t     off;
 690         vm_paddr_t      pa_base;
 691         int             i, j;
 692
 693         bzero(translations, sz);
 694         OF_getencprop(OF_finddevice("/"), "#address-cells", &acells,
 695             sizeof(acells));
 696         if (OF_getencprop(mmu, "translations", trans_cells, sz) == -1)
 697                 panic("moea64_bootstrap: can't get ofw translations");
 698
 699         CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations");
 700         sz /= sizeof(cell_t);
 701         for (i = 0, j = 0; i < sz; j++) {
 702                 translations[j].om_va = trans_cells[i++];
 703                 translations[j].om_len = trans_cells[i++];
 704                 translations[j].om_pa = trans_cells[i++];
 705                 if (acells == 2) {
 706                         translations[j].om_pa <<= 32;
 707                         translations[j].om_pa |= trans_cells[i++];
 708                 }
 709                 translations[j].om_mode = trans_cells[i++];
 710         }
 711         KASSERT(i == sz, ("Translations map has incorrect cell count (%d/%zd)",
 712             i, sz));
 713
 714         sz = j;
 715         qsort(translations, sz, sizeof (*translations), om_cmp);
 716
 717         for (i = 0; i < sz; i++) {
 718                 pa_base = translations[i].om_pa;
 719               #ifndef __powerpc64__
 720                 if ((translations[i].om_pa >> 32) != 0)
 721                         panic("OFW translations above 32-bit boundary!");
 722               #endif
 723
 724                 if (pa_base % PAGE_SIZE)
 725                         panic("OFW translation not page-aligned (phys)!");
 726                 if (translations[i].om_va % PAGE_SIZE)
 727                         panic("OFW translation not page-aligned (virt)!");
 728
 729                 CTR3(KTR_PMAP, "translation: pa=%#zx va=%#x len=%#x",
 730                     pa_base, translations[i].om_va, translations[i].om_len);
 731
 732                 /* Now enter the pages for this mapping */
 733
 734                 DISABLE_TRANS(msr);
 735                 for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) {
 736                         /* If this address is direct-mapped, skip remapping */
 737                         if (hw_direct_map &&
 738                             translations[i].om_va == PHYS_TO_DMAP(pa_base) &&
 739                             moea64_calc_wimg(pa_base + off, VM_MEMATTR_DEFAULT)
 740                             == LPTE_M)
 741                                 continue;
 742
 743                         PMAP_LOCK(kernel_pmap);
 744                         pvo = moea64_pvo_find_va(kernel_pmap,
 745                             translations[i].om_va + off);
 746                         PMAP_UNLOCK(kernel_pmap);
 747                         if (pvo != NULL)
 748                                 continue;
 749
 750                         moea64_kenter(translations[i].om_va + off,
 751                             pa_base + off);
 752                 }
 753                 ENABLE_TRANS(msr);
 754         }
 755 }
 756
 757 #ifdef __powerpc64__
 758 static void
 759 moea64_probe_large_page(void)
 760 {
 761         uint16_t pvr = mfpvr() >> 16;
 762
 763         switch (pvr) {
 764         case IBM970:
 765         case IBM970FX:
 766         case IBM970MP:
 767                 powerpc_sync(); isync();
 768                 mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG);
 769                 powerpc_sync(); isync();
 770
 771                 /* FALLTHROUGH */
 772         default:
 773                 if (moea64_large_page_size == 0) {
 774                         moea64_large_page_size = 0x1000000; /* 16 MB */
 775                         moea64_large_page_shift = 24;
 776                 }
 777         }
 778
 779         moea64_large_page_mask = moea64_large_page_size - 1;
 780 }
 781
 782 static void
 783 moea64_bootstrap_slb_prefault(vm_offset_t va, int large)
 784 {
 785         struct slb *cache;
 786         struct slb entry;
 787         uint64_t esid, slbe;
 788         uint64_t i;
 789
 790         cache = PCPU_GET(aim.slb);
 791         esid = va >> ADDR_SR_SHFT;
 792         slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID;
 793
 794         for (i = 0; i < 64; i++) {
 795                 if (cache[i].slbe == (slbe | i))
 796                         return;
 797         }
 798
 799         entry.slbe = slbe;
 800         entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT;
 801         if (large)
 802                 entry.slbv |= SLBV_L;
 803
 804         slb_insert_kernel(entry.slbe, entry.slbv);
 805 }
 806 #endif
 807
 808 static int
 809 moea64_kenter_large(vm_offset_t va, vm_paddr_t pa, uint64_t attr, int bootstrap)
 810 {
 811         struct pvo_entry *pvo;
 812         uint64_t pte_lo;
 813         int error;
 814
 815         pte_lo = LPTE_M;
 816         pte_lo |= attr;
 817
 818         pvo = alloc_pvo_entry(bootstrap);
 819         pvo->pvo_vaddr |= PVO_WIRED | PVO_LARGE;
 820         init_pvo_entry(pvo, kernel_pmap, va);
 821
 822         pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE |
 823             VM_PROT_EXECUTE;
 824         pvo->pvo_pte.pa = pa | pte_lo;
 825         error = moea64_pvo_enter(pvo, NULL, NULL);
 826         if (error != 0)
 827                 panic("Error %d inserting large page\n", error);
 828         return (0);
 829 }
 830
 831 static void
 832 moea64_setup_direct_map(vm_offset_t kernelstart,
 833     vm_offset_t kernelend)
 834 {
 835         register_t msr;
 836         vm_paddr_t pa, pkernelstart, pkernelend;
 837         vm_offset_t size, off;
 838         uint64_t pte_lo;
 839         int i;
 840
 841         if (moea64_large_page_size == 0)
 842                 hw_direct_map = 0;
 843
 844         DISABLE_TRANS(msr);
 845         if (hw_direct_map) {
 846                 PMAP_LOCK(kernel_pmap);
 847                 for (i = 0; i < pregions_sz; i++) {
 848                   for (pa = pregions[i].mr_start; pa < pregions[i].mr_start +
 849                      pregions[i].mr_size; pa += moea64_large_page_size) {
 850                         pte_lo = LPTE_M;
 851                         if (pa & moea64_large_page_mask) {
 852                                 pa &= moea64_large_page_mask;
 853                                 pte_lo |= LPTE_G;
 854                         }
 855                         if (pa + moea64_large_page_size >
 856                             pregions[i].mr_start + pregions[i].mr_size)
 857                                 pte_lo |= LPTE_G;
 858
 859                         moea64_kenter_large(PHYS_TO_DMAP(pa), pa, pte_lo, 1);
 860                   }
 861                 }
 862                 PMAP_UNLOCK(kernel_pmap);
 863         }
 864
 865         /*
 866          * Make sure the kernel and BPVO pool stay mapped on systems either
 867          * without a direct map or on which the kernel is not already executing
 868          * out of the direct-mapped region.
 869          */
 870         if (kernelstart < DMAP_BASE_ADDRESS) {
 871                 /*
 872                  * For pre-dmap execution, we need to use identity mapping
 873                  * because we will be operating with the mmu on but in the
 874                  * wrong address configuration until we __restartkernel().
 875                  */
 876                 for (pa = kernelstart & ~PAGE_MASK; pa < kernelend;
 877                     pa += PAGE_SIZE)
 878                         moea64_kenter(pa, pa);
 879         } else if (!hw_direct_map) {
 880                 pkernelstart = kernelstart & ~DMAP_BASE_ADDRESS;
 881                 pkernelend = kernelend & ~DMAP_BASE_ADDRESS;
 882                 for (pa = pkernelstart & ~PAGE_MASK; pa < pkernelend;
 883                     pa += PAGE_SIZE)
 884                         moea64_kenter(pa | DMAP_BASE_ADDRESS, pa);
 885         }
 886
 887         if (!hw_direct_map) {
 888                 size = moea64_bpvo_pool_size*sizeof(struct pvo_entry);
 889                 off = (vm_offset_t)(moea64_bpvo_pool);
 890                 for (pa = off; pa < off + size; pa += PAGE_SIZE)
 891                         moea64_kenter(pa, pa);
 892
 893                 /* Map exception vectors */
 894                 for (pa = EXC_RSVD; pa < EXC_LAST; pa += PAGE_SIZE)
 895                         moea64_kenter(pa | DMAP_BASE_ADDRESS, pa);
 896         }
 897         ENABLE_TRANS(msr);
 898
 899         /*
 900          * Allow user to override unmapped_buf_allowed for testing.
 901          * XXXKIB Only direct map implementation was tested.
 902          */
 903         if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed",
 904             &unmapped_buf_allowed))
 905                 unmapped_buf_allowed = hw_direct_map;
 906 }
 907
 908 /* Quick sort callout for comparing physical addresses. */
 909 static int
 910 pa_cmp(const void *a, const void *b)
 911 {
 912         const vm_paddr_t *pa = a, *pb = b;
 913
 914         if (*pa < *pb)
 915                 return (-1);
 916         else if (*pa > *pb)
 917                 return (1);
 918         else
 919                 return (0);
 920 }
 921
 922 void
 923 moea64_early_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
 924 {
 925         int             i, j;
 926         vm_size_t       physsz, hwphyssz;
 927         vm_paddr_t      kernelphysstart, kernelphysend;
 928         int             rm_pavail;
 929
 930         /* Level 0 reservations consist of 4096 pages (16MB superpage). */
 931         vm_level_0_order = 12;
 932
 933 #ifndef __powerpc64__
 934         /* We don't have a direct map since there is no BAT */
 935         hw_direct_map = 0;
 936
 937         /* Make sure battable is zero, since we have no BAT */
 938         for (i = 0; i < 16; i++) {
 939                 battable[i].batu = 0;
 940                 battable[i].batl = 0;
 941         }
 942 #else
 943         /* Install trap handlers for SLBs */
 944         bcopy(&slbtrap, (void *)EXC_DSE,(size_t)&slbtrapend - (size_t)&slbtrap);
 945         bcopy(&slbtrap, (void *)EXC_ISE,(size_t)&slbtrapend - (size_t)&slbtrap);
 946         __syncicache((void *)EXC_DSE, 0x80);
 947         __syncicache((void *)EXC_ISE, 0x80);
 948 #endif
 949
 950         kernelphysstart = kernelstart & ~DMAP_BASE_ADDRESS;
 951         kernelphysend = kernelend & ~DMAP_BASE_ADDRESS;
 952
 953         /* Get physical memory regions from firmware */
 954         mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
 955         CTR0(KTR_PMAP, "moea64_bootstrap: physical memory");
 956
 957         if (PHYS_AVAIL_ENTRIES < regions_sz)
 958                 panic("moea64_bootstrap: phys_avail too small");
 959
 960         phys_avail_count = 0;
 961         physsz = 0;
 962         hwphyssz = 0;
 963         TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
 964         for (i = 0, j = 0; i < regions_sz; i++, j += 2) {
 965                 CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)",
 966                     regions[i].mr_start, regions[i].mr_start +
 967                     regions[i].mr_size, regions[i].mr_size);
 968                 if (hwphyssz != 0 &&
 969                     (physsz + regions[i].mr_size) >= hwphyssz) {
 970                         if (physsz < hwphyssz) {
 971                                 phys_avail[j] = regions[i].mr_start;
 972                                 phys_avail[j + 1] = regions[i].mr_start +
 973                                     hwphyssz - physsz;
 974                                 physsz = hwphyssz;
 975                                 phys_avail_count++;
 976                                 dump_avail[j] = phys_avail[j];
 977                                 dump_avail[j + 1] = phys_avail[j + 1];
 978                         }
 979                         break;
 980                 }
 981                 phys_avail[j] = regions[i].mr_start;
 982                 phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
 983                 phys_avail_count++;
 984                 physsz += regions[i].mr_size;
 985                 dump_avail[j] = phys_avail[j];
 986                 dump_avail[j + 1] = phys_avail[j + 1];
 987         }
 988
 989         /* Check for overlap with the kernel and exception vectors */
 990         rm_pavail = 0;
 991         for (j = 0; j < 2*phys_avail_count; j+=2) {
 992                 if (phys_avail[j] < EXC_LAST)
 993                         phys_avail[j] += EXC_LAST;
 994
 995                 if (phys_avail[j] >= kernelphysstart &&
 996                     phys_avail[j+1] <= kernelphysend) {
 997                         phys_avail[j] = phys_avail[j+1] = ~0;
 998                         rm_pavail++;
 999                         continue;
1000                 }
1001
1002                 if (kernelphysstart >= phys_avail[j] &&
1003                     kernelphysstart < phys_avail[j+1]) {
1004                         if (kernelphysend < phys_avail[j+1]) {
1005                                 phys_avail[2*phys_avail_count] =
1006                                     (kernelphysend & ~PAGE_MASK) + PAGE_SIZE;
1007                                 phys_avail[2*phys_avail_count + 1] =
1008                                     phys_avail[j+1];
1009                                 phys_avail_count++;
1010                         }
1011
1012                         phys_avail[j+1] = kernelphysstart & ~PAGE_MASK;
1013                 }
1014
1015                 if (kernelphysend >= phys_avail[j] &&
1016                     kernelphysend < phys_avail[j+1]) {
1017                         if (kernelphysstart > phys_avail[j]) {
1018                                 phys_avail[2*phys_avail_count] = phys_avail[j];
1019                                 phys_avail[2*phys_avail_count + 1] =
1020                                     kernelphysstart & ~PAGE_MASK;
1021                                 phys_avail_count++;
1022                         }
1023
1024                         phys_avail[j] = (kernelphysend & ~PAGE_MASK) +
1025                             PAGE_SIZE;
1026                 }
1027         }
1028
1029         /* Remove physical available regions marked for removal (~0) */
1030         if (rm_pavail) {
1031                 qsort(phys_avail, 2*phys_avail_count, sizeof(phys_avail[0]),
1032                         pa_cmp);
1033                 phys_avail_count -= rm_pavail;
1034                 for (i = 2*phys_avail_count;
1035                      i < 2*(phys_avail_count + rm_pavail); i+=2)
1036                         phys_avail[i] = phys_avail[i+1] = 0;
1037         }
1038
1039         physmem = btoc(physsz);
1040
1041 #ifdef PTEGCOUNT
1042         moea64_pteg_count = PTEGCOUNT;
1043 #else
1044         moea64_pteg_count = 0x1000;
1045
1046         while (moea64_pteg_count < physmem)
1047                 moea64_pteg_count <<= 1;
1048
1049         moea64_pteg_count >>= 1;
1050 #endif /* PTEGCOUNT */
1051 }
1052
1053 void
1054 moea64_mid_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
1055 {
1056         int             i;
1057
1058         /*
1059          * Set PTEG mask
1060          */
1061         moea64_pteg_mask = moea64_pteg_count - 1;
1062
1063         /*
1064          * Initialize SLB table lock and page locks
1065          */
1066         mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF);
1067         for (i = 0; i < PV_LOCK_COUNT; i++)
1068                 mtx_init(&pv_lock[i], "page pv", NULL, MTX_DEF);
1069
1070         /*
1071          * Initialise the bootstrap pvo pool.
1072          */
1073         TUNABLE_INT_FETCH("machdep.moea64_bpvo_pool_size", &moea64_bpvo_pool_size);
1074         if (moea64_bpvo_pool_size == 0) {
1075                 if (!hw_direct_map)
1076                         moea64_bpvo_pool_size = ((ptoa((uintmax_t)physmem) * sizeof(struct vm_page)) /
1077                             (PAGE_SIZE * PAGE_SIZE)) * BPVO_POOL_EXPANSION_FACTOR;
1078                 else
1079                         moea64_bpvo_pool_size = BPVO_POOL_SIZE;
1080         }
1081
1082         if (boothowto & RB_VERBOSE) {
1083                 printf("mmu_oea64: bpvo pool entries = %d, bpvo pool size = %zu MB\n",
1084                     moea64_bpvo_pool_size,
1085                     moea64_bpvo_pool_size*sizeof(struct pvo_entry) / 1048576);
1086         }
1087
1088         moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc(
1089                 moea64_bpvo_pool_size*sizeof(struct pvo_entry), PAGE_SIZE);
1090         moea64_bpvo_pool_index = 0;
1091
1092         /* Place at address usable through the direct map */
1093         if (hw_direct_map)
1094                 moea64_bpvo_pool = (struct pvo_entry *)
1095                     PHYS_TO_DMAP((uintptr_t)moea64_bpvo_pool);
1096
1097         /*
1098          * Make sure kernel vsid is allocated as well as VSID 0.
1099          */
1100         #ifndef __powerpc64__
1101         moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW]
1102                 |= 1 << (KERNEL_VSIDBITS % VSID_NBPW);
1103         moea64_vsid_bitmap[0] |= 1;
1104         #endif
1105
1106         /*
1107          * Initialize the kernel pmap (which is statically allocated).
1108          */
1109         #ifdef __powerpc64__
1110         for (i = 0; i < 64; i++) {
1111                 pcpup->pc_aim.slb[i].slbv = 0;
1112                 pcpup->pc_aim.slb[i].slbe = 0;
1113         }
1114         #else
1115         for (i = 0; i < 16; i++)
1116                 kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i;
1117         #endif
1118
1119         kernel_pmap->pmap_phys = kernel_pmap;
1120         CPU_FILL(&kernel_pmap->pm_active);
1121         RB_INIT(&kernel_pmap->pmap_pvo);
1122
1123         PMAP_LOCK_INIT(kernel_pmap);
1124
1125         /*
1126          * Now map in all the other buffers we allocated earlier
1127          */
1128
1129         moea64_setup_direct_map(kernelstart, kernelend);
1130 }
1131
1132 void
1133 moea64_late_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
1134 {
1135         ihandle_t       mmui;
1136         phandle_t       chosen;
1137         phandle_t       mmu;
1138         ssize_t         sz;
1139         int             i;
1140         vm_offset_t     pa, va;
1141         void            *dpcpu;
1142
1143         /*
1144          * Set up the Open Firmware pmap and add its mappings if not in real
1145          * mode.
1146          */
1147
1148         chosen = OF_finddevice("/chosen");
1149         if (chosen != -1 && OF_getencprop(chosen, "mmu", &mmui, 4) != -1) {
1150                 mmu = OF_instance_to_package(mmui);
1151                 if (mmu == -1 ||
1152                     (sz = OF_getproplen(mmu, "translations")) == -1)
1153                         sz = 0;
1154                 if (sz > 6144 /* tmpstksz - 2 KB headroom */)
1155                         panic("moea64_bootstrap: too many ofw translations");
1156
1157                 if (sz > 0)
1158                         moea64_add_ofw_mappings(mmu, sz);
1159         }
1160
1161         /*
1162          * Calculate the last available physical address.
1163          */
1164         Maxmem = 0;
1165         for (i = 0; phys_avail[i + 1] != 0; i += 2)
1166                 Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1]));
1167
1168         /*
1169          * Initialize MMU.
1170          */
1171         pmap_cpu_bootstrap(0);
1172         mtmsr(mfmsr() | PSL_DR | PSL_IR);
1173         pmap_bootstrapped++;
1174
1175         /*
1176          * Set the start and end of kva.
1177          */
1178         virtual_avail = VM_MIN_KERNEL_ADDRESS;
1179         virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS;
1180
1181         /*
1182          * Map the entire KVA range into the SLB. We must not fault there.
1183          */
1184         #ifdef __powerpc64__
1185         for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH)
1186                 moea64_bootstrap_slb_prefault(va, 0);
1187         #endif
1188
1189         /*
1190          * Remap any early IO mappings (console framebuffer, etc.)
1191          */
1192         bs_remap_earlyboot();
1193
1194         /*
1195          * Figure out how far we can extend virtual_end into segment 16
1196          * without running into existing mappings. Segment 16 is guaranteed
1197          * to contain neither RAM nor devices (at least on Apple hardware),
1198          * but will generally contain some OFW mappings we should not
1199          * step on.
1200          */
1201
1202         #ifndef __powerpc64__   /* KVA is in high memory on PPC64 */
1203         PMAP_LOCK(kernel_pmap);
1204         while (virtual_end < VM_MAX_KERNEL_ADDRESS &&
1205             moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL)
1206                 virtual_end += PAGE_SIZE;
1207         PMAP_UNLOCK(kernel_pmap);
1208         #endif
1209
1210         /*
1211          * Allocate a kernel stack with a guard page for thread0 and map it
1212          * into the kernel page map.
1213          */
1214         pa = moea64_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE);
1215         va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
1216         virtual_avail = va + kstack_pages * PAGE_SIZE;
1217         CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va);
1218         thread0.td_kstack = va;
1219         thread0.td_kstack_pages = kstack_pages;
1220         for (i = 0; i < kstack_pages; i++) {
1221                 moea64_kenter(va, pa);
1222                 pa += PAGE_SIZE;
1223                 va += PAGE_SIZE;
1224         }
1225
1226         /*
1227          * Allocate virtual address space for the message buffer.
1228          */
1229         pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE);
1230         msgbufp = (struct msgbuf *)virtual_avail;
1231         va = virtual_avail;
1232         virtual_avail += round_page(msgbufsize);
1233         while (va < virtual_avail) {
1234                 moea64_kenter(va, pa);
1235                 pa += PAGE_SIZE;
1236                 va += PAGE_SIZE;
1237         }
1238
1239         /*
1240          * Allocate virtual address space for the dynamic percpu area.
1241          */
1242         pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE);
1243         dpcpu = (void *)virtual_avail;
1244         va = virtual_avail;
1245         virtual_avail += DPCPU_SIZE;
1246         while (va < virtual_avail) {
1247                 moea64_kenter(va, pa);
1248                 pa += PAGE_SIZE;
1249                 va += PAGE_SIZE;
1250         }
1251         dpcpu_init(dpcpu, curcpu);
1252
1253         crashdumpmap = (caddr_t)virtual_avail;
1254         virtual_avail += MAXDUMPPGS * PAGE_SIZE;
1255
1256         /*
1257          * Allocate some things for page zeroing. We put this directly
1258          * in the page table and use MOEA64_PTE_REPLACE to avoid any
1259          * of the PVO book-keeping or other parts of the VM system
1260          * from even knowing that this hack exists.
1261          */
1262
1263         if (!hw_direct_map) {
1264                 mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL,
1265                     MTX_DEF);
1266                 for (i = 0; i < 2; i++) {
1267                         moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE;
1268                         virtual_end -= PAGE_SIZE;
1269
1270                         moea64_kenter(moea64_scratchpage_va[i], 0);
1271
1272                         PMAP_LOCK(kernel_pmap);
1273                         moea64_scratchpage_pvo[i] = moea64_pvo_find_va(
1274                             kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]);
1275                         PMAP_UNLOCK(kernel_pmap);
1276                 }
1277         }
1278
1279         numa_mem_regions(&numa_pregions, &numapregions_sz);
1280 }
1281
1282 static void
1283 moea64_pmap_init_qpages(void)
1284 {
1285         struct pcpu *pc;
1286         int i;
1287
1288         if (hw_direct_map)
1289                 return;
1290
1291         CPU_FOREACH(i) {
1292                 pc = pcpu_find(i);
1293                 pc->pc_qmap_addr = kva_alloc(PAGE_SIZE);
1294                 if (pc->pc_qmap_addr == 0)
1295                         panic("pmap_init_qpages: unable to allocate KVA");
1296                 PMAP_LOCK(kernel_pmap);
1297                 pc->pc_aim.qmap_pvo =
1298                     moea64_pvo_find_va(kernel_pmap, pc->pc_qmap_addr);
1299                 PMAP_UNLOCK(kernel_pmap);
1300                 mtx_init(&pc->pc_aim.qmap_lock, "qmap lock", NULL, MTX_DEF);
1301         }
1302 }
1303
1304 SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, moea64_pmap_init_qpages, NULL);
1305
1306 /*
1307  * Activate a user pmap.  This mostly involves setting some non-CPU
1308  * state.
1309  */
1310 void
1311 moea64_activate(struct thread *td)
1312 {
1313         pmap_t  pm;
1314
1315         pm = &td->td_proc->p_vmspace->vm_pmap;
1316         CPU_SET(PCPU_GET(cpuid), &pm->pm_active);
1317
1318         #ifdef __powerpc64__
1319         PCPU_SET(aim.userslb, pm->pm_slb);
1320         __asm __volatile("slbmte %0, %1; isync" ::
1321             "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE));
1322         #else
1323         PCPU_SET(curpmap, pm->pmap_phys);
1324         mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid);
1325         #endif
1326 }
1327
1328 void
1329 moea64_deactivate(struct thread *td)
1330 {
1331         pmap_t  pm;
1332
1333         __asm __volatile("isync; slbie %0" :: "r"(USER_ADDR));
1334
1335         pm = &td->td_proc->p_vmspace->vm_pmap;
1336         CPU_CLR(PCPU_GET(cpuid), &pm->pm_active);
1337         #ifdef __powerpc64__
1338         PCPU_SET(aim.userslb, NULL);
1339         #else
1340         PCPU_SET(curpmap, NULL);
1341         #endif
1342 }
1343
1344 void
1345 moea64_unwire(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
1346 {
1347         struct  pvo_entry key, *pvo;
1348         vm_page_t m;
1349         int64_t refchg;
1350
1351         key.pvo_vaddr = sva;
1352         PMAP_LOCK(pm);
1353         for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
1354             pvo != NULL && PVO_VADDR(pvo) < eva;
1355             pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
1356                 if (PVO_IS_SP(pvo)) {
1357                         if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
1358                                 pvo = moea64_sp_unwire(pvo);
1359                                 continue;
1360                         } else {
1361                                 CTR1(KTR_PMAP, "%s: demote before unwire",
1362                                     __func__);
1363                                 moea64_sp_demote(pvo);
1364                         }
1365                 }
1366
1367                 if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
1368                         panic("moea64_unwire: pvo %p is missing PVO_WIRED",
1369                             pvo);
1370                 pvo->pvo_vaddr &= ~PVO_WIRED;
1371                 refchg = moea64_pte_replace(pvo, 0 /* No invalidation */);
1372                 if ((pvo->pvo_vaddr & PVO_MANAGED) &&
1373                     (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
1374                         if (refchg < 0)
1375                                 refchg = LPTE_CHG;
1376                         m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
1377
1378                         refchg |= atomic_readandclear_32(&m->md.mdpg_attrs);
1379                         if (refchg & LPTE_CHG)
1380                                 vm_page_dirty(m);
1381                         if (refchg & LPTE_REF)
1382                                 vm_page_aflag_set(m, PGA_REFERENCED);
1383                 }
1384                 pm->pm_stats.wired_count--;
1385         }
1386         PMAP_UNLOCK(pm);
1387 }
1388
1389 static int
1390 moea64_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
1391 {
1392         struct pvo_entry *pvo;
1393         vm_paddr_t pa;
1394         vm_page_t m;
1395         int val;
1396         bool managed;
1397
1398         PMAP_LOCK(pmap);
1399
1400         pvo = moea64_pvo_find_va(pmap, addr);
1401         if (pvo != NULL) {
1402                 pa = PVO_PADDR(pvo);
1403                 m = PHYS_TO_VM_PAGE(pa);
1404                 managed = (pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED;
1405                 if (PVO_IS_SP(pvo))
1406                         val = MINCORE_INCORE | MINCORE_PSIND(1);
1407                 else
1408                         val = MINCORE_INCORE;
1409         } else {
1410                 PMAP_UNLOCK(pmap);
1411                 return (0);
1412         }
1413
1414         PMAP_UNLOCK(pmap);
1415
1416         if (m == NULL)
1417                 return (0);
1418
1419         if (managed) {
1420                 if (moea64_is_modified(m))
1421                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
1422
1423                 if (moea64_is_referenced(m))
1424                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
1425         }
1426
1427         if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
1428             (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
1429             managed) {
1430                 *pap = pa;
1431         }
1432
1433         return (val);
1434 }
1435
1436 /*
1437  * This goes through and sets the physical address of our
1438  * special scratch PTE to the PA we want to zero or copy. Because
1439  * of locking issues (this can get called in pvo_enter() by
1440  * the UMA allocator), we can't use most other utility functions here
1441  */
1442
1443 static __inline
1444 void moea64_set_scratchpage_pa(int which, vm_paddr_t pa)
1445 {
1446         struct pvo_entry *pvo;
1447
1448         KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!"));
1449         mtx_assert(&moea64_scratchpage_mtx, MA_OWNED);
1450
1451         pvo = moea64_scratchpage_pvo[which];
1452         PMAP_LOCK(pvo->pvo_pmap);
1453         pvo->pvo_pte.pa =
1454             moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa;
1455         moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
1456         PMAP_UNLOCK(pvo->pvo_pmap);
1457         isync();
1458 }
1459
1460 void
1461 moea64_copy_page(vm_page_t msrc, vm_page_t mdst)
1462 {
1463         mtx_lock(&moea64_scratchpage_mtx);
1464
1465         moea64_set_scratchpage_pa(0, VM_PAGE_TO_PHYS(msrc));
1466         moea64_set_scratchpage_pa(1, VM_PAGE_TO_PHYS(mdst));
1467
1468         bcopy((void *)moea64_scratchpage_va[0],
1469             (void *)moea64_scratchpage_va[1], PAGE_SIZE);
1470
1471         mtx_unlock(&moea64_scratchpage_mtx);
1472 }
1473
1474 void
1475 moea64_copy_page_dmap(vm_page_t msrc, vm_page_t mdst)
1476 {
1477         vm_offset_t     dst;
1478         vm_offset_t     src;
1479
1480         dst = VM_PAGE_TO_PHYS(mdst);
1481         src = VM_PAGE_TO_PHYS(msrc);
1482
1483         bcopy((void *)PHYS_TO_DMAP(src), (void *)PHYS_TO_DMAP(dst),
1484             PAGE_SIZE);
1485 }
1486
1487 inline void
1488 moea64_copy_pages_dmap(vm_page_t *ma, vm_offset_t a_offset,
1489     vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1490 {
1491         void *a_cp, *b_cp;
1492         vm_offset_t a_pg_offset, b_pg_offset;
1493         int cnt;
1494
1495         while (xfersize > 0) {
1496                 a_pg_offset = a_offset & PAGE_MASK;
1497                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
1498                 a_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
1499                     VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) +
1500                     a_pg_offset;
1501                 b_pg_offset = b_offset & PAGE_MASK;
1502                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
1503                 b_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
1504                     VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) +
1505                     b_pg_offset;
1506                 bcopy(a_cp, b_cp, cnt);
1507                 a_offset += cnt;
1508                 b_offset += cnt;
1509                 xfersize -= cnt;
1510         }
1511 }
1512
1513 void
1514 moea64_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
1515     vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1516 {
1517         void *a_cp, *b_cp;
1518         vm_offset_t a_pg_offset, b_pg_offset;
1519         int cnt;
1520
1521         mtx_lock(&moea64_scratchpage_mtx);
1522         while (xfersize > 0) {
1523                 a_pg_offset = a_offset & PAGE_MASK;
1524                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
1525                 moea64_set_scratchpage_pa(0,
1526                     VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]));
1527                 a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset;
1528                 b_pg_offset = b_offset & PAGE_MASK;
1529                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
1530                 moea64_set_scratchpage_pa(1,
1531                     VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]));
1532                 b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset;
1533                 bcopy(a_cp, b_cp, cnt);
1534                 a_offset += cnt;
1535                 b_offset += cnt;
1536                 xfersize -= cnt;
1537         }
1538         mtx_unlock(&moea64_scratchpage_mtx);
1539 }
1540
1541 void
1542 moea64_zero_page_area(vm_page_t m, int off, int size)
1543 {
1544         vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
1545
1546         if (size + off > PAGE_SIZE)
1547                 panic("moea64_zero_page: size + off > PAGE_SIZE");
1548
1549         if (hw_direct_map) {
1550                 bzero((caddr_t)(uintptr_t)PHYS_TO_DMAP(pa) + off, size);
1551         } else {
1552                 mtx_lock(&moea64_scratchpage_mtx);
1553                 moea64_set_scratchpage_pa(0, pa);
1554                 bzero((caddr_t)moea64_scratchpage_va[0] + off, size);
1555                 mtx_unlock(&moea64_scratchpage_mtx);
1556         }
1557 }
1558
1559 /*
1560  * Zero a page of physical memory by temporarily mapping it
1561  */
1562 void
1563 moea64_zero_page(vm_page_t m)
1564 {
1565         vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
1566         vm_offset_t va, off;
1567
1568         mtx_lock(&moea64_scratchpage_mtx);
1569
1570         moea64_set_scratchpage_pa(0, pa);
1571         va = moea64_scratchpage_va[0];
1572
1573         for (off = 0; off < PAGE_SIZE; off += cacheline_size)
1574                 __asm __volatile("dcbz 0,%0" :: "r"(va + off));
1575
1576         mtx_unlock(&moea64_scratchpage_mtx);
1577 }
1578
1579 void
1580 moea64_zero_page_dmap(vm_page_t m)
1581 {
1582         vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
1583         vm_offset_t va, off;
1584
1585         va = PHYS_TO_DMAP(pa);
1586         for (off = 0; off < PAGE_SIZE; off += cacheline_size)
1587                 __asm __volatile("dcbz 0,%0" :: "r"(va + off));
1588 }
1589
1590 vm_offset_t
1591 moea64_quick_enter_page(vm_page_t m)
1592 {
1593         struct pvo_entry *pvo;
1594         vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
1595
1596         /*
1597          * MOEA64_PTE_REPLACE does some locking, so we can't just grab
1598          * a critical section and access the PCPU data like on i386.
1599          * Instead, pin the thread and grab the PCPU lock to prevent
1600          * a preempting thread from using the same PCPU data.
1601          */
1602         sched_pin();
1603
1604         mtx_assert(PCPU_PTR(aim.qmap_lock), MA_NOTOWNED);
1605         pvo = PCPU_GET(aim.qmap_pvo);
1606
1607         mtx_lock(PCPU_PTR(aim.qmap_lock));
1608         pvo->pvo_pte.pa = moea64_calc_wimg(pa, pmap_page_get_memattr(m)) |
1609             (uint64_t)pa;
1610         moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
1611         isync();
1612
1613         return (PCPU_GET(qmap_addr));
1614 }
1615
1616 vm_offset_t
1617 moea64_quick_enter_page_dmap(vm_page_t m)
1618 {
1619
1620         return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
1621 }
1622
1623 void
1624 moea64_quick_remove_page(vm_offset_t addr)
1625 {
1626
1627         mtx_assert(PCPU_PTR(aim.qmap_lock), MA_OWNED);
1628         KASSERT(PCPU_GET(qmap_addr) == addr,
1629             ("moea64_quick_remove_page: invalid address"));
1630         mtx_unlock(PCPU_PTR(aim.qmap_lock));
1631         sched_unpin();
1632 }
1633
1634 boolean_t
1635 moea64_page_is_mapped(vm_page_t m)
1636 {
1637         return (!LIST_EMPTY(&(m)->md.mdpg_pvoh));
1638 }
1639
1640 /*
1641  * Map the given physical page at the specified virtual address in the
1642  * target pmap with the protection requested.  If specified the page
1643  * will be wired down.
1644  */
1645
1646 int
1647 moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
1648     vm_prot_t prot, u_int flags, int8_t psind)
1649 {
1650         struct          pvo_entry *pvo, *oldpvo, *tpvo;
1651         struct          pvo_head *pvo_head;
1652         uint64_t        pte_lo;
1653         int             error;
1654         vm_paddr_t      pa;
1655
1656         if ((m->oflags & VPO_UNMANAGED) == 0) {
1657                 if ((flags & PMAP_ENTER_QUICK_LOCKED) == 0)
1658                         VM_PAGE_OBJECT_BUSY_ASSERT(m);
1659                 else
1660                         VM_OBJECT_ASSERT_LOCKED(m->object);
1661         }
1662
1663         if (psind > 0)
1664                 return (moea64_sp_enter(pmap, va, m, prot, flags, psind));
1665
1666         pvo = alloc_pvo_entry(0);
1667         if (pvo == NULL)
1668                 return (KERN_RESOURCE_SHORTAGE);
1669         pvo->pvo_pmap = NULL; /* to be filled in later */
1670         pvo->pvo_pte.prot = prot;
1671
1672         pa = VM_PAGE_TO_PHYS(m);
1673         pte_lo = moea64_calc_wimg(pa, pmap_page_get_memattr(m));
1674         pvo->pvo_pte.pa = pa | pte_lo;
1675
1676         if ((flags & PMAP_ENTER_WIRED) != 0)
1677                 pvo->pvo_vaddr |= PVO_WIRED;
1678
1679         if ((m->oflags & VPO_UNMANAGED) != 0 || !moea64_initialized) {
1680                 pvo_head = NULL;
1681         } else {
1682                 pvo_head = &m->md.mdpg_pvoh;
1683                 pvo->pvo_vaddr |= PVO_MANAGED;
1684         }
1685
1686         PV_LOCK(pa);
1687         PMAP_LOCK(pmap);
1688         if (pvo->pvo_pmap == NULL)
1689                 init_pvo_entry(pvo, pmap, va);
1690
1691         if (moea64_ps_enabled(pmap) &&
1692             (tpvo = moea64_pvo_find_va(pmap, va & ~HPT_SP_MASK)) != NULL &&
1693             PVO_IS_SP(tpvo)) {
1694                 /* Demote SP before entering a regular page */
1695                 CTR2(KTR_PMAP, "%s: demote before enter: va=%#jx",
1696                     __func__, (uintmax_t)va);
1697                 moea64_sp_demote_aligned(tpvo);
1698         }
1699
1700         if (prot & VM_PROT_WRITE)
1701                 if (pmap_bootstrapped &&
1702                     (m->oflags & VPO_UNMANAGED) == 0)
1703                         vm_page_aflag_set(m, PGA_WRITEABLE);
1704
1705         error = moea64_pvo_enter(pvo, pvo_head, &oldpvo);
1706         if (error == EEXIST) {
1707                 if (oldpvo->pvo_vaddr == pvo->pvo_vaddr &&
1708                     oldpvo->pvo_pte.pa == pvo->pvo_pte.pa &&
1709                     oldpvo->pvo_pte.prot == prot) {
1710                         /* Identical mapping already exists */
1711                         error = 0;
1712
1713                         /* If not in page table, reinsert it */
1714                         if (moea64_pte_synch(oldpvo) < 0) {
1715                                 STAT_MOEA64(moea64_pte_overflow--);
1716                                 moea64_pte_insert(oldpvo);
1717                         }
1718
1719                         /* Then just clean up and go home */
1720                         PMAP_UNLOCK(pmap);
1721                         PV_UNLOCK(pa);
1722                         free_pvo_entry(pvo);
1723                         pvo = NULL;
1724                         goto out;
1725                 } else {
1726                         /* Otherwise, need to kill it first */
1727                         KASSERT(oldpvo->pvo_pmap == pmap, ("pmap of old "
1728                             "mapping does not match new mapping"));
1729                         moea64_pvo_remove_from_pmap(oldpvo);
1730                         moea64_pvo_enter(pvo, pvo_head, NULL);
1731                 }
1732         }
1733         PMAP_UNLOCK(pmap);
1734         PV_UNLOCK(pa);
1735
1736         /* Free any dead pages */
1737         if (error == EEXIST) {
1738                 moea64_pvo_remove_from_page(oldpvo);
1739                 free_pvo_entry(oldpvo);
1740         }
1741
1742 out:
1743         /*
1744          * Flush the page from the instruction cache if this page is
1745          * mapped executable and cacheable.
1746          */
1747         if (pmap != kernel_pmap && (m->a.flags & PGA_EXECUTABLE) == 0 &&
1748             (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
1749                 vm_page_aflag_set(m, PGA_EXECUTABLE);
1750                 moea64_syncicache(pmap, va, pa, PAGE_SIZE);
1751         }
1752
1753 #if VM_NRESERVLEVEL > 0
1754         /*
1755          * Try to promote pages.
1756          *
1757          * If the VA of the entered page is not aligned with its PA,
1758          * don't try page promotion as it is not possible.
1759          * This reduces the number of promotion failures dramatically.
1760          */
1761         if (moea64_ps_enabled(pmap) && pmap != kernel_pmap && pvo != NULL &&
1762             (pvo->pvo_vaddr & PVO_MANAGED) != 0 &&
1763             (va & HPT_SP_MASK) == (pa & HPT_SP_MASK) &&
1764             (m->flags & PG_FICTITIOUS) == 0 &&
1765             vm_reserv_level_iffullpop(m) == 0)
1766                 moea64_sp_promote(pmap, va, m);
1767 #endif
1768
1769         return (KERN_SUCCESS);
1770 }
1771
1772 static void
1773 moea64_syncicache(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1774     vm_size_t sz)
1775 {
1776
1777         /*
1778          * This is much trickier than on older systems because
1779          * we can't sync the icache on physical addresses directly
1780          * without a direct map. Instead we check a couple of cases
1781          * where the memory is already mapped in and, failing that,
1782          * use the same trick we use for page zeroing to create
1783          * a temporary mapping for this physical address.
1784          */
1785
1786         if (!pmap_bootstrapped) {
1787                 /*
1788                  * If PMAP is not bootstrapped, we are likely to be
1789                  * in real mode.
1790                  */
1791                 __syncicache((void *)(uintptr_t)pa, sz);
1792         } else if (pmap == kernel_pmap) {
1793                 __syncicache((void *)va, sz);
1794         } else if (hw_direct_map) {
1795                 __syncicache((void *)(uintptr_t)PHYS_TO_DMAP(pa), sz);
1796         } else {
1797                 /* Use the scratch page to set up a temp mapping */
1798
1799                 mtx_lock(&moea64_scratchpage_mtx);
1800
1801                 moea64_set_scratchpage_pa(1, pa & ~ADDR_POFF);
1802                 __syncicache((void *)(moea64_scratchpage_va[1] +
1803                     (va & ADDR_POFF)), sz);
1804
1805                 mtx_unlock(&moea64_scratchpage_mtx);
1806         }
1807 }
1808
1809 /*
1810  * Maps a sequence of resident pages belonging to the same object.
1811  * The sequence begins with the given page m_start.  This page is
1812  * mapped at the given virtual address start.  Each subsequent page is
1813  * mapped at a virtual address that is offset from start by the same
1814  * amount as the page is offset from m_start within the object.  The
1815  * last page in the sequence is the page with the largest offset from
1816  * m_start that can be mapped at a virtual address less than the given
1817  * virtual address end.  Not every virtual page between start and end
1818  * is mapped; only those for which a resident page exists with the
1819  * corresponding offset from m_start are mapped.
1820  */
1821 void
1822 moea64_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end,
1823     vm_page_t m_start, vm_prot_t prot)
1824 {
1825         vm_page_t m;
1826         vm_pindex_t diff, psize;
1827         vm_offset_t va;
1828         int8_t psind;
1829
1830         VM_OBJECT_ASSERT_LOCKED(m_start->object);
1831
1832         psize = atop(end - start);
1833         m = m_start;
1834         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
1835                 va = start + ptoa(diff);
1836                 if ((va & HPT_SP_MASK) == 0 && va + HPT_SP_SIZE <= end &&
1837                     m->psind == 1 && moea64_ps_enabled(pm))
1838                         psind = 1;
1839                 else
1840                         psind = 0;
1841                 moea64_enter(pm, va, m, prot &
1842                     (VM_PROT_READ | VM_PROT_EXECUTE),
1843                     PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, psind);
1844                 if (psind == 1)
1845                         m = &m[HPT_SP_SIZE / PAGE_SIZE - 1];
1846                 m = TAILQ_NEXT(m, listq);
1847         }
1848 }
1849
1850 void
1851 moea64_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m,
1852     vm_prot_t prot)
1853 {
1854
1855         moea64_enter(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
1856             PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, 0);
1857 }
1858
1859 vm_paddr_t
1860 moea64_extract(pmap_t pm, vm_offset_t va)
1861 {
1862         struct  pvo_entry *pvo;
1863         vm_paddr_t pa;
1864
1865         PMAP_LOCK(pm);
1866         pvo = moea64_pvo_find_va(pm, va);
1867         if (pvo == NULL)
1868                 pa = 0;
1869         else
1870                 pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo));
1871         PMAP_UNLOCK(pm);
1872
1873         return (pa);
1874 }
1875
1876 /*
1877  * Atomically extract and hold the physical page with the given
1878  * pmap and virtual address pair if that mapping permits the given
1879  * protection.
1880  */
1881 vm_page_t
1882 moea64_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1883 {
1884         struct  pvo_entry *pvo;
1885         vm_page_t m;
1886
1887         m = NULL;
1888         PMAP_LOCK(pmap);
1889         pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
1890         if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) {
1891                 m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
1892                 if (!vm_page_wire_mapped(m))
1893                         m = NULL;
1894         }
1895         PMAP_UNLOCK(pmap);
1896         return (m);
1897 }
1898
1899 static void *
1900 moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
1901     uint8_t *flags, int wait)
1902 {
1903         struct pvo_entry *pvo;
1904         vm_offset_t va;
1905         vm_page_t m;
1906         int needed_lock;
1907
1908         /*
1909          * This entire routine is a horrible hack to avoid bothering kmem
1910          * for new KVA addresses. Because this can get called from inside
1911          * kmem allocation routines, calling kmem for a new address here
1912          * can lead to multiply locking non-recursive mutexes.
1913          */
1914
1915         *flags = UMA_SLAB_PRIV;
1916         needed_lock = !PMAP_LOCKED(kernel_pmap);
1917
1918         m = vm_page_alloc_noobj_domain(domain, malloc2vm_flags(wait) |
1919             VM_ALLOC_WIRED);
1920         if (m == NULL)
1921                 return (NULL);
1922
1923         va = VM_PAGE_TO_PHYS(m);
1924
1925         pvo = alloc_pvo_entry(1 /* bootstrap */);
1926
1927         pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE;
1928         pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | LPTE_M;
1929
1930         if (needed_lock)
1931                 PMAP_LOCK(kernel_pmap);
1932
1933         init_pvo_entry(pvo, kernel_pmap, va);
1934         pvo->pvo_vaddr |= PVO_WIRED;
1935
1936         moea64_pvo_enter(pvo, NULL, NULL);
1937
1938         if (needed_lock)
1939                 PMAP_UNLOCK(kernel_pmap);
1940
1941         return (void *)va;
1942 }
1943
1944 extern int elf32_nxstack;
1945
1946 void
1947 moea64_init()
1948 {
1949
1950         CTR0(KTR_PMAP, "moea64_init");
1951
1952         moea64_pvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry),
1953             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1954             UMA_ZONE_VM | UMA_ZONE_NOFREE);
1955
1956         /*
1957          * Are large page mappings enabled?
1958          *
1959          * While HPT superpages are not better tested, leave it disabled by
1960          * default.
1961          */
1962         superpages_enabled = 0;
1963         TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1964         if (superpages_enabled) {
1965                 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1966                     ("moea64_init: can't assign to pagesizes[1]"));
1967
1968                 if (moea64_large_page_size == 0) {
1969                         printf("mmu_oea64: HW does not support large pages. "
1970                                         "Disabling superpages...\n");
1971                         superpages_enabled = 0;
1972                 } else if (!moea64_has_lp_4k_16m) {
1973                         printf("mmu_oea64: "
1974                             "HW does not support mixed 4KB/16MB page sizes. "
1975                             "Disabling superpages...\n");
1976                         superpages_enabled = 0;
1977                 } else
1978                         pagesizes[1] = HPT_SP_SIZE;
1979         }
1980
1981         if (!hw_direct_map) {
1982                 uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc);
1983         }
1984
1985 #ifdef COMPAT_FREEBSD32
1986         elf32_nxstack = 1;
1987 #endif
1988
1989         moea64_initialized = TRUE;
1990 }
1991
1992 boolean_t
1993 moea64_is_referenced(vm_page_t m)
1994 {
1995
1996         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1997             ("moea64_is_referenced: page %p is not managed", m));
1998
1999         return (moea64_query_bit(m, LPTE_REF));
2000 }
2001
2002 boolean_t
2003 moea64_is_modified(vm_page_t m)
2004 {
2005
2006         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2007             ("moea64_is_modified: page %p is not managed", m));
2008
2009         /*
2010          * If the page is not busied then this check is racy.
2011          */
2012         if (!pmap_page_is_write_mapped(m))
2013                 return (FALSE);
2014
2015         return (moea64_query_bit(m, LPTE_CHG));
2016 }
2017
2018 boolean_t
2019 moea64_is_prefaultable(pmap_t pmap, vm_offset_t va)
2020 {
2021         struct pvo_entry *pvo;
2022         boolean_t rv = TRUE;
2023
2024         PMAP_LOCK(pmap);
2025         pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
2026         if (pvo != NULL)
2027                 rv = FALSE;
2028         PMAP_UNLOCK(pmap);
2029         return (rv);
2030 }
2031
2032 void
2033 moea64_clear_modify(vm_page_t m)
2034 {
2035
2036         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2037             ("moea64_clear_modify: page %p is not managed", m));
2038         vm_page_assert_busied(m);
2039
2040         if (!pmap_page_is_write_mapped(m))
2041                 return;
2042         moea64_clear_bit(m, LPTE_CHG);
2043 }
2044
2045 /*
2046  * Clear the write and modified bits in each of the given page's mappings.
2047  */
2048 void
2049 moea64_remove_write(vm_page_t m)
2050 {
2051         struct  pvo_entry *pvo;
2052         int64_t refchg, ret;
2053         pmap_t  pmap;
2054
2055         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2056             ("moea64_remove_write: page %p is not managed", m));
2057         vm_page_assert_busied(m);
2058
2059         if (!pmap_page_is_write_mapped(m))
2060                 return;
2061
2062         powerpc_sync();
2063         PV_PAGE_LOCK(m);
2064         refchg = 0;
2065         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2066                 pmap = pvo->pvo_pmap;
2067                 PMAP_LOCK(pmap);
2068                 if (!(pvo->pvo_vaddr & PVO_DEAD) &&
2069                     (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
2070                         if (PVO_IS_SP(pvo)) {
2071                                 CTR1(KTR_PMAP, "%s: demote before remwr",
2072                                     __func__);
2073                                 moea64_sp_demote(pvo);
2074                         }
2075                         pvo->pvo_pte.prot &= ~VM_PROT_WRITE;
2076                         ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
2077                         if (ret < 0)
2078                                 ret = LPTE_CHG;
2079                         refchg |= ret;
2080                         if (pvo->pvo_pmap == kernel_pmap)
2081                                 isync();
2082                 }
2083                 PMAP_UNLOCK(pmap);
2084         }
2085         if ((refchg | atomic_readandclear_32(&m->md.mdpg_attrs)) & LPTE_CHG)
2086                 vm_page_dirty(m);
2087         vm_page_aflag_clear(m, PGA_WRITEABLE);
2088         PV_PAGE_UNLOCK(m);
2089 }
2090
2091 /*
2092  *      moea64_ts_referenced:
2093  *
2094  *      Return a count of reference bits for a page, clearing those bits.
2095  *      It is not necessary for every reference bit to be cleared, but it
2096  *      is necessary that 0 only be returned when there are truly no
2097  *      reference bits set.
2098  *
2099  *      XXX: The exact number of bits to check and clear is a matter that
2100  *      should be tested and standardized at some point in the future for
2101  *      optimal aging of shared pages.
2102  */
2103 int
2104 moea64_ts_referenced(vm_page_t m)
2105 {
2106
2107         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2108             ("moea64_ts_referenced: page %p is not managed", m));
2109         return (moea64_clear_bit(m, LPTE_REF));
2110 }
2111
2112 /*
2113  * Modify the WIMG settings of all mappings for a page.
2114  */
2115 void
2116 moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma)
2117 {
2118         struct  pvo_entry *pvo;
2119         int64_t refchg;
2120         pmap_t  pmap;
2121         uint64_t lo;
2122
2123         CTR3(KTR_PMAP, "%s: pa=%#jx, ma=%#x",
2124             __func__, (uintmax_t)VM_PAGE_TO_PHYS(m), ma);
2125
2126         if ((m->oflags & VPO_UNMANAGED) != 0) {
2127                 m->md.mdpg_cache_attrs = ma;
2128                 return;
2129         }
2130
2131         lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma);
2132
2133         PV_PAGE_LOCK(m);
2134         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2135                 pmap = pvo->pvo_pmap;
2136                 PMAP_LOCK(pmap);
2137                 if (!(pvo->pvo_vaddr & PVO_DEAD)) {
2138                         if (PVO_IS_SP(pvo)) {
2139                                 CTR1(KTR_PMAP,
2140                                     "%s: demote before set_memattr", __func__);
2141                                 moea64_sp_demote(pvo);
2142                         }
2143                         pvo->pvo_pte.pa &= ~LPTE_WIMG;
2144                         pvo->pvo_pte.pa |= lo;
2145                         refchg = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
2146                         if (refchg < 0)
2147                                 refchg = (pvo->pvo_pte.prot & VM_PROT_WRITE) ?
2148                                     LPTE_CHG : 0;
2149                         if ((pvo->pvo_vaddr & PVO_MANAGED) &&
2150                             (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
2151                                 refchg |=
2152                                     atomic_readandclear_32(&m->md.mdpg_attrs);
2153                                 if (refchg & LPTE_CHG)
2154                                         vm_page_dirty(m);
2155                                 if (refchg & LPTE_REF)
2156                                         vm_page_aflag_set(m, PGA_REFERENCED);
2157                         }
2158                         if (pvo->pvo_pmap == kernel_pmap)
2159                                 isync();
2160                 }
2161                 PMAP_UNLOCK(pmap);
2162         }
2163         m->md.mdpg_cache_attrs = ma;
2164         PV_PAGE_UNLOCK(m);
2165 }
2166
2167 /*
2168  * Map a wired page into kernel virtual address space.
2169  */
2170 void
2171 moea64_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
2172 {
2173         int             error;
2174         struct pvo_entry *pvo, *oldpvo;
2175
2176         do {
2177                 pvo = alloc_pvo_entry(0);
2178                 if (pvo == NULL)
2179                         vm_wait(NULL);
2180         } while (pvo == NULL);
2181         pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
2182         pvo->pvo_pte.pa = (pa & ~ADDR_POFF) | moea64_calc_wimg(pa, ma);
2183         pvo->pvo_vaddr |= PVO_WIRED;
2184
2185         PMAP_LOCK(kernel_pmap);
2186         oldpvo = moea64_pvo_find_va(kernel_pmap, va);
2187         if (oldpvo != NULL)
2188                 moea64_pvo_remove_from_pmap(oldpvo);
2189         init_pvo_entry(pvo, kernel_pmap, va);
2190         error = moea64_pvo_enter(pvo, NULL, NULL);
2191         PMAP_UNLOCK(kernel_pmap);
2192
2193         /* Free any dead pages */
2194         if (oldpvo != NULL) {
2195                 moea64_pvo_remove_from_page(oldpvo);
2196                 free_pvo_entry(oldpvo);
2197         }
2198
2199         if (error != 0)
2200                 panic("moea64_kenter: failed to enter va %#zx pa %#jx: %d", va,
2201                     (uintmax_t)pa, error);
2202 }
2203
2204 void
2205 moea64_kenter(vm_offset_t va, vm_paddr_t pa)
2206 {
2207
2208         moea64_kenter_attr(va, pa, VM_MEMATTR_DEFAULT);
2209 }
2210
2211 /*
2212  * Extract the physical page address associated with the given kernel virtual
2213  * address.
2214  */
2215 vm_paddr_t
2216 moea64_kextract(vm_offset_t va)
2217 {
2218         struct          pvo_entry *pvo;
2219         vm_paddr_t pa;
2220
2221         /*
2222          * Shortcut the direct-mapped case when applicable.  We never put
2223          * anything but 1:1 (or 62-bit aliased) mappings below
2224          * VM_MIN_KERNEL_ADDRESS.
2225          */
2226         if (va < VM_MIN_KERNEL_ADDRESS)
2227                 return (va & ~DMAP_BASE_ADDRESS);
2228
2229         PMAP_LOCK(kernel_pmap);
2230         pvo = moea64_pvo_find_va(kernel_pmap, va);
2231         KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR,
2232             va));
2233         pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo));
2234         PMAP_UNLOCK(kernel_pmap);
2235         return (pa);
2236 }
2237
2238 /*
2239  * Remove a wired page from kernel virtual address space.
2240  */
2241 void
2242 moea64_kremove(vm_offset_t va)
2243 {
2244         moea64_remove(kernel_pmap, va, va + PAGE_SIZE);
2245 }
2246
2247 /*
2248  * Provide a kernel pointer corresponding to a given userland pointer.
2249  * The returned pointer is valid until the next time this function is
2250  * called in this thread. This is used internally in copyin/copyout.
2251  */
2252 static int
2253 moea64_map_user_ptr(pmap_t pm, volatile const void *uaddr,
2254     void **kaddr, size_t ulen, size_t *klen)
2255 {
2256         size_t l;
2257 #ifdef __powerpc64__
2258         struct slb *slb;
2259 #endif
2260         register_t slbv;
2261
2262         *kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK);
2263         l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr);
2264         if (l > ulen)
2265                 l = ulen;
2266         if (klen)
2267                 *klen = l;
2268         else if (l != ulen)
2269                 return (EFAULT);
2270
2271 #ifdef __powerpc64__
2272         /* Try lockless look-up first */
2273         slb = user_va_to_slb_entry(pm, (vm_offset_t)uaddr);
2274
2275         if (slb == NULL) {
2276                 /* If it isn't there, we need to pre-fault the VSID */
2277                 PMAP_LOCK(pm);
2278                 slbv = va_to_vsid(pm, (vm_offset_t)uaddr) << SLBV_VSID_SHIFT;
2279                 PMAP_UNLOCK(pm);
2280         } else {
2281                 slbv = slb->slbv;
2282         }
2283
2284         /* Mark segment no-execute */
2285         slbv |= SLBV_N;
2286 #else
2287         slbv = va_to_vsid(pm, (vm_offset_t)uaddr);
2288
2289         /* Mark segment no-execute */
2290         slbv |= SR_N;
2291 #endif
2292
2293         /* If we have already set this VSID, we can just return */
2294         if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == slbv)
2295                 return (0);
2296
2297         __asm __volatile("isync");
2298         curthread->td_pcb->pcb_cpu.aim.usr_segm =
2299             (uintptr_t)uaddr >> ADDR_SR_SHFT;
2300         curthread->td_pcb->pcb_cpu.aim.usr_vsid = slbv;
2301 #ifdef __powerpc64__
2302         __asm __volatile ("slbie %0; slbmte %1, %2; isync" ::
2303             "r"(USER_ADDR), "r"(slbv), "r"(USER_SLB_SLBE));
2304 #else
2305         __asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(slbv));
2306 #endif
2307
2308         return (0);
2309 }
2310
2311 /*
2312  * Figure out where a given kernel pointer (usually in a fault) points
2313  * to from the VM's perspective, potentially remapping into userland's
2314  * address space.
2315  */
2316 static int
2317 moea64_decode_kernel_ptr(vm_offset_t addr, int *is_user,
2318     vm_offset_t *decoded_addr)
2319 {
2320         vm_offset_t user_sr;
2321
2322         if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) {
2323                 user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm;
2324                 addr &= ADDR_PIDX | ADDR_POFF;
2325                 addr |= user_sr << ADDR_SR_SHFT;
2326                 *decoded_addr = addr;
2327                 *is_user = 1;
2328         } else {
2329                 *decoded_addr = addr;
2330                 *is_user = 0;
2331         }
2332
2333         return (0);
2334 }
2335
2336 /*
2337  * Map a range of physical addresses into kernel virtual address space.
2338  *
2339  * The value passed in *virt is a suggested virtual address for the mapping.
2340  * Architectures which can support a direct-mapped physical to virtual region
2341  * can return the appropriate address within that region, leaving '*virt'
2342  * unchanged.  Other architectures should map the pages starting at '*virt' and
2343  * update '*virt' with the first usable address after the mapped region.
2344  */
2345 vm_offset_t
2346 moea64_map(vm_offset_t *virt, vm_paddr_t pa_start,
2347     vm_paddr_t pa_end, int prot)
2348 {
2349         vm_offset_t     sva, va;
2350
2351         if (hw_direct_map) {
2352                 /*
2353                  * Check if every page in the region is covered by the direct
2354                  * map. The direct map covers all of physical memory. Use
2355                  * moea64_calc_wimg() as a shortcut to see if the page is in
2356                  * physical memory as a way to see if the direct map covers it.
2357                  */
2358                 for (va = pa_start; va < pa_end; va += PAGE_SIZE)
2359                         if (moea64_calc_wimg(va, VM_MEMATTR_DEFAULT) != LPTE_M)
2360                                 break;
2361                 if (va == pa_end)
2362                         return (PHYS_TO_DMAP(pa_start));
2363         }
2364         sva = *virt;
2365         va = sva;
2366         /* XXX respect prot argument */
2367         for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE)
2368                 moea64_kenter(va, pa_start);
2369         *virt = va;
2370
2371         return (sva);
2372 }
2373
2374 /*
2375  * Returns true if the pmap's pv is one of the first
2376  * 16 pvs linked to from this page.  This count may
2377  * be changed upwards or downwards in the future; it
2378  * is only necessary that true be returned for a small
2379  * subset of pmaps for proper page aging.
2380  */
2381 boolean_t
2382 moea64_page_exists_quick(pmap_t pmap, vm_page_t m)
2383 {
2384         int loops;
2385         struct pvo_entry *pvo;
2386         boolean_t rv;
2387
2388         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2389             ("moea64_page_exists_quick: page %p is not managed", m));
2390         loops = 0;
2391         rv = FALSE;
2392         PV_PAGE_LOCK(m);
2393         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2394                 if (!(pvo->pvo_vaddr & PVO_DEAD) && pvo->pvo_pmap == pmap) {
2395                         rv = TRUE;
2396                         break;
2397                 }
2398                 if (++loops >= 16)
2399                         break;
2400         }
2401         PV_PAGE_UNLOCK(m);
2402         return (rv);
2403 }
2404
2405 void
2406 moea64_page_init(vm_page_t m)
2407 {
2408
2409         m->md.mdpg_attrs = 0;
2410         m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT;
2411         LIST_INIT(&m->md.mdpg_pvoh);
2412 }
2413
2414 /*
2415  * Return the number of managed mappings to the given physical page
2416  * that are wired.
2417  */
2418 int
2419 moea64_page_wired_mappings(vm_page_t m)
2420 {
2421         struct pvo_entry *pvo;
2422         int count;
2423
2424         count = 0;
2425         if ((m->oflags & VPO_UNMANAGED) != 0)
2426                 return (count);
2427         PV_PAGE_LOCK(m);
2428         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink)
2429                 if ((pvo->pvo_vaddr & (PVO_DEAD | PVO_WIRED)) == PVO_WIRED)
2430                         count++;
2431         PV_PAGE_UNLOCK(m);
2432         return (count);
2433 }
2434
2435 static uintptr_t        moea64_vsidcontext;
2436
2437 uintptr_t
2438 moea64_get_unique_vsid(void) {
2439         u_int entropy;
2440         register_t hash;
2441         uint32_t mask;
2442         int i;
2443
2444         entropy = 0;
2445         __asm __volatile("mftb %0" : "=r"(entropy));
2446
2447         mtx_lock(&moea64_slb_mutex);
2448         for (i = 0; i < NVSIDS; i += VSID_NBPW) {
2449                 u_int   n;
2450
2451                 /*
2452                  * Create a new value by multiplying by a prime and adding in
2453                  * entropy from the timebase register.  This is to make the
2454                  * VSID more random so that the PT hash function collides
2455                  * less often.  (Note that the prime casues gcc to do shifts
2456                  * instead of a multiply.)
2457                  */
2458                 moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy;
2459                 hash = moea64_vsidcontext & (NVSIDS - 1);
2460                 if (hash == 0)          /* 0 is special, avoid it */
2461                         continue;
2462                 n = hash >> 5;
2463                 mask = 1 << (hash & (VSID_NBPW - 1));
2464                 hash = (moea64_vsidcontext & VSID_HASHMASK);
2465                 if (moea64_vsid_bitmap[n] & mask) {     /* collision? */
2466                         /* anything free in this bucket? */
2467                         if (moea64_vsid_bitmap[n] == 0xffffffff) {
2468                                 entropy = (moea64_vsidcontext >> 20);
2469                                 continue;
2470                         }
2471                         i = ffs(~moea64_vsid_bitmap[n]) - 1;
2472                         mask = 1 << i;
2473                         hash &= rounddown2(VSID_HASHMASK, VSID_NBPW);
2474                         hash |= i;
2475                 }
2476                 if (hash == VSID_VRMA)  /* also special, avoid this too */
2477                         continue;
2478                 KASSERT(!(moea64_vsid_bitmap[n] & mask),
2479                     ("Allocating in-use VSID %#zx\n", hash));
2480                 moea64_vsid_bitmap[n] |= mask;
2481                 mtx_unlock(&moea64_slb_mutex);
2482                 return (hash);
2483         }
2484
2485         mtx_unlock(&moea64_slb_mutex);
2486         panic("%s: out of segments",__func__);
2487 }
2488
2489 #ifdef __powerpc64__
2490 int
2491 moea64_pinit(pmap_t pmap)
2492 {
2493
2494         RB_INIT(&pmap->pmap_pvo);
2495
2496         pmap->pm_slb_tree_root = slb_alloc_tree();
2497         pmap->pm_slb = slb_alloc_user_cache();
2498         pmap->pm_slb_len = 0;
2499
2500         return (1);
2501 }
2502 #else
2503 int
2504 moea64_pinit(pmap_t pmap)
2505 {
2506         int     i;
2507         uint32_t hash;
2508
2509         RB_INIT(&pmap->pmap_pvo);
2510
2511         if (pmap_bootstrapped)
2512                 pmap->pmap_phys = (pmap_t)moea64_kextract((vm_offset_t)pmap);
2513         else
2514                 pmap->pmap_phys = pmap;
2515
2516         /*
2517          * Allocate some segment registers for this pmap.
2518          */
2519         hash = moea64_get_unique_vsid();
2520
2521         for (i = 0; i < 16; i++)
2522                 pmap->pm_sr[i] = VSID_MAKE(i, hash);
2523
2524         KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0"));
2525
2526         return (1);
2527 }
2528 #endif
2529
2530 /*
2531  * Initialize the pmap associated with process 0.
2532  */
2533 void
2534 moea64_pinit0(pmap_t pm)
2535 {
2536
2537         PMAP_LOCK_INIT(pm);
2538         moea64_pinit(pm);
2539         bzero(&pm->pm_stats, sizeof(pm->pm_stats));
2540 }
2541
2542 /*
2543  * Set the physical protection on the specified range of this map as requested.
2544  */
2545 static void
2546 moea64_pvo_protect( pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot)
2547 {
2548         struct vm_page *pg;
2549         vm_prot_t oldprot;
2550         int32_t refchg;
2551
2552         PMAP_LOCK_ASSERT(pm, MA_OWNED);
2553
2554         /*
2555          * Change the protection of the page.
2556          */
2557         oldprot = pvo->pvo_pte.prot;
2558         pvo->pvo_pte.prot = prot;
2559         pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
2560
2561         /*
2562          * If the PVO is in the page table, update mapping
2563          */
2564         refchg = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
2565         if (refchg < 0)
2566                 refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0;
2567
2568         if (pm != kernel_pmap && pg != NULL &&
2569             (pg->a.flags & PGA_EXECUTABLE) == 0 &&
2570             (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
2571                 if ((pg->oflags & VPO_UNMANAGED) == 0)
2572                         vm_page_aflag_set(pg, PGA_EXECUTABLE);
2573                 moea64_syncicache(pm, PVO_VADDR(pvo),
2574                     PVO_PADDR(pvo), PAGE_SIZE);
2575         }
2576
2577         /*
2578          * Update vm about the REF/CHG bits if the page is managed and we have
2579          * removed write access.
2580          */
2581         if (pg != NULL && (pvo->pvo_vaddr & PVO_MANAGED) &&
2582             (oldprot & VM_PROT_WRITE)) {
2583                 refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs);
2584                 if (refchg & LPTE_CHG)
2585                         vm_page_dirty(pg);
2586                 if (refchg & LPTE_REF)
2587                         vm_page_aflag_set(pg, PGA_REFERENCED);
2588         }
2589 }
2590
2591 void
2592 moea64_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva,
2593     vm_prot_t prot)
2594 {
2595         struct  pvo_entry *pvo, key;
2596
2597         CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm,
2598             sva, eva, prot);
2599
2600         KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap,
2601             ("moea64_protect: non current pmap"));
2602
2603         if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2604                 moea64_remove(pm, sva, eva);
2605                 return;
2606         }
2607
2608         PMAP_LOCK(pm);
2609         key.pvo_vaddr = sva;
2610         for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
2611             pvo != NULL && PVO_VADDR(pvo) < eva;
2612             pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
2613                 if (PVO_IS_SP(pvo)) {
2614                         if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
2615                                 pvo = moea64_sp_protect(pvo, prot);
2616                                 continue;
2617                         } else {
2618                                 CTR1(KTR_PMAP, "%s: demote before protect",
2619                                     __func__);
2620                                 moea64_sp_demote(pvo);
2621                         }
2622                 }
2623                 moea64_pvo_protect(pm, pvo, prot);
2624         }
2625         PMAP_UNLOCK(pm);
2626 }
2627
2628 /*
2629  * Map a list of wired pages into kernel virtual address space.  This is
2630  * intended for temporary mappings which do not need page modification or
2631  * references recorded.  Existing mappings in the region are overwritten.
2632  */
2633 void
2634 moea64_qenter(vm_offset_t va, vm_page_t *m, int count)
2635 {
2636         while (count-- > 0) {
2637                 moea64_kenter(va, VM_PAGE_TO_PHYS(*m));
2638                 va += PAGE_SIZE;
2639                 m++;
2640         }
2641 }
2642
2643 /*
2644  * Remove page mappings from kernel virtual address space.  Intended for
2645  * temporary mappings entered by moea64_qenter.
2646  */
2647 void
2648 moea64_qremove(vm_offset_t va, int count)
2649 {
2650         while (count-- > 0) {
2651                 moea64_kremove(va);
2652                 va += PAGE_SIZE;
2653         }
2654 }
2655
2656 void
2657 moea64_release_vsid(uint64_t vsid)
2658 {
2659         int idx, mask;
2660
2661         mtx_lock(&moea64_slb_mutex);
2662         idx = vsid & (NVSIDS-1);
2663         mask = 1 << (idx % VSID_NBPW);
2664         idx /= VSID_NBPW;
2665         KASSERT(moea64_vsid_bitmap[idx] & mask,
2666             ("Freeing unallocated VSID %#jx", vsid));
2667         moea64_vsid_bitmap[idx] &= ~mask;
2668         mtx_unlock(&moea64_slb_mutex);
2669 }
2670
2671 void
2672 moea64_release(pmap_t pmap)
2673 {
2674
2675         /*
2676          * Free segment registers' VSIDs
2677          */
2678     #ifdef __powerpc64__
2679         slb_free_tree(pmap);
2680         slb_free_user_cache(pmap->pm_slb);
2681     #else
2682         KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0"));
2683
2684         moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0]));
2685     #endif
2686 }
2687
2688 /*
2689  * Remove all pages mapped by the specified pmap
2690  */
2691 void
2692 moea64_remove_pages(pmap_t pm)
2693 {
2694         struct pvo_entry *pvo, *tpvo;
2695         struct pvo_dlist tofree;
2696
2697         SLIST_INIT(&tofree);
2698
2699         PMAP_LOCK(pm);
2700         RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) {
2701                 if (pvo->pvo_vaddr & PVO_WIRED)
2702                         continue;
2703
2704                 /*
2705                  * For locking reasons, remove this from the page table and
2706                  * pmap, but save delinking from the vm_page for a second
2707                  * pass
2708                  */
2709                 moea64_pvo_remove_from_pmap(pvo);
2710                 SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink);
2711         }
2712         PMAP_UNLOCK(pm);
2713
2714         while (!SLIST_EMPTY(&tofree)) {
2715                 pvo = SLIST_FIRST(&tofree);
2716                 SLIST_REMOVE_HEAD(&tofree, pvo_dlink);
2717                 moea64_pvo_remove_from_page(pvo);
2718                 free_pvo_entry(pvo);
2719         }
2720 }
2721
2722 static void
2723 moea64_remove_locked(pmap_t pm, vm_offset_t sva, vm_offset_t eva,
2724     struct pvo_dlist *tofree)
2725 {
2726         struct pvo_entry *pvo, *tpvo, key;
2727
2728         PMAP_LOCK_ASSERT(pm, MA_OWNED);
2729
2730         key.pvo_vaddr = sva;
2731         for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
2732             pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
2733                 if (PVO_IS_SP(pvo)) {
2734                         if (moea64_sp_pvo_in_range(pvo, sva, eva)) {
2735                                 tpvo = moea64_sp_remove(pvo, tofree);
2736                                 continue;
2737                         } else {
2738                                 CTR1(KTR_PMAP, "%s: demote before remove",
2739                                     __func__);
2740                                 moea64_sp_demote(pvo);
2741                         }
2742                 }
2743                 tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
2744
2745                 /*
2746                  * For locking reasons, remove this from the page table and
2747                  * pmap, but save delinking from the vm_page for a second
2748                  * pass
2749                  */
2750                 moea64_pvo_remove_from_pmap(pvo);
2751                 SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink);
2752         }
2753 }
2754
2755 /*
2756  * Remove the given range of addresses from the specified map.
2757  */
2758 void
2759 moea64_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
2760 {
2761         struct pvo_entry *pvo;
2762         struct pvo_dlist tofree;
2763
2764         /*
2765          * Perform an unsynchronized read.  This is, however, safe.
2766          */
2767         if (pm->pm_stats.resident_count == 0)
2768                 return;
2769
2770         SLIST_INIT(&tofree);
2771         PMAP_LOCK(pm);
2772         moea64_remove_locked(pm, sva, eva, &tofree);
2773         PMAP_UNLOCK(pm);
2774
2775         while (!SLIST_EMPTY(&tofree)) {
2776                 pvo = SLIST_FIRST(&tofree);
2777                 SLIST_REMOVE_HEAD(&tofree, pvo_dlink);
2778                 moea64_pvo_remove_from_page(pvo);
2779                 free_pvo_entry(pvo);
2780         }
2781 }
2782
2783 /*
2784  * Remove physical page from all pmaps in which it resides. moea64_pvo_remove()
2785  * will reflect changes in pte's back to the vm_page.
2786  */
2787 void
2788 moea64_remove_all(vm_page_t m)
2789 {
2790         struct  pvo_entry *pvo, *next_pvo;
2791         struct  pvo_head freequeue;
2792         int     wasdead;
2793         pmap_t  pmap;
2794
2795         LIST_INIT(&freequeue);
2796
2797         PV_PAGE_LOCK(m);
2798         LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) {
2799                 pmap = pvo->pvo_pmap;
2800                 PMAP_LOCK(pmap);
2801                 wasdead = (pvo->pvo_vaddr & PVO_DEAD);
2802                 if (!wasdead) {
2803                         if (PVO_IS_SP(pvo)) {
2804                                 CTR1(KTR_PMAP, "%s: demote before remove_all",
2805                                     __func__);
2806                                 moea64_sp_demote(pvo);
2807                         }
2808                         moea64_pvo_remove_from_pmap(pvo);
2809                 }
2810                 moea64_pvo_remove_from_page_locked(pvo, m);
2811                 if (!wasdead)
2812                         LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink);
2813                 PMAP_UNLOCK(pmap);
2814
2815         }
2816         KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings"));
2817         KASSERT((m->a.flags & PGA_WRITEABLE) == 0, ("Page still writable"));
2818         PV_PAGE_UNLOCK(m);
2819
2820         /* Clean up UMA allocations */
2821         LIST_FOREACH_SAFE(pvo, &freequeue, pvo_vlink, next_pvo)
2822                 free_pvo_entry(pvo);
2823 }
2824
2825 /*
2826  * Allocate a physical page of memory directly from the phys_avail map.
2827  * Can only be called from moea64_bootstrap before avail start and end are
2828  * calculated.
2829  */
2830 vm_offset_t
2831 moea64_bootstrap_alloc(vm_size_t size, vm_size_t align)
2832 {
2833         vm_offset_t     s, e;
2834         int             i, j;
2835
2836         size = round_page(size);
2837         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
2838                 if (align != 0)
2839                         s = roundup2(phys_avail[i], align);
2840                 else
2841                         s = phys_avail[i];
2842                 e = s + size;
2843
2844                 if (s < phys_avail[i] || e > phys_avail[i + 1])
2845                         continue;
2846
2847                 if (s + size > platform_real_maxaddr())
2848                         continue;
2849
2850                 if (s == phys_avail[i]) {
2851                         phys_avail[i] += size;
2852                 } else if (e == phys_avail[i + 1]) {
2853                         phys_avail[i + 1] -= size;
2854                 } else {
2855                         for (j = phys_avail_count * 2; j > i; j -= 2) {
2856                                 phys_avail[j] = phys_avail[j - 2];
2857                                 phys_avail[j + 1] = phys_avail[j - 1];
2858                         }
2859
2860                         phys_avail[i + 3] = phys_avail[i + 1];
2861                         phys_avail[i + 1] = s;
2862                         phys_avail[i + 2] = e;
2863                         phys_avail_count++;
2864                 }
2865
2866                 return (s);
2867         }
2868         panic("moea64_bootstrap_alloc: could not allocate memory");
2869 }
2870
2871 static int
2872 moea64_pvo_enter(struct pvo_entry *pvo, struct pvo_head *pvo_head,
2873     struct pvo_entry **oldpvop)
2874 {
2875         struct pvo_entry *old_pvo;
2876         int err;
2877
2878         PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
2879
2880         STAT_MOEA64(moea64_pvo_enter_calls++);
2881
2882         /*
2883          * Add to pmap list
2884          */
2885         old_pvo = RB_INSERT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
2886
2887         if (old_pvo != NULL) {
2888                 if (oldpvop != NULL)
2889                         *oldpvop = old_pvo;
2890                 return (EEXIST);
2891         }
2892
2893         if (pvo_head != NULL) {
2894                 LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink);
2895         }
2896
2897         if (pvo->pvo_vaddr & PVO_WIRED)
2898                 pvo->pvo_pmap->pm_stats.wired_count++;
2899         pvo->pvo_pmap->pm_stats.resident_count++;
2900
2901         /*
2902          * Insert it into the hardware page table
2903          */
2904         err = moea64_pte_insert(pvo);
2905         if (err != 0) {
2906                 panic("moea64_pvo_enter: overflow");
2907         }
2908
2909         STAT_MOEA64(moea64_pvo_entries++);
2910
2911         if (pvo->pvo_pmap == kernel_pmap)
2912                 isync();
2913
2914 #ifdef __powerpc64__
2915         /*
2916          * Make sure all our bootstrap mappings are in the SLB as soon
2917          * as virtual memory is switched on.
2918          */
2919         if (!pmap_bootstrapped)
2920                 moea64_bootstrap_slb_prefault(PVO_VADDR(pvo),
2921                     pvo->pvo_vaddr & PVO_LARGE);
2922 #endif
2923
2924         return (0);
2925 }
2926
2927 static void
2928 moea64_pvo_remove_from_pmap(struct pvo_entry *pvo)
2929 {
2930         struct  vm_page *pg;
2931         int32_t refchg;
2932
2933         KASSERT(pvo->pvo_pmap != NULL, ("Trying to remove PVO with no pmap"));
2934         PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
2935         KASSERT(!(pvo->pvo_vaddr & PVO_DEAD), ("Trying to remove dead PVO"));
2936
2937         /*
2938          * If there is an active pte entry, we need to deactivate it
2939          */
2940         refchg = moea64_pte_unset(pvo);
2941         if (refchg < 0) {
2942                 /*
2943                  * If it was evicted from the page table, be pessimistic and
2944                  * dirty the page.
2945                  */
2946                 if (pvo->pvo_pte.prot & VM_PROT_WRITE)
2947                         refchg = LPTE_CHG;
2948                 else
2949                         refchg = 0;
2950         }
2951
2952         /*
2953          * Update our statistics.
2954          */
2955         pvo->pvo_pmap->pm_stats.resident_count--;
2956         if (pvo->pvo_vaddr & PVO_WIRED)
2957                 pvo->pvo_pmap->pm_stats.wired_count--;
2958
2959         /*
2960          * Remove this PVO from the pmap list.
2961          */
2962         RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
2963
2964         /*
2965          * Mark this for the next sweep
2966          */
2967         pvo->pvo_vaddr |= PVO_DEAD;
2968
2969         /* Send RC bits to VM */
2970         if ((pvo->pvo_vaddr & PVO_MANAGED) &&
2971             (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
2972                 pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
2973                 if (pg != NULL) {
2974                         refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs);
2975                         if (refchg & LPTE_CHG)
2976                                 vm_page_dirty(pg);
2977                         if (refchg & LPTE_REF)
2978                                 vm_page_aflag_set(pg, PGA_REFERENCED);
2979                 }
2980         }
2981 }
2982
2983 static inline void
2984 moea64_pvo_remove_from_page_locked(struct pvo_entry *pvo,
2985     vm_page_t m)
2986 {
2987
2988         KASSERT(pvo->pvo_vaddr & PVO_DEAD, ("Trying to delink live page"));
2989
2990         /* Use NULL pmaps as a sentinel for races in page deletion */
2991         if (pvo->pvo_pmap == NULL)
2992                 return;
2993         pvo->pvo_pmap = NULL;
2994
2995         /*
2996          * Update vm about page writeability/executability if managed
2997          */
2998         PV_LOCKASSERT(PVO_PADDR(pvo));
2999         if (pvo->pvo_vaddr & PVO_MANAGED) {
3000                 if (m != NULL) {
3001                         LIST_REMOVE(pvo, pvo_vlink);
3002                         if (LIST_EMPTY(vm_page_to_pvoh(m)))
3003                                 vm_page_aflag_clear(m,
3004                                     PGA_WRITEABLE | PGA_EXECUTABLE);
3005                 }
3006         }
3007
3008         STAT_MOEA64(moea64_pvo_entries--);
3009         STAT_MOEA64(moea64_pvo_remove_calls++);
3010 }
3011
3012 static void
3013 moea64_pvo_remove_from_page(struct pvo_entry *pvo)
3014 {
3015         vm_page_t pg = NULL;
3016
3017         if (pvo->pvo_vaddr & PVO_MANAGED)
3018                 pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
3019
3020         PV_LOCK(PVO_PADDR(pvo));
3021         moea64_pvo_remove_from_page_locked(pvo, pg);
3022         PV_UNLOCK(PVO_PADDR(pvo));
3023 }
3024
3025 static struct pvo_entry *
3026 moea64_pvo_find_va(pmap_t pm, vm_offset_t va)
3027 {
3028         struct pvo_entry key;
3029
3030         PMAP_LOCK_ASSERT(pm, MA_OWNED);
3031
3032         key.pvo_vaddr = va & ~ADDR_POFF;
3033         return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key));
3034 }
3035
3036 static boolean_t
3037 moea64_query_bit(vm_page_t m, uint64_t ptebit)
3038 {
3039         struct  pvo_entry *pvo;
3040         int64_t ret;
3041         boolean_t rv;
3042         vm_page_t sp;
3043
3044         /*
3045          * See if this bit is stored in the page already.
3046          *
3047          * For superpages, the bit is stored in the first vm page.
3048          */
3049         if ((m->md.mdpg_attrs & ptebit) != 0 ||
3050             ((sp = PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(m) & ~HPT_SP_MASK)) != NULL &&
3051              (sp->md.mdpg_attrs & (ptebit | MDPG_ATTR_SP)) ==
3052              (ptebit | MDPG_ATTR_SP)))
3053                 return (TRUE);
3054
3055         /*
3056          * Examine each PTE.  Sync so that any pending REF/CHG bits are
3057          * flushed to the PTEs.
3058          */
3059         rv = FALSE;
3060         powerpc_sync();
3061         PV_PAGE_LOCK(m);
3062         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
3063                 if (PVO_IS_SP(pvo)) {
3064                         ret = moea64_sp_query(pvo, ptebit);
3065                         /*
3066                          * If SP was not demoted, check its REF/CHG bits here.
3067                          */
3068                         if (ret != -1) {
3069                                 if ((ret & ptebit) != 0) {
3070                                         rv = TRUE;
3071                                         break;
3072                                 }
3073                                 continue;
3074                         }
3075                         /* else, fallthrough */
3076                 }
3077
3078                 ret = 0;
3079
3080                 /*
3081                  * See if this pvo has a valid PTE.  if so, fetch the
3082                  * REF/CHG bits from the valid PTE.  If the appropriate
3083                  * ptebit is set, return success.
3084                  */
3085                 PMAP_LOCK(pvo->pvo_pmap);
3086                 if (!(pvo->pvo_vaddr & PVO_DEAD))
3087                         ret = moea64_pte_synch(pvo);
3088                 PMAP_UNLOCK(pvo->pvo_pmap);
3089
3090                 if (ret > 0) {
3091                         atomic_set_32(&m->md.mdpg_attrs,
3092                             ret & (LPTE_CHG | LPTE_REF));
3093                         if (ret & ptebit) {
3094                                 rv = TRUE;
3095                                 break;
3096                         }
3097                 }
3098         }
3099         PV_PAGE_UNLOCK(m);
3100
3101         return (rv);
3102 }
3103
3104 static u_int
3105 moea64_clear_bit(vm_page_t m, u_int64_t ptebit)
3106 {
3107         u_int   count;
3108         struct  pvo_entry *pvo;
3109         int64_t ret;
3110
3111         /*
3112          * Sync so that any pending REF/CHG bits are flushed to the PTEs (so
3113          * we can reset the right ones).
3114          */
3115         powerpc_sync();
3116
3117         /*
3118          * For each pvo entry, clear the pte's ptebit.
3119          */
3120         count = 0;
3121         PV_PAGE_LOCK(m);
3122         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
3123                 if (PVO_IS_SP(pvo)) {
3124                         if ((ret = moea64_sp_clear(pvo, m, ptebit)) != -1) {
3125                                 count += ret;
3126                                 continue;
3127                         }
3128                 }
3129                 ret = 0;
3130
3131                 PMAP_LOCK(pvo->pvo_pmap);
3132                 if (!(pvo->pvo_vaddr & PVO_DEAD))
3133                         ret = moea64_pte_clear(pvo, ptebit);
3134                 PMAP_UNLOCK(pvo->pvo_pmap);
3135
3136                 if (ret > 0 && (ret & ptebit))
3137                         count++;
3138         }
3139         atomic_clear_32(&m->md.mdpg_attrs, ptebit);
3140         PV_PAGE_UNLOCK(m);
3141
3142         return (count);
3143 }
3144
3145 boolean_t
3146 moea64_dev_direct_mapped(vm_paddr_t pa, vm_size_t size)
3147 {
3148         struct pvo_entry *pvo, key;
3149         vm_offset_t ppa;
3150         int error = 0;
3151
3152         if (hw_direct_map && mem_valid(pa, size) == 0)
3153                 return (0);
3154
3155         PMAP_LOCK(kernel_pmap);
3156         ppa = pa & ~ADDR_POFF;
3157         key.pvo_vaddr = DMAP_BASE_ADDRESS + ppa;
3158         for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key);
3159             ppa < pa + size; ppa += PAGE_SIZE,
3160             pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) {
3161                 if (pvo == NULL || PVO_PADDR(pvo) != ppa) {
3162                         error = EFAULT;
3163                         break;
3164                 }
3165         }
3166         PMAP_UNLOCK(kernel_pmap);
3167
3168         return (error);
3169 }
3170
3171 /*
3172  * Map a set of physical memory pages into the kernel virtual
3173  * address space. Return a pointer to where it is mapped. This
3174  * routine is intended to be used for mapping device memory,
3175  * NOT real memory.
3176  */
3177 void *
3178 moea64_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma)
3179 {
3180         vm_offset_t va, tmpva, ppa, offset;
3181
3182         ppa = trunc_page(pa);
3183         offset = pa & PAGE_MASK;
3184         size = roundup2(offset + size, PAGE_SIZE);
3185
3186         va = kva_alloc(size);
3187
3188         if (!va)
3189                 panic("moea64_mapdev: Couldn't alloc kernel virtual memory");
3190
3191         for (tmpva = va; size > 0;) {
3192                 moea64_kenter_attr(tmpva, ppa, ma);
3193                 size -= PAGE_SIZE;
3194                 tmpva += PAGE_SIZE;
3195                 ppa += PAGE_SIZE;
3196         }
3197
3198         return ((void *)(va + offset));
3199 }
3200
3201 void *
3202 moea64_mapdev(vm_paddr_t pa, vm_size_t size)
3203 {
3204
3205         return moea64_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT);
3206 }
3207
3208 void
3209 moea64_unmapdev(vm_offset_t va, vm_size_t size)
3210 {
3211         vm_offset_t base, offset;
3212
3213         base = trunc_page(va);
3214         offset = va & PAGE_MASK;
3215         size = roundup2(offset + size, PAGE_SIZE);
3216
3217         moea64_qremove(base, atop(size));
3218         kva_free(base, size);
3219 }
3220
3221 void
3222 moea64_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
3223 {
3224         struct pvo_entry *pvo;
3225         vm_offset_t lim;
3226         vm_paddr_t pa;
3227         vm_size_t len;
3228
3229         if (__predict_false(pm == NULL))
3230                 pm = &curthread->td_proc->p_vmspace->vm_pmap;
3231
3232         PMAP_LOCK(pm);
3233         while (sz > 0) {
3234                 lim = round_page(va+1);
3235                 len = MIN(lim - va, sz);
3236                 pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF);
3237                 if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) {
3238                         pa = PVO_PADDR(pvo) | (va & ADDR_POFF);
3239                         moea64_syncicache(pm, va, pa, len);
3240                 }
3241                 va += len;
3242                 sz -= len;
3243         }
3244         PMAP_UNLOCK(pm);
3245 }
3246
3247 void
3248 moea64_dumpsys_map(vm_paddr_t pa, size_t sz, void **va)
3249 {
3250
3251         *va = (void *)(uintptr_t)pa;
3252 }
3253
3254 extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1];
3255
3256 void
3257 moea64_scan_init()
3258 {
3259         struct pvo_entry *pvo;
3260         vm_offset_t va;
3261         int i;
3262
3263         if (!do_minidump) {
3264                 /* Initialize phys. segments for dumpsys(). */
3265                 memset(&dump_map, 0, sizeof(dump_map));
3266                 mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
3267                 for (i = 0; i < pregions_sz; i++) {
3268                         dump_map[i].pa_start = pregions[i].mr_start;
3269                         dump_map[i].pa_size = pregions[i].mr_size;
3270                 }
3271                 return;
3272         }
3273
3274         /* Virtual segments for minidumps: */
3275         memset(&dump_map, 0, sizeof(dump_map));
3276
3277         /* 1st: kernel .data and .bss. */
3278         dump_map[0].pa_start = trunc_page((uintptr_t)_etext);
3279         dump_map[0].pa_size = round_page((uintptr_t)_end) -
3280             dump_map[0].pa_start;
3281
3282         /* 2nd: msgbuf and tables (see pmap_bootstrap()). */
3283         dump_map[1].pa_start = (vm_paddr_t)(uintptr_t)msgbufp->msg_ptr;
3284         dump_map[1].pa_size = round_page(msgbufp->msg_size);
3285
3286         /* 3rd: kernel VM. */
3287         va = dump_map[1].pa_start + dump_map[1].pa_size;
3288         /* Find start of next chunk (from va). */
3289         while (va < virtual_end) {
3290                 /* Don't dump the buffer cache. */
3291                 if (va >= kmi.buffer_sva && va < kmi.buffer_eva) {
3292                         va = kmi.buffer_eva;
3293                         continue;
3294                 }
3295                 pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF);
3296                 if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD))
3297                         break;
3298                 va += PAGE_SIZE;
3299         }
3300         if (va < virtual_end) {
3301                 dump_map[2].pa_start = va;
3302                 va += PAGE_SIZE;
3303                 /* Find last page in chunk. */
3304                 while (va < virtual_end) {
3305                         /* Don't run into the buffer cache. */
3306                         if (va == kmi.buffer_sva)
3307                                 break;
3308                         pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF);
3309                         if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD))
3310                                 break;
3311                         va += PAGE_SIZE;
3312                 }
3313                 dump_map[2].pa_size = va - dump_map[2].pa_start;
3314         }
3315 }
3316
3317 #ifdef __powerpc64__
3318
3319 static size_t
3320 moea64_scan_pmap(struct bitset *dump_bitset)
3321 {
3322         struct pvo_entry *pvo;
3323         vm_paddr_t pa, pa_end;
3324         vm_offset_t va, pgva, kstart, kend, kstart_lp, kend_lp;
3325         uint64_t lpsize;
3326
3327         lpsize = moea64_large_page_size;
3328         kstart = trunc_page((vm_offset_t)_etext);
3329         kend = round_page((vm_offset_t)_end);
3330         kstart_lp = kstart & ~moea64_large_page_mask;
3331         kend_lp = (kend + moea64_large_page_mask) & ~moea64_large_page_mask;
3332
3333         CTR4(KTR_PMAP, "moea64_scan_pmap: kstart=0x%016lx, kend=0x%016lx, "
3334             "kstart_lp=0x%016lx, kend_lp=0x%016lx",
3335             kstart, kend, kstart_lp, kend_lp);
3336
3337         PMAP_LOCK(kernel_pmap);
3338         RB_FOREACH(pvo, pvo_tree, &kernel_pmap->pmap_pvo) {
3339                 va = pvo->pvo_vaddr;
3340
3341                 if (va & PVO_DEAD)
3342                         continue;
3343
3344                 /* Skip DMAP (except kernel area) */
3345                 if (va >= DMAP_BASE_ADDRESS && va <= DMAP_MAX_ADDRESS) {
3346                         if (va & PVO_LARGE) {
3347                                 pgva = va & ~moea64_large_page_mask;
3348                                 if (pgva < kstart_lp || pgva >= kend_lp)
3349                                         continue;
3350                         } else {
3351                                 pgva = trunc_page(va);
3352                                 if (pgva < kstart || pgva >= kend)
3353                                         continue;
3354                         }
3355                 }
3356
3357                 pa = PVO_PADDR(pvo);
3358
3359                 if (va & PVO_LARGE) {
3360                         pa_end = pa + lpsize;
3361                         for (; pa < pa_end; pa += PAGE_SIZE) {
3362                                 if (vm_phys_is_dumpable(pa))
3363                                         vm_page_dump_add(dump_bitset, pa);
3364                         }
3365                 } else {
3366                         if (vm_phys_is_dumpable(pa))
3367                                 vm_page_dump_add(dump_bitset, pa);
3368                 }
3369         }
3370         PMAP_UNLOCK(kernel_pmap);
3371
3372         return (sizeof(struct lpte) * moea64_pteg_count * 8);
3373 }
3374
3375 static struct dump_context dump_ctx;
3376
3377 static void *
3378 moea64_dump_pmap_init(unsigned blkpgs)
3379 {
3380         dump_ctx.ptex = 0;
3381         dump_ctx.ptex_end = moea64_pteg_count * 8;
3382         dump_ctx.blksz = blkpgs * PAGE_SIZE;
3383         return (&dump_ctx);
3384 }
3385
3386 #else
3387
3388 static size_t
3389 moea64_scan_pmap(struct bitset *dump_bitset __unused)
3390 {
3391         return (0);
3392 }
3393
3394 static void *
3395 moea64_dump_pmap_init(unsigned blkpgs)
3396 {
3397         return (NULL);
3398 }
3399
3400 #endif
3401
3402 #ifdef __powerpc64__
3403 static void
3404 moea64_map_range(vm_offset_t va, vm_paddr_t pa, vm_size_t npages)
3405 {
3406
3407         for (; npages > 0; --npages) {
3408                 if (moea64_large_page_size != 0 &&
3409                     (pa & moea64_large_page_mask) == 0 &&
3410                     (va & moea64_large_page_mask) == 0 &&
3411                     npages >= (moea64_large_page_size >> PAGE_SHIFT)) {
3412                         PMAP_LOCK(kernel_pmap);
3413                         moea64_kenter_large(va, pa, 0, 0);
3414                         PMAP_UNLOCK(kernel_pmap);
3415                         pa += moea64_large_page_size;
3416                         va += moea64_large_page_size;
3417                         npages -= (moea64_large_page_size >> PAGE_SHIFT) - 1;
3418                 } else {
3419                         moea64_kenter(va, pa);
3420                         pa += PAGE_SIZE;
3421                         va += PAGE_SIZE;
3422                 }
3423         }
3424 }
3425
3426 static void
3427 moea64_page_array_startup(long pages)
3428 {
3429         long dom_pages[MAXMEMDOM];
3430         vm_paddr_t pa;
3431         vm_offset_t va, vm_page_base;
3432         vm_size_t needed, size;
3433         int domain;
3434         int i;
3435
3436         vm_page_base = 0xd000000000000000ULL;
3437
3438         /* Short-circuit single-domain systems. */
3439         if (vm_ndomains == 1) {
3440                 size = round_page(pages * sizeof(struct vm_page));
3441                 pa = vm_phys_early_alloc(0, size);
3442                 vm_page_base = moea64_map(&vm_page_base,
3443                     pa, pa + size, VM_PROT_READ | VM_PROT_WRITE);
3444                 vm_page_array_size = pages;
3445                 vm_page_array = (vm_page_t)vm_page_base;
3446                 return;
3447         }
3448
3449         for (i = 0; i < MAXMEMDOM; i++)
3450                 dom_pages[i] = 0;
3451
3452         /* Now get the number of pages required per domain. */
3453         for (i = 0; i < vm_phys_nsegs; i++) {
3454                 domain = vm_phys_segs[i].domain;
3455                 KASSERT(domain < MAXMEMDOM,
3456                     ("Invalid vm_phys_segs NUMA domain %d!\n", domain));
3457                 /* Get size of vm_page_array needed for this segment. */
3458                 size = btoc(vm_phys_segs[i].end - vm_phys_segs[i].start);
3459                 dom_pages[domain] += size;
3460         }
3461
3462         for (i = 0; phys_avail[i + 1] != 0; i+= 2) {
3463                 domain = vm_phys_domain(phys_avail[i]);
3464                 KASSERT(domain < MAXMEMDOM,
3465                     ("Invalid phys_avail NUMA domain %d!\n", domain));
3466                 size = btoc(phys_avail[i + 1] - phys_avail[i]);
3467                 dom_pages[domain] += size;
3468         }
3469
3470         /*
3471          * Map in chunks that can get us all 16MB pages.  There will be some
3472          * overlap between domains, but that's acceptable for now.
3473          */
3474         vm_page_array_size = 0;
3475         va = vm_page_base;
3476         for (i = 0; i < MAXMEMDOM && vm_page_array_size < pages; i++) {
3477                 if (dom_pages[i] == 0)
3478                         continue;
3479                 size = ulmin(pages - vm_page_array_size, dom_pages[i]);
3480                 size = round_page(size * sizeof(struct vm_page));
3481                 needed = size;
3482                 size = roundup2(size, moea64_large_page_size);
3483                 pa = vm_phys_early_alloc(i, size);
3484                 vm_page_array_size += size / sizeof(struct vm_page);
3485                 moea64_map_range(va, pa, size >> PAGE_SHIFT);
3486                 /* Scoot up domain 0, to reduce the domain page overlap. */
3487                 if (i == 0)
3488                         vm_page_base += size - needed;
3489                 va += size;
3490         }
3491         vm_page_array = (vm_page_t)vm_page_base;
3492         vm_page_array_size = pages;
3493 }
3494 #endif
3495
3496 static int64_t
3497 moea64_null_method(void)
3498 {
3499         return (0);
3500 }
3501
3502 static int64_t moea64_pte_replace_default(struct pvo_entry *pvo, int flags)
3503 {
3504         int64_t refchg;
3505
3506         refchg = moea64_pte_unset(pvo);
3507         moea64_pte_insert(pvo);
3508
3509         return (refchg);
3510 }
3511
3512 struct moea64_funcs *moea64_ops;
3513
3514 #define DEFINE_OEA64_IFUNC(ret, func, args, def)                \
3515         DEFINE_IFUNC(, ret, moea64_##func, args) {              \
3516                 moea64_##func##_t f;                            \
3517                 if (moea64_ops == NULL)                         \
3518                         return ((moea64_##func##_t)def);        \
3519                 f = moea64_ops->func;                           \
3520                 return (f != NULL ? f : (moea64_##func##_t)def);\
3521         }
3522
3523 void
3524 moea64_install(void)
3525 {
3526 #ifdef __powerpc64__
3527         if (hw_direct_map == -1) {
3528                 moea64_probe_large_page();
3529
3530                 /* Use a direct map if we have large page support */
3531                 if (moea64_large_page_size > 0)
3532                         hw_direct_map = 1;
3533                 else
3534                         hw_direct_map = 0;
3535         }
3536 #endif
3537
3538         /*
3539          * Default to non-DMAP, and switch over to DMAP functions once we know
3540          * we have DMAP.
3541          */
3542         if (hw_direct_map) {
3543                 moea64_methods.quick_enter_page = moea64_quick_enter_page_dmap;
3544                 moea64_methods.quick_remove_page = NULL;
3545                 moea64_methods.copy_page = moea64_copy_page_dmap;
3546                 moea64_methods.zero_page = moea64_zero_page_dmap;
3547                 moea64_methods.copy_pages = moea64_copy_pages_dmap;
3548         }
3549 }
3550
3551 DEFINE_OEA64_IFUNC(int64_t, pte_replace, (struct pvo_entry *, int),
3552     moea64_pte_replace_default)
3553 DEFINE_OEA64_IFUNC(int64_t, pte_insert, (struct pvo_entry *), moea64_null_method)
3554 DEFINE_OEA64_IFUNC(int64_t, pte_unset, (struct pvo_entry *), moea64_null_method)
3555 DEFINE_OEA64_IFUNC(int64_t, pte_clear, (struct pvo_entry *, uint64_t),
3556     moea64_null_method)
3557 DEFINE_OEA64_IFUNC(int64_t, pte_synch, (struct pvo_entry *), moea64_null_method)
3558 DEFINE_OEA64_IFUNC(int64_t, pte_insert_sp, (struct pvo_entry *), moea64_null_method)
3559 DEFINE_OEA64_IFUNC(int64_t, pte_unset_sp, (struct pvo_entry *), moea64_null_method)
3560 DEFINE_OEA64_IFUNC(int64_t, pte_replace_sp, (struct pvo_entry *), moea64_null_method)
3561
3562 /* Superpage functions */
3563
3564 /* MMU interface */
3565
3566 static bool
3567 moea64_ps_enabled(pmap_t pmap)
3568 {
3569         return (superpages_enabled);
3570 }
3571
3572 static void
3573 moea64_align_superpage(vm_object_t object, vm_ooffset_t offset,
3574     vm_offset_t *addr, vm_size_t size)
3575 {
3576         vm_offset_t sp_offset;
3577
3578         if (size < HPT_SP_SIZE)
3579                 return;
3580
3581         CTR4(KTR_PMAP, "%s: offs=%#jx, addr=%p, size=%#jx",
3582             __func__, (uintmax_t)offset, addr, (uintmax_t)size);
3583
3584         if (object != NULL && (object->flags & OBJ_COLORED) != 0)
3585                 offset += ptoa(object->pg_color);
3586         sp_offset = offset & HPT_SP_MASK;
3587         if (size - ((HPT_SP_SIZE - sp_offset) & HPT_SP_MASK) < HPT_SP_SIZE ||
3588             (*addr & HPT_SP_MASK) == sp_offset)
3589                 return;
3590         if ((*addr & HPT_SP_MASK) < sp_offset)
3591                 *addr = (*addr & ~HPT_SP_MASK) + sp_offset;
3592         else
3593                 *addr = ((*addr + HPT_SP_MASK) & ~HPT_SP_MASK) + sp_offset;
3594 }
3595
3596 /* Helpers */
3597
3598 static __inline void
3599 moea64_pvo_cleanup(struct pvo_dlist *tofree)
3600 {
3601         struct pvo_entry *pvo;
3602
3603         /* clean up */
3604         while (!SLIST_EMPTY(tofree)) {
3605                 pvo = SLIST_FIRST(tofree);
3606                 SLIST_REMOVE_HEAD(tofree, pvo_dlink);
3607                 if (pvo->pvo_vaddr & PVO_DEAD)
3608                         moea64_pvo_remove_from_page(pvo);
3609                 free_pvo_entry(pvo);
3610         }
3611 }
3612
3613 static __inline uint16_t
3614 pvo_to_vmpage_flags(struct pvo_entry *pvo)
3615 {
3616         uint16_t flags;
3617
3618         flags = 0;
3619         if ((pvo->pvo_pte.prot & VM_PROT_WRITE) != 0)
3620                 flags |= PGA_WRITEABLE;
3621         if ((pvo->pvo_pte.prot & VM_PROT_EXECUTE) != 0)
3622                 flags |= PGA_EXECUTABLE;
3623
3624         return (flags);
3625 }
3626
3627 /*
3628  * Check if the given pvo and its superpage are in sva-eva range.
3629  */
3630 static __inline bool
3631 moea64_sp_pvo_in_range(struct pvo_entry *pvo, vm_offset_t sva, vm_offset_t eva)
3632 {
3633         vm_offset_t spva;
3634
3635         spva = PVO_VADDR(pvo) & ~HPT_SP_MASK;
3636         if (spva >= sva && spva + HPT_SP_SIZE <= eva) {
3637                 /*
3638                  * Because this function is intended to be called from loops
3639                  * that iterate over ordered pvo entries, if the condition
3640                  * above is true then the pvo must be the first of its
3641                  * superpage.
3642                  */
3643                 KASSERT(PVO_VADDR(pvo) == spva,
3644                     ("%s: unexpected unaligned superpage pvo", __func__));
3645                 return (true);
3646         }
3647         return (false);
3648 }
3649
3650 /*
3651  * Update vm about the REF/CHG bits if the superpage is managed and
3652  * has (or had) write access.
3653  */
3654 static void
3655 moea64_sp_refchg_process(struct pvo_entry *sp, vm_page_t m,
3656     int64_t sp_refchg, vm_prot_t prot)
3657 {
3658         vm_page_t m_end;
3659         int64_t refchg;
3660
3661         if ((sp->pvo_vaddr & PVO_MANAGED) != 0 && (prot & VM_PROT_WRITE) != 0) {
3662                 for (m_end = &m[HPT_SP_PAGES]; m < m_end; m++) {
3663                         refchg = sp_refchg |
3664                             atomic_readandclear_32(&m->md.mdpg_attrs);
3665                         if (refchg & LPTE_CHG)
3666                                 vm_page_dirty(m);
3667                         if (refchg & LPTE_REF)
3668                                 vm_page_aflag_set(m, PGA_REFERENCED);
3669                 }
3670         }
3671 }
3672
3673 /* Superpage ops */
3674
3675 static int
3676 moea64_sp_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
3677     vm_prot_t prot, u_int flags, int8_t psind)
3678 {
3679         struct pvo_entry *pvo, **pvos;
3680         struct pvo_head *pvo_head;
3681         vm_offset_t sva;
3682         vm_page_t sm;
3683         vm_paddr_t pa, spa;
3684         bool sync;
3685         struct pvo_dlist tofree;
3686         int error __diagused, i;
3687         uint16_t aflags;
3688
3689         KASSERT((va & HPT_SP_MASK) == 0, ("%s: va %#jx unaligned",
3690             __func__, (uintmax_t)va));
3691         KASSERT(psind == 1, ("%s: invalid psind: %d", __func__, psind));
3692         KASSERT(m->psind == 1, ("%s: invalid m->psind: %d",
3693             __func__, m->psind));
3694         KASSERT(pmap != kernel_pmap,
3695             ("%s: function called with kernel pmap", __func__));
3696
3697         CTR5(KTR_PMAP, "%s: va=%#jx, pa=%#jx, prot=%#x, flags=%#x, psind=1",
3698             __func__, (uintmax_t)va, (uintmax_t)VM_PAGE_TO_PHYS(m),
3699             prot, flags);
3700
3701         SLIST_INIT(&tofree);
3702
3703         sva = va;
3704         sm = m;
3705         spa = pa = VM_PAGE_TO_PHYS(sm);
3706
3707         /* Try to allocate all PVOs first, to make failure handling easier. */
3708         pvos = malloc(HPT_SP_PAGES * sizeof(struct pvo_entry *), M_TEMP,
3709             M_NOWAIT);
3710         if (pvos == NULL) {
3711                 CTR1(KTR_PMAP, "%s: failed to alloc pvo array", __func__);
3712                 return (KERN_RESOURCE_SHORTAGE);
3713         }
3714
3715         for (i = 0; i < HPT_SP_PAGES; i++) {
3716                 pvos[i] = alloc_pvo_entry(0);
3717                 if (pvos[i] == NULL) {
3718                         CTR1(KTR_PMAP, "%s: failed to alloc pvo", __func__);
3719                         for (i = i - 1; i >= 0; i--)
3720                                 free_pvo_entry(pvos[i]);
3721                         free(pvos, M_TEMP);
3722                         return (KERN_RESOURCE_SHORTAGE);
3723                 }
3724         }
3725
3726         SP_PV_LOCK_ALIGNED(spa);
3727         PMAP_LOCK(pmap);
3728
3729         /* Note: moea64_remove_locked() also clears cached REF/CHG bits. */
3730         moea64_remove_locked(pmap, va, va + HPT_SP_SIZE, &tofree);
3731
3732         /* Enter pages */
3733         for (i = 0; i < HPT_SP_PAGES;
3734             i++, va += PAGE_SIZE, pa += PAGE_SIZE, m++) {
3735                 pvo = pvos[i];
3736
3737                 pvo->pvo_pte.prot = prot;
3738                 pvo->pvo_pte.pa = (pa & ~HPT_SP_MASK) | LPTE_LP_4K_16M |
3739                     moea64_calc_wimg(pa, pmap_page_get_memattr(m));
3740
3741                 if ((flags & PMAP_ENTER_WIRED) != 0)
3742                         pvo->pvo_vaddr |= PVO_WIRED;
3743                 pvo->pvo_vaddr |= PVO_LARGE;
3744
3745                 if ((m->oflags & VPO_UNMANAGED) != 0)
3746                         pvo_head = NULL;
3747                 else {
3748                         pvo_head = &m->md.mdpg_pvoh;
3749                         pvo->pvo_vaddr |= PVO_MANAGED;
3750                 }
3751
3752                 init_pvo_entry(pvo, pmap, va);
3753
3754                 error = moea64_pvo_enter(pvo, pvo_head, NULL);
3755                 /*
3756                  * All superpage PVOs were previously removed, so no errors
3757                  * should occur while inserting the new ones.
3758                  */
3759                 KASSERT(error == 0, ("%s: unexpected error "
3760                             "when inserting superpage PVO: %d",
3761                             __func__, error));
3762         }
3763
3764         PMAP_UNLOCK(pmap);
3765         SP_PV_UNLOCK_ALIGNED(spa);
3766
3767         sync = (sm->a.flags & PGA_EXECUTABLE) == 0;
3768         /* Note: moea64_pvo_cleanup() also clears page prot. flags. */
3769         moea64_pvo_cleanup(&tofree);
3770         pvo = pvos[0];
3771
3772         /* Set vm page flags */
3773         aflags = pvo_to_vmpage_flags(pvo);
3774         if (aflags != 0)
3775                 for (m = sm; m < &sm[HPT_SP_PAGES]; m++)
3776                         vm_page_aflag_set(m, aflags);
3777
3778         /*
3779          * Flush the page from the instruction cache if this page is
3780          * mapped executable and cacheable.
3781          */
3782         if (sync && (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0)
3783                 moea64_syncicache(pmap, sva, spa, HPT_SP_SIZE);
3784
3785         atomic_add_long(&sp_mappings, 1);
3786         CTR3(KTR_PMAP, "%s: SP success for va %#jx in pmap %p",
3787             __func__, (uintmax_t)sva, pmap);
3788
3789         free(pvos, M_TEMP);
3790         return (KERN_SUCCESS);
3791 }
3792
3793 static void
3794 moea64_sp_promote(pmap_t pmap, vm_offset_t va, vm_page_t m)
3795 {
3796         struct pvo_entry *first, *pvo;
3797         vm_paddr_t pa, pa_end;
3798         vm_offset_t sva, va_end;
3799         int64_t sp_refchg;
3800
3801         /* This CTR may generate a lot of output. */
3802         /* CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)va); */
3803
3804         va &= ~HPT_SP_MASK;
3805         sva = va;
3806         /* Get superpage */
3807         pa = VM_PAGE_TO_PHYS(m) & ~HPT_SP_MASK;
3808         m = PHYS_TO_VM_PAGE(pa);
3809
3810         PMAP_LOCK(pmap);
3811
3812         /*
3813          * Check if all pages meet promotion criteria.
3814          *
3815          * XXX In some cases the loop below may be executed for each or most
3816          * of the entered pages of a superpage, which can be expensive
3817          * (although it was not profiled) and need some optimization.
3818          *
3819          * Some cases where this seems to happen are:
3820          * - When a superpage is first entered read-only and later becomes
3821          *   read-write.
3822          * - When some of the superpage's virtual addresses map to previously
3823          *   wired/cached pages while others map to pages allocated from a
3824          *   different physical address range. A common scenario where this
3825          *   happens is when mmap'ing a file that is already present in FS
3826          *   block cache and doesn't fill a superpage.
3827          */
3828         first = pvo = moea64_pvo_find_va(pmap, sva);
3829         for (pa_end = pa + HPT_SP_SIZE;
3830             pa < pa_end; pa += PAGE_SIZE, va += PAGE_SIZE) {
3831                 if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
3832                         CTR3(KTR_PMAP,
3833                             "%s: NULL or dead PVO: pmap=%p, va=%#jx",
3834                             __func__, pmap, (uintmax_t)va);
3835                         goto error;
3836                 }
3837                 if (PVO_PADDR(pvo) != pa) {
3838                         CTR5(KTR_PMAP, "%s: PAs don't match: "
3839                             "pmap=%p, va=%#jx, pvo_pa=%#jx, exp_pa=%#jx",
3840                             __func__, pmap, (uintmax_t)va,
3841                             (uintmax_t)PVO_PADDR(pvo), (uintmax_t)pa);
3842                         atomic_add_long(&sp_p_fail_pa, 1);
3843                         goto error;
3844                 }
3845                 if ((first->pvo_vaddr & PVO_FLAGS_PROMOTE) !=
3846                     (pvo->pvo_vaddr & PVO_FLAGS_PROMOTE)) {
3847                         CTR5(KTR_PMAP, "%s: PVO flags don't match: "
3848                             "pmap=%p, va=%#jx, pvo_flags=%#jx, exp_flags=%#jx",
3849                             __func__, pmap, (uintmax_t)va,
3850                             (uintmax_t)(pvo->pvo_vaddr & PVO_FLAGS_PROMOTE),
3851                             (uintmax_t)(first->pvo_vaddr & PVO_FLAGS_PROMOTE));
3852                         atomic_add_long(&sp_p_fail_flags, 1);
3853                         goto error;
3854                 }
3855                 if (first->pvo_pte.prot != pvo->pvo_pte.prot) {
3856                         CTR5(KTR_PMAP, "%s: PVO protections don't match: "
3857                             "pmap=%p, va=%#jx, pvo_prot=%#x, exp_prot=%#x",
3858                             __func__, pmap, (uintmax_t)va,
3859                             pvo->pvo_pte.prot, first->pvo_pte.prot);
3860                         atomic_add_long(&sp_p_fail_prot, 1);
3861                         goto error;
3862                 }
3863                 if ((first->pvo_pte.pa & LPTE_WIMG) !=
3864                     (pvo->pvo_pte.pa & LPTE_WIMG)) {
3865                         CTR5(KTR_PMAP, "%s: WIMG bits don't match: "
3866                             "pmap=%p, va=%#jx, pvo_wimg=%#jx, exp_wimg=%#jx",
3867                             __func__, pmap, (uintmax_t)va,
3868                             (uintmax_t)(pvo->pvo_pte.pa & LPTE_WIMG),
3869                             (uintmax_t)(first->pvo_pte.pa & LPTE_WIMG));
3870                         atomic_add_long(&sp_p_fail_wimg, 1);
3871                         goto error;
3872                 }
3873
3874                 pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo);
3875         }
3876
3877         /* All OK, promote. */
3878
3879         /*
3880          * Handle superpage REF/CHG bits. If REF or CHG is set in
3881          * any page, then it must be set in the superpage.
3882          *
3883          * Instead of querying each page, we take advantage of two facts:
3884          * 1- If a page is being promoted, it was referenced.
3885          * 2- If promoted pages are writable, they were modified.
3886          */
3887         sp_refchg = LPTE_REF |
3888             ((first->pvo_pte.prot & VM_PROT_WRITE) != 0 ? LPTE_CHG : 0);
3889
3890         /* Promote pages */
3891
3892         for (pvo = first, va_end = PVO_VADDR(pvo) + HPT_SP_SIZE;
3893             pvo != NULL && PVO_VADDR(pvo) < va_end;
3894             pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) {
3895                 pvo->pvo_pte.pa &= ADDR_POFF | ~HPT_SP_MASK;
3896                 pvo->pvo_pte.pa |= LPTE_LP_4K_16M;
3897                 pvo->pvo_vaddr |= PVO_LARGE;
3898         }
3899         moea64_pte_replace_sp(first);
3900
3901         /* Send REF/CHG bits to VM */
3902         moea64_sp_refchg_process(first, m, sp_refchg, first->pvo_pte.prot);
3903
3904         /* Use first page to cache REF/CHG bits */
3905         atomic_set_32(&m->md.mdpg_attrs, sp_refchg | MDPG_ATTR_SP);
3906
3907         PMAP_UNLOCK(pmap);
3908
3909         atomic_add_long(&sp_mappings, 1);
3910         atomic_add_long(&sp_promotions, 1);
3911         CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p",
3912             __func__, (uintmax_t)sva, pmap);
3913         return;
3914
3915 error:
3916         atomic_add_long(&sp_p_failures, 1);
3917         PMAP_UNLOCK(pmap);
3918 }
3919
3920 static void
3921 moea64_sp_demote_aligned(struct pvo_entry *sp)
3922 {
3923         struct pvo_entry *pvo;
3924         vm_offset_t va, va_end;
3925         vm_paddr_t pa;
3926         vm_page_t m;
3927         pmap_t pmap __diagused;
3928         int64_t refchg;
3929
3930         CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
3931
3932         pmap = sp->pvo_pmap;
3933         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3934
3935         pvo = sp;
3936
3937         /* Demote pages */
3938
3939         va = PVO_VADDR(pvo);
3940         pa = PVO_PADDR(pvo);
3941         m = PHYS_TO_VM_PAGE(pa);
3942
3943         for (pvo = sp, va_end = va + HPT_SP_SIZE;
3944             pvo != NULL && PVO_VADDR(pvo) < va_end;
3945             pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo),
3946             va += PAGE_SIZE, pa += PAGE_SIZE) {
3947                 KASSERT(pvo && PVO_VADDR(pvo) == va,
3948                     ("%s: missing PVO for va %#jx", __func__, (uintmax_t)va));
3949
3950                 pvo->pvo_vaddr &= ~PVO_LARGE;
3951                 pvo->pvo_pte.pa &= ~LPTE_RPGN;
3952                 pvo->pvo_pte.pa |= pa;
3953
3954         }
3955         refchg = moea64_pte_replace_sp(sp);
3956
3957         /*
3958          * Clear SP flag
3959          *
3960          * XXX It is possible that another pmap has this page mapped as
3961          *     part of a superpage, but as the SP flag is used only for
3962          *     caching SP REF/CHG bits, that will be queried if not set
3963          *     in cache, it should be ok to clear it here.
3964          */
3965         atomic_clear_32(&m->md.mdpg_attrs, MDPG_ATTR_SP);
3966
3967         /*
3968          * Handle superpage REF/CHG bits. A bit set in the superpage
3969          * means all pages should consider it set.
3970          */
3971         moea64_sp_refchg_process(sp, m, refchg, sp->pvo_pte.prot);
3972
3973         atomic_add_long(&sp_demotions, 1);
3974         CTR3(KTR_PMAP, "%s: success for va %#jx in pmap %p",
3975             __func__, (uintmax_t)PVO_VADDR(sp), pmap);
3976 }
3977
3978 static void
3979 moea64_sp_demote(struct pvo_entry *pvo)
3980 {
3981         PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
3982
3983         if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) {
3984                 pvo = moea64_pvo_find_va(pvo->pvo_pmap,
3985                     PVO_VADDR(pvo) & ~HPT_SP_MASK);
3986                 KASSERT(pvo != NULL, ("%s: missing PVO for va %#jx",
3987                      __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK)));
3988         }
3989         moea64_sp_demote_aligned(pvo);
3990 }
3991
3992 static struct pvo_entry *
3993 moea64_sp_unwire(struct pvo_entry *sp)
3994 {
3995         struct pvo_entry *pvo, *prev;
3996         vm_offset_t eva;
3997         pmap_t pm;
3998         int64_t ret, refchg;
3999
4000         CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
4001
4002         pm = sp->pvo_pmap;
4003         PMAP_LOCK_ASSERT(pm, MA_OWNED);
4004
4005         eva = PVO_VADDR(sp) + HPT_SP_SIZE;
4006         refchg = 0;
4007         for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
4008             prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
4009                 if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
4010                         panic("%s: pvo %p is missing PVO_WIRED",
4011                             __func__, pvo);
4012                 pvo->pvo_vaddr &= ~PVO_WIRED;
4013
4014                 ret = moea64_pte_replace(pvo, 0 /* No invalidation */);
4015                 if (ret < 0)
4016                         refchg |= LPTE_CHG;
4017                 else
4018                         refchg |= ret;
4019
4020                 pm->pm_stats.wired_count--;
4021         }
4022
4023         /* Send REF/CHG bits to VM */
4024         moea64_sp_refchg_process(sp, PHYS_TO_VM_PAGE(PVO_PADDR(sp)),
4025             refchg, sp->pvo_pte.prot);
4026
4027         return (prev);
4028 }
4029
4030 static struct pvo_entry *
4031 moea64_sp_protect(struct pvo_entry *sp, vm_prot_t prot)
4032 {
4033         struct pvo_entry *pvo, *prev;
4034         vm_offset_t eva;
4035         pmap_t pm;
4036         vm_page_t m, m_end;
4037         int64_t ret, refchg;
4038         vm_prot_t oldprot;
4039
4040         CTR3(KTR_PMAP, "%s: va=%#jx, prot=%x",
4041             __func__, (uintmax_t)PVO_VADDR(sp), prot);
4042
4043         pm = sp->pvo_pmap;
4044         PMAP_LOCK_ASSERT(pm, MA_OWNED);
4045
4046         oldprot = sp->pvo_pte.prot;
4047         m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
4048         KASSERT(m != NULL, ("%s: missing vm page for pa %#jx",
4049             __func__, (uintmax_t)PVO_PADDR(sp)));
4050         eva = PVO_VADDR(sp) + HPT_SP_SIZE;
4051         refchg = 0;
4052
4053         for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
4054             prev = pvo, pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
4055                 pvo->pvo_pte.prot = prot;
4056                 /*
4057                  * If the PVO is in the page table, update mapping
4058                  */
4059                 ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
4060                 if (ret < 0)
4061                         refchg |= LPTE_CHG;
4062                 else
4063                         refchg |= ret;
4064         }
4065
4066         /* Send REF/CHG bits to VM */
4067         moea64_sp_refchg_process(sp, m, refchg, oldprot);
4068
4069         /* Handle pages that became executable */
4070         if ((m->a.flags & PGA_EXECUTABLE) == 0 &&
4071             (sp->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
4072                 if ((m->oflags & VPO_UNMANAGED) == 0)
4073                         for (m_end = &m[HPT_SP_PAGES]; m < m_end; m++)
4074                                 vm_page_aflag_set(m, PGA_EXECUTABLE);
4075                 moea64_syncicache(pm, PVO_VADDR(sp), PVO_PADDR(sp),
4076                     HPT_SP_SIZE);
4077         }
4078
4079         return (prev);
4080 }
4081
4082 static struct pvo_entry *
4083 moea64_sp_remove(struct pvo_entry *sp, struct pvo_dlist *tofree)
4084 {
4085         struct pvo_entry *pvo, *tpvo;
4086         vm_offset_t eva;
4087         pmap_t pm __diagused;
4088
4089         CTR2(KTR_PMAP, "%s: va=%#jx", __func__, (uintmax_t)PVO_VADDR(sp));
4090
4091         pm = sp->pvo_pmap;
4092         PMAP_LOCK_ASSERT(pm, MA_OWNED);
4093
4094         eva = PVO_VADDR(sp) + HPT_SP_SIZE;
4095         for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
4096                 tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
4097
4098                 /*
4099                  * For locking reasons, remove this from the page table and
4100                  * pmap, but save delinking from the vm_page for a second
4101                  * pass
4102                  */
4103                 moea64_pvo_remove_from_pmap(pvo);
4104                 SLIST_INSERT_HEAD(tofree, pvo, pvo_dlink);
4105         }
4106
4107         /*
4108          * Clear SP bit
4109          *
4110          * XXX See comment in moea64_sp_demote_aligned() for why it's
4111          *     ok to always clear the SP bit on remove/demote.
4112          */
4113         atomic_clear_32(&PHYS_TO_VM_PAGE(PVO_PADDR(sp))->md.mdpg_attrs,
4114             MDPG_ATTR_SP);
4115
4116         return (tpvo);
4117 }
4118
4119 static int64_t
4120 moea64_sp_query_locked(struct pvo_entry *pvo, uint64_t ptebit)
4121 {
4122         int64_t refchg, ret;
4123         vm_offset_t eva;
4124         vm_page_t m;
4125         pmap_t pmap;
4126         struct pvo_entry *sp;
4127
4128         pmap = pvo->pvo_pmap;
4129         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4130
4131         /* Get first SP PVO */
4132         if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) {
4133                 sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~HPT_SP_MASK);
4134                 KASSERT(sp != NULL, ("%s: missing PVO for va %#jx",
4135                      __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK)));
4136         } else
4137                 sp = pvo;
4138         eva = PVO_VADDR(sp) + HPT_SP_SIZE;
4139
4140         refchg = 0;
4141         for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
4142             pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) {
4143                 ret = moea64_pte_synch(pvo);
4144                 if (ret > 0) {
4145                         refchg |= ret & (LPTE_CHG | LPTE_REF);
4146                         if ((refchg & ptebit) != 0)
4147                                 break;
4148                 }
4149         }
4150
4151         /* Save results */
4152         if (refchg != 0) {
4153                 m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
4154                 atomic_set_32(&m->md.mdpg_attrs, refchg | MDPG_ATTR_SP);
4155         }
4156
4157         return (refchg);
4158 }
4159
4160 static int64_t
4161 moea64_sp_query(struct pvo_entry *pvo, uint64_t ptebit)
4162 {
4163         int64_t refchg;
4164         pmap_t pmap;
4165
4166         pmap = pvo->pvo_pmap;
4167         PMAP_LOCK(pmap);
4168
4169         /*
4170          * Check if SP was demoted/removed before pmap lock was acquired.
4171          */
4172         if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
4173                 CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
4174                     __func__, (uintmax_t)PVO_PADDR(pvo));
4175                 PMAP_UNLOCK(pmap);
4176                 return (-1);
4177         }
4178
4179         refchg = moea64_sp_query_locked(pvo, ptebit);
4180         PMAP_UNLOCK(pmap);
4181
4182         CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx",
4183             __func__, (uintmax_t)PVO_VADDR(pvo),
4184             (uintmax_t)PVO_PADDR(pvo), (uintmax_t)refchg);
4185
4186         return (refchg);
4187 }
4188
4189 static int64_t
4190 moea64_sp_pvo_clear(struct pvo_entry *pvo, uint64_t ptebit)
4191 {
4192         int64_t refchg, ret;
4193         pmap_t pmap;
4194         struct pvo_entry *sp;
4195         vm_offset_t eva;
4196         vm_page_t m;
4197
4198         pmap = pvo->pvo_pmap;
4199         PMAP_LOCK(pmap);
4200
4201         /*
4202          * Check if SP was demoted/removed before pmap lock was acquired.
4203          */
4204         if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
4205                 CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
4206                     __func__, (uintmax_t)PVO_PADDR(pvo));
4207                 PMAP_UNLOCK(pmap);
4208                 return (-1);
4209         }
4210
4211         /* Get first SP PVO */
4212         if ((PVO_VADDR(pvo) & HPT_SP_MASK) != 0) {
4213                 sp = moea64_pvo_find_va(pmap, PVO_VADDR(pvo) & ~HPT_SP_MASK);
4214                 KASSERT(sp != NULL, ("%s: missing PVO for va %#jx",
4215                      __func__, (uintmax_t)(PVO_VADDR(pvo) & ~HPT_SP_MASK)));
4216         } else
4217                 sp = pvo;
4218         eva = PVO_VADDR(sp) + HPT_SP_SIZE;
4219
4220         refchg = 0;
4221         for (pvo = sp; pvo != NULL && PVO_VADDR(pvo) < eva;
4222             pvo = RB_NEXT(pvo_tree, &pmap->pmap_pvo, pvo)) {
4223                 ret = moea64_pte_clear(pvo, ptebit);
4224                 if (ret > 0)
4225                         refchg |= ret & (LPTE_CHG | LPTE_REF);
4226         }
4227
4228         m = PHYS_TO_VM_PAGE(PVO_PADDR(sp));
4229         atomic_clear_32(&m->md.mdpg_attrs, ptebit);
4230         PMAP_UNLOCK(pmap);
4231
4232         CTR4(KTR_PMAP, "%s: va=%#jx, pa=%#jx: refchg=%#jx",
4233             __func__, (uintmax_t)PVO_VADDR(sp),
4234             (uintmax_t)PVO_PADDR(sp), (uintmax_t)refchg);
4235
4236         return (refchg);
4237 }
4238
4239 static int64_t
4240 moea64_sp_clear(struct pvo_entry *pvo, vm_page_t m, uint64_t ptebit)
4241 {
4242         int64_t count, ret;
4243         pmap_t pmap;
4244
4245         count = 0;
4246         pmap = pvo->pvo_pmap;
4247
4248         /*
4249          * Since this reference bit is shared by 4096 4KB pages, it
4250          * should not be cleared every time it is tested. Apply a
4251          * simple "hash" function on the physical page number, the
4252          * virtual superpage number, and the pmap address to select
4253          * one 4KB page out of the 4096 on which testing the
4254          * reference bit will result in clearing that reference bit.
4255          * This function is designed to avoid the selection of the
4256          * same 4KB page for every 16MB page mapping.
4257          *
4258          * Always leave the reference bit of a wired mapping set, as
4259          * the current state of its reference bit won't affect page
4260          * replacement.
4261          */
4262         if (ptebit == LPTE_REF && (((VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT) ^
4263             (PVO_VADDR(pvo) >> HPT_SP_SHIFT) ^ (uintptr_t)pmap) &
4264             (HPT_SP_PAGES - 1)) == 0 && (pvo->pvo_vaddr & PVO_WIRED) == 0) {
4265                 if ((ret = moea64_sp_pvo_clear(pvo, ptebit)) == -1)
4266                         return (-1);
4267
4268                 if ((ret & ptebit) != 0)
4269                         count++;
4270
4271         /*
4272          * If this page was not selected by the hash function, then assume
4273          * its REF bit was set.
4274          */
4275         } else if (ptebit == LPTE_REF) {
4276                 count++;
4277
4278         /*
4279          * To clear the CHG bit of a single SP page, first it must be demoted.
4280          * But if no CHG bit is set, no bit clear and thus no SP demotion is
4281          * needed.
4282          */
4283         } else {
4284                 CTR4(KTR_PMAP, "%s: ptebit=%#jx, va=%#jx, pa=%#jx",
4285                     __func__, (uintmax_t)ptebit, (uintmax_t)PVO_VADDR(pvo),
4286                     (uintmax_t)PVO_PADDR(pvo));
4287
4288                 PMAP_LOCK(pmap);
4289
4290                 /*
4291                  * Make sure SP wasn't demoted/removed before pmap lock
4292                  * was acquired.
4293                  */
4294                 if (!PVO_IS_SP(pvo) || (pvo->pvo_vaddr & PVO_DEAD) != 0) {
4295                         CTR2(KTR_PMAP, "%s: demoted/removed: pa=%#jx",
4296                             __func__, (uintmax_t)PVO_PADDR(pvo));
4297                         PMAP_UNLOCK(pmap);
4298                         return (-1);
4299                 }
4300
4301                 ret = moea64_sp_query_locked(pvo, ptebit);
4302                 if ((ret & ptebit) != 0)
4303                         count++;
4304                 else {
4305                         PMAP_UNLOCK(pmap);
4306                         return (0);
4307                 }
4308
4309                 moea64_sp_demote(pvo);
4310                 moea64_pte_clear(pvo, ptebit);
4311
4312                 /*
4313                  * Write protect the mapping to a single page so that a
4314                  * subsequent write access may repromote.
4315                  */
4316                 if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
4317                         moea64_pvo_protect(pmap, pvo,
4318                             pvo->pvo_pte.prot & ~VM_PROT_WRITE);
4319
4320                 PMAP_UNLOCK(pmap);
4321         }
4322
4323         return (count);
4324 }