sys/powerpc/aim/mmu_oea64.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2008-2015 Nathan Whitehorn
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27  */
  28
  29 #include <sys/cdefs.h>
  30 __FBSDID("$FreeBSD$");
  31
  32 /*
  33  * Manages physical address maps.
  34  *
  35  * Since the information managed by this module is also stored by the
  36  * logical address mapping module, this module may throw away valid virtual
  37  * to physical mappings at almost any time.  However, invalidations of
  38  * mappings must be done as requested.
  39  *
  40  * In order to cope with hardware architectures which make virtual to
  41  * physical map invalidates expensive, this module may delay invalidate
  42  * reduced protection operations until such time as they are actually
  43  * necessary.  This module is given full information as to which processors
  44  * are currently using which maps, and to when physical maps must be made
  45  * correct.
  46  */
  47
  48 #include "opt_kstack_pages.h"
  49
  50 #include <sys/param.h>
  51 #include <sys/kernel.h>
  52 #include <sys/conf.h>
  53 #include <sys/queue.h>
  54 #include <sys/cpuset.h>
  55 #include <sys/kerneldump.h>
  56 #include <sys/ktr.h>
  57 #include <sys/lock.h>
  58 #include <sys/msgbuf.h>
  59 #include <sys/malloc.h>
  60 #include <sys/mman.h>
  61 #include <sys/mutex.h>
  62 #include <sys/proc.h>
  63 #include <sys/rwlock.h>
  64 #include <sys/sched.h>
  65 #include <sys/sysctl.h>
  66 #include <sys/systm.h>
  67 #include <sys/vmmeter.h>
  68 #include <sys/smp.h>
  69 #include <sys/reboot.h>
  70
  71 #include <sys/kdb.h>
  72
  73 #include <dev/ofw/openfirm.h>
  74
  75 #include <vm/vm.h>
  76 #include <vm/pmap.h>
  77 #include <vm/vm_param.h>
  78 #include <vm/vm_kern.h>
  79 #include <vm/vm_page.h>
  80 #include <vm/vm_phys.h>
  81 #include <vm/vm_map.h>
  82 #include <vm/vm_object.h>
  83 #include <vm/vm_extern.h>
  84 #include <vm/vm_pageout.h>
  85 #include <vm/uma.h>
  86
  87 #include <machine/_inttypes.h>
  88 #include <machine/cpu.h>
  89 #include <machine/ifunc.h>
  90 #include <machine/platform.h>
  91 #include <machine/frame.h>
  92 #include <machine/md_var.h>
  93 #include <machine/psl.h>
  94 #include <machine/bat.h>
  95 #include <machine/hid.h>
  96 #include <machine/pte.h>
  97 #include <machine/sr.h>
  98 #include <machine/trap.h>
  99 #include <machine/mmuvar.h>
 100
 101 #include "mmu_oea64.h"
 102
 103 void moea64_release_vsid(uint64_t vsid);
 104 uintptr_t moea64_get_unique_vsid(void);
 105
 106 #define DISABLE_TRANS(msr)      msr = mfmsr(); mtmsr(msr & ~PSL_DR)
 107 #define ENABLE_TRANS(msr)       mtmsr(msr)
 108
 109 #define VSID_MAKE(sr, hash)     ((sr) | (((hash) & 0xfffff) << 4))
 110 #define VSID_TO_HASH(vsid)      (((vsid) >> 4) & 0xfffff)
 111 #define VSID_HASH_MASK          0x0000007fffffffffULL
 112
 113 /* Get physical address from PVO. */
 114 #define PVO_PADDR(pvo)          ((pvo)->pvo_pte.pa & LPTE_RPGN)
 115
 116 /*
 117  * Locking semantics:
 118  *
 119  * There are two locks of interest: the page locks and the pmap locks, which
 120  * protect their individual PVO lists and are locked in that order. The contents
 121  * of all PVO entries are protected by the locks of their respective pmaps.
 122  * The pmap of any PVO is guaranteed not to change so long as the PVO is linked
 123  * into any list.
 124  *
 125  */
 126
 127 #define PV_LOCK_COUNT   PA_LOCK_COUNT
 128 static struct mtx_padalign pv_lock[PV_LOCK_COUNT];
 129
 130 /*
 131  * Cheap NUMA-izing of the pv locks, to reduce contention across domains.
 132  * NUMA domains on POWER9 appear to be indexed as sparse memory spaces, with the
 133  * index at (N << 45).
 134  */
 135 #ifdef __powerpc64__
 136 #define PV_LOCK_IDX(pa) ((pa_index(pa) * (((pa) >> 45) + 1)) % PV_LOCK_COUNT)
 137 #else
 138 #define PV_LOCK_IDX(pa) (pa_index(pa) % PV_LOCK_COUNT)
 139 #endif
 140 #define PV_LOCKPTR(pa)  ((struct mtx *)(&pv_lock[PV_LOCK_IDX(pa)]))
 141 #define PV_LOCK(pa)             mtx_lock(PV_LOCKPTR(pa))
 142 #define PV_UNLOCK(pa)           mtx_unlock(PV_LOCKPTR(pa))
 143 #define PV_LOCKASSERT(pa)       mtx_assert(PV_LOCKPTR(pa), MA_OWNED)
 144 #define PV_PAGE_LOCK(m)         PV_LOCK(VM_PAGE_TO_PHYS(m))
 145 #define PV_PAGE_UNLOCK(m)       PV_UNLOCK(VM_PAGE_TO_PHYS(m))
 146 #define PV_PAGE_LOCKASSERT(m)   PV_LOCKASSERT(VM_PAGE_TO_PHYS(m))
 147
 148 struct ofw_map {
 149         cell_t  om_va;
 150         cell_t  om_len;
 151         uint64_t om_pa;
 152         cell_t  om_mode;
 153 };
 154
 155 extern unsigned char _etext[];
 156 extern unsigned char _end[];
 157
 158 extern void *slbtrap, *slbtrapend;
 159
 160 /*
 161  * Map of physical memory regions.
 162  */
 163 static struct   mem_region *regions;
 164 static struct   mem_region *pregions;
 165 static struct   numa_mem_region *numa_pregions;
 166 static u_int    phys_avail_count;
 167 static int      regions_sz, pregions_sz, numapregions_sz;
 168
 169 extern void bs_remap_earlyboot(void);
 170
 171 /*
 172  * Lock for the SLB tables.
 173  */
 174 struct mtx      moea64_slb_mutex;
 175
 176 /*
 177  * PTEG data.
 178  */
 179 u_long          moea64_pteg_count;
 180 u_long          moea64_pteg_mask;
 181
 182 /*
 183  * PVO data.
 184  */
 185
 186 uma_zone_t      moea64_pvo_zone; /* zone for pvo entries */
 187
 188 static struct   pvo_entry *moea64_bpvo_pool;
 189 static int      moea64_bpvo_pool_index = 0;
 190 static int      moea64_bpvo_pool_size = 0;
 191 SYSCTL_INT(_machdep, OID_AUTO, moea64_allocated_bpvo_entries, CTLFLAG_RD,
 192     &moea64_bpvo_pool_index, 0, "");
 193
 194 #define BPVO_POOL_SIZE  327680 /* Sensible historical default value */
 195 #define BPVO_POOL_EXPANSION_FACTOR      3
 196 #define VSID_NBPW       (sizeof(u_int32_t) * 8)
 197 #ifdef __powerpc64__
 198 #define NVSIDS          (NPMAPS * 16)
 199 #define VSID_HASHMASK   0xffffffffUL
 200 #else
 201 #define NVSIDS          NPMAPS
 202 #define VSID_HASHMASK   0xfffffUL
 203 #endif
 204 static u_int    moea64_vsid_bitmap[NVSIDS / VSID_NBPW];
 205
 206 static boolean_t moea64_initialized = FALSE;
 207
 208 #ifdef MOEA64_STATS
 209 /*
 210  * Statistics.
 211  */
 212 u_int   moea64_pte_valid = 0;
 213 u_int   moea64_pte_overflow = 0;
 214 u_int   moea64_pvo_entries = 0;
 215 u_int   moea64_pvo_enter_calls = 0;
 216 u_int   moea64_pvo_remove_calls = 0;
 217 SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_valid, CTLFLAG_RD,
 218     &moea64_pte_valid, 0, "");
 219 SYSCTL_INT(_machdep, OID_AUTO, moea64_pte_overflow, CTLFLAG_RD,
 220     &moea64_pte_overflow, 0, "");
 221 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_entries, CTLFLAG_RD,
 222     &moea64_pvo_entries, 0, "");
 223 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_enter_calls, CTLFLAG_RD,
 224     &moea64_pvo_enter_calls, 0, "");
 225 SYSCTL_INT(_machdep, OID_AUTO, moea64_pvo_remove_calls, CTLFLAG_RD,
 226     &moea64_pvo_remove_calls, 0, "");
 227 #endif
 228
 229 vm_offset_t     moea64_scratchpage_va[2];
 230 struct pvo_entry *moea64_scratchpage_pvo[2];
 231 struct  mtx     moea64_scratchpage_mtx;
 232
 233 uint64_t        moea64_large_page_mask = 0;
 234 uint64_t        moea64_large_page_size = 0;
 235 int             moea64_large_page_shift = 0;
 236
 237 /*
 238  * PVO calls.
 239  */
 240 static int      moea64_pvo_enter(struct pvo_entry *pvo,
 241                     struct pvo_head *pvo_head, struct pvo_entry **oldpvo);
 242 static void     moea64_pvo_remove_from_pmap(struct pvo_entry *pvo);
 243 static void     moea64_pvo_remove_from_page(struct pvo_entry *pvo);
 244 static void     moea64_pvo_remove_from_page_locked(
 245                     struct pvo_entry *pvo, vm_page_t m);
 246 static struct   pvo_entry *moea64_pvo_find_va(pmap_t, vm_offset_t);
 247
 248 /*
 249  * Utility routines.
 250  */
 251 static boolean_t        moea64_query_bit(vm_page_t, uint64_t);
 252 static u_int            moea64_clear_bit(vm_page_t, uint64_t);
 253 static void             moea64_kremove(vm_offset_t);
 254 static void             moea64_syncicache(pmap_t pmap, vm_offset_t va,
 255                             vm_paddr_t pa, vm_size_t sz);
 256 static void             moea64_pmap_init_qpages(void);
 257
 258 /*
 259  * Kernel MMU interface
 260  */
 261 void moea64_clear_modify(vm_page_t);
 262 void moea64_copy_page(vm_page_t, vm_page_t);
 263 void moea64_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
 264     vm_page_t *mb, vm_offset_t b_offset, int xfersize);
 265 int moea64_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t,
 266     u_int flags, int8_t psind);
 267 void moea64_enter_object(pmap_t, vm_offset_t, vm_offset_t, vm_page_t,
 268     vm_prot_t);
 269 void moea64_enter_quick(pmap_t, vm_offset_t, vm_page_t, vm_prot_t);
 270 vm_paddr_t moea64_extract(pmap_t, vm_offset_t);
 271 vm_page_t moea64_extract_and_hold(pmap_t, vm_offset_t, vm_prot_t);
 272 void moea64_init(void);
 273 boolean_t moea64_is_modified(vm_page_t);
 274 boolean_t moea64_is_prefaultable(pmap_t, vm_offset_t);
 275 boolean_t moea64_is_referenced(vm_page_t);
 276 int moea64_ts_referenced(vm_page_t);
 277 vm_offset_t moea64_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
 278 boolean_t moea64_page_exists_quick(pmap_t, vm_page_t);
 279 void moea64_page_init(vm_page_t);
 280 int moea64_page_wired_mappings(vm_page_t);
 281 int moea64_pinit(pmap_t);
 282 void moea64_pinit0(pmap_t);
 283 void moea64_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
 284 void moea64_qenter(vm_offset_t, vm_page_t *, int);
 285 void moea64_qremove(vm_offset_t, int);
 286 void moea64_release(pmap_t);
 287 void moea64_remove(pmap_t, vm_offset_t, vm_offset_t);
 288 void moea64_remove_pages(pmap_t);
 289 void moea64_remove_all(vm_page_t);
 290 void moea64_remove_write(vm_page_t);
 291 void moea64_unwire(pmap_t, vm_offset_t, vm_offset_t);
 292 void moea64_zero_page(vm_page_t);
 293 void moea64_zero_page_area(vm_page_t, int, int);
 294 void moea64_activate(struct thread *);
 295 void moea64_deactivate(struct thread *);
 296 void *moea64_mapdev(vm_paddr_t, vm_size_t);
 297 void *moea64_mapdev_attr(vm_paddr_t, vm_size_t, vm_memattr_t);
 298 void moea64_unmapdev(vm_offset_t, vm_size_t);
 299 vm_paddr_t moea64_kextract(vm_offset_t);
 300 void moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma);
 301 void moea64_kenter_attr(vm_offset_t, vm_paddr_t, vm_memattr_t ma);
 302 void moea64_kenter(vm_offset_t, vm_paddr_t);
 303 boolean_t moea64_dev_direct_mapped(vm_paddr_t, vm_size_t);
 304 static void moea64_sync_icache(pmap_t, vm_offset_t, vm_size_t);
 305 void moea64_dumpsys_map(vm_paddr_t pa, size_t sz,
 306     void **va);
 307 void moea64_scan_init(void);
 308 vm_offset_t moea64_quick_enter_page(vm_page_t m);
 309 void moea64_quick_remove_page(vm_offset_t addr);
 310 boolean_t moea64_page_is_mapped(vm_page_t m);
 311 static int moea64_map_user_ptr(pmap_t pm,
 312     volatile const void *uaddr, void **kaddr, size_t ulen, size_t *klen);
 313 static int moea64_decode_kernel_ptr(vm_offset_t addr,
 314     int *is_user, vm_offset_t *decoded_addr);
 315 static size_t moea64_scan_pmap(void);
 316 static void *moea64_dump_pmap_init(unsigned blkpgs);
 317 #ifdef __powerpc64__
 318 static void moea64_page_array_startup(long);
 319 #endif
 320 static int moea64_mincore(pmap_t, vm_offset_t, vm_paddr_t *);
 321
 322 static struct pmap_funcs moea64_methods = {
 323         .clear_modify = moea64_clear_modify,
 324         .copy_page = moea64_copy_page,
 325         .copy_pages = moea64_copy_pages,
 326         .enter = moea64_enter,
 327         .enter_object = moea64_enter_object,
 328         .enter_quick = moea64_enter_quick,
 329         .extract = moea64_extract,
 330         .extract_and_hold = moea64_extract_and_hold,
 331         .init = moea64_init,
 332         .is_modified = moea64_is_modified,
 333         .is_prefaultable = moea64_is_prefaultable,
 334         .is_referenced = moea64_is_referenced,
 335         .ts_referenced = moea64_ts_referenced,
 336         .map =                  moea64_map,
 337         .mincore = moea64_mincore,
 338         .page_exists_quick = moea64_page_exists_quick,
 339         .page_init = moea64_page_init,
 340         .page_wired_mappings = moea64_page_wired_mappings,
 341         .pinit = moea64_pinit,
 342         .pinit0 = moea64_pinit0,
 343         .protect = moea64_protect,
 344         .qenter = moea64_qenter,
 345         .qremove = moea64_qremove,
 346         .release = moea64_release,
 347         .remove = moea64_remove,
 348         .remove_pages = moea64_remove_pages,
 349         .remove_all =           moea64_remove_all,
 350         .remove_write = moea64_remove_write,
 351         .sync_icache = moea64_sync_icache,
 352         .unwire = moea64_unwire,
 353         .zero_page =            moea64_zero_page,
 354         .zero_page_area = moea64_zero_page_area,
 355         .activate = moea64_activate,
 356         .deactivate =           moea64_deactivate,
 357         .page_set_memattr = moea64_page_set_memattr,
 358         .quick_enter_page =  moea64_quick_enter_page,
 359         .quick_remove_page =  moea64_quick_remove_page,
 360         .page_is_mapped = moea64_page_is_mapped,
 361 #ifdef __powerpc64__
 362         .page_array_startup = moea64_page_array_startup,
 363 #endif
 364
 365         /* Internal interfaces */
 366         .mapdev = moea64_mapdev,
 367         .mapdev_attr = moea64_mapdev_attr,
 368         .unmapdev = moea64_unmapdev,
 369         .kextract = moea64_kextract,
 370         .kenter = moea64_kenter,
 371         .kenter_attr = moea64_kenter_attr,
 372         .dev_direct_mapped = moea64_dev_direct_mapped,
 373         .dumpsys_pa_init = moea64_scan_init,
 374         .dumpsys_scan_pmap = moea64_scan_pmap,
 375         .dumpsys_dump_pmap_init =    moea64_dump_pmap_init,
 376         .dumpsys_map_chunk = moea64_dumpsys_map,
 377         .map_user_ptr = moea64_map_user_ptr,
 378         .decode_kernel_ptr =  moea64_decode_kernel_ptr,
 379 };
 380
 381 MMU_DEF(oea64_mmu, "mmu_oea64_base", moea64_methods);
 382
 383 static struct pvo_head *
 384 vm_page_to_pvoh(vm_page_t m)
 385 {
 386
 387         mtx_assert(PV_LOCKPTR(VM_PAGE_TO_PHYS(m)), MA_OWNED);
 388         return (&m->md.mdpg_pvoh);
 389 }
 390
 391 static struct pvo_entry *
 392 alloc_pvo_entry(int bootstrap)
 393 {
 394         struct pvo_entry *pvo;
 395
 396         if (!moea64_initialized || bootstrap) {
 397                 if (moea64_bpvo_pool_index >= moea64_bpvo_pool_size) {
 398                         panic("%s: bpvo pool exhausted, index=%d, size=%d, bytes=%zd."
 399                             "Try setting machdep.moea64_bpvo_pool_size tunable",
 400                             __func__, moea64_bpvo_pool_index,
 401                             moea64_bpvo_pool_size,
 402                             moea64_bpvo_pool_size * sizeof(struct pvo_entry));
 403                 }
 404                 pvo = &moea64_bpvo_pool[
 405                     atomic_fetchadd_int(&moea64_bpvo_pool_index, 1)];
 406                 bzero(pvo, sizeof(*pvo));
 407                 pvo->pvo_vaddr = PVO_BOOTSTRAP;
 408         } else
 409                 pvo = uma_zalloc(moea64_pvo_zone, M_NOWAIT | M_ZERO);
 410
 411         return (pvo);
 412 }
 413
 414 static void
 415 init_pvo_entry(struct pvo_entry *pvo, pmap_t pmap, vm_offset_t va)
 416 {
 417         uint64_t vsid;
 418         uint64_t hash;
 419         int shift;
 420
 421         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 422
 423         pvo->pvo_pmap = pmap;
 424         va &= ~ADDR_POFF;
 425         pvo->pvo_vaddr |= va;
 426         vsid = va_to_vsid(pmap, va);
 427         pvo->pvo_vpn = (uint64_t)((va & ADDR_PIDX) >> ADDR_PIDX_SHFT)
 428             | (vsid << 16);
 429
 430         shift = (pvo->pvo_vaddr & PVO_LARGE) ? moea64_large_page_shift :
 431             ADDR_PIDX_SHFT;
 432         hash = (vsid & VSID_HASH_MASK) ^ (((uint64_t)va & ADDR_PIDX) >> shift);
 433         pvo->pvo_pte.slot = (hash & moea64_pteg_mask) << 3;
 434 }
 435
 436 static void
 437 free_pvo_entry(struct pvo_entry *pvo)
 438 {
 439
 440         if (!(pvo->pvo_vaddr & PVO_BOOTSTRAP))
 441                 uma_zfree(moea64_pvo_zone, pvo);
 442 }
 443
 444 void
 445 moea64_pte_from_pvo(const struct pvo_entry *pvo, struct lpte *lpte)
 446 {
 447
 448         lpte->pte_hi = moea64_pte_vpn_from_pvo_vpn(pvo);
 449         lpte->pte_hi |= LPTE_VALID;
 450
 451         if (pvo->pvo_vaddr & PVO_LARGE)
 452                 lpte->pte_hi |= LPTE_BIG;
 453         if (pvo->pvo_vaddr & PVO_WIRED)
 454                 lpte->pte_hi |= LPTE_WIRED;
 455         if (pvo->pvo_vaddr & PVO_HID)
 456                 lpte->pte_hi |= LPTE_HID;
 457
 458         lpte->pte_lo = pvo->pvo_pte.pa; /* Includes WIMG bits */
 459         if (pvo->pvo_pte.prot & VM_PROT_WRITE)
 460                 lpte->pte_lo |= LPTE_BW;
 461         else
 462                 lpte->pte_lo |= LPTE_BR;
 463
 464         if (!(pvo->pvo_pte.prot & VM_PROT_EXECUTE))
 465                 lpte->pte_lo |= LPTE_NOEXEC;
 466 }
 467
 468 static __inline uint64_t
 469 moea64_calc_wimg(vm_paddr_t pa, vm_memattr_t ma)
 470 {
 471         uint64_t pte_lo;
 472         int i;
 473
 474         if (ma != VM_MEMATTR_DEFAULT) {
 475                 switch (ma) {
 476                 case VM_MEMATTR_UNCACHEABLE:
 477                         return (LPTE_I | LPTE_G);
 478                 case VM_MEMATTR_CACHEABLE:
 479                         return (LPTE_M);
 480                 case VM_MEMATTR_WRITE_COMBINING:
 481                 case VM_MEMATTR_WRITE_BACK:
 482                 case VM_MEMATTR_PREFETCHABLE:
 483                         return (LPTE_I);
 484                 case VM_MEMATTR_WRITE_THROUGH:
 485                         return (LPTE_W | LPTE_M);
 486                 }
 487         }
 488
 489         /*
 490          * Assume the page is cache inhibited and access is guarded unless
 491          * it's in our available memory array.
 492          */
 493         pte_lo = LPTE_I | LPTE_G;
 494         for (i = 0; i < pregions_sz; i++) {
 495                 if ((pa >= pregions[i].mr_start) &&
 496                     (pa < (pregions[i].mr_start + pregions[i].mr_size))) {
 497                         pte_lo &= ~(LPTE_I | LPTE_G);
 498                         pte_lo |= LPTE_M;
 499                         break;
 500                 }
 501         }
 502
 503         return pte_lo;
 504 }
 505
 506 /*
 507  * Quick sort callout for comparing memory regions.
 508  */
 509 static int      om_cmp(const void *a, const void *b);
 510
 511 static int
 512 om_cmp(const void *a, const void *b)
 513 {
 514         const struct    ofw_map *mapa;
 515         const struct    ofw_map *mapb;
 516
 517         mapa = a;
 518         mapb = b;
 519         if (mapa->om_pa < mapb->om_pa)
 520                 return (-1);
 521         else if (mapa->om_pa > mapb->om_pa)
 522                 return (1);
 523         else
 524                 return (0);
 525 }
 526
 527 static void
 528 moea64_add_ofw_mappings(phandle_t mmu, size_t sz)
 529 {
 530         struct ofw_map  translations[sz/(4*sizeof(cell_t))]; /*>= 4 cells per */
 531         pcell_t         acells, trans_cells[sz/sizeof(cell_t)];
 532         struct pvo_entry *pvo;
 533         register_t      msr;
 534         vm_offset_t     off;
 535         vm_paddr_t      pa_base;
 536         int             i, j;
 537
 538         bzero(translations, sz);
 539         OF_getencprop(OF_finddevice("/"), "#address-cells", &acells,
 540             sizeof(acells));
 541         if (OF_getencprop(mmu, "translations", trans_cells, sz) == -1)
 542                 panic("moea64_bootstrap: can't get ofw translations");
 543
 544         CTR0(KTR_PMAP, "moea64_add_ofw_mappings: translations");
 545         sz /= sizeof(cell_t);
 546         for (i = 0, j = 0; i < sz; j++) {
 547                 translations[j].om_va = trans_cells[i++];
 548                 translations[j].om_len = trans_cells[i++];
 549                 translations[j].om_pa = trans_cells[i++];
 550                 if (acells == 2) {
 551                         translations[j].om_pa <<= 32;
 552                         translations[j].om_pa |= trans_cells[i++];
 553                 }
 554                 translations[j].om_mode = trans_cells[i++];
 555         }
 556         KASSERT(i == sz, ("Translations map has incorrect cell count (%d/%zd)",
 557             i, sz));
 558
 559         sz = j;
 560         qsort(translations, sz, sizeof (*translations), om_cmp);
 561
 562         for (i = 0; i < sz; i++) {
 563                 pa_base = translations[i].om_pa;
 564               #ifndef __powerpc64__
 565                 if ((translations[i].om_pa >> 32) != 0)
 566                         panic("OFW translations above 32-bit boundary!");
 567               #endif
 568
 569                 if (pa_base % PAGE_SIZE)
 570                         panic("OFW translation not page-aligned (phys)!");
 571                 if (translations[i].om_va % PAGE_SIZE)
 572                         panic("OFW translation not page-aligned (virt)!");
 573
 574                 CTR3(KTR_PMAP, "translation: pa=%#zx va=%#x len=%#x",
 575                     pa_base, translations[i].om_va, translations[i].om_len);
 576
 577                 /* Now enter the pages for this mapping */
 578
 579                 DISABLE_TRANS(msr);
 580                 for (off = 0; off < translations[i].om_len; off += PAGE_SIZE) {
 581                         /* If this address is direct-mapped, skip remapping */
 582                         if (hw_direct_map &&
 583                             translations[i].om_va == PHYS_TO_DMAP(pa_base) &&
 584                             moea64_calc_wimg(pa_base + off, VM_MEMATTR_DEFAULT)
 585                             == LPTE_M)
 586                                 continue;
 587
 588                         PMAP_LOCK(kernel_pmap);
 589                         pvo = moea64_pvo_find_va(kernel_pmap,
 590                             translations[i].om_va + off);
 591                         PMAP_UNLOCK(kernel_pmap);
 592                         if (pvo != NULL)
 593                                 continue;
 594
 595                         moea64_kenter(translations[i].om_va + off,
 596                             pa_base + off);
 597                 }
 598                 ENABLE_TRANS(msr);
 599         }
 600 }
 601
 602 #ifdef __powerpc64__
 603 static void
 604 moea64_probe_large_page(void)
 605 {
 606         uint16_t pvr = mfpvr() >> 16;
 607
 608         switch (pvr) {
 609         case IBM970:
 610         case IBM970FX:
 611         case IBM970MP:
 612                 powerpc_sync(); isync();
 613                 mtspr(SPR_HID4, mfspr(SPR_HID4) & ~HID4_970_DISABLE_LG_PG);
 614                 powerpc_sync(); isync();
 615
 616                 /* FALLTHROUGH */
 617         default:
 618                 if (moea64_large_page_size == 0) {
 619                         moea64_large_page_size = 0x1000000; /* 16 MB */
 620                         moea64_large_page_shift = 24;
 621                 }
 622         }
 623
 624         moea64_large_page_mask = moea64_large_page_size - 1;
 625 }
 626
 627 static void
 628 moea64_bootstrap_slb_prefault(vm_offset_t va, int large)
 629 {
 630         struct slb *cache;
 631         struct slb entry;
 632         uint64_t esid, slbe;
 633         uint64_t i;
 634
 635         cache = PCPU_GET(aim.slb);
 636         esid = va >> ADDR_SR_SHFT;
 637         slbe = (esid << SLBE_ESID_SHIFT) | SLBE_VALID;
 638
 639         for (i = 0; i < 64; i++) {
 640                 if (cache[i].slbe == (slbe | i))
 641                         return;
 642         }
 643
 644         entry.slbe = slbe;
 645         entry.slbv = KERNEL_VSID(esid) << SLBV_VSID_SHIFT;
 646         if (large)
 647                 entry.slbv |= SLBV_L;
 648
 649         slb_insert_kernel(entry.slbe, entry.slbv);
 650 }
 651 #endif
 652
 653 static int
 654 moea64_kenter_large(vm_offset_t va, vm_paddr_t pa, uint64_t attr, int bootstrap)
 655 {
 656         struct pvo_entry *pvo;
 657         uint64_t pte_lo;
 658         int error;
 659
 660         pte_lo = LPTE_M;
 661         pte_lo |= attr;
 662
 663         pvo = alloc_pvo_entry(bootstrap);
 664         pvo->pvo_vaddr |= PVO_WIRED | PVO_LARGE;
 665         init_pvo_entry(pvo, kernel_pmap, va);
 666
 667         pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE |
 668             VM_PROT_EXECUTE;
 669         pvo->pvo_pte.pa = pa | pte_lo;
 670         error = moea64_pvo_enter(pvo, NULL, NULL);
 671         if (error != 0)
 672                 panic("Error %d inserting large page\n", error);
 673         return (0);
 674 }
 675
 676 static void
 677 moea64_setup_direct_map(vm_offset_t kernelstart,
 678     vm_offset_t kernelend)
 679 {
 680         register_t msr;
 681         vm_paddr_t pa, pkernelstart, pkernelend;
 682         vm_offset_t size, off;
 683         uint64_t pte_lo;
 684         int i;
 685
 686         if (moea64_large_page_size == 0)
 687                 hw_direct_map = 0;
 688
 689         DISABLE_TRANS(msr);
 690         if (hw_direct_map) {
 691                 PMAP_LOCK(kernel_pmap);
 692                 for (i = 0; i < pregions_sz; i++) {
 693                   for (pa = pregions[i].mr_start; pa < pregions[i].mr_start +
 694                      pregions[i].mr_size; pa += moea64_large_page_size) {
 695                         pte_lo = LPTE_M;
 696                         if (pa & moea64_large_page_mask) {
 697                                 pa &= moea64_large_page_mask;
 698                                 pte_lo |= LPTE_G;
 699                         }
 700                         if (pa + moea64_large_page_size >
 701                             pregions[i].mr_start + pregions[i].mr_size)
 702                                 pte_lo |= LPTE_G;
 703
 704                         moea64_kenter_large(PHYS_TO_DMAP(pa), pa, pte_lo, 1);
 705                   }
 706                 }
 707                 PMAP_UNLOCK(kernel_pmap);
 708         }
 709
 710         /*
 711          * Make sure the kernel and BPVO pool stay mapped on systems either
 712          * without a direct map or on which the kernel is not already executing
 713          * out of the direct-mapped region.
 714          */
 715         if (kernelstart < DMAP_BASE_ADDRESS) {
 716                 /*
 717                  * For pre-dmap execution, we need to use identity mapping
 718                  * because we will be operating with the mmu on but in the
 719                  * wrong address configuration until we __restartkernel().
 720                  */
 721                 for (pa = kernelstart & ~PAGE_MASK; pa < kernelend;
 722                     pa += PAGE_SIZE)
 723                         moea64_kenter(pa, pa);
 724         } else if (!hw_direct_map) {
 725                 pkernelstart = kernelstart & ~DMAP_BASE_ADDRESS;
 726                 pkernelend = kernelend & ~DMAP_BASE_ADDRESS;
 727                 for (pa = pkernelstart & ~PAGE_MASK; pa < pkernelend;
 728                     pa += PAGE_SIZE)
 729                         moea64_kenter(pa | DMAP_BASE_ADDRESS, pa);
 730         }
 731
 732         if (!hw_direct_map) {
 733                 size = moea64_bpvo_pool_size*sizeof(struct pvo_entry);
 734                 off = (vm_offset_t)(moea64_bpvo_pool);
 735                 for (pa = off; pa < off + size; pa += PAGE_SIZE)
 736                         moea64_kenter(pa, pa);
 737
 738                 /* Map exception vectors */
 739                 for (pa = EXC_RSVD; pa < EXC_LAST; pa += PAGE_SIZE)
 740                         moea64_kenter(pa | DMAP_BASE_ADDRESS, pa);
 741         }
 742         ENABLE_TRANS(msr);
 743
 744         /*
 745          * Allow user to override unmapped_buf_allowed for testing.
 746          * XXXKIB Only direct map implementation was tested.
 747          */
 748         if (!TUNABLE_INT_FETCH("vfs.unmapped_buf_allowed",
 749             &unmapped_buf_allowed))
 750                 unmapped_buf_allowed = hw_direct_map;
 751 }
 752
 753 /* Quick sort callout for comparing physical addresses. */
 754 static int
 755 pa_cmp(const void *a, const void *b)
 756 {
 757         const vm_paddr_t *pa = a, *pb = b;
 758
 759         if (*pa < *pb)
 760                 return (-1);
 761         else if (*pa > *pb)
 762                 return (1);
 763         else
 764                 return (0);
 765 }
 766
 767 void
 768 moea64_early_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
 769 {
 770         int             i, j;
 771         vm_size_t       physsz, hwphyssz;
 772         vm_paddr_t      kernelphysstart, kernelphysend;
 773         int             rm_pavail;
 774
 775 #ifndef __powerpc64__
 776         /* We don't have a direct map since there is no BAT */
 777         hw_direct_map = 0;
 778
 779         /* Make sure battable is zero, since we have no BAT */
 780         for (i = 0; i < 16; i++) {
 781                 battable[i].batu = 0;
 782                 battable[i].batl = 0;
 783         }
 784 #else
 785         moea64_probe_large_page();
 786
 787         /* Use a direct map if we have large page support */
 788         if (moea64_large_page_size > 0)
 789                 hw_direct_map = 1;
 790         else
 791                 hw_direct_map = 0;
 792
 793         /* Install trap handlers for SLBs */
 794         bcopy(&slbtrap, (void *)EXC_DSE,(size_t)&slbtrapend - (size_t)&slbtrap);
 795         bcopy(&slbtrap, (void *)EXC_ISE,(size_t)&slbtrapend - (size_t)&slbtrap);
 796         __syncicache((void *)EXC_DSE, 0x80);
 797         __syncicache((void *)EXC_ISE, 0x80);
 798 #endif
 799
 800         kernelphysstart = kernelstart & ~DMAP_BASE_ADDRESS;
 801         kernelphysend = kernelend & ~DMAP_BASE_ADDRESS;
 802
 803         /* Get physical memory regions from firmware */
 804         mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
 805         CTR0(KTR_PMAP, "moea64_bootstrap: physical memory");
 806
 807         if (PHYS_AVAIL_ENTRIES < regions_sz)
 808                 panic("moea64_bootstrap: phys_avail too small");
 809
 810         phys_avail_count = 0;
 811         physsz = 0;
 812         hwphyssz = 0;
 813         TUNABLE_ULONG_FETCH("hw.physmem", (u_long *) &hwphyssz);
 814         for (i = 0, j = 0; i < regions_sz; i++, j += 2) {
 815                 CTR3(KTR_PMAP, "region: %#zx - %#zx (%#zx)",
 816                     regions[i].mr_start, regions[i].mr_start +
 817                     regions[i].mr_size, regions[i].mr_size);
 818                 if (hwphyssz != 0 &&
 819                     (physsz + regions[i].mr_size) >= hwphyssz) {
 820                         if (physsz < hwphyssz) {
 821                                 phys_avail[j] = regions[i].mr_start;
 822                                 phys_avail[j + 1] = regions[i].mr_start +
 823                                     hwphyssz - physsz;
 824                                 physsz = hwphyssz;
 825                                 phys_avail_count++;
 826                                 dump_avail[j] = phys_avail[j];
 827                                 dump_avail[j + 1] = phys_avail[j + 1];
 828                         }
 829                         break;
 830                 }
 831                 phys_avail[j] = regions[i].mr_start;
 832                 phys_avail[j + 1] = regions[i].mr_start + regions[i].mr_size;
 833                 phys_avail_count++;
 834                 physsz += regions[i].mr_size;
 835                 dump_avail[j] = phys_avail[j];
 836                 dump_avail[j + 1] = phys_avail[j + 1];
 837         }
 838
 839         /* Check for overlap with the kernel and exception vectors */
 840         rm_pavail = 0;
 841         for (j = 0; j < 2*phys_avail_count; j+=2) {
 842                 if (phys_avail[j] < EXC_LAST)
 843                         phys_avail[j] += EXC_LAST;
 844
 845                 if (phys_avail[j] >= kernelphysstart &&
 846                     phys_avail[j+1] <= kernelphysend) {
 847                         phys_avail[j] = phys_avail[j+1] = ~0;
 848                         rm_pavail++;
 849                         continue;
 850                 }
 851
 852                 if (kernelphysstart >= phys_avail[j] &&
 853                     kernelphysstart < phys_avail[j+1]) {
 854                         if (kernelphysend < phys_avail[j+1]) {
 855                                 phys_avail[2*phys_avail_count] =
 856                                     (kernelphysend & ~PAGE_MASK) + PAGE_SIZE;
 857                                 phys_avail[2*phys_avail_count + 1] =
 858                                     phys_avail[j+1];
 859                                 phys_avail_count++;
 860                         }
 861
 862                         phys_avail[j+1] = kernelphysstart & ~PAGE_MASK;
 863                 }
 864
 865                 if (kernelphysend >= phys_avail[j] &&
 866                     kernelphysend < phys_avail[j+1]) {
 867                         if (kernelphysstart > phys_avail[j]) {
 868                                 phys_avail[2*phys_avail_count] = phys_avail[j];
 869                                 phys_avail[2*phys_avail_count + 1] =
 870                                     kernelphysstart & ~PAGE_MASK;
 871                                 phys_avail_count++;
 872                         }
 873
 874                         phys_avail[j] = (kernelphysend & ~PAGE_MASK) +
 875                             PAGE_SIZE;
 876                 }
 877         }
 878
 879         /* Remove physical available regions marked for removal (~0) */
 880         if (rm_pavail) {
 881                 qsort(phys_avail, 2*phys_avail_count, sizeof(phys_avail[0]),
 882                         pa_cmp);
 883                 phys_avail_count -= rm_pavail;
 884                 for (i = 2*phys_avail_count;
 885                      i < 2*(phys_avail_count + rm_pavail); i+=2)
 886                         phys_avail[i] = phys_avail[i+1] = 0;
 887         }
 888
 889         physmem = btoc(physsz);
 890
 891 #ifdef PTEGCOUNT
 892         moea64_pteg_count = PTEGCOUNT;
 893 #else
 894         moea64_pteg_count = 0x1000;
 895
 896         while (moea64_pteg_count < physmem)
 897                 moea64_pteg_count <<= 1;
 898
 899         moea64_pteg_count >>= 1;
 900 #endif /* PTEGCOUNT */
 901 }
 902
 903 void
 904 moea64_mid_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
 905 {
 906         int             i;
 907
 908         /*
 909          * Set PTEG mask
 910          */
 911         moea64_pteg_mask = moea64_pteg_count - 1;
 912
 913         /*
 914          * Initialize SLB table lock and page locks
 915          */
 916         mtx_init(&moea64_slb_mutex, "SLB table", NULL, MTX_DEF);
 917         for (i = 0; i < PV_LOCK_COUNT; i++)
 918                 mtx_init(&pv_lock[i], "page pv", NULL, MTX_DEF);
 919
 920         /*
 921          * Initialise the bootstrap pvo pool.
 922          */
 923         TUNABLE_INT_FETCH("machdep.moea64_bpvo_pool_size", &moea64_bpvo_pool_size);
 924         if (moea64_bpvo_pool_size == 0) {
 925                 if (!hw_direct_map)
 926                         moea64_bpvo_pool_size = ((ptoa((uintmax_t)physmem) * sizeof(struct vm_page)) /
 927                             (PAGE_SIZE * PAGE_SIZE)) * BPVO_POOL_EXPANSION_FACTOR;
 928                 else
 929                         moea64_bpvo_pool_size = BPVO_POOL_SIZE;
 930         }
 931
 932         if (boothowto & RB_VERBOSE) {
 933                 printf("mmu_oea64: bpvo pool entries = %d, bpvo pool size = %zu MB\n",
 934                     moea64_bpvo_pool_size,
 935                     moea64_bpvo_pool_size*sizeof(struct pvo_entry) / 1048576);
 936         }
 937
 938         moea64_bpvo_pool = (struct pvo_entry *)moea64_bootstrap_alloc(
 939                 moea64_bpvo_pool_size*sizeof(struct pvo_entry), PAGE_SIZE);
 940         moea64_bpvo_pool_index = 0;
 941
 942         /* Place at address usable through the direct map */
 943         if (hw_direct_map)
 944                 moea64_bpvo_pool = (struct pvo_entry *)
 945                     PHYS_TO_DMAP((uintptr_t)moea64_bpvo_pool);
 946
 947         /*
 948          * Make sure kernel vsid is allocated as well as VSID 0.
 949          */
 950         #ifndef __powerpc64__
 951         moea64_vsid_bitmap[(KERNEL_VSIDBITS & (NVSIDS - 1)) / VSID_NBPW]
 952                 |= 1 << (KERNEL_VSIDBITS % VSID_NBPW);
 953         moea64_vsid_bitmap[0] |= 1;
 954         #endif
 955
 956         /*
 957          * Initialize the kernel pmap (which is statically allocated).
 958          */
 959         #ifdef __powerpc64__
 960         for (i = 0; i < 64; i++) {
 961                 pcpup->pc_aim.slb[i].slbv = 0;
 962                 pcpup->pc_aim.slb[i].slbe = 0;
 963         }
 964         #else
 965         for (i = 0; i < 16; i++)
 966                 kernel_pmap->pm_sr[i] = EMPTY_SEGMENT + i;
 967         #endif
 968
 969         kernel_pmap->pmap_phys = kernel_pmap;
 970         CPU_FILL(&kernel_pmap->pm_active);
 971         RB_INIT(&kernel_pmap->pmap_pvo);
 972
 973         PMAP_LOCK_INIT(kernel_pmap);
 974
 975         /*
 976          * Now map in all the other buffers we allocated earlier
 977          */
 978
 979         moea64_setup_direct_map(kernelstart, kernelend);
 980 }
 981
 982 void
 983 moea64_late_bootstrap(vm_offset_t kernelstart, vm_offset_t kernelend)
 984 {
 985         ihandle_t       mmui;
 986         phandle_t       chosen;
 987         phandle_t       mmu;
 988         ssize_t         sz;
 989         int             i;
 990         vm_offset_t     pa, va;
 991         void            *dpcpu;
 992
 993         /*
 994          * Set up the Open Firmware pmap and add its mappings if not in real
 995          * mode.
 996          */
 997
 998         chosen = OF_finddevice("/chosen");
 999         if (chosen != -1 && OF_getencprop(chosen, "mmu", &mmui, 4) != -1) {
1000                 mmu = OF_instance_to_package(mmui);
1001                 if (mmu == -1 ||
1002                     (sz = OF_getproplen(mmu, "translations")) == -1)
1003                         sz = 0;
1004                 if (sz > 6144 /* tmpstksz - 2 KB headroom */)
1005                         panic("moea64_bootstrap: too many ofw translations");
1006
1007                 if (sz > 0)
1008                         moea64_add_ofw_mappings(mmu, sz);
1009         }
1010
1011         /*
1012          * Calculate the last available physical address.
1013          */
1014         Maxmem = 0;
1015         for (i = 0; phys_avail[i + 2] != 0; i += 2)
1016                 Maxmem = MAX(Maxmem, powerpc_btop(phys_avail[i + 1]));
1017
1018         /*
1019          * Initialize MMU.
1020          */
1021         pmap_cpu_bootstrap(0);
1022         mtmsr(mfmsr() | PSL_DR | PSL_IR);
1023         pmap_bootstrapped++;
1024
1025         /*
1026          * Set the start and end of kva.
1027          */
1028         virtual_avail = VM_MIN_KERNEL_ADDRESS;
1029         virtual_end = VM_MAX_SAFE_KERNEL_ADDRESS;
1030
1031         /*
1032          * Map the entire KVA range into the SLB. We must not fault there.
1033          */
1034         #ifdef __powerpc64__
1035         for (va = virtual_avail; va < virtual_end; va += SEGMENT_LENGTH)
1036                 moea64_bootstrap_slb_prefault(va, 0);
1037         #endif
1038
1039         /*
1040          * Remap any early IO mappings (console framebuffer, etc.)
1041          */
1042         bs_remap_earlyboot();
1043
1044         /*
1045          * Figure out how far we can extend virtual_end into segment 16
1046          * without running into existing mappings. Segment 16 is guaranteed
1047          * to contain neither RAM nor devices (at least on Apple hardware),
1048          * but will generally contain some OFW mappings we should not
1049          * step on.
1050          */
1051
1052         #ifndef __powerpc64__   /* KVA is in high memory on PPC64 */
1053         PMAP_LOCK(kernel_pmap);
1054         while (virtual_end < VM_MAX_KERNEL_ADDRESS &&
1055             moea64_pvo_find_va(kernel_pmap, virtual_end+1) == NULL)
1056                 virtual_end += PAGE_SIZE;
1057         PMAP_UNLOCK(kernel_pmap);
1058         #endif
1059
1060         /*
1061          * Allocate a kernel stack with a guard page for thread0 and map it
1062          * into the kernel page map.
1063          */
1064         pa = moea64_bootstrap_alloc(kstack_pages * PAGE_SIZE, PAGE_SIZE);
1065         va = virtual_avail + KSTACK_GUARD_PAGES * PAGE_SIZE;
1066         virtual_avail = va + kstack_pages * PAGE_SIZE;
1067         CTR2(KTR_PMAP, "moea64_bootstrap: kstack0 at %#x (%#x)", pa, va);
1068         thread0.td_kstack = va;
1069         thread0.td_kstack_pages = kstack_pages;
1070         for (i = 0; i < kstack_pages; i++) {
1071                 moea64_kenter(va, pa);
1072                 pa += PAGE_SIZE;
1073                 va += PAGE_SIZE;
1074         }
1075
1076         /*
1077          * Allocate virtual address space for the message buffer.
1078          */
1079         pa = msgbuf_phys = moea64_bootstrap_alloc(msgbufsize, PAGE_SIZE);
1080         msgbufp = (struct msgbuf *)virtual_avail;
1081         va = virtual_avail;
1082         virtual_avail += round_page(msgbufsize);
1083         while (va < virtual_avail) {
1084                 moea64_kenter(va, pa);
1085                 pa += PAGE_SIZE;
1086                 va += PAGE_SIZE;
1087         }
1088
1089         /*
1090          * Allocate virtual address space for the dynamic percpu area.
1091          */
1092         pa = moea64_bootstrap_alloc(DPCPU_SIZE, PAGE_SIZE);
1093         dpcpu = (void *)virtual_avail;
1094         va = virtual_avail;
1095         virtual_avail += DPCPU_SIZE;
1096         while (va < virtual_avail) {
1097                 moea64_kenter(va, pa);
1098                 pa += PAGE_SIZE;
1099                 va += PAGE_SIZE;
1100         }
1101         dpcpu_init(dpcpu, curcpu);
1102
1103         crashdumpmap = (caddr_t)virtual_avail;
1104         virtual_avail += MAXDUMPPGS * PAGE_SIZE;
1105
1106         /*
1107          * Allocate some things for page zeroing. We put this directly
1108          * in the page table and use MOEA64_PTE_REPLACE to avoid any
1109          * of the PVO book-keeping or other parts of the VM system
1110          * from even knowing that this hack exists.
1111          */
1112
1113         if (!hw_direct_map) {
1114                 mtx_init(&moea64_scratchpage_mtx, "pvo zero page", NULL,
1115                     MTX_DEF);
1116                 for (i = 0; i < 2; i++) {
1117                         moea64_scratchpage_va[i] = (virtual_end+1) - PAGE_SIZE;
1118                         virtual_end -= PAGE_SIZE;
1119
1120                         moea64_kenter(moea64_scratchpage_va[i], 0);
1121
1122                         PMAP_LOCK(kernel_pmap);
1123                         moea64_scratchpage_pvo[i] = moea64_pvo_find_va(
1124                             kernel_pmap, (vm_offset_t)moea64_scratchpage_va[i]);
1125                         PMAP_UNLOCK(kernel_pmap);
1126                 }
1127         }
1128
1129         numa_mem_regions(&numa_pregions, &numapregions_sz);
1130 }
1131
1132 static void
1133 moea64_pmap_init_qpages(void)
1134 {
1135         struct pcpu *pc;
1136         int i;
1137
1138         if (hw_direct_map)
1139                 return;
1140
1141         CPU_FOREACH(i) {
1142                 pc = pcpu_find(i);
1143                 pc->pc_qmap_addr = kva_alloc(PAGE_SIZE);
1144                 if (pc->pc_qmap_addr == 0)
1145                         panic("pmap_init_qpages: unable to allocate KVA");
1146                 PMAP_LOCK(kernel_pmap);
1147                 pc->pc_aim.qmap_pvo =
1148                     moea64_pvo_find_va(kernel_pmap, pc->pc_qmap_addr);
1149                 PMAP_UNLOCK(kernel_pmap);
1150                 mtx_init(&pc->pc_aim.qmap_lock, "qmap lock", NULL, MTX_DEF);
1151         }
1152 }
1153
1154 SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, moea64_pmap_init_qpages, NULL);
1155
1156 /*
1157  * Activate a user pmap.  This mostly involves setting some non-CPU
1158  * state.
1159  */
1160 void
1161 moea64_activate(struct thread *td)
1162 {
1163         pmap_t  pm;
1164
1165         pm = &td->td_proc->p_vmspace->vm_pmap;
1166         CPU_SET(PCPU_GET(cpuid), &pm->pm_active);
1167
1168         #ifdef __powerpc64__
1169         PCPU_SET(aim.userslb, pm->pm_slb);
1170         __asm __volatile("slbmte %0, %1; isync" ::
1171             "r"(td->td_pcb->pcb_cpu.aim.usr_vsid), "r"(USER_SLB_SLBE));
1172         #else
1173         PCPU_SET(curpmap, pm->pmap_phys);
1174         mtsrin(USER_SR << ADDR_SR_SHFT, td->td_pcb->pcb_cpu.aim.usr_vsid);
1175         #endif
1176 }
1177
1178 void
1179 moea64_deactivate(struct thread *td)
1180 {
1181         pmap_t  pm;
1182
1183         __asm __volatile("isync; slbie %0" :: "r"(USER_ADDR));
1184
1185         pm = &td->td_proc->p_vmspace->vm_pmap;
1186         CPU_CLR(PCPU_GET(cpuid), &pm->pm_active);
1187         #ifdef __powerpc64__
1188         PCPU_SET(aim.userslb, NULL);
1189         #else
1190         PCPU_SET(curpmap, NULL);
1191         #endif
1192 }
1193
1194 void
1195 moea64_unwire(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
1196 {
1197         struct  pvo_entry key, *pvo;
1198         vm_page_t m;
1199         int64_t refchg;
1200
1201         key.pvo_vaddr = sva;
1202         PMAP_LOCK(pm);
1203         for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
1204             pvo != NULL && PVO_VADDR(pvo) < eva;
1205             pvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo)) {
1206                 if ((pvo->pvo_vaddr & PVO_WIRED) == 0)
1207                         panic("moea64_unwire: pvo %p is missing PVO_WIRED",
1208                             pvo);
1209                 pvo->pvo_vaddr &= ~PVO_WIRED;
1210                 refchg = moea64_pte_replace(pvo, 0 /* No invalidation */);
1211                 if ((pvo->pvo_vaddr & PVO_MANAGED) &&
1212                     (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
1213                         if (refchg < 0)
1214                                 refchg = LPTE_CHG;
1215                         m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
1216
1217                         refchg |= atomic_readandclear_32(&m->md.mdpg_attrs);
1218                         if (refchg & LPTE_CHG)
1219                                 vm_page_dirty(m);
1220                         if (refchg & LPTE_REF)
1221                                 vm_page_aflag_set(m, PGA_REFERENCED);
1222                 }
1223                 pm->pm_stats.wired_count--;
1224         }
1225         PMAP_UNLOCK(pm);
1226 }
1227
1228 static int
1229 moea64_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
1230 {
1231         struct pvo_entry *pvo;
1232         vm_paddr_t pa;
1233         vm_page_t m;
1234         int val;
1235         bool managed;
1236
1237         PMAP_LOCK(pmap);
1238
1239         /* XXX Add support for superpages */
1240         pvo = moea64_pvo_find_va(pmap, addr);
1241         if (pvo != NULL) {
1242                 pa = PVO_PADDR(pvo);
1243                 m = PHYS_TO_VM_PAGE(pa);
1244                 managed = (pvo->pvo_vaddr & PVO_MANAGED) == PVO_MANAGED;
1245                 val = MINCORE_INCORE;
1246         } else {
1247                 PMAP_UNLOCK(pmap);
1248                 return (0);
1249         }
1250
1251         PMAP_UNLOCK(pmap);
1252
1253         if (m == NULL)
1254                 return (0);
1255
1256         if (managed) {
1257                 if (moea64_is_modified(m))
1258                         val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
1259
1260                 if (moea64_is_referenced(m))
1261                         val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
1262         }
1263
1264         if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
1265             (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
1266             managed) {
1267                 *pap = pa;
1268         }
1269
1270         return (val);
1271 }
1272
1273 /*
1274  * This goes through and sets the physical address of our
1275  * special scratch PTE to the PA we want to zero or copy. Because
1276  * of locking issues (this can get called in pvo_enter() by
1277  * the UMA allocator), we can't use most other utility functions here
1278  */
1279
1280 static __inline
1281 void moea64_set_scratchpage_pa(int which, vm_paddr_t pa)
1282 {
1283         struct pvo_entry *pvo;
1284
1285         KASSERT(!hw_direct_map, ("Using OEA64 scratchpage with a direct map!"));
1286         mtx_assert(&moea64_scratchpage_mtx, MA_OWNED);
1287
1288         pvo = moea64_scratchpage_pvo[which];
1289         PMAP_LOCK(pvo->pvo_pmap);
1290         pvo->pvo_pte.pa =
1291             moea64_calc_wimg(pa, VM_MEMATTR_DEFAULT) | (uint64_t)pa;
1292         moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
1293         PMAP_UNLOCK(pvo->pvo_pmap);
1294         isync();
1295 }
1296
1297 void
1298 moea64_copy_page(vm_page_t msrc, vm_page_t mdst)
1299 {
1300         vm_offset_t     dst;
1301         vm_offset_t     src;
1302
1303         dst = VM_PAGE_TO_PHYS(mdst);
1304         src = VM_PAGE_TO_PHYS(msrc);
1305
1306         if (hw_direct_map) {
1307                 bcopy((void *)PHYS_TO_DMAP(src), (void *)PHYS_TO_DMAP(dst),
1308                     PAGE_SIZE);
1309         } else {
1310                 mtx_lock(&moea64_scratchpage_mtx);
1311
1312                 moea64_set_scratchpage_pa(0, src);
1313                 moea64_set_scratchpage_pa(1, dst);
1314
1315                 bcopy((void *)moea64_scratchpage_va[0],
1316                     (void *)moea64_scratchpage_va[1], PAGE_SIZE);
1317
1318                 mtx_unlock(&moea64_scratchpage_mtx);
1319         }
1320 }
1321
1322 static inline void
1323 moea64_copy_pages_dmap(vm_page_t *ma, vm_offset_t a_offset,
1324     vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1325 {
1326         void *a_cp, *b_cp;
1327         vm_offset_t a_pg_offset, b_pg_offset;
1328         int cnt;
1329
1330         while (xfersize > 0) {
1331                 a_pg_offset = a_offset & PAGE_MASK;
1332                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
1333                 a_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
1334                     VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT])) +
1335                     a_pg_offset;
1336                 b_pg_offset = b_offset & PAGE_MASK;
1337                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
1338                 b_cp = (char *)(uintptr_t)PHYS_TO_DMAP(
1339                     VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT])) +
1340                     b_pg_offset;
1341                 bcopy(a_cp, b_cp, cnt);
1342                 a_offset += cnt;
1343                 b_offset += cnt;
1344                 xfersize -= cnt;
1345         }
1346 }
1347
1348 static inline void
1349 moea64_copy_pages_nodmap(vm_page_t *ma, vm_offset_t a_offset,
1350     vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1351 {
1352         void *a_cp, *b_cp;
1353         vm_offset_t a_pg_offset, b_pg_offset;
1354         int cnt;
1355
1356         mtx_lock(&moea64_scratchpage_mtx);
1357         while (xfersize > 0) {
1358                 a_pg_offset = a_offset & PAGE_MASK;
1359                 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
1360                 moea64_set_scratchpage_pa(0,
1361                     VM_PAGE_TO_PHYS(ma[a_offset >> PAGE_SHIFT]));
1362                 a_cp = (char *)moea64_scratchpage_va[0] + a_pg_offset;
1363                 b_pg_offset = b_offset & PAGE_MASK;
1364                 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
1365                 moea64_set_scratchpage_pa(1,
1366                     VM_PAGE_TO_PHYS(mb[b_offset >> PAGE_SHIFT]));
1367                 b_cp = (char *)moea64_scratchpage_va[1] + b_pg_offset;
1368                 bcopy(a_cp, b_cp, cnt);
1369                 a_offset += cnt;
1370                 b_offset += cnt;
1371                 xfersize -= cnt;
1372         }
1373         mtx_unlock(&moea64_scratchpage_mtx);
1374 }
1375
1376 void
1377 moea64_copy_pages(vm_page_t *ma, vm_offset_t a_offset,
1378     vm_page_t *mb, vm_offset_t b_offset, int xfersize)
1379 {
1380
1381         if (hw_direct_map) {
1382                 moea64_copy_pages_dmap(ma, a_offset, mb, b_offset,
1383                     xfersize);
1384         } else {
1385                 moea64_copy_pages_nodmap(ma, a_offset, mb, b_offset,
1386                     xfersize);
1387         }
1388 }
1389
1390 void
1391 moea64_zero_page_area(vm_page_t m, int off, int size)
1392 {
1393         vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
1394
1395         if (size + off > PAGE_SIZE)
1396                 panic("moea64_zero_page: size + off > PAGE_SIZE");
1397
1398         if (hw_direct_map) {
1399                 bzero((caddr_t)(uintptr_t)PHYS_TO_DMAP(pa) + off, size);
1400         } else {
1401                 mtx_lock(&moea64_scratchpage_mtx);
1402                 moea64_set_scratchpage_pa(0, pa);
1403                 bzero((caddr_t)moea64_scratchpage_va[0] + off, size);
1404                 mtx_unlock(&moea64_scratchpage_mtx);
1405         }
1406 }
1407
1408 /*
1409  * Zero a page of physical memory by temporarily mapping it
1410  */
1411 void
1412 moea64_zero_page(vm_page_t m)
1413 {
1414         vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
1415         vm_offset_t va, off;
1416
1417         if (!hw_direct_map) {
1418                 mtx_lock(&moea64_scratchpage_mtx);
1419
1420                 moea64_set_scratchpage_pa(0, pa);
1421                 va = moea64_scratchpage_va[0];
1422         } else {
1423                 va = PHYS_TO_DMAP(pa);
1424         }
1425
1426         for (off = 0; off < PAGE_SIZE; off += cacheline_size)
1427                 __asm __volatile("dcbz 0,%0" :: "r"(va + off));
1428
1429         if (!hw_direct_map)
1430                 mtx_unlock(&moea64_scratchpage_mtx);
1431 }
1432
1433 vm_offset_t
1434 moea64_quick_enter_page(vm_page_t m)
1435 {
1436         struct pvo_entry *pvo;
1437         vm_paddr_t pa = VM_PAGE_TO_PHYS(m);
1438
1439         if (hw_direct_map)
1440                 return (PHYS_TO_DMAP(pa));
1441
1442         /*
1443          * MOEA64_PTE_REPLACE does some locking, so we can't just grab
1444          * a critical section and access the PCPU data like on i386.
1445          * Instead, pin the thread and grab the PCPU lock to prevent
1446          * a preempting thread from using the same PCPU data.
1447          */
1448         sched_pin();
1449
1450         mtx_assert(PCPU_PTR(aim.qmap_lock), MA_NOTOWNED);
1451         pvo = PCPU_GET(aim.qmap_pvo);
1452
1453         mtx_lock(PCPU_PTR(aim.qmap_lock));
1454         pvo->pvo_pte.pa = moea64_calc_wimg(pa, pmap_page_get_memattr(m)) |
1455             (uint64_t)pa;
1456         moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
1457         isync();
1458
1459         return (PCPU_GET(qmap_addr));
1460 }
1461
1462 void
1463 moea64_quick_remove_page(vm_offset_t addr)
1464 {
1465         if (hw_direct_map)
1466                 return;
1467
1468         mtx_assert(PCPU_PTR(aim.qmap_lock), MA_OWNED);
1469         KASSERT(PCPU_GET(qmap_addr) == addr,
1470             ("moea64_quick_remove_page: invalid address"));
1471         mtx_unlock(PCPU_PTR(aim.qmap_lock));
1472         sched_unpin();
1473 }
1474
1475 boolean_t
1476 moea64_page_is_mapped(vm_page_t m)
1477 {
1478         return (!LIST_EMPTY(&(m)->md.mdpg_pvoh));
1479 }
1480
1481 /*
1482  * Map the given physical page at the specified virtual address in the
1483  * target pmap with the protection requested.  If specified the page
1484  * will be wired down.
1485  */
1486
1487 int
1488 moea64_enter(pmap_t pmap, vm_offset_t va, vm_page_t m,
1489     vm_prot_t prot, u_int flags, int8_t psind)
1490 {
1491         struct          pvo_entry *pvo, *oldpvo;
1492         struct          pvo_head *pvo_head;
1493         uint64_t        pte_lo;
1494         int             error;
1495
1496         if ((m->oflags & VPO_UNMANAGED) == 0) {
1497                 if ((flags & PMAP_ENTER_QUICK_LOCKED) == 0)
1498                         VM_PAGE_OBJECT_BUSY_ASSERT(m);
1499                 else
1500                         VM_OBJECT_ASSERT_LOCKED(m->object);
1501         }
1502
1503         pvo = alloc_pvo_entry(0);
1504         if (pvo == NULL)
1505                 return (KERN_RESOURCE_SHORTAGE);
1506         pvo->pvo_pmap = NULL; /* to be filled in later */
1507         pvo->pvo_pte.prot = prot;
1508
1509         pte_lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), pmap_page_get_memattr(m));
1510         pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | pte_lo;
1511
1512         if ((flags & PMAP_ENTER_WIRED) != 0)
1513                 pvo->pvo_vaddr |= PVO_WIRED;
1514
1515         if ((m->oflags & VPO_UNMANAGED) != 0 || !moea64_initialized) {
1516                 pvo_head = NULL;
1517         } else {
1518                 pvo_head = &m->md.mdpg_pvoh;
1519                 pvo->pvo_vaddr |= PVO_MANAGED;
1520         }
1521
1522         PV_PAGE_LOCK(m);
1523         PMAP_LOCK(pmap);
1524         if (pvo->pvo_pmap == NULL)
1525                 init_pvo_entry(pvo, pmap, va);
1526         if (prot & VM_PROT_WRITE)
1527                 if (pmap_bootstrapped &&
1528                     (m->oflags & VPO_UNMANAGED) == 0)
1529                         vm_page_aflag_set(m, PGA_WRITEABLE);
1530
1531         error = moea64_pvo_enter(pvo, pvo_head, &oldpvo);
1532         if (error == EEXIST) {
1533                 if (oldpvo->pvo_vaddr == pvo->pvo_vaddr &&
1534                     oldpvo->pvo_pte.pa == pvo->pvo_pte.pa &&
1535                     oldpvo->pvo_pte.prot == prot) {
1536                         /* Identical mapping already exists */
1537                         error = 0;
1538
1539                         /* If not in page table, reinsert it */
1540                         if (moea64_pte_synch(oldpvo) < 0) {
1541                                 STAT_MOEA64(moea64_pte_overflow--);
1542                                 moea64_pte_insert(oldpvo);
1543                         }
1544
1545                         /* Then just clean up and go home */
1546                         PV_PAGE_UNLOCK(m);
1547                         PMAP_UNLOCK(pmap);
1548                         free_pvo_entry(pvo);
1549                         goto out;
1550                 } else {
1551                         /* Otherwise, need to kill it first */
1552                         KASSERT(oldpvo->pvo_pmap == pmap, ("pmap of old "
1553                             "mapping does not match new mapping"));
1554                         moea64_pvo_remove_from_pmap(oldpvo);
1555                         moea64_pvo_enter(pvo, pvo_head, NULL);
1556                 }
1557         }
1558         PMAP_UNLOCK(pmap);
1559         PV_PAGE_UNLOCK(m);
1560
1561         /* Free any dead pages */
1562         if (error == EEXIST) {
1563                 moea64_pvo_remove_from_page(oldpvo);
1564                 free_pvo_entry(oldpvo);
1565         }
1566
1567 out:
1568         /*
1569          * Flush the page from the instruction cache if this page is
1570          * mapped executable and cacheable.
1571          */
1572         if (pmap != kernel_pmap && (m->a.flags & PGA_EXECUTABLE) == 0 &&
1573             (pte_lo & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
1574                 vm_page_aflag_set(m, PGA_EXECUTABLE);
1575                 moea64_syncicache(pmap, va, VM_PAGE_TO_PHYS(m), PAGE_SIZE);
1576         }
1577         return (KERN_SUCCESS);
1578 }
1579
1580 static void
1581 moea64_syncicache(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1582     vm_size_t sz)
1583 {
1584
1585         /*
1586          * This is much trickier than on older systems because
1587          * we can't sync the icache on physical addresses directly
1588          * without a direct map. Instead we check a couple of cases
1589          * where the memory is already mapped in and, failing that,
1590          * use the same trick we use for page zeroing to create
1591          * a temporary mapping for this physical address.
1592          */
1593
1594         if (!pmap_bootstrapped) {
1595                 /*
1596                  * If PMAP is not bootstrapped, we are likely to be
1597                  * in real mode.
1598                  */
1599                 __syncicache((void *)(uintptr_t)pa, sz);
1600         } else if (pmap == kernel_pmap) {
1601                 __syncicache((void *)va, sz);
1602         } else if (hw_direct_map) {
1603                 __syncicache((void *)(uintptr_t)PHYS_TO_DMAP(pa), sz);
1604         } else {
1605                 /* Use the scratch page to set up a temp mapping */
1606
1607                 mtx_lock(&moea64_scratchpage_mtx);
1608
1609                 moea64_set_scratchpage_pa(1, pa & ~ADDR_POFF);
1610                 __syncicache((void *)(moea64_scratchpage_va[1] +
1611                     (va & ADDR_POFF)), sz);
1612
1613                 mtx_unlock(&moea64_scratchpage_mtx);
1614         }
1615 }
1616
1617 /*
1618  * Maps a sequence of resident pages belonging to the same object.
1619  * The sequence begins with the given page m_start.  This page is
1620  * mapped at the given virtual address start.  Each subsequent page is
1621  * mapped at a virtual address that is offset from start by the same
1622  * amount as the page is offset from m_start within the object.  The
1623  * last page in the sequence is the page with the largest offset from
1624  * m_start that can be mapped at a virtual address less than the given
1625  * virtual address end.  Not every virtual page between start and end
1626  * is mapped; only those for which a resident page exists with the
1627  * corresponding offset from m_start are mapped.
1628  */
1629 void
1630 moea64_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end,
1631     vm_page_t m_start, vm_prot_t prot)
1632 {
1633         vm_page_t m;
1634         vm_pindex_t diff, psize;
1635
1636         VM_OBJECT_ASSERT_LOCKED(m_start->object);
1637
1638         psize = atop(end - start);
1639         m = m_start;
1640         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
1641                 moea64_enter(pm, start + ptoa(diff), m, prot &
1642                     (VM_PROT_READ | VM_PROT_EXECUTE), PMAP_ENTER_NOSLEEP |
1643                     PMAP_ENTER_QUICK_LOCKED, 0);
1644                 m = TAILQ_NEXT(m, listq);
1645         }
1646 }
1647
1648 void
1649 moea64_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m,
1650     vm_prot_t prot)
1651 {
1652
1653         moea64_enter(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
1654             PMAP_ENTER_NOSLEEP | PMAP_ENTER_QUICK_LOCKED, 0);
1655 }
1656
1657 vm_paddr_t
1658 moea64_extract(pmap_t pm, vm_offset_t va)
1659 {
1660         struct  pvo_entry *pvo;
1661         vm_paddr_t pa;
1662
1663         PMAP_LOCK(pm);
1664         pvo = moea64_pvo_find_va(pm, va);
1665         if (pvo == NULL)
1666                 pa = 0;
1667         else
1668                 pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo));
1669         PMAP_UNLOCK(pm);
1670
1671         return (pa);
1672 }
1673
1674 /*
1675  * Atomically extract and hold the physical page with the given
1676  * pmap and virtual address pair if that mapping permits the given
1677  * protection.
1678  */
1679 vm_page_t
1680 moea64_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1681 {
1682         struct  pvo_entry *pvo;
1683         vm_page_t m;
1684
1685         m = NULL;
1686         PMAP_LOCK(pmap);
1687         pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
1688         if (pvo != NULL && (pvo->pvo_pte.prot & prot) == prot) {
1689                 m = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
1690                 if (!vm_page_wire_mapped(m))
1691                         m = NULL;
1692         }
1693         PMAP_UNLOCK(pmap);
1694         return (m);
1695 }
1696
1697 static void *
1698 moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
1699     uint8_t *flags, int wait)
1700 {
1701         struct pvo_entry *pvo;
1702         vm_offset_t va;
1703         vm_page_t m;
1704         int needed_lock;
1705
1706         /*
1707          * This entire routine is a horrible hack to avoid bothering kmem
1708          * for new KVA addresses. Because this can get called from inside
1709          * kmem allocation routines, calling kmem for a new address here
1710          * can lead to multiply locking non-recursive mutexes.
1711          */
1712
1713         *flags = UMA_SLAB_PRIV;
1714         needed_lock = !PMAP_LOCKED(kernel_pmap);
1715
1716         m = vm_page_alloc_domain(NULL, 0, domain,
1717             malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
1718         if (m == NULL)
1719                 return (NULL);
1720
1721         va = VM_PAGE_TO_PHYS(m);
1722
1723         pvo = alloc_pvo_entry(1 /* bootstrap */);
1724
1725         pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE;
1726         pvo->pvo_pte.pa = VM_PAGE_TO_PHYS(m) | LPTE_M;
1727
1728         if (needed_lock)
1729                 PMAP_LOCK(kernel_pmap);
1730
1731         init_pvo_entry(pvo, kernel_pmap, va);
1732         pvo->pvo_vaddr |= PVO_WIRED;
1733
1734         moea64_pvo_enter(pvo, NULL, NULL);
1735
1736         if (needed_lock)
1737                 PMAP_UNLOCK(kernel_pmap);
1738
1739         if ((wait & M_ZERO) && (m->flags & PG_ZERO) == 0)
1740                 bzero((void *)va, PAGE_SIZE);
1741
1742         return (void *)va;
1743 }
1744
1745 extern int elf32_nxstack;
1746
1747 void
1748 moea64_init()
1749 {
1750
1751         CTR0(KTR_PMAP, "moea64_init");
1752
1753         moea64_pvo_zone = uma_zcreate("UPVO entry", sizeof (struct pvo_entry),
1754             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
1755             UMA_ZONE_VM | UMA_ZONE_NOFREE);
1756
1757         if (!hw_direct_map) {
1758                 uma_zone_set_allocf(moea64_pvo_zone, moea64_uma_page_alloc);
1759         }
1760
1761 #ifdef COMPAT_FREEBSD32
1762         elf32_nxstack = 1;
1763 #endif
1764
1765         moea64_initialized = TRUE;
1766 }
1767
1768 boolean_t
1769 moea64_is_referenced(vm_page_t m)
1770 {
1771
1772         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1773             ("moea64_is_referenced: page %p is not managed", m));
1774
1775         return (moea64_query_bit(m, LPTE_REF));
1776 }
1777
1778 boolean_t
1779 moea64_is_modified(vm_page_t m)
1780 {
1781
1782         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1783             ("moea64_is_modified: page %p is not managed", m));
1784
1785         /*
1786          * If the page is not busied then this check is racy.
1787          */
1788         if (!pmap_page_is_write_mapped(m))
1789                 return (FALSE);
1790
1791         return (moea64_query_bit(m, LPTE_CHG));
1792 }
1793
1794 boolean_t
1795 moea64_is_prefaultable(pmap_t pmap, vm_offset_t va)
1796 {
1797         struct pvo_entry *pvo;
1798         boolean_t rv = TRUE;
1799
1800         PMAP_LOCK(pmap);
1801         pvo = moea64_pvo_find_va(pmap, va & ~ADDR_POFF);
1802         if (pvo != NULL)
1803                 rv = FALSE;
1804         PMAP_UNLOCK(pmap);
1805         return (rv);
1806 }
1807
1808 void
1809 moea64_clear_modify(vm_page_t m)
1810 {
1811
1812         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1813             ("moea64_clear_modify: page %p is not managed", m));
1814         vm_page_assert_busied(m);
1815
1816         if (!pmap_page_is_write_mapped(m))
1817                 return;
1818         moea64_clear_bit(m, LPTE_CHG);
1819 }
1820
1821 /*
1822  * Clear the write and modified bits in each of the given page's mappings.
1823  */
1824 void
1825 moea64_remove_write(vm_page_t m)
1826 {
1827         struct  pvo_entry *pvo;
1828         int64_t refchg, ret;
1829         pmap_t  pmap;
1830
1831         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1832             ("moea64_remove_write: page %p is not managed", m));
1833         vm_page_assert_busied(m);
1834
1835         if (!pmap_page_is_write_mapped(m))
1836                 return
1837
1838         powerpc_sync();
1839         PV_PAGE_LOCK(m);
1840         refchg = 0;
1841         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
1842                 pmap = pvo->pvo_pmap;
1843                 PMAP_LOCK(pmap);
1844                 if (!(pvo->pvo_vaddr & PVO_DEAD) &&
1845                     (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
1846                         pvo->pvo_pte.prot &= ~VM_PROT_WRITE;
1847                         ret = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
1848                         if (ret < 0)
1849                                 ret = LPTE_CHG;
1850                         refchg |= ret;
1851                         if (pvo->pvo_pmap == kernel_pmap)
1852                                 isync();
1853                 }
1854                 PMAP_UNLOCK(pmap);
1855         }
1856         if ((refchg | atomic_readandclear_32(&m->md.mdpg_attrs)) & LPTE_CHG)
1857                 vm_page_dirty(m);
1858         vm_page_aflag_clear(m, PGA_WRITEABLE);
1859         PV_PAGE_UNLOCK(m);
1860 }
1861
1862 /*
1863  *      moea64_ts_referenced:
1864  *
1865  *      Return a count of reference bits for a page, clearing those bits.
1866  *      It is not necessary for every reference bit to be cleared, but it
1867  *      is necessary that 0 only be returned when there are truly no
1868  *      reference bits set.
1869  *
1870  *      XXX: The exact number of bits to check and clear is a matter that
1871  *      should be tested and standardized at some point in the future for
1872  *      optimal aging of shared pages.
1873  */
1874 int
1875 moea64_ts_referenced(vm_page_t m)
1876 {
1877
1878         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1879             ("moea64_ts_referenced: page %p is not managed", m));
1880         return (moea64_clear_bit(m, LPTE_REF));
1881 }
1882
1883 /*
1884  * Modify the WIMG settings of all mappings for a page.
1885  */
1886 void
1887 moea64_page_set_memattr(vm_page_t m, vm_memattr_t ma)
1888 {
1889         struct  pvo_entry *pvo;
1890         int64_t refchg;
1891         pmap_t  pmap;
1892         uint64_t lo;
1893
1894         if ((m->oflags & VPO_UNMANAGED) != 0) {
1895                 m->md.mdpg_cache_attrs = ma;
1896                 return;
1897         }
1898
1899         lo = moea64_calc_wimg(VM_PAGE_TO_PHYS(m), ma);
1900
1901         PV_PAGE_LOCK(m);
1902         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
1903                 pmap = pvo->pvo_pmap;
1904                 PMAP_LOCK(pmap);
1905                 if (!(pvo->pvo_vaddr & PVO_DEAD)) {
1906                         pvo->pvo_pte.pa &= ~LPTE_WIMG;
1907                         pvo->pvo_pte.pa |= lo;
1908                         refchg = moea64_pte_replace(pvo, MOEA64_PTE_INVALIDATE);
1909                         if (refchg < 0)
1910                                 refchg = (pvo->pvo_pte.prot & VM_PROT_WRITE) ?
1911                                     LPTE_CHG : 0;
1912                         if ((pvo->pvo_vaddr & PVO_MANAGED) &&
1913                             (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
1914                                 refchg |=
1915                                     atomic_readandclear_32(&m->md.mdpg_attrs);
1916                                 if (refchg & LPTE_CHG)
1917                                         vm_page_dirty(m);
1918                                 if (refchg & LPTE_REF)
1919                                         vm_page_aflag_set(m, PGA_REFERENCED);
1920                         }
1921                         if (pvo->pvo_pmap == kernel_pmap)
1922                                 isync();
1923                 }
1924                 PMAP_UNLOCK(pmap);
1925         }
1926         m->md.mdpg_cache_attrs = ma;
1927         PV_PAGE_UNLOCK(m);
1928 }
1929
1930 /*
1931  * Map a wired page into kernel virtual address space.
1932  */
1933 void
1934 moea64_kenter_attr(vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma)
1935 {
1936         int             error;
1937         struct pvo_entry *pvo, *oldpvo;
1938
1939         do {
1940                 pvo = alloc_pvo_entry(0);
1941                 if (pvo == NULL)
1942                         vm_wait(NULL);
1943         } while (pvo == NULL);
1944         pvo->pvo_pte.prot = VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
1945         pvo->pvo_pte.pa = (pa & ~ADDR_POFF) | moea64_calc_wimg(pa, ma);
1946         pvo->pvo_vaddr |= PVO_WIRED;
1947
1948         PMAP_LOCK(kernel_pmap);
1949         oldpvo = moea64_pvo_find_va(kernel_pmap, va);
1950         if (oldpvo != NULL)
1951                 moea64_pvo_remove_from_pmap(oldpvo);
1952         init_pvo_entry(pvo, kernel_pmap, va);
1953         error = moea64_pvo_enter(pvo, NULL, NULL);
1954         PMAP_UNLOCK(kernel_pmap);
1955
1956         /* Free any dead pages */
1957         if (oldpvo != NULL) {
1958                 moea64_pvo_remove_from_page(oldpvo);
1959                 free_pvo_entry(oldpvo);
1960         }
1961
1962         if (error != 0)
1963                 panic("moea64_kenter: failed to enter va %#zx pa %#jx: %d", va,
1964                     (uintmax_t)pa, error);
1965 }
1966
1967 void
1968 moea64_kenter(vm_offset_t va, vm_paddr_t pa)
1969 {
1970
1971         moea64_kenter_attr(va, pa, VM_MEMATTR_DEFAULT);
1972 }
1973
1974 /*
1975  * Extract the physical page address associated with the given kernel virtual
1976  * address.
1977  */
1978 vm_paddr_t
1979 moea64_kextract(vm_offset_t va)
1980 {
1981         struct          pvo_entry *pvo;
1982         vm_paddr_t pa;
1983
1984         /*
1985          * Shortcut the direct-mapped case when applicable.  We never put
1986          * anything but 1:1 (or 62-bit aliased) mappings below
1987          * VM_MIN_KERNEL_ADDRESS.
1988          */
1989         if (va < VM_MIN_KERNEL_ADDRESS)
1990                 return (va & ~DMAP_BASE_ADDRESS);
1991
1992         PMAP_LOCK(kernel_pmap);
1993         pvo = moea64_pvo_find_va(kernel_pmap, va);
1994         KASSERT(pvo != NULL, ("moea64_kextract: no addr found for %#" PRIxPTR,
1995             va));
1996         pa = PVO_PADDR(pvo) | (va - PVO_VADDR(pvo));
1997         PMAP_UNLOCK(kernel_pmap);
1998         return (pa);
1999 }
2000
2001 /*
2002  * Remove a wired page from kernel virtual address space.
2003  */
2004 void
2005 moea64_kremove(vm_offset_t va)
2006 {
2007         moea64_remove(kernel_pmap, va, va + PAGE_SIZE);
2008 }
2009
2010 /*
2011  * Provide a kernel pointer corresponding to a given userland pointer.
2012  * The returned pointer is valid until the next time this function is
2013  * called in this thread. This is used internally in copyin/copyout.
2014  */
2015 static int
2016 moea64_map_user_ptr(pmap_t pm, volatile const void *uaddr,
2017     void **kaddr, size_t ulen, size_t *klen)
2018 {
2019         size_t l;
2020 #ifdef __powerpc64__
2021         struct slb *slb;
2022 #endif
2023         register_t slbv;
2024
2025         *kaddr = (char *)USER_ADDR + ((uintptr_t)uaddr & ~SEGMENT_MASK);
2026         l = ((char *)USER_ADDR + SEGMENT_LENGTH) - (char *)(*kaddr);
2027         if (l > ulen)
2028                 l = ulen;
2029         if (klen)
2030                 *klen = l;
2031         else if (l != ulen)
2032                 return (EFAULT);
2033
2034 #ifdef __powerpc64__
2035         /* Try lockless look-up first */
2036         slb = user_va_to_slb_entry(pm, (vm_offset_t)uaddr);
2037
2038         if (slb == NULL) {
2039                 /* If it isn't there, we need to pre-fault the VSID */
2040                 PMAP_LOCK(pm);
2041                 slbv = va_to_vsid(pm, (vm_offset_t)uaddr) << SLBV_VSID_SHIFT;
2042                 PMAP_UNLOCK(pm);
2043         } else {
2044                 slbv = slb->slbv;
2045         }
2046
2047         /* Mark segment no-execute */
2048         slbv |= SLBV_N;
2049 #else
2050         slbv = va_to_vsid(pm, (vm_offset_t)uaddr);
2051
2052         /* Mark segment no-execute */
2053         slbv |= SR_N;
2054 #endif
2055
2056         /* If we have already set this VSID, we can just return */
2057         if (curthread->td_pcb->pcb_cpu.aim.usr_vsid == slbv)
2058                 return (0);
2059
2060         __asm __volatile("isync");
2061         curthread->td_pcb->pcb_cpu.aim.usr_segm =
2062             (uintptr_t)uaddr >> ADDR_SR_SHFT;
2063         curthread->td_pcb->pcb_cpu.aim.usr_vsid = slbv;
2064 #ifdef __powerpc64__
2065         __asm __volatile ("slbie %0; slbmte %1, %2; isync" ::
2066             "r"(USER_ADDR), "r"(slbv), "r"(USER_SLB_SLBE));
2067 #else
2068         __asm __volatile("mtsr %0,%1; isync" :: "n"(USER_SR), "r"(slbv));
2069 #endif
2070
2071         return (0);
2072 }
2073
2074 /*
2075  * Figure out where a given kernel pointer (usually in a fault) points
2076  * to from the VM's perspective, potentially remapping into userland's
2077  * address space.
2078  */
2079 static int
2080 moea64_decode_kernel_ptr(vm_offset_t addr, int *is_user,
2081     vm_offset_t *decoded_addr)
2082 {
2083         vm_offset_t user_sr;
2084
2085         if ((addr >> ADDR_SR_SHFT) == (USER_ADDR >> ADDR_SR_SHFT)) {
2086                 user_sr = curthread->td_pcb->pcb_cpu.aim.usr_segm;
2087                 addr &= ADDR_PIDX | ADDR_POFF;
2088                 addr |= user_sr << ADDR_SR_SHFT;
2089                 *decoded_addr = addr;
2090                 *is_user = 1;
2091         } else {
2092                 *decoded_addr = addr;
2093                 *is_user = 0;
2094         }
2095
2096         return (0);
2097 }
2098
2099 /*
2100  * Map a range of physical addresses into kernel virtual address space.
2101  *
2102  * The value passed in *virt is a suggested virtual address for the mapping.
2103  * Architectures which can support a direct-mapped physical to virtual region
2104  * can return the appropriate address within that region, leaving '*virt'
2105  * unchanged.  Other architectures should map the pages starting at '*virt' and
2106  * update '*virt' with the first usable address after the mapped region.
2107  */
2108 vm_offset_t
2109 moea64_map(vm_offset_t *virt, vm_paddr_t pa_start,
2110     vm_paddr_t pa_end, int prot)
2111 {
2112         vm_offset_t     sva, va;
2113
2114         if (hw_direct_map) {
2115                 /*
2116                  * Check if every page in the region is covered by the direct
2117                  * map. The direct map covers all of physical memory. Use
2118                  * moea64_calc_wimg() as a shortcut to see if the page is in
2119                  * physical memory as a way to see if the direct map covers it.
2120                  */
2121                 for (va = pa_start; va < pa_end; va += PAGE_SIZE)
2122                         if (moea64_calc_wimg(va, VM_MEMATTR_DEFAULT) != LPTE_M)
2123                                 break;
2124                 if (va == pa_end)
2125                         return (PHYS_TO_DMAP(pa_start));
2126         }
2127         sva = *virt;
2128         va = sva;
2129         /* XXX respect prot argument */
2130         for (; pa_start < pa_end; pa_start += PAGE_SIZE, va += PAGE_SIZE)
2131                 moea64_kenter(va, pa_start);
2132         *virt = va;
2133
2134         return (sva);
2135 }
2136
2137 /*
2138  * Returns true if the pmap's pv is one of the first
2139  * 16 pvs linked to from this page.  This count may
2140  * be changed upwards or downwards in the future; it
2141  * is only necessary that true be returned for a small
2142  * subset of pmaps for proper page aging.
2143  */
2144 boolean_t
2145 moea64_page_exists_quick(pmap_t pmap, vm_page_t m)
2146 {
2147         int loops;
2148         struct pvo_entry *pvo;
2149         boolean_t rv;
2150
2151         KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2152             ("moea64_page_exists_quick: page %p is not managed", m));
2153         loops = 0;
2154         rv = FALSE;
2155         PV_PAGE_LOCK(m);
2156         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2157                 if (!(pvo->pvo_vaddr & PVO_DEAD) && pvo->pvo_pmap == pmap) {
2158                         rv = TRUE;
2159                         break;
2160                 }
2161                 if (++loops >= 16)
2162                         break;
2163         }
2164         PV_PAGE_UNLOCK(m);
2165         return (rv);
2166 }
2167
2168 void
2169 moea64_page_init(vm_page_t m)
2170 {
2171
2172         m->md.mdpg_attrs = 0;
2173         m->md.mdpg_cache_attrs = VM_MEMATTR_DEFAULT;
2174         LIST_INIT(&m->md.mdpg_pvoh);
2175 }
2176
2177 /*
2178  * Return the number of managed mappings to the given physical page
2179  * that are wired.
2180  */
2181 int
2182 moea64_page_wired_mappings(vm_page_t m)
2183 {
2184         struct pvo_entry *pvo;
2185         int count;
2186
2187         count = 0;
2188         if ((m->oflags & VPO_UNMANAGED) != 0)
2189                 return (count);
2190         PV_PAGE_LOCK(m);
2191         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink)
2192                 if ((pvo->pvo_vaddr & (PVO_DEAD | PVO_WIRED)) == PVO_WIRED)
2193                         count++;
2194         PV_PAGE_UNLOCK(m);
2195         return (count);
2196 }
2197
2198 static uintptr_t        moea64_vsidcontext;
2199
2200 uintptr_t
2201 moea64_get_unique_vsid(void) {
2202         u_int entropy;
2203         register_t hash;
2204         uint32_t mask;
2205         int i;
2206
2207         entropy = 0;
2208         __asm __volatile("mftb %0" : "=r"(entropy));
2209
2210         mtx_lock(&moea64_slb_mutex);
2211         for (i = 0; i < NVSIDS; i += VSID_NBPW) {
2212                 u_int   n;
2213
2214                 /*
2215                  * Create a new value by mutiplying by a prime and adding in
2216                  * entropy from the timebase register.  This is to make the
2217                  * VSID more random so that the PT hash function collides
2218                  * less often.  (Note that the prime casues gcc to do shifts
2219                  * instead of a multiply.)
2220                  */
2221                 moea64_vsidcontext = (moea64_vsidcontext * 0x1105) + entropy;
2222                 hash = moea64_vsidcontext & (NVSIDS - 1);
2223                 if (hash == 0)          /* 0 is special, avoid it */
2224                         continue;
2225                 n = hash >> 5;
2226                 mask = 1 << (hash & (VSID_NBPW - 1));
2227                 hash = (moea64_vsidcontext & VSID_HASHMASK);
2228                 if (moea64_vsid_bitmap[n] & mask) {     /* collision? */
2229                         /* anything free in this bucket? */
2230                         if (moea64_vsid_bitmap[n] == 0xffffffff) {
2231                                 entropy = (moea64_vsidcontext >> 20);
2232                                 continue;
2233                         }
2234                         i = ffs(~moea64_vsid_bitmap[n]) - 1;
2235                         mask = 1 << i;
2236                         hash &= rounddown2(VSID_HASHMASK, VSID_NBPW);
2237                         hash |= i;
2238                 }
2239                 if (hash == VSID_VRMA)  /* also special, avoid this too */
2240                         continue;
2241                 KASSERT(!(moea64_vsid_bitmap[n] & mask),
2242                     ("Allocating in-use VSID %#zx\n", hash));
2243                 moea64_vsid_bitmap[n] |= mask;
2244                 mtx_unlock(&moea64_slb_mutex);
2245                 return (hash);
2246         }
2247
2248         mtx_unlock(&moea64_slb_mutex);
2249         panic("%s: out of segments",__func__);
2250 }
2251
2252 #ifdef __powerpc64__
2253 int
2254 moea64_pinit(pmap_t pmap)
2255 {
2256
2257         RB_INIT(&pmap->pmap_pvo);
2258
2259         pmap->pm_slb_tree_root = slb_alloc_tree();
2260         pmap->pm_slb = slb_alloc_user_cache();
2261         pmap->pm_slb_len = 0;
2262
2263         return (1);
2264 }
2265 #else
2266 int
2267 moea64_pinit(pmap_t pmap)
2268 {
2269         int     i;
2270         uint32_t hash;
2271
2272         RB_INIT(&pmap->pmap_pvo);
2273
2274         if (pmap_bootstrapped)
2275                 pmap->pmap_phys = (pmap_t)moea64_kextract((vm_offset_t)pmap);
2276         else
2277                 pmap->pmap_phys = pmap;
2278
2279         /*
2280          * Allocate some segment registers for this pmap.
2281          */
2282         hash = moea64_get_unique_vsid();
2283
2284         for (i = 0; i < 16; i++)
2285                 pmap->pm_sr[i] = VSID_MAKE(i, hash);
2286
2287         KASSERT(pmap->pm_sr[0] != 0, ("moea64_pinit: pm_sr[0] = 0"));
2288
2289         return (1);
2290 }
2291 #endif
2292
2293 /*
2294  * Initialize the pmap associated with process 0.
2295  */
2296 void
2297 moea64_pinit0(pmap_t pm)
2298 {
2299
2300         PMAP_LOCK_INIT(pm);
2301         moea64_pinit(pm);
2302         bzero(&pm->pm_stats, sizeof(pm->pm_stats));
2303 }
2304
2305 /*
2306  * Set the physical protection on the specified range of this map as requested.
2307  */
2308 static void
2309 moea64_pvo_protect( pmap_t pm, struct pvo_entry *pvo, vm_prot_t prot)
2310 {
2311         struct vm_page *pg;
2312         vm_prot_t oldprot;
2313         int32_t refchg;
2314
2315         PMAP_LOCK_ASSERT(pm, MA_OWNED);
2316
2317         /*
2318          * Change the protection of the page.
2319          */
2320         oldprot = pvo->pvo_pte.prot;
2321         pvo->pvo_pte.prot = prot;
2322         pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
2323
2324         /*
2325          * If the PVO is in the page table, update mapping
2326          */
2327         refchg = moea64_pte_replace(pvo, MOEA64_PTE_PROT_UPDATE);
2328         if (refchg < 0)
2329                 refchg = (oldprot & VM_PROT_WRITE) ? LPTE_CHG : 0;
2330
2331         if (pm != kernel_pmap && pg != NULL &&
2332             (pg->a.flags & PGA_EXECUTABLE) == 0 &&
2333             (pvo->pvo_pte.pa & (LPTE_I | LPTE_G | LPTE_NOEXEC)) == 0) {
2334                 if ((pg->oflags & VPO_UNMANAGED) == 0)
2335                         vm_page_aflag_set(pg, PGA_EXECUTABLE);
2336                 moea64_syncicache(pm, PVO_VADDR(pvo),
2337                     PVO_PADDR(pvo), PAGE_SIZE);
2338         }
2339
2340         /*
2341          * Update vm about the REF/CHG bits if the page is managed and we have
2342          * removed write access.
2343          */
2344         if (pg != NULL && (pvo->pvo_vaddr & PVO_MANAGED) &&
2345             (oldprot & VM_PROT_WRITE)) {
2346                 refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs);
2347                 if (refchg & LPTE_CHG)
2348                         vm_page_dirty(pg);
2349                 if (refchg & LPTE_REF)
2350                         vm_page_aflag_set(pg, PGA_REFERENCED);
2351         }
2352 }
2353
2354 void
2355 moea64_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva,
2356     vm_prot_t prot)
2357 {
2358         struct  pvo_entry *pvo, *tpvo, key;
2359
2360         CTR4(KTR_PMAP, "moea64_protect: pm=%p sva=%#x eva=%#x prot=%#x", pm,
2361             sva, eva, prot);
2362
2363         KASSERT(pm == &curproc->p_vmspace->vm_pmap || pm == kernel_pmap,
2364             ("moea64_protect: non current pmap"));
2365
2366         if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2367                 moea64_remove(pm, sva, eva);
2368                 return;
2369         }
2370
2371         PMAP_LOCK(pm);
2372         key.pvo_vaddr = sva;
2373         for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
2374             pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
2375                 tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
2376                 moea64_pvo_protect(pm, pvo, prot);
2377         }
2378         PMAP_UNLOCK(pm);
2379 }
2380
2381 /*
2382  * Map a list of wired pages into kernel virtual address space.  This is
2383  * intended for temporary mappings which do not need page modification or
2384  * references recorded.  Existing mappings in the region are overwritten.
2385  */
2386 void
2387 moea64_qenter(vm_offset_t va, vm_page_t *m, int count)
2388 {
2389         while (count-- > 0) {
2390                 moea64_kenter(va, VM_PAGE_TO_PHYS(*m));
2391                 va += PAGE_SIZE;
2392                 m++;
2393         }
2394 }
2395
2396 /*
2397  * Remove page mappings from kernel virtual address space.  Intended for
2398  * temporary mappings entered by moea64_qenter.
2399  */
2400 void
2401 moea64_qremove(vm_offset_t va, int count)
2402 {
2403         while (count-- > 0) {
2404                 moea64_kremove(va);
2405                 va += PAGE_SIZE;
2406         }
2407 }
2408
2409 void
2410 moea64_release_vsid(uint64_t vsid)
2411 {
2412         int idx, mask;
2413
2414         mtx_lock(&moea64_slb_mutex);
2415         idx = vsid & (NVSIDS-1);
2416         mask = 1 << (idx % VSID_NBPW);
2417         idx /= VSID_NBPW;
2418         KASSERT(moea64_vsid_bitmap[idx] & mask,
2419             ("Freeing unallocated VSID %#jx", vsid));
2420         moea64_vsid_bitmap[idx] &= ~mask;
2421         mtx_unlock(&moea64_slb_mutex);
2422 }
2423
2424 void
2425 moea64_release(pmap_t pmap)
2426 {
2427
2428         /*
2429          * Free segment registers' VSIDs
2430          */
2431     #ifdef __powerpc64__
2432         slb_free_tree(pmap);
2433         slb_free_user_cache(pmap->pm_slb);
2434     #else
2435         KASSERT(pmap->pm_sr[0] != 0, ("moea64_release: pm_sr[0] = 0"));
2436
2437         moea64_release_vsid(VSID_TO_HASH(pmap->pm_sr[0]));
2438     #endif
2439 }
2440
2441 /*
2442  * Remove all pages mapped by the specified pmap
2443  */
2444 void
2445 moea64_remove_pages(pmap_t pm)
2446 {
2447         struct pvo_entry *pvo, *tpvo;
2448         struct pvo_dlist tofree;
2449
2450         SLIST_INIT(&tofree);
2451
2452         PMAP_LOCK(pm);
2453         RB_FOREACH_SAFE(pvo, pvo_tree, &pm->pmap_pvo, tpvo) {
2454                 if (pvo->pvo_vaddr & PVO_WIRED)
2455                         continue;
2456
2457                 /*
2458                  * For locking reasons, remove this from the page table and
2459                  * pmap, but save delinking from the vm_page for a second
2460                  * pass
2461                  */
2462                 moea64_pvo_remove_from_pmap(pvo);
2463                 SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink);
2464         }
2465         PMAP_UNLOCK(pm);
2466
2467         while (!SLIST_EMPTY(&tofree)) {
2468                 pvo = SLIST_FIRST(&tofree);
2469                 SLIST_REMOVE_HEAD(&tofree, pvo_dlink);
2470                 moea64_pvo_remove_from_page(pvo);
2471                 free_pvo_entry(pvo);
2472         }
2473 }
2474
2475 /*
2476  * Remove the given range of addresses from the specified map.
2477  */
2478 void
2479 moea64_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
2480 {
2481         struct  pvo_entry *pvo, *tpvo, key;
2482         struct pvo_dlist tofree;
2483
2484         /*
2485          * Perform an unsynchronized read.  This is, however, safe.
2486          */
2487         if (pm->pm_stats.resident_count == 0)
2488                 return;
2489
2490         key.pvo_vaddr = sva;
2491
2492         SLIST_INIT(&tofree);
2493
2494         PMAP_LOCK(pm);
2495         for (pvo = RB_NFIND(pvo_tree, &pm->pmap_pvo, &key);
2496             pvo != NULL && PVO_VADDR(pvo) < eva; pvo = tpvo) {
2497                 tpvo = RB_NEXT(pvo_tree, &pm->pmap_pvo, pvo);
2498
2499                 /*
2500                  * For locking reasons, remove this from the page table and
2501                  * pmap, but save delinking from the vm_page for a second
2502                  * pass
2503                  */
2504                 moea64_pvo_remove_from_pmap(pvo);
2505                 SLIST_INSERT_HEAD(&tofree, pvo, pvo_dlink);
2506         }
2507         PMAP_UNLOCK(pm);
2508
2509         while (!SLIST_EMPTY(&tofree)) {
2510                 pvo = SLIST_FIRST(&tofree);
2511                 SLIST_REMOVE_HEAD(&tofree, pvo_dlink);
2512                 moea64_pvo_remove_from_page(pvo);
2513                 free_pvo_entry(pvo);
2514         }
2515 }
2516
2517 /*
2518  * Remove physical page from all pmaps in which it resides. moea64_pvo_remove()
2519  * will reflect changes in pte's back to the vm_page.
2520  */
2521 void
2522 moea64_remove_all(vm_page_t m)
2523 {
2524         struct  pvo_entry *pvo, *next_pvo;
2525         struct  pvo_head freequeue;
2526         int     wasdead;
2527         pmap_t  pmap;
2528
2529         LIST_INIT(&freequeue);
2530
2531         PV_PAGE_LOCK(m);
2532         LIST_FOREACH_SAFE(pvo, vm_page_to_pvoh(m), pvo_vlink, next_pvo) {
2533                 pmap = pvo->pvo_pmap;
2534                 PMAP_LOCK(pmap);
2535                 wasdead = (pvo->pvo_vaddr & PVO_DEAD);
2536                 if (!wasdead)
2537                         moea64_pvo_remove_from_pmap(pvo);
2538                 moea64_pvo_remove_from_page_locked(pvo, m);
2539                 if (!wasdead)
2540                         LIST_INSERT_HEAD(&freequeue, pvo, pvo_vlink);
2541                 PMAP_UNLOCK(pmap);
2542
2543         }
2544         KASSERT(!pmap_page_is_mapped(m), ("Page still has mappings"));
2545         KASSERT((m->a.flags & PGA_WRITEABLE) == 0, ("Page still writable"));
2546         PV_PAGE_UNLOCK(m);
2547
2548         /* Clean up UMA allocations */
2549         LIST_FOREACH_SAFE(pvo, &freequeue, pvo_vlink, next_pvo)
2550                 free_pvo_entry(pvo);
2551 }
2552
2553 /*
2554  * Allocate a physical page of memory directly from the phys_avail map.
2555  * Can only be called from moea64_bootstrap before avail start and end are
2556  * calculated.
2557  */
2558 vm_offset_t
2559 moea64_bootstrap_alloc(vm_size_t size, vm_size_t align)
2560 {
2561         vm_offset_t     s, e;
2562         int             i, j;
2563
2564         size = round_page(size);
2565         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
2566                 if (align != 0)
2567                         s = roundup2(phys_avail[i], align);
2568                 else
2569                         s = phys_avail[i];
2570                 e = s + size;
2571
2572                 if (s < phys_avail[i] || e > phys_avail[i + 1])
2573                         continue;
2574
2575                 if (s + size > platform_real_maxaddr())
2576                         continue;
2577
2578                 if (s == phys_avail[i]) {
2579                         phys_avail[i] += size;
2580                 } else if (e == phys_avail[i + 1]) {
2581                         phys_avail[i + 1] -= size;
2582                 } else {
2583                         for (j = phys_avail_count * 2; j > i; j -= 2) {
2584                                 phys_avail[j] = phys_avail[j - 2];
2585                                 phys_avail[j + 1] = phys_avail[j - 1];
2586                         }
2587
2588                         phys_avail[i + 3] = phys_avail[i + 1];
2589                         phys_avail[i + 1] = s;
2590                         phys_avail[i + 2] = e;
2591                         phys_avail_count++;
2592                 }
2593
2594                 return (s);
2595         }
2596         panic("moea64_bootstrap_alloc: could not allocate memory");
2597 }
2598
2599 static int
2600 moea64_pvo_enter(struct pvo_entry *pvo, struct pvo_head *pvo_head,
2601     struct pvo_entry **oldpvop)
2602 {
2603         struct pvo_entry *old_pvo;
2604         int err;
2605
2606         PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
2607
2608         STAT_MOEA64(moea64_pvo_enter_calls++);
2609
2610         /*
2611          * Add to pmap list
2612          */
2613         old_pvo = RB_INSERT(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
2614
2615         if (old_pvo != NULL) {
2616                 if (oldpvop != NULL)
2617                         *oldpvop = old_pvo;
2618                 return (EEXIST);
2619         }
2620
2621         if (pvo_head != NULL) {
2622                 LIST_INSERT_HEAD(pvo_head, pvo, pvo_vlink);
2623         }
2624
2625         if (pvo->pvo_vaddr & PVO_WIRED)
2626                 pvo->pvo_pmap->pm_stats.wired_count++;
2627         pvo->pvo_pmap->pm_stats.resident_count++;
2628
2629         /*
2630          * Insert it into the hardware page table
2631          */
2632         err = moea64_pte_insert(pvo);
2633         if (err != 0) {
2634                 panic("moea64_pvo_enter: overflow");
2635         }
2636
2637         STAT_MOEA64(moea64_pvo_entries++);
2638
2639         if (pvo->pvo_pmap == kernel_pmap)
2640                 isync();
2641
2642 #ifdef __powerpc64__
2643         /*
2644          * Make sure all our bootstrap mappings are in the SLB as soon
2645          * as virtual memory is switched on.
2646          */
2647         if (!pmap_bootstrapped)
2648                 moea64_bootstrap_slb_prefault(PVO_VADDR(pvo),
2649                     pvo->pvo_vaddr & PVO_LARGE);
2650 #endif
2651
2652         return (0);
2653 }
2654
2655 static void
2656 moea64_pvo_remove_from_pmap(struct pvo_entry *pvo)
2657 {
2658         struct  vm_page *pg;
2659         int32_t refchg;
2660
2661         KASSERT(pvo->pvo_pmap != NULL, ("Trying to remove PVO with no pmap"));
2662         PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED);
2663         KASSERT(!(pvo->pvo_vaddr & PVO_DEAD), ("Trying to remove dead PVO"));
2664
2665         /*
2666          * If there is an active pte entry, we need to deactivate it
2667          */
2668         refchg = moea64_pte_unset(pvo);
2669         if (refchg < 0) {
2670                 /*
2671                  * If it was evicted from the page table, be pessimistic and
2672                  * dirty the page.
2673                  */
2674                 if (pvo->pvo_pte.prot & VM_PROT_WRITE)
2675                         refchg = LPTE_CHG;
2676                 else
2677                         refchg = 0;
2678         }
2679
2680         /*
2681          * Update our statistics.
2682          */
2683         pvo->pvo_pmap->pm_stats.resident_count--;
2684         if (pvo->pvo_vaddr & PVO_WIRED)
2685                 pvo->pvo_pmap->pm_stats.wired_count--;
2686
2687         /*
2688          * Remove this PVO from the pmap list.
2689          */
2690         RB_REMOVE(pvo_tree, &pvo->pvo_pmap->pmap_pvo, pvo);
2691
2692         /*
2693          * Mark this for the next sweep
2694          */
2695         pvo->pvo_vaddr |= PVO_DEAD;
2696
2697         /* Send RC bits to VM */
2698         if ((pvo->pvo_vaddr & PVO_MANAGED) &&
2699             (pvo->pvo_pte.prot & VM_PROT_WRITE)) {
2700                 pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
2701                 if (pg != NULL) {
2702                         refchg |= atomic_readandclear_32(&pg->md.mdpg_attrs);
2703                         if (refchg & LPTE_CHG)
2704                                 vm_page_dirty(pg);
2705                         if (refchg & LPTE_REF)
2706                                 vm_page_aflag_set(pg, PGA_REFERENCED);
2707                 }
2708         }
2709 }
2710
2711 static inline void
2712 moea64_pvo_remove_from_page_locked(struct pvo_entry *pvo,
2713     vm_page_t m)
2714 {
2715
2716         KASSERT(pvo->pvo_vaddr & PVO_DEAD, ("Trying to delink live page"));
2717
2718         /* Use NULL pmaps as a sentinel for races in page deletion */
2719         if (pvo->pvo_pmap == NULL)
2720                 return;
2721         pvo->pvo_pmap = NULL;
2722
2723         /*
2724          * Update vm about page writeability/executability if managed
2725          */
2726         PV_LOCKASSERT(PVO_PADDR(pvo));
2727         if (pvo->pvo_vaddr & PVO_MANAGED) {
2728                 if (m != NULL) {
2729                         LIST_REMOVE(pvo, pvo_vlink);
2730                         if (LIST_EMPTY(vm_page_to_pvoh(m)))
2731                                 vm_page_aflag_clear(m,
2732                                     PGA_WRITEABLE | PGA_EXECUTABLE);
2733                 }
2734         }
2735
2736         STAT_MOEA64(moea64_pvo_entries--);
2737         STAT_MOEA64(moea64_pvo_remove_calls++);
2738 }
2739
2740 static void
2741 moea64_pvo_remove_from_page(struct pvo_entry *pvo)
2742 {
2743         vm_page_t pg = NULL;
2744
2745         if (pvo->pvo_vaddr & PVO_MANAGED)
2746                 pg = PHYS_TO_VM_PAGE(PVO_PADDR(pvo));
2747
2748         PV_LOCK(PVO_PADDR(pvo));
2749         moea64_pvo_remove_from_page_locked(pvo, pg);
2750         PV_UNLOCK(PVO_PADDR(pvo));
2751 }
2752
2753 static struct pvo_entry *
2754 moea64_pvo_find_va(pmap_t pm, vm_offset_t va)
2755 {
2756         struct pvo_entry key;
2757
2758         PMAP_LOCK_ASSERT(pm, MA_OWNED);
2759
2760         key.pvo_vaddr = va & ~ADDR_POFF;
2761         return (RB_FIND(pvo_tree, &pm->pmap_pvo, &key));
2762 }
2763
2764 static boolean_t
2765 moea64_query_bit(vm_page_t m, uint64_t ptebit)
2766 {
2767         struct  pvo_entry *pvo;
2768         int64_t ret;
2769         boolean_t rv;
2770
2771         /*
2772          * See if this bit is stored in the page already.
2773          */
2774         if (m->md.mdpg_attrs & ptebit)
2775                 return (TRUE);
2776
2777         /*
2778          * Examine each PTE.  Sync so that any pending REF/CHG bits are
2779          * flushed to the PTEs.
2780          */
2781         rv = FALSE;
2782         powerpc_sync();
2783         PV_PAGE_LOCK(m);
2784         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2785                 ret = 0;
2786
2787                 /*
2788                  * See if this pvo has a valid PTE.  if so, fetch the
2789                  * REF/CHG bits from the valid PTE.  If the appropriate
2790                  * ptebit is set, return success.
2791                  */
2792                 PMAP_LOCK(pvo->pvo_pmap);
2793                 if (!(pvo->pvo_vaddr & PVO_DEAD))
2794                         ret = moea64_pte_synch(pvo);
2795                 PMAP_UNLOCK(pvo->pvo_pmap);
2796
2797                 if (ret > 0) {
2798                         atomic_set_32(&m->md.mdpg_attrs,
2799                             ret & (LPTE_CHG | LPTE_REF));
2800                         if (ret & ptebit) {
2801                                 rv = TRUE;
2802                                 break;
2803                         }
2804                 }
2805         }
2806         PV_PAGE_UNLOCK(m);
2807
2808         return (rv);
2809 }
2810
2811 static u_int
2812 moea64_clear_bit(vm_page_t m, u_int64_t ptebit)
2813 {
2814         u_int   count;
2815         struct  pvo_entry *pvo;
2816         int64_t ret;
2817
2818         /*
2819          * Sync so that any pending REF/CHG bits are flushed to the PTEs (so
2820          * we can reset the right ones).
2821          */
2822         powerpc_sync();
2823
2824         /*
2825          * For each pvo entry, clear the pte's ptebit.
2826          */
2827         count = 0;
2828         PV_PAGE_LOCK(m);
2829         LIST_FOREACH(pvo, vm_page_to_pvoh(m), pvo_vlink) {
2830                 ret = 0;
2831
2832                 PMAP_LOCK(pvo->pvo_pmap);
2833                 if (!(pvo->pvo_vaddr & PVO_DEAD))
2834                         ret = moea64_pte_clear(pvo, ptebit);
2835                 PMAP_UNLOCK(pvo->pvo_pmap);
2836
2837                 if (ret > 0 && (ret & ptebit))
2838                         count++;
2839         }
2840         atomic_clear_32(&m->md.mdpg_attrs, ptebit);
2841         PV_PAGE_UNLOCK(m);
2842
2843         return (count);
2844 }
2845
2846 boolean_t
2847 moea64_dev_direct_mapped(vm_paddr_t pa, vm_size_t size)
2848 {
2849         struct pvo_entry *pvo, key;
2850         vm_offset_t ppa;
2851         int error = 0;
2852
2853         if (hw_direct_map && mem_valid(pa, size) == 0)
2854                 return (0);
2855
2856         PMAP_LOCK(kernel_pmap);
2857         ppa = pa & ~ADDR_POFF;
2858         key.pvo_vaddr = DMAP_BASE_ADDRESS + ppa;
2859         for (pvo = RB_FIND(pvo_tree, &kernel_pmap->pmap_pvo, &key);
2860             ppa < pa + size; ppa += PAGE_SIZE,
2861             pvo = RB_NEXT(pvo_tree, &kernel_pmap->pmap_pvo, pvo)) {
2862                 if (pvo == NULL || PVO_PADDR(pvo) != ppa) {
2863                         error = EFAULT;
2864                         break;
2865                 }
2866         }
2867         PMAP_UNLOCK(kernel_pmap);
2868
2869         return (error);
2870 }
2871
2872 /*
2873  * Map a set of physical memory pages into the kernel virtual
2874  * address space. Return a pointer to where it is mapped. This
2875  * routine is intended to be used for mapping device memory,
2876  * NOT real memory.
2877  */
2878 void *
2879 moea64_mapdev_attr(vm_paddr_t pa, vm_size_t size, vm_memattr_t ma)
2880 {
2881         vm_offset_t va, tmpva, ppa, offset;
2882
2883         ppa = trunc_page(pa);
2884         offset = pa & PAGE_MASK;
2885         size = roundup2(offset + size, PAGE_SIZE);
2886
2887         va = kva_alloc(size);
2888
2889         if (!va)
2890                 panic("moea64_mapdev: Couldn't alloc kernel virtual memory");
2891
2892         for (tmpva = va; size > 0;) {
2893                 moea64_kenter_attr(tmpva, ppa, ma);
2894                 size -= PAGE_SIZE;
2895                 tmpva += PAGE_SIZE;
2896                 ppa += PAGE_SIZE;
2897         }
2898
2899         return ((void *)(va + offset));
2900 }
2901
2902 void *
2903 moea64_mapdev(vm_paddr_t pa, vm_size_t size)
2904 {
2905
2906         return moea64_mapdev_attr(pa, size, VM_MEMATTR_DEFAULT);
2907 }
2908
2909 void
2910 moea64_unmapdev(vm_offset_t va, vm_size_t size)
2911 {
2912         vm_offset_t base, offset;
2913
2914         base = trunc_page(va);
2915         offset = va & PAGE_MASK;
2916         size = roundup2(offset + size, PAGE_SIZE);
2917
2918         moea64_qremove(base, atop(size));
2919         kva_free(base, size);
2920 }
2921
2922 void
2923 moea64_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
2924 {
2925         struct pvo_entry *pvo;
2926         vm_offset_t lim;
2927         vm_paddr_t pa;
2928         vm_size_t len;
2929
2930         if (__predict_false(pm == NULL))
2931                 pm = &curthread->td_proc->p_vmspace->vm_pmap;
2932
2933         PMAP_LOCK(pm);
2934         while (sz > 0) {
2935                 lim = round_page(va+1);
2936                 len = MIN(lim - va, sz);
2937                 pvo = moea64_pvo_find_va(pm, va & ~ADDR_POFF);
2938                 if (pvo != NULL && !(pvo->pvo_pte.pa & LPTE_I)) {
2939                         pa = PVO_PADDR(pvo) | (va & ADDR_POFF);
2940                         moea64_syncicache(pm, va, pa, len);
2941                 }
2942                 va += len;
2943                 sz -= len;
2944         }
2945         PMAP_UNLOCK(pm);
2946 }
2947
2948 void
2949 moea64_dumpsys_map(vm_paddr_t pa, size_t sz, void **va)
2950 {
2951
2952         *va = (void *)(uintptr_t)pa;
2953 }
2954
2955 extern struct dump_pa dump_map[PHYS_AVAIL_SZ + 1];
2956
2957 void
2958 moea64_scan_init()
2959 {
2960         struct pvo_entry *pvo;
2961         vm_offset_t va;
2962         int i;
2963
2964         if (!do_minidump) {
2965                 /* Initialize phys. segments for dumpsys(). */
2966                 memset(&dump_map, 0, sizeof(dump_map));
2967                 mem_regions(&pregions, &pregions_sz, &regions, &regions_sz);
2968                 for (i = 0; i < pregions_sz; i++) {
2969                         dump_map[i].pa_start = pregions[i].mr_start;
2970                         dump_map[i].pa_size = pregions[i].mr_size;
2971                 }
2972                 return;
2973         }
2974
2975         /* Virtual segments for minidumps: */
2976         memset(&dump_map, 0, sizeof(dump_map));
2977
2978         /* 1st: kernel .data and .bss. */
2979         dump_map[0].pa_start = trunc_page((uintptr_t)_etext);
2980         dump_map[0].pa_size = round_page((uintptr_t)_end) -
2981             dump_map[0].pa_start;
2982
2983         /* 2nd: msgbuf and tables (see pmap_bootstrap()). */
2984         dump_map[1].pa_start = (vm_paddr_t)(uintptr_t)msgbufp->msg_ptr;
2985         dump_map[1].pa_size = round_page(msgbufp->msg_size);
2986
2987         /* 3rd: kernel VM. */
2988         va = dump_map[1].pa_start + dump_map[1].pa_size;
2989         /* Find start of next chunk (from va). */
2990         while (va < virtual_end) {
2991                 /* Don't dump the buffer cache. */
2992                 if (va >= kmi.buffer_sva && va < kmi.buffer_eva) {
2993                         va = kmi.buffer_eva;
2994                         continue;
2995                 }
2996                 pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF);
2997                 if (pvo != NULL && !(pvo->pvo_vaddr & PVO_DEAD))
2998                         break;
2999                 va += PAGE_SIZE;
3000         }
3001         if (va < virtual_end) {
3002                 dump_map[2].pa_start = va;
3003                 va += PAGE_SIZE;
3004                 /* Find last page in chunk. */
3005                 while (va < virtual_end) {
3006                         /* Don't run into the buffer cache. */
3007                         if (va == kmi.buffer_sva)
3008                                 break;
3009                         pvo = moea64_pvo_find_va(kernel_pmap, va & ~ADDR_POFF);
3010                         if (pvo == NULL || (pvo->pvo_vaddr & PVO_DEAD))
3011                                 break;
3012                         va += PAGE_SIZE;
3013                 }
3014                 dump_map[2].pa_size = va - dump_map[2].pa_start;
3015         }
3016 }
3017
3018 #ifdef __powerpc64__
3019
3020 static size_t
3021 moea64_scan_pmap()
3022 {
3023         struct pvo_entry *pvo;
3024         vm_paddr_t pa, pa_end;
3025         vm_offset_t va, pgva, kstart, kend, kstart_lp, kend_lp;
3026         uint64_t lpsize;
3027
3028         lpsize = moea64_large_page_size;
3029         kstart = trunc_page((vm_offset_t)_etext);
3030         kend = round_page((vm_offset_t)_end);
3031         kstart_lp = kstart & ~moea64_large_page_mask;
3032         kend_lp = (kend + moea64_large_page_mask) & ~moea64_large_page_mask;
3033
3034         CTR4(KTR_PMAP, "moea64_scan_pmap: kstart=0x%016lx, kend=0x%016lx, "
3035             "kstart_lp=0x%016lx, kend_lp=0x%016lx",
3036             kstart, kend, kstart_lp, kend_lp);
3037
3038         PMAP_LOCK(kernel_pmap);
3039         RB_FOREACH(pvo, pvo_tree, &kernel_pmap->pmap_pvo) {
3040                 va = pvo->pvo_vaddr;
3041
3042                 if (va & PVO_DEAD)
3043                         continue;
3044
3045                 /* Skip DMAP (except kernel area) */
3046                 if (va >= DMAP_BASE_ADDRESS && va <= DMAP_MAX_ADDRESS) {
3047                         if (va & PVO_LARGE) {
3048                                 pgva = va & ~moea64_large_page_mask;
3049                                 if (pgva < kstart_lp || pgva >= kend_lp)
3050                                         continue;
3051                         } else {
3052                                 pgva = trunc_page(va);
3053                                 if (pgva < kstart || pgva >= kend)
3054                                         continue;
3055                         }
3056                 }
3057
3058                 pa = PVO_PADDR(pvo);
3059
3060                 if (va & PVO_LARGE) {
3061                         pa_end = pa + lpsize;
3062                         for (; pa < pa_end; pa += PAGE_SIZE) {
3063                                 if (is_dumpable(pa))
3064                                         dump_add_page(pa);
3065                         }
3066                 } else {
3067                         if (is_dumpable(pa))
3068                                 dump_add_page(pa);
3069                 }
3070         }
3071         PMAP_UNLOCK(kernel_pmap);
3072
3073         return (sizeof(struct lpte) * moea64_pteg_count * 8);
3074 }
3075
3076 static struct dump_context dump_ctx;
3077
3078 static void *
3079 moea64_dump_pmap_init(unsigned blkpgs)
3080 {
3081         dump_ctx.ptex = 0;
3082         dump_ctx.ptex_end = moea64_pteg_count * 8;
3083         dump_ctx.blksz = blkpgs * PAGE_SIZE;
3084         return (&dump_ctx);
3085 }
3086
3087 #else
3088
3089 static size_t
3090 moea64_scan_pmap()
3091 {
3092         return (0);
3093 }
3094
3095 static void *
3096 moea64_dump_pmap_init(unsigned blkpgs)
3097 {
3098         return (NULL);
3099 }
3100
3101 #endif
3102
3103 #ifdef __powerpc64__
3104 static void
3105 moea64_map_range(vm_offset_t va, vm_paddr_t pa, vm_size_t npages)
3106 {
3107
3108         for (; npages > 0; --npages) {
3109                 if (moea64_large_page_size != 0 &&
3110                     (pa & moea64_large_page_mask) == 0 &&
3111                     (va & moea64_large_page_mask) == 0 &&
3112                     npages >= (moea64_large_page_size >> PAGE_SHIFT)) {
3113                         PMAP_LOCK(kernel_pmap);
3114                         moea64_kenter_large(va, pa, 0, 0);
3115                         PMAP_UNLOCK(kernel_pmap);
3116                         pa += moea64_large_page_size;
3117                         va += moea64_large_page_size;
3118                         npages -= (moea64_large_page_size >> PAGE_SHIFT) - 1;
3119                 } else {
3120                         moea64_kenter(va, pa);
3121                         pa += PAGE_SIZE;
3122                         va += PAGE_SIZE;
3123                 }
3124         }
3125 }
3126
3127 static void
3128 moea64_page_array_startup(long pages)
3129 {
3130         long dom_pages[MAXMEMDOM];
3131         vm_paddr_t pa;
3132         vm_offset_t va, vm_page_base;
3133         vm_size_t needed, size;
3134         long page;
3135         int domain;
3136         int i;
3137
3138         vm_page_base = 0xd000000000000000ULL;
3139
3140         /* Short-circuit single-domain systems. */
3141         if (vm_ndomains == 1) {
3142                 size = round_page(pages * sizeof(struct vm_page));
3143                 pa = vm_phys_early_alloc(0, size);
3144                 vm_page_base = moea64_map(&vm_page_base,
3145                     pa, pa + size, VM_PROT_READ | VM_PROT_WRITE);
3146                 vm_page_array_size = pages;
3147                 vm_page_array = (vm_page_t)vm_page_base;
3148                 return;
3149         }
3150
3151         page = 0;
3152         for (i = 0; i < MAXMEMDOM; i++)
3153                 dom_pages[i] = 0;
3154
3155         /* Now get the number of pages required per domain. */
3156         for (i = 0; i < vm_phys_nsegs; i++) {
3157                 domain = vm_phys_segs[i].domain;
3158                 KASSERT(domain < MAXMEMDOM,
3159                     ("Invalid vm_phys_segs NUMA domain %d!\n", domain));
3160                 /* Get size of vm_page_array needed for this segment. */
3161                 size = btoc(vm_phys_segs[i].end - vm_phys_segs[i].start);
3162                 dom_pages[domain] += size;
3163         }
3164
3165         for (i = 0; phys_avail[i + 1] != 0; i+= 2) {
3166                 domain = _vm_phys_domain(phys_avail[i]);
3167                 KASSERT(domain < MAXMEMDOM,
3168                     ("Invalid phys_avail NUMA domain %d!\n", domain));
3169                 size = btoc(phys_avail[i + 1] - phys_avail[i]);
3170                 dom_pages[domain] += size;
3171         }
3172
3173         /*
3174          * Map in chunks that can get us all 16MB pages.  There will be some
3175          * overlap between domains, but that's acceptable for now.
3176          */
3177         vm_page_array_size = 0;
3178         va = vm_page_base;
3179         for (i = 0; i < MAXMEMDOM && vm_page_array_size < pages; i++) {
3180                 if (dom_pages[i] == 0)
3181                         continue;
3182                 size = ulmin(pages - vm_page_array_size, dom_pages[i]);
3183                 size = round_page(size * sizeof(struct vm_page));
3184                 needed = size;
3185                 size = roundup2(size, moea64_large_page_size);
3186                 pa = vm_phys_early_alloc(i, size);
3187                 vm_page_array_size += size / sizeof(struct vm_page);
3188                 moea64_map_range(va, pa, size >> PAGE_SHIFT);
3189                 /* Scoot up domain 0, to reduce the domain page overlap. */
3190                 if (i == 0)
3191                         vm_page_base += size - needed;
3192                 va += size;
3193         }
3194         vm_page_array = (vm_page_t)vm_page_base;
3195         vm_page_array_size = pages;
3196 }
3197 #endif
3198
3199 static int64_t
3200 moea64_null_method(void)
3201 {
3202         return (0);
3203 }
3204
3205 static int64_t moea64_pte_replace_default(struct pvo_entry *pvo, int flags)
3206 {
3207         int64_t refchg;
3208
3209         refchg = moea64_pte_unset(pvo);
3210         moea64_pte_insert(pvo);
3211
3212         return (refchg);
3213 }
3214
3215 struct moea64_funcs *moea64_ops;
3216
3217 #define DEFINE_OEA64_IFUNC(ret, func, args, def)                \
3218         DEFINE_IFUNC(, ret, moea64_##func, args) {              \
3219                 moea64_##func##_t f;                            \
3220                 if (moea64_ops == NULL)                         \
3221                         return ((moea64_##func##_t)def);        \
3222                 f = moea64_ops->func;                           \
3223                 return (f != NULL ? f : (moea64_##func##_t)def);\
3224         }
3225
3226 DEFINE_OEA64_IFUNC(int64_t, pte_replace, (struct pvo_entry *, int),
3227     moea64_pte_replace_default)
3228 DEFINE_OEA64_IFUNC(int64_t, pte_insert, (struct pvo_entry *), moea64_null_method)
3229 DEFINE_OEA64_IFUNC(int64_t, pte_unset, (struct pvo_entry *), moea64_null_method)
3230 DEFINE_OEA64_IFUNC(int64_t, pte_clear, (struct pvo_entry *, uint64_t),
3231     moea64_null_method)
3232 DEFINE_OEA64_IFUNC(int64_t, pte_synch, (struct pvo_entry *), moea64_null_method)