sys/sun4v/sun4v/pmap.c

   1 /*-
   2  * Copyright (c) 2006 Kip Macy <kmacy@FreeBSD.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  *
  26  */
  27
  28 #include <sys/cdefs.h>
  29 __FBSDID("$FreeBSD$");
  30
  31 #include "opt_kstack_pages.h"
  32 #include "opt_msgbuf.h"
  33 #include "opt_pmap.h"
  34 #include "opt_trap_trace.h"
  35
  36 #include <sys/param.h>
  37 #include <sys/kernel.h>
  38 #include <sys/kdb.h>
  39 #include <sys/ktr.h>
  40 #include <sys/lock.h>
  41 #include <sys/msgbuf.h>
  42 #include <sys/mutex.h>
  43 #include <sys/proc.h>
  44 #include <sys/smp.h>
  45 #include <sys/sched.h>
  46 #include <sys/sysctl.h>
  47 #include <sys/systm.h>
  48 #include <sys/vmmeter.h>
  49
  50 #include <dev/ofw/openfirm.h>
  51
  52 #include <vm/vm.h>
  53 #include <vm/vm_page.h>
  54 #include <vm/vm_param.h>
  55 #include <vm/vm_kern.h>
  56 #include <vm/vm_map.h>
  57 #include <vm/vm_object.h>
  58 #include <vm/vm_extern.h>
  59 #include <vm/vm_pageout.h>
  60 #include <vm/vm_pager.h>
  61 #include <vm/vm_phys.h>
  62 #include <vm/uma.h>
  63
  64 #include <machine/cpu.h>
  65 #include <machine/frame.h>
  66 #include <machine/instr.h>
  67 #include <machine/md_var.h>
  68 #include <machine/metadata.h>
  69 #include <machine/ofw_mem.h>
  70 #include <machine/mmu.h>
  71 #include <machine/smp.h>
  72 #include <machine/tlb.h>
  73 #include <machine/tte.h>
  74 #include <machine/tte_hash.h>
  75 #include <machine/pcb.h>
  76 #include <machine/pstate.h>
  77 #include <machine/tsb.h>
  78
  79 #include <machine/hypervisorvar.h>
  80 #include <machine/hv_api.h>
  81
  82 #ifdef TRAP_TRACING
  83 void trap_trace_report(int);
  84 #endif
  85
  86 #if 1
  87 #define PMAP_DEBUG
  88 #endif
  89 #ifndef PMAP_SHPGPERPROC
  90 #define PMAP_SHPGPERPROC        200
  91 #endif
  92
  93 /*
  94  * Virtual and physical address of message buffer.
  95  */
  96 struct msgbuf *msgbufp;
  97 vm_paddr_t msgbuf_phys;
  98
  99 /*
 100  * Map of physical memory reagions.
 101  */
 102 vm_paddr_t phys_avail[128];
 103 vm_paddr_t phys_avail_tmp[128];
 104 static struct ofw_mem_region mra[128];
 105 static struct ofw_map translations[128];
 106 static int translations_size;
 107
 108
 109 struct ofw_mem_region sparc64_memreg[128];
 110 int sparc64_nmemreg;
 111
 112 extern vm_paddr_t mmu_fault_status_area;
 113
 114 /*
 115  * First and last available kernel virtual addresses.
 116  */
 117 vm_offset_t virtual_avail;
 118 vm_offset_t virtual_end;
 119 vm_offset_t kernel_vm_end;
 120 vm_offset_t vm_max_kernel_address;
 121
 122 #ifndef PMAP_SHPGPERPROC
 123 #define PMAP_SHPGPERPROC 200
 124 #endif
 125 /*
 126  * Data for the pv entry allocation mechanism
 127  */
 128 static uma_zone_t pvzone;
 129 static struct vm_object pvzone_obj;
 130 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 131 int pmap_debug = 0;
 132 static int pmap_debug_range = 1;
 133 static int use_256M_pages = 1;
 134
 135 static struct mtx pmap_ctx_lock;
 136 static uint16_t ctx_stack[PMAP_CONTEXT_MAX];
 137 static int ctx_stack_top;
 138
 139 static int permanent_mappings = 0;
 140 static uint64_t nucleus_memory;
 141 static uint64_t nucleus_mappings[4];
 142 /*
 143  * Kernel pmap.
 144  */
 145 struct pmap kernel_pmap_store;
 146
 147 hv_tsb_info_t kernel_td[MAX_TSB_INFO];
 148
 149 /*
 150  * This should be determined at boot time
 151  * with tiny TLBS it doesn't make sense to try and selectively
 152  * invalidate more than this
 153  */
 154 #define MAX_INVALIDATES   32
 155 #define MAX_TSB_CLEARS   128
 156
 157 /*
 158  * Allocate physical memory for use in pmap_bootstrap.
 159  */
 160 static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size);
 161
 162 /*
 163  * If user pmap is processed with pmap_remove and with pmap_remove and the
 164  * resident count drops to 0, there are no more pages to remove, so we
 165  * need not continue.
 166  */
 167 #define PMAP_REMOVE_DONE(pm) \
 168         ((pm) != kernel_pmap && (pm)->pm_stats.resident_count == 0)
 169
 170 /*
 171  * Kernel MMU interface
 172  */
 173 #define curthread_pmap vmspace_pmap(curthread->td_proc->p_vmspace)
 174
 175 #ifdef PMAP_DEBUG
 176 #define KDPRINTF if (pmap_debug) printf
 177 #define DPRINTF \
 178         if (curthread_pmap && (curthread_pmap->pm_context != 0) && ((PCPU_GET(cpumask) & curthread_pmap->pm_active) == 0)) \
 179         panic("cpumask(0x%x) & active (0x%x) == 0 pid == %d\n",  \
 180               PCPU_GET(cpumask), curthread_pmap->pm_active, curthread->td_proc->p_pid); \
 181 if (pmap_debug) printf
 182
 183
 184 #else
 185 #define DPRINTF(...)
 186 #define KDPRINTF(...)
 187 #endif
 188
 189
 190 static void free_pv_entry(pv_entry_t pv);
 191 static pv_entry_t get_pv_entry(pmap_t locked_pmap);
 192
 193 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 194 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va);
 195 static void pmap_remove_tte(pmap_t pmap, tte_t tte_data, vm_offset_t va);
 196 static void pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot);
 197 static void pmap_tsb_reset(pmap_t pmap);
 198 static void pmap_tsb_resize(pmap_t pmap);
 199 static void pmap_tte_hash_resize(pmap_t pmap);
 200
 201 void pmap_set_ctx_panic(uint64_t error, vm_paddr_t tsb_ra, pmap_t pmap);
 202
 203 struct tsb_resize_info {
 204         uint64_t tri_tsbscratch;
 205         uint64_t tri_tsb_ra;
 206 };
 207
 208 /*
 209  * Quick sort callout for comparing memory regions.
 210  */
 211 static int mr_cmp(const void *a, const void *b);
 212 static int om_cmp(const void *a, const void *b);
 213 static int
 214 mr_cmp(const void *a, const void *b)
 215 {
 216         const struct ofw_mem_region *mra;
 217         const struct ofw_mem_region *mrb;
 218
 219         mra = a;
 220         mrb = b;
 221         if (mra->mr_start < mrb->mr_start)
 222                 return (-1);
 223         else if (mra->mr_start > mrb->mr_start)
 224                 return (1);
 225         else
 226                 return (0);
 227 }
 228 static int
 229 om_cmp(const void *a, const void *b)
 230 {
 231         const struct ofw_map *oma;
 232         const struct ofw_map *omb;
 233
 234         oma = a;
 235         omb = b;
 236         if (oma->om_start < omb->om_start)
 237                 return (-1);
 238         else if (oma->om_start > omb->om_start)
 239                 return (1);
 240         else
 241                 return (0);
 242 }
 243
 244 static __inline void
 245 free_context(uint16_t ctx)
 246 {
 247         mtx_lock_spin(&pmap_ctx_lock);
 248         ctx_stack[ctx_stack_top++] = ctx;
 249         mtx_unlock_spin(&pmap_ctx_lock);
 250
 251         KASSERT(ctx_stack_top < PMAP_CONTEXT_MAX,
 252                 ("context stack overrun - system error"));
 253 }
 254
 255 static __inline uint16_t
 256 get_context(void)
 257 {
 258         uint16_t ctx;
 259
 260         mtx_lock_spin(&pmap_ctx_lock);
 261         ctx = ctx_stack[--ctx_stack_top];
 262         mtx_unlock_spin(&pmap_ctx_lock);
 263
 264         KASSERT(ctx_stack_top > 0,
 265                 ("context stack underrun - need to implement context stealing"));
 266
 267         return ctx;
 268 }
 269
 270 static __inline void
 271 free_pv_entry(pv_entry_t pv)
 272 {
 273         pv_entry_count--;
 274         uma_zfree(pvzone, pv);
 275 }
 276
 277 /*
 278  * get a new pv_entry, allocating a block from the system
 279  * when needed.
 280  */
 281 static pv_entry_t
 282 get_pv_entry(pmap_t locked_pmap)
 283 {
 284         static const struct timeval printinterval = { 60, 0 };
 285         static struct timeval lastprint;
 286         struct vpgqueues *vpq;
 287         uint64_t tte_data;
 288         pmap_t pmap;
 289         pv_entry_t allocated_pv, next_pv, pv;
 290         vm_offset_t va;
 291         vm_page_t m;
 292
 293         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 294         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 295         allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
 296         if (allocated_pv != NULL) {
 297                 pv_entry_count++;
 298                 if (pv_entry_count > pv_entry_high_water)
 299                         pagedaemon_wakeup();
 300                 else
 301                         return (allocated_pv);
 302         }
 303
 304         /*
 305          * Reclaim pv entries: At first, destroy mappings to inactive
 306          * pages.  After that, if a pv entry is still needed, destroy
 307          * mappings to active pages.
 308          */
 309         if (ratecheck(&lastprint, &printinterval))
 310                 printf("Approaching the limit on PV entries, "
 311                     "increase the vm.pmap.shpgperproc tunable.\n");
 312
 313         vpq = &vm_page_queues[PQ_INACTIVE];
 314 retry:
 315         TAILQ_FOREACH(m, &vpq->pl, pageq) {
 316                 if (m->hold_count || m->busy)
 317                         continue;
 318                 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
 319                         va = pv->pv_va;
 320                         pmap = pv->pv_pmap;
 321                         /* Avoid deadlock and lock recursion. */
 322                         if (pmap > locked_pmap)
 323                                 PMAP_LOCK(pmap);
 324                         else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
 325                                 continue;
 326                         pmap->pm_stats.resident_count--;
 327
 328                         tte_data = tte_hash_delete(pmap->pm_hash, va);
 329
 330                         KASSERT((tte_data & VTD_WIRED) == 0,
 331                             ("get_pv_entry: wired pte %#jx", (uintmax_t)tte_data));
 332                         if (tte_data & VTD_REF)
 333                                 vm_page_flag_set(m, PG_REFERENCED);
 334                         if (tte_data & VTD_W) {
 335                                 KASSERT((tte_data & VTD_SW_W),
 336                                 ("get_pv_entry: modified page not writable: va: %lx, tte: %lx",
 337                                     va, tte_data));
 338                                 vm_page_dirty(m);
 339                         }
 340
 341                         pmap_invalidate_page(pmap, va, TRUE);
 342                         TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 343                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 344                         if (TAILQ_EMPTY(&m->md.pv_list))
 345                                 vm_page_flag_clear(m, PG_WRITEABLE);
 346                         m->md.pv_list_count--;
 347
 348                         if (pmap != locked_pmap)
 349                                 PMAP_UNLOCK(pmap);
 350                         if (allocated_pv == NULL)
 351                                 allocated_pv = pv;
 352                         else
 353                                 free_pv_entry(pv);
 354                 }
 355         }
 356         if (allocated_pv == NULL) {
 357                 if (vpq == &vm_page_queues[PQ_INACTIVE]) {
 358                         vpq = &vm_page_queues[PQ_ACTIVE];
 359                         goto retry;
 360                 }
 361                 panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
 362         }
 363         return (allocated_pv);
 364 }
 365
 366 /*
 367  * Allocate a physical page of memory directly from the phys_avail map.
 368  * Can only be called from pmap_bootstrap before avail start and end are
 369  * calculated.
 370  */
 371 static vm_paddr_t
 372 pmap_bootstrap_alloc(vm_size_t size)
 373 {
 374         vm_paddr_t pa;
 375         int i;
 376
 377         size = round_page(size);
 378
 379         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 380                 if (phys_avail[i + 1] - phys_avail[i] < size)
 381                         continue;
 382                 pa = phys_avail[i];
 383                 phys_avail[i] += size;
 384                 pmap_scrub_pages(pa, size);
 385                 return (pa);
 386         }
 387         panic("pmap_bootstrap_alloc");
 388 }
 389
 390 /*
 391  * Activate a user pmap.  The pmap must be activated before its address space
 392  * can be accessed in any way.
 393  */
 394 void
 395 pmap_activate(struct thread *td)
 396 {
 397         pmap_t pmap, oldpmap;
 398         int err;
 399
 400         critical_enter();
 401         pmap = vmspace_pmap(td->td_proc->p_vmspace);
 402         oldpmap = PCPU_GET(curpmap);
 403 #if defined(SMP)
 404         atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
 405         atomic_set_int(&pmap->pm_tlbactive, PCPU_GET(cpumask));
 406         atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
 407 #else
 408         oldpmap->pm_active &= ~1;
 409         pmap->pm_active |= 1;
 410         pmap->pm_tlbactive |= 1;
 411 #endif
 412
 413         pmap->pm_hashscratch = tte_hash_set_scratchpad_user(pmap->pm_hash, pmap->pm_context);
 414         pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
 415         pmap->pm_tsb_miss_count = pmap->pm_tsb_cap_miss_count = 0;
 416
 417         PCPU_SET(curpmap, pmap);
 418         if (pmap->pm_context != 0)
 419                 if ((err = hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra)) != H_EOK)
 420                         panic("failed to set TSB 0x%lx - context == %ld\n",
 421                               pmap->pm_tsb_ra, pmap->pm_context);
 422         stxa(MMU_CID_S, ASI_MMU_CONTEXTID, pmap->pm_context);
 423         membar(Sync);
 424         critical_exit();
 425 }
 426
 427 void
 428 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 429 {
 430 }
 431
 432 /*
 433  *      Increase the starting virtual address of the given mapping if a
 434  *      different alignment might result in more superpage mappings.
 435  */
 436 void
 437 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
 438     vm_offset_t *addr, vm_size_t size)
 439 {
 440 }
 441
 442 /*
 443  * Bootstrap the system enough to run with virtual memory.
 444  */
 445 void
 446 pmap_bootstrap(vm_offset_t ekva)
 447 {
 448         struct pmap *pm;
 449         vm_offset_t off, va;
 450         vm_paddr_t pa, tsb_8k_pa, tsb_4m_pa, kernel_hash_pa, nucleus_memory_start;
 451         vm_size_t physsz, virtsz, kernel_hash_shift;
 452         ihandle_t pmem, vmem;
 453         int i, j, k, sz;
 454         uint64_t tsb_8k_size, tsb_4m_size, error, physmem_tunable, physmemstart_tunable;
 455         vm_paddr_t real_phys_avail[128], tmp_phys_avail[128], bounds;
 456
 457
 458         if ((vmem = OF_finddevice("/virtual-memory")) == -1)
 459                 panic("pmap_bootstrap: finddevice /virtual-memory");
 460         if ((sz = OF_getproplen(vmem, "translations")) == -1)
 461                 panic("pmap_bootstrap: getproplen translations");
 462         if (sizeof(translations) < sz)
 463                 panic("pmap_bootstrap: translations too small");
 464         bzero(translations, sz);
 465         if (OF_getprop(vmem, "translations", translations, sz) == -1)
 466                 panic("pmap_bootstrap: getprop /virtual-memory/translations");
 467         sz /= sizeof(*translations);
 468         translations_size = sz;
 469         nucleus_memory_start = 0;
 470         CTR0(KTR_PMAP, "pmap_bootstrap: translations");
 471         qsort(translations, sz, sizeof (*translations), om_cmp);
 472
 473         for (i = 0; i < sz; i++) {
 474                 KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n",
 475                         translations[i].om_size, translations[i].om_start,
 476                         translations[i].om_tte);
 477                 if ((translations[i].om_start >= KERNBASE) &&
 478                     (translations[i].om_start <= KERNBASE + 3*PAGE_SIZE_4M)) {
 479                         for (j = 0; j < translations[i].om_size; j += PAGE_SIZE_4M) {
 480                                 KDPRINTF("mapping permanent translation\n");
 481                                 pa = TTE_GET_PA(translations[i].om_tte) + j;
 482                                 va = translations[i].om_start + j;
 483                                 error = hv_mmu_map_perm_addr(va, KCONTEXT,
 484                                                              pa | TTE_KERNEL | VTD_4M, MAP_ITLB | MAP_DTLB);
 485                                 if (error != H_EOK)
 486                                         panic("map_perm_addr returned error=%ld", error);
 487
 488                                 if ((nucleus_memory_start == 0) || (pa < nucleus_memory_start))
 489                                         nucleus_memory_start = pa;
 490                                 printf("nucleus_mappings[%d] = 0x%lx\n", permanent_mappings, pa);
 491                                 nucleus_mappings[permanent_mappings++] = pa;
 492                                 nucleus_memory += PAGE_SIZE_4M;
 493 #ifdef SMP
 494                                 mp_add_nucleus_mapping(va, pa|TTE_KERNEL|VTD_4M);
 495 #endif
 496                         }
 497                 }
 498         }
 499
 500         /*
 501          * Find out what physical memory is available from the prom and
 502          * initialize the phys_avail array.  This must be done before
 503          * pmap_bootstrap_alloc is called.
 504          */
 505         if ((pmem = OF_finddevice("/memory")) == -1)
 506                 panic("pmap_bootstrap: finddevice /memory");
 507         if ((sz = OF_getproplen(pmem, "available")) == -1)
 508                 panic("pmap_bootstrap: getproplen /memory/available");
 509         if (sizeof(vm_paddr_t)*128 < sz) /* FIXME */
 510                 panic("pmap_bootstrap: phys_avail too small");
 511         if (sizeof(mra) < sz)
 512                 panic("pmap_bootstrap: mra too small");
 513         bzero(mra, sz);
 514         if (OF_getprop(pmem, "available", mra, sz) == -1)
 515                 panic("pmap_bootstrap: getprop /memory/available");
 516
 517         sz /= sizeof(*mra);
 518         CTR0(KTR_PMAP, "pmap_bootstrap: physical memory");
 519
 520         qsort(mra, sz, sizeof (*mra), mr_cmp);
 521         physmemstart_tunable = physmem_tunable = physmem = physsz = 0;
 522
 523         if (TUNABLE_ULONG_FETCH("hw.physmemstart", &physmemstart_tunable)) {
 524                 KDPRINTF("desired physmemstart=0x%lx\n", physmemstart_tunable);
 525         }
 526         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) {
 527                 physmem = atop(physmem_tunable);
 528                 KDPRINTF("desired physmem=0x%lx\n", physmem_tunable);
 529         }
 530         if ((physmem_tunable != 0) && (physmemstart_tunable != 0))
 531                 physmem_tunable += physmemstart_tunable;
 532
 533         bzero(real_phys_avail, sizeof(real_phys_avail));
 534         bzero(tmp_phys_avail, sizeof(tmp_phys_avail));
 535
 536         for (i = 0, j = 0; i < sz; i++) {
 537                 uint64_t size;
 538                 KDPRINTF("start=%#lx size=%#lx\n", mra[i].mr_start, mra[i].mr_size);
 539                 if (mra[i].mr_size < PAGE_SIZE_4M)
 540                         continue;
 541
 542                 if ((mra[i].mr_start & PAGE_MASK_4M) || (mra[i].mr_size & PAGE_MASK_4M)) {
 543                         uint64_t newstart, roundup;
 544                         newstart = ((mra[i].mr_start + (PAGE_MASK_4M)) & ~PAGE_MASK_4M);
 545                         roundup = newstart - mra[i].mr_start;
 546                         size = (mra[i].mr_size - roundup) & ~PAGE_MASK_4M;
 547                         mra[i].mr_start = newstart;
 548                         if (size < PAGE_SIZE_4M)
 549                                 continue;
 550                         mra[i].mr_size = size;
 551                 }
 552                 real_phys_avail[j] = mra[i].mr_start;
 553                 if (physmem_tunable != 0 && ((physsz + mra[i].mr_size) >= physmem_tunable)) {
 554                         mra[i].mr_size = physmem_tunable - physsz;
 555                         physsz = physmem_tunable;
 556                         real_phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
 557                         break;
 558                 }
 559                 physsz += mra[i].mr_size;
 560                 real_phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
 561                 j += 2;
 562         }
 563         physmem = btoc(physsz - physmemstart_tunable);
 564
 565         /*
 566          * This is needed for versions of OFW that would allocate us memory
 567          * and then forget to remove it from the available ranges ...
 568          * as well as for compensating for the above move of nucleus pages
 569          */
 570         for (i = 0, j = 0, bounds = (1UL<<32); real_phys_avail[i] != 0; i += 2) {
 571                 vm_paddr_t start = real_phys_avail[i];
 572                 uint64_t end = real_phys_avail[i + 1];
 573                 CTR2(KTR_PMAP, "start=%#lx size=%#lx\n", start, end);
 574                 KDPRINTF("real_phys start=%#lx end=%#lx\n", start, end);
 575                 /*
 576                  * Is kernel memory at the beginning of range?
 577                  */
 578                 if (nucleus_memory_start == start) {
 579                         start += nucleus_memory;
 580                 }
 581                 /*
 582                  * Is kernel memory at the end of range?
 583                  */
 584                 if (nucleus_memory_start == (end - nucleus_memory))
 585                         end -= nucleus_memory;
 586
 587                 if (physmemstart_tunable != 0 &&
 588                     (end < physmemstart_tunable))
 589                         continue;
 590
 591                 if (physmemstart_tunable != 0 &&
 592                     ((start < physmemstart_tunable))) {
 593                         start = physmemstart_tunable;
 594                 }
 595
 596                 /*
 597                  * Is kernel memory in the middle somewhere?
 598                  */
 599                 if ((nucleus_memory_start > start) &&
 600                     (nucleus_memory_start < end)) {
 601                         phys_avail[j] = start;
 602                         phys_avail[j+1] = nucleus_memory_start;
 603                         start =  nucleus_memory_start + nucleus_memory;
 604                         j += 2;
 605                 }
 606                 /*
 607                  * Break phys_avail up on 4GB boundaries to try
 608                  * to work around PCI-e allocation bug
 609                  * we rely on the fact that kernel memory is allocated
 610                  * from the first 4GB of physical memory
 611                  */
 612                 while (bounds < start)
 613                         bounds += (1UL<<32);
 614
 615                 while (bounds < end) {
 616                         phys_avail[j] = start;
 617                         phys_avail[j + 1] = bounds;
 618                         start = bounds;
 619                         bounds += (1UL<<32);
 620                         j += 2;
 621                 }
 622                 phys_avail[j] = start;
 623                 phys_avail[j + 1] = end;
 624                 j += 2;
 625         }
 626
 627         /*
 628          * Merge nucleus memory in to real_phys_avail
 629          *
 630          */
 631         for (i = 0; real_phys_avail[i] != 0; i += 2) {
 632                 if (real_phys_avail[i] == nucleus_memory_start + nucleus_memory)
 633                         real_phys_avail[i] -= nucleus_memory;
 634
 635                 if (real_phys_avail[i + 1] == nucleus_memory_start)
 636                         real_phys_avail[i + 1] += nucleus_memory;
 637
 638                 if (real_phys_avail[i + 1] == real_phys_avail[i + 2]) {
 639                         real_phys_avail[i + 1] = real_phys_avail[i + 3];
 640                         for (k = i + 2; real_phys_avail[k] != 0; k += 2) {
 641                                 real_phys_avail[k] = real_phys_avail[k + 2];
 642                                 real_phys_avail[k + 1] = real_phys_avail[k + 3];
 643                         }
 644                 }
 645         }
 646         for (i = 0; phys_avail[i] != 0; i += 2)
 647                 if (pmap_debug_range || pmap_debug)
 648                         printf("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
 649                         i, phys_avail[i], i+1, phys_avail[i+1]);
 650
 651         /*
 652          * Shuffle the memory range containing the 256MB page with
 653          * nucleus_memory to the beginning of the phys_avail array
 654          * so that physical memory from that page is preferentially
 655          * allocated first
 656          */
 657         for (j = 0; phys_avail[j] != 0; j += 2)
 658                 if (nucleus_memory_start < phys_avail[j])
 659                         break;
 660         /*
 661          * Don't shuffle unless we have a full 256M page in the range
 662          * our kernel malloc appears to be horribly brittle
 663          */
 664         if ((phys_avail[j + 1] - phys_avail[j]) <
 665             (PAGE_SIZE_256M - nucleus_memory))
 666                 goto skipshuffle;
 667
 668         for (i = j, k = 0; phys_avail[i] != 0; k++, i++)
 669                 tmp_phys_avail[k] = phys_avail[i];
 670         for (i = 0; i < j; i++)
 671                 tmp_phys_avail[k + i] = phys_avail[i];
 672         for (i = 0; i < 128; i++)
 673                 phys_avail[i] = tmp_phys_avail[i];
 674
 675 skipshuffle:
 676         for (i = 0; real_phys_avail[i] != 0; i += 2)
 677                 if (pmap_debug_range || pmap_debug)
 678                         printf("real_phys_avail[%d]=0x%lx real_phys_avail[%d]=0x%lx\n",
 679                         i, real_phys_avail[i], i+1, real_phys_avail[i+1]);
 680
 681         for (i = 0; phys_avail[i] != 0; i += 2)
 682                 if (pmap_debug_range || pmap_debug)
 683                         printf("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
 684                         i, phys_avail[i], i+1, phys_avail[i+1]);
 685         /*
 686          * Calculate the size of kernel virtual memory, and the size and mask
 687          * for the kernel tsb.
 688          */
 689         virtsz = roundup(physsz, PAGE_SIZE_4M << (PAGE_SHIFT - TTE_SHIFT));
 690         vm_max_kernel_address = VM_MIN_KERNEL_ADDRESS + virtsz;
 691
 692         /*
 693          * Set the start and end of kva.  The kernel is loaded at the first
 694          * available 4 meg super page, so round up to the end of the page.
 695          */
 696         virtual_avail = roundup2(ekva, PAGE_SIZE_4M);
 697         virtual_end = vm_max_kernel_address;
 698         kernel_vm_end = vm_max_kernel_address;
 699
 700         /*
 701          * Allocate and map a 4MB page for the kernel hashtable
 702          *
 703          */
 704 #ifndef SIMULATOR
 705         kernel_hash_shift = 10; /* PAGE_SIZE_4M*2 */
 706 #else
 707         kernel_hash_shift = 6; /* PAGE_SIZE_8K*64 */
 708 #endif
 709
 710         kernel_hash_pa = pmap_bootstrap_alloc((1<<(kernel_hash_shift + PAGE_SHIFT)));
 711         if (kernel_hash_pa & PAGE_MASK_4M)
 712                 panic("pmap_bootstrap: hashtable pa unaligned\n");
 713         /*
 714          * Set up TSB descriptors for the hypervisor
 715          *
 716          */
 717 #ifdef notyet
 718         tsb_8k_size = virtsz >> (PAGE_SHIFT - TTE_SHIFT);
 719 #else
 720         /* avoid alignment complaints from the hypervisor */
 721         tsb_8k_size = PAGE_SIZE_4M;
 722 #endif
 723
 724         tsb_8k_pa = pmap_bootstrap_alloc(tsb_8k_size);
 725         if (tsb_8k_pa & PAGE_MASK_4M)
 726                 panic("pmap_bootstrap: tsb unaligned\n");
 727         KDPRINTF("tsb_8k_size is 0x%lx, tsb_8k_pa is 0x%lx\n", tsb_8k_size, tsb_8k_pa);
 728
 729         tsb_4m_size = (virtsz >> (PAGE_SHIFT_4M - TTE_SHIFT)) << 3;
 730         tsb_4m_pa = pmap_bootstrap_alloc(tsb_4m_size);
 731
 732         kernel_td[TSB8K_INDEX].hti_idxpgsz = TTE8K;
 733         kernel_td[TSB8K_INDEX].hti_assoc = 1;
 734         kernel_td[TSB8K_INDEX].hti_ntte = (tsb_8k_size >> TTE_SHIFT);
 735         kernel_td[TSB8K_INDEX].hti_ctx_index = 0;
 736         kernel_td[TSB8K_INDEX].hti_pgszs = TSB8K;
 737         kernel_td[TSB8K_INDEX].hti_rsvd = 0;
 738         kernel_td[TSB8K_INDEX].hti_ra = tsb_8k_pa;
 739
 740         /*
 741          * Initialize kernel's private TSB from 8K page TSB
 742          *
 743          */
 744         kernel_pmap->pm_tsb.hti_idxpgsz = TTE8K;
 745         kernel_pmap->pm_tsb.hti_assoc = 1;
 746         kernel_pmap->pm_tsb.hti_ntte = (tsb_8k_size >> TTE_SHIFT);
 747         kernel_pmap->pm_tsb.hti_ctx_index = 0;
 748         kernel_pmap->pm_tsb.hti_pgszs = TSB8K;
 749         kernel_pmap->pm_tsb.hti_rsvd = 0;
 750         kernel_pmap->pm_tsb.hti_ra = tsb_8k_pa;
 751
 752         kernel_pmap->pm_tsb_ra = vtophys((vm_offset_t)&kernel_pmap->pm_tsb);
 753         tsb_set_scratchpad_kernel(&kernel_pmap->pm_tsb);
 754
 755         /*
 756          * Initialize kernel TSB for 4M pages
 757          * currently (not by design) used for permanent mappings
 758          */
 759
 760
 761         KDPRINTF("tsb_4m_pa is 0x%lx tsb_4m_size is 0x%lx\n", tsb_4m_pa, tsb_4m_size);
 762         kernel_td[TSB4M_INDEX].hti_idxpgsz = TTE4M;
 763         kernel_td[TSB4M_INDEX].hti_assoc = 1;
 764         kernel_td[TSB4M_INDEX].hti_ntte = (tsb_4m_size >> TTE_SHIFT);
 765         kernel_td[TSB4M_INDEX].hti_ctx_index = 0;
 766         kernel_td[TSB4M_INDEX].hti_pgszs = TSB4M|TSB256M;
 767         kernel_td[TSB4M_INDEX].hti_rsvd = 0;
 768         kernel_td[TSB4M_INDEX].hti_ra = tsb_4m_pa;
 769         /*
 770          * allocate MMU fault status areas for all CPUS
 771          */
 772         mmu_fault_status_area = pmap_bootstrap_alloc(MMFSA_SIZE*MAXCPU);
 773
 774         /*
 775          * Allocate and map the dynamic per-CPU area for the BSP.
 776          */
 777         dpcpu0 = (void *)TLB_PHYS_TO_DIRECT(pmap_bootstrap_alloc(DPCPU_SIZE));
 778
 779         /*
 780          * Allocate and map the message buffer.
 781          */
 782         msgbuf_phys = pmap_bootstrap_alloc(MSGBUF_SIZE);
 783         msgbufp = (struct msgbuf *)TLB_PHYS_TO_DIRECT(msgbuf_phys);
 784
 785         /*
 786          * Allocate a kernel stack with guard page for thread0 and map it into
 787          * the kernel tsb.
 788          */
 789         pa = pmap_bootstrap_alloc(KSTACK_PAGES*PAGE_SIZE);
 790         kstack0_phys = pa;
 791         virtual_avail += KSTACK_GUARD_PAGES * PAGE_SIZE;
 792         kstack0 = virtual_avail;
 793         virtual_avail += KSTACK_PAGES * PAGE_SIZE;
 794         for (i = 0; i < KSTACK_PAGES; i++) {
 795                 pa = kstack0_phys + i * PAGE_SIZE;
 796                 va = kstack0 + i * PAGE_SIZE;
 797                 tsb_set_tte_real(&kernel_td[TSB8K_INDEX], va, va,
 798                             pa | TTE_KERNEL | VTD_8K, 0);
 799         }
 800         /*
 801          * Calculate the last available physical address.
 802          */
 803         for (i = 0; phys_avail[i + 2] != 0; i += 2)
 804                 KDPRINTF("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
 805                         i, phys_avail[i], i+1, phys_avail[i+1]);
 806         KDPRINTF("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
 807                         i, phys_avail[i], i+1, phys_avail[i+1]);
 808
 809         Maxmem = sparc64_btop(phys_avail[i + 1]);
 810
 811         /*
 812          * Add the prom mappings to the kernel tsb.
 813          */
 814         for (i = 0; i < sz; i++) {
 815                 CTR3(KTR_PMAP,
 816                     "translation: start=%#lx size=%#lx tte=%#lx",
 817                     translations[i].om_start, translations[i].om_size,
 818                     translations[i].om_tte);
 819                 KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n",
 820                        translations[i].om_size, translations[i].om_start,
 821                        translations[i].om_tte);
 822
 823                 if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
 824                     translations[i].om_start > VM_MAX_PROM_ADDRESS)
 825                         continue;
 826
 827                 for (off = 0; off < translations[i].om_size;
 828                      off += PAGE_SIZE) {
 829                         va = translations[i].om_start + off;
 830                         pa = TTE_GET_PA(translations[i].om_tte) + off;
 831                         tsb_assert_invalid(&kernel_td[TSB8K_INDEX], va);
 832                         tsb_set_tte_real(&kernel_td[TSB8K_INDEX], va, va, pa |
 833                                     TTE_KERNEL | VTD_8K, 0);
 834                 }
 835         }
 836
 837         if ((error = hv_mmu_tsb_ctx0(MAX_TSB_INFO,
 838                                      vtophys((vm_offset_t)kernel_td))) != H_EOK)
 839                 panic("failed to set ctx0 TSBs error: %ld", error);
 840
 841 #ifdef SMP
 842         mp_set_tsb_desc_ra(vtophys((vm_offset_t)&kernel_td));
 843 #endif
 844         /*
 845          * setup direct mappings
 846          *
 847          */
 848         for (i = 0, pa = real_phys_avail[i]; pa != 0; i += 2, pa = real_phys_avail[i]) {
 849                 vm_paddr_t tag_pa = 0, next_pa = 0;
 850                 uint64_t size_bits = VTD_4M;
 851                 while (pa < real_phys_avail[i + 1]) {
 852                         if (use_256M_pages &&
 853                             (pa & PAGE_MASK_256M) == 0 &&
 854                             ((pa + PAGE_SIZE_256M) <= real_phys_avail[i + 1])) {
 855                                 tag_pa = pa;
 856                                 size_bits = VTD_256M;
 857                                 next_pa = pa + PAGE_SIZE_256M;
 858                         } else if (next_pa <= pa) {
 859                                 tag_pa = pa;
 860                                 size_bits = VTD_4M;
 861                         }
 862                         tsb_assert_invalid(&kernel_td[TSB4M_INDEX], TLB_PHYS_TO_DIRECT(pa));
 863                         tsb_set_tte_real(&kernel_td[TSB4M_INDEX], TLB_PHYS_TO_DIRECT(pa),
 864                                          TLB_PHYS_TO_DIRECT(pa),
 865                                          tag_pa | TTE_KERNEL | size_bits, 0);
 866                         pa += PAGE_SIZE_4M;
 867                 }
 868         }
 869
 870         /*
 871          * Get the available physical memory ranges from /memory/reg. These
 872          * are only used for kernel dumps, but it may not be wise to do prom
 873          * calls in that situation.
 874          */
 875         if ((sz = OF_getproplen(pmem, "reg")) == -1)
 876                 panic("pmap_bootstrap: getproplen /memory/reg");
 877         if (sizeof(sparc64_memreg) < sz)
 878                 panic("pmap_bootstrap: sparc64_memreg too small");
 879         if (OF_getprop(pmem, "reg", sparc64_memreg, sz) == -1)
 880                 panic("pmap_bootstrap: getprop /memory/reg");
 881         sparc64_nmemreg = sz / sizeof(*sparc64_memreg);
 882
 883         pm = kernel_pmap;
 884         pm->pm_active = ~0;
 885         pm->pm_tlbactive = ~0;
 886
 887         PMAP_LOCK_INIT(kernel_pmap);
 888
 889         TAILQ_INIT(&kernel_pmap->pm_pvlist);
 890
 891         /*
 892          * This could happen earlier - but I put it here to avoid
 893          * attempts to do updates until they're legal
 894          */
 895         pm->pm_hash = tte_hash_kernel_create(TLB_PHYS_TO_DIRECT(kernel_hash_pa), kernel_hash_shift,
 896                                              pmap_bootstrap_alloc(PAGE_SIZE));
 897         pm->pm_hashscratch = tte_hash_set_scratchpad_kernel(pm->pm_hash);
 898
 899         for (i = 0; i < translations_size; i++) {
 900                 KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n",
 901                        translations[i].om_size, translations[i].om_start,
 902                        translations[i].om_tte);
 903
 904                 if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
 905                     translations[i].om_start > VM_MAX_PROM_ADDRESS) {
 906                         KDPRINTF("skipping\n");
 907                         continue;
 908                 }
 909                 for (off = 0; off < translations[i].om_size; off += PAGE_SIZE) {
 910                         va = translations[i].om_start + off;
 911                         pa = TTE_GET_PA(translations[i].om_tte) + off;
 912                         tte_hash_insert(pm->pm_hash, va, pa | TTE_KERNEL | VTD_8K);
 913                 }
 914                 KDPRINTF("set om_size=%ld om_start=%lx om_tte=%lx\n",
 915                        translations[i].om_size, translations[i].om_start,
 916                        translations[i].om_tte);
 917         }
 918         for (i = 0; i < KSTACK_PAGES; i++) {
 919                 pa = kstack0_phys + i * PAGE_SIZE;
 920                 va = kstack0 + i * PAGE_SIZE;
 921                 tte_hash_insert(pm->pm_hash, va, pa | TTE_KERNEL | VTD_8K);
 922         }
 923         /*
 924          * Add direct mappings to hash
 925          *
 926          */
 927 #ifdef notyet
 928         /* hash only supports 8k pages */
 929         for (pa = PAGE_SIZE_4M; pa < phys_avail[2]; pa += PAGE_SIZE_4M)
 930                 tte_hash_insert(pm->pm_hash, TLB_PHYS_TO_DIRECT(pa),
 931                                 pa | TTE_KERNEL | VTD_4M);
 932 #endif
 933
 934
 935         if (bootverbose)
 936                 printf("pmap_bootstrap done\n");
 937 }
 938
 939
 940
 941 /*
 942  *      Routine:        pmap_change_wiring
 943  *      Function:       Change the wiring attribute for a map/virtual-address
 944  *                      pair.
 945  *      In/out conditions:
 946  *                      The mapping must already exist in the pmap.
 947  */
 948 void
 949 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 950 {
 951         boolean_t iswired;
 952         PMAP_LOCK(pmap);
 953         iswired = tte_get_virt_bit(pmap, va, VTD_WIRED);
 954
 955         if (wired && !iswired) {
 956                 pmap->pm_stats.wired_count++;
 957                 tte_set_virt_bit(pmap, va, VTD_WIRED);
 958         } else if (!wired && iswired) {
 959                 pmap->pm_stats.wired_count--;
 960                 tte_clear_virt_bit(pmap, va, VTD_WIRED);
 961         }
 962         PMAP_UNLOCK(pmap);
 963 }
 964
 965 void
 966 pmap_clear_modify(vm_page_t m)
 967 {
 968         KDPRINTF("pmap_clear_modify(0x%lx)\n", VM_PAGE_TO_PHYS(m));
 969         tte_clear_phys_bit(m, VTD_W);
 970 }
 971
 972 void
 973 pmap_clear_reference(vm_page_t m)
 974 {
 975         KDPRINTF("pmap_clear_reference(0x%lx)\n", VM_PAGE_TO_PHYS(m));
 976         tte_clear_phys_bit(m, VTD_REF);
 977 }
 978
 979 void
 980 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
 981           vm_size_t len, vm_offset_t src_addr)
 982 {
 983         vm_offset_t addr, end_addr;
 984
 985         end_addr = src_addr + len;
 986         /*
 987          * Don't let optional prefaulting of pages make us go
 988          * way below the low water mark of free pages or way
 989          * above high water mark of used pv entries.
 990          */
 991         if (cnt.v_free_count < cnt.v_free_reserved ||
 992             pv_entry_count > pv_entry_high_water)
 993                 return;
 994
 995
 996         vm_page_lock_queues();
 997         if (dst_pmap < src_pmap) {
 998                 PMAP_LOCK(dst_pmap);
 999                 PMAP_LOCK(src_pmap);
1000         } else {
1001                 PMAP_LOCK(src_pmap);
1002                 PMAP_LOCK(dst_pmap);
1003         }
1004         for (addr = src_addr; addr < end_addr; addr += PAGE_SIZE) {
1005                 tte_t tte_data;
1006                 vm_page_t m;
1007
1008                 tte_data = tte_hash_lookup(src_pmap->pm_hash, addr);
1009
1010                 if ((tte_data & VTD_MANAGED) != 0) {
1011                         if (tte_hash_lookup(dst_pmap->pm_hash, addr) == 0) {
1012                                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
1013
1014                                 tte_hash_insert(dst_pmap->pm_hash, addr, tte_data & ~(VTD_W|VTD_REF|VTD_WIRED));
1015                                 dst_pmap->pm_stats.resident_count++;
1016                                 pmap_insert_entry(dst_pmap, addr, m);
1017                         }
1018                 }
1019         }
1020         vm_page_unlock_queues();
1021         PMAP_UNLOCK(src_pmap);
1022         PMAP_UNLOCK(dst_pmap);
1023 }
1024
1025 void
1026 pmap_copy_page(vm_page_t src, vm_page_t dst)
1027 {
1028         vm_paddr_t srcpa, dstpa;
1029         srcpa = VM_PAGE_TO_PHYS(src);
1030         dstpa = VM_PAGE_TO_PHYS(dst);
1031
1032         novbcopy((char *)TLB_PHYS_TO_DIRECT(srcpa), (char *)TLB_PHYS_TO_DIRECT(dstpa), PAGE_SIZE);
1033
1034
1035 }
1036
1037 static __inline void
1038 pmap_add_tte(pmap_t pmap, vm_offset_t va, vm_page_t m, tte_t *tte_data, int wired)
1039 {
1040
1041         if (wired)
1042                 pmap->pm_stats.wired_count++;
1043
1044         if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
1045                 pmap_insert_entry(pmap, va, m);
1046                 *tte_data |= VTD_MANAGED;
1047         }
1048 }
1049
1050 /*
1051  * Map the given physical page at the specified virtual address in the
1052  * target pmap with the protection requested.  If specified the page
1053  * will be wired down.
1054  */
1055 void
1056 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
1057     vm_prot_t prot, boolean_t wired)
1058 {
1059         vm_paddr_t pa, opa;
1060         uint64_t tte_data, otte_data;
1061         vm_page_t om;
1062         int invlva;
1063
1064         if (pmap->pm_context)
1065                 DPRINTF("pmap_enter(va=%lx, pa=0x%lx, prot=%x)\n", va,
1066                         VM_PAGE_TO_PHYS(m), prot);
1067
1068         om = NULL;
1069
1070         vm_page_lock_queues();
1071         PMAP_LOCK(pmap);
1072
1073         tte_data = pa = VM_PAGE_TO_PHYS(m);
1074         otte_data = tte_hash_delete(pmap->pm_hash, va);
1075         opa = TTE_GET_PA(otte_data);
1076
1077         if (opa == 0) {
1078                 /*
1079                  * This is a new mapping
1080                  */
1081                 pmap->pm_stats.resident_count++;
1082                 pmap_add_tte(pmap, va, m, &tte_data, wired);
1083
1084         } else if (pa != opa) {
1085                 /*
1086                  * Mapping has changed, handle validating new mapping.
1087                  *
1088                  */
1089                 if (otte_data & VTD_WIRED)
1090                         pmap->pm_stats.wired_count--;
1091
1092                 if (otte_data & VTD_MANAGED) {
1093                         om = PHYS_TO_VM_PAGE(opa);
1094                         pmap_remove_entry(pmap, om, va);
1095                 }
1096
1097                 pmap_add_tte(pmap, va, m, &tte_data, wired);
1098
1099         } else /* (pa == opa) */ {
1100                 /*
1101                  * Mapping has not changed, must be protection or wiring change.
1102                  */
1103
1104                 /*
1105                  * Wiring change, just update stats. We don't worry about
1106                  * wiring PT pages as they remain resident as long as there
1107                  * are valid mappings in them. Hence, if a user page is wired,
1108                  * the PT page will be also.
1109                  */
1110                 if (wired && ((otte_data & VTD_WIRED) == 0))
1111                         pmap->pm_stats.wired_count++;
1112                 else if (!wired && (otte_data & VTD_WIRED))
1113                         pmap->pm_stats.wired_count--;
1114
1115                 /*
1116                  * We might be turning off write access to the page,
1117                  * so we go ahead and sense modify status.
1118                  */
1119                 if (otte_data & VTD_MANAGED) {
1120                         om = m;
1121                         tte_data |= VTD_MANAGED;
1122                 }
1123         }
1124
1125         /*
1126          * Now validate mapping with desired protection/wiring.
1127          */
1128         if ((prot & VM_PROT_WRITE) != 0) {
1129                 tte_data |= VTD_SW_W;
1130                 vm_page_flag_set(m, PG_WRITEABLE);
1131         }
1132         if ((prot & VM_PROT_EXECUTE) != 0)
1133                 tte_data |= VTD_X;
1134         if (wired)
1135                 tte_data |= VTD_WIRED;
1136         if (pmap == kernel_pmap)
1137                 tte_data |= VTD_P;
1138
1139         invlva = FALSE;
1140         if ((otte_data & ~(VTD_W|VTD_REF)) != tte_data) {
1141                 if (otte_data & VTD_V) {
1142                         if (otte_data & VTD_REF) {
1143                                 if (otte_data & VTD_MANAGED)
1144                                         vm_page_flag_set(om, PG_REFERENCED);
1145                                 if ((opa != pa) || ((opa & VTD_X) != (pa & VTD_X)))
1146                                         invlva = TRUE;
1147                         }
1148                         if (otte_data & VTD_W) {
1149                                 if (otte_data & VTD_MANAGED)
1150                                         vm_page_dirty(om);
1151                                 if ((pa & VTD_SW_W) != 0)
1152                                         invlva = TRUE;
1153                         }
1154                         if (invlva)
1155                                 pmap_invalidate_page(pmap, va, TRUE);
1156                 }
1157         }
1158
1159
1160         tte_hash_insert(pmap->pm_hash, va, tte_data|TTE_MINFLAGS|VTD_REF);
1161         /*
1162          * XXX this needs to be locked for the threaded / kernel case
1163          */
1164         tsb_set_tte(&pmap->pm_tsb, va, tte_data|TTE_MINFLAGS|VTD_REF,
1165                     pmap->pm_context);
1166
1167         if (tte_hash_needs_resize(pmap->pm_hash))
1168                 pmap_tte_hash_resize(pmap);
1169
1170         /*
1171          * 512 is an arbitrary number of tsb misses
1172          */
1173         if (0 && pmap->pm_context != 0 && pmap->pm_tsb_miss_count > 512)
1174                 pmap_tsb_resize(pmap);
1175
1176         vm_page_unlock_queues();
1177
1178         PMAP_UNLOCK(pmap);
1179 }
1180
1181 /*
1182  * Maps a sequence of resident pages belonging to the same object.
1183  * The sequence begins with the given page m_start.  This page is
1184  * mapped at the given virtual address start.  Each subsequent page is
1185  * mapped at a virtual address that is offset from start by the same
1186  * amount as the page is offset from m_start within the object.  The
1187  * last page in the sequence is the page with the largest offset from
1188  * m_start that can be mapped at a virtual address less than the given
1189  * virtual address end.  Not every virtual page between start and end
1190  * is mapped; only those for which a resident page exists with the
1191  * corresponding offset from m_start are mapped.
1192  */
1193 void
1194 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
1195                   vm_page_t m_start, vm_prot_t prot)
1196 {
1197         vm_page_t m;
1198         vm_pindex_t diff, psize;
1199
1200         VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
1201         psize = atop(end - start);
1202         m = m_start;
1203         PMAP_LOCK(pmap);
1204         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
1205                 pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot);
1206                 m = TAILQ_NEXT(m, listq);
1207         }
1208         PMAP_UNLOCK(pmap);
1209 }
1210
1211 void
1212 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
1213 {
1214         PMAP_LOCK(pmap);
1215         pmap_enter_quick_locked(pmap, va, m, prot);
1216         PMAP_UNLOCK(pmap);
1217 }
1218
1219 static void
1220 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
1221 {
1222
1223         tte_t tte_data;
1224
1225         if (pmap->pm_context)
1226                 KDPRINTF("pmap_enter_quick(ctx=0x%lx va=%lx, pa=0x%lx prot=%x)\n",
1227                         pmap->pm_context, va, VM_PAGE_TO_PHYS(m), prot);
1228
1229         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1230         if (tte_hash_lookup(pmap->pm_hash, va))
1231                 return;
1232
1233         tte_data = VM_PAGE_TO_PHYS(m);
1234         /*
1235          * Enter on the PV list if part of our managed memory. Note that we
1236          * raise IPL while manipulating pv_table since pmap_enter can be
1237          * called at interrupt time.
1238          */
1239         if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
1240                 pmap_insert_entry(pmap, va, m);
1241                 tte_data |= VTD_MANAGED;
1242         }
1243
1244         pmap->pm_stats.resident_count++;
1245
1246         if ((prot & VM_PROT_EXECUTE) != 0)
1247                 tte_data |= VTD_X;
1248
1249         tte_hash_insert(pmap->pm_hash, va, tte_data | TTE_MINFLAGS);
1250 }
1251
1252 /*
1253  * Extract the physical page address associated with the given
1254  * map/virtual_address pair.
1255  */
1256 vm_paddr_t
1257 pmap_extract(pmap_t pmap, vm_offset_t va)
1258 {
1259         vm_paddr_t pa;
1260         tte_t tte_data;
1261
1262         tte_data = tte_hash_lookup(pmap->pm_hash, va);
1263         pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
1264
1265         return (pa);
1266 }
1267
1268 /*
1269  * Atomically extract and hold the physical page with the given
1270  * pmap and virtual address pair if that mapping permits the given
1271  * protection.
1272  */
1273 vm_page_t
1274 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1275 {
1276         tte_t tte_data;
1277         vm_page_t m;
1278
1279         m = NULL;
1280         vm_page_lock_queues();
1281         PMAP_LOCK(pmap);
1282         tte_data = tte_hash_lookup(pmap->pm_hash, va);
1283         if (tte_data != 0 &&
1284             ((tte_data & VTD_SW_W) || (prot & VM_PROT_WRITE) == 0)) {
1285                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
1286                 vm_page_hold(m);
1287         }
1288         vm_page_unlock_queues();
1289         PMAP_UNLOCK(pmap);
1290
1291         return (m);
1292 }
1293
1294 void *
1295 pmap_alloc_zeroed_contig_pages(int npages, uint64_t alignment)
1296 {
1297         vm_page_t m, tm;
1298         int i;
1299         void *ptr;
1300
1301         m = NULL;
1302         while (m == NULL) {
1303                 for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1304                         m = vm_phys_alloc_contig(npages, phys_avail[i],
1305                             phys_avail[i + 1], alignment, (1UL<<34));
1306                         if (m)
1307                                 goto found;
1308                 }
1309                 if (m == NULL) {
1310                         printf("vm_phys_alloc_contig failed - waiting to retry\n");
1311                         VM_WAIT;
1312                 }
1313         }
1314 found:
1315         for (i = 0, tm = m; i < npages; i++, tm++) {
1316                 tm->wire_count++;
1317                 if ((tm->flags & PG_ZERO) == 0)
1318                         pmap_zero_page(tm);
1319         }
1320         ptr = (void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m));
1321
1322         return (ptr);
1323 }
1324
1325 void
1326 pmap_free_contig_pages(void *ptr, int npages)
1327 {
1328         int i;
1329         vm_page_t m;
1330
1331         m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS((vm_offset_t)ptr));
1332         for (i = 0; i < npages; i++, m++) {
1333                 m->wire_count--;
1334                 atomic_subtract_int(&cnt.v_wire_count, 1);
1335                 vm_page_free(m);
1336         }
1337 }
1338
1339 void
1340 pmap_growkernel(vm_offset_t addr)
1341 {
1342         return;
1343 }
1344
1345 void
1346 pmap_init(void)
1347 {
1348
1349         /* allocate pv_entry zones */
1350         int shpgperproc = PMAP_SHPGPERPROC;
1351
1352         for (ctx_stack_top = 1; ctx_stack_top < PMAP_CONTEXT_MAX; ctx_stack_top++)
1353                 ctx_stack[ctx_stack_top] = ctx_stack_top;
1354
1355         mtx_init(&pmap_ctx_lock, "ctx lock", NULL, MTX_SPIN);
1356
1357         /*
1358          * Initialize the address space (zone) for the pv entries.  Set a
1359          * high water mark so that the system can recover from excessive
1360          * numbers of pv entries.
1361          */
1362         pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL,
1363             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
1364         TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
1365         pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
1366         TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
1367         pv_entry_high_water = 9 * (pv_entry_max / 10);
1368         uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
1369
1370         tte_hash_init();
1371
1372 }
1373
1374 /*
1375  * Create a pv entry for page at pa for
1376  * (pmap, va).
1377  */
1378 static void
1379 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1380 {
1381         pv_entry_t pv;
1382
1383         KDPRINTF("pmap_insert_entry(va=0x%lx, pa=0x%lx)\n", va, VM_PAGE_TO_PHYS(m));
1384         pv = get_pv_entry(pmap);
1385         pv->pv_va = va;
1386         pv->pv_pmap = pmap;
1387
1388         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1389         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1390         TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1391         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1392         m->md.pv_list_count++;
1393 }
1394
1395 #ifdef TRAP_TRACING
1396 static int trap_trace_report_done;
1397 #endif
1398
1399 #ifdef SMP
1400 static cpumask_t
1401 pmap_ipi(pmap_t pmap, char *func, uint64_t arg1, uint64_t arg2)
1402 {
1403
1404         int i, cpu_count, retried;
1405         u_int cpus;
1406         cpumask_t cpumask, active, curactive;
1407         cpumask_t active_total, ackmask;
1408         uint16_t *cpulist;
1409
1410         retried = 0;
1411
1412         if (!smp_started)
1413                 return (0);
1414
1415         cpumask = PCPU_GET(cpumask);
1416         cpulist = PCPU_GET(cpulist);
1417         curactive = 0;
1418
1419         if (rdpr(pil) != 14)
1420                 panic("pil %ld != 14", rdpr(pil));
1421
1422 #ifndef CPUMASK_NOT_BEING_ERRONEOUSLY_CHANGED
1423         /* by definition cpumask should have curcpu's bit set */
1424         if (cpumask != (1 << curcpu))
1425                 panic("cpumask(0x%x) != (1 << curcpu) (0x%x)\n",
1426                       cpumask, (1 << curcpu));
1427
1428 #endif
1429 #ifdef notyet
1430         if ((active_total = (pmap->pm_tlbactive & ~cpumask)) == 0)
1431                 goto done;
1432
1433         if (pmap->pm_context != 0)
1434                 active_total = active = (pmap->pm_tlbactive & ~cpumask);
1435         else
1436 #endif
1437                 active_total = active = PCPU_GET(other_cpus);
1438
1439         if (active == 0)
1440                 goto done;
1441
1442  retry:
1443
1444         for (i = curactive = cpu_count = 0, cpus = active; i < mp_ncpus && cpus; i++, cpus = (cpus>>1)) {
1445                 if ((cpus & 0x1) == 0)
1446                         continue;
1447
1448                 curactive |= (1 << i);
1449                 cpulist[cpu_count] = (uint16_t)i;
1450                 cpu_count++;
1451         }
1452
1453         ackmask = 0;
1454         cpu_ipi_selected(cpu_count, cpulist, (uint64_t)func, (uint64_t)arg1,
1455                          (uint64_t)arg2, (uint64_t *)&ackmask);
1456
1457         while (ackmask != curactive) {
1458                 membar(Sync);
1459                 i++;
1460                 if (i > 10000000) {
1461 #ifdef TRAP_TRACING
1462                         int j;
1463 #endif
1464                         uint64_t cpu_state;
1465                         printf("cpu with cpumask=0x%x appears to not be responding to ipis\n",
1466                                curactive & ~ackmask);
1467
1468 #ifdef TRAP_TRACING
1469                         if (!trap_trace_report_done) {
1470                                 trap_trace_report_done = 1;
1471                                 for (j = 0; j < MAXCPU; j++)
1472                                         if (((1 << j) & curactive & ~ackmask) != 0) {
1473                                                 struct pcpu *pc = pcpu_find(j);
1474                                                 printf("pcpu pad 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx\n",
1475                                                     pc->pad[0], pc->pad[1], pc->pad[2], pc->pad[3],
1476                                                     pc->pad[4], pc->pad[5], pc->pad[6]);
1477                                                 trap_trace_report(j);
1478                                         }
1479                         }
1480 #endif
1481
1482                         hv_cpu_state((uint64_t)ffs64(curactive & ~ackmask), &cpu_state);
1483                         printf("cpu_state of %ld is %ld\n", ffs64(curactive & ~ackmask), cpu_state);
1484                         if (!retried) {
1485                                 printf("I'm going to send off another ipi just to confirm that it isn't a memory barrier bug\n"
1486                                "and then I'm going to panic\n");
1487
1488                                 retried = 1;
1489                                 goto retry;
1490                         }
1491
1492                         panic(" ackmask=0x%x active=0x%x\n", ackmask, curactive);
1493                 }
1494         }
1495
1496         active_total |= curactive;
1497         if ((active = ((pmap->pm_tlbactive & all_cpus) & ~(active_total|cpumask))) != 0) {
1498                 printf("pmap_ipi: retrying");
1499                 goto retry;
1500         }
1501  done:
1502         return (active_total);
1503 }
1504 #endif
1505
1506 void
1507 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, int cleartsb)
1508 {
1509
1510         if (cleartsb == TRUE)
1511                 tsb_clear_tte(&pmap->pm_tsb, va);
1512
1513         DPRINTF("pmap_invalidate_page(va=0x%lx)\n", va);
1514         spinlock_enter();
1515         invlpg(va, pmap->pm_context);
1516 #ifdef SMP
1517         pmap_ipi(pmap, (void *)tl_invlpg, (uint64_t)va, (uint64_t)pmap->pm_context);
1518 #endif
1519         spinlock_exit();
1520 }
1521
1522 void
1523 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int cleartsb)
1524 {
1525         vm_offset_t tva, invlrngva;
1526         char *func;
1527 #ifdef SMP
1528         cpumask_t active;
1529 #endif
1530         if ((eva - sva) == PAGE_SIZE) {
1531                 pmap_invalidate_page(pmap, sva, cleartsb);
1532                 return;
1533         }
1534
1535
1536         KASSERT(sva < eva, ("invalidating negative or zero range sva=0x%lx eva=0x%lx", sva, eva));
1537
1538         if (cleartsb == TRUE)
1539                 tsb_clear_range(&pmap->pm_tsb, sva, eva);
1540
1541         spinlock_enter();
1542         if ((sva - eva) < PAGE_SIZE*64) {
1543                 for (tva = sva; tva < eva; tva += PAGE_SIZE_8K)
1544                         invlpg(tva, pmap->pm_context);
1545                 func = tl_invlrng;
1546         } else if (pmap->pm_context) {
1547                 func = tl_invlctx;
1548                 invlctx(pmap->pm_context);
1549
1550         } else {
1551                 func = tl_invltlb;
1552                 invltlb();
1553         }
1554 #ifdef SMP
1555         invlrngva = sva | ((eva - sva) >> PAGE_SHIFT);
1556         active = pmap_ipi(pmap, (void *)func, pmap->pm_context, invlrngva);
1557         active &= ~pmap->pm_active;
1558         atomic_clear_int(&pmap->pm_tlbactive, active);
1559 #endif
1560         spinlock_exit();
1561 }
1562
1563 void
1564 pmap_invalidate_all(pmap_t pmap)
1565 {
1566
1567         KASSERT(pmap != kernel_pmap, ("invalidate_all called on kernel_pmap"));
1568
1569         tsb_clear(&pmap->pm_tsb);
1570
1571         spinlock_enter();
1572         invlctx(pmap->pm_context);
1573 #ifdef SMP
1574         pmap_ipi(pmap, tl_invlctx, pmap->pm_context, 0);
1575         pmap->pm_tlbactive = pmap->pm_active;
1576 #endif
1577         spinlock_exit();
1578 }
1579
1580 boolean_t
1581 pmap_is_modified(vm_page_t m)
1582 {
1583
1584         return (tte_get_phys_bit(m, VTD_W));
1585 }
1586
1587
1588 boolean_t
1589 pmap_is_prefaultable(pmap_t pmap, vm_offset_t va)
1590 {
1591         return (tte_hash_lookup(pmap->pm_hash, va) == 0);
1592 }
1593
1594 /*
1595  * Extract the physical page address associated with the given kernel virtual
1596  * address.
1597  */
1598
1599 vm_paddr_t
1600 pmap_kextract(vm_offset_t va)
1601 {
1602         tte_t tte_data;
1603         vm_paddr_t pa;
1604
1605         pa = 0;
1606         if (va > KERNBASE && va < KERNBASE + nucleus_memory) {
1607                 uint64_t offset;
1608                 offset = va - KERNBASE;
1609                 pa = nucleus_mappings[offset >> 22] | (va & PAGE_MASK_4M);
1610         }
1611         if ((pa == 0) && (tte_data = tsb_lookup_tte(va, 0)) != 0)
1612                 pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
1613
1614         if ((pa == 0) && (tte_data = tte_hash_lookup(kernel_pmap->pm_hash, va)) != 0)
1615                 pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
1616
1617         return pa;
1618 }
1619
1620 /*
1621  * Map a range of physical addresses into kernel virtual address space.
1622  *
1623  * The value passed in *virt is a suggested virtual address for the mapping.
1624  * Architectures which can support a direct-mapped physical to virtual region
1625  * can return the appropriate address within that region, leaving '*virt'
1626  * unchanged.
1627  */
1628 vm_offset_t
1629 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1630 {
1631         return TLB_PHYS_TO_DIRECT(start);
1632 }
1633
1634 int
1635 pmap_mincore(pmap_t pmap, vm_offset_t addr)
1636 {
1637         return (0);
1638 }
1639
1640 void
1641 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
1642                     vm_pindex_t index, vm_size_t size)
1643 {
1644         printf("pmap_object_init_pt\n");
1645         return;
1646 }
1647
1648 /*
1649  * Returns true if the pmap's pv is one of the first
1650  * 16 pvs linked to from this page.  This count may
1651  * be changed upwards or downwards in the future; it
1652  * is only necessary that true be returned for a small
1653  * subset of pmaps for proper page aging.
1654  */
1655 boolean_t
1656 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
1657 {
1658         pv_entry_t pv;
1659         int loops = 0;
1660
1661         if (m->flags & PG_FICTITIOUS)
1662                 return FALSE;
1663
1664         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1665         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1666                 if (pv->pv_pmap == pmap) {
1667                         return TRUE;
1668                 }
1669                 loops++;
1670                 if (loops >= 16)
1671                         break;
1672         }
1673         return (FALSE);
1674 }
1675
1676 /*
1677  * Initialize a vm_page's machine-dependent fields.
1678  */
1679 void
1680 pmap_page_init(vm_page_t m)
1681 {
1682
1683         TAILQ_INIT(&m->md.pv_list);
1684         m->md.pv_list_count = 0;
1685 }
1686
1687 /*
1688  * Return the number of managed mappings to the given physical page
1689  * that are wired.
1690  */
1691 int
1692 pmap_page_wired_mappings(vm_page_t m)
1693 {
1694         pmap_t pmap;
1695         pv_entry_t pv;
1696         uint64_t tte_data;
1697         int count;
1698
1699         count = 0;
1700         if ((m->flags & PG_FICTITIOUS) != 0)
1701                 return (count);
1702         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1703         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1704                 pmap = pv->pv_pmap;
1705                 PMAP_LOCK(pmap);
1706                 tte_data = tte_hash_lookup(pmap->pm_hash, pv->pv_va);
1707                 if ((tte_data & VTD_WIRED) != 0)
1708                         count++;
1709                 PMAP_UNLOCK(pmap);
1710         }
1711         return (count);
1712 }
1713
1714 /*
1715  * Lower the permission for all mappings to a given page.
1716  */
1717 void
1718 pmap_remove_write(vm_page_t m)
1719 {
1720         if ((m->flags & PG_WRITEABLE) == 0)
1721                 return;
1722         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1723         tte_clear_phys_bit(m, VTD_SW_W|VTD_W);
1724         vm_page_flag_clear(m, PG_WRITEABLE);
1725 }
1726 /*
1727  * Initialize the pmap associated with process 0.
1728  */
1729 void
1730 pmap_pinit0(pmap_t pmap)
1731 {
1732         PMAP_LOCK_INIT(pmap);
1733         pmap->pm_active = pmap->pm_tlbactive = ~0;
1734         pmap->pm_context = 0;
1735         pmap->pm_tsb_ra = kernel_pmap->pm_tsb_ra;
1736         pmap->pm_hash = kernel_pmap->pm_hash;
1737         critical_enter();
1738         PCPU_SET(curpmap, pmap);
1739         critical_exit();
1740         TAILQ_INIT(&pmap->pm_pvlist);
1741         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1742 }
1743
1744 /*
1745  * Initialize a preallocated and zeroed pmap structure, such as one in a
1746  * vmspace structure.
1747  */
1748 int
1749 pmap_pinit(pmap_t pmap)
1750 {
1751         int i;
1752
1753         pmap->pm_context = get_context();
1754         pmap->pm_tsb_ra = vtophys(&pmap->pm_tsb);
1755
1756         vm_page_lock_queues();
1757         pmap->pm_hash = tte_hash_create(pmap->pm_context, &pmap->pm_hashscratch);
1758         tsb_init(&pmap->pm_tsb, &pmap->pm_tsbscratch, TSB_INIT_SHIFT);
1759         vm_page_unlock_queues();
1760         pmap->pm_tsb_miss_count = pmap->pm_tsb_cap_miss_count = 0;
1761         pmap->pm_active = pmap->pm_tlbactive = 0;
1762         for (i = 0; i < TSB_MAX_RESIZE; i++)
1763                 pmap->pm_old_tsb_ra[i] = 0;
1764
1765         TAILQ_INIT(&pmap->pm_pvlist);
1766         PMAP_LOCK_INIT(pmap);
1767         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1768         return (1);
1769 }
1770
1771 /*
1772  * Set the physical protection on the specified range of this map as requested.
1773  */
1774 void
1775 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1776 {
1777
1778         int anychanged;
1779         vm_offset_t tva;
1780         uint64_t clearbits;
1781
1782         DPRINTF("pmap_protect(0x%lx, 0x%lx, %d)\n", sva, eva, prot);
1783
1784         if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1785                 pmap_remove(pmap, sva, eva);
1786                 return;
1787         }
1788
1789         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
1790             (VM_PROT_WRITE|VM_PROT_EXECUTE))
1791                 return;
1792
1793         clearbits = anychanged = 0;
1794
1795         if ((prot & VM_PROT_WRITE) == 0)
1796                 clearbits |= (VTD_W|VTD_SW_W);
1797         if ((prot & VM_PROT_EXECUTE) == 0)
1798                 clearbits |= VTD_X;
1799
1800         vm_page_lock_queues();
1801         PMAP_LOCK(pmap);
1802         for (tva = sva; tva < eva; tva += PAGE_SIZE) {
1803                 uint64_t otte_data;
1804                 vm_page_t m;
1805
1806                 if ((otte_data = tte_hash_clear_bits(pmap->pm_hash, tva,
1807                                                      clearbits)) == 0)
1808                         continue;
1809                 /*
1810                  * XXX technically we should do a shootdown if it
1811                  * was referenced and was executable - but is not now
1812                  */
1813                 if (!anychanged && (otte_data & VTD_W))
1814                         anychanged = 1;
1815
1816                 if (otte_data & VTD_MANAGED) {
1817                         m = NULL;
1818
1819                         if (otte_data & VTD_REF) {
1820                                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(otte_data));
1821                                 vm_page_flag_set(m, PG_REFERENCED);
1822                         }
1823                         if (otte_data & VTD_W) {
1824                                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(otte_data));
1825                                 vm_page_dirty(m);
1826                         }
1827                 }
1828         }
1829
1830         vm_page_unlock_queues();
1831         if (anychanged)
1832                 pmap_invalidate_range(pmap, sva, eva, TRUE);
1833         PMAP_UNLOCK(pmap);
1834 }
1835
1836 /*
1837  * Map a list of wired pages into kernel virtual address space.  This is
1838  * intended for temporary mappings which do not need page modification or
1839  * references recorded.  Existing mappings in the region are overwritten.
1840  */
1841 void
1842 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
1843 {
1844         vm_offset_t va;
1845         tte_t otte;
1846
1847         otte = 0;
1848         va = sva;
1849         while (count-- > 0) {
1850                 otte |= tte_hash_update(kernel_pmap->pm_hash, va,
1851                                         VM_PAGE_TO_PHYS(*m) | TTE_KERNEL | VTD_8K);
1852                 va += PAGE_SIZE;
1853                 m++;
1854         }
1855         if ((otte & VTD_REF) != 0)
1856                 pmap_invalidate_range(kernel_pmap, sva, va, FALSE);
1857 }
1858
1859 /*
1860  * Remove page mappings from kernel virtual address space.  Intended for
1861  * temporary mappings entered by pmap_qenter.
1862  */
1863 void
1864 pmap_qremove(vm_offset_t sva, int count)
1865 {
1866         vm_offset_t va;
1867         tte_t otte;
1868
1869         va = sva;
1870
1871         otte = 0;
1872         while (count-- > 0) {
1873                 otte |= tte_hash_delete(kernel_pmap->pm_hash, va);
1874                 va += PAGE_SIZE;
1875         }
1876         if ((otte & VTD_REF) != 0)
1877                 pmap_invalidate_range(kernel_pmap, sva, va, TRUE);
1878 }
1879
1880 /*
1881  * Release any resources held by the given physical map.
1882  * Called when a pmap initialized by pmap_pinit is being released.
1883  * Should only be called if the map contains no valid mappings.
1884  */
1885 void
1886 pmap_release(pmap_t pmap)
1887 {
1888         KASSERT(pmap->pm_stats.resident_count == 0,
1889             ("pmap_release: pmap resident count %ld != 0",
1890             pmap->pm_stats.resident_count));
1891
1892         tsb_deinit(&pmap->pm_tsb);
1893         tte_hash_destroy(pmap->pm_hash);
1894         free_context(pmap->pm_context);
1895         PMAP_LOCK_DESTROY(pmap);
1896 }
1897
1898 /*
1899  * Remove the given range of addresses from the specified map.
1900  */
1901 void
1902 pmap_remove(pmap_t pmap, vm_offset_t start, vm_offset_t end)
1903 {
1904         int invlva;
1905         vm_offset_t tva;
1906         uint64_t tte_data;
1907         /*
1908          * Perform an unsynchronized read.  This is, however, safe.
1909          */
1910         if (pmap->pm_stats.resident_count == 0)
1911                 return;
1912
1913         DPRINTF("pmap_remove(start=0x%lx, end=0x%lx)\n",
1914                 start, end);
1915         invlva = 0;
1916         vm_page_lock_queues();
1917         PMAP_LOCK(pmap);
1918         for (tva = start; tva < end; tva += PAGE_SIZE) {
1919                 if ((tte_data = tte_hash_delete(pmap->pm_hash, tva)) == 0)
1920                         continue;
1921                 pmap_remove_tte(pmap, tte_data, tva);
1922                 if (tte_data & (VTD_REF|VTD_W))
1923                         invlva = 1;
1924         }
1925         vm_page_unlock_queues();
1926         if (invlva)
1927                 pmap_invalidate_range(pmap, start, end, TRUE);
1928         PMAP_UNLOCK(pmap);
1929 }
1930
1931 /*
1932  *      Routine:        pmap_remove_all
1933  *      Function:
1934  *              Removes this physical page from
1935  *              all physical maps in which it resides.
1936  *              Reflects back modify bits to the pager.
1937  *
1938  *      Notes:
1939  *              Original versions of this routine were very
1940  *              inefficient because they iteratively called
1941  *              pmap_remove (slow...)
1942  */
1943
1944 void
1945 pmap_remove_all(vm_page_t m)
1946 {
1947         pv_entry_t pv;
1948         uint64_t tte_data;
1949         DPRINTF("pmap_remove_all 0x%lx\n", VM_PAGE_TO_PHYS(m));
1950
1951         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1952         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1953                 PMAP_LOCK(pv->pv_pmap);
1954                 pv->pv_pmap->pm_stats.resident_count--;
1955
1956                 tte_data = tte_hash_delete(pv->pv_pmap->pm_hash, pv->pv_va);
1957
1958                 if (tte_data & VTD_WIRED)
1959                         pv->pv_pmap->pm_stats.wired_count--;
1960                 if (tte_data & VTD_REF)
1961                         vm_page_flag_set(m, PG_REFERENCED);
1962
1963                 /*
1964                  * Update the vm_page_t clean and reference bits.
1965                  */
1966                 if (tte_data & VTD_W) {
1967                         KASSERT((tte_data & VTD_SW_W),
1968         ("pmap_remove_all: modified page not writable: va: %lx, tte: %lx",
1969                             pv->pv_va, tte_data));
1970                         vm_page_dirty(m);
1971                 }
1972
1973                 pmap_invalidate_page(pv->pv_pmap, pv->pv_va, TRUE);
1974                 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1975                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1976                 m->md.pv_list_count--;
1977                 PMAP_UNLOCK(pv->pv_pmap);
1978                 free_pv_entry(pv);
1979         }
1980         vm_page_flag_clear(m, PG_WRITEABLE);
1981 }
1982
1983 static void
1984 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1985 {
1986         pv_entry_t pv;
1987         if (pmap != kernel_pmap)
1988                 DPRINTF("pmap_remove_entry(va=0x%lx, pa=0x%lx)\n", va, VM_PAGE_TO_PHYS(m));
1989         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1990         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1991         if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1992                 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1993                         if (pmap == pv->pv_pmap && va == pv->pv_va)
1994                                 break;
1995                 }
1996         } else {
1997                 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1998                         if (va == pv->pv_va)
1999                                 break;
2000                 }
2001         }
2002         KASSERT(pv != NULL, ("pmap_remove_entry: pv not found va=0x%lx pa=0x%lx", va, VM_PAGE_TO_PHYS(m)));
2003         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2004         m->md.pv_list_count--;
2005         if (TAILQ_EMPTY(&m->md.pv_list))
2006                 vm_page_flag_clear(m, PG_WRITEABLE);
2007         TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2008         free_pv_entry(pv);
2009 }
2010
2011
2012 void
2013 pmap_remove_pages(pmap_t pmap)
2014 {
2015
2016         vm_page_t m;
2017         pv_entry_t pv, npv;
2018         tte_t tte_data;
2019
2020         DPRINTF("pmap_remove_pages(ctx=0x%lx)\n", pmap->pm_context);
2021         vm_page_lock_queues();
2022         PMAP_LOCK(pmap);
2023         for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2024                 tte_data = tte_hash_delete(pmap->pm_hash, pv->pv_va);
2025
2026                 if (tte_data == 0) {
2027                         printf("TTE IS ZERO @ VA %016lx\n", pv->pv_va);
2028                         panic("bad tte");
2029                 }
2030                 if (tte_data & VTD_WIRED) {
2031                         panic("wired page in process not handled correctly");
2032                         pmap->pm_stats.wired_count--;
2033                 }
2034                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
2035
2036                 pmap->pm_stats.resident_count--;
2037
2038                 if (tte_data & VTD_W) {
2039                         vm_page_dirty(m);
2040                 }
2041
2042                 npv = TAILQ_NEXT(pv, pv_plist);
2043                 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2044
2045                 m->md.pv_list_count--;
2046                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2047                 if (TAILQ_EMPTY(&m->md.pv_list))
2048                         vm_page_flag_clear(m, PG_WRITEABLE);
2049
2050                 free_pv_entry(pv);
2051         }
2052         pmap->pm_hash = tte_hash_reset(pmap->pm_hash, &pmap->pm_hashscratch);
2053         if (0)
2054                 pmap_tsb_reset(pmap);
2055
2056         vm_page_unlock_queues();
2057         pmap_invalidate_all(pmap);
2058         PMAP_UNLOCK(pmap);
2059 }
2060
2061 static void
2062 pmap_tsb_reset(pmap_t pmap)
2063 {
2064         int i;
2065
2066         for (i = 1; i < TSB_MAX_RESIZE && pmap->pm_old_tsb_ra[i]; i++) {
2067                 pmap_free_contig_pages((void *)TLB_PHYS_TO_DIRECT(pmap->pm_old_tsb_ra[i]),
2068                                        (1 << (TSB_INIT_SHIFT + i)));
2069                 pmap->pm_old_tsb_ra[i] = 0;
2070         }
2071         if (pmap->pm_old_tsb_ra[0] != 0) {
2072                 vm_paddr_t tsb_pa = pmap->pm_tsb.hti_ra;
2073                 int size = tsb_size(&pmap->pm_tsb);
2074                 pmap->pm_tsb.hti_ntte = (1 << (TSB_INIT_SHIFT + PAGE_SHIFT - TTE_SHIFT));
2075                 pmap->pm_tsb.hti_ra = pmap->pm_old_tsb_ra[0];
2076                 pmap_free_contig_pages((void *)TLB_PHYS_TO_DIRECT(tsb_pa), size);
2077                 pmap->pm_tsbscratch = pmap->pm_tsb.hti_ra | (uint64_t)TSB_INIT_SHIFT;
2078                 pmap->pm_old_tsb_ra[0] = 0;
2079         }
2080 }
2081
2082 void
2083 pmap_scrub_pages(vm_paddr_t pa, int64_t size)
2084 {
2085         uint64_t bytes_zeroed;
2086         while (size > 0) {
2087                 hv_mem_scrub(pa, size, &bytes_zeroed);
2088                 pa += bytes_zeroed;
2089                 size -= bytes_zeroed;
2090         }
2091 }
2092
2093 static void
2094 pmap_remove_tte(pmap_t pmap, tte_t tte_data, vm_offset_t va)
2095 {
2096
2097         vm_page_t m;
2098
2099         if (pmap != kernel_pmap)
2100                 DPRINTF("pmap_remove_tte(va=0x%lx, pa=0x%lx)\n", va, TTE_GET_PA(tte_data));
2101
2102         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2103         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2104         if (tte_data & VTD_WIRED)
2105                 pmap->pm_stats.wired_count--;
2106
2107         pmap->pm_stats.resident_count--;
2108
2109         if (tte_data & VTD_MANAGED) {
2110                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
2111                 if (tte_data & VTD_W) {
2112                         vm_page_dirty(m);
2113                 }
2114                 if (tte_data & VTD_REF)
2115                         vm_page_flag_set(m, PG_REFERENCED);
2116                 pmap_remove_entry(pmap, m, va);
2117         }
2118 }
2119
2120 /* resize the tsb if the number of capacity misses is greater than 1/4 of
2121  * the total
2122  */
2123 static void
2124 pmap_tsb_resize(pmap_t pmap)
2125 {
2126         uint32_t miss_count;
2127         uint32_t cap_miss_count;
2128         struct tsb_resize_info info;
2129         hv_tsb_info_t hvtsb;
2130         uint64_t tsbscratch;
2131
2132         KASSERT(pmap == curthread_pmap, ("operating on non-current pmap"));
2133         miss_count = pmap->pm_tsb_miss_count;
2134         cap_miss_count = pmap->pm_tsb_cap_miss_count;
2135         int npages_shift = tsb_page_shift(pmap);
2136
2137         if (npages_shift < (TSB_INIT_SHIFT + TSB_MAX_RESIZE) &&
2138             cap_miss_count > (miss_count >> 1)) {
2139                 DPRINTF("resizing tsb for proc=%s pid=%d\n",
2140                         curthread->td_proc->p_comm, curthread->td_proc->p_pid);
2141                 pmap->pm_old_tsb_ra[npages_shift - TSB_INIT_SHIFT] = pmap->pm_tsb.hti_ra;
2142
2143                 /* double TSB size */
2144                 tsb_init(&hvtsb, &tsbscratch, npages_shift + 1);
2145 #ifdef SMP
2146                 spinlock_enter();
2147                 /* reset tsb */
2148                 bcopy(&hvtsb, &pmap->pm_tsb, sizeof(hv_tsb_info_t));
2149                 pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
2150
2151                 if (hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra) != H_EOK)
2152                         panic("failed to set TSB 0x%lx - context == %ld\n",
2153                               pmap->pm_tsb_ra, pmap->pm_context);
2154                 info.tri_tsbscratch = pmap->pm_tsbscratch;
2155                 info.tri_tsb_ra = pmap->pm_tsb_ra;
2156                 pmap_ipi(pmap, tl_tsbupdate, pmap->pm_context, vtophys(&info));
2157                 pmap->pm_tlbactive = pmap->pm_active;
2158                 spinlock_exit();
2159 #else
2160                 bcopy(&hvtsb, &pmap->pm_tsb, sizeof(hvtsb));
2161                 if (hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra) != H_EOK)
2162                         panic("failed to set TSB 0x%lx - context == %ld\n",
2163                               pmap->pm_tsb_ra, pmap->pm_context);
2164                 pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
2165 #endif
2166         }
2167         pmap->pm_tsb_miss_count = 0;
2168         pmap->pm_tsb_cap_miss_count = 0;
2169 }
2170
2171 static void
2172 pmap_tte_hash_resize(pmap_t pmap)
2173 {
2174         tte_hash_t old_th = pmap->pm_hash;
2175
2176         pmap->pm_hash = tte_hash_resize(pmap->pm_hash);
2177         spinlock_enter();
2178         if (curthread->td_proc->p_numthreads != 1)
2179                 pmap_ipi(pmap, tl_ttehashupdate, pmap->pm_context, pmap->pm_hashscratch);
2180
2181         pmap->pm_hashscratch = tte_hash_set_scratchpad_user(pmap->pm_hash, pmap->pm_context);
2182         spinlock_exit();
2183         tte_hash_destroy(old_th);
2184 }
2185
2186 /*
2187  *      pmap_ts_referenced:
2188  *
2189  *      Return a count of reference bits for a page, clearing those bits.
2190  *      It is not necessary for every reference bit to be cleared, but it
2191  *      is necessary that 0 only be returned when there are truly no
2192  *      reference bits set.
2193  *
2194  *      XXX: The exact number of bits to check and clear is a matter that
2195  *      should be tested and standardized at some point in the future for
2196  *      optimal aging of shared pages.
2197  */
2198
2199 int
2200 pmap_ts_referenced(vm_page_t m)
2201 {
2202
2203         int rv;
2204         pv_entry_t pv, pvf, pvn;
2205         pmap_t pmap;
2206         tte_t otte_data;
2207
2208         rv = 0;
2209         if (m->flags & PG_FICTITIOUS)
2210                 return (rv);
2211
2212         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2213         if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2214
2215                 pvf = pv;
2216
2217                 do {
2218                         pvn = TAILQ_NEXT(pv, pv_list);
2219
2220                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2221
2222                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2223
2224                         pmap = pv->pv_pmap;
2225                         PMAP_LOCK(pmap);
2226                         otte_data = tte_hash_clear_bits(pmap->pm_hash, pv->pv_va, VTD_REF);
2227                         if ((otte_data & VTD_REF) != 0) {
2228                                 pmap_invalidate_page(pmap, pv->pv_va, TRUE);
2229
2230                                 rv++;
2231                                 if (rv > 4) {
2232                                         PMAP_UNLOCK(pmap);
2233                                         break;
2234                                 }
2235                         }
2236
2237                         PMAP_UNLOCK(pmap);
2238                 } while ((pv = pvn) != NULL && pv != pvf);
2239         }
2240         return (rv);
2241 }
2242
2243 void
2244 pmap_zero_page(vm_page_t m)
2245 {
2246         hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
2247 }
2248
2249 void
2250 pmap_zero_page_area(vm_page_t m, int off, int size)
2251 {
2252         vm_paddr_t pa;
2253         vm_offset_t va;
2254
2255         pa = VM_PAGE_TO_PHYS(m);
2256         va = TLB_PHYS_TO_DIRECT(pa);
2257         if (off == 0 && size == PAGE_SIZE)
2258                 hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
2259         else
2260                 bzero((char *)(va + off), size);
2261
2262 }
2263
2264 void
2265 pmap_zero_page_idle(vm_page_t m)
2266 {
2267         hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
2268 }
2269
2270 void
2271 pmap_set_ctx_panic(uint64_t error, vm_paddr_t tsb_ra, pmap_t pmap)
2272 {
2273         panic("setting ctxnon0 failed ctx=0x%lx hvtsb_ra=0x%lx tsbscratch=0x%lx error=0x%lx",
2274               pmap->pm_context, tsb_ra, pmap->pm_tsbscratch, error);
2275
2276 }