]> CyberLeo.Net >> Repos - FreeBSD/releng/8.0.git/blob - sys/sun4v/sun4v/pmap.c
Adjust to reflect 8.0-RELEASE.
[FreeBSD/releng/8.0.git] / sys / sun4v / sun4v / pmap.c
1 /*-
2  * Copyright (c) 2006 Kip Macy <kmacy@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30
31 #include "opt_kstack_pages.h"
32 #include "opt_msgbuf.h"
33 #include "opt_pmap.h"
34 #include "opt_trap_trace.h"
35
36 #include <sys/param.h>
37 #include <sys/kernel.h>
38 #include <sys/kdb.h>
39 #include <sys/ktr.h>
40 #include <sys/lock.h>
41 #include <sys/msgbuf.h>
42 #include <sys/mutex.h>
43 #include <sys/proc.h>
44 #include <sys/smp.h>
45 #include <sys/sched.h>
46 #include <sys/sysctl.h>
47 #include <sys/systm.h>
48 #include <sys/vmmeter.h>
49
50 #include <dev/ofw/openfirm.h>
51
52 #include <vm/vm.h> 
53 #include <vm/vm_page.h>
54 #include <vm/vm_param.h>
55 #include <vm/vm_kern.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_object.h>
58 #include <vm/vm_extern.h>
59 #include <vm/vm_pageout.h>
60 #include <vm/vm_pager.h>
61 #include <vm/vm_phys.h>
62 #include <vm/uma.h>
63
64 #include <machine/cpu.h>
65 #include <machine/frame.h>
66 #include <machine/instr.h>
67 #include <machine/md_var.h>
68 #include <machine/metadata.h>
69 #include <machine/ofw_mem.h>
70 #include <machine/mmu.h>
71 #include <machine/smp.h>
72 #include <machine/tlb.h>
73 #include <machine/tte.h>
74 #include <machine/tte_hash.h>
75 #include <machine/pcb.h>
76 #include <machine/pstate.h>
77 #include <machine/tsb.h>
78
79 #include <machine/hypervisorvar.h>
80 #include <machine/hv_api.h>
81
82 #ifdef TRAP_TRACING
83 void trap_trace_report(int);
84 #endif
85
86 #if 1
87 #define PMAP_DEBUG
88 #endif
89 #ifndef PMAP_SHPGPERPROC
90 #define PMAP_SHPGPERPROC        200
91 #endif
92
93 /*
94  * Virtual and physical address of message buffer.
95  */
96 struct msgbuf *msgbufp;
97 vm_paddr_t msgbuf_phys;
98
99 /*
100  * Map of physical memory reagions.
101  */
102 vm_paddr_t phys_avail[128];
103 vm_paddr_t phys_avail_tmp[128];
104 static struct ofw_mem_region mra[128];
105 static struct ofw_map translations[128];
106 static int translations_size;
107
108
109 struct ofw_mem_region sparc64_memreg[128];
110 int sparc64_nmemreg;
111
112 extern vm_paddr_t mmu_fault_status_area;
113
114 /*
115  * First and last available kernel virtual addresses.
116  */
117 vm_offset_t virtual_avail;
118 vm_offset_t virtual_end;
119 vm_offset_t kernel_vm_end;
120 vm_offset_t vm_max_kernel_address;
121
122 #ifndef PMAP_SHPGPERPROC
123 #define PMAP_SHPGPERPROC 200
124 #endif
125 /*
126  * Data for the pv entry allocation mechanism
127  */
128 static uma_zone_t pvzone;
129 static struct vm_object pvzone_obj;
130 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
131 int pmap_debug = 0;
132 static int pmap_debug_range = 1;
133 static int use_256M_pages = 1;
134
135 static struct mtx pmap_ctx_lock;
136 static uint16_t ctx_stack[PMAP_CONTEXT_MAX];
137 static int ctx_stack_top; 
138
139 static int permanent_mappings = 0;
140 static uint64_t nucleus_memory;
141 static uint64_t nucleus_mappings[4];
142 /*
143  * Kernel pmap.
144  */
145 struct pmap kernel_pmap_store;
146
147 hv_tsb_info_t kernel_td[MAX_TSB_INFO];
148
149 /*
150  * This should be determined at boot time
151  * with tiny TLBS it doesn't make sense to try and selectively
152  * invalidate more than this 
153  */
154 #define MAX_INVALIDATES   32
155 #define MAX_TSB_CLEARS   128
156
157 /*
158  * Allocate physical memory for use in pmap_bootstrap.
159  */
160 static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size);
161
162 /*
163  * If user pmap is processed with pmap_remove and with pmap_remove and the
164  * resident count drops to 0, there are no more pages to remove, so we
165  * need not continue.
166  */
167 #define PMAP_REMOVE_DONE(pm) \
168         ((pm) != kernel_pmap && (pm)->pm_stats.resident_count == 0)
169
170 /*
171  * Kernel MMU interface
172  */
173 #define curthread_pmap vmspace_pmap(curthread->td_proc->p_vmspace) 
174
175 #ifdef PMAP_DEBUG
176 #define KDPRINTF if (pmap_debug) printf
177 #define DPRINTF \
178         if (curthread_pmap && (curthread_pmap->pm_context != 0) && ((PCPU_GET(cpumask) & curthread_pmap->pm_active) == 0)) \
179         panic("cpumask(0x%x) & active (0x%x) == 0 pid == %d\n",  \
180               PCPU_GET(cpumask), curthread_pmap->pm_active, curthread->td_proc->p_pid); \
181 if (pmap_debug) printf
182
183
184 #else
185 #define DPRINTF(...)
186 #define KDPRINTF(...)
187 #endif
188
189
190 static void free_pv_entry(pv_entry_t pv);
191 static pv_entry_t get_pv_entry(pmap_t locked_pmap);
192
193 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
194 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va);
195 static void pmap_remove_tte(pmap_t pmap, tte_t tte_data, vm_offset_t va);
196 static void pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot);
197 static void pmap_tsb_reset(pmap_t pmap);
198 static void pmap_tsb_resize(pmap_t pmap);
199 static void pmap_tte_hash_resize(pmap_t pmap);
200
201 void pmap_set_ctx_panic(uint64_t error, vm_paddr_t tsb_ra, pmap_t pmap);
202
203 struct tsb_resize_info {
204         uint64_t tri_tsbscratch;
205         uint64_t tri_tsb_ra;
206 };
207
208 /*
209  * Quick sort callout for comparing memory regions.
210  */
211 static int mr_cmp(const void *a, const void *b);
212 static int om_cmp(const void *a, const void *b);
213 static int
214 mr_cmp(const void *a, const void *b)
215 {
216         const struct ofw_mem_region *mra;
217         const struct ofw_mem_region *mrb;
218
219         mra = a;
220         mrb = b;
221         if (mra->mr_start < mrb->mr_start)
222                 return (-1);
223         else if (mra->mr_start > mrb->mr_start)
224                 return (1);
225         else
226                 return (0);
227 }
228 static int
229 om_cmp(const void *a, const void *b)
230 {
231         const struct ofw_map *oma;
232         const struct ofw_map *omb;
233
234         oma = a;
235         omb = b;
236         if (oma->om_start < omb->om_start)
237                 return (-1);
238         else if (oma->om_start > omb->om_start)
239                 return (1);
240         else
241                 return (0);
242 }
243
244 static __inline void
245 free_context(uint16_t ctx)
246 {
247         mtx_lock_spin(&pmap_ctx_lock);
248         ctx_stack[ctx_stack_top++] = ctx;
249         mtx_unlock_spin(&pmap_ctx_lock);
250
251         KASSERT(ctx_stack_top < PMAP_CONTEXT_MAX, 
252                 ("context stack overrun - system error"));
253 }
254
255 static __inline uint16_t
256 get_context(void)
257 {
258         uint16_t ctx;
259
260         mtx_lock_spin(&pmap_ctx_lock);
261         ctx = ctx_stack[--ctx_stack_top];
262         mtx_unlock_spin(&pmap_ctx_lock);
263
264         KASSERT(ctx_stack_top > 0,
265                 ("context stack underrun - need to implement context stealing"));
266
267         return ctx;
268 }
269
270 static __inline void
271 free_pv_entry(pv_entry_t pv)
272 {
273         pv_entry_count--;
274         uma_zfree(pvzone, pv);
275 }
276
277 /*
278  * get a new pv_entry, allocating a block from the system
279  * when needed.
280  */
281 static pv_entry_t
282 get_pv_entry(pmap_t locked_pmap)
283 {
284         static const struct timeval printinterval = { 60, 0 };
285         static struct timeval lastprint;
286         struct vpgqueues *vpq;
287         uint64_t tte_data;
288         pmap_t pmap;
289         pv_entry_t allocated_pv, next_pv, pv;
290         vm_offset_t va;
291         vm_page_t m;
292
293         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
294         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
295         allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
296         if (allocated_pv != NULL) {
297                 pv_entry_count++;
298                 if (pv_entry_count > pv_entry_high_water)
299                         pagedaemon_wakeup();
300                 else
301                         return (allocated_pv);
302         }
303
304         /*
305          * Reclaim pv entries: At first, destroy mappings to inactive
306          * pages.  After that, if a pv entry is still needed, destroy
307          * mappings to active pages.
308          */
309         if (ratecheck(&lastprint, &printinterval))
310                 printf("Approaching the limit on PV entries, "
311                     "increase the vm.pmap.shpgperproc tunable.\n");
312
313         vpq = &vm_page_queues[PQ_INACTIVE];
314 retry:
315         TAILQ_FOREACH(m, &vpq->pl, pageq) {
316                 if (m->hold_count || m->busy)
317                         continue;
318                 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
319                         va = pv->pv_va;
320                         pmap = pv->pv_pmap;
321                         /* Avoid deadlock and lock recursion. */
322                         if (pmap > locked_pmap)
323                                 PMAP_LOCK(pmap);
324                         else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
325                                 continue;
326                         pmap->pm_stats.resident_count--;
327
328                         tte_data = tte_hash_delete(pmap->pm_hash, va);
329
330                         KASSERT((tte_data & VTD_WIRED) == 0,
331                             ("get_pv_entry: wired pte %#jx", (uintmax_t)tte_data));
332                         if (tte_data & VTD_REF)
333                                 vm_page_flag_set(m, PG_REFERENCED);
334                         if (tte_data & VTD_W) {
335                                 KASSERT((tte_data & VTD_SW_W),
336                                 ("get_pv_entry: modified page not writable: va: %lx, tte: %lx",
337                                     va, tte_data));
338                                 vm_page_dirty(m);
339                         }
340
341                         pmap_invalidate_page(pmap, va, TRUE);
342                         TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
343                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
344                         if (TAILQ_EMPTY(&m->md.pv_list))
345                                 vm_page_flag_clear(m, PG_WRITEABLE);
346                         m->md.pv_list_count--;
347
348                         if (pmap != locked_pmap)
349                                 PMAP_UNLOCK(pmap);
350                         if (allocated_pv == NULL)
351                                 allocated_pv = pv;
352                         else
353                                 free_pv_entry(pv);
354                 }
355         }
356         if (allocated_pv == NULL) {
357                 if (vpq == &vm_page_queues[PQ_INACTIVE]) {
358                         vpq = &vm_page_queues[PQ_ACTIVE];
359                         goto retry;
360                 }
361                 panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
362         }
363         return (allocated_pv);
364 }
365
366 /*
367  * Allocate a physical page of memory directly from the phys_avail map.
368  * Can only be called from pmap_bootstrap before avail start and end are
369  * calculated.
370  */
371 static vm_paddr_t
372 pmap_bootstrap_alloc(vm_size_t size)
373 {
374         vm_paddr_t pa;
375         int i;
376
377         size = round_page(size);
378
379         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
380                 if (phys_avail[i + 1] - phys_avail[i] < size)
381                         continue;
382                 pa = phys_avail[i];
383                 phys_avail[i] += size;
384                 pmap_scrub_pages(pa, size);
385                 return (pa);
386         }
387         panic("pmap_bootstrap_alloc");
388 }
389
390 /*
391  * Activate a user pmap.  The pmap must be activated before its address space
392  * can be accessed in any way.
393  */
394 void
395 pmap_activate(struct thread *td)
396 {
397         pmap_t pmap, oldpmap;
398         int err;
399         
400         critical_enter();
401         pmap = vmspace_pmap(td->td_proc->p_vmspace);
402         oldpmap = PCPU_GET(curpmap);
403 #if defined(SMP)
404         atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
405         atomic_set_int(&pmap->pm_tlbactive, PCPU_GET(cpumask));
406         atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
407 #else
408         oldpmap->pm_active &= ~1;
409         pmap->pm_active |= 1;
410         pmap->pm_tlbactive |= 1;
411 #endif
412
413         pmap->pm_hashscratch = tte_hash_set_scratchpad_user(pmap->pm_hash, pmap->pm_context);
414         pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
415         pmap->pm_tsb_miss_count = pmap->pm_tsb_cap_miss_count = 0;
416
417         PCPU_SET(curpmap, pmap);
418         if (pmap->pm_context != 0)
419                 if ((err = hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra)) != H_EOK)
420                         panic("failed to set TSB 0x%lx - context == %ld\n", 
421                               pmap->pm_tsb_ra, pmap->pm_context);
422         stxa(MMU_CID_S, ASI_MMU_CONTEXTID, pmap->pm_context);
423         membar(Sync);
424         critical_exit();
425 }
426
427 /*
428  *      Increase the starting virtual address of the given mapping if a
429  *      different alignment might result in more superpage mappings.
430  */
431 void
432 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
433     vm_offset_t *addr, vm_size_t size)
434 {
435 }
436
437 /*
438  * Bootstrap the system enough to run with virtual memory.
439  */
440 void
441 pmap_bootstrap(vm_offset_t ekva)
442 {
443         struct pmap *pm;
444         vm_offset_t off, va;
445         vm_paddr_t pa, tsb_8k_pa, tsb_4m_pa, kernel_hash_pa, nucleus_memory_start;
446         vm_size_t physsz, virtsz, kernel_hash_shift;
447         ihandle_t pmem, vmem;
448         int i, j, k, sz;
449         uint64_t tsb_8k_size, tsb_4m_size, error, physmem_tunable, physmemstart_tunable;
450         vm_paddr_t real_phys_avail[128], tmp_phys_avail[128], bounds;
451         
452
453         if ((vmem = OF_finddevice("/virtual-memory")) == -1)
454                 panic("pmap_bootstrap: finddevice /virtual-memory");
455         if ((sz = OF_getproplen(vmem, "translations")) == -1)
456                 panic("pmap_bootstrap: getproplen translations");
457         if (sizeof(translations) < sz)
458                 panic("pmap_bootstrap: translations too small");
459         bzero(translations, sz);
460         if (OF_getprop(vmem, "translations", translations, sz) == -1)
461                 panic("pmap_bootstrap: getprop /virtual-memory/translations");
462         sz /= sizeof(*translations);
463         translations_size = sz;
464         nucleus_memory_start = 0;
465         CTR0(KTR_PMAP, "pmap_bootstrap: translations");
466         qsort(translations, sz, sizeof (*translations), om_cmp);
467
468         for (i = 0; i < sz; i++) {
469                 KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", 
470                         translations[i].om_size, translations[i].om_start, 
471                         translations[i].om_tte);
472                 if ((translations[i].om_start >= KERNBASE) && 
473                     (translations[i].om_start <= KERNBASE + 3*PAGE_SIZE_4M)) {
474                         for (j = 0; j < translations[i].om_size; j += PAGE_SIZE_4M) {
475                                 KDPRINTF("mapping permanent translation\n");
476                                 pa = TTE_GET_PA(translations[i].om_tte) + j;
477                                 va = translations[i].om_start + j;
478                                 error = hv_mmu_map_perm_addr(va, KCONTEXT, 
479                                                              pa | TTE_KERNEL | VTD_4M, MAP_ITLB | MAP_DTLB);
480                                 if (error != H_EOK)
481                                         panic("map_perm_addr returned error=%ld", error);
482                                 
483                                 if ((nucleus_memory_start == 0) || (pa < nucleus_memory_start))
484                                         nucleus_memory_start = pa;
485                                 printf("nucleus_mappings[%d] = 0x%lx\n", permanent_mappings, pa);
486                                 nucleus_mappings[permanent_mappings++] = pa;
487                                 nucleus_memory += PAGE_SIZE_4M;
488 #ifdef SMP
489                                 mp_add_nucleus_mapping(va, pa|TTE_KERNEL|VTD_4M);
490 #endif
491                         }
492                 }  
493         }
494
495         /*
496          * Find out what physical memory is available from the prom and
497          * initialize the phys_avail array.  This must be done before
498          * pmap_bootstrap_alloc is called.
499          */
500         if ((pmem = OF_finddevice("/memory")) == -1)
501                 panic("pmap_bootstrap: finddevice /memory");
502         if ((sz = OF_getproplen(pmem, "available")) == -1)
503                 panic("pmap_bootstrap: getproplen /memory/available");
504         if (sizeof(vm_paddr_t)*128 < sz) /* FIXME */
505                 panic("pmap_bootstrap: phys_avail too small");
506         if (sizeof(mra) < sz)
507                 panic("pmap_bootstrap: mra too small");
508         bzero(mra, sz);
509         if (OF_getprop(pmem, "available", mra, sz) == -1)
510                 panic("pmap_bootstrap: getprop /memory/available");
511
512         sz /= sizeof(*mra);
513         CTR0(KTR_PMAP, "pmap_bootstrap: physical memory");
514
515         qsort(mra, sz, sizeof (*mra), mr_cmp);
516         physmemstart_tunable = physmem_tunable = physmem = physsz = 0;
517         
518         if (TUNABLE_ULONG_FETCH("hw.physmemstart", &physmemstart_tunable)) {
519                 KDPRINTF("desired physmemstart=0x%lx\n", physmemstart_tunable);
520         }
521         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) {
522                 physmem = atop(physmem_tunable);
523                 KDPRINTF("desired physmem=0x%lx\n", physmem_tunable);
524         }
525         if ((physmem_tunable != 0) && (physmemstart_tunable != 0))
526                 physmem_tunable += physmemstart_tunable;
527         
528         bzero(real_phys_avail, sizeof(real_phys_avail));
529         bzero(tmp_phys_avail, sizeof(tmp_phys_avail));
530
531         for (i = 0, j = 0; i < sz; i++) {
532                 uint64_t size;
533                 KDPRINTF("start=%#lx size=%#lx\n", mra[i].mr_start, mra[i].mr_size);
534                 if (mra[i].mr_size < PAGE_SIZE_4M)
535                         continue;
536
537                 if ((mra[i].mr_start & PAGE_MASK_4M) || (mra[i].mr_size & PAGE_MASK_4M)) {
538                         uint64_t newstart, roundup;
539                         newstart = ((mra[i].mr_start + (PAGE_MASK_4M)) & ~PAGE_MASK_4M);
540                         roundup = newstart - mra[i].mr_start;
541                         size = (mra[i].mr_size - roundup) & ~PAGE_MASK_4M;
542                         mra[i].mr_start = newstart;
543                         if (size < PAGE_SIZE_4M)
544                                 continue;
545                         mra[i].mr_size = size;
546                 }
547                 real_phys_avail[j] = mra[i].mr_start;
548                 if (physmem_tunable != 0 && ((physsz + mra[i].mr_size) >= physmem_tunable)) {
549                         mra[i].mr_size = physmem_tunable - physsz;
550                         physsz = physmem_tunable;
551                         real_phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
552                         break;
553                 }
554                 physsz += mra[i].mr_size;
555                 real_phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
556                 j += 2;
557         }
558         physmem = btoc(physsz - physmemstart_tunable);
559
560         /*
561          * This is needed for versions of OFW that would allocate us memory
562          * and then forget to remove it from the available ranges ...
563          * as well as for compensating for the above move of nucleus pages
564          */
565         for (i = 0, j = 0, bounds = (1UL<<32); real_phys_avail[i] != 0; i += 2) {
566                 vm_paddr_t start = real_phys_avail[i];
567                 uint64_t end = real_phys_avail[i + 1];
568                 CTR2(KTR_PMAP, "start=%#lx size=%#lx\n", start, end);
569                 KDPRINTF("real_phys start=%#lx end=%#lx\n", start, end);
570                 /* 
571                  * Is kernel memory at the beginning of range?
572                  */
573                 if (nucleus_memory_start == start) {
574                         start += nucleus_memory;
575                 }
576                 /* 
577                  * Is kernel memory at the end of range?
578                  */
579                 if (nucleus_memory_start == (end - nucleus_memory)) 
580                         end -= nucleus_memory;
581
582                 if (physmemstart_tunable != 0 && 
583                     (end < physmemstart_tunable))
584                         continue;
585
586                 if (physmemstart_tunable != 0 && 
587                     ((start < physmemstart_tunable))) {
588                         start = physmemstart_tunable;
589                 }
590
591                 /* 
592                  * Is kernel memory in the middle somewhere?             
593                  */
594                 if ((nucleus_memory_start > start) && 
595                     (nucleus_memory_start < end)) {
596                         phys_avail[j] = start;
597                         phys_avail[j+1] = nucleus_memory_start;
598                         start =  nucleus_memory_start + nucleus_memory;
599                         j += 2;
600                 }
601                 /*
602                  * Break phys_avail up on 4GB boundaries to try
603                  * to work around PCI-e allocation bug
604                  * we rely on the fact that kernel memory is allocated 
605                  * from the first 4GB of physical memory
606                  */ 
607                 while (bounds < start)
608                         bounds += (1UL<<32);
609
610                 while (bounds < end) {
611                         phys_avail[j] = start;
612                         phys_avail[j + 1] = bounds;
613                         start = bounds;
614                         bounds += (1UL<<32);
615                         j += 2;
616                 }
617                 phys_avail[j] = start; 
618                 phys_avail[j + 1] = end;
619                 j += 2;
620         }
621
622         /*
623          * Merge nucleus memory in to real_phys_avail
624          *
625          */
626         for (i = 0; real_phys_avail[i] != 0; i += 2) {
627                 if (real_phys_avail[i] == nucleus_memory_start + nucleus_memory)
628                         real_phys_avail[i] -= nucleus_memory;
629                 
630                 if (real_phys_avail[i + 1] == nucleus_memory_start)
631                         real_phys_avail[i + 1] += nucleus_memory;
632                 
633                 if (real_phys_avail[i + 1] == real_phys_avail[i + 2]) {
634                         real_phys_avail[i + 1] = real_phys_avail[i + 3];
635                         for (k = i + 2; real_phys_avail[k] != 0; k += 2) {
636                                 real_phys_avail[k] = real_phys_avail[k + 2];
637                                 real_phys_avail[k + 1] = real_phys_avail[k + 3];
638                         }
639                 }
640         }
641         for (i = 0; phys_avail[i] != 0; i += 2)
642                 if (pmap_debug_range || pmap_debug)
643                         printf("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
644                         i, phys_avail[i], i+1, phys_avail[i+1]);
645
646         /*
647          * Shuffle the memory range containing the 256MB page with 
648          * nucleus_memory to the beginning of the phys_avail array
649          * so that physical memory from that page is preferentially
650          * allocated first
651          */
652         for (j = 0; phys_avail[j] != 0; j += 2) 
653                 if (nucleus_memory_start < phys_avail[j])
654                         break;
655         /*
656          * Don't shuffle unless we have a full 256M page in the range
657          * our kernel malloc appears to be horribly brittle
658          */
659         if ((phys_avail[j + 1] - phys_avail[j]) < 
660             (PAGE_SIZE_256M - nucleus_memory))
661                 goto skipshuffle;
662
663         for (i = j, k = 0; phys_avail[i] != 0; k++, i++)
664                 tmp_phys_avail[k] = phys_avail[i];
665         for (i = 0; i < j; i++)
666                 tmp_phys_avail[k + i] = phys_avail[i];
667         for (i = 0; i < 128; i++)
668                 phys_avail[i] = tmp_phys_avail[i];
669
670 skipshuffle:
671         for (i = 0; real_phys_avail[i] != 0; i += 2)
672                 if (pmap_debug_range || pmap_debug)
673                         printf("real_phys_avail[%d]=0x%lx real_phys_avail[%d]=0x%lx\n",
674                         i, real_phys_avail[i], i+1, real_phys_avail[i+1]);
675
676         for (i = 0; phys_avail[i] != 0; i += 2)
677                 if (pmap_debug_range || pmap_debug)
678                         printf("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
679                         i, phys_avail[i], i+1, phys_avail[i+1]);
680         /*
681          * Calculate the size of kernel virtual memory, and the size and mask
682          * for the kernel tsb.
683          */
684         virtsz = roundup(physsz, PAGE_SIZE_4M << (PAGE_SHIFT - TTE_SHIFT));
685         vm_max_kernel_address = VM_MIN_KERNEL_ADDRESS + virtsz;
686
687         /*
688          * Set the start and end of kva.  The kernel is loaded at the first
689          * available 4 meg super page, so round up to the end of the page.
690          */
691         virtual_avail = roundup2(ekva, PAGE_SIZE_4M);
692         virtual_end = vm_max_kernel_address;
693         kernel_vm_end = vm_max_kernel_address;
694
695         /*
696          * Allocate and map a 4MB page for the kernel hashtable 
697          *
698          */
699 #ifndef SIMULATOR
700         kernel_hash_shift = 10; /* PAGE_SIZE_4M*2 */
701 #else
702         kernel_hash_shift = 6; /* PAGE_SIZE_8K*64 */
703 #endif
704
705         kernel_hash_pa = pmap_bootstrap_alloc((1<<(kernel_hash_shift + PAGE_SHIFT)));
706         if (kernel_hash_pa & PAGE_MASK_4M)
707                 panic("pmap_bootstrap: hashtable pa unaligned\n");
708         /*
709          * Set up TSB descriptors for the hypervisor
710          *
711          */
712 #ifdef notyet
713         tsb_8k_size = virtsz >> (PAGE_SHIFT - TTE_SHIFT);
714 #else
715         /* avoid alignment complaints from the hypervisor */
716         tsb_8k_size = PAGE_SIZE_4M;
717 #endif
718
719         tsb_8k_pa = pmap_bootstrap_alloc(tsb_8k_size);
720         if (tsb_8k_pa & PAGE_MASK_4M)
721                 panic("pmap_bootstrap: tsb unaligned\n");
722         KDPRINTF("tsb_8k_size is 0x%lx, tsb_8k_pa is 0x%lx\n", tsb_8k_size, tsb_8k_pa);
723
724         tsb_4m_size = (virtsz >> (PAGE_SHIFT_4M - TTE_SHIFT)) << 3;
725         tsb_4m_pa = pmap_bootstrap_alloc(tsb_4m_size);
726
727         kernel_td[TSB8K_INDEX].hti_idxpgsz = TTE8K;
728         kernel_td[TSB8K_INDEX].hti_assoc = 1;
729         kernel_td[TSB8K_INDEX].hti_ntte = (tsb_8k_size >> TTE_SHIFT);
730         kernel_td[TSB8K_INDEX].hti_ctx_index = 0;
731         kernel_td[TSB8K_INDEX].hti_pgszs = TSB8K;
732         kernel_td[TSB8K_INDEX].hti_rsvd = 0;
733         kernel_td[TSB8K_INDEX].hti_ra = tsb_8k_pa;
734
735         /*
736          * Initialize kernel's private TSB from 8K page TSB
737          *
738          */
739         kernel_pmap->pm_tsb.hti_idxpgsz = TTE8K;
740         kernel_pmap->pm_tsb.hti_assoc = 1;
741         kernel_pmap->pm_tsb.hti_ntte = (tsb_8k_size >> TTE_SHIFT);
742         kernel_pmap->pm_tsb.hti_ctx_index = 0;
743         kernel_pmap->pm_tsb.hti_pgszs = TSB8K;
744         kernel_pmap->pm_tsb.hti_rsvd = 0;
745         kernel_pmap->pm_tsb.hti_ra = tsb_8k_pa;
746         
747         kernel_pmap->pm_tsb_ra = vtophys((vm_offset_t)&kernel_pmap->pm_tsb);
748         tsb_set_scratchpad_kernel(&kernel_pmap->pm_tsb);
749         
750         /*
751          * Initialize kernel TSB for 4M pages
752          * currently (not by design) used for permanent mappings
753          */
754         
755
756         KDPRINTF("tsb_4m_pa is 0x%lx tsb_4m_size is 0x%lx\n", tsb_4m_pa, tsb_4m_size);
757         kernel_td[TSB4M_INDEX].hti_idxpgsz = TTE4M;
758         kernel_td[TSB4M_INDEX].hti_assoc = 1;
759         kernel_td[TSB4M_INDEX].hti_ntte = (tsb_4m_size >> TTE_SHIFT);
760         kernel_td[TSB4M_INDEX].hti_ctx_index = 0;
761         kernel_td[TSB4M_INDEX].hti_pgszs = TSB4M|TSB256M;
762         kernel_td[TSB4M_INDEX].hti_rsvd = 0;
763         kernel_td[TSB4M_INDEX].hti_ra = tsb_4m_pa;
764         /*
765          * allocate MMU fault status areas for all CPUS
766          */
767         mmu_fault_status_area = pmap_bootstrap_alloc(MMFSA_SIZE*MAXCPU);
768
769         /*
770          * Allocate and map the dynamic per-CPU area for the BSP.
771          */
772         dpcpu0 = (void *)TLB_PHYS_TO_DIRECT(pmap_bootstrap_alloc(DPCPU_SIZE));
773
774         /*
775          * Allocate and map the message buffer.
776          */
777         msgbuf_phys = pmap_bootstrap_alloc(MSGBUF_SIZE);
778         msgbufp = (struct msgbuf *)TLB_PHYS_TO_DIRECT(msgbuf_phys);
779
780         /*
781          * Allocate a kernel stack with guard page for thread0 and map it into
782          * the kernel tsb.  
783          */
784         pa = pmap_bootstrap_alloc(KSTACK_PAGES*PAGE_SIZE);
785         kstack0_phys = pa;
786         virtual_avail += KSTACK_GUARD_PAGES * PAGE_SIZE;
787         kstack0 = virtual_avail;
788         virtual_avail += KSTACK_PAGES * PAGE_SIZE;
789         for (i = 0; i < KSTACK_PAGES; i++) {
790                 pa = kstack0_phys + i * PAGE_SIZE;
791                 va = kstack0 + i * PAGE_SIZE;
792                 tsb_set_tte_real(&kernel_td[TSB8K_INDEX], va, va,
793                             pa | TTE_KERNEL | VTD_8K, 0);
794         }
795         /*
796          * Calculate the last available physical address.
797          */
798         for (i = 0; phys_avail[i + 2] != 0; i += 2)
799                 KDPRINTF("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
800                         i, phys_avail[i], i+1, phys_avail[i+1]);
801         KDPRINTF("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
802                         i, phys_avail[i], i+1, phys_avail[i+1]);
803
804         Maxmem = sparc64_btop(phys_avail[i + 1]);
805         
806         /*
807          * Add the prom mappings to the kernel tsb.
808          */
809         for (i = 0; i < sz; i++) {
810                 CTR3(KTR_PMAP,
811                     "translation: start=%#lx size=%#lx tte=%#lx",
812                     translations[i].om_start, translations[i].om_size,
813                     translations[i].om_tte);
814                 KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", 
815                        translations[i].om_size, translations[i].om_start, 
816                        translations[i].om_tte);
817
818                 if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
819                     translations[i].om_start > VM_MAX_PROM_ADDRESS) 
820                         continue;
821
822                 for (off = 0; off < translations[i].om_size;
823                      off += PAGE_SIZE) {
824                         va = translations[i].om_start + off;
825                         pa = TTE_GET_PA(translations[i].om_tte) + off;
826                         tsb_assert_invalid(&kernel_td[TSB8K_INDEX], va);
827                         tsb_set_tte_real(&kernel_td[TSB8K_INDEX], va, va, pa | 
828                                     TTE_KERNEL | VTD_8K, 0);
829                 }
830         }
831
832         if ((error = hv_mmu_tsb_ctx0(MAX_TSB_INFO, 
833                                      vtophys((vm_offset_t)kernel_td))) != H_EOK)
834                 panic("failed to set ctx0 TSBs error: %ld", error);
835
836 #ifdef SMP
837         mp_set_tsb_desc_ra(vtophys((vm_offset_t)&kernel_td));
838 #endif
839         /*
840          * setup direct mappings
841          * 
842          */
843         for (i = 0, pa = real_phys_avail[i]; pa != 0; i += 2, pa = real_phys_avail[i]) {
844                 vm_paddr_t tag_pa = 0, next_pa = 0;
845                 uint64_t size_bits = VTD_4M;
846                 while (pa < real_phys_avail[i + 1]) {
847                         if (use_256M_pages &&
848                             (pa & PAGE_MASK_256M) == 0 && 
849                             ((pa + PAGE_SIZE_256M) <= real_phys_avail[i + 1])) {
850                                 tag_pa = pa;
851                                 size_bits = VTD_256M;
852                                 next_pa = pa + PAGE_SIZE_256M;
853                         } else if (next_pa <= pa) {
854                                 tag_pa = pa;
855                                 size_bits = VTD_4M;
856                         }
857                         tsb_assert_invalid(&kernel_td[TSB4M_INDEX], TLB_PHYS_TO_DIRECT(pa));
858                         tsb_set_tte_real(&kernel_td[TSB4M_INDEX], TLB_PHYS_TO_DIRECT(pa), 
859                                          TLB_PHYS_TO_DIRECT(pa), 
860                                          tag_pa | TTE_KERNEL | size_bits, 0);
861                         pa += PAGE_SIZE_4M;
862                 }
863         }
864
865         /*
866          * Get the available physical memory ranges from /memory/reg. These
867          * are only used for kernel dumps, but it may not be wise to do prom
868          * calls in that situation.
869          */
870         if ((sz = OF_getproplen(pmem, "reg")) == -1)
871                 panic("pmap_bootstrap: getproplen /memory/reg");
872         if (sizeof(sparc64_memreg) < sz)
873                 panic("pmap_bootstrap: sparc64_memreg too small");
874         if (OF_getprop(pmem, "reg", sparc64_memreg, sz) == -1)
875                 panic("pmap_bootstrap: getprop /memory/reg");
876         sparc64_nmemreg = sz / sizeof(*sparc64_memreg);
877
878         pm = kernel_pmap;
879         pm->pm_active = ~0;
880         pm->pm_tlbactive = ~0;
881
882         PMAP_LOCK_INIT(kernel_pmap);
883
884         TAILQ_INIT(&kernel_pmap->pm_pvlist);
885
886         /* 
887          * This could happen earlier - but I put it here to avoid 
888          * attempts to do updates until they're legal
889          */
890         pm->pm_hash = tte_hash_kernel_create(TLB_PHYS_TO_DIRECT(kernel_hash_pa), kernel_hash_shift, 
891                                              pmap_bootstrap_alloc(PAGE_SIZE));
892         pm->pm_hashscratch = tte_hash_set_scratchpad_kernel(pm->pm_hash);
893
894         for (i = 0; i < translations_size; i++) {
895                 KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", 
896                        translations[i].om_size, translations[i].om_start, 
897                        translations[i].om_tte);
898
899                 if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
900                     translations[i].om_start > VM_MAX_PROM_ADDRESS) {
901                         KDPRINTF("skipping\n");
902                         continue;
903                 }
904                 for (off = 0; off < translations[i].om_size; off += PAGE_SIZE) {
905                         va = translations[i].om_start + off;
906                         pa = TTE_GET_PA(translations[i].om_tte) + off;
907                         tte_hash_insert(pm->pm_hash, va, pa | TTE_KERNEL | VTD_8K);
908                 }
909                 KDPRINTF("set om_size=%ld om_start=%lx om_tte=%lx\n", 
910                        translations[i].om_size, translations[i].om_start, 
911                        translations[i].om_tte);
912         }
913         for (i = 0; i < KSTACK_PAGES; i++) {
914                 pa = kstack0_phys + i * PAGE_SIZE;
915                 va = kstack0 + i * PAGE_SIZE;
916                 tte_hash_insert(pm->pm_hash, va, pa | TTE_KERNEL | VTD_8K);
917         }
918         /*
919          * Add direct mappings to hash
920          *
921          */
922 #ifdef notyet
923         /* hash only supports 8k pages */
924         for (pa = PAGE_SIZE_4M; pa < phys_avail[2]; pa += PAGE_SIZE_4M)
925                 tte_hash_insert(pm->pm_hash, TLB_PHYS_TO_DIRECT(pa), 
926                                 pa | TTE_KERNEL | VTD_4M);
927 #endif
928
929
930         if (bootverbose)
931                 printf("pmap_bootstrap done\n");
932 }
933
934
935
936 /*
937  *      Routine:        pmap_change_wiring
938  *      Function:       Change the wiring attribute for a map/virtual-address
939  *                      pair.
940  *      In/out conditions:
941  *                      The mapping must already exist in the pmap.
942  */
943 void
944 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
945 {
946         boolean_t iswired;
947         PMAP_LOCK(pmap);
948         iswired = tte_get_virt_bit(pmap, va, VTD_WIRED);
949
950         if (wired && !iswired) {
951                 pmap->pm_stats.wired_count++;
952                 tte_set_virt_bit(pmap, va, VTD_WIRED);
953         } else if (!wired && iswired) {
954                 pmap->pm_stats.wired_count--;
955                 tte_clear_virt_bit(pmap, va, VTD_WIRED);
956         }
957         PMAP_UNLOCK(pmap);
958 }
959
960 void
961 pmap_clear_modify(vm_page_t m)
962 {
963         KDPRINTF("pmap_clear_modify(0x%lx)\n", VM_PAGE_TO_PHYS(m));
964         tte_clear_phys_bit(m, VTD_W);
965 }
966
967 void
968 pmap_clear_reference(vm_page_t m)
969 {
970         KDPRINTF("pmap_clear_reference(0x%lx)\n", VM_PAGE_TO_PHYS(m));
971         tte_clear_phys_bit(m, VTD_REF);
972 }
973
974 void
975 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
976           vm_size_t len, vm_offset_t src_addr)
977 {
978         vm_offset_t addr, end_addr;
979
980         end_addr = src_addr + len;
981         /*
982          * Don't let optional prefaulting of pages make us go
983          * way below the low water mark of free pages or way
984          * above high water mark of used pv entries.
985          */
986         if (cnt.v_free_count < cnt.v_free_reserved ||
987             pv_entry_count > pv_entry_high_water)
988                 return;
989         
990
991         vm_page_lock_queues();
992         if (dst_pmap < src_pmap) {
993                 PMAP_LOCK(dst_pmap);
994                 PMAP_LOCK(src_pmap);
995         } else {
996                 PMAP_LOCK(src_pmap);
997                 PMAP_LOCK(dst_pmap);
998         }
999         for (addr = src_addr; addr < end_addr; addr += PAGE_SIZE) {
1000                 tte_t tte_data;
1001                 vm_page_t m;
1002
1003                 tte_data = tte_hash_lookup(src_pmap->pm_hash, addr);
1004
1005                 if ((tte_data & VTD_MANAGED) != 0) {
1006                         if (tte_hash_lookup(dst_pmap->pm_hash, addr) == 0) {
1007                                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
1008
1009                                 tte_hash_insert(dst_pmap->pm_hash, addr, tte_data & ~(VTD_W|VTD_REF|VTD_WIRED));
1010                                 dst_pmap->pm_stats.resident_count++;
1011                                 pmap_insert_entry(dst_pmap, addr, m);
1012                         } 
1013                 }               
1014         }
1015         vm_page_unlock_queues();
1016         PMAP_UNLOCK(src_pmap);
1017         PMAP_UNLOCK(dst_pmap);
1018 }
1019
1020 void
1021 pmap_copy_page(vm_page_t src, vm_page_t dst)
1022 {
1023         vm_paddr_t srcpa, dstpa;
1024         srcpa = VM_PAGE_TO_PHYS(src);
1025         dstpa = VM_PAGE_TO_PHYS(dst);
1026
1027         novbcopy((char *)TLB_PHYS_TO_DIRECT(srcpa), (char *)TLB_PHYS_TO_DIRECT(dstpa), PAGE_SIZE);
1028
1029
1030 }
1031
1032 static __inline void
1033 pmap_add_tte(pmap_t pmap, vm_offset_t va, vm_page_t m, tte_t *tte_data, int wired)
1034 {
1035
1036         if (wired)
1037                 pmap->pm_stats.wired_count++;
1038         
1039         if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
1040                 pmap_insert_entry(pmap, va, m);
1041                 *tte_data |= VTD_MANAGED;
1042         }
1043 }
1044
1045 /*
1046  * Map the given physical page at the specified virtual address in the
1047  * target pmap with the protection requested.  If specified the page
1048  * will be wired down.
1049  */
1050 void
1051 pmap_enter(pmap_t pmap, vm_offset_t va, vm_prot_t access, vm_page_t m,
1052     vm_prot_t prot, boolean_t wired)
1053 {
1054         vm_paddr_t pa, opa;
1055         uint64_t tte_data, otte_data;
1056         vm_page_t om;
1057         int invlva;
1058
1059         if (pmap->pm_context)
1060                 DPRINTF("pmap_enter(va=%lx, pa=0x%lx, prot=%x)\n", va, 
1061                         VM_PAGE_TO_PHYS(m), prot);
1062
1063         om = NULL;
1064         
1065         vm_page_lock_queues();
1066         PMAP_LOCK(pmap);
1067
1068         tte_data = pa = VM_PAGE_TO_PHYS(m);
1069         otte_data = tte_hash_delete(pmap->pm_hash, va);
1070         opa = TTE_GET_PA(otte_data);
1071
1072         if (opa == 0) {
1073                 /*
1074                  * This is a new mapping
1075                  */
1076                 pmap->pm_stats.resident_count++;
1077                 pmap_add_tte(pmap, va, m, &tte_data, wired);
1078
1079         } else if (pa != opa) {
1080                 /*
1081                  * Mapping has changed, handle validating new mapping.
1082                  * 
1083                  */
1084                 if (otte_data & VTD_WIRED)
1085                         pmap->pm_stats.wired_count--;
1086
1087                 if (otte_data & VTD_MANAGED) {
1088                         om = PHYS_TO_VM_PAGE(opa);
1089                         pmap_remove_entry(pmap, om, va);
1090                 }
1091
1092                 pmap_add_tte(pmap, va, m, &tte_data, wired);
1093
1094         } else /* (pa == opa) */ {
1095                 /*
1096                  * Mapping has not changed, must be protection or wiring change.
1097                  */
1098
1099                 /*
1100                  * Wiring change, just update stats. We don't worry about
1101                  * wiring PT pages as they remain resident as long as there
1102                  * are valid mappings in them. Hence, if a user page is wired,
1103                  * the PT page will be also.
1104                  */
1105                 if (wired && ((otte_data & VTD_WIRED) == 0))
1106                         pmap->pm_stats.wired_count++;
1107                 else if (!wired && (otte_data & VTD_WIRED))
1108                         pmap->pm_stats.wired_count--;
1109
1110                 /*
1111                  * We might be turning off write access to the page,
1112                  * so we go ahead and sense modify status.
1113                  */
1114                 if (otte_data & VTD_MANAGED) {
1115                         om = m;
1116                         tte_data |= VTD_MANAGED;
1117                 }
1118         } 
1119
1120         /*
1121          * Now validate mapping with desired protection/wiring.
1122          */
1123         if ((prot & VM_PROT_WRITE) != 0) {
1124                 tte_data |= VTD_SW_W; 
1125                 vm_page_flag_set(m, PG_WRITEABLE);
1126         }
1127         if ((prot & VM_PROT_EXECUTE) != 0)
1128                 tte_data |= VTD_X;
1129         if (wired)
1130                 tte_data |= VTD_WIRED;
1131         if (pmap == kernel_pmap)
1132                 tte_data |= VTD_P;
1133         
1134         invlva = FALSE;
1135         if ((otte_data & ~(VTD_W|VTD_REF)) != tte_data) {
1136                 if (otte_data & VTD_V) {
1137                         if (otte_data & VTD_REF) {
1138                                 if (otte_data & VTD_MANAGED) 
1139                                         vm_page_flag_set(om, PG_REFERENCED);
1140                                 if ((opa != pa) || ((opa & VTD_X) != (pa & VTD_X)))
1141                                         invlva = TRUE;
1142                         }
1143                         if (otte_data & VTD_W) {
1144                                 if (otte_data & VTD_MANAGED) 
1145                                         vm_page_dirty(om);
1146                                 if ((pa & VTD_SW_W) != 0) 
1147                                         invlva = TRUE;
1148                         }
1149                         if (invlva)
1150                                 pmap_invalidate_page(pmap, va, TRUE);
1151                 }
1152         } 
1153
1154
1155         tte_hash_insert(pmap->pm_hash, va, tte_data|TTE_MINFLAGS|VTD_REF);
1156         /*
1157          * XXX this needs to be locked for the threaded / kernel case 
1158          */
1159         tsb_set_tte(&pmap->pm_tsb, va, tte_data|TTE_MINFLAGS|VTD_REF, 
1160                     pmap->pm_context);
1161
1162         if (tte_hash_needs_resize(pmap->pm_hash))
1163                 pmap_tte_hash_resize(pmap);
1164
1165         /*
1166          * 512 is an arbitrary number of tsb misses
1167          */
1168         if (0 && pmap->pm_context != 0 && pmap->pm_tsb_miss_count > 512)
1169                 pmap_tsb_resize(pmap);
1170
1171         vm_page_unlock_queues();
1172
1173         PMAP_UNLOCK(pmap);
1174 }
1175
1176 /*
1177  * Maps a sequence of resident pages belonging to the same object.
1178  * The sequence begins with the given page m_start.  This page is
1179  * mapped at the given virtual address start.  Each subsequent page is
1180  * mapped at a virtual address that is offset from start by the same
1181  * amount as the page is offset from m_start within the object.  The
1182  * last page in the sequence is the page with the largest offset from
1183  * m_start that can be mapped at a virtual address less than the given
1184  * virtual address end.  Not every virtual page between start and end
1185  * is mapped; only those for which a resident page exists with the
1186  * corresponding offset from m_start are mapped.
1187  */
1188 void
1189 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 
1190                   vm_page_t m_start, vm_prot_t prot)
1191 {
1192         vm_page_t m;
1193         vm_pindex_t diff, psize;
1194
1195         VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
1196         psize = atop(end - start);
1197         m = m_start;
1198         PMAP_LOCK(pmap);
1199         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
1200                 pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot);
1201                 m = TAILQ_NEXT(m, listq);
1202         }
1203         PMAP_UNLOCK(pmap);
1204 }
1205
1206 void
1207 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
1208 {
1209         PMAP_LOCK(pmap);
1210         pmap_enter_quick_locked(pmap, va, m, prot);
1211         PMAP_UNLOCK(pmap);
1212 }
1213
1214 static void
1215 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
1216 {
1217
1218         tte_t tte_data;
1219
1220         if (pmap->pm_context)
1221                 KDPRINTF("pmap_enter_quick(ctx=0x%lx va=%lx, pa=0x%lx prot=%x)\n", 
1222                         pmap->pm_context, va, VM_PAGE_TO_PHYS(m), prot);
1223
1224         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1225         if (tte_hash_lookup(pmap->pm_hash, va))
1226                 return;
1227                 
1228         tte_data = VM_PAGE_TO_PHYS(m);
1229         /*
1230          * Enter on the PV list if part of our managed memory. Note that we
1231          * raise IPL while manipulating pv_table since pmap_enter can be
1232          * called at interrupt time.
1233          */
1234         if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
1235                 pmap_insert_entry(pmap, va, m);
1236                 tte_data |= VTD_MANAGED;
1237         }
1238
1239         pmap->pm_stats.resident_count++;
1240
1241         if ((prot & VM_PROT_EXECUTE) != 0)
1242                 tte_data |= VTD_X;
1243
1244         tte_hash_insert(pmap->pm_hash, va, tte_data | TTE_MINFLAGS);
1245 }
1246
1247 /*
1248  * Extract the physical page address associated with the given
1249  * map/virtual_address pair.
1250  */
1251 vm_paddr_t
1252 pmap_extract(pmap_t pmap, vm_offset_t va)
1253 {
1254         vm_paddr_t pa;
1255         tte_t tte_data;
1256
1257         tte_data = tte_hash_lookup(pmap->pm_hash, va);
1258         pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
1259
1260         return (pa);
1261 }
1262
1263 /*
1264  * Atomically extract and hold the physical page with the given
1265  * pmap and virtual address pair if that mapping permits the given
1266  * protection.
1267  */
1268 vm_page_t
1269 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1270 {
1271         tte_t tte_data;
1272         vm_page_t m;
1273
1274         m = NULL;
1275         vm_page_lock_queues();
1276         PMAP_LOCK(pmap);
1277         tte_data = tte_hash_lookup(pmap->pm_hash, va);
1278         if (tte_data != 0 && 
1279             ((tte_data & VTD_SW_W) || (prot & VM_PROT_WRITE) == 0)) {
1280                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
1281                 vm_page_hold(m);
1282         }
1283         vm_page_unlock_queues();
1284         PMAP_UNLOCK(pmap);
1285
1286         return (m);
1287 }
1288
1289 void *
1290 pmap_alloc_zeroed_contig_pages(int npages, uint64_t alignment)
1291 {
1292         vm_page_t m, tm;
1293         int i;
1294         void *ptr;
1295         
1296         m = NULL;
1297         while (m == NULL) {     
1298                 for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1299                         m = vm_phys_alloc_contig(npages, phys_avail[i], 
1300                             phys_avail[i + 1], alignment, (1UL<<34));
1301                         if (m)
1302                                 goto found;
1303                 }
1304                 if (m == NULL) {
1305                         printf("vm_phys_alloc_contig failed - waiting to retry\n");
1306                         VM_WAIT;
1307                 }
1308         }
1309 found:
1310         for (i = 0, tm = m; i < npages; i++, tm++) {
1311                 tm->wire_count++;
1312                 if ((tm->flags & PG_ZERO) == 0)
1313                         pmap_zero_page(tm);
1314         }
1315         ptr = (void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m));
1316         
1317         return (ptr);
1318 }
1319
1320 void
1321 pmap_free_contig_pages(void *ptr, int npages)
1322 {
1323         int i;
1324         vm_page_t m;
1325
1326         m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS((vm_offset_t)ptr));
1327         for (i = 0; i < npages; i++, m++) {
1328                 m->wire_count--;
1329                 atomic_subtract_int(&cnt.v_wire_count, 1);
1330                 vm_page_free(m);
1331         }
1332 }
1333
1334 void 
1335 pmap_growkernel(vm_offset_t addr)
1336 {
1337         return;
1338 }
1339
1340 void 
1341 pmap_init(void)
1342 {
1343
1344         /* allocate pv_entry zones */
1345         int shpgperproc = PMAP_SHPGPERPROC;
1346
1347         for (ctx_stack_top = 1; ctx_stack_top < PMAP_CONTEXT_MAX; ctx_stack_top++) 
1348                 ctx_stack[ctx_stack_top] = ctx_stack_top;
1349
1350         mtx_init(&pmap_ctx_lock, "ctx lock", NULL, MTX_SPIN);
1351
1352         /*
1353          * Initialize the address space (zone) for the pv entries.  Set a
1354          * high water mark so that the system can recover from excessive
1355          * numbers of pv entries.
1356          */
1357         pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, 
1358             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
1359         TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
1360         pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
1361         TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
1362         pv_entry_high_water = 9 * (pv_entry_max / 10);
1363         uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
1364
1365         tte_hash_init();
1366
1367 }
1368
1369 /*
1370  * Create a pv entry for page at pa for
1371  * (pmap, va).
1372  */
1373 static void
1374 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1375 {
1376         pv_entry_t pv;
1377
1378         KDPRINTF("pmap_insert_entry(va=0x%lx, pa=0x%lx)\n", va, VM_PAGE_TO_PHYS(m));
1379         pv = get_pv_entry(pmap);
1380         pv->pv_va = va;
1381         pv->pv_pmap = pmap;
1382
1383         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1384         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1385         TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1386         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1387         m->md.pv_list_count++;
1388 }
1389
1390 #ifdef TRAP_TRACING
1391 static int trap_trace_report_done;
1392 #endif
1393
1394 #ifdef SMP
1395 static cpumask_t
1396 pmap_ipi(pmap_t pmap, char *func, uint64_t arg1, uint64_t arg2)
1397 {
1398
1399         int i, cpu_count, retried;
1400         u_int cpus;
1401         cpumask_t cpumask, active, curactive;
1402         cpumask_t active_total, ackmask;
1403         uint16_t *cpulist;
1404
1405         retried = 0;
1406
1407         if (!smp_started)
1408                 return (0);
1409
1410         cpumask = PCPU_GET(cpumask);
1411         cpulist = PCPU_GET(cpulist);
1412         curactive = 0;
1413
1414         if (rdpr(pil) != 14)
1415                 panic("pil %ld != 14", rdpr(pil));
1416
1417 #ifndef CPUMASK_NOT_BEING_ERRONEOUSLY_CHANGED
1418         /* by definition cpumask should have curcpu's bit set */
1419         if (cpumask != (1 << curcpu)) 
1420                 panic("cpumask(0x%x) != (1 << curcpu) (0x%x)\n", 
1421                       cpumask, (1 << curcpu));
1422
1423 #endif
1424 #ifdef notyet
1425         if ((active_total = (pmap->pm_tlbactive & ~cpumask)) == 0)
1426                 goto done;
1427
1428         if (pmap->pm_context != 0)
1429                 active_total = active = (pmap->pm_tlbactive & ~cpumask);
1430         else 
1431 #endif
1432                 active_total = active = PCPU_GET(other_cpus);
1433
1434         if (active == 0)
1435                 goto done;
1436         
1437  retry:
1438         
1439         for (i = curactive = cpu_count = 0, cpus = active; i < mp_ncpus && cpus; i++, cpus = (cpus>>1)) {
1440                 if ((cpus & 0x1) == 0)
1441                         continue;
1442                 
1443                 curactive |= (1 << i);
1444                 cpulist[cpu_count] = (uint16_t)i;
1445                 cpu_count++;
1446         }
1447
1448         ackmask = 0;
1449         cpu_ipi_selected(cpu_count, cpulist, (uint64_t)func, (uint64_t)arg1, 
1450                          (uint64_t)arg2, (uint64_t *)&ackmask);
1451
1452         while (ackmask != curactive) {
1453                 membar(Sync);
1454                 i++;
1455                 if (i > 10000000) {
1456 #ifdef TRAP_TRACING
1457                         int j;
1458 #endif
1459                         uint64_t cpu_state;
1460                         printf("cpu with cpumask=0x%x appears to not be responding to ipis\n",
1461                                curactive & ~ackmask);
1462
1463 #ifdef TRAP_TRACING
1464                         if (!trap_trace_report_done) {
1465                                 trap_trace_report_done = 1;
1466                                 for (j = 0; j < MAXCPU; j++)
1467                                         if (((1 << j) & curactive & ~ackmask) != 0) {
1468                                                 struct pcpu *pc = pcpu_find(j);
1469                                                 printf("pcpu pad 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx\n",
1470                                                     pc->pad[0], pc->pad[1], pc->pad[2], pc->pad[3],
1471                                                     pc->pad[4], pc->pad[5], pc->pad[6]);
1472                                                 trap_trace_report(j);
1473                                         }
1474                         }
1475 #endif
1476
1477                         hv_cpu_state((uint64_t)ffs64(curactive & ~ackmask), &cpu_state);
1478                         printf("cpu_state of %ld is %ld\n", ffs64(curactive & ~ackmask), cpu_state);
1479                         if (!retried) {
1480                                 printf("I'm going to send off another ipi just to confirm that it isn't a memory barrier bug\n"
1481                                "and then I'm going to panic\n");
1482
1483                                 retried = 1;
1484                                 goto retry;
1485                         }
1486
1487                         panic(" ackmask=0x%x active=0x%x\n", ackmask, curactive);
1488                 }
1489         }
1490
1491         active_total |= curactive;
1492         if ((active = ((pmap->pm_tlbactive & all_cpus) & ~(active_total|cpumask))) != 0) {
1493                 printf("pmap_ipi: retrying");
1494                 goto retry;
1495         }
1496  done:
1497         return (active_total);
1498 }
1499 #endif
1500
1501 void
1502 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, int cleartsb)
1503 {
1504
1505         if (cleartsb == TRUE)
1506                 tsb_clear_tte(&pmap->pm_tsb, va);
1507
1508         DPRINTF("pmap_invalidate_page(va=0x%lx)\n", va);
1509         spinlock_enter();
1510         invlpg(va, pmap->pm_context);
1511 #ifdef SMP
1512         pmap_ipi(pmap, (void *)tl_invlpg, (uint64_t)va, (uint64_t)pmap->pm_context);
1513 #endif
1514         spinlock_exit();
1515 }
1516
1517 void
1518 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int cleartsb)
1519 {
1520         vm_offset_t tva, invlrngva;
1521         char *func;
1522 #ifdef SMP
1523         cpumask_t active;
1524 #endif
1525         if ((eva - sva) == PAGE_SIZE) {
1526                 pmap_invalidate_page(pmap, sva, cleartsb);
1527                 return;
1528         }
1529         
1530
1531         KASSERT(sva < eva, ("invalidating negative or zero range sva=0x%lx eva=0x%lx", sva, eva));
1532
1533         if (cleartsb == TRUE) 
1534                 tsb_clear_range(&pmap->pm_tsb, sva, eva);
1535
1536         spinlock_enter();
1537         if ((sva - eva) < PAGE_SIZE*64) {
1538                 for (tva = sva; tva < eva; tva += PAGE_SIZE_8K)
1539                         invlpg(tva, pmap->pm_context);
1540                 func = tl_invlrng;
1541         } else if (pmap->pm_context) {
1542                 func = tl_invlctx;
1543                 invlctx(pmap->pm_context);
1544
1545         } else {
1546                 func = tl_invltlb;
1547                 invltlb();
1548         }
1549 #ifdef SMP
1550         invlrngva = sva | ((eva - sva) >> PAGE_SHIFT);
1551         active = pmap_ipi(pmap, (void *)func, pmap->pm_context, invlrngva);
1552         active &= ~pmap->pm_active;
1553         atomic_clear_int(&pmap->pm_tlbactive, active);
1554 #endif
1555         spinlock_exit();
1556 }
1557
1558 void
1559 pmap_invalidate_all(pmap_t pmap)
1560 {
1561
1562         KASSERT(pmap != kernel_pmap, ("invalidate_all called on kernel_pmap"));
1563
1564         tsb_clear(&pmap->pm_tsb);
1565
1566         spinlock_enter();
1567         invlctx(pmap->pm_context);
1568 #ifdef SMP
1569         pmap_ipi(pmap, tl_invlctx, pmap->pm_context, 0);
1570         pmap->pm_tlbactive = pmap->pm_active;
1571 #endif
1572         spinlock_exit();
1573 }
1574
1575 boolean_t
1576 pmap_is_modified(vm_page_t m)
1577 {
1578
1579         return (tte_get_phys_bit(m, VTD_W));
1580 }
1581
1582
1583 boolean_t 
1584 pmap_is_prefaultable(pmap_t pmap, vm_offset_t va)
1585 {
1586         return (tte_hash_lookup(pmap->pm_hash, va) == 0);
1587 }
1588
1589 /*
1590  * Extract the physical page address associated with the given kernel virtual
1591  * address.
1592  */
1593
1594 vm_paddr_t
1595 pmap_kextract(vm_offset_t va)
1596 {
1597         tte_t tte_data;
1598         vm_paddr_t pa;
1599
1600         pa = 0;
1601         if (va > KERNBASE && va < KERNBASE + nucleus_memory) {
1602                 uint64_t offset;
1603                 offset = va - KERNBASE; 
1604                 pa = nucleus_mappings[offset >> 22] | (va & PAGE_MASK_4M);
1605         }
1606         if ((pa == 0) && (tte_data = tsb_lookup_tte(va, 0)) != 0)
1607                 pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
1608
1609         if ((pa == 0) && (tte_data = tte_hash_lookup(kernel_pmap->pm_hash, va)) != 0)
1610                 pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
1611
1612         return pa;
1613 }
1614
1615 /*
1616  * Map a range of physical addresses into kernel virtual address space.
1617  *
1618  * The value passed in *virt is a suggested virtual address for the mapping.
1619  * Architectures which can support a direct-mapped physical to virtual region
1620  * can return the appropriate address within that region, leaving '*virt'
1621  * unchanged.
1622  */
1623 vm_offset_t
1624 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1625 {
1626         return TLB_PHYS_TO_DIRECT(start);
1627 }
1628
1629 int 
1630 pmap_mincore(pmap_t pmap, vm_offset_t addr)
1631 {
1632         return (0);
1633 }
1634
1635 void 
1636 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 
1637                     vm_pindex_t index, vm_size_t size)
1638 {
1639         printf("pmap_object_init_pt\n");
1640         return;
1641 }
1642
1643 /*
1644  * Returns true if the pmap's pv is one of the first
1645  * 16 pvs linked to from this page.  This count may
1646  * be changed upwards or downwards in the future; it
1647  * is only necessary that true be returned for a small
1648  * subset of pmaps for proper page aging.
1649  */
1650 boolean_t
1651 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
1652 {
1653         pv_entry_t pv;
1654         int loops = 0;
1655
1656         if (m->flags & PG_FICTITIOUS)
1657                 return FALSE;
1658
1659         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1660         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1661                 if (pv->pv_pmap == pmap) {
1662                         return TRUE;
1663                 }
1664                 loops++;
1665                 if (loops >= 16)
1666                         break;
1667         }       
1668         return (FALSE);
1669 }
1670
1671 /*
1672  * Initialize a vm_page's machine-dependent fields.
1673  */
1674 void
1675 pmap_page_init(vm_page_t m)
1676 {
1677
1678         TAILQ_INIT(&m->md.pv_list);
1679         m->md.pv_list_count = 0;
1680 }
1681
1682 /*
1683  * Return the number of managed mappings to the given physical page
1684  * that are wired.
1685  */
1686 int
1687 pmap_page_wired_mappings(vm_page_t m)
1688 {
1689         pmap_t pmap;
1690         pv_entry_t pv;
1691         uint64_t tte_data;
1692         int count;
1693
1694         count = 0;
1695         if ((m->flags & PG_FICTITIOUS) != 0)
1696                 return (count);
1697         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1698         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1699                 pmap = pv->pv_pmap;
1700                 PMAP_LOCK(pmap);
1701                 tte_data = tte_hash_lookup(pmap->pm_hash, pv->pv_va);
1702                 if ((tte_data & VTD_WIRED) != 0)
1703                         count++;
1704                 PMAP_UNLOCK(pmap);
1705         }
1706         return (count);
1707 }
1708
1709 /*
1710  * Lower the permission for all mappings to a given page.
1711  */
1712 void
1713 pmap_remove_write(vm_page_t m)
1714 {
1715         if ((m->flags & PG_WRITEABLE) == 0)
1716                 return;
1717         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1718         tte_clear_phys_bit(m, VTD_SW_W|VTD_W);
1719         vm_page_flag_clear(m, PG_WRITEABLE);
1720 }
1721 /*
1722  * Initialize the pmap associated with process 0.
1723  */
1724 void
1725 pmap_pinit0(pmap_t pmap)
1726 {
1727         PMAP_LOCK_INIT(pmap);
1728         pmap->pm_active = pmap->pm_tlbactive = ~0;
1729         pmap->pm_context = 0;
1730         pmap->pm_tsb_ra = kernel_pmap->pm_tsb_ra;
1731         pmap->pm_hash = kernel_pmap->pm_hash;
1732         critical_enter();
1733         PCPU_SET(curpmap, pmap);
1734         critical_exit();
1735         TAILQ_INIT(&pmap->pm_pvlist);
1736         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1737 }
1738
1739 /*
1740  * Initialize a preallocated and zeroed pmap structure, such as one in a
1741  * vmspace structure.
1742  */
1743 int
1744 pmap_pinit(pmap_t pmap)
1745 {
1746         int i;
1747
1748         pmap->pm_context = get_context();
1749         pmap->pm_tsb_ra = vtophys(&pmap->pm_tsb);
1750
1751         vm_page_lock_queues();
1752         pmap->pm_hash = tte_hash_create(pmap->pm_context, &pmap->pm_hashscratch);
1753         tsb_init(&pmap->pm_tsb, &pmap->pm_tsbscratch, TSB_INIT_SHIFT);
1754         vm_page_unlock_queues();
1755         pmap->pm_tsb_miss_count = pmap->pm_tsb_cap_miss_count = 0;
1756         pmap->pm_active = pmap->pm_tlbactive = 0;
1757         for (i = 0; i < TSB_MAX_RESIZE; i++)
1758                 pmap->pm_old_tsb_ra[i] = 0;
1759
1760         TAILQ_INIT(&pmap->pm_pvlist);
1761         PMAP_LOCK_INIT(pmap);
1762         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1763         return (1);
1764 }
1765
1766 /*
1767  * Set the physical protection on the specified range of this map as requested.
1768  */
1769 void
1770 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1771 {
1772
1773         int anychanged;
1774         vm_offset_t tva;
1775         uint64_t clearbits;
1776
1777         DPRINTF("pmap_protect(0x%lx, 0x%lx, %d)\n", sva, eva, prot);
1778         
1779         if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1780                 pmap_remove(pmap, sva, eva);
1781                 return;
1782         }
1783         
1784         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 
1785             (VM_PROT_WRITE|VM_PROT_EXECUTE))
1786                 return;
1787
1788         clearbits = anychanged = 0;
1789         
1790         if ((prot & VM_PROT_WRITE) == 0)
1791                 clearbits |= (VTD_W|VTD_SW_W);
1792         if ((prot & VM_PROT_EXECUTE) == 0)
1793                 clearbits |= VTD_X;
1794
1795         vm_page_lock_queues();
1796         PMAP_LOCK(pmap);
1797         for (tva = sva; tva < eva; tva += PAGE_SIZE) {
1798                 uint64_t otte_data;
1799                 vm_page_t m;
1800
1801                 if ((otte_data = tte_hash_clear_bits(pmap->pm_hash, tva, 
1802                                                      clearbits)) == 0)
1803                         continue;
1804                 /*
1805                  * XXX technically we should do a shootdown if it 
1806                  * was referenced and was executable - but is not now
1807                  */
1808                 if (!anychanged && (otte_data & VTD_W))
1809                         anychanged = 1;
1810                 
1811                 if (otte_data & VTD_MANAGED) {
1812                         m = NULL;
1813
1814                         if (otte_data & VTD_REF) {
1815                                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(otte_data));
1816                                 vm_page_flag_set(m, PG_REFERENCED);
1817                         }
1818                         if (otte_data & VTD_W) {
1819                                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(otte_data));
1820                                 vm_page_dirty(m);
1821                         }
1822                 } 
1823         }
1824
1825         vm_page_unlock_queues();
1826         if (anychanged)
1827                 pmap_invalidate_range(pmap, sva, eva, TRUE);
1828         PMAP_UNLOCK(pmap);
1829 }
1830
1831 /*
1832  * Map a list of wired pages into kernel virtual address space.  This is
1833  * intended for temporary mappings which do not need page modification or
1834  * references recorded.  Existing mappings in the region are overwritten.
1835  */
1836 void
1837 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
1838 {
1839         vm_offset_t va;
1840         tte_t otte;
1841         
1842         otte = 0;
1843         va = sva;
1844         while (count-- > 0) {
1845                 otte |= tte_hash_update(kernel_pmap->pm_hash, va,  
1846                                         VM_PAGE_TO_PHYS(*m) | TTE_KERNEL | VTD_8K);
1847                 va += PAGE_SIZE;
1848                 m++;
1849         }
1850         if ((otte & VTD_REF) != 0)
1851                 pmap_invalidate_range(kernel_pmap, sva, va, FALSE);
1852 }
1853
1854 /*
1855  * Remove page mappings from kernel virtual address space.  Intended for
1856  * temporary mappings entered by pmap_qenter.
1857  */
1858 void
1859 pmap_qremove(vm_offset_t sva, int count)
1860 {
1861         vm_offset_t va;
1862         tte_t otte;
1863
1864         va = sva;
1865
1866         otte = 0;
1867         while (count-- > 0) {
1868                 otte |= tte_hash_delete(kernel_pmap->pm_hash, va);
1869                 va += PAGE_SIZE;
1870         }
1871         if ((otte & VTD_REF) != 0)
1872                 pmap_invalidate_range(kernel_pmap, sva, va, TRUE);
1873 }
1874
1875 /*
1876  * Release any resources held by the given physical map.
1877  * Called when a pmap initialized by pmap_pinit is being released.
1878  * Should only be called if the map contains no valid mappings.
1879  */
1880 void
1881 pmap_release(pmap_t pmap)
1882 {
1883         KASSERT(pmap->pm_stats.resident_count == 0,
1884             ("pmap_release: pmap resident count %ld != 0",
1885             pmap->pm_stats.resident_count));
1886
1887         tsb_deinit(&pmap->pm_tsb);
1888         tte_hash_destroy(pmap->pm_hash);
1889         free_context(pmap->pm_context);
1890         PMAP_LOCK_DESTROY(pmap);
1891 }
1892
1893 /*
1894  * Remove the given range of addresses from the specified map.
1895  */
1896 void
1897 pmap_remove(pmap_t pmap, vm_offset_t start, vm_offset_t end)
1898 {
1899         int invlva;
1900         vm_offset_t tva;
1901         uint64_t tte_data;
1902         /*
1903          * Perform an unsynchronized read.  This is, however, safe.
1904          */
1905         if (pmap->pm_stats.resident_count == 0)
1906                 return;
1907         
1908         DPRINTF("pmap_remove(start=0x%lx, end=0x%lx)\n", 
1909                 start, end);
1910         invlva = 0;
1911         vm_page_lock_queues();
1912         PMAP_LOCK(pmap);
1913         for (tva = start; tva < end; tva += PAGE_SIZE) {
1914                 if ((tte_data = tte_hash_delete(pmap->pm_hash, tva)) == 0)
1915                         continue;
1916                 pmap_remove_tte(pmap, tte_data, tva);
1917                 if (tte_data & (VTD_REF|VTD_W))
1918                         invlva = 1;
1919         }
1920         vm_page_unlock_queues();
1921         if (invlva)
1922                 pmap_invalidate_range(pmap, start, end, TRUE);
1923         PMAP_UNLOCK(pmap);
1924 }
1925
1926 /*
1927  *      Routine:        pmap_remove_all
1928  *      Function:
1929  *              Removes this physical page from
1930  *              all physical maps in which it resides.
1931  *              Reflects back modify bits to the pager.
1932  *
1933  *      Notes:
1934  *              Original versions of this routine were very
1935  *              inefficient because they iteratively called
1936  *              pmap_remove (slow...)
1937  */
1938
1939 void
1940 pmap_remove_all(vm_page_t m)
1941 {
1942         pv_entry_t pv;
1943         uint64_t tte_data;
1944         DPRINTF("pmap_remove_all 0x%lx\n", VM_PAGE_TO_PHYS(m));
1945
1946         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1947         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1948                 PMAP_LOCK(pv->pv_pmap);
1949                 pv->pv_pmap->pm_stats.resident_count--;
1950
1951                 tte_data = tte_hash_delete(pv->pv_pmap->pm_hash, pv->pv_va);
1952
1953                 if (tte_data & VTD_WIRED)
1954                         pv->pv_pmap->pm_stats.wired_count--;
1955                 if (tte_data & VTD_REF)
1956                         vm_page_flag_set(m, PG_REFERENCED);
1957                 
1958                 /*
1959                  * Update the vm_page_t clean and reference bits.
1960                  */
1961                 if (tte_data & VTD_W) {
1962                         KASSERT((tte_data & VTD_SW_W),
1963         ("pmap_remove_all: modified page not writable: va: %lx, tte: %lx",
1964                             pv->pv_va, tte_data));
1965                         vm_page_dirty(m);
1966                 }
1967         
1968                 pmap_invalidate_page(pv->pv_pmap, pv->pv_va, TRUE);
1969                 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1970                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1971                 m->md.pv_list_count--;
1972                 PMAP_UNLOCK(pv->pv_pmap);
1973                 free_pv_entry(pv);
1974         }
1975         vm_page_flag_clear(m, PG_WRITEABLE);
1976 }
1977
1978 static void
1979 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1980 {
1981         pv_entry_t pv;
1982         if (pmap != kernel_pmap)
1983                 DPRINTF("pmap_remove_entry(va=0x%lx, pa=0x%lx)\n", va, VM_PAGE_TO_PHYS(m));
1984         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1985         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1986         if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1987                 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1988                         if (pmap == pv->pv_pmap && va == pv->pv_va) 
1989                                 break;
1990                 }
1991         } else {
1992                 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1993                         if (va == pv->pv_va) 
1994                                 break;
1995                 }
1996         }
1997         KASSERT(pv != NULL, ("pmap_remove_entry: pv not found va=0x%lx pa=0x%lx", va, VM_PAGE_TO_PHYS(m)));
1998         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1999         m->md.pv_list_count--;
2000         if (TAILQ_EMPTY(&m->md.pv_list))
2001                 vm_page_flag_clear(m, PG_WRITEABLE);
2002         TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2003         free_pv_entry(pv);
2004 }
2005
2006
2007 void
2008 pmap_remove_pages(pmap_t pmap)
2009 {
2010         
2011         vm_page_t m;
2012         pv_entry_t pv, npv;
2013         tte_t tte_data;
2014         
2015         DPRINTF("pmap_remove_pages(ctx=0x%lx)\n", pmap->pm_context);
2016         vm_page_lock_queues();
2017         PMAP_LOCK(pmap);
2018         for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
2019                 tte_data = tte_hash_delete(pmap->pm_hash, pv->pv_va);
2020
2021                 if (tte_data == 0) {
2022                         printf("TTE IS ZERO @ VA %016lx\n", pv->pv_va);
2023                         panic("bad tte");
2024                 }
2025                 if (tte_data & VTD_WIRED) {
2026                         panic("wired page in process not handled correctly");
2027                         pmap->pm_stats.wired_count--;
2028                 }
2029                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
2030
2031                 pmap->pm_stats.resident_count--;
2032                 
2033                 if (tte_data & VTD_W) {
2034                         vm_page_dirty(m);
2035                 }
2036                 
2037                 npv = TAILQ_NEXT(pv, pv_plist);
2038                 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2039                 
2040                 m->md.pv_list_count--;
2041                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2042                 if (TAILQ_EMPTY(&m->md.pv_list))
2043                         vm_page_flag_clear(m, PG_WRITEABLE);
2044
2045                 free_pv_entry(pv);
2046         }
2047         pmap->pm_hash = tte_hash_reset(pmap->pm_hash, &pmap->pm_hashscratch);
2048         if (0)
2049                 pmap_tsb_reset(pmap);
2050
2051         vm_page_unlock_queues();
2052         pmap_invalidate_all(pmap);
2053         PMAP_UNLOCK(pmap);
2054 }
2055
2056 static void
2057 pmap_tsb_reset(pmap_t pmap)
2058 {
2059         int i;
2060
2061         for (i = 1; i < TSB_MAX_RESIZE && pmap->pm_old_tsb_ra[i]; i++) {
2062                 pmap_free_contig_pages((void *)TLB_PHYS_TO_DIRECT(pmap->pm_old_tsb_ra[i]), 
2063                                        (1 << (TSB_INIT_SHIFT + i)));
2064                 pmap->pm_old_tsb_ra[i] = 0;
2065         }
2066         if (pmap->pm_old_tsb_ra[0] != 0) {
2067                 vm_paddr_t tsb_pa = pmap->pm_tsb.hti_ra;
2068                 int size = tsb_size(&pmap->pm_tsb);
2069                 pmap->pm_tsb.hti_ntte = (1 << (TSB_INIT_SHIFT + PAGE_SHIFT - TTE_SHIFT));
2070                 pmap->pm_tsb.hti_ra = pmap->pm_old_tsb_ra[0];
2071                 pmap_free_contig_pages((void *)TLB_PHYS_TO_DIRECT(tsb_pa), size);
2072                 pmap->pm_tsbscratch = pmap->pm_tsb.hti_ra | (uint64_t)TSB_INIT_SHIFT;
2073                 pmap->pm_old_tsb_ra[0] = 0;
2074         }
2075 }
2076
2077 void
2078 pmap_scrub_pages(vm_paddr_t pa, int64_t size)
2079 {
2080         uint64_t bytes_zeroed;
2081         while (size > 0) {
2082                 hv_mem_scrub(pa, size, &bytes_zeroed);
2083                 pa += bytes_zeroed;
2084                 size -= bytes_zeroed;
2085         }
2086 }
2087
2088 static void
2089 pmap_remove_tte(pmap_t pmap, tte_t tte_data, vm_offset_t va)
2090 {
2091         
2092         vm_page_t m;
2093
2094         if (pmap != kernel_pmap)
2095                 DPRINTF("pmap_remove_tte(va=0x%lx, pa=0x%lx)\n", va, TTE_GET_PA(tte_data));
2096
2097         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2098         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2099         if (tte_data & VTD_WIRED)
2100                 pmap->pm_stats.wired_count--;
2101
2102         pmap->pm_stats.resident_count--;
2103         
2104         if (tte_data & VTD_MANAGED) {
2105                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
2106                 if (tte_data & VTD_W) {
2107                         vm_page_dirty(m);       
2108                 }
2109                 if (tte_data & VTD_REF) 
2110                         vm_page_flag_set(m, PG_REFERENCED);
2111                 pmap_remove_entry(pmap, m, va);
2112         }
2113 }
2114
2115 /* resize the tsb if the number of capacity misses is greater than 1/4 of
2116  * the total 
2117  */  
2118 static void
2119 pmap_tsb_resize(pmap_t pmap)
2120 {
2121         uint32_t miss_count;
2122         uint32_t cap_miss_count;
2123         struct tsb_resize_info info;
2124         hv_tsb_info_t hvtsb;
2125         uint64_t tsbscratch;
2126
2127         KASSERT(pmap == curthread_pmap, ("operating on non-current pmap"));
2128         miss_count = pmap->pm_tsb_miss_count;
2129         cap_miss_count = pmap->pm_tsb_cap_miss_count;
2130         int npages_shift = tsb_page_shift(pmap);
2131
2132         if (npages_shift < (TSB_INIT_SHIFT + TSB_MAX_RESIZE) && 
2133             cap_miss_count > (miss_count >> 1)) {
2134                 DPRINTF("resizing tsb for proc=%s pid=%d\n", 
2135                         curthread->td_proc->p_comm, curthread->td_proc->p_pid);
2136                 pmap->pm_old_tsb_ra[npages_shift - TSB_INIT_SHIFT] = pmap->pm_tsb.hti_ra;
2137
2138                 /* double TSB size */
2139                 tsb_init(&hvtsb, &tsbscratch, npages_shift + 1);
2140 #ifdef SMP
2141                 spinlock_enter();
2142                 /* reset tsb */
2143                 bcopy(&hvtsb, &pmap->pm_tsb, sizeof(hv_tsb_info_t));
2144                 pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
2145
2146                 if (hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra) != H_EOK)
2147                         panic("failed to set TSB 0x%lx - context == %ld\n", 
2148                               pmap->pm_tsb_ra, pmap->pm_context);
2149                 info.tri_tsbscratch = pmap->pm_tsbscratch;
2150                 info.tri_tsb_ra = pmap->pm_tsb_ra;
2151                 pmap_ipi(pmap, tl_tsbupdate, pmap->pm_context, vtophys(&info));
2152                 pmap->pm_tlbactive = pmap->pm_active;
2153                 spinlock_exit();
2154 #else 
2155                 bcopy(&hvtsb, &pmap->pm_tsb, sizeof(hvtsb));
2156                 if (hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra) != H_EOK)
2157                         panic("failed to set TSB 0x%lx - context == %ld\n", 
2158                               pmap->pm_tsb_ra, pmap->pm_context);
2159                 pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
2160 #endif          
2161         }
2162         pmap->pm_tsb_miss_count = 0;
2163         pmap->pm_tsb_cap_miss_count = 0;
2164 }
2165
2166 static void
2167 pmap_tte_hash_resize(pmap_t pmap)
2168 {
2169         tte_hash_t old_th = pmap->pm_hash;
2170         
2171         pmap->pm_hash = tte_hash_resize(pmap->pm_hash);
2172         spinlock_enter();
2173         if (curthread->td_proc->p_numthreads != 1) 
2174                 pmap_ipi(pmap, tl_ttehashupdate, pmap->pm_context, pmap->pm_hashscratch);
2175
2176         pmap->pm_hashscratch = tte_hash_set_scratchpad_user(pmap->pm_hash, pmap->pm_context);   
2177         spinlock_exit();
2178         tte_hash_destroy(old_th);
2179 }
2180
2181 /*
2182  *      pmap_ts_referenced:
2183  *
2184  *      Return a count of reference bits for a page, clearing those bits.
2185  *      It is not necessary for every reference bit to be cleared, but it
2186  *      is necessary that 0 only be returned when there are truly no
2187  *      reference bits set.
2188  *
2189  *      XXX: The exact number of bits to check and clear is a matter that
2190  *      should be tested and standardized at some point in the future for
2191  *      optimal aging of shared pages.
2192  */
2193
2194 int
2195 pmap_ts_referenced(vm_page_t m)
2196 {
2197         
2198         int rv;
2199         pv_entry_t pv, pvf, pvn;
2200         pmap_t pmap;
2201         tte_t otte_data;
2202
2203         rv = 0;
2204         if (m->flags & PG_FICTITIOUS)
2205                 return (rv);
2206
2207         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2208         if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2209                 
2210                 pvf = pv;
2211
2212                 do {
2213                         pvn = TAILQ_NEXT(pv, pv_list);
2214                         
2215                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2216                         
2217                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2218                         
2219                         pmap = pv->pv_pmap;
2220                         PMAP_LOCK(pmap);
2221                         otte_data = tte_hash_clear_bits(pmap->pm_hash, pv->pv_va, VTD_REF);
2222                         if ((otte_data & VTD_REF) != 0) {
2223                                 pmap_invalidate_page(pmap, pv->pv_va, TRUE);
2224                                 
2225                                 rv++;
2226                                 if (rv > 4) {
2227                                         PMAP_UNLOCK(pmap);
2228                                         break;
2229                                 }
2230                         }
2231                 
2232                         PMAP_UNLOCK(pmap);
2233                 } while ((pv = pvn) != NULL && pv != pvf);
2234         }
2235         return (rv);
2236 }
2237
2238 void
2239 pmap_zero_page(vm_page_t m)
2240 {
2241         hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
2242 }
2243
2244 void
2245 pmap_zero_page_area(vm_page_t m, int off, int size)
2246 {
2247         vm_paddr_t pa;
2248         vm_offset_t va;
2249                 
2250         pa = VM_PAGE_TO_PHYS(m);
2251         va = TLB_PHYS_TO_DIRECT(pa);
2252         if (off == 0 && size == PAGE_SIZE)
2253                 hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
2254         else
2255                 bzero((char *)(va + off), size);
2256
2257 }
2258
2259 void
2260 pmap_zero_page_idle(vm_page_t m)
2261 {
2262         hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
2263 }
2264
2265 void
2266 pmap_set_ctx_panic(uint64_t error, vm_paddr_t tsb_ra, pmap_t pmap)
2267 {
2268         panic("setting ctxnon0 failed ctx=0x%lx hvtsb_ra=0x%lx tsbscratch=0x%lx error=0x%lx",
2269               pmap->pm_context, tsb_ra, pmap->pm_tsbscratch, error);
2270         
2271 }