]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/sun4v/sun4v/pmap.c
This commit was generated by cvs2svn to compensate for changes in r172423,
[FreeBSD/FreeBSD.git] / sys / sun4v / sun4v / pmap.c
1 /*-
2  * Copyright (c) 2006 Kip Macy <kmacy@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30
31 #include "opt_kstack_pages.h"
32 #include "opt_msgbuf.h"
33 #include "opt_pmap.h"
34 #include "opt_trap_trace.h"
35
36 #include <sys/param.h>
37 #include <sys/kernel.h>
38 #include <sys/kdb.h>
39 #include <sys/ktr.h>
40 #include <sys/lock.h>
41 #include <sys/msgbuf.h>
42 #include <sys/mutex.h>
43 #include <sys/proc.h>
44 #include <sys/smp.h>
45 #include <sys/sched.h>
46 #include <sys/sysctl.h>
47 #include <sys/systm.h>
48 #include <sys/vmmeter.h>
49
50 #include <dev/ofw/openfirm.h>
51
52 #include <vm/vm.h> 
53 #include <vm/vm_page.h>
54 #include <vm/vm_param.h>
55 #include <vm/vm_kern.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_object.h>
58 #include <vm/vm_extern.h>
59 #include <vm/vm_pageout.h>
60 #include <vm/vm_pager.h>
61 #include <vm/vm_phys.h>
62 #include <vm/uma.h>
63
64 #include <machine/cpu.h>
65 #include <machine/frame.h>
66 #include <machine/instr.h>
67 #include <machine/md_var.h>
68 #include <machine/metadata.h>
69 #include <machine/ofw_mem.h>
70 #include <machine/mmu.h>
71 #include <machine/smp.h>
72 #include <machine/tlb.h>
73 #include <machine/tte.h>
74 #include <machine/tte_hash.h>
75 #include <machine/pcb.h>
76 #include <machine/pstate.h>
77 #include <machine/tsb.h>
78
79 #include <machine/hypervisorvar.h>
80 #include <machine/hv_api.h>
81
82 #ifdef TRAP_TRACING
83 void trap_trace_report(int);
84 #endif
85
86 #if 1
87 #define PMAP_DEBUG
88 #endif
89 #ifndef PMAP_SHPGPERPROC
90 #define PMAP_SHPGPERPROC        200
91 #endif
92
93 /*
94  * Virtual and physical address of message buffer.
95  */
96 struct msgbuf *msgbufp;
97 vm_paddr_t msgbuf_phys;
98
99 /*
100  * Map of physical memory reagions.
101  */
102 vm_paddr_t phys_avail[128];
103 vm_paddr_t phys_avail_tmp[128];
104 static struct ofw_mem_region mra[128];
105 static struct ofw_map translations[128];
106 static int translations_size;
107
108
109 struct ofw_mem_region sparc64_memreg[128];
110 int sparc64_nmemreg;
111
112 extern vm_paddr_t mmu_fault_status_area;
113
114 /*
115  * First and last available kernel virtual addresses.
116  */
117 vm_offset_t virtual_avail;
118 vm_offset_t virtual_end;
119 vm_offset_t kernel_vm_end;
120 vm_offset_t vm_max_kernel_address;
121
122 #ifndef PMAP_SHPGPERPROC
123 #define PMAP_SHPGPERPROC 200
124 #endif
125 /*
126  * Data for the pv entry allocation mechanism
127  */
128 static uma_zone_t pvzone;
129 static struct vm_object pvzone_obj;
130 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
131 int pmap_debug = 0;
132 static int pmap_debug_range = 1;
133 static int use_256M_pages = 1;
134
135 static struct mtx pmap_ctx_lock;
136 static uint16_t ctx_stack[PMAP_CONTEXT_MAX];
137 static int ctx_stack_top; 
138
139 static int permanent_mappings = 0;
140 static uint64_t nucleus_memory;
141 static uint64_t nucleus_mappings[4];
142 /*
143  * Kernel pmap.
144  */
145 struct pmap kernel_pmap_store;
146
147 hv_tsb_info_t kernel_td[MAX_TSB_INFO];
148
149 /*
150  * This should be determined at boot time
151  * with tiny TLBS it doesn't make sense to try and selectively
152  * invalidate more than this 
153  */
154 #define MAX_INVALIDATES   32
155 #define MAX_TSB_CLEARS   128
156
157 /*
158  * Allocate physical memory for use in pmap_bootstrap.
159  */
160 static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size);
161
162 /*
163  * If user pmap is processed with pmap_remove and with pmap_remove and the
164  * resident count drops to 0, there are no more pages to remove, so we
165  * need not continue.
166  */
167 #define PMAP_REMOVE_DONE(pm) \
168         ((pm) != kernel_pmap && (pm)->pm_stats.resident_count == 0)
169
170 /*
171  * Kernel MMU interface
172  */
173 #define curthread_pmap vmspace_pmap(curthread->td_proc->p_vmspace) 
174
175 #ifdef PMAP_DEBUG
176 #define KDPRINTF if (pmap_debug) printf
177 #define DPRINTF \
178         if (curthread_pmap && (curthread_pmap->pm_context != 0) && ((PCPU_GET(cpumask) & curthread_pmap->pm_active) == 0)) \
179         panic("cpumask(0x%x) & active (0x%x) == 0 pid == %d\n",  \
180               PCPU_GET(cpumask), curthread_pmap->pm_active, curthread->td_proc->p_pid); \
181 if (pmap_debug) printf
182
183
184 #else
185 #define DPRINTF(...)
186 #define KDPRINTF(...)
187 #endif
188
189
190 static void free_pv_entry(pv_entry_t pv);
191 static pv_entry_t get_pv_entry(pmap_t locked_pmap);
192
193 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
194 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va);
195 static void pmap_remove_tte(pmap_t pmap, tte_t tte_data, vm_offset_t va);
196 static void pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot);
197 static void pmap_tsb_reset(pmap_t pmap);
198 static void pmap_tsb_resize(pmap_t pmap);
199 static void pmap_tte_hash_resize(pmap_t pmap);
200
201 void pmap_set_ctx_panic(uint64_t error, vm_paddr_t tsb_ra, pmap_t pmap);
202
203 struct tsb_resize_info {
204         uint64_t tri_tsbscratch;
205         uint64_t tri_tsb_ra;
206 };
207
208 /*
209  * Quick sort callout for comparing memory regions.
210  */
211 static int mr_cmp(const void *a, const void *b);
212 static int om_cmp(const void *a, const void *b);
213 static int
214 mr_cmp(const void *a, const void *b)
215 {
216         const struct ofw_mem_region *mra;
217         const struct ofw_mem_region *mrb;
218
219         mra = a;
220         mrb = b;
221         if (mra->mr_start < mrb->mr_start)
222                 return (-1);
223         else if (mra->mr_start > mrb->mr_start)
224                 return (1);
225         else
226                 return (0);
227 }
228 static int
229 om_cmp(const void *a, const void *b)
230 {
231         const struct ofw_map *oma;
232         const struct ofw_map *omb;
233
234         oma = a;
235         omb = b;
236         if (oma->om_start < omb->om_start)
237                 return (-1);
238         else if (oma->om_start > omb->om_start)
239                 return (1);
240         else
241                 return (0);
242 }
243
244 static __inline void
245 free_context(uint16_t ctx)
246 {
247         mtx_lock_spin(&pmap_ctx_lock);
248         ctx_stack[ctx_stack_top++] = ctx;
249         mtx_unlock_spin(&pmap_ctx_lock);
250
251         KASSERT(ctx_stack_top < PMAP_CONTEXT_MAX, 
252                 ("context stack overrun - system error"));
253 }
254
255 static __inline uint16_t
256 get_context(void)
257 {
258         uint16_t ctx;
259
260         mtx_lock_spin(&pmap_ctx_lock);
261         ctx = ctx_stack[--ctx_stack_top];
262         mtx_unlock_spin(&pmap_ctx_lock);
263
264         KASSERT(ctx_stack_top > 0,
265                 ("context stack underrun - need to implement context stealing"));
266
267         return ctx;
268 }
269
270 static __inline void
271 free_pv_entry(pv_entry_t pv)
272 {
273         pv_entry_count--;
274         uma_zfree(pvzone, pv);
275 }
276
277 /*
278  * get a new pv_entry, allocating a block from the system
279  * when needed.
280  */
281 static pv_entry_t
282 get_pv_entry(pmap_t locked_pmap)
283 {
284         static const struct timeval printinterval = { 60, 0 };
285         static struct timeval lastprint;
286         struct vpgqueues *vpq;
287         uint64_t tte_data;
288         pmap_t pmap;
289         pv_entry_t allocated_pv, next_pv, pv;
290         vm_offset_t va;
291         vm_page_t m;
292
293         PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
294         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
295         allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
296         if (allocated_pv != NULL) {
297                 pv_entry_count++;
298                 if (pv_entry_count > pv_entry_high_water)
299                         pagedaemon_wakeup();
300                 else
301                         return (allocated_pv);
302         }
303
304         /*
305          * Reclaim pv entries: At first, destroy mappings to inactive
306          * pages.  After that, if a pv entry is still needed, destroy
307          * mappings to active pages.
308          */
309         if (ratecheck(&lastprint, &printinterval))
310                 printf("Approaching the limit on PV entries, "
311                     "increase the vm.pmap.shpgperproc tunable.\n");
312
313         vpq = &vm_page_queues[PQ_INACTIVE];
314 retry:
315         TAILQ_FOREACH(m, &vpq->pl, pageq) {
316                 if (m->hold_count || m->busy)
317                         continue;
318                 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
319                         va = pv->pv_va;
320                         pmap = pv->pv_pmap;
321                         /* Avoid deadlock and lock recursion. */
322                         if (pmap > locked_pmap)
323                                 PMAP_LOCK(pmap);
324                         else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
325                                 continue;
326                         pmap->pm_stats.resident_count--;
327
328                         tte_data = tte_hash_delete(pmap->pm_hash, va);
329
330                         KASSERT((tte_data & VTD_WIRED) == 0,
331                             ("get_pv_entry: wired pte %#jx", (uintmax_t)tte_data));
332                         if (tte_data & VTD_REF)
333                                 vm_page_flag_set(m, PG_REFERENCED);
334                         if (tte_data & VTD_W) {
335                                 KASSERT((tte_data & VTD_SW_W),
336                                 ("get_pv_entry: modified page not writable: va: %lx, tte: %lx",
337                                     va, tte_data));
338                                 vm_page_dirty(m);
339                         }
340
341                         pmap_invalidate_page(pmap, va, TRUE);
342                         TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
343                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
344                         if (TAILQ_EMPTY(&m->md.pv_list))
345                                 vm_page_flag_clear(m, PG_WRITEABLE);
346                         m->md.pv_list_count--;
347
348                         if (pmap != locked_pmap)
349                                 PMAP_UNLOCK(pmap);
350                         if (allocated_pv == NULL)
351                                 allocated_pv = pv;
352                         else
353                                 free_pv_entry(pv);
354                 }
355         }
356         if (allocated_pv == NULL) {
357                 if (vpq == &vm_page_queues[PQ_INACTIVE]) {
358                         vpq = &vm_page_queues[PQ_ACTIVE];
359                         goto retry;
360                 }
361                 panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
362         }
363         return (allocated_pv);
364 }
365
366 /*
367  * Allocate a physical page of memory directly from the phys_avail map.
368  * Can only be called from pmap_bootstrap before avail start and end are
369  * calculated.
370  */
371 static vm_paddr_t
372 pmap_bootstrap_alloc(vm_size_t size)
373 {
374         vm_paddr_t pa;
375         int i;
376
377         size = round_page(size);
378
379         for (i = 0; phys_avail[i + 1] != 0; i += 2) {
380                 if (phys_avail[i + 1] - phys_avail[i] < size)
381                         continue;
382                 pa = phys_avail[i];
383                 phys_avail[i] += size;
384                 pmap_scrub_pages(pa, size);
385                 return (pa);
386         }
387         panic("pmap_bootstrap_alloc");
388 }
389
390 /*
391  * Activate a user pmap.  The pmap must be activated before its address space
392  * can be accessed in any way.
393  */
394 void
395 pmap_activate(struct thread *td)
396 {
397         pmap_t pmap, oldpmap;
398         int err;
399         
400         critical_enter();
401         pmap = vmspace_pmap(td->td_proc->p_vmspace);
402         oldpmap = PCPU_GET(curpmap);
403 #if defined(SMP)
404         atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
405         atomic_set_int(&pmap->pm_tlbactive, PCPU_GET(cpumask));
406         atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
407 #else
408         oldpmap->pm_active &= ~1;
409         pmap->pm_active |= 1;
410         pmap->pm_tlbactive |= 1;
411 #endif
412
413         pmap->pm_hashscratch = tte_hash_set_scratchpad_user(pmap->pm_hash, pmap->pm_context);
414         pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
415         pmap->pm_tsb_miss_count = pmap->pm_tsb_cap_miss_count = 0;
416
417         PCPU_SET(curpmap, pmap);
418         if (pmap->pm_context != 0)
419                 if ((err = hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra)) != H_EOK)
420                         panic("failed to set TSB 0x%lx - context == %ld\n", 
421                               pmap->pm_tsb_ra, pmap->pm_context);
422         stxa(MMU_CID_S, ASI_MMU_CONTEXTID, pmap->pm_context);
423         membar(Sync);
424         critical_exit();
425 }
426
427 vm_offset_t 
428 pmap_addr_hint(vm_object_t object, vm_offset_t va, vm_size_t size)
429 {
430         return (va);
431 }
432
433 /*
434  * Bootstrap the system enough to run with virtual memory.
435  */
436 void
437 pmap_bootstrap(vm_offset_t ekva)
438 {
439         struct pmap *pm;
440         vm_offset_t off, va;
441         vm_paddr_t pa, tsb_8k_pa, tsb_4m_pa, kernel_hash_pa, nucleus_memory_start;
442         vm_size_t physsz, virtsz, kernel_hash_shift;
443         ihandle_t pmem, vmem;
444         int i, j, k, sz;
445         uint64_t tsb_8k_size, tsb_4m_size, error, physmem_tunable, physmemstart_tunable;
446         vm_paddr_t real_phys_avail[128], tmp_phys_avail[128], bounds;
447         
448
449         if ((vmem = OF_finddevice("/virtual-memory")) == -1)
450                 panic("pmap_bootstrap: finddevice /virtual-memory");
451         if ((sz = OF_getproplen(vmem, "translations")) == -1)
452                 panic("pmap_bootstrap: getproplen translations");
453         if (sizeof(translations) < sz)
454                 panic("pmap_bootstrap: translations too small");
455         bzero(translations, sz);
456         if (OF_getprop(vmem, "translations", translations, sz) == -1)
457                 panic("pmap_bootstrap: getprop /virtual-memory/translations");
458         sz /= sizeof(*translations);
459         translations_size = sz;
460         nucleus_memory_start = 0;
461         CTR0(KTR_PMAP, "pmap_bootstrap: translations");
462         qsort(translations, sz, sizeof (*translations), om_cmp);
463
464         for (i = 0; i < sz; i++) {
465                 KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", 
466                         translations[i].om_size, translations[i].om_start, 
467                         translations[i].om_tte);
468                 if ((translations[i].om_start >= KERNBASE) && 
469                     (translations[i].om_start <= KERNBASE + 3*PAGE_SIZE_4M)) {
470                         for (j = 0; j < translations[i].om_size; j += PAGE_SIZE_4M) {
471                                 KDPRINTF("mapping permanent translation\n");
472                                 pa = TTE_GET_PA(translations[i].om_tte) + j;
473                                 va = translations[i].om_start + j;
474                                 error = hv_mmu_map_perm_addr(va, KCONTEXT, 
475                                                              pa | TTE_KERNEL | VTD_4M, MAP_ITLB | MAP_DTLB);
476                                 if (error != H_EOK)
477                                         panic("map_perm_addr returned error=%ld", error);
478                                 
479                                 if ((nucleus_memory_start == 0) || (pa < nucleus_memory_start))
480                                         nucleus_memory_start = pa;
481                                 printf("nucleus_mappings[%d] = 0x%lx\n", permanent_mappings, pa);
482                                 nucleus_mappings[permanent_mappings++] = pa;
483                                 nucleus_memory += PAGE_SIZE_4M;
484 #ifdef SMP
485                                 mp_add_nucleus_mapping(va, pa|TTE_KERNEL|VTD_4M);
486 #endif
487                         }
488                 }  
489         }
490
491         /*
492          * Find out what physical memory is available from the prom and
493          * initialize the phys_avail array.  This must be done before
494          * pmap_bootstrap_alloc is called.
495          */
496         if ((pmem = OF_finddevice("/memory")) == -1)
497                 panic("pmap_bootstrap: finddevice /memory");
498         if ((sz = OF_getproplen(pmem, "available")) == -1)
499                 panic("pmap_bootstrap: getproplen /memory/available");
500         if (sizeof(vm_paddr_t)*128 < sz) /* FIXME */
501                 panic("pmap_bootstrap: phys_avail too small");
502         if (sizeof(mra) < sz)
503                 panic("pmap_bootstrap: mra too small");
504         bzero(mra, sz);
505         if (OF_getprop(pmem, "available", mra, sz) == -1)
506                 panic("pmap_bootstrap: getprop /memory/available");
507
508         sz /= sizeof(*mra);
509         CTR0(KTR_PMAP, "pmap_bootstrap: physical memory");
510
511         qsort(mra, sz, sizeof (*mra), mr_cmp);
512         physmemstart_tunable = physmem_tunable = physmem = physsz = 0;
513         
514         if (TUNABLE_ULONG_FETCH("hw.physmemstart", &physmemstart_tunable)) {
515                 KDPRINTF("desired physmemstart=0x%lx\n", physmemstart_tunable);
516         }
517         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) {
518                 physmem = atop(physmem_tunable);
519                 KDPRINTF("desired physmem=0x%lx\n", physmem_tunable);
520         }
521         if ((physmem_tunable != 0) && (physmemstart_tunable != 0))
522                 physmem_tunable += physmemstart_tunable;
523         
524         bzero(real_phys_avail, sizeof(real_phys_avail));
525         bzero(tmp_phys_avail, sizeof(tmp_phys_avail));
526
527         for (i = 0, j = 0; i < sz; i++) {
528                 uint64_t size;
529                 KDPRINTF("start=%#lx size=%#lx\n", mra[i].mr_start, mra[i].mr_size);
530                 if (mra[i].mr_size < PAGE_SIZE_4M)
531                         continue;
532
533                 if ((mra[i].mr_start & PAGE_MASK_4M) || (mra[i].mr_size & PAGE_MASK_4M)) {
534                         uint64_t newstart, roundup;
535                         newstart = ((mra[i].mr_start + (PAGE_MASK_4M)) & ~PAGE_MASK_4M);
536                         roundup = newstart - mra[i].mr_start;
537                         size = (mra[i].mr_size - roundup) & ~PAGE_MASK_4M;
538                         mra[i].mr_start = newstart;
539                         if (size < PAGE_SIZE_4M)
540                                 continue;
541                         mra[i].mr_size = size;
542                 }
543                 real_phys_avail[j] = mra[i].mr_start;
544                 if (physmem_tunable != 0 && ((physsz + mra[i].mr_size) >= physmem_tunable)) {
545                         mra[i].mr_size = physmem_tunable - physsz;
546                         physsz = physmem_tunable;
547                         real_phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
548                         break;
549                 }
550                 physsz += mra[i].mr_size;
551                 real_phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
552                 j += 2;
553         }
554         physmem = btoc(physsz - physmemstart_tunable);
555
556         /*
557          * This is needed for versions of OFW that would allocate us memory
558          * and then forget to remove it from the available ranges ...
559          * as well as for compensating for the above move of nucleus pages
560          */
561         for (i = 0, j = 0, bounds = (1UL<<32); real_phys_avail[i] != 0; i += 2) {
562                 vm_paddr_t start = real_phys_avail[i];
563                 uint64_t end = real_phys_avail[i + 1];
564                 CTR2(KTR_PMAP, "start=%#lx size=%#lx\n", start, end);
565                 KDPRINTF("real_phys start=%#lx end=%#lx\n", start, end);
566                 /* 
567                  * Is kernel memory at the beginning of range?
568                  */
569                 if (nucleus_memory_start == start) {
570                         start += nucleus_memory;
571                 }
572                 /* 
573                  * Is kernel memory at the end of range?
574                  */
575                 if (nucleus_memory_start == (end - nucleus_memory)) 
576                         end -= nucleus_memory;
577
578                 if (physmemstart_tunable != 0 && 
579                     (end < physmemstart_tunable))
580                         continue;
581
582                 if (physmemstart_tunable != 0 && 
583                     ((start < physmemstart_tunable))) {
584                         start = physmemstart_tunable;
585                 }
586
587                 /* 
588                  * Is kernel memory in the middle somewhere?             
589                  */
590                 if ((nucleus_memory_start > start) && 
591                     (nucleus_memory_start < end)) {
592                         phys_avail[j] = start;
593                         phys_avail[j+1] = nucleus_memory_start;
594                         start =  nucleus_memory_start + nucleus_memory;
595                         j += 2;
596                 }
597                 /*
598                  * Break phys_avail up on 4GB boundaries to try
599                  * to work around PCI-e allocation bug
600                  * we rely on the fact that kernel memory is allocated 
601                  * from the first 4GB of physical memory
602                  */ 
603                 while (bounds < start)
604                         bounds += (1UL<<32);
605
606                 while (bounds < end) {
607                         phys_avail[j] = start;
608                         phys_avail[j + 1] = bounds;
609                         start = bounds;
610                         bounds += (1UL<<32);
611                         j += 2;
612                 }
613                 phys_avail[j] = start; 
614                 phys_avail[j + 1] = end;
615                 j += 2;
616         }
617
618         /*
619          * Merge nucleus memory in to real_phys_avail
620          *
621          */
622         for (i = 0; real_phys_avail[i] != 0; i += 2) {
623                 if (real_phys_avail[i] == nucleus_memory_start + nucleus_memory)
624                         real_phys_avail[i] -= nucleus_memory;
625                 
626                 if (real_phys_avail[i + 1] == nucleus_memory_start)
627                         real_phys_avail[i + 1] += nucleus_memory;
628                 
629                 if (real_phys_avail[i + 1] == real_phys_avail[i + 2]) {
630                         real_phys_avail[i + 1] = real_phys_avail[i + 3];
631                         for (k = i + 2; real_phys_avail[k] != 0; k += 2) {
632                                 real_phys_avail[k] = real_phys_avail[k + 2];
633                                 real_phys_avail[k + 1] = real_phys_avail[k + 3];
634                         }
635                 }
636         }
637         for (i = 0; phys_avail[i] != 0; i += 2)
638                 if (pmap_debug_range || pmap_debug)
639                         printf("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
640                         i, phys_avail[i], i+1, phys_avail[i+1]);
641
642         /*
643          * Shuffle the memory range containing the 256MB page with 
644          * nucleus_memory to the beginning of the phys_avail array
645          * so that physical memory from that page is preferentially
646          * allocated first
647          */
648         for (j = 0; phys_avail[j] != 0; j += 2) 
649                 if (nucleus_memory_start < phys_avail[j])
650                         break;
651         /*
652          * Don't shuffle unless we have a full 256M page in the range
653          * our kernel malloc appears to be horribly brittle
654          */
655         if ((phys_avail[j + 1] - phys_avail[j]) < 
656             (PAGE_SIZE_256M - nucleus_memory))
657                 goto skipshuffle;
658
659         for (i = j, k = 0; phys_avail[i] != 0; k++, i++)
660                 tmp_phys_avail[k] = phys_avail[i];
661         for (i = 0; i < j; i++)
662                 tmp_phys_avail[k + i] = phys_avail[i];
663         for (i = 0; i < 128; i++)
664                 phys_avail[i] = tmp_phys_avail[i];
665
666 skipshuffle:
667         for (i = 0; real_phys_avail[i] != 0; i += 2)
668                 if (pmap_debug_range || pmap_debug)
669                         printf("real_phys_avail[%d]=0x%lx real_phys_avail[%d]=0x%lx\n",
670                         i, real_phys_avail[i], i+1, real_phys_avail[i+1]);
671
672         for (i = 0; phys_avail[i] != 0; i += 2)
673                 if (pmap_debug_range || pmap_debug)
674                         printf("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
675                         i, phys_avail[i], i+1, phys_avail[i+1]);
676         /*
677          * Calculate the size of kernel virtual memory, and the size and mask
678          * for the kernel tsb.
679          */
680         virtsz = roundup(physsz, PAGE_SIZE_4M << (PAGE_SHIFT - TTE_SHIFT));
681         vm_max_kernel_address = VM_MIN_KERNEL_ADDRESS + virtsz;
682
683         /*
684          * Set the start and end of kva.  The kernel is loaded at the first
685          * available 4 meg super page, so round up to the end of the page.
686          */
687         virtual_avail = roundup2(ekva, PAGE_SIZE_4M);
688         virtual_end = vm_max_kernel_address;
689         kernel_vm_end = vm_max_kernel_address;
690
691         /*
692          * Allocate and map a 4MB page for the kernel hashtable 
693          *
694          */
695 #ifndef SIMULATOR
696         kernel_hash_shift = 10; /* PAGE_SIZE_4M*2 */
697 #else
698         kernel_hash_shift = 6; /* PAGE_SIZE_8K*64 */
699 #endif
700
701         kernel_hash_pa = pmap_bootstrap_alloc((1<<(kernel_hash_shift + PAGE_SHIFT)));
702         if (kernel_hash_pa & PAGE_MASK_4M)
703                 panic("pmap_bootstrap: hashtable pa unaligned\n");
704         /*
705          * Set up TSB descriptors for the hypervisor
706          *
707          */
708 #ifdef notyet
709         tsb_8k_size = virtsz >> (PAGE_SHIFT - TTE_SHIFT);
710 #else
711         /* avoid alignment complaints from the hypervisor */
712         tsb_8k_size = PAGE_SIZE_4M;
713 #endif
714
715         tsb_8k_pa = pmap_bootstrap_alloc(tsb_8k_size);
716         if (tsb_8k_pa & PAGE_MASK_4M)
717                 panic("pmap_bootstrap: tsb unaligned\n");
718         KDPRINTF("tsb_8k_size is 0x%lx, tsb_8k_pa is 0x%lx\n", tsb_8k_size, tsb_8k_pa);
719
720         tsb_4m_size = (virtsz >> (PAGE_SHIFT_4M - TTE_SHIFT)) << 3;
721         tsb_4m_pa = pmap_bootstrap_alloc(tsb_4m_size);
722
723         kernel_td[TSB8K_INDEX].hti_idxpgsz = TTE8K;
724         kernel_td[TSB8K_INDEX].hti_assoc = 1;
725         kernel_td[TSB8K_INDEX].hti_ntte = (tsb_8k_size >> TTE_SHIFT);
726         kernel_td[TSB8K_INDEX].hti_ctx_index = 0;
727         kernel_td[TSB8K_INDEX].hti_pgszs = TSB8K;
728         kernel_td[TSB8K_INDEX].hti_rsvd = 0;
729         kernel_td[TSB8K_INDEX].hti_ra = tsb_8k_pa;
730
731         /*
732          * Initialize kernel's private TSB from 8K page TSB
733          *
734          */
735         kernel_pmap->pm_tsb.hti_idxpgsz = TTE8K;
736         kernel_pmap->pm_tsb.hti_assoc = 1;
737         kernel_pmap->pm_tsb.hti_ntte = (tsb_8k_size >> TTE_SHIFT);
738         kernel_pmap->pm_tsb.hti_ctx_index = 0;
739         kernel_pmap->pm_tsb.hti_pgszs = TSB8K;
740         kernel_pmap->pm_tsb.hti_rsvd = 0;
741         kernel_pmap->pm_tsb.hti_ra = tsb_8k_pa;
742         
743         kernel_pmap->pm_tsb_ra = vtophys((vm_offset_t)&kernel_pmap->pm_tsb);
744         tsb_set_scratchpad_kernel(&kernel_pmap->pm_tsb);
745         
746         /*
747          * Initialize kernel TSB for 4M pages
748          * currently (not by design) used for permanent mappings
749          */
750         
751
752         KDPRINTF("tsb_4m_pa is 0x%lx tsb_4m_size is 0x%lx\n", tsb_4m_pa, tsb_4m_size);
753         kernel_td[TSB4M_INDEX].hti_idxpgsz = TTE4M;
754         kernel_td[TSB4M_INDEX].hti_assoc = 1;
755         kernel_td[TSB4M_INDEX].hti_ntte = (tsb_4m_size >> TTE_SHIFT);
756         kernel_td[TSB4M_INDEX].hti_ctx_index = 0;
757         kernel_td[TSB4M_INDEX].hti_pgszs = TSB4M|TSB256M;
758         kernel_td[TSB4M_INDEX].hti_rsvd = 0;
759         kernel_td[TSB4M_INDEX].hti_ra = tsb_4m_pa;
760         /*
761          * allocate MMU fault status areas for all CPUS
762          */
763         mmu_fault_status_area = pmap_bootstrap_alloc(MMFSA_SIZE*MAXCPU);
764
765         /*
766          * Allocate and map the message buffer.
767          */
768         msgbuf_phys = pmap_bootstrap_alloc(MSGBUF_SIZE);
769         msgbufp = (struct msgbuf *)TLB_PHYS_TO_DIRECT(msgbuf_phys);
770
771         /*
772          * Allocate a kernel stack with guard page for thread0 and map it into
773          * the kernel tsb.  
774          */
775         pa = pmap_bootstrap_alloc(KSTACK_PAGES*PAGE_SIZE);
776         kstack0_phys = pa;
777         virtual_avail += KSTACK_GUARD_PAGES * PAGE_SIZE;
778         kstack0 = virtual_avail;
779         virtual_avail += KSTACK_PAGES * PAGE_SIZE;
780         for (i = 0; i < KSTACK_PAGES; i++) {
781                 pa = kstack0_phys + i * PAGE_SIZE;
782                 va = kstack0 + i * PAGE_SIZE;
783                 tsb_set_tte_real(&kernel_td[TSB8K_INDEX], va, va,
784                             pa | TTE_KERNEL | VTD_8K, 0);
785         }
786         /*
787          * Calculate the last available physical address.
788          */
789         for (i = 0; phys_avail[i + 2] != 0; i += 2)
790                 KDPRINTF("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
791                         i, phys_avail[i], i+1, phys_avail[i+1]);
792         KDPRINTF("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
793                         i, phys_avail[i], i+1, phys_avail[i+1]);
794
795         Maxmem = sparc64_btop(phys_avail[i + 1]);
796         
797         /*
798          * Add the prom mappings to the kernel tsb.
799          */
800         for (i = 0; i < sz; i++) {
801                 CTR3(KTR_PMAP,
802                     "translation: start=%#lx size=%#lx tte=%#lx",
803                     translations[i].om_start, translations[i].om_size,
804                     translations[i].om_tte);
805                 KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", 
806                        translations[i].om_size, translations[i].om_start, 
807                        translations[i].om_tte);
808
809                 if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
810                     translations[i].om_start > VM_MAX_PROM_ADDRESS) 
811                         continue;
812
813                 for (off = 0; off < translations[i].om_size;
814                      off += PAGE_SIZE) {
815                         va = translations[i].om_start + off;
816                         pa = TTE_GET_PA(translations[i].om_tte) + off;
817                         tsb_assert_invalid(&kernel_td[TSB8K_INDEX], va);
818                         tsb_set_tte_real(&kernel_td[TSB8K_INDEX], va, va, pa | 
819                                     TTE_KERNEL | VTD_8K, 0);
820                 }
821         }
822
823         if ((error = hv_mmu_tsb_ctx0(MAX_TSB_INFO, 
824                                      vtophys((vm_offset_t)kernel_td))) != H_EOK)
825                 panic("failed to set ctx0 TSBs error: %ld", error);
826
827 #ifdef SMP
828         mp_set_tsb_desc_ra(vtophys((vm_offset_t)&kernel_td));
829 #endif
830         /*
831          * setup direct mappings
832          * 
833          */
834         for (i = 0, pa = real_phys_avail[i]; pa != 0; i += 2, pa = real_phys_avail[i]) {
835                 vm_paddr_t tag_pa = 0, next_pa = 0;
836                 uint64_t size_bits = VTD_4M;
837                 while (pa < real_phys_avail[i + 1]) {
838                         if (use_256M_pages &&
839                             (pa & PAGE_MASK_256M) == 0 && 
840                             ((pa + PAGE_SIZE_256M) <= real_phys_avail[i + 1])) {
841                                 tag_pa = pa;
842                                 size_bits = VTD_256M;
843                                 next_pa = pa + PAGE_SIZE_256M;
844                         } else if (next_pa <= pa) {
845                                 tag_pa = pa;
846                                 size_bits = VTD_4M;
847                         }
848                         tsb_assert_invalid(&kernel_td[TSB4M_INDEX], TLB_PHYS_TO_DIRECT(pa));
849                         tsb_set_tte_real(&kernel_td[TSB4M_INDEX], TLB_PHYS_TO_DIRECT(pa), 
850                                          TLB_PHYS_TO_DIRECT(pa), 
851                                          tag_pa | TTE_KERNEL | size_bits, 0);
852                         pa += PAGE_SIZE_4M;
853                 }
854         }
855
856         /*
857          * Get the available physical memory ranges from /memory/reg. These
858          * are only used for kernel dumps, but it may not be wise to do prom
859          * calls in that situation.
860          */
861         if ((sz = OF_getproplen(pmem, "reg")) == -1)
862                 panic("pmap_bootstrap: getproplen /memory/reg");
863         if (sizeof(sparc64_memreg) < sz)
864                 panic("pmap_bootstrap: sparc64_memreg too small");
865         if (OF_getprop(pmem, "reg", sparc64_memreg, sz) == -1)
866                 panic("pmap_bootstrap: getprop /memory/reg");
867         sparc64_nmemreg = sz / sizeof(*sparc64_memreg);
868
869         pm = kernel_pmap;
870         pm->pm_active = ~0;
871         pm->pm_tlbactive = ~0;
872
873         PMAP_LOCK_INIT(kernel_pmap);
874
875         TAILQ_INIT(&kernel_pmap->pm_pvlist);
876
877         /* 
878          * This could happen earlier - but I put it here to avoid 
879          * attempts to do updates until they're legal
880          */
881         pm->pm_hash = tte_hash_kernel_create(TLB_PHYS_TO_DIRECT(kernel_hash_pa), kernel_hash_shift, 
882                                              pmap_bootstrap_alloc(PAGE_SIZE));
883         pm->pm_hashscratch = tte_hash_set_scratchpad_kernel(pm->pm_hash);
884
885         for (i = 0; i < translations_size; i++) {
886                 KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", 
887                        translations[i].om_size, translations[i].om_start, 
888                        translations[i].om_tte);
889
890                 if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
891                     translations[i].om_start > VM_MAX_PROM_ADDRESS) {
892                         KDPRINTF("skipping\n");
893                         continue;
894                 }
895                 for (off = 0; off < translations[i].om_size; off += PAGE_SIZE) {
896                         va = translations[i].om_start + off;
897                         pa = TTE_GET_PA(translations[i].om_tte) + off;
898                         tte_hash_insert(pm->pm_hash, va, pa | TTE_KERNEL | VTD_8K);
899                 }
900                 KDPRINTF("set om_size=%ld om_start=%lx om_tte=%lx\n", 
901                        translations[i].om_size, translations[i].om_start, 
902                        translations[i].om_tte);
903         }
904         for (i = 0; i < KSTACK_PAGES; i++) {
905                 pa = kstack0_phys + i * PAGE_SIZE;
906                 va = kstack0 + i * PAGE_SIZE;
907                 tte_hash_insert(pm->pm_hash, va, pa | TTE_KERNEL | VTD_8K);
908         }
909         /*
910          * Add direct mappings to hash
911          *
912          */
913 #ifdef notyet
914         /* hash only supports 8k pages */
915         for (pa = PAGE_SIZE_4M; pa < phys_avail[2]; pa += PAGE_SIZE_4M)
916                 tte_hash_insert(pm->pm_hash, TLB_PHYS_TO_DIRECT(pa), 
917                                 pa | TTE_KERNEL | VTD_4M);
918 #endif
919
920
921         if (bootverbose)
922                 printf("pmap_bootstrap done\n");
923 }
924
925
926
927 /*
928  *      Routine:        pmap_change_wiring
929  *      Function:       Change the wiring attribute for a map/virtual-address
930  *                      pair.
931  *      In/out conditions:
932  *                      The mapping must already exist in the pmap.
933  */
934 void
935 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
936 {
937         boolean_t iswired;
938         PMAP_LOCK(pmap);
939         iswired = tte_get_virt_bit(pmap, va, VTD_WIRED);
940
941         if (wired && !iswired) {
942                 pmap->pm_stats.wired_count++;
943                 tte_set_virt_bit(pmap, va, VTD_WIRED);
944         } else if (!wired && iswired) {
945                 pmap->pm_stats.wired_count--;
946                 tte_clear_virt_bit(pmap, va, VTD_WIRED);
947         }
948         PMAP_UNLOCK(pmap);
949 }
950
951 void
952 pmap_clear_modify(vm_page_t m)
953 {
954         KDPRINTF("pmap_clear_modify(0x%lx)\n", VM_PAGE_TO_PHYS(m));
955         tte_clear_phys_bit(m, VTD_W);
956 }
957
958 void
959 pmap_clear_reference(vm_page_t m)
960 {
961         KDPRINTF("pmap_clear_reference(0x%lx)\n", VM_PAGE_TO_PHYS(m));
962         tte_clear_phys_bit(m, VTD_REF);
963 }
964
965 void
966 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
967           vm_size_t len, vm_offset_t src_addr)
968 {
969         vm_offset_t addr, end_addr;
970
971         end_addr = src_addr + len;
972         /*
973          * Don't let optional prefaulting of pages make us go
974          * way below the low water mark of free pages or way
975          * above high water mark of used pv entries.
976          */
977         if (cnt.v_free_count < cnt.v_free_reserved ||
978             pv_entry_count > pv_entry_high_water)
979                 return;
980         
981
982         vm_page_lock_queues();
983         if (dst_pmap < src_pmap) {
984                 PMAP_LOCK(dst_pmap);
985                 PMAP_LOCK(src_pmap);
986         } else {
987                 PMAP_LOCK(src_pmap);
988                 PMAP_LOCK(dst_pmap);
989         }
990         for (addr = src_addr; addr < end_addr; addr += PAGE_SIZE) {
991                 tte_t tte_data;
992                 vm_page_t m;
993
994                 tte_data = tte_hash_lookup(src_pmap->pm_hash, addr);
995
996                 if ((tte_data & VTD_MANAGED) != 0) {
997                         if (tte_hash_lookup(dst_pmap->pm_hash, addr) == 0) {
998                                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
999
1000                                 tte_hash_insert(dst_pmap->pm_hash, addr, tte_data & ~(VTD_W|VTD_REF|VTD_WIRED));
1001                                 dst_pmap->pm_stats.resident_count++;
1002                                 pmap_insert_entry(dst_pmap, addr, m);
1003                         } 
1004                 }               
1005         }
1006         vm_page_unlock_queues();
1007         PMAP_UNLOCK(src_pmap);
1008         PMAP_UNLOCK(dst_pmap);
1009 }
1010
1011 void
1012 pmap_copy_page(vm_page_t src, vm_page_t dst)
1013 {
1014         vm_paddr_t srcpa, dstpa;
1015         srcpa = VM_PAGE_TO_PHYS(src);
1016         dstpa = VM_PAGE_TO_PHYS(dst);
1017
1018         novbcopy((char *)TLB_PHYS_TO_DIRECT(srcpa), (char *)TLB_PHYS_TO_DIRECT(dstpa), PAGE_SIZE);
1019
1020
1021 }
1022
1023 static __inline void
1024 pmap_add_tte(pmap_t pmap, vm_offset_t va, vm_page_t m, tte_t *tte_data, int wired)
1025 {
1026
1027         if (wired)
1028                 pmap->pm_stats.wired_count++;
1029         
1030         if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
1031                 pmap_insert_entry(pmap, va, m);
1032                 *tte_data |= VTD_MANAGED;
1033         }
1034 }
1035
1036 /*
1037  * Map the given physical page at the specified virtual address in the
1038  * target pmap with the protection requested.  If specified the page
1039  * will be wired down.
1040  */
1041 void
1042 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
1043            boolean_t wired)
1044 {
1045         vm_paddr_t pa, opa;
1046         uint64_t tte_data, otte_data;
1047         vm_page_t om;
1048         int invlva;
1049
1050         if (pmap->pm_context)
1051                 DPRINTF("pmap_enter(va=%lx, pa=0x%lx, prot=%x)\n", va, 
1052                         VM_PAGE_TO_PHYS(m), prot);
1053
1054         om = NULL;
1055         
1056         vm_page_lock_queues();
1057         PMAP_LOCK(pmap);
1058
1059         tte_data = pa = VM_PAGE_TO_PHYS(m);
1060         otte_data = tte_hash_delete(pmap->pm_hash, va);
1061         opa = TTE_GET_PA(otte_data);
1062
1063         if (opa == 0) {
1064                 /*
1065                  * This is a new mapping
1066                  */
1067                 pmap->pm_stats.resident_count++;
1068                 pmap_add_tte(pmap, va, m, &tte_data, wired);
1069
1070         } else if (pa != opa) {
1071                 /*
1072                  * Mapping has changed, handle validating new mapping.
1073                  * 
1074                  */
1075                 if (otte_data & VTD_WIRED)
1076                         pmap->pm_stats.wired_count--;
1077
1078                 if (otte_data & VTD_MANAGED) {
1079                         om = PHYS_TO_VM_PAGE(opa);
1080                         pmap_remove_entry(pmap, om, va);
1081                 }
1082
1083                 pmap_add_tte(pmap, va, m, &tte_data, wired);
1084
1085         } else /* (pa == opa) */ {
1086                 /*
1087                  * Mapping has not changed, must be protection or wiring change.
1088                  */
1089
1090                 /*
1091                  * Wiring change, just update stats. We don't worry about
1092                  * wiring PT pages as they remain resident as long as there
1093                  * are valid mappings in them. Hence, if a user page is wired,
1094                  * the PT page will be also.
1095                  */
1096                 if (wired && ((otte_data & VTD_WIRED) == 0))
1097                         pmap->pm_stats.wired_count++;
1098                 else if (!wired && (otte_data & VTD_WIRED))
1099                         pmap->pm_stats.wired_count--;
1100
1101                 /*
1102                  * We might be turning off write access to the page,
1103                  * so we go ahead and sense modify status.
1104                  */
1105                 if (otte_data & VTD_MANAGED) {
1106                         om = m;
1107                         tte_data |= VTD_MANAGED;
1108                 }
1109         } 
1110
1111         /*
1112          * Now validate mapping with desired protection/wiring.
1113          */
1114         if ((prot & VM_PROT_WRITE) != 0) {
1115                 tte_data |= VTD_SW_W; 
1116                 vm_page_flag_set(m, PG_WRITEABLE);
1117         }
1118         if ((prot & VM_PROT_EXECUTE) != 0)
1119                 tte_data |= VTD_X;
1120         if (wired)
1121                 tte_data |= VTD_WIRED;
1122         if (pmap == kernel_pmap)
1123                 tte_data |= VTD_P;
1124         
1125         invlva = FALSE;
1126         if ((otte_data & ~(VTD_W|VTD_REF)) != tte_data) {
1127                 if (otte_data & VTD_V) {
1128                         if (otte_data & VTD_REF) {
1129                                 if (otte_data & VTD_MANAGED) 
1130                                         vm_page_flag_set(om, PG_REFERENCED);
1131                                 if ((opa != pa) || ((opa & VTD_X) != (pa & VTD_X)))
1132                                         invlva = TRUE;
1133                         }
1134                         if (otte_data & VTD_W) {
1135                                 if (otte_data & VTD_MANAGED) 
1136                                         vm_page_dirty(om);
1137                                 if ((pa & VTD_SW_W) != 0) 
1138                                         invlva = TRUE;
1139                         }
1140                         if (invlva)
1141                                 pmap_invalidate_page(pmap, va, TRUE);
1142                 }
1143         } 
1144
1145
1146         tte_hash_insert(pmap->pm_hash, va, tte_data|TTE_MINFLAGS|VTD_REF);
1147         /*
1148          * XXX this needs to be locked for the threaded / kernel case 
1149          */
1150         tsb_set_tte(&pmap->pm_tsb, va, tte_data|TTE_MINFLAGS|VTD_REF, 
1151                     pmap->pm_context);
1152
1153         if (tte_hash_needs_resize(pmap->pm_hash))
1154                 pmap_tte_hash_resize(pmap);
1155
1156         /*
1157          * 512 is an arbitrary number of tsb misses
1158          */
1159         if (0 && pmap->pm_context != 0 && pmap->pm_tsb_miss_count > 512)
1160                 pmap_tsb_resize(pmap);
1161
1162         vm_page_unlock_queues();
1163
1164         PMAP_UNLOCK(pmap);
1165 }
1166
1167 /*
1168  * Maps a sequence of resident pages belonging to the same object.
1169  * The sequence begins with the given page m_start.  This page is
1170  * mapped at the given virtual address start.  Each subsequent page is
1171  * mapped at a virtual address that is offset from start by the same
1172  * amount as the page is offset from m_start within the object.  The
1173  * last page in the sequence is the page with the largest offset from
1174  * m_start that can be mapped at a virtual address less than the given
1175  * virtual address end.  Not every virtual page between start and end
1176  * is mapped; only those for which a resident page exists with the
1177  * corresponding offset from m_start are mapped.
1178  */
1179 void
1180 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 
1181                   vm_page_t m_start, vm_prot_t prot)
1182 {
1183         vm_page_t m;
1184         vm_pindex_t diff, psize;
1185
1186         VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
1187         psize = atop(end - start);
1188         m = m_start;
1189         PMAP_LOCK(pmap);
1190         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
1191                 pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot);
1192                 m = TAILQ_NEXT(m, listq);
1193         }
1194         PMAP_UNLOCK(pmap);
1195 }
1196
1197 void
1198 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
1199 {
1200         PMAP_LOCK(pmap);
1201         pmap_enter_quick_locked(pmap, va, m, prot);
1202         PMAP_UNLOCK(pmap);
1203 }
1204
1205 static void
1206 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
1207 {
1208
1209         tte_t tte_data;
1210
1211         if (pmap->pm_context)
1212                 KDPRINTF("pmap_enter_quick(ctx=0x%lx va=%lx, pa=0x%lx prot=%x)\n", 
1213                         pmap->pm_context, va, VM_PAGE_TO_PHYS(m), prot);
1214
1215         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1216         if (tte_hash_lookup(pmap->pm_hash, va))
1217                 return;
1218                 
1219         tte_data = VM_PAGE_TO_PHYS(m);
1220         /*
1221          * Enter on the PV list if part of our managed memory. Note that we
1222          * raise IPL while manipulating pv_table since pmap_enter can be
1223          * called at interrupt time.
1224          */
1225         if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
1226                 pmap_insert_entry(pmap, va, m);
1227                 tte_data |= VTD_MANAGED;
1228         }
1229
1230         pmap->pm_stats.resident_count++;
1231
1232         if ((prot & VM_PROT_EXECUTE) != 0)
1233                 tte_data |= VTD_X;
1234
1235         tte_hash_insert(pmap->pm_hash, va, tte_data | TTE_MINFLAGS);
1236 }
1237
1238 /*
1239  * Extract the physical page address associated with the given
1240  * map/virtual_address pair.
1241  */
1242 vm_paddr_t
1243 pmap_extract(pmap_t pmap, vm_offset_t va)
1244 {
1245         vm_paddr_t pa;
1246         tte_t tte_data;
1247
1248         tte_data = tte_hash_lookup(pmap->pm_hash, va);
1249         pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
1250
1251         return (pa);
1252 }
1253
1254 /*
1255  * Atomically extract and hold the physical page with the given
1256  * pmap and virtual address pair if that mapping permits the given
1257  * protection.
1258  */
1259 vm_page_t
1260 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1261 {
1262         tte_t tte_data;
1263         vm_page_t m;
1264
1265         m = NULL;
1266         vm_page_lock_queues();
1267         PMAP_LOCK(pmap);
1268         tte_data = tte_hash_lookup(pmap->pm_hash, va);
1269         if (tte_data != 0 && 
1270             ((tte_data & VTD_SW_W) || (prot & VM_PROT_WRITE) == 0)) {
1271                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
1272                 vm_page_hold(m);
1273         }
1274         vm_page_unlock_queues();
1275         PMAP_UNLOCK(pmap);
1276
1277         return (m);
1278 }
1279
1280 void *
1281 pmap_alloc_zeroed_contig_pages(int npages, uint64_t alignment)
1282 {
1283         vm_page_t m, tm;
1284         int i;
1285         void *ptr;
1286         
1287         m = NULL;
1288         while (m == NULL) {     
1289                 for (i = 0; phys_avail[i + 1] != 0; i += 2) {
1290                         m = vm_phys_alloc_contig(npages, phys_avail[i], 
1291                                                  phys_avail[i + 1], alignment, (1UL<<34));
1292                         if (m)
1293                                 goto found;
1294                 }
1295                 if (m == NULL) {
1296                         printf("vm_phys_alloc_contig failed - waiting to retry\n");
1297                         VM_WAIT;
1298                 }
1299         }
1300 found:
1301         for (i = 0, tm = m; i < npages; i++, tm++) {
1302                 tm->wire_count++;
1303                 if ((tm->flags & PG_ZERO) == 0)
1304                         pmap_zero_page(tm);
1305         }
1306         ptr = (void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m));
1307         
1308         return (ptr);
1309 }
1310
1311 void
1312 pmap_free_contig_pages(void *ptr, int npages)
1313 {
1314         int i;
1315         vm_page_t m;
1316
1317         m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS((vm_offset_t)ptr));
1318         for (i = 0; i < npages; i++, m++) {
1319                 m->wire_count--;
1320                 atomic_subtract_int(&cnt.v_wire_count, 1);
1321                 vm_page_free(m);
1322         }
1323 }
1324
1325 void 
1326 pmap_growkernel(vm_offset_t addr)
1327 {
1328         return;
1329 }
1330
1331 void 
1332 pmap_init(void)
1333 {
1334
1335         /* allocate pv_entry zones */
1336         int shpgperproc = PMAP_SHPGPERPROC;
1337
1338         for (ctx_stack_top = 1; ctx_stack_top < PMAP_CONTEXT_MAX; ctx_stack_top++) 
1339                 ctx_stack[ctx_stack_top] = ctx_stack_top;
1340
1341         mtx_init(&pmap_ctx_lock, "ctx lock", NULL, MTX_SPIN);
1342
1343         /*
1344          * Initialize the address space (zone) for the pv entries.  Set a
1345          * high water mark so that the system can recover from excessive
1346          * numbers of pv entries.
1347          */
1348         pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, 
1349             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
1350         TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
1351         pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
1352         TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
1353         pv_entry_high_water = 9 * (pv_entry_max / 10);
1354         uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
1355
1356         tte_hash_init();
1357
1358 }
1359
1360 /*
1361  * Create a pv entry for page at pa for
1362  * (pmap, va).
1363  */
1364 static void
1365 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1366 {
1367         pv_entry_t pv;
1368
1369         KDPRINTF("pmap_insert_entry(va=0x%lx, pa=0x%lx)\n", va, VM_PAGE_TO_PHYS(m));
1370         pv = get_pv_entry(pmap);
1371         pv->pv_va = va;
1372         pv->pv_pmap = pmap;
1373
1374         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1375         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1376         TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
1377         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1378         m->md.pv_list_count++;
1379 }
1380
1381 #ifdef TRAP_TRACING
1382 static int trap_trace_report_done;
1383 #endif
1384
1385 #ifdef SMP
1386 static cpumask_t
1387 pmap_ipi(pmap_t pmap, char *func, uint64_t arg1, uint64_t arg2)
1388 {
1389
1390         int i, cpu_count, retried;
1391         u_int cpus;
1392         cpumask_t cpumask, active, curactive;
1393         cpumask_t active_total, ackmask;
1394         uint16_t *cpulist;
1395
1396         retried = 0;
1397
1398         if (!smp_started)
1399                 return (0);
1400
1401         cpumask = PCPU_GET(cpumask);
1402         cpulist = PCPU_GET(cpulist);
1403         curactive = 0;
1404
1405         if (rdpr(pil) != 14)
1406                 panic("pil %ld != 14", rdpr(pil));
1407
1408 #ifndef CPUMASK_NOT_BEING_ERRONEOUSLY_CHANGED
1409         /* by definition cpumask should have curcpu's bit set */
1410         if (cpumask != (1 << curcpu)) 
1411                 panic("cpumask(0x%x) != (1 << curcpu) (0x%x)\n", 
1412                       cpumask, (1 << curcpu));
1413
1414 #endif
1415 #ifdef notyet
1416         if ((active_total = (pmap->pm_tlbactive & ~cpumask)) == 0)
1417                 goto done;
1418
1419         if (pmap->pm_context != 0)
1420                 active_total = active = (pmap->pm_tlbactive & ~cpumask);
1421         else 
1422 #endif
1423                 active_total = active = PCPU_GET(other_cpus);
1424
1425         if (active == 0)
1426                 goto done;
1427         
1428  retry:
1429         
1430         for (i = curactive = cpu_count = 0, cpus = active; i < mp_ncpus && cpus; i++, cpus = (cpus>>1)) {
1431                 if ((cpus & 0x1) == 0)
1432                         continue;
1433                 
1434                 curactive |= (1 << i);
1435                 cpulist[cpu_count] = (uint16_t)i;
1436                 cpu_count++;
1437         }
1438
1439         ackmask = 0;
1440         cpu_ipi_selected(cpu_count, cpulist, (uint64_t)func, (uint64_t)arg1, 
1441                          (uint64_t)arg2, (uint64_t *)&ackmask);
1442
1443         while (ackmask != curactive) {
1444                 membar(Sync);
1445                 i++;
1446                 if (i > 10000000) {
1447 #ifdef TRAP_TRACING
1448                         int j;
1449 #endif
1450                         uint64_t cpu_state;
1451                         printf("cpu with cpumask=0x%x appears to not be responding to ipis\n",
1452                                curactive & ~ackmask);
1453
1454 #ifdef TRAP_TRACING
1455                         if (!trap_trace_report_done) {
1456                                 trap_trace_report_done = 1;
1457                                 for (j = 0; j < MAXCPU; j++)
1458                                         if (((1 << j) & curactive & ~ackmask) != 0) {
1459                                                 struct pcpu *pc = pcpu_find(j);
1460                                                 printf("pcpu pad 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx\n",
1461                                                     pc->pad[0], pc->pad[1], pc->pad[2], pc->pad[3],
1462                                                     pc->pad[4], pc->pad[5], pc->pad[6]);
1463                                                 trap_trace_report(j);
1464                                         }
1465                         }
1466 #endif
1467
1468                         hv_cpu_state((uint64_t)ffs64(curactive & ~ackmask), &cpu_state);
1469                         printf("cpu_state of %ld is %ld\n", ffs64(curactive & ~ackmask), cpu_state);
1470                         if (!retried) {
1471                                 printf("I'm going to send off another ipi just to confirm that it isn't a memory barrier bug\n"
1472                                "and then I'm going to panic\n");
1473
1474                                 retried = 1;
1475                                 goto retry;
1476                         }
1477
1478                         panic(" ackmask=0x%x active=0x%x\n", ackmask, curactive);
1479                 }
1480         }
1481
1482         active_total |= curactive;
1483         if ((active = ((pmap->pm_tlbactive & all_cpus) & ~(active_total|cpumask))) != 0) {
1484                 printf("pmap_ipi: retrying");
1485                 goto retry;
1486         }
1487  done:
1488         return (active_total);
1489 }
1490 #endif
1491
1492 void
1493 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, int cleartsb)
1494 {
1495
1496         if (cleartsb == TRUE)
1497                 tsb_clear_tte(&pmap->pm_tsb, va);
1498
1499         DPRINTF("pmap_invalidate_page(va=0x%lx)\n", va);
1500         spinlock_enter();
1501         invlpg(va, pmap->pm_context);
1502 #ifdef SMP
1503         pmap_ipi(pmap, (void *)tl_invlpg, (uint64_t)va, (uint64_t)pmap->pm_context);
1504 #endif
1505         spinlock_exit();
1506 }
1507
1508 void
1509 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int cleartsb)
1510 {
1511         vm_offset_t tva, invlrngva;
1512         char *func;
1513 #ifdef SMP
1514         cpumask_t active;
1515 #endif
1516         if ((eva - sva) == PAGE_SIZE) {
1517                 pmap_invalidate_page(pmap, sva, cleartsb);
1518                 return;
1519         }
1520         
1521
1522         KASSERT(sva < eva, ("invalidating negative or zero range sva=0x%lx eva=0x%lx", sva, eva));
1523
1524         if (cleartsb == TRUE) 
1525                 tsb_clear_range(&pmap->pm_tsb, sva, eva);
1526
1527         spinlock_enter();
1528         if ((sva - eva) < PAGE_SIZE*64) {
1529                 for (tva = sva; tva < eva; tva += PAGE_SIZE_8K)
1530                         invlpg(tva, pmap->pm_context);
1531                 func = tl_invlrng;
1532         } else if (pmap->pm_context) {
1533                 func = tl_invlctx;
1534                 invlctx(pmap->pm_context);
1535
1536         } else {
1537                 func = tl_invltlb;
1538                 invltlb();
1539         }
1540 #ifdef SMP
1541         invlrngva = sva | ((eva - sva) >> PAGE_SHIFT);
1542         active = pmap_ipi(pmap, (void *)func, pmap->pm_context, invlrngva);
1543         active &= ~pmap->pm_active;
1544         atomic_clear_int(&pmap->pm_tlbactive, active);
1545 #endif
1546         spinlock_exit();
1547 }
1548
1549 void
1550 pmap_invalidate_all(pmap_t pmap)
1551 {
1552
1553         KASSERT(pmap != kernel_pmap, ("invalidate_all called on kernel_pmap"));
1554
1555         tsb_clear(&pmap->pm_tsb);
1556
1557         spinlock_enter();
1558         invlctx(pmap->pm_context);
1559 #ifdef SMP
1560         pmap_ipi(pmap, tl_invlctx, pmap->pm_context, 0);
1561         pmap->pm_tlbactive = pmap->pm_active;
1562 #endif
1563         spinlock_exit();
1564 }
1565
1566 boolean_t
1567 pmap_is_modified(vm_page_t m)
1568 {
1569
1570         return (tte_get_phys_bit(m, VTD_W));
1571 }
1572
1573
1574 boolean_t 
1575 pmap_is_prefaultable(pmap_t pmap, vm_offset_t va)
1576 {
1577         return (tte_hash_lookup(pmap->pm_hash, va) == 0);
1578 }
1579
1580 /*
1581  * Extract the physical page address associated with the given kernel virtual
1582  * address.
1583  */
1584
1585 vm_paddr_t
1586 pmap_kextract(vm_offset_t va)
1587 {
1588         tte_t tte_data;
1589         vm_paddr_t pa;
1590
1591         pa = 0;
1592         if (va > KERNBASE && va < KERNBASE + nucleus_memory) {
1593                 uint64_t offset;
1594                 offset = va - KERNBASE; 
1595                 pa = nucleus_mappings[offset >> 22] | (va & PAGE_MASK_4M);
1596         }
1597         if ((pa == 0) && (tte_data = tsb_lookup_tte(va, 0)) != 0)
1598                 pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
1599
1600         if ((pa == 0) && (tte_data = tte_hash_lookup(kernel_pmap->pm_hash, va)) != 0)
1601                 pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
1602
1603         return pa;
1604 }
1605
1606 /*
1607  * Map a range of physical addresses into kernel virtual address space.
1608  *
1609  * The value passed in *virt is a suggested virtual address for the mapping.
1610  * Architectures which can support a direct-mapped physical to virtual region
1611  * can return the appropriate address within that region, leaving '*virt'
1612  * unchanged.
1613  */
1614 vm_offset_t
1615 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1616 {
1617         return TLB_PHYS_TO_DIRECT(start);
1618 }
1619
1620 int 
1621 pmap_mincore(pmap_t pmap, vm_offset_t addr)
1622 {
1623         return (0);
1624 }
1625
1626 void 
1627 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 
1628                     vm_pindex_t index, vm_size_t size)
1629 {
1630         printf("pmap_object_init_pt\n");
1631         return;
1632 }
1633
1634 /*
1635  * Returns true if the pmap's pv is one of the first
1636  * 16 pvs linked to from this page.  This count may
1637  * be changed upwards or downwards in the future; it
1638  * is only necessary that true be returned for a small
1639  * subset of pmaps for proper page aging.
1640  */
1641 boolean_t
1642 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
1643 {
1644         pv_entry_t pv;
1645         int loops = 0;
1646
1647         if (m->flags & PG_FICTITIOUS)
1648                 return FALSE;
1649
1650         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1651         TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1652                 if (pv->pv_pmap == pmap) {
1653                         return TRUE;
1654                 }
1655                 loops++;
1656                 if (loops >= 16)
1657                         break;
1658         }       
1659         return (FALSE);
1660 }
1661
1662 /*
1663  * Initialize a vm_page's machine-dependent fields.
1664  */
1665 void
1666 pmap_page_init(vm_page_t m)
1667 {
1668
1669         TAILQ_INIT(&m->md.pv_list);
1670         m->md.pv_list_count = 0;
1671 }
1672 /*
1673  * Lower the permission for all mappings to a given page.
1674  */
1675 void
1676 pmap_remove_write(vm_page_t m)
1677 {
1678         if ((m->flags & PG_WRITEABLE) == 0)
1679                 return;
1680         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1681         tte_clear_phys_bit(m, VTD_SW_W|VTD_W);
1682         vm_page_flag_clear(m, PG_WRITEABLE);
1683 }
1684 /*
1685  * Initialize the pmap associated with process 0.
1686  */
1687 void
1688 pmap_pinit0(pmap_t pmap)
1689 {
1690         PMAP_LOCK_INIT(pmap);
1691         pmap->pm_active = pmap->pm_tlbactive = ~0;
1692         pmap->pm_context = 0;
1693         pmap->pm_tsb_ra = kernel_pmap->pm_tsb_ra;
1694         pmap->pm_hash = kernel_pmap->pm_hash;
1695         critical_enter();
1696         PCPU_SET(curpmap, pmap);
1697         critical_exit();
1698         TAILQ_INIT(&pmap->pm_pvlist);
1699         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1700 }
1701
1702 /*
1703  * Initialize a preallocated and zeroed pmap structure, such as one in a
1704  * vmspace structure.
1705  */
1706 void
1707 pmap_pinit(pmap_t pmap)
1708 {
1709         int i;
1710
1711         pmap->pm_context = get_context();
1712         pmap->pm_tsb_ra = vtophys(&pmap->pm_tsb);
1713
1714         vm_page_lock_queues();
1715         pmap->pm_hash = tte_hash_create(pmap->pm_context, &pmap->pm_hashscratch);
1716         tsb_init(&pmap->pm_tsb, &pmap->pm_tsbscratch, TSB_INIT_SHIFT);
1717         vm_page_unlock_queues();
1718         pmap->pm_tsb_miss_count = pmap->pm_tsb_cap_miss_count = 0;
1719         pmap->pm_active = pmap->pm_tlbactive = 0;
1720         for (i = 0; i < TSB_MAX_RESIZE; i++)
1721                 pmap->pm_old_tsb_ra[i] = 0;
1722
1723         TAILQ_INIT(&pmap->pm_pvlist);
1724         PMAP_LOCK_INIT(pmap);
1725         bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1726 }
1727
1728 /*
1729  * Set the physical protection on the specified range of this map as requested.
1730  */
1731 void
1732 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
1733 {
1734
1735         int anychanged;
1736         vm_offset_t tva;
1737         uint64_t clearbits;
1738
1739         DPRINTF("pmap_protect(0x%lx, 0x%lx, %d)\n", sva, eva, prot);
1740         
1741         if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
1742                 pmap_remove(pmap, sva, eva);
1743                 return;
1744         }
1745         
1746         if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 
1747             (VM_PROT_WRITE|VM_PROT_EXECUTE))
1748                 return;
1749
1750         clearbits = anychanged = 0;
1751         
1752         if ((prot & VM_PROT_WRITE) == 0)
1753                 clearbits |= (VTD_W|VTD_SW_W);
1754         if ((prot & VM_PROT_EXECUTE) == 0)
1755                 clearbits |= VTD_X;
1756
1757         vm_page_lock_queues();
1758         PMAP_LOCK(pmap);
1759         for (tva = sva; tva < eva; tva += PAGE_SIZE) {
1760                 uint64_t otte_data;
1761                 vm_page_t m;
1762
1763                 if ((otte_data = tte_hash_clear_bits(pmap->pm_hash, tva, 
1764                                                      clearbits)) == 0)
1765                         continue;
1766                 /*
1767                  * XXX technically we should do a shootdown if it 
1768                  * was referenced and was executable - but is not now
1769                  */
1770                 if (!anychanged && (otte_data & VTD_W))
1771                         anychanged = 1;
1772                 
1773                 if (otte_data & VTD_MANAGED) {
1774                         m = NULL;
1775
1776                         if (otte_data & VTD_REF) {
1777                                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(otte_data));
1778                                 vm_page_flag_set(m, PG_REFERENCED);
1779                         }
1780                         if (otte_data & VTD_W) {
1781                                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(otte_data));
1782                                 vm_page_dirty(m);
1783                         }
1784                 } 
1785         }
1786
1787         vm_page_unlock_queues();
1788         if (anychanged)
1789                 pmap_invalidate_range(pmap, sva, eva, TRUE);
1790         PMAP_UNLOCK(pmap);
1791 }
1792
1793 /*
1794  * Map a list of wired pages into kernel virtual address space.  This is
1795  * intended for temporary mappings which do not need page modification or
1796  * references recorded.  Existing mappings in the region are overwritten.
1797  */
1798 void
1799 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
1800 {
1801         vm_offset_t va;
1802         tte_t otte;
1803         
1804         otte = 0;
1805         va = sva;
1806         while (count-- > 0) {
1807                 otte |= tte_hash_update(kernel_pmap->pm_hash, va,  
1808                                         VM_PAGE_TO_PHYS(*m) | TTE_KERNEL | VTD_8K);
1809                 va += PAGE_SIZE;
1810                 m++;
1811         }
1812         if ((otte & VTD_REF) != 0)
1813                 pmap_invalidate_range(kernel_pmap, sva, va, FALSE);
1814 }
1815
1816 /*
1817  * Remove page mappings from kernel virtual address space.  Intended for
1818  * temporary mappings entered by pmap_qenter.
1819  */
1820 void
1821 pmap_qremove(vm_offset_t sva, int count)
1822 {
1823         vm_offset_t va;
1824         tte_t otte;
1825
1826         va = sva;
1827
1828         otte = 0;
1829         while (count-- > 0) {
1830                 otte |= tte_hash_delete(kernel_pmap->pm_hash, va);
1831                 va += PAGE_SIZE;
1832         }
1833         if ((otte & VTD_REF) != 0)
1834                 pmap_invalidate_range(kernel_pmap, sva, va, TRUE);
1835 }
1836
1837 /*
1838  * Release any resources held by the given physical map.
1839  * Called when a pmap initialized by pmap_pinit is being released.
1840  * Should only be called if the map contains no valid mappings.
1841  */
1842 void
1843 pmap_release(pmap_t pmap)
1844 {
1845         KASSERT(pmap->pm_stats.resident_count == 0,
1846             ("pmap_release: pmap resident count %ld != 0",
1847             pmap->pm_stats.resident_count));
1848
1849         tsb_deinit(&pmap->pm_tsb);
1850         tte_hash_destroy(pmap->pm_hash);
1851         free_context(pmap->pm_context);
1852         PMAP_LOCK_DESTROY(pmap);
1853 }
1854
1855 /*
1856  * Remove the given range of addresses from the specified map.
1857  */
1858 void
1859 pmap_remove(pmap_t pmap, vm_offset_t start, vm_offset_t end)
1860 {
1861         int invlva;
1862         vm_offset_t tva;
1863         uint64_t tte_data;
1864         /*
1865          * Perform an unsynchronized read.  This is, however, safe.
1866          */
1867         if (pmap->pm_stats.resident_count == 0)
1868                 return;
1869         
1870         DPRINTF("pmap_remove(start=0x%lx, end=0x%lx)\n", 
1871                 start, end);
1872         invlva = 0;
1873         vm_page_lock_queues();
1874         PMAP_LOCK(pmap);
1875         for (tva = start; tva < end; tva += PAGE_SIZE) {
1876                 if ((tte_data = tte_hash_delete(pmap->pm_hash, tva)) == 0)
1877                         continue;
1878                 pmap_remove_tte(pmap, tte_data, tva);
1879                 if (tte_data & (VTD_REF|VTD_W))
1880                         invlva = 1;
1881         }
1882         vm_page_unlock_queues();
1883         if (invlva)
1884                 pmap_invalidate_range(pmap, start, end, TRUE);
1885         PMAP_UNLOCK(pmap);
1886 }
1887
1888 /*
1889  *      Routine:        pmap_remove_all
1890  *      Function:
1891  *              Removes this physical page from
1892  *              all physical maps in which it resides.
1893  *              Reflects back modify bits to the pager.
1894  *
1895  *      Notes:
1896  *              Original versions of this routine were very
1897  *              inefficient because they iteratively called
1898  *              pmap_remove (slow...)
1899  */
1900
1901 void
1902 pmap_remove_all(vm_page_t m)
1903 {
1904         pv_entry_t pv;
1905         uint64_t tte_data;
1906         DPRINTF("pmap_remove_all 0x%lx\n", VM_PAGE_TO_PHYS(m));
1907
1908         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1909         while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
1910                 PMAP_LOCK(pv->pv_pmap);
1911                 pv->pv_pmap->pm_stats.resident_count--;
1912
1913                 tte_data = tte_hash_delete(pv->pv_pmap->pm_hash, pv->pv_va);
1914
1915                 if (tte_data & VTD_WIRED)
1916                         pv->pv_pmap->pm_stats.wired_count--;
1917                 if (tte_data & VTD_REF)
1918                         vm_page_flag_set(m, PG_REFERENCED);
1919                 
1920                 /*
1921                  * Update the vm_page_t clean and reference bits.
1922                  */
1923                 if (tte_data & VTD_W) {
1924                         KASSERT((tte_data & VTD_SW_W),
1925         ("pmap_remove_all: modified page not writable: va: %lx, tte: %lx",
1926                             pv->pv_va, tte_data));
1927                         vm_page_dirty(m);
1928                 }
1929         
1930                 pmap_invalidate_page(pv->pv_pmap, pv->pv_va, TRUE);
1931                 TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
1932                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1933                 m->md.pv_list_count--;
1934                 PMAP_UNLOCK(pv->pv_pmap);
1935                 free_pv_entry(pv);
1936         }
1937         vm_page_flag_clear(m, PG_WRITEABLE);
1938 }
1939
1940 static void
1941 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1942 {
1943         pv_entry_t pv;
1944         if (pmap != kernel_pmap)
1945                 DPRINTF("pmap_remove_entry(va=0x%lx, pa=0x%lx)\n", va, VM_PAGE_TO_PHYS(m));
1946         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1947         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1948         if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
1949                 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1950                         if (pmap == pv->pv_pmap && va == pv->pv_va) 
1951                                 break;
1952                 }
1953         } else {
1954                 TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
1955                         if (va == pv->pv_va) 
1956                                 break;
1957                 }
1958         }
1959         KASSERT(pv != NULL, ("pmap_remove_entry: pv not found va=0x%lx pa=0x%lx", va, VM_PAGE_TO_PHYS(m)));
1960         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1961         m->md.pv_list_count--;
1962         if (TAILQ_EMPTY(&m->md.pv_list))
1963                 vm_page_flag_clear(m, PG_WRITEABLE);
1964         TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
1965         free_pv_entry(pv);
1966 }
1967
1968
1969 void
1970 pmap_remove_pages(pmap_t pmap)
1971 {
1972         
1973         vm_page_t m;
1974         pv_entry_t pv, npv;
1975         tte_t tte_data;
1976         
1977         DPRINTF("pmap_remove_pages(ctx=0x%lx)\n", pmap->pm_context);
1978         vm_page_lock_queues();
1979         PMAP_LOCK(pmap);
1980         for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
1981                 tte_data = tte_hash_delete(pmap->pm_hash, pv->pv_va);
1982
1983                 if (tte_data == 0) {
1984                         printf("TTE IS ZERO @ VA %016lx\n", pv->pv_va);
1985                         panic("bad tte");
1986                 }
1987                 if (tte_data & VTD_WIRED) {
1988                         panic("wired page in process not handled correctly");
1989                         pmap->pm_stats.wired_count--;
1990                 }
1991                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
1992
1993                 pmap->pm_stats.resident_count--;
1994                 
1995                 if (tte_data & VTD_W) {
1996                         vm_page_dirty(m);
1997                 }
1998                 
1999                 npv = TAILQ_NEXT(pv, pv_plist);
2000                 TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
2001                 
2002                 m->md.pv_list_count--;
2003                 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2004                 if (TAILQ_EMPTY(&m->md.pv_list))
2005                         vm_page_flag_clear(m, PG_WRITEABLE);
2006
2007                 free_pv_entry(pv);
2008         }
2009         pmap->pm_hash = tte_hash_reset(pmap->pm_hash, &pmap->pm_hashscratch);
2010         if (0)
2011                 pmap_tsb_reset(pmap);
2012
2013         vm_page_unlock_queues();
2014         pmap_invalidate_all(pmap);
2015         PMAP_UNLOCK(pmap);
2016 }
2017
2018 static void
2019 pmap_tsb_reset(pmap_t pmap)
2020 {
2021         int i;
2022
2023         for (i = 1; i < TSB_MAX_RESIZE && pmap->pm_old_tsb_ra[i]; i++) {
2024                 pmap_free_contig_pages((void *)TLB_PHYS_TO_DIRECT(pmap->pm_old_tsb_ra[i]), 
2025                                        (1 << (TSB_INIT_SHIFT + i)));
2026                 pmap->pm_old_tsb_ra[i] = 0;
2027         }
2028         if (pmap->pm_old_tsb_ra[0] != 0) {
2029                 vm_paddr_t tsb_pa = pmap->pm_tsb.hti_ra;
2030                 int size = tsb_size(&pmap->pm_tsb);
2031                 pmap->pm_tsb.hti_ntte = (1 << (TSB_INIT_SHIFT + PAGE_SHIFT - TTE_SHIFT));
2032                 pmap->pm_tsb.hti_ra = pmap->pm_old_tsb_ra[0];
2033                 pmap_free_contig_pages((void *)TLB_PHYS_TO_DIRECT(tsb_pa), size);
2034                 pmap->pm_tsbscratch = pmap->pm_tsb.hti_ra | (uint64_t)TSB_INIT_SHIFT;
2035                 pmap->pm_old_tsb_ra[0] = 0;
2036         }
2037 }
2038
2039 void
2040 pmap_scrub_pages(vm_paddr_t pa, int64_t size)
2041 {
2042         uint64_t bytes_zeroed;
2043         while (size > 0) {
2044                 hv_mem_scrub(pa, size, &bytes_zeroed);
2045                 pa += bytes_zeroed;
2046                 size -= bytes_zeroed;
2047         }
2048 }
2049
2050 static void
2051 pmap_remove_tte(pmap_t pmap, tte_t tte_data, vm_offset_t va)
2052 {
2053         
2054         vm_page_t m;
2055
2056         if (pmap != kernel_pmap)
2057                 DPRINTF("pmap_remove_tte(va=0x%lx, pa=0x%lx)\n", va, TTE_GET_PA(tte_data));
2058
2059         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2060         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2061         if (tte_data & VTD_WIRED)
2062                 pmap->pm_stats.wired_count--;
2063
2064         pmap->pm_stats.resident_count--;
2065         
2066         if (tte_data & VTD_MANAGED) {
2067                 m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
2068                 if (tte_data & VTD_W) {
2069                         vm_page_dirty(m);       
2070                 }
2071                 if (tte_data & VTD_REF) 
2072                         vm_page_flag_set(m, PG_REFERENCED);
2073                 pmap_remove_entry(pmap, m, va);
2074         }
2075 }
2076
2077 /* resize the tsb if the number of capacity misses is greater than 1/4 of
2078  * the total 
2079  */  
2080 static void
2081 pmap_tsb_resize(pmap_t pmap)
2082 {
2083         uint32_t miss_count;
2084         uint32_t cap_miss_count;
2085         struct tsb_resize_info info;
2086         hv_tsb_info_t hvtsb;
2087         uint64_t tsbscratch;
2088
2089         KASSERT(pmap == curthread_pmap, ("operating on non-current pmap"));
2090         miss_count = pmap->pm_tsb_miss_count;
2091         cap_miss_count = pmap->pm_tsb_cap_miss_count;
2092         int npages_shift = tsb_page_shift(pmap);
2093
2094         if (npages_shift < (TSB_INIT_SHIFT + TSB_MAX_RESIZE) && 
2095             cap_miss_count > (miss_count >> 1)) {
2096                 DPRINTF("resizing tsb for proc=%s pid=%d\n", 
2097                         curthread->td_proc->p_comm, curthread->td_proc->p_pid);
2098                 pmap->pm_old_tsb_ra[npages_shift - TSB_INIT_SHIFT] = pmap->pm_tsb.hti_ra;
2099
2100                 /* double TSB size */
2101                 tsb_init(&hvtsb, &tsbscratch, npages_shift + 1);
2102 #ifdef SMP
2103                 spinlock_enter();
2104                 /* reset tsb */
2105                 bcopy(&hvtsb, &pmap->pm_tsb, sizeof(hv_tsb_info_t));
2106                 pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
2107
2108                 if (hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra) != H_EOK)
2109                         panic("failed to set TSB 0x%lx - context == %ld\n", 
2110                               pmap->pm_tsb_ra, pmap->pm_context);
2111                 info.tri_tsbscratch = pmap->pm_tsbscratch;
2112                 info.tri_tsb_ra = pmap->pm_tsb_ra;
2113                 pmap_ipi(pmap, tl_tsbupdate, pmap->pm_context, vtophys(&info));
2114                 pmap->pm_tlbactive = pmap->pm_active;
2115                 spinlock_exit();
2116 #else 
2117                 bcopy(&hvtsb, &pmap->pm_tsb, sizeof(hvtsb));
2118                 if (hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra) != H_EOK)
2119                         panic("failed to set TSB 0x%lx - context == %ld\n", 
2120                               pmap->pm_tsb_ra, pmap->pm_context);
2121                 pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
2122 #endif          
2123         }
2124         pmap->pm_tsb_miss_count = 0;
2125         pmap->pm_tsb_cap_miss_count = 0;
2126 }
2127
2128 static void
2129 pmap_tte_hash_resize(pmap_t pmap)
2130 {
2131         tte_hash_t old_th = pmap->pm_hash;
2132         
2133         pmap->pm_hash = tte_hash_resize(pmap->pm_hash);
2134         spinlock_enter();
2135         if (curthread->td_proc->p_numthreads != 1) 
2136                 pmap_ipi(pmap, tl_ttehashupdate, pmap->pm_context, pmap->pm_hashscratch);
2137
2138         pmap->pm_hashscratch = tte_hash_set_scratchpad_user(pmap->pm_hash, pmap->pm_context);   
2139         spinlock_exit();
2140         tte_hash_destroy(old_th);
2141 }
2142
2143 /*
2144  *      pmap_ts_referenced:
2145  *
2146  *      Return a count of reference bits for a page, clearing those bits.
2147  *      It is not necessary for every reference bit to be cleared, but it
2148  *      is necessary that 0 only be returned when there are truly no
2149  *      reference bits set.
2150  *
2151  *      XXX: The exact number of bits to check and clear is a matter that
2152  *      should be tested and standardized at some point in the future for
2153  *      optimal aging of shared pages.
2154  */
2155
2156 int
2157 pmap_ts_referenced(vm_page_t m)
2158 {
2159         
2160         int rv;
2161         pv_entry_t pv, pvf, pvn;
2162         pmap_t pmap;
2163         tte_t otte_data;
2164
2165         rv = 0;
2166         if (m->flags & PG_FICTITIOUS)
2167                 return (rv);
2168
2169         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2170         if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2171                 
2172                 pvf = pv;
2173
2174                 do {
2175                         pvn = TAILQ_NEXT(pv, pv_list);
2176                         
2177                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2178                         
2179                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
2180                         
2181                         pmap = pv->pv_pmap;
2182                         PMAP_LOCK(pmap);
2183                         otte_data = tte_hash_clear_bits(pmap->pm_hash, pv->pv_va, VTD_REF);
2184                         if ((otte_data & VTD_REF) != 0) {
2185                                 pmap_invalidate_page(pmap, pv->pv_va, TRUE);
2186                                 
2187                                 rv++;
2188                                 if (rv > 4) {
2189                                         PMAP_UNLOCK(pmap);
2190                                         break;
2191                                 }
2192                         }
2193                 
2194                         PMAP_UNLOCK(pmap);
2195                 } while ((pv = pvn) != NULL && pv != pvf);
2196         }
2197         return (rv);
2198 }
2199
2200 void
2201 pmap_zero_page(vm_page_t m)
2202 {
2203         hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
2204 }
2205
2206 void
2207 pmap_zero_page_area(vm_page_t m, int off, int size)
2208 {
2209         vm_paddr_t pa;
2210         vm_offset_t va;
2211                 
2212         pa = VM_PAGE_TO_PHYS(m);
2213         va = TLB_PHYS_TO_DIRECT(pa);
2214         if (off == 0 && size == PAGE_SIZE)
2215                 hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
2216         else
2217                 bzero((char *)(va + off), size);
2218
2219 }
2220
2221 void
2222 pmap_zero_page_idle(vm_page_t m)
2223 {
2224         hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
2225 }
2226
2227 void
2228 pmap_set_ctx_panic(uint64_t error, vm_paddr_t tsb_ra, pmap_t pmap)
2229 {
2230         panic("setting ctxnon0 failed ctx=0x%lx hvtsb_ra=0x%lx tsbscratch=0x%lx error=0x%lx",
2231               pmap->pm_context, tsb_ra, pmap->pm_tsbscratch, error);
2232         
2233 }