3 * Copyright (c) 2004 Christian Limpach.
4 * Copyright (c) 2004-2006,2008 Kip Macy
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed by Christian Limpach.
18 * 4. The name of the author may not be used to endorse or promote products
19 * derived from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
36 #include <sys/param.h>
37 #include <sys/systm.h>
39 #include <sys/mount.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
42 #include <sys/reboot.h>
43 #include <sys/sysproto.h>
45 #include <machine/xen/xen-os.h>
49 #include <machine/segments.h>
50 #include <machine/pcb.h>
51 #include <machine/stdarg.h>
52 #include <machine/vmparam.h>
53 #include <machine/cpu.h>
54 #include <machine/intr_machdep.h>
55 #include <machine/md_var.h>
56 #include <machine/asmacros.h>
60 #include <machine/xen/hypervisor.h>
61 #include <machine/xen/xenvar.h>
62 #include <machine/xen/xenfunc.h>
63 #include <machine/xen/xenpmap.h>
64 #include <machine/xen/xenbus.h>
65 #include <machine/xen/xenfunc.h>
66 #include <xen/interface/memory.h>
67 #include <machine/xen/features.h>
69 #include <machine/privatespace.h>
73 #include <vm/vm_page.h>
76 #define IDTVEC(name) __CONCAT(X,name)
79 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
80 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
81 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
82 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
83 IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
87 start_info_t *xen_start_info;
88 shared_info_t *HYPERVISOR_shared_info;
89 xen_pfn_t *xen_machine_phys = machine_to_phys_mapping;
90 xen_pfn_t *xen_phys_machine;
91 int preemptable, init_first;
92 extern unsigned int avail_space;
101 __asm__("pushl %edx;"
114 __asm__("pushl %edx;"
126 * Modify the cmd_line by converting ',' to NULLs so that it is in a format
127 * suitable for the static env vars.
130 xen_setbootenv(char *cmd_line)
134 /* Skip leading spaces */
135 for (; *cmd_line == ' '; cmd_line++);
137 printk("xen_setbootenv(): cmd_line='%s'\n", cmd_line);
139 for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;);
148 {"boot_askname", RB_ASKNAME},
149 {"boot_single", RB_SINGLE},
150 {"boot_nosync", RB_NOSYNC},
151 {"boot_halt", RB_ASKNAME},
152 {"boot_serial", RB_SERIAL},
153 {"boot_cdrom", RB_CDROM},
154 {"boot_gdb", RB_GDB},
155 {"boot_gdb_pause", RB_RESERVED1},
156 {"boot_verbose", RB_VERBOSE},
157 {"boot_multicons", RB_MULTIPLE},
162 xen_boothowto(char *envp)
166 /* get equivalents from the environment */
167 for (i = 0; howto_names[i].ev != NULL; i++)
168 if (getenv(howto_names[i].ev) != NULL)
169 howto |= howto_names[i].mask;
173 #define PRINTK_BUFSIZE 1024
175 printk(const char *fmt, ...)
179 static char buf[PRINTK_BUFSIZE];
184 retval = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap);
187 (void)HYPERVISOR_console_write(buf, retval);
191 #define XPQUEUE_SIZE 128
199 /* per-cpu queues and indices */
201 static struct mmu_log xpq_queue_log[MAX_VIRT_CPUS][XPQUEUE_SIZE];
204 static int xpq_idx[MAX_VIRT_CPUS];
205 static mmu_update_t xpq_queue[MAX_VIRT_CPUS][XPQUEUE_SIZE];
207 #define XPQ_QUEUE xpq_queue[vcpu]
208 #define XPQ_IDX xpq_idx[vcpu]
209 #define SET_VCPU() int vcpu = smp_processor_id()
211 #define XPQ_QUEUE_LOG xpq_queue_log[vcpu]
214 static mmu_update_t xpq_queue[XPQUEUE_SIZE];
215 static struct mmu_log xpq_queue_log[XPQUEUE_SIZE];
216 static int xpq_idx = 0;
218 #define XPQ_QUEUE_LOG xpq_queue_log
219 #define XPQ_QUEUE xpq_queue
220 #define XPQ_IDX xpq_idx
224 #define XPQ_IDX_INC atomic_add_int(&XPQ_IDX, 1);
230 int _xpq_idx = XPQ_IDX;
236 printk("xen_dump_queue(): %u entries\n", _xpq_idx);
237 for (i = 0; i < _xpq_idx; i++) {
238 printk(" val: %llx ptr: %llx\n", XPQ_QUEUE[i].val, XPQ_QUEUE[i].ptr);
245 _xen_flush_queue(void)
248 int _xpq_idx = XPQ_IDX;
250 /* window of vulnerability here? */
252 if (__predict_true(gdtset))
255 /* Make sure index is cleared first to avoid double updates. */
256 error = HYPERVISOR_mmu_update((mmu_update_t *)&XPQ_QUEUE,
257 _xpq_idx, NULL, DOMID_SELF);
260 if (__predict_true(gdtset))
261 for (i = _xpq_idx; i > 0;) {
263 CTR6(KTR_PMAP, "mmu:val: %lx ptr: %lx val: %lx "
264 "ptr: %lx val: %lx ptr: %lx",
265 (XPQ_QUEUE[i-1].val & 0xffffffff),
266 (XPQ_QUEUE[i-1].ptr & 0xffffffff),
267 (XPQ_QUEUE[i-2].val & 0xffffffff),
268 (XPQ_QUEUE[i-2].ptr & 0xffffffff),
269 (XPQ_QUEUE[i-3].val & 0xffffffff),
270 (XPQ_QUEUE[i-3].ptr & 0xffffffff));
273 CTR4(KTR_PMAP, "mmu: val: %lx ptr: %lx val: %lx ptr: %lx",
274 (XPQ_QUEUE[i-1].val & 0xffffffff),
275 (XPQ_QUEUE[i-1].ptr & 0xffffffff),
276 (XPQ_QUEUE[i-2].val & 0xffffffff),
277 (XPQ_QUEUE[i-2].ptr & 0xffffffff));
280 CTR2(KTR_PMAP, "mmu: val: %lx ptr: %lx",
281 (XPQ_QUEUE[i-1].val & 0xffffffff),
282 (XPQ_QUEUE[i-1].ptr & 0xffffffff));
287 if (__predict_true(gdtset))
289 if (__predict_false(error < 0)) {
290 for (i = 0; i < _xpq_idx; i++)
291 printf("val: %llx ptr: %llx\n",
292 XPQ_QUEUE[i].val, XPQ_QUEUE[i].ptr);
293 panic("Failed to execute MMU updates: %d", error);
299 xen_flush_queue(void)
302 if (XPQ_IDX != 0) _xen_flush_queue();
306 xen_increment_idx(void)
311 if (__predict_false(XPQ_IDX == XPQUEUE_SIZE))
316 xen_check_queue(void)
321 KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX));
326 xen_invlpg(vm_offset_t va)
329 op.cmd = MMUEXT_INVLPG_ALL;
330 op.arg1.linear_addr = va & ~PAGE_MASK;
331 PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
335 xen_load_cr3(u_int val)
341 KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX));
343 op.cmd = MMUEXT_NEW_BASEPTR;
344 op.arg1.mfn = xpmap_ptom(val) >> PAGE_SHIFT;
345 PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
349 xen_restore_flags(u_int eflags)
353 eflags = ((eflags & PSL_I) == 0);
355 __restore_flags(eflags);
359 xen_save_and_cli(void)
363 __save_and_cli(eflags);
383 return (HYPERVISOR_shared_info->vcpu_info[curcpu].arch.cr2);
387 _xen_machphys_update(vm_paddr_t mfn, vm_paddr_t pfn, char *file, int line)
391 if (__predict_true(gdtset))
393 XPQ_QUEUE[XPQ_IDX].ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
394 XPQ_QUEUE[XPQ_IDX].val = pfn;
396 XPQ_QUEUE_LOG[XPQ_IDX].file = file;
397 XPQ_QUEUE_LOG[XPQ_IDX].line = line;
400 if (__predict_true(gdtset))
405 _xen_queue_pt_update(vm_paddr_t ptr, vm_paddr_t val, char *file, int line)
409 if (__predict_true(gdtset))
410 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
412 KASSERT((ptr & 7) == 0, ("misaligned update"));
414 if (__predict_true(gdtset))
417 XPQ_QUEUE[XPQ_IDX].ptr = ((uint64_t)ptr) | MMU_NORMAL_PT_UPDATE;
418 XPQ_QUEUE[XPQ_IDX].val = (uint64_t)val;
420 XPQ_QUEUE_LOG[XPQ_IDX].file = file;
421 XPQ_QUEUE_LOG[XPQ_IDX].line = line;
424 if (__predict_true(gdtset))
429 xen_pgdpt_pin(vm_paddr_t ma)
432 op.cmd = MMUEXT_PIN_L3_TABLE;
433 op.arg1.mfn = ma >> PAGE_SHIFT;
435 PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
439 xen_pgd_pin(vm_paddr_t ma)
442 op.cmd = MMUEXT_PIN_L2_TABLE;
443 op.arg1.mfn = ma >> PAGE_SHIFT;
445 PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
449 xen_pgd_unpin(vm_paddr_t ma)
452 op.cmd = MMUEXT_UNPIN_TABLE;
453 op.arg1.mfn = ma >> PAGE_SHIFT;
455 PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
459 xen_pt_pin(vm_paddr_t ma)
462 op.cmd = MMUEXT_PIN_L1_TABLE;
463 op.arg1.mfn = ma >> PAGE_SHIFT;
464 printk("xen_pt_pin(): mfn=%x\n", op.arg1.mfn);
466 PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
470 xen_pt_unpin(vm_paddr_t ma)
473 op.cmd = MMUEXT_UNPIN_TABLE;
474 op.arg1.mfn = ma >> PAGE_SHIFT;
476 PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
480 xen_set_ldt(vm_paddr_t ptr, unsigned long len)
483 op.cmd = MMUEXT_SET_LDT;
484 op.arg1.linear_addr = ptr;
485 op.arg2.nr_ents = len;
487 PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
490 void xen_tlb_flush(void)
493 op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
495 PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
499 xen_update_descriptor(union descriptor *table, union descriptor *entry)
504 ptp = vtopte((vm_offset_t)table);
505 pa = (*ptp & PG_FRAME) | ((vm_offset_t)table & PAGE_MASK);
506 if (HYPERVISOR_update_descriptor(pa, *(uint64_t *)entry))
507 panic("HYPERVISOR_update_descriptor failed\n");
513 * Bitmap is indexed by page number. If bit is set, the page is part of a
514 * xen_create_contiguous_region() area of memory.
516 unsigned long *contiguous_bitmap;
519 contiguous_bitmap_set(unsigned long first_page, unsigned long nr_pages)
521 unsigned long start_off, end_off, curr_idx, end_idx;
523 curr_idx = first_page / BITS_PER_LONG;
524 start_off = first_page & (BITS_PER_LONG-1);
525 end_idx = (first_page + nr_pages) / BITS_PER_LONG;
526 end_off = (first_page + nr_pages) & (BITS_PER_LONG-1);
528 if (curr_idx == end_idx) {
529 contiguous_bitmap[curr_idx] |=
530 ((1UL<<end_off)-1) & -(1UL<<start_off);
532 contiguous_bitmap[curr_idx] |= -(1UL<<start_off);
533 while ( ++curr_idx < end_idx )
534 contiguous_bitmap[curr_idx] = ~0UL;
535 contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1;
540 contiguous_bitmap_clear(unsigned long first_page, unsigned long nr_pages)
542 unsigned long start_off, end_off, curr_idx, end_idx;
544 curr_idx = first_page / BITS_PER_LONG;
545 start_off = first_page & (BITS_PER_LONG-1);
546 end_idx = (first_page + nr_pages) / BITS_PER_LONG;
547 end_off = (first_page + nr_pages) & (BITS_PER_LONG-1);
549 if (curr_idx == end_idx) {
550 contiguous_bitmap[curr_idx] &=
551 -(1UL<<end_off) | ((1UL<<start_off)-1);
553 contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1;
554 while ( ++curr_idx != end_idx )
555 contiguous_bitmap[curr_idx] = 0;
556 contiguous_bitmap[curr_idx] &= -(1UL<<end_off);
561 /* Ensure multi-page extents are contiguous in machine memory. */
563 xen_create_contiguous_region(vm_page_t pages, int npages)
565 unsigned long mfn, i, flags;
567 struct xen_memory_reservation reservation = {
572 set_xen_guest_handle(reservation.extent_start, &mfn);
576 /* can currently only handle power of two allocation */
577 PANIC_IF(ffs(npages) != fls(npages));
579 /* 0. determine order */
580 order = (ffs(npages) == fls(npages)) ? fls(npages) - 1 : fls(npages);
582 /* 1. give away machine pages. */
583 for (i = 0; i < (1 << order); i++) {
585 pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT;
587 PFNTOMFN(pfn) = INVALID_P2M_ENTRY;
588 PANIC_IF(HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation) != 1);
592 /* 2. Get a new contiguous memory extent. */
593 reservation.extent_order = order;
594 /* xenlinux hardcodes this because of aacraid - maybe set to 0 if we're not
595 * running with a broxen driver XXXEN
597 reservation.address_bits = 31;
598 if (HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1)
601 /* 3. Map the new extent in place of old pages. */
602 for (i = 0; i < (1 << order); i++) {
604 pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT;
605 xen_machphys_update(mfn+i, pfn);
606 PFNTOMFN(pfn) = mfn+i;
612 contiguous_bitmap_set(VM_PAGE_TO_PHYS(&pages[0]) >> PAGE_SHIFT, 1UL << order);
615 balloon_unlock(flags);
620 reservation.extent_order = 0;
621 reservation.address_bits = 0;
623 for (i = 0; i < (1 << order); i++) {
625 pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT;
626 PANIC_IF(HYPERVISOR_memory_op(
627 XENMEM_increase_reservation, &reservation) != 1);
628 xen_machphys_update(mfn, pfn);
634 balloon_unlock(flags);
640 xen_destroy_contiguous_region(void *addr, int npages)
642 unsigned long mfn, i, flags, order, pfn0;
643 struct xen_memory_reservation reservation = {
648 set_xen_guest_handle(reservation.extent_start, &mfn);
650 pfn0 = vtophys(addr) >> PAGE_SHIFT;
652 scrub_pages(vstart, 1 << order);
654 /* can currently only handle power of two allocation */
655 PANIC_IF(ffs(npages) != fls(npages));
657 /* 0. determine order */
658 order = (ffs(npages) == fls(npages)) ? fls(npages) - 1 : fls(npages);
663 contiguous_bitmap_clear(vtophys(addr) >> PAGE_SHIFT, 1UL << order);
666 /* 1. Zap current PTEs, giving away the underlying pages. */
667 for (i = 0; i < (1 << order); i++) {
669 uint64_t new_val = 0;
670 pfn = vtomach((char *)addr + i*PAGE_SIZE) >> PAGE_SHIFT;
672 PANIC_IF(HYPERVISOR_update_va_mapping((vm_offset_t)((char *)addr + (i * PAGE_SIZE)), new_val, 0));
673 PFNTOMFN(pfn) = INVALID_P2M_ENTRY;
674 PANIC_IF(HYPERVISOR_memory_op(
675 XENMEM_decrease_reservation, &reservation) != 1);
678 /* 2. Map new pages in place of old pages. */
679 for (i = 0; i < (1 << order); i++) {
683 PANIC_IF(HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1);
685 new_val = mfn << PAGE_SHIFT;
686 PANIC_IF(HYPERVISOR_update_va_mapping((vm_offset_t)addr + (i * PAGE_SIZE),
687 new_val, PG_KERNEL));
688 xen_machphys_update(mfn, pfn);
694 balloon_unlock(flags);
697 extern unsigned long cpu0prvpage;
698 extern unsigned long *SMPpt;
699 extern struct user *proc0uarea;
700 extern vm_offset_t proc0kstack;
701 extern int vm86paddr, vm86phystk;
702 char *bootmem_start, *bootmem_current, *bootmem_end;
704 pteinfo_t *pteinfo_list;
705 void initvalues(start_info_t *startinfo);
707 struct ringbuf_head *xen_store; /* XXX move me */
711 bootmem_alloc(unsigned int size)
715 retptr = bootmem_current;
716 PANIC_IF(retptr + size > bootmem_end);
717 bootmem_current += size;
723 bootmem_free(void *ptr, unsigned int size)
728 PANIC_IF(tptr != bootmem_current - size ||
729 bootmem_current - size < bootmem_start);
731 bootmem_current -= size;
736 xpmap_mtop2(vm_paddr_t mpa)
738 return ((machine_to_phys_mapping[mpa >> PAGE_SHIFT] << PAGE_SHIFT)
739 ) | (mpa & ~PG_FRAME);
743 xpmap_get_bootpde(vm_paddr_t va)
746 return ((pd_entry_t *)xen_start_info->pt_base)[va >> 22];
750 xpmap_get_vbootpde(vm_paddr_t va)
754 pde = xpmap_get_bootpde(va);
755 if ((pde & PG_V) == 0)
756 return (pde & ~PG_FRAME);
757 return (pde & ~PG_FRAME) |
758 (xpmap_mtop2(pde & PG_FRAME) + KERNBASE);
762 xpmap_get_bootptep(vm_paddr_t va)
766 pde = xpmap_get_vbootpde(va);
767 if ((pde & PG_V) == 0)
769 #define PT_MASK 0x003ff000 /* page table address bits */
770 return &(((pt_entry_t *)(pde & PG_FRAME))[(va & PT_MASK) >> PAGE_SHIFT]);
774 xpmap_get_bootpte(vm_paddr_t va)
777 return xpmap_get_bootptep(va)[0];
784 shift_phys_machine(unsigned long *phys_machine, int nr_pages)
787 unsigned long *tmp_page, *current_page, *next_page;
790 tmp_page = bootmem_alloc(PAGE_SIZE);
791 current_page = phys_machine + nr_pages - (PAGE_SIZE/sizeof(unsigned long));
792 next_page = current_page - (PAGE_SIZE/sizeof(unsigned long));
793 bcopy(phys_machine, tmp_page, PAGE_SIZE);
795 while (current_page > phys_machine) {
797 bcopy(next_page, tmp_page, PAGE_SIZE);
798 /* shift down page */
799 bcopy(current_page, next_page, PAGE_SIZE);
801 bcopy(tmp_page, current_page, PAGE_SIZE);
803 current_page -= (PAGE_SIZE/sizeof(unsigned long));
804 next_page -= (PAGE_SIZE/sizeof(unsigned long));
806 bootmem_free(tmp_page, PAGE_SIZE);
808 for (i = 0; i < nr_pages; i++) {
809 xen_machphys_update(phys_machine[i], i);
811 memset(phys_machine, INVALID_P2M_ENTRY, PAGE_SIZE);
814 #endif /* ADD_ISA_HOLE */
816 extern unsigned long physfree;
822 initvalues(start_info_t *startinfo)
824 int l3_pages, l2_pages, l1_pages, offset;
825 vm_offset_t cur_space, cur_space_pt;
826 struct physdev_set_iopl set_iopl;
828 vm_paddr_t KPTphys, IdlePTDma;
829 vm_paddr_t console_page_ma, xen_store_ma;
830 vm_offset_t KPTphysoff, tmpva;
833 vm_paddr_t IdlePDPTma, IdlePDPTnewma;
834 vm_paddr_t IdlePTDnewma[4];
835 pd_entry_t *IdlePDPTnew, *IdlePTDnew;
837 vm_paddr_t pdir_shadow_ma;
844 max((startinfo->nr_pages >> NPGPTD_SHIFT), nkpt),
845 NPGPTD*NPDEPG - KPTDI),
846 (HYPERVISOR_VIRT_START - KERNBASE) >> PDRSHIFT);
854 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
857 * need to install handler
859 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify);
861 xen_start_info = startinfo;
862 xen_phys_machine = (xen_pfn_t *)startinfo->mfn_list;
864 /* number of pages allocated after the pts + 1*/;
865 cur_space = xen_start_info->pt_base +
866 ((xen_start_info->nr_pt_frames) + 3 )*PAGE_SIZE;
867 printk("initvalues(): wooh - availmem=%x,%x\n", avail_space, cur_space);
869 printk("KERNBASE=%x,pt_base=%x, VTOPFN(base)=%x, nr_pt_frames=%x\n",
870 KERNBASE,xen_start_info->pt_base, VTOPFN(xen_start_info->pt_base),
871 xen_start_info->nr_pt_frames);
872 xendebug_flags = 0; /* 0xffffffff; */
874 /* allocate 4 pages for bootmem allocator */
875 bootmem_start = bootmem_current = (char *)cur_space;
876 cur_space += (4 * PAGE_SIZE);
877 bootmem_end = (char *)cur_space;
879 /* allocate page for gdt */
880 gdt = (union descriptor *)cur_space;
881 cur_space += PAGE_SIZE*ncpus;
883 /* allocate page for ldt */
884 ldt = (union descriptor *)cur_space; cur_space += PAGE_SIZE;
885 cur_space += PAGE_SIZE;
887 HYPERVISOR_shared_info = (shared_info_t *)cur_space;
888 cur_space += PAGE_SIZE;
890 xen_store = (struct ringbuf_head *)cur_space;
891 cur_space += PAGE_SIZE;
893 console_page = (char *)cur_space;
894 cur_space += PAGE_SIZE;
897 shift_phys_machine(xen_phys_machine, xen_start_info->nr_pages);
900 * pre-zero unused mapped pages - mapped on 4MB boundary
903 IdlePDPT = (pd_entry_t *)startinfo->pt_base;
904 IdlePDPTma = xpmap_ptom(VTOP(startinfo->pt_base));
906 * Note that only one page directory has been allocated at this point.
909 IdlePTD = (pd_entry_t *)((uint8_t *)startinfo->pt_base + PAGE_SIZE);
910 IdlePTDma = xpmap_ptom(VTOP(IdlePTD));
913 IdlePTD = (pd_entry_t *)startinfo->pt_base;
914 IdlePTDma = xpmap_ptom(VTOP(startinfo->pt_base));
918 l1_pages = xen_start_info->nr_pt_frames - l2_pages - l3_pages;
920 KPTphysoff = (l2_pages + l3_pages)*PAGE_SIZE;
922 KPTphys = xpmap_ptom(VTOP(startinfo->pt_base + KPTphysoff));
923 XENPRINTF("IdlePTD %p\n", IdlePTD);
924 XENPRINTF("nr_pages: %ld shared_info: 0x%lx flags: 0x%lx pt_base: 0x%lx "
925 "mod_start: 0x%lx mod_len: 0x%lx\n",
926 xen_start_info->nr_pages, xen_start_info->shared_info,
927 xen_start_info->flags, xen_start_info->pt_base,
928 xen_start_info->mod_start, xen_start_info->mod_len);
929 /* Map proc0's KSTACK */
931 proc0kstack = cur_space; cur_space += (KSTACK_PAGES * PAGE_SIZE);
932 printk("proc0kstack=%u\n", proc0kstack);
934 /* vm86/bios stack */
935 cur_space += PAGE_SIZE;
937 /* Map space for the vm86 region */
938 vm86paddr = (vm_offset_t)cur_space;
939 cur_space += (PAGE_SIZE * 3);
942 IdlePDPTnew = (pd_entry_t *)cur_space; cur_space += PAGE_SIZE;
943 bzero(IdlePDPTnew, PAGE_SIZE);
945 IdlePDPTnewma = xpmap_ptom(VTOP(IdlePDPTnew));
946 IdlePTDnew = (pd_entry_t *)cur_space; cur_space += 4*PAGE_SIZE;
947 bzero(IdlePTDnew, 4*PAGE_SIZE);
949 for (i = 0; i < 4; i++)
951 xpmap_ptom(VTOP((uint8_t *)IdlePTDnew + i*PAGE_SIZE));
955 * Copy the 4 machine addresses of the new PTDs in to the PDPT
958 for (i = 0; i < 4; i++)
959 IdlePDPTnew[i] = IdlePTDnewma[i] | PG_V;
964 * re-map the new PDPT read-only
966 PT_SET_MA(IdlePDPTnew, IdlePDPTnewma | PG_V);
969 * Unpin the current PDPT
971 xen_pt_unpin(IdlePDPTma);
973 for (i = 0; i < 20; i++) {
974 int startidx = ((KERNBASE >> 18) & PAGE_MASK) >> 3;
976 if (IdlePTD[startidx + i] == 0) {
984 /* unmap remaining pages from initial 4MB chunk
987 for (tmpva = cur_space; (tmpva & ((1<<22)-1)) != 0; tmpva += PAGE_SIZE) {
988 bzero((char *)tmpva, PAGE_SIZE);
989 PT_SET_MA(tmpva, (vm_paddr_t)0);
996 memcpy(((uint8_t *)IdlePTDnew) + ((unsigned int)(KERNBASE >> 18)),
997 ((uint8_t *)IdlePTD) + ((KERNBASE >> 18) & PAGE_MASK),
998 l1_pages*sizeof(pt_entry_t));
1000 for (i = 0; i < 4; i++) {
1001 PT_SET_MA((uint8_t *)IdlePTDnew + i*PAGE_SIZE,
1002 IdlePTDnewma[i] | PG_V);
1004 xen_load_cr3(VTOP(IdlePDPTnew));
1005 xen_pgdpt_pin(xpmap_ptom(VTOP(IdlePDPTnew)));
1007 /* allocate remainder of nkpt pages */
1008 cur_space_pt = cur_space;
1009 for (offset = (KERNBASE >> PDRSHIFT), i = l1_pages; i < nkpt;
1010 i++, cur_space += PAGE_SIZE) {
1011 pdir = (offset + i) / NPDEPG;
1012 curoffset = ((offset + i) % NPDEPG);
1013 if (((offset + i) << PDRSHIFT) == VM_MAX_KERNEL_ADDRESS)
1017 * make sure that all the initial page table pages
1020 PT_SET_MA(cur_space_pt,
1021 xpmap_ptom(VTOP(cur_space)) | PG_V | PG_RW);
1022 bzero((char *)cur_space_pt, PAGE_SIZE);
1023 PT_SET_MA(cur_space_pt, (vm_paddr_t)0);
1024 xen_pt_pin(xpmap_ptom(VTOP(cur_space)));
1025 xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[pdir] +
1026 curoffset*sizeof(vm_paddr_t)),
1027 xpmap_ptom(VTOP(cur_space)) | PG_KERNEL);
1031 for (i = 0; i < 4; i++) {
1032 pdir = (PTDPTDI + i) / NPDEPG;
1033 curoffset = (PTDPTDI + i) % NPDEPG;
1035 xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[pdir] +
1036 curoffset*sizeof(vm_paddr_t)),
1037 IdlePTDnewma[i] | PG_V);
1042 IdlePTD = IdlePTDnew;
1043 IdlePDPT = IdlePDPTnew;
1044 IdlePDPTma = IdlePDPTnewma;
1047 * shared_info is an unsigned long so this will randomly break if
1048 * it is allocated above 4GB - I guess people are used to that
1049 * sort of thing with Xen ... sigh
1051 shinfo = xen_start_info->shared_info;
1052 PT_SET_MA(HYPERVISOR_shared_info, shinfo | PG_KERNEL);
1057 xen_store_ma = (((vm_paddr_t)xen_start_info->store_mfn) << PAGE_SHIFT);
1058 PT_SET_MA(xen_store, xen_store_ma | PG_KERNEL);
1059 console_page_ma = (((vm_paddr_t)xen_start_info->console.domU.mfn) << PAGE_SHIFT);
1060 PT_SET_MA(console_page, console_page_ma | PG_KERNEL);
1063 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = (unsigned long)xen_phys_machine;
1066 PANIC_IF(HYPERVISOR_physdev_op(PHYSDEVOP_SET_IOPL, &set_iopl));
1069 /* add page table for KERNBASE */
1070 xen_queue_pt_update(IdlePTDma + KPTDI*sizeof(vm_paddr_t),
1071 xpmap_ptom(VTOP(cur_space) | PG_KERNEL));
1074 xen_queue_pt_update(pdir_shadow_ma[3] + KPTDI*sizeof(vm_paddr_t),
1075 xpmap_ptom(VTOP(cur_space) | PG_V | PG_A));
1077 xen_queue_pt_update(pdir_shadow_ma + KPTDI*sizeof(vm_paddr_t),
1078 xpmap_ptom(VTOP(cur_space) | PG_V | PG_A));
1081 cur_space += PAGE_SIZE;
1085 if (xen_start_info->flags & SIF_INITDOMAIN) {
1086 /* Map first megabyte */
1087 for (i = 0; i < (256 << PAGE_SHIFT); i += PAGE_SIZE)
1088 PT_SET_MA(KERNBASE + i, i | PG_KERNEL | PG_NC_PCD);
1093 * re-map kernel text read-only
1096 for (i = (((vm_offset_t)&btext) & ~PAGE_MASK);
1097 i < (((vm_offset_t)&etext) & ~PAGE_MASK); i += PAGE_SIZE)
1098 PT_SET_MA(i, xpmap_ptom(VTOP(i)) | PG_V | PG_A);
1101 physfree = VTOP(cur_space);
1102 init_first = physfree >> PAGE_SHIFT;
1103 IdlePTD = (pd_entry_t *)VTOP(IdlePTD);
1104 IdlePDPT = (pd_entry_t *)VTOP(IdlePDPT);
1105 setup_xen_features();
1106 printk("#8, proc0kstack=%u\n", proc0kstack);
1110 trap_info_t trap_table[] = {
1111 { 0, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(div)},
1112 { 1, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dbg)},
1113 { 3, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bpt)},
1114 { 4, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ofl)},
1115 /* This is UPL on Linux and KPL on BSD */
1116 { 5, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bnd)},
1117 { 6, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ill)},
1118 { 7, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dna)},
1120 * { 8, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(XXX)},
1121 * no handler for double fault
1123 { 9, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpusegm)},
1124 {10, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(tss)},
1125 {11, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(missing)},
1126 {12, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(stk)},
1127 {13, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(prot)},
1128 {14, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(page)},
1129 {15, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(rsvd)},
1130 {16, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpu)},
1131 {17, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(align)},
1132 {18, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(mchk)},
1133 {19, 0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(xmm)},
1134 {0x80, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(int0x80_syscall)},
1140 shutdown_handler(struct xenbus_watch *watch,
1141 const char **vec, unsigned int len)
1144 struct xenbus_transaction xbt;
1146 struct reboot_args uap;
1151 err = xenbus_transaction_start(&xbt);
1154 str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
1155 /* Ignore read errors and empty reads. */
1156 if (XENBUS_IS_ERR_READ(str)) {
1157 xenbus_transaction_end(xbt, 1);
1161 xenbus_write(xbt, "control", "shutdown", "");
1163 err = xenbus_transaction_end(xbt, 0);
1164 if (err == EAGAIN) {
1165 free(str, M_DEVBUF);
1169 if (strcmp(str, "reboot") == 0)
1171 else if (strcmp(str, "poweroff") == 0)
1172 howto |= (RB_POWEROFF | RB_HALT);
1173 else if (strcmp(str, "halt") == 0)
1175 else if (strcmp(str, "suspend") == 0)
1178 printf("Ignoring shutdown request: %s\n", str);
1188 printf("suspend not currently supported\n");
1193 reboot(curthread, &uap);
1195 free(str, M_DEVBUF);
1198 static struct xenbus_watch shutdown_watch = {
1199 .node = "control/shutdown",
1200 .callback = shutdown_handler
1204 void setup_shutdown_watcher(void *unused);
1208 setup_shutdown_watcher(void *unused)
1210 if (register_xenbus_watch(&shutdown_watch))
1211 printf("Failed to set shutdown watcher\n");
1215 SYSINIT(shutdown, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, setup_shutdown_watcher, NULL);
1220 xen_suspend(void *ignore)
1224 extern void time_resume(void);
1225 extern unsigned long max_pfn;
1226 extern unsigned long *pfn_to_mfn_frame_list_list;
1227 extern unsigned long *pfn_to_mfn_frame_list[];
1230 #error "do_suspend must be run cpu 0 - need to create separate thread"
1231 cpumask_t prev_online_cpus;
1232 int vcpu_prepare(int vcpu);
1237 PANIC_IF(smp_processor_id() != 0);
1239 #if defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
1240 if (num_online_cpus() > 1) {
1241 printk(KERN_WARNING "Can't suspend SMP guests "
1242 "without CONFIG_HOTPLUG_CPU\n");
1252 * Take all other CPUs offline. We hold the hotplug semaphore to
1253 * avoid other processes bringing up CPUs under our feet.
1255 cpus_clear(prev_online_cpus);
1256 while (num_online_cpus() > 1) {
1257 for_each_online_cpu(i) {
1260 unlock_cpu_hotplug();
1264 printk(KERN_CRIT "Failed to take all CPUs "
1265 "down: %d.\n", err);
1266 goto out_reenable_cpus;
1268 cpu_set(i, prev_online_cpus);
1271 #endif /* CONFIG_SMP */
1279 unlock_cpu_hotplug();
1283 pmap_kremove(HYPERVISOR_shared_info);
1285 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
1286 xen_start_info->console.domU.mfn = mfn_to_pfn(xen_start_info->console.domU.mfn);
1289 * We'll stop somewhere inside this hypercall. When it returns,
1290 * we'll start resuming after the restore.
1292 HYPERVISOR_suspend(VTOMFN(xen_start_info));
1294 pmap_kenter_ma(HYPERVISOR_shared_info, xen_start_info->shared_info);
1295 set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
1298 memset(empty_zero_page, 0, PAGE_SIZE);
1300 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
1301 VTOMFN(pfn_to_mfn_frame_list_list);
1303 fpp = PAGE_SIZE/sizeof(unsigned long);
1304 for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
1305 if ((j % fpp) == 0) {
1307 pfn_to_mfn_frame_list_list[k] =
1308 VTOMFN(pfn_to_mfn_frame_list[k]);
1311 pfn_to_mfn_frame_list[k][j] =
1312 VTOMFN(&phys_to_machine_mapping[i]);
1314 HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
1332 * Only resume xenbus /after/ we've prepared our VCPUs; otherwise
1333 * the VCPU hotplug callback can race with our vcpu_prepare
1339 for_each_cpu_mask(i, prev_online_cpus) {
1341 if ((j != 0) && !cpu_online(i)) {
1342 printk(KERN_CRIT "Failed to bring cpu "
1343 "%d back up (%d).\n",
1353 /********** CODE WORTH KEEPING ABOVE HERE *****************/
1355 void xen_failsafe_handler(void);
1358 xen_failsafe_handler(void)
1361 panic("xen_failsafe_handler called!\n");
1364 void xen_handle_thread_switch(struct pcb *pcb);
1366 /* This is called by cpu_switch() when switching threads. */
1367 /* The pcb arg refers to the process control block of the */
1368 /* next thread which is to run */
1370 xen_handle_thread_switch(struct pcb *pcb)
1372 uint32_t *a = (uint32_t *)&PCPU_GET(fsgs_gdt)[0];
1373 uint32_t *b = (uint32_t *)&pcb->pcb_fsd;
1374 multicall_entry_t mcl[3];
1377 /* Notify Xen of task switch */
1378 mcl[i].op = __HYPERVISOR_stack_switch;
1379 mcl[i].args[0] = GSEL(GDATA_SEL, SEL_KPL);
1380 mcl[i++].args[1] = (unsigned long)pcb;
1382 /* Check for update of fsd */
1383 if (*a != *b || *(a+1) != *(b+1)) {
1384 mcl[i].op = __HYPERVISOR_update_descriptor;
1385 *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a);
1386 *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b;
1392 /* Check for update of gsd */
1393 if (*a != *b || *(a+1) != *(b+1)) {
1394 mcl[i].op = __HYPERVISOR_update_descriptor;
1395 *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a);
1396 *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b;
1399 (void)HYPERVISOR_multicall(mcl, i);