]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - 6/sys/i386/xen/xen_machdep.c
merge fix for boot-time hang on centos' xen
[FreeBSD/FreeBSD.git] / 6 / sys / i386 / xen / xen_machdep.c
1 /*
2  *
3  * Copyright (c) 2004 Christian Limpach.
4  * Copyright (c) 2004-2006,2008 Kip Macy
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. All advertising materials mentioning features or use of this software
16  *    must display the following acknowledgement:
17  *      This product includes software developed by Christian Limpach.
18  * 4. The name of the author may not be used to endorse or promote products
19  *    derived from this software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/bus.h>
39 #include <sys/mount.h>
40 #include <sys/malloc.h>
41 #include <sys/kernel.h>
42 #include <sys/reboot.h>
43 #include <sys/sysproto.h>
44
45 #include <machine/xen/xen-os.h>
46
47 #include <vm/vm.h>
48 #include <vm/pmap.h>
49 #include <machine/segments.h>
50 #include <machine/pcb.h>
51 #include <machine/stdarg.h>
52 #include <machine/vmparam.h>
53 #include <machine/cpu.h>
54 #include <machine/intr_machdep.h>
55 #include <machine/md_var.h>
56 #include <machine/asmacros.h>
57
58
59
60 #include <machine/xen/hypervisor.h>
61 #include <machine/xen/xenvar.h>
62 #include <machine/xen/xenfunc.h>
63 #include <machine/xen/xenpmap.h>
64 #include <machine/xen/xenbus.h>
65 #include <machine/xen/xenfunc.h>
66 #include <xen/interface/memory.h>
67 #include <machine/xen/features.h>
68 #ifdef SMP
69 #include <machine/privatespace.h>
70 #endif
71
72
73 #include <vm/vm_page.h>
74
75
76 #define IDTVEC(name)    __CONCAT(X,name)
77
78 extern inthand_t
79 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
80         IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
81         IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
82         IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
83         IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
84
85
86 int xendebug_flags; 
87 start_info_t *xen_start_info;
88 shared_info_t *HYPERVISOR_shared_info;
89 xen_pfn_t *xen_machine_phys = machine_to_phys_mapping;
90 xen_pfn_t *xen_phys_machine;
91 int preemptable, init_first;
92 extern unsigned int avail_space;
93
94 void ni_cli(void);
95 void ni_sti(void);
96
97
98 void
99 ni_cli(void)
100 {
101         __asm__("pushl %edx;"
102                 "pushl %eax;"
103                 );
104         __cli();
105         __asm__("popl %eax;"
106                 "popl %edx;"
107                 );
108 }
109
110
111 void
112 ni_sti(void)
113 {
114         __asm__("pushl %edx;"
115                 "pushl %esi;"
116                 "pushl %eax;"
117                 );
118         __sti();
119         __asm__("popl %eax;"
120                 "popl %esi;"
121                 "popl %edx;"
122                 );
123 }
124
125 /*
126  * Modify the cmd_line by converting ',' to NULLs so that it is in a  format 
127  * suitable for the static env vars.
128  */
129 char *
130 xen_setbootenv(char *cmd_line)
131 {
132         char *cmd_line_next;
133     
134         /* Skip leading spaces */
135         for (; *cmd_line == ' '; cmd_line++);
136
137         printk("xen_setbootenv(): cmd_line='%s'\n", cmd_line);
138
139         for (cmd_line_next = cmd_line; strsep(&cmd_line_next, ",") != NULL;);
140         return cmd_line;
141 }
142
143 static struct 
144 {
145         const char      *ev;
146         int             mask;
147 } howto_names[] = {
148         {"boot_askname",        RB_ASKNAME},
149         {"boot_single", RB_SINGLE},
150         {"boot_nosync", RB_NOSYNC},
151         {"boot_halt",   RB_ASKNAME},
152         {"boot_serial", RB_SERIAL},
153         {"boot_cdrom",  RB_CDROM},
154         {"boot_gdb",    RB_GDB},
155         {"boot_gdb_pause",      RB_RESERVED1},
156         {"boot_verbose",        RB_VERBOSE},
157         {"boot_multicons",      RB_MULTIPLE},
158         {NULL,  0}
159 };
160
161 int 
162 xen_boothowto(char *envp)
163 {
164         int i, howto = 0;
165
166         /* get equivalents from the environment */
167         for (i = 0; howto_names[i].ev != NULL; i++)
168                 if (getenv(howto_names[i].ev) != NULL)
169                         howto |= howto_names[i].mask;
170         return howto;
171 }
172
173 #define PRINTK_BUFSIZE 1024
174 void
175 printk(const char *fmt, ...)
176 {
177         __va_list ap;
178         int retval;
179         static char buf[PRINTK_BUFSIZE];
180
181         return;
182         
183         va_start(ap, fmt);
184         retval = vsnprintf(buf, PRINTK_BUFSIZE - 1, fmt, ap);
185         va_end(ap);
186         buf[retval] = 0;
187         (void)HYPERVISOR_console_write(buf, retval);
188 }
189
190
191 #define XPQUEUE_SIZE 128
192
193 struct mmu_log {
194         char *file;
195         int line;
196 };
197
198 #ifdef SMP
199 /* per-cpu queues and indices */
200 #ifdef INVARIANTS
201 static struct mmu_log xpq_queue_log[MAX_VIRT_CPUS][XPQUEUE_SIZE];
202 #endif
203
204 static int xpq_idx[MAX_VIRT_CPUS];  
205 static mmu_update_t xpq_queue[MAX_VIRT_CPUS][XPQUEUE_SIZE];
206
207 #define XPQ_QUEUE xpq_queue[vcpu]
208 #define XPQ_IDX xpq_idx[vcpu]
209 #define SET_VCPU() int vcpu = smp_processor_id()
210
211 #define XPQ_QUEUE_LOG xpq_queue_log[vcpu]
212 #else
213         
214 static mmu_update_t xpq_queue[XPQUEUE_SIZE];
215 static struct mmu_log xpq_queue_log[XPQUEUE_SIZE];
216 static int xpq_idx = 0;
217
218 #define XPQ_QUEUE_LOG xpq_queue_log
219 #define XPQ_QUEUE xpq_queue
220 #define XPQ_IDX xpq_idx
221 #define SET_VCPU()
222 #endif /* !SMP */
223
224 #define XPQ_IDX_INC atomic_add_int(&XPQ_IDX, 1);
225
226 #if 0
227 static void
228 xen_dump_queue(void)
229 {
230         int _xpq_idx = XPQ_IDX;
231         int i;
232
233         if (_xpq_idx <= 1)
234                 return;
235
236         printk("xen_dump_queue(): %u entries\n", _xpq_idx);
237         for (i = 0; i < _xpq_idx; i++) {
238                 printk(" val: %llx ptr: %llx\n", XPQ_QUEUE[i].val, XPQ_QUEUE[i].ptr);
239         }
240 }
241 #endif
242
243
244 static __inline void
245 _xen_flush_queue(void)
246 {
247         SET_VCPU();
248         int _xpq_idx = XPQ_IDX;
249         int error, i;
250         /* window of vulnerability here? */
251
252         if (__predict_true(gdtset))
253                 critical_enter();
254         XPQ_IDX = 0;
255         /* Make sure index is cleared first to avoid double updates. */
256         error = HYPERVISOR_mmu_update((mmu_update_t *)&XPQ_QUEUE,
257                                       _xpq_idx, NULL, DOMID_SELF);
258     
259 #if 0
260         if (__predict_true(gdtset))
261         for (i = _xpq_idx; i > 0;) {
262                 if (i >= 3) {
263                         CTR6(KTR_PMAP, "mmu:val: %lx ptr: %lx val: %lx "
264                             "ptr: %lx val: %lx ptr: %lx",
265                             (XPQ_QUEUE[i-1].val & 0xffffffff),
266                             (XPQ_QUEUE[i-1].ptr & 0xffffffff),
267                             (XPQ_QUEUE[i-2].val & 0xffffffff),
268                             (XPQ_QUEUE[i-2].ptr & 0xffffffff),
269                             (XPQ_QUEUE[i-3].val & 0xffffffff),
270                             (XPQ_QUEUE[i-3].ptr & 0xffffffff));
271                             i -= 3;
272                 } else if (i == 2) {
273                         CTR4(KTR_PMAP, "mmu: val: %lx ptr: %lx val: %lx ptr: %lx",
274                             (XPQ_QUEUE[i-1].val & 0xffffffff),
275                             (XPQ_QUEUE[i-1].ptr & 0xffffffff),
276                             (XPQ_QUEUE[i-2].val & 0xffffffff),
277                             (XPQ_QUEUE[i-2].ptr & 0xffffffff));
278                         i = 0;
279                 } else {
280                         CTR2(KTR_PMAP, "mmu: val: %lx ptr: %lx", 
281                             (XPQ_QUEUE[i-1].val & 0xffffffff),
282                             (XPQ_QUEUE[i-1].ptr & 0xffffffff));
283                         i = 0;
284                 }
285         }
286 #endif  
287         if (__predict_true(gdtset))
288                 critical_exit();
289         if (__predict_false(error < 0)) {
290                 for (i = 0; i < _xpq_idx; i++)
291                         printf("val: %llx ptr: %llx\n",
292                             XPQ_QUEUE[i].val, XPQ_QUEUE[i].ptr);
293                 panic("Failed to execute MMU updates: %d", error);
294         }
295
296 }
297
298 void
299 xen_flush_queue(void)
300 {
301         SET_VCPU();
302         if (XPQ_IDX != 0) _xen_flush_queue();
303 }
304
305 static __inline void
306 xen_increment_idx(void)
307 {
308         SET_VCPU();
309
310         XPQ_IDX++;
311         if (__predict_false(XPQ_IDX == XPQUEUE_SIZE))
312                 xen_flush_queue();
313 }
314
315 void
316 xen_check_queue(void)
317 {
318 #ifdef INVARIANTS
319         SET_VCPU();
320         
321         KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX));
322 #endif
323 }
324
325 void
326 xen_invlpg(vm_offset_t va)
327 {
328         struct mmuext_op op;
329         op.cmd = MMUEXT_INVLPG_ALL;
330         op.arg1.linear_addr = va & ~PAGE_MASK;
331         PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
332 }
333
334 void
335 xen_load_cr3(u_int val)
336 {
337         struct mmuext_op op;
338 #ifdef INVARIANTS
339         SET_VCPU();
340         
341         KASSERT(XPQ_IDX == 0, ("pending operations XPQ_IDX=%d", XPQ_IDX));
342 #endif
343         op.cmd = MMUEXT_NEW_BASEPTR;
344         op.arg1.mfn = xpmap_ptom(val) >> PAGE_SHIFT;
345         PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
346 }
347
348 void
349 xen_restore_flags(u_int eflags)
350 {
351
352         if (eflags > 1)
353                 eflags = ((eflags & PSL_I) == 0);
354
355         __restore_flags(eflags);
356 }
357
358 int
359 xen_save_and_cli(void)
360 {
361         int eflags;
362         
363         __save_and_cli(eflags);
364         return (eflags);
365 }
366
367 void
368 xen_cli(void)
369 {
370         __cli();
371 }
372
373 void
374 xen_sti(void)
375 {
376         __sti();
377 }
378
379 u_int
380 xen_rcr2(void)
381 {
382
383         return (HYPERVISOR_shared_info->vcpu_info[curcpu].arch.cr2);
384 }
385
386 void
387 _xen_machphys_update(vm_paddr_t mfn, vm_paddr_t pfn, char *file, int line)
388 {
389         SET_VCPU();
390         
391         if (__predict_true(gdtset))
392                 critical_enter();
393         XPQ_QUEUE[XPQ_IDX].ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
394         XPQ_QUEUE[XPQ_IDX].val = pfn;
395 #ifdef INVARIANTS
396         XPQ_QUEUE_LOG[XPQ_IDX].file = file;
397         XPQ_QUEUE_LOG[XPQ_IDX].line = line;     
398 #endif          
399         xen_increment_idx();
400         if (__predict_true(gdtset))
401                 critical_exit();
402 }
403
404 void
405 _xen_queue_pt_update(vm_paddr_t ptr, vm_paddr_t val, char *file, int line)
406 {
407         SET_VCPU();
408
409         if (__predict_true(gdtset))     
410                 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
411
412         KASSERT((ptr & 7) == 0, ("misaligned update"));
413         
414         if (__predict_true(gdtset))
415                 critical_enter();
416         
417         XPQ_QUEUE[XPQ_IDX].ptr = ((uint64_t)ptr) | MMU_NORMAL_PT_UPDATE;
418         XPQ_QUEUE[XPQ_IDX].val = (uint64_t)val;
419 #ifdef INVARIANTS
420         XPQ_QUEUE_LOG[XPQ_IDX].file = file;
421         XPQ_QUEUE_LOG[XPQ_IDX].line = line;     
422 #endif  
423         xen_increment_idx();
424         if (__predict_true(gdtset))
425                 critical_exit();
426 }
427
428 void 
429 xen_pgdpt_pin(vm_paddr_t ma)
430 {
431         struct mmuext_op op;
432         op.cmd = MMUEXT_PIN_L3_TABLE;
433         op.arg1.mfn = ma >> PAGE_SHIFT;
434         xen_flush_queue();
435         PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
436 }
437
438 void 
439 xen_pgd_pin(vm_paddr_t ma)
440 {
441         struct mmuext_op op;
442         op.cmd = MMUEXT_PIN_L2_TABLE;
443         op.arg1.mfn = ma >> PAGE_SHIFT;
444         xen_flush_queue();
445         PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
446 }
447
448 void 
449 xen_pgd_unpin(vm_paddr_t ma)
450 {
451         struct mmuext_op op;
452         op.cmd = MMUEXT_UNPIN_TABLE;
453         op.arg1.mfn = ma >> PAGE_SHIFT;
454         xen_flush_queue();
455         PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
456 }
457
458 void 
459 xen_pt_pin(vm_paddr_t ma)
460 {
461         struct mmuext_op op;
462         op.cmd = MMUEXT_PIN_L1_TABLE;
463         op.arg1.mfn = ma >> PAGE_SHIFT;
464         printk("xen_pt_pin(): mfn=%x\n", op.arg1.mfn);
465         xen_flush_queue();
466         PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
467 }
468
469 void 
470 xen_pt_unpin(vm_paddr_t ma)
471 {
472         struct mmuext_op op;
473         op.cmd = MMUEXT_UNPIN_TABLE;
474         op.arg1.mfn = ma >> PAGE_SHIFT;
475         xen_flush_queue();
476         PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
477 }
478
479 void 
480 xen_set_ldt(vm_paddr_t ptr, unsigned long len)
481 {
482         struct mmuext_op op;
483         op.cmd = MMUEXT_SET_LDT;
484         op.arg1.linear_addr = ptr;
485         op.arg2.nr_ents = len;
486         xen_flush_queue();
487         PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
488 }
489
490 void xen_tlb_flush(void)
491 {
492         struct mmuext_op op;
493         op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
494         xen_flush_queue();
495         PANIC_IF(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
496 }
497
498 void
499 xen_update_descriptor(union descriptor *table, union descriptor *entry)
500 {
501         vm_paddr_t pa;
502         pt_entry_t *ptp;
503
504         ptp = vtopte((vm_offset_t)table);
505         pa = (*ptp & PG_FRAME) | ((vm_offset_t)table & PAGE_MASK);
506         if (HYPERVISOR_update_descriptor(pa, *(uint64_t *)entry))
507                 panic("HYPERVISOR_update_descriptor failed\n");
508 }
509
510
511 #if 0
512 /*
513  * Bitmap is indexed by page number. If bit is set, the page is part of a
514  * xen_create_contiguous_region() area of memory.
515  */
516 unsigned long *contiguous_bitmap;
517
518 static void 
519 contiguous_bitmap_set(unsigned long first_page, unsigned long nr_pages)
520 {
521         unsigned long start_off, end_off, curr_idx, end_idx;
522
523         curr_idx  = first_page / BITS_PER_LONG;
524         start_off = first_page & (BITS_PER_LONG-1);
525         end_idx   = (first_page + nr_pages) / BITS_PER_LONG;
526         end_off   = (first_page + nr_pages) & (BITS_PER_LONG-1);
527
528         if (curr_idx == end_idx) {
529                 contiguous_bitmap[curr_idx] |=
530                         ((1UL<<end_off)-1) & -(1UL<<start_off);
531         } else {
532                 contiguous_bitmap[curr_idx] |= -(1UL<<start_off);
533                 while ( ++curr_idx < end_idx )
534                         contiguous_bitmap[curr_idx] = ~0UL;
535                 contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1;
536         }
537 }
538
539 static void 
540 contiguous_bitmap_clear(unsigned long first_page, unsigned long nr_pages)
541 {
542         unsigned long start_off, end_off, curr_idx, end_idx;
543
544         curr_idx  = first_page / BITS_PER_LONG;
545         start_off = first_page & (BITS_PER_LONG-1);
546         end_idx   = (first_page + nr_pages) / BITS_PER_LONG;
547         end_off   = (first_page + nr_pages) & (BITS_PER_LONG-1);
548
549         if (curr_idx == end_idx) {
550                 contiguous_bitmap[curr_idx] &=
551                         -(1UL<<end_off) | ((1UL<<start_off)-1);
552         } else {
553                 contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1;
554                 while ( ++curr_idx != end_idx )
555                         contiguous_bitmap[curr_idx] = 0;
556                 contiguous_bitmap[curr_idx] &= -(1UL<<end_off);
557         }
558 }
559 #endif
560
561 /* Ensure multi-page extents are contiguous in machine memory. */
562 int 
563 xen_create_contiguous_region(vm_page_t pages, int npages)
564 {
565         unsigned long  mfn, i, flags;
566         int order;
567         struct xen_memory_reservation reservation = {
568                 .nr_extents   = 1,
569                 .extent_order = 0,
570                 .domid        = DOMID_SELF
571         };
572         set_xen_guest_handle(reservation.extent_start, &mfn);
573         
574         balloon_lock(flags);
575
576         /* can currently only handle power of two allocation */
577         PANIC_IF(ffs(npages) != fls(npages));
578
579         /* 0. determine order */
580         order = (ffs(npages) == fls(npages)) ? fls(npages) - 1 : fls(npages);
581         
582         /* 1. give away machine pages. */
583         for (i = 0; i < (1 << order); i++) {
584                 int pfn;
585                 pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT;
586                 mfn = PFNTOMFN(pfn);
587                 PFNTOMFN(pfn) = INVALID_P2M_ENTRY;
588                 PANIC_IF(HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation) != 1);
589         }
590
591
592         /* 2. Get a new contiguous memory extent. */
593         reservation.extent_order = order;
594         /* xenlinux hardcodes this because of aacraid - maybe set to 0 if we're not 
595          * running with a broxen driver XXXEN
596          */
597         reservation.address_bits = 31; 
598         if (HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1)
599                 goto fail;
600
601         /* 3. Map the new extent in place of old pages. */
602         for (i = 0; i < (1 << order); i++) {
603                 int pfn;
604                 pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT;
605                 xen_machphys_update(mfn+i, pfn);
606                 PFNTOMFN(pfn) = mfn+i;
607         }
608
609         xen_tlb_flush();
610
611 #if 0
612         contiguous_bitmap_set(VM_PAGE_TO_PHYS(&pages[0]) >> PAGE_SHIFT, 1UL << order);
613 #endif
614
615         balloon_unlock(flags);
616
617         return 0;
618
619  fail:
620         reservation.extent_order = 0;
621         reservation.address_bits = 0;
622
623         for (i = 0; i < (1 << order); i++) {
624                 int pfn;
625                 pfn = VM_PAGE_TO_PHYS(&pages[i]) >> PAGE_SHIFT;
626                 PANIC_IF(HYPERVISOR_memory_op(
627                         XENMEM_increase_reservation, &reservation) != 1);
628                 xen_machphys_update(mfn, pfn);
629                 PFNTOMFN(pfn) = mfn;
630         }
631
632         xen_tlb_flush();
633
634         balloon_unlock(flags);
635
636         return ENOMEM;
637 }
638
639 void 
640 xen_destroy_contiguous_region(void *addr, int npages)
641 {
642         unsigned long  mfn, i, flags, order, pfn0;
643         struct xen_memory_reservation reservation = {
644                 .nr_extents   = 1,
645                 .extent_order = 0,
646                 .domid        = DOMID_SELF
647         };
648         set_xen_guest_handle(reservation.extent_start, &mfn);
649         
650         pfn0 = vtophys(addr) >> PAGE_SHIFT;
651 #if 0
652         scrub_pages(vstart, 1 << order);
653 #endif
654         /* can currently only handle power of two allocation */
655         PANIC_IF(ffs(npages) != fls(npages));
656
657         /* 0. determine order */
658         order = (ffs(npages) == fls(npages)) ? fls(npages) - 1 : fls(npages);
659
660         balloon_lock(flags);
661
662 #if 0
663         contiguous_bitmap_clear(vtophys(addr) >> PAGE_SHIFT, 1UL << order);
664 #endif
665
666         /* 1. Zap current PTEs, giving away the underlying pages. */
667         for (i = 0; i < (1 << order); i++) {
668                 int pfn;
669                 uint64_t new_val = 0;
670                 pfn = vtomach((char *)addr + i*PAGE_SIZE) >> PAGE_SHIFT;
671
672                 PANIC_IF(HYPERVISOR_update_va_mapping((vm_offset_t)((char *)addr + (i * PAGE_SIZE)), new_val, 0));
673                 PFNTOMFN(pfn) = INVALID_P2M_ENTRY;
674                 PANIC_IF(HYPERVISOR_memory_op(
675                         XENMEM_decrease_reservation, &reservation) != 1);
676         }
677
678         /* 2. Map new pages in place of old pages. */
679         for (i = 0; i < (1 << order); i++) {
680                 int pfn;
681                 uint64_t new_val;
682                 pfn = pfn0 + i;
683                 PANIC_IF(HYPERVISOR_memory_op(XENMEM_increase_reservation, &reservation) != 1);
684                 
685                 new_val = mfn << PAGE_SHIFT;
686                 PANIC_IF(HYPERVISOR_update_va_mapping((vm_offset_t)addr + (i * PAGE_SIZE), 
687                                                       new_val, PG_KERNEL));
688                 xen_machphys_update(mfn, pfn);
689                 PFNTOMFN(pfn) = mfn;
690         }
691
692         xen_tlb_flush();
693
694         balloon_unlock(flags);
695 }
696
697 extern unsigned long cpu0prvpage;
698 extern unsigned long *SMPpt;
699 extern  struct user     *proc0uarea;
700 extern  vm_offset_t     proc0kstack;
701 extern int vm86paddr, vm86phystk;
702 char *bootmem_start, *bootmem_current, *bootmem_end;
703
704 pteinfo_t *pteinfo_list;
705 void initvalues(start_info_t *startinfo);
706
707 struct ringbuf_head *xen_store; /* XXX move me */
708 char *console_page;
709
710 void *
711 bootmem_alloc(unsigned int size) 
712 {
713         char *retptr;
714         
715         retptr = bootmem_current;
716         PANIC_IF(retptr + size > bootmem_end);
717         bootmem_current += size;
718
719         return retptr;
720 }
721
722 void 
723 bootmem_free(void *ptr, unsigned int size) 
724 {
725         char *tptr;
726         
727         tptr = ptr;
728         PANIC_IF(tptr != bootmem_current - size ||
729                 bootmem_current - size < bootmem_start);        
730
731         bootmem_current -= size;
732 }
733
734 #if 0
735 static vm_paddr_t
736 xpmap_mtop2(vm_paddr_t mpa)
737 {
738         return ((machine_to_phys_mapping[mpa >> PAGE_SHIFT] << PAGE_SHIFT)
739             ) | (mpa & ~PG_FRAME);
740 }
741
742 static pd_entry_t 
743 xpmap_get_bootpde(vm_paddr_t va)
744 {
745
746         return ((pd_entry_t *)xen_start_info->pt_base)[va >> 22];
747 }
748
749 static pd_entry_t
750 xpmap_get_vbootpde(vm_paddr_t va)
751 {
752         pd_entry_t pde;
753
754         pde = xpmap_get_bootpde(va);
755         if ((pde & PG_V) == 0)
756                 return (pde & ~PG_FRAME);
757         return (pde & ~PG_FRAME) |
758                 (xpmap_mtop2(pde & PG_FRAME) + KERNBASE);
759 }
760
761 static pt_entry_t 8*
762 xpmap_get_bootptep(vm_paddr_t va)
763 {
764         pd_entry_t pde;
765
766         pde = xpmap_get_vbootpde(va);
767         if ((pde & PG_V) == 0)
768                 return (void *)-1;
769 #define PT_MASK         0x003ff000      /* page table address bits */
770         return &(((pt_entry_t *)(pde & PG_FRAME))[(va & PT_MASK) >> PAGE_SHIFT]);
771 }
772
773 static pt_entry_t
774 xpmap_get_bootpte(vm_paddr_t va)
775 {
776
777         return xpmap_get_bootptep(va)[0];
778 }
779 #endif
780
781
782 #ifdef ADD_ISA_HOLE
783 static void
784 shift_phys_machine(unsigned long *phys_machine, int nr_pages)
785 {
786
787         unsigned long *tmp_page, *current_page, *next_page;
788         int i;
789
790         tmp_page = bootmem_alloc(PAGE_SIZE);
791         current_page = phys_machine + nr_pages - (PAGE_SIZE/sizeof(unsigned long));  
792         next_page = current_page - (PAGE_SIZE/sizeof(unsigned long));  
793         bcopy(phys_machine, tmp_page, PAGE_SIZE);
794
795         while (current_page > phys_machine) { 
796                 /*  save next page */
797                 bcopy(next_page, tmp_page, PAGE_SIZE);
798                 /* shift down page */
799                 bcopy(current_page, next_page, PAGE_SIZE);
800                 /*  finish swap */
801                 bcopy(tmp_page, current_page, PAGE_SIZE);
802           
803                 current_page -= (PAGE_SIZE/sizeof(unsigned long));
804                 next_page -= (PAGE_SIZE/sizeof(unsigned long));
805         }
806         bootmem_free(tmp_page, PAGE_SIZE);      
807         
808         for (i = 0; i < nr_pages; i++) {
809                 xen_machphys_update(phys_machine[i], i);
810         }
811         memset(phys_machine, INVALID_P2M_ENTRY, PAGE_SIZE);
812
813 }
814 #endif /* ADD_ISA_HOLE */
815
816 extern unsigned long physfree;
817
818 int pdir, curoffset;
819 extern int nkpt;
820
821 void
822 initvalues(start_info_t *startinfo)
823
824         int l3_pages, l2_pages, l1_pages, offset;
825         vm_offset_t cur_space, cur_space_pt;
826         struct physdev_set_iopl set_iopl;
827         
828         vm_paddr_t KPTphys, IdlePTDma;
829         vm_paddr_t console_page_ma, xen_store_ma;
830         vm_offset_t KPTphysoff, tmpva;
831         vm_paddr_t shinfo;
832 #ifdef PAE
833         vm_paddr_t IdlePDPTma, IdlePDPTnewma;
834         vm_paddr_t IdlePTDnewma[4];
835         pd_entry_t *IdlePDPTnew, *IdlePTDnew;
836 #else
837         vm_paddr_t pdir_shadow_ma;
838 #endif
839         unsigned long i;
840         int ncpus;
841
842         nkpt = min(
843                 min(
844                         max((startinfo->nr_pages >> NPGPTD_SHIFT), nkpt),
845                     NPGPTD*NPDEPG - KPTDI),
846                     (HYPERVISOR_VIRT_START - KERNBASE) >> PDRSHIFT);
847         
848 #ifdef SMP
849         ncpus = MAXCPU;
850 #else
851         ncpus = 1;
852 #endif  
853
854         HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);      
855 #ifdef notyet
856         /*
857          * need to install handler
858          */
859         HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify);       
860 #endif  
861         xen_start_info = startinfo;
862         xen_phys_machine = (xen_pfn_t *)startinfo->mfn_list;
863
864         /* number of pages allocated after the pts + 1*/;
865         cur_space = xen_start_info->pt_base +
866             ((xen_start_info->nr_pt_frames) + 3 )*PAGE_SIZE;
867         printk("initvalues(): wooh - availmem=%x,%x\n", avail_space, cur_space);
868
869         printk("KERNBASE=%x,pt_base=%x, VTOPFN(base)=%x, nr_pt_frames=%x\n",
870             KERNBASE,xen_start_info->pt_base, VTOPFN(xen_start_info->pt_base),
871             xen_start_info->nr_pt_frames);
872         xendebug_flags = 0; /* 0xffffffff; */
873
874         /* allocate 4 pages for bootmem allocator */
875         bootmem_start = bootmem_current = (char *)cur_space;
876         cur_space += (4 * PAGE_SIZE);
877         bootmem_end = (char *)cur_space;
878         
879         /* allocate page for gdt */
880         gdt = (union descriptor *)cur_space;
881         cur_space += PAGE_SIZE*ncpus;
882
883         /* allocate page for ldt */
884         ldt = (union descriptor *)cur_space; cur_space += PAGE_SIZE;
885         cur_space += PAGE_SIZE;
886         
887         HYPERVISOR_shared_info = (shared_info_t *)cur_space;
888         cur_space += PAGE_SIZE;
889
890         xen_store = (struct ringbuf_head *)cur_space;
891         cur_space += PAGE_SIZE;
892
893         console_page = (char *)cur_space;
894         cur_space += PAGE_SIZE;
895
896 #ifdef ADD_ISA_HOLE
897         shift_phys_machine(xen_phys_machine, xen_start_info->nr_pages);
898 #endif
899         /* 
900          * pre-zero unused mapped pages - mapped on 4MB boundary
901          */
902 #ifdef PAE
903         IdlePDPT = (pd_entry_t *)startinfo->pt_base;
904         IdlePDPTma = xpmap_ptom(VTOP(startinfo->pt_base));
905         /*
906          * Note that only one page directory has been allocated at this point.
907          * Thus, if KERNBASE
908          */
909         IdlePTD = (pd_entry_t *)((uint8_t *)startinfo->pt_base + PAGE_SIZE);
910         IdlePTDma = xpmap_ptom(VTOP(IdlePTD));
911         l3_pages = 1;
912 #else   
913         IdlePTD = (pd_entry_t *)startinfo->pt_base;
914         IdlePTDma = xpmap_ptom(VTOP(startinfo->pt_base));
915         l3_pages = 0;
916 #endif
917         l2_pages = 1;
918         l1_pages = xen_start_info->nr_pt_frames - l2_pages - l3_pages;
919
920         KPTphysoff = (l2_pages + l3_pages)*PAGE_SIZE;
921
922         KPTphys = xpmap_ptom(VTOP(startinfo->pt_base + KPTphysoff));
923         XENPRINTF("IdlePTD %p\n", IdlePTD);
924         XENPRINTF("nr_pages: %ld shared_info: 0x%lx flags: 0x%lx pt_base: 0x%lx "
925                   "mod_start: 0x%lx mod_len: 0x%lx\n",
926                   xen_start_info->nr_pages, xen_start_info->shared_info, 
927                   xen_start_info->flags, xen_start_info->pt_base, 
928                   xen_start_info->mod_start, xen_start_info->mod_len);
929         /* Map proc0's KSTACK */
930
931         proc0kstack = cur_space; cur_space += (KSTACK_PAGES * PAGE_SIZE);
932         printk("proc0kstack=%u\n", proc0kstack);
933
934         /* vm86/bios stack */
935         cur_space += PAGE_SIZE;
936
937         /* Map space for the vm86 region */
938         vm86paddr = (vm_offset_t)cur_space;
939         cur_space += (PAGE_SIZE * 3);
940
941 #ifdef PAE
942         IdlePDPTnew = (pd_entry_t *)cur_space; cur_space += PAGE_SIZE;
943         bzero(IdlePDPTnew, PAGE_SIZE);
944
945         IdlePDPTnewma =  xpmap_ptom(VTOP(IdlePDPTnew));
946         IdlePTDnew = (pd_entry_t *)cur_space; cur_space += 4*PAGE_SIZE;
947         bzero(IdlePTDnew, 4*PAGE_SIZE);
948
949         for (i = 0; i < 4; i++) 
950                 IdlePTDnewma[i] =
951                     xpmap_ptom(VTOP((uint8_t *)IdlePTDnew + i*PAGE_SIZE));
952         /*
953          * L3
954          *
955          * Copy the 4 machine addresses of the new PTDs in to the PDPT
956          * 
957          */
958         for (i = 0; i < 4; i++)
959                 IdlePDPTnew[i] = IdlePTDnewma[i] | PG_V;
960
961         __asm__("nop;");
962         /*
963          *
964          * re-map the new PDPT read-only
965          */
966         PT_SET_MA(IdlePDPTnew, IdlePDPTnewma | PG_V);
967         /*
968          * 
969          * Unpin the current PDPT
970          */
971         xen_pt_unpin(IdlePDPTma);
972         
973         for (i = 0; i < 20; i++) {
974                 int startidx = ((KERNBASE >> 18) & PAGE_MASK) >> 3;
975
976                 if (IdlePTD[startidx + i] == 0) {
977                         l1_pages = i;
978                         break;
979                 }       
980         }
981                     
982 #endif  /* PAE */
983         
984         /* unmap remaining pages from initial 4MB chunk
985          *
986          */
987         for (tmpva = cur_space; (tmpva & ((1<<22)-1)) != 0; tmpva += PAGE_SIZE) {
988                 bzero((char *)tmpva, PAGE_SIZE);
989                 PT_SET_MA(tmpva, (vm_paddr_t)0);
990         }
991         
992         PT_UPDATES_FLUSH();
993
994                 
995
996         memcpy(((uint8_t *)IdlePTDnew) + ((unsigned int)(KERNBASE >> 18)),
997             ((uint8_t *)IdlePTD) + ((KERNBASE >> 18) & PAGE_MASK),
998             l1_pages*sizeof(pt_entry_t));
999
1000         for (i = 0; i < 4; i++) {
1001                 PT_SET_MA((uint8_t *)IdlePTDnew + i*PAGE_SIZE,
1002                     IdlePTDnewma[i] | PG_V);
1003         }
1004         xen_load_cr3(VTOP(IdlePDPTnew));
1005         xen_pgdpt_pin(xpmap_ptom(VTOP(IdlePDPTnew)));
1006
1007         /* allocate remainder of nkpt pages */
1008         cur_space_pt = cur_space;
1009         for (offset = (KERNBASE >> PDRSHIFT), i = l1_pages; i < nkpt;
1010              i++, cur_space += PAGE_SIZE) {
1011                 pdir = (offset + i) / NPDEPG;
1012                 curoffset = ((offset + i) % NPDEPG);
1013                 if (((offset + i) << PDRSHIFT) == VM_MAX_KERNEL_ADDRESS)
1014                         break;
1015                 
1016                 /*
1017                  * make sure that all the initial page table pages
1018                  * have been zeroed
1019                  */
1020                 PT_SET_MA(cur_space_pt,
1021                     xpmap_ptom(VTOP(cur_space)) | PG_V | PG_RW);
1022                 bzero((char *)cur_space_pt, PAGE_SIZE);
1023                 PT_SET_MA(cur_space_pt, (vm_paddr_t)0);
1024                 xen_pt_pin(xpmap_ptom(VTOP(cur_space)));
1025                 xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[pdir] +
1026                         curoffset*sizeof(vm_paddr_t)), 
1027                     xpmap_ptom(VTOP(cur_space)) | PG_KERNEL);
1028                 PT_UPDATES_FLUSH();
1029         }
1030         
1031         for (i = 0; i < 4; i++) {
1032                 pdir = (PTDPTDI + i) / NPDEPG;
1033                 curoffset = (PTDPTDI + i) % NPDEPG;
1034
1035                 xen_queue_pt_update((vm_paddr_t)(IdlePTDnewma[pdir] +
1036                         curoffset*sizeof(vm_paddr_t)), 
1037                     IdlePTDnewma[i] | PG_V);
1038         }
1039
1040         PT_UPDATES_FLUSH();
1041         
1042         IdlePTD = IdlePTDnew;
1043         IdlePDPT = IdlePDPTnew;
1044         IdlePDPTma = IdlePDPTnewma;
1045         
1046         /*
1047          * shared_info is an unsigned long so this will randomly break if
1048          * it is allocated above 4GB - I guess people are used to that
1049          * sort of thing with Xen ... sigh
1050          */
1051         shinfo = xen_start_info->shared_info;
1052         PT_SET_MA(HYPERVISOR_shared_info, shinfo | PG_KERNEL);
1053         
1054         printk("#4\n");
1055
1056         
1057         xen_store_ma = (((vm_paddr_t)xen_start_info->store_mfn) << PAGE_SHIFT);
1058         PT_SET_MA(xen_store, xen_store_ma | PG_KERNEL);
1059         console_page_ma = (((vm_paddr_t)xen_start_info->console.domU.mfn) << PAGE_SHIFT);
1060         PT_SET_MA(console_page, console_page_ma | PG_KERNEL);
1061
1062         printk("#5\n");
1063         HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = (unsigned long)xen_phys_machine;
1064
1065         set_iopl.iopl = 1;
1066         PANIC_IF(HYPERVISOR_physdev_op(PHYSDEVOP_SET_IOPL, &set_iopl));
1067         printk("#6\n");
1068 #if 0
1069         /* add page table for KERNBASE */
1070         xen_queue_pt_update(IdlePTDma + KPTDI*sizeof(vm_paddr_t), 
1071                             xpmap_ptom(VTOP(cur_space) | PG_KERNEL));
1072         xen_flush_queue();
1073 #ifdef PAE      
1074         xen_queue_pt_update(pdir_shadow_ma[3] + KPTDI*sizeof(vm_paddr_t), 
1075                             xpmap_ptom(VTOP(cur_space) | PG_V | PG_A));
1076 #else
1077         xen_queue_pt_update(pdir_shadow_ma + KPTDI*sizeof(vm_paddr_t), 
1078                             xpmap_ptom(VTOP(cur_space) | PG_V | PG_A));
1079 #endif  
1080         xen_flush_queue();
1081         cur_space += PAGE_SIZE;
1082         printk("#6\n");
1083 #endif /* 0 */  
1084 #ifdef notyet
1085         if (xen_start_info->flags & SIF_INITDOMAIN) {
1086                 /* Map first megabyte */
1087                 for (i = 0; i < (256 << PAGE_SHIFT); i += PAGE_SIZE) 
1088                         PT_SET_MA(KERNBASE + i, i | PG_KERNEL | PG_NC_PCD);
1089                 xen_flush_queue();
1090         }
1091 #endif
1092         /*
1093          * re-map kernel text read-only
1094          *
1095          */
1096         for (i = (((vm_offset_t)&btext) & ~PAGE_MASK);
1097              i < (((vm_offset_t)&etext) & ~PAGE_MASK); i += PAGE_SIZE)
1098                 PT_SET_MA(i, xpmap_ptom(VTOP(i)) | PG_V | PG_A);
1099         
1100         printk("#7\n");
1101         physfree = VTOP(cur_space);
1102         init_first = physfree >> PAGE_SHIFT;
1103         IdlePTD = (pd_entry_t *)VTOP(IdlePTD);
1104         IdlePDPT = (pd_entry_t *)VTOP(IdlePDPT);
1105         setup_xen_features();
1106         printk("#8, proc0kstack=%u\n", proc0kstack);
1107 }
1108
1109
1110 trap_info_t trap_table[] = {
1111         { 0,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(div)},
1112         { 1,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dbg)},
1113         { 3,   3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bpt)},
1114         { 4,   3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ofl)},
1115         /* This is UPL on Linux and KPL on BSD */
1116         { 5,   3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(bnd)},
1117         { 6,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(ill)},
1118         { 7,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(dna)},
1119         /*
1120          * { 8,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(XXX)},
1121          *   no handler for double fault
1122          */
1123         { 9,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpusegm)},
1124         {10,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(tss)},
1125         {11,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(missing)},
1126         {12,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(stk)},
1127         {13,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(prot)},
1128         {14,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(page)},
1129         {15,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(rsvd)},
1130         {16,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(fpu)},
1131         {17,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(align)},
1132         {18,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(mchk)},
1133         {19,   0, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(xmm)},
1134         {0x80, 3, GSEL(GCODE_SEL, SEL_KPL), (unsigned long) &IDTVEC(int0x80_syscall)},
1135         {  0, 0,           0, 0 }
1136 };
1137
1138
1139 static void 
1140 shutdown_handler(struct xenbus_watch *watch,
1141                  const char **vec, unsigned int len)
1142 {
1143         char *str;
1144         struct xenbus_transaction xbt;
1145         int err, howto;
1146         struct reboot_args uap;
1147         
1148         howto = 0;
1149
1150  again:
1151         err = xenbus_transaction_start(&xbt);
1152         if (err)
1153                 return;
1154         str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
1155         /* Ignore read errors and empty reads. */
1156         if (XENBUS_IS_ERR_READ(str)) {
1157                 xenbus_transaction_end(xbt, 1);
1158                 return;
1159         }
1160
1161         xenbus_write(xbt, "control", "shutdown", "");
1162
1163         err = xenbus_transaction_end(xbt, 0);
1164         if (err == EAGAIN) {
1165                 free(str, M_DEVBUF);
1166                 goto again;
1167         }
1168
1169         if (strcmp(str, "reboot") == 0)
1170                 howto = 0;
1171         else if (strcmp(str, "poweroff") == 0)
1172                 howto |= (RB_POWEROFF | RB_HALT);
1173         else if (strcmp(str, "halt") == 0)
1174                 howto |= RB_HALT;
1175         else if (strcmp(str, "suspend") == 0)
1176                 howto = -1;
1177         else {
1178                 printf("Ignoring shutdown request: %s\n", str);
1179                 goto done;
1180         }
1181 #ifdef notyet
1182         if (howto == -1) {
1183                 do_suspend(NULL);
1184                 goto done;
1185         }
1186 #else 
1187         if (howto == -1) {
1188                 printf("suspend not currently supported\n");
1189                 goto done;
1190         }
1191 #endif
1192         uap.opt = howto;
1193         reboot(curthread, &uap);
1194  done:
1195         free(str, M_DEVBUF);
1196 }
1197
1198 static struct xenbus_watch shutdown_watch = {
1199         .node = "control/shutdown",
1200         .callback = shutdown_handler
1201 };
1202
1203
1204 void setup_shutdown_watcher(void *unused);
1205
1206
1207 void
1208 setup_shutdown_watcher(void *unused)
1209 {
1210         if (register_xenbus_watch(&shutdown_watch))
1211                 printf("Failed to set shutdown watcher\n");
1212 }
1213
1214
1215 SYSINIT(shutdown, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, setup_shutdown_watcher, NULL);
1216
1217 #ifdef notyet
1218
1219 static void 
1220 xen_suspend(void *ignore)
1221 {
1222         int i, j, k, fpp;
1223
1224         extern void time_resume(void);
1225         extern unsigned long max_pfn;
1226         extern unsigned long *pfn_to_mfn_frame_list_list;
1227         extern unsigned long *pfn_to_mfn_frame_list[];
1228
1229 #ifdef CONFIG_SMP
1230 #error "do_suspend must be run cpu 0 - need to create separate thread"
1231         cpumask_t prev_online_cpus;
1232         int vcpu_prepare(int vcpu);
1233 #endif
1234
1235         int err = 0;
1236
1237         PANIC_IF(smp_processor_id() != 0);
1238
1239 #if defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU)
1240         if (num_online_cpus() > 1) {
1241                 printk(KERN_WARNING "Can't suspend SMP guests "
1242                        "without CONFIG_HOTPLUG_CPU\n");
1243                 return -EOPNOTSUPP;
1244         }
1245 #endif
1246
1247         xenbus_suspend();
1248
1249 #ifdef CONFIG_SMP
1250         lock_cpu_hotplug();
1251         /*
1252          * Take all other CPUs offline. We hold the hotplug semaphore to
1253          * avoid other processes bringing up CPUs under our feet.
1254          */
1255         cpus_clear(prev_online_cpus);
1256         while (num_online_cpus() > 1) {
1257                 for_each_online_cpu(i) {
1258                         if (i == 0)
1259                                 continue;
1260                         unlock_cpu_hotplug();
1261                         err = cpu_down(i);
1262                         lock_cpu_hotplug();
1263                         if (err != 0) {
1264                                 printk(KERN_CRIT "Failed to take all CPUs "
1265                                        "down: %d.\n", err);
1266                                 goto out_reenable_cpus;
1267                         }
1268                         cpu_set(i, prev_online_cpus);
1269                 }
1270         }
1271 #endif /* CONFIG_SMP */
1272
1273         preempt_disable();
1274
1275
1276         __cli();
1277         preempt_enable();
1278 #ifdef SMP
1279         unlock_cpu_hotplug();
1280 #endif
1281         gnttab_suspend();
1282
1283         pmap_kremove(HYPERVISOR_shared_info);
1284
1285         xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
1286         xen_start_info->console.domU.mfn = mfn_to_pfn(xen_start_info->console.domU.mfn);
1287
1288         /*
1289          * We'll stop somewhere inside this hypercall. When it returns,
1290          * we'll start resuming after the restore.
1291          */
1292         HYPERVISOR_suspend(VTOMFN(xen_start_info));
1293
1294         pmap_kenter_ma(HYPERVISOR_shared_info, xen_start_info->shared_info);
1295         set_fixmap(FIX_SHARED_INFO, xen_start_info->shared_info);
1296
1297 #if 0
1298         memset(empty_zero_page, 0, PAGE_SIZE);
1299 #endif     
1300         HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
1301                 VTOMFN(pfn_to_mfn_frame_list_list);
1302   
1303         fpp = PAGE_SIZE/sizeof(unsigned long);
1304         for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) {
1305                 if ((j % fpp) == 0) {
1306                         k++;
1307                         pfn_to_mfn_frame_list_list[k] = 
1308                                 VTOMFN(pfn_to_mfn_frame_list[k]);
1309                         j = 0;
1310                 }
1311                 pfn_to_mfn_frame_list[k][j] = 
1312                         VTOMFN(&phys_to_machine_mapping[i]);
1313         }
1314         HYPERVISOR_shared_info->arch.max_pfn = max_pfn;
1315
1316         gnttab_resume();
1317
1318         irq_resume();
1319
1320         time_resume();
1321
1322         __sti();
1323
1324         xencons_resume();
1325
1326 #ifdef CONFIG_SMP
1327         for_each_cpu(i)
1328                 vcpu_prepare(i);
1329
1330 #endif
1331         /* 
1332          * Only resume xenbus /after/ we've prepared our VCPUs; otherwise
1333          * the VCPU hotplug callback can race with our vcpu_prepare
1334          */
1335         xenbus_resume();
1336
1337 #ifdef CONFIG_SMP
1338  out_reenable_cpus:
1339         for_each_cpu_mask(i, prev_online_cpus) {
1340                 j = cpu_up(i);
1341                 if ((j != 0) && !cpu_online(i)) {
1342                         printk(KERN_CRIT "Failed to bring cpu "
1343                                "%d back up (%d).\n",
1344                                i, j);
1345                         err = j;
1346                 }
1347         }
1348 #endif
1349         return err;
1350 }
1351
1352 #endif /* notyet */
1353 /********** CODE WORTH KEEPING ABOVE HERE *****************/ 
1354
1355 void xen_failsafe_handler(void);
1356
1357 void
1358 xen_failsafe_handler(void)
1359 {
1360
1361         panic("xen_failsafe_handler called!\n");
1362 }
1363
1364 void xen_handle_thread_switch(struct pcb *pcb);
1365
1366 /* This is called by cpu_switch() when switching threads. */
1367 /* The pcb arg refers to the process control block of the */
1368 /* next thread which is to run */
1369 void
1370 xen_handle_thread_switch(struct pcb *pcb)
1371 {
1372     uint32_t *a = (uint32_t *)&PCPU_GET(fsgs_gdt)[0];
1373     uint32_t *b = (uint32_t *)&pcb->pcb_fsd;
1374     multicall_entry_t mcl[3];
1375     int i = 0;
1376
1377     /* Notify Xen of task switch */
1378     mcl[i].op = __HYPERVISOR_stack_switch;
1379     mcl[i].args[0] = GSEL(GDATA_SEL, SEL_KPL);
1380     mcl[i++].args[1] = (unsigned long)pcb;
1381
1382     /* Check for update of fsd */
1383     if (*a != *b || *(a+1) != *(b+1)) {
1384         mcl[i].op = __HYPERVISOR_update_descriptor;
1385         *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a);
1386         *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b;
1387     }    
1388
1389     a += 2;
1390     b += 2;
1391
1392     /* Check for update of gsd */
1393     if (*a != *b || *(a+1) != *(b+1)) {
1394         mcl[i].op = __HYPERVISOR_update_descriptor;
1395         *(uint64_t *)&mcl[i].args[0] = vtomach((vm_offset_t)a);
1396         *(uint64_t *)&mcl[i++].args[2] = *(uint64_t *)b;
1397     }    
1398
1399     (void)HYPERVISOR_multicall(mcl, i);
1400 }