2 * Copyright (c) 1992 Terrence R. Lambert.
3 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
6 * This code is derived from software contributed to Berkeley by
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed by the University of
20 * California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
41 #include "opt_atalk.h"
42 #include "opt_compat.h"
48 #include "opt_maxmem.h"
49 #include "opt_msgbuf.h"
51 #include "opt_perfmon.h"
52 #include "opt_swtch.h"
53 #include "opt_kstack_pages.h"
55 #include <sys/param.h>
56 #include <sys/systm.h>
57 #include <sys/sysproto.h>
58 #include <sys/signalvar.h>
59 #include <sys/imgact.h>
60 #include <sys/kernel.h>
62 #include <sys/linker.h>
64 #include <sys/malloc.h>
65 #include <sys/mutex.h>
70 #include <sys/reboot.h>
71 #include <sys/callout.h>
72 #include <sys/msgbuf.h>
73 #include <sys/sched.h>
74 #include <sys/sysent.h>
75 #include <sys/sysctl.h>
76 #include <sys/ucontext.h>
77 #include <sys/vmmeter.h>
79 #include <sys/eventhandler.h>
82 #include <vm/vm_param.h>
83 #include <vm/vm_kern.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_map.h>
87 #include <vm/vm_pager.h>
88 #include <vm/vm_extern.h>
96 #include <net/netisr.h>
98 #include <machine/cpu.h>
99 #include <machine/cputypes.h>
100 #include <machine/reg.h>
101 #include <machine/clock.h>
102 #include <machine/specialreg.h>
103 #include <machine/bootinfo.h>
104 #include <machine/md_var.h>
105 #include <machine/pc/bios.h>
106 #include <machine/pcb_ext.h> /* pcb.h included via sys/user.h */
107 #include <machine/proc.h>
109 #include <machine/perfmon.h>
112 #include <machine/privatespace.h>
113 #include <machine/smp.h>
116 #include <i386/isa/icu.h>
117 #include <i386/isa/intr_machdep.h>
119 #include <pc98/pc98/pc98_machdep.h>
120 #include <pc98/pc98/pc98.h>
124 #include <machine/vm86.h>
125 #include <sys/ptrace.h>
126 #include <machine/sigframe.h>
128 extern void init386(int first);
129 extern void dblfault_handler(void);
131 extern void printcpuinfo(void); /* XXX header file */
132 extern void finishidentcpu(void);
133 extern void panicifcpuunsupported(void);
134 extern void initializecpu(void);
136 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
137 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
139 #if !defined(CPU_ENABLE_SSE) && defined(I686_CPU)
140 #define CPU_ENABLE_SSE
142 #if defined(CPU_DISABLE_SSE)
143 #undef CPU_ENABLE_SSE
146 static void cpu_startup(void *);
147 static void fpstate_drop(struct thread *td);
148 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
149 static int set_fpcontext(struct thread *td, const mcontext_t *mcp);
150 #ifdef CPU_ENABLE_SSE
151 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
152 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
153 #endif /* CPU_ENABLE_SSE */
154 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
157 int need_pre_dma_flush; /* If 1, use wbinvd befor DMA transfer. */
158 int need_post_dma_flush; /* If 1, use invd after DMA transfer. */
161 int _udatasel, _ucodesel;
164 #if defined(SWTCH_OPTIM_STATS)
166 SYSCTL_INT(_debug, OID_AUTO, stupid_switch,
167 CTLFLAG_RW, &stupid_switch, 0, "");
168 int swtch_optim_stats;
169 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
170 CTLFLAG_RW, &swtch_optim_stats, 0, "");
172 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
173 CTLFLAG_RW, &tlb_flush_count, 0, "");
174 int lazy_flush_count;
175 SYSCTL_INT(_debug, OID_AUTO, lazy_flush_count,
176 CTLFLAG_RW, &lazy_flush_count, 0, "");
177 int lazy_flush_fixup;
178 SYSCTL_INT(_debug, OID_AUTO, lazy_flush_fixup,
179 CTLFLAG_RW, &lazy_flush_fixup, 0, "");
181 int lazy_flush_smpfixup;
182 SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpfixup,
183 CTLFLAG_RW, &lazy_flush_smpfixup, 0, "");
184 int lazy_flush_smpipi;
185 SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpipi,
186 CTLFLAG_RW, &lazy_flush_smpipi, 0, "");
187 int lazy_flush_smpbadcr3;
188 SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpbadcr3,
189 CTLFLAG_RW, &lazy_flush_smpbadcr3, 0, "");
190 int lazy_flush_smpmiss;
191 SYSCTL_INT(_debug, OID_AUTO, lazy_flush_smpmiss,
192 CTLFLAG_RW, &lazy_flush_smpmiss, 0, "");
196 int lazy_flush_enable = 1;
197 SYSCTL_INT(_debug, OID_AUTO, lazy_flush_enable,
198 CTLFLAG_RW, &lazy_flush_enable, 0, "");
202 static int ispc98 = 1;
204 static int ispc98 = 0;
206 SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, "");
211 static void osendsig(sig_t catcher, int sig, sigset_t *mask, u_long code);
213 #ifdef COMPAT_FREEBSD4
214 static void freebsd4_sendsig(sig_t catcher, int sig, sigset_t *mask,
220 int Maxmem_under16M = 0;
223 vm_paddr_t phys_avail[10];
225 /* must be 2 less so 0 0 can signal end of chunks */
226 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
228 struct kva_md_info kmi;
230 static struct trapframe proc0_tf;
232 static struct pcpu __pcpu;
242 * Good {morning,afternoon,evening,night}.
246 panicifcpuunsupported();
250 printf("real memory = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem),
251 ptoa((uintmax_t)Maxmem) / 1048576);
253 * Display any holes after the first chunk of extended memory.
258 printf("Physical memory chunk(s):\n");
259 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
262 size = phys_avail[indx + 1] - phys_avail[indx];
264 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
265 (uintmax_t)phys_avail[indx],
266 (uintmax_t)phys_avail[indx + 1] - 1,
267 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
271 vm_ksubmap_init(&kmi);
273 printf("avail memory = %ju (%ju MB)\n",
274 ptoa((uintmax_t)cnt.v_free_count),
275 ptoa((uintmax_t)cnt.v_free_count) / 1048576);
278 * Set up buffers, so they can be used to read disk labels.
281 vm_pager_bufferinit();
284 /* For SMP, we delay the cpu_setregs() until after SMP startup. */
290 * Send an interrupt to process.
292 * Stack is set up to allow sigcode stored
293 * at top to call routine, followed by kcall
294 * to sigreturn routine below. After sigreturn
295 * resets the signal mask, the stack, and the
296 * frame pointer, it returns to the user
301 osendsig(catcher, sig, mask, code)
307 struct osigframe sf, *fp;
311 struct trapframe *regs;
316 PROC_LOCK_ASSERT(p, MA_OWNED);
318 mtx_assert(&psp->ps_mtx, MA_OWNED);
320 oonstack = sigonstack(regs->tf_esp);
322 /* Allocate space for the signal handler context. */
323 if ((p->p_flag & P_ALTSTACK) && !oonstack &&
324 SIGISMEMBER(psp->ps_sigonstack, sig)) {
325 fp = (struct osigframe *)(p->p_sigstk.ss_sp +
326 p->p_sigstk.ss_size - sizeof(struct osigframe));
327 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
328 p->p_sigstk.ss_flags |= SS_ONSTACK;
331 fp = (struct osigframe *)regs->tf_esp - 1;
333 /* Translate the signal if appropriate. */
334 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
335 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
337 /* Build the argument list for the signal handler. */
339 sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
340 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
341 /* Signal handler installed with SA_SIGINFO. */
342 sf.sf_arg2 = (register_t)&fp->sf_siginfo;
343 sf.sf_siginfo.si_signo = sig;
344 sf.sf_siginfo.si_code = code;
345 sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
347 /* Old FreeBSD-style arguments. */
349 sf.sf_addr = regs->tf_err;
350 sf.sf_ahu.sf_handler = catcher;
352 mtx_unlock(&psp->ps_mtx);
355 /* Save most if not all of trap frame. */
356 sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
357 sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
358 sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
359 sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
360 sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
361 sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
362 sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
363 sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
364 sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
365 sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
366 sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
367 sf.sf_siginfo.si_sc.sc_gs = rgs();
368 sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
370 /* Build the signal context to be used by osigreturn(). */
371 sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
372 SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
373 sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
374 sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
375 sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
376 sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
377 sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
378 sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
381 * If we're a vm86 process, we want to save the segment registers.
382 * We also change eflags to be our emulated eflags, not the actual
385 if (regs->tf_eflags & PSL_VM) {
386 /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
387 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
388 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
390 sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
391 sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
392 sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
393 sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
395 if (vm86->vm86_has_vme == 0)
396 sf.sf_siginfo.si_sc.sc_ps =
397 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
398 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
400 /* See sendsig() for comments. */
401 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
405 * Copy the sigframe out to the user's stack.
407 if (copyout(&sf, fp, sizeof(*fp)) != 0) {
409 printf("process %ld has trashed its stack\n", (long)p->p_pid);
415 regs->tf_esp = (int)fp;
416 regs->tf_eip = PS_STRINGS - szosigcode;
417 regs->tf_eflags &= ~PSL_T;
418 regs->tf_cs = _ucodesel;
419 regs->tf_ds = _udatasel;
420 regs->tf_es = _udatasel;
421 regs->tf_fs = _udatasel;
423 regs->tf_ss = _udatasel;
425 mtx_lock(&psp->ps_mtx);
427 #endif /* COMPAT_43 */
429 #ifdef COMPAT_FREEBSD4
431 freebsd4_sendsig(catcher, sig, mask, code)
437 struct sigframe4 sf, *sfp;
441 struct trapframe *regs;
446 PROC_LOCK_ASSERT(p, MA_OWNED);
448 mtx_assert(&psp->ps_mtx, MA_OWNED);
450 oonstack = sigonstack(regs->tf_esp);
452 /* Save user context. */
453 bzero(&sf, sizeof(sf));
454 sf.sf_uc.uc_sigmask = *mask;
455 sf.sf_uc.uc_stack = p->p_sigstk;
456 sf.sf_uc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK)
457 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
458 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
459 sf.sf_uc.uc_mcontext.mc_gs = rgs();
460 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
462 /* Allocate space for the signal handler context. */
463 if ((p->p_flag & P_ALTSTACK) != 0 && !oonstack &&
464 SIGISMEMBER(psp->ps_sigonstack, sig)) {
465 sfp = (struct sigframe4 *)(p->p_sigstk.ss_sp +
466 p->p_sigstk.ss_size - sizeof(struct sigframe4));
467 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
468 p->p_sigstk.ss_flags |= SS_ONSTACK;
471 sfp = (struct sigframe4 *)regs->tf_esp - 1;
473 /* Translate the signal if appropriate. */
474 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
475 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
477 /* Build the argument list for the signal handler. */
479 sf.sf_ucontext = (register_t)&sfp->sf_uc;
480 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
481 /* Signal handler installed with SA_SIGINFO. */
482 sf.sf_siginfo = (register_t)&sfp->sf_si;
483 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
485 /* Fill in POSIX parts */
486 sf.sf_si.si_signo = sig;
487 sf.sf_si.si_code = code;
488 sf.sf_si.si_addr = (void *)regs->tf_err;
490 /* Old FreeBSD-style arguments. */
491 sf.sf_siginfo = code;
492 sf.sf_addr = regs->tf_err;
493 sf.sf_ahu.sf_handler = catcher;
495 mtx_unlock(&psp->ps_mtx);
499 * If we're a vm86 process, we want to save the segment registers.
500 * We also change eflags to be our emulated eflags, not the actual
503 if (regs->tf_eflags & PSL_VM) {
504 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
505 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
507 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
508 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
509 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
510 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
512 if (vm86->vm86_has_vme == 0)
513 sf.sf_uc.uc_mcontext.mc_eflags =
514 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
515 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
518 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
519 * syscalls made by the signal handler. This just avoids
520 * wasting time for our lazy fixup of such faults. PSL_NT
521 * does nothing in vm86 mode, but vm86 programs can set it
522 * almost legitimately in probes for old cpu types.
524 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
528 * Copy the sigframe out to the user's stack.
530 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
532 printf("process %ld has trashed its stack\n", (long)p->p_pid);
538 regs->tf_esp = (int)sfp;
539 regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
540 regs->tf_eflags &= ~PSL_T;
541 regs->tf_cs = _ucodesel;
542 regs->tf_ds = _udatasel;
543 regs->tf_es = _udatasel;
544 regs->tf_fs = _udatasel;
545 regs->tf_ss = _udatasel;
547 mtx_lock(&psp->ps_mtx);
549 #endif /* COMPAT_FREEBSD4 */
552 sendsig(catcher, sig, mask, code)
558 struct sigframe sf, *sfp;
563 struct trapframe *regs;
568 PROC_LOCK_ASSERT(p, MA_OWNED);
570 mtx_assert(&psp->ps_mtx, MA_OWNED);
571 #ifdef COMPAT_FREEBSD4
572 if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
573 freebsd4_sendsig(catcher, sig, mask, code);
578 if (SIGISMEMBER(psp->ps_osigset, sig)) {
579 osendsig(catcher, sig, mask, code);
584 oonstack = sigonstack(regs->tf_esp);
586 /* Save user context. */
587 bzero(&sf, sizeof(sf));
588 sf.sf_uc.uc_sigmask = *mask;
589 sf.sf_uc.uc_stack = p->p_sigstk;
590 sf.sf_uc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK)
591 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
592 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
593 sf.sf_uc.uc_mcontext.mc_gs = rgs();
594 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
595 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
596 get_fpcontext(td, &sf.sf_uc.uc_mcontext);
599 /* Allocate space for the signal handler context. */
600 if ((p->p_flag & P_ALTSTACK) != 0 && !oonstack &&
601 SIGISMEMBER(psp->ps_sigonstack, sig)) {
602 sp = p->p_sigstk.ss_sp +
603 p->p_sigstk.ss_size - sizeof(struct sigframe);
604 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
605 p->p_sigstk.ss_flags |= SS_ONSTACK;
608 sp = (char *)regs->tf_esp - sizeof(struct sigframe);
609 /* Align to 16 bytes. */
610 sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
612 /* Translate the signal if appropriate. */
613 if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
614 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
616 /* Build the argument list for the signal handler. */
618 sf.sf_ucontext = (register_t)&sfp->sf_uc;
619 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
620 /* Signal handler installed with SA_SIGINFO. */
621 sf.sf_siginfo = (register_t)&sfp->sf_si;
622 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
624 /* Fill in POSIX parts */
625 sf.sf_si.si_signo = sig;
626 sf.sf_si.si_code = code;
627 sf.sf_si.si_addr = (void *)regs->tf_err;
629 /* Old FreeBSD-style arguments. */
630 sf.sf_siginfo = code;
631 sf.sf_addr = regs->tf_err;
632 sf.sf_ahu.sf_handler = catcher;
634 mtx_unlock(&psp->ps_mtx);
638 * If we're a vm86 process, we want to save the segment registers.
639 * We also change eflags to be our emulated eflags, not the actual
642 if (regs->tf_eflags & PSL_VM) {
643 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
644 struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
646 sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
647 sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
648 sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
649 sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
651 if (vm86->vm86_has_vme == 0)
652 sf.sf_uc.uc_mcontext.mc_eflags =
653 (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
654 (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
657 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
658 * syscalls made by the signal handler. This just avoids
659 * wasting time for our lazy fixup of such faults. PSL_NT
660 * does nothing in vm86 mode, but vm86 programs can set it
661 * almost legitimately in probes for old cpu types.
663 tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
667 * Copy the sigframe out to the user's stack.
669 if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
671 printf("process %ld has trashed its stack\n", (long)p->p_pid);
677 regs->tf_esp = (int)sfp;
678 regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
679 regs->tf_eflags &= ~PSL_T;
680 regs->tf_cs = _ucodesel;
681 regs->tf_ds = _udatasel;
682 regs->tf_es = _udatasel;
683 regs->tf_fs = _udatasel;
684 regs->tf_ss = _udatasel;
686 mtx_lock(&psp->ps_mtx);
690 * System call to cleanup state after a signal
691 * has been taken. Reset signal mask and
692 * stack state from context left by sendsig (above).
693 * Return to previous pc and psl as specified by
694 * context left by sendsig. Check carefully to
695 * make sure that the user has not modified the
696 * state to gain improper privileges.
704 struct osigreturn_args /* {
705 struct osigcontext *sigcntxp;
708 struct osigcontext sc;
709 struct trapframe *regs;
710 struct osigcontext *scp;
711 struct proc *p = td->td_proc;
715 error = copyin(uap->sigcntxp, &sc, sizeof(sc));
720 if (eflags & PSL_VM) {
721 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
722 struct vm86_kernel *vm86;
725 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
726 * set up the vm86 area, and we can't enter vm86 mode.
728 if (td->td_pcb->pcb_ext == 0)
730 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
731 if (vm86->vm86_inited == 0)
734 /* Go back to user mode if both flags are set. */
735 if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
736 trapsignal(td, SIGBUS, 0);
738 if (vm86->vm86_has_vme) {
739 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
740 (eflags & VME_USERCHANGE) | PSL_VM;
742 vm86->vm86_eflags = eflags; /* save VIF, VIP */
743 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
744 (eflags & VM_USERCHANGE) | PSL_VM;
746 tf->tf_vm86_ds = scp->sc_ds;
747 tf->tf_vm86_es = scp->sc_es;
748 tf->tf_vm86_fs = scp->sc_fs;
749 tf->tf_vm86_gs = scp->sc_gs;
750 tf->tf_ds = _udatasel;
751 tf->tf_es = _udatasel;
752 tf->tf_fs = _udatasel;
755 * Don't allow users to change privileged or reserved flags.
758 * XXX do allow users to change the privileged flag PSL_RF.
759 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
760 * should sometimes set it there too. tf_eflags is kept in
761 * the signal context during signal handling and there is no
762 * other place to remember it, so the PSL_RF bit may be
763 * corrupted by the signal handler without us knowing.
764 * Corruption of the PSL_RF bit at worst causes one more or
765 * one less debugger trap, so allowing it is fairly harmless.
767 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
772 * Don't allow users to load a valid privileged %cs. Let the
773 * hardware check for invalid selectors, excess privilege in
774 * other selectors, invalid %eip's and invalid %esp's.
776 if (!CS_SECURE(scp->sc_cs)) {
777 trapsignal(td, SIGBUS, T_PROTFLT);
780 regs->tf_ds = scp->sc_ds;
781 regs->tf_es = scp->sc_es;
782 regs->tf_fs = scp->sc_fs;
785 /* Restore remaining registers. */
786 regs->tf_eax = scp->sc_eax;
787 regs->tf_ebx = scp->sc_ebx;
788 regs->tf_ecx = scp->sc_ecx;
789 regs->tf_edx = scp->sc_edx;
790 regs->tf_esi = scp->sc_esi;
791 regs->tf_edi = scp->sc_edi;
792 regs->tf_cs = scp->sc_cs;
793 regs->tf_ss = scp->sc_ss;
794 regs->tf_isp = scp->sc_isp;
795 regs->tf_ebp = scp->sc_fp;
796 regs->tf_esp = scp->sc_sp;
797 regs->tf_eip = scp->sc_pc;
798 regs->tf_eflags = eflags;
801 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
802 if (scp->sc_onstack & 1)
803 p->p_sigstk.ss_flags |= SS_ONSTACK;
805 p->p_sigstk.ss_flags &= ~SS_ONSTACK;
807 SIGSETOLD(td->td_sigmask, scp->sc_mask);
808 SIG_CANTMASK(td->td_sigmask);
811 return (EJUSTRETURN);
813 #endif /* COMPAT_43 */
815 #ifdef COMPAT_FREEBSD4
820 freebsd4_sigreturn(td, uap)
822 struct freebsd4_sigreturn_args /* {
823 const ucontext4 *sigcntxp;
827 struct proc *p = td->td_proc;
828 struct trapframe *regs;
829 const struct ucontext4 *ucp;
830 int cs, eflags, error;
832 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
837 eflags = ucp->uc_mcontext.mc_eflags;
838 if (eflags & PSL_VM) {
839 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
840 struct vm86_kernel *vm86;
843 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
844 * set up the vm86 area, and we can't enter vm86 mode.
846 if (td->td_pcb->pcb_ext == 0)
848 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
849 if (vm86->vm86_inited == 0)
852 /* Go back to user mode if both flags are set. */
853 if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
854 trapsignal(td, SIGBUS, 0);
856 if (vm86->vm86_has_vme) {
857 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
858 (eflags & VME_USERCHANGE) | PSL_VM;
860 vm86->vm86_eflags = eflags; /* save VIF, VIP */
861 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
862 (eflags & VM_USERCHANGE) | PSL_VM;
864 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
865 tf->tf_eflags = eflags;
866 tf->tf_vm86_ds = tf->tf_ds;
867 tf->tf_vm86_es = tf->tf_es;
868 tf->tf_vm86_fs = tf->tf_fs;
869 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
870 tf->tf_ds = _udatasel;
871 tf->tf_es = _udatasel;
872 tf->tf_fs = _udatasel;
875 * Don't allow users to change privileged or reserved flags.
878 * XXX do allow users to change the privileged flag PSL_RF.
879 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
880 * should sometimes set it there too. tf_eflags is kept in
881 * the signal context during signal handling and there is no
882 * other place to remember it, so the PSL_RF bit may be
883 * corrupted by the signal handler without us knowing.
884 * Corruption of the PSL_RF bit at worst causes one more or
885 * one less debugger trap, so allowing it is fairly harmless.
887 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
888 printf("freebsd4_sigreturn: eflags = 0x%x\n", eflags);
893 * Don't allow users to load a valid privileged %cs. Let the
894 * hardware check for invalid selectors, excess privilege in
895 * other selectors, invalid %eip's and invalid %esp's.
897 cs = ucp->uc_mcontext.mc_cs;
898 if (!CS_SECURE(cs)) {
899 printf("freebsd4_sigreturn: cs = 0x%x\n", cs);
900 trapsignal(td, SIGBUS, T_PROTFLT);
904 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
908 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
909 if (ucp->uc_mcontext.mc_onstack & 1)
910 p->p_sigstk.ss_flags |= SS_ONSTACK;
912 p->p_sigstk.ss_flags &= ~SS_ONSTACK;
915 td->td_sigmask = ucp->uc_sigmask;
916 SIG_CANTMASK(td->td_sigmask);
919 return (EJUSTRETURN);
921 #endif /* COMPAT_FREEBSD4 */
929 struct sigreturn_args /* {
930 const __ucontext *sigcntxp;
934 struct proc *p = td->td_proc;
935 struct trapframe *regs;
936 const ucontext_t *ucp;
937 int cs, eflags, error, ret;
939 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
944 eflags = ucp->uc_mcontext.mc_eflags;
945 if (eflags & PSL_VM) {
946 struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
947 struct vm86_kernel *vm86;
950 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
951 * set up the vm86 area, and we can't enter vm86 mode.
953 if (td->td_pcb->pcb_ext == 0)
955 vm86 = &td->td_pcb->pcb_ext->ext_vm86;
956 if (vm86->vm86_inited == 0)
959 /* Go back to user mode if both flags are set. */
960 if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
961 trapsignal(td, SIGBUS, 0);
963 if (vm86->vm86_has_vme) {
964 eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
965 (eflags & VME_USERCHANGE) | PSL_VM;
967 vm86->vm86_eflags = eflags; /* save VIF, VIP */
968 eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
969 (eflags & VM_USERCHANGE) | PSL_VM;
971 bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
972 tf->tf_eflags = eflags;
973 tf->tf_vm86_ds = tf->tf_ds;
974 tf->tf_vm86_es = tf->tf_es;
975 tf->tf_vm86_fs = tf->tf_fs;
976 tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
977 tf->tf_ds = _udatasel;
978 tf->tf_es = _udatasel;
979 tf->tf_fs = _udatasel;
982 * Don't allow users to change privileged or reserved flags.
985 * XXX do allow users to change the privileged flag PSL_RF.
986 * The cpu sets PSL_RF in tf_eflags for faults. Debuggers
987 * should sometimes set it there too. tf_eflags is kept in
988 * the signal context during signal handling and there is no
989 * other place to remember it, so the PSL_RF bit may be
990 * corrupted by the signal handler without us knowing.
991 * Corruption of the PSL_RF bit at worst causes one more or
992 * one less debugger trap, so allowing it is fairly harmless.
994 if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
995 printf("sigreturn: eflags = 0x%x\n", eflags);
1000 * Don't allow users to load a valid privileged %cs. Let the
1001 * hardware check for invalid selectors, excess privilege in
1002 * other selectors, invalid %eip's and invalid %esp's.
1004 cs = ucp->uc_mcontext.mc_cs;
1005 if (!CS_SECURE(cs)) {
1006 printf("sigreturn: cs = 0x%x\n", cs);
1007 trapsignal(td, SIGBUS, T_PROTFLT);
1011 ret = set_fpcontext(td, &ucp->uc_mcontext);
1014 bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
1018 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
1019 if (ucp->uc_mcontext.mc_onstack & 1)
1020 p->p_sigstk.ss_flags |= SS_ONSTACK;
1022 p->p_sigstk.ss_flags &= ~SS_ONSTACK;
1025 td->td_sigmask = ucp->uc_sigmask;
1026 SIG_CANTMASK(td->td_sigmask);
1029 return (EJUSTRETURN);
1033 * Machine dependent boot() routine
1035 * I haven't seen anything to put here yet
1036 * Possibly some stuff might be grafted back here from boot()
1044 * Shutdown the CPU as much as possible
1054 * Hook to idle the CPU when possible. In the SMP case we default to
1055 * off because a halted cpu will not currently pick up a new thread in the
1056 * run queue until the next timer tick. If turned on this will result in
1057 * approximately a 4.2% loss in real time performance in buildworld tests
1058 * (but improves user and sys times oddly enough), and saves approximately
1059 * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3).
1061 * XXX we need to have a cpu mask of idle cpus and generate an IPI or
1062 * otherwise generate some sort of interrupt to wake up cpus sitting in HLT.
1063 * Then we can have our cake and eat it too.
1065 * XXX I'm turning it on for SMP as well by default for now. It seems to
1066 * help lock contention somewhat, and this is critical for HTT. -Peter
1068 static int cpu_idle_hlt = 1;
1069 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
1070 &cpu_idle_hlt, 0, "Idle loop HLT enable");
1073 * Note that we have to be careful here to avoid a race between checking
1074 * sched_runnable() and actually halting. If we don't do this, we may waste
1075 * the time between calling hlt and the next interrupt even though there
1076 * is a runnable process.
1083 if (mp_grab_cpu_hlt())
1089 if (sched_runnable()) {
1093 * we must absolutely guarentee that hlt is the
1094 * absolute next instruction after sti or we
1095 * introduce a timing window.
1097 __asm __volatile("sti; hlt");
1103 * Clear registers on exec
1106 exec_setregs(td, entry, stack, ps_strings)
1112 struct trapframe *regs = td->td_frame;
1113 struct pcb *pcb = td->td_pcb;
1115 /* Reset pc->pcb_gs and %gs before possibly invalidating it. */
1116 pcb->pcb_gs = _udatasel;
1119 if (td->td_proc->p_md.md_ldt)
1122 bzero((char *)regs, sizeof(struct trapframe));
1123 regs->tf_eip = entry;
1124 regs->tf_esp = stack;
1125 regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
1126 regs->tf_ss = _udatasel;
1127 regs->tf_ds = _udatasel;
1128 regs->tf_es = _udatasel;
1129 regs->tf_fs = _udatasel;
1130 regs->tf_cs = _ucodesel;
1132 /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
1133 regs->tf_ebx = ps_strings;
1136 * Reset the hardware debug registers if they were in use.
1137 * They won't have any meaning for the newly exec'd process.
1139 if (pcb->pcb_flags & PCB_DBREGS) {
1146 if (pcb == PCPU_GET(curpcb)) {
1148 * Clear the debug registers on the running
1149 * CPU, otherwise they will end up affecting
1150 * the next process we switch to.
1154 pcb->pcb_flags &= ~PCB_DBREGS;
1158 * Initialize the math emulator (if any) for the current process.
1159 * Actually, just clear the bit that says that the emulator has
1160 * been initialized. Initialization is delayed until the process
1161 * traps to the emulator (if it is done at all) mainly because
1162 * emulators don't provide an entry point for initialization.
1164 td->td_pcb->pcb_flags &= ~FP_SOFTFP;
1167 * Arrange to trap the next npx or `fwait' instruction (see npx.c
1168 * for why fwait must be trapped at least if there is an npx or an
1169 * emulator). This is mainly to handle the case where npx0 is not
1170 * configured, since the npx routines normally set up the trap
1171 * otherwise. It should be done only at boot time, but doing it
1172 * here allows modifying `npx_exists' for testing the emulator on
1173 * systems with an npx.
1175 load_cr0(rcr0() | CR0_MP | CR0_TS);
1177 /* Initialize the npx (if any) for the current process. */
1179 * XXX the above load_cr0() also initializes it and is a layering
1180 * violation if NPX is configured. It drops the npx partially
1181 * and this would be fatal if we were interrupted now, and decided
1182 * to force the state to the pcb, and checked the invariant
1183 * (CR0_TS clear) if and only if PCPU_GET(fpcurthread) != NULL).
1184 * ALL of this can happen except the check. The check used to
1185 * happen and be fatal later when we didn't complete the drop
1186 * before returning to user mode. This should be fixed properly
1192 * XXX - Linux emulator
1193 * Make sure sure edx is 0x0 on entry. Linux binaries depend
1196 td->td_retval[1] = 0;
1206 cr0 |= CR0_NE; /* Done by npxinit() */
1208 cr0 |= CR0_MP | CR0_TS; /* Done at every execve() too. */
1210 cr0 |= CR0_WP | CR0_AM;
1217 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
1220 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
1222 if (!error && req->newptr)
1227 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
1228 &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
1230 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
1231 CTLFLAG_RW, &disable_rtc_set, 0, "");
1233 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo,
1234 CTLFLAG_RD, &bootinfo, bootinfo, "");
1236 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
1237 CTLFLAG_RW, &wall_cmos_clock, 0, "");
1239 u_long bootdev; /* not a dev_t - encoding is different */
1240 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
1241 CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in dev_t format)");
1244 * Initialize 386 and configure to run kernel
1248 * Initialize segments & interrupt table
1252 union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */
1253 static struct gate_descriptor idt0[NIDT];
1254 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
1255 union descriptor ldt[NLDT]; /* local descriptor table */
1257 /* table descriptors - used to load tables by microp */
1258 struct region_descriptor r_gdt, r_idt;
1261 int private_tss; /* flag indicating private tss */
1263 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
1264 extern int has_f00f_bug;
1267 static struct i386tss dblfault_tss;
1268 static char dblfault_stack[PAGE_SIZE];
1270 extern struct user *proc0uarea;
1271 extern vm_offset_t proc0kstack;
1274 /* software prototypes -- in more palatable form */
1275 struct soft_segment_descriptor gdt_segs[] = {
1276 /* GNULL_SEL 0 Null Descriptor */
1277 { 0x0, /* segment base address */
1279 0, /* segment type */
1280 0, /* segment descriptor priority level */
1281 0, /* segment descriptor present */
1283 0, /* default 32 vs 16 bit size */
1284 0 /* limit granularity (byte/page units)*/ },
1285 /* GCODE_SEL 1 Code Descriptor for kernel */
1286 { 0x0, /* segment base address */
1287 0xfffff, /* length - all address space */
1288 SDT_MEMERA, /* segment type */
1289 0, /* segment descriptor priority level */
1290 1, /* segment descriptor present */
1292 1, /* default 32 vs 16 bit size */
1293 1 /* limit granularity (byte/page units)*/ },
1294 /* GDATA_SEL 2 Data Descriptor for kernel */
1295 { 0x0, /* segment base address */
1296 0xfffff, /* length - all address space */
1297 SDT_MEMRWA, /* segment type */
1298 0, /* segment descriptor priority level */
1299 1, /* segment descriptor present */
1301 1, /* default 32 vs 16 bit size */
1302 1 /* limit granularity (byte/page units)*/ },
1303 /* GPRIV_SEL 3 SMP Per-Processor Private Data Descriptor */
1304 { 0x0, /* segment base address */
1305 0xfffff, /* length - all address space */
1306 SDT_MEMRWA, /* segment type */
1307 0, /* segment descriptor priority level */
1308 1, /* segment descriptor present */
1310 1, /* default 32 vs 16 bit size */
1311 1 /* limit granularity (byte/page units)*/ },
1312 /* GPROC0_SEL 4 Proc 0 Tss Descriptor */
1314 0x0, /* segment base address */
1315 sizeof(struct i386tss)-1,/* length - all address space */
1316 SDT_SYS386TSS, /* segment type */
1317 0, /* segment descriptor priority level */
1318 1, /* segment descriptor present */
1320 0, /* unused - default 32 vs 16 bit size */
1321 0 /* limit granularity (byte/page units)*/ },
1322 /* GLDT_SEL 5 LDT Descriptor */
1323 { (int) ldt, /* segment base address */
1324 sizeof(ldt)-1, /* length - all address space */
1325 SDT_SYSLDT, /* segment type */
1326 SEL_UPL, /* segment descriptor priority level */
1327 1, /* segment descriptor present */
1329 0, /* unused - default 32 vs 16 bit size */
1330 0 /* limit granularity (byte/page units)*/ },
1331 /* GUSERLDT_SEL 6 User LDT Descriptor per process */
1332 { (int) ldt, /* segment base address */
1333 (512 * sizeof(union descriptor)-1), /* length */
1334 SDT_SYSLDT, /* segment type */
1335 0, /* segment descriptor priority level */
1336 1, /* segment descriptor present */
1338 0, /* unused - default 32 vs 16 bit size */
1339 0 /* limit granularity (byte/page units)*/ },
1340 /* GTGATE_SEL 7 Null Descriptor - Placeholder */
1341 { 0x0, /* segment base address */
1342 0x0, /* length - all address space */
1343 0, /* segment type */
1344 0, /* segment descriptor priority level */
1345 0, /* segment descriptor present */
1347 0, /* default 32 vs 16 bit size */
1348 0 /* limit granularity (byte/page units)*/ },
1349 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
1350 { 0x400, /* segment base address */
1351 0xfffff, /* length */
1352 SDT_MEMRWA, /* segment type */
1353 0, /* segment descriptor priority level */
1354 1, /* segment descriptor present */
1356 1, /* default 32 vs 16 bit size */
1357 1 /* limit granularity (byte/page units)*/ },
1358 /* GPANIC_SEL 9 Panic Tss Descriptor */
1359 { (int) &dblfault_tss, /* segment base address */
1360 sizeof(struct i386tss)-1,/* length - all address space */
1361 SDT_SYS386TSS, /* segment type */
1362 0, /* segment descriptor priority level */
1363 1, /* segment descriptor present */
1365 0, /* unused - default 32 vs 16 bit size */
1366 0 /* limit granularity (byte/page units)*/ },
1367 /* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */
1368 { 0, /* segment base address (overwritten) */
1369 0xfffff, /* length */
1370 SDT_MEMERA, /* segment type */
1371 0, /* segment descriptor priority level */
1372 1, /* segment descriptor present */
1374 0, /* default 32 vs 16 bit size */
1375 1 /* limit granularity (byte/page units)*/ },
1376 /* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */
1377 { 0, /* segment base address (overwritten) */
1378 0xfffff, /* length */
1379 SDT_MEMERA, /* segment type */
1380 0, /* segment descriptor priority level */
1381 1, /* segment descriptor present */
1383 0, /* default 32 vs 16 bit size */
1384 1 /* limit granularity (byte/page units)*/ },
1385 /* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */
1386 { 0, /* segment base address (overwritten) */
1387 0xfffff, /* length */
1388 SDT_MEMRWA, /* segment type */
1389 0, /* segment descriptor priority level */
1390 1, /* segment descriptor present */
1392 1, /* default 32 vs 16 bit size */
1393 1 /* limit granularity (byte/page units)*/ },
1394 /* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */
1395 { 0, /* segment base address (overwritten) */
1396 0xfffff, /* length */
1397 SDT_MEMRWA, /* segment type */
1398 0, /* segment descriptor priority level */
1399 1, /* segment descriptor present */
1401 0, /* default 32 vs 16 bit size */
1402 1 /* limit granularity (byte/page units)*/ },
1403 /* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */
1404 { 0, /* segment base address (overwritten) */
1405 0xfffff, /* length */
1406 SDT_MEMRWA, /* segment type */
1407 0, /* segment descriptor priority level */
1408 1, /* segment descriptor present */
1410 0, /* default 32 vs 16 bit size */
1411 1 /* limit granularity (byte/page units)*/ },
1414 static struct soft_segment_descriptor ldt_segs[] = {
1415 /* Null Descriptor - overwritten by call gate */
1416 { 0x0, /* segment base address */
1417 0x0, /* length - all address space */
1418 0, /* segment type */
1419 0, /* segment descriptor priority level */
1420 0, /* segment descriptor present */
1422 0, /* default 32 vs 16 bit size */
1423 0 /* limit granularity (byte/page units)*/ },
1424 /* Null Descriptor - overwritten by call gate */
1425 { 0x0, /* segment base address */
1426 0x0, /* length - all address space */
1427 0, /* segment type */
1428 0, /* segment descriptor priority level */
1429 0, /* segment descriptor present */
1431 0, /* default 32 vs 16 bit size */
1432 0 /* limit granularity (byte/page units)*/ },
1433 /* Null Descriptor - overwritten by call gate */
1434 { 0x0, /* segment base address */
1435 0x0, /* length - all address space */
1436 0, /* segment type */
1437 0, /* segment descriptor priority level */
1438 0, /* segment descriptor present */
1440 0, /* default 32 vs 16 bit size */
1441 0 /* limit granularity (byte/page units)*/ },
1442 /* Code Descriptor for user */
1443 { 0x0, /* segment base address */
1444 0xfffff, /* length - all address space */
1445 SDT_MEMERA, /* segment type */
1446 SEL_UPL, /* segment descriptor priority level */
1447 1, /* segment descriptor present */
1449 1, /* default 32 vs 16 bit size */
1450 1 /* limit granularity (byte/page units)*/ },
1451 /* Null Descriptor - overwritten by call gate */
1452 { 0x0, /* segment base address */
1453 0x0, /* length - all address space */
1454 0, /* segment type */
1455 0, /* segment descriptor priority level */
1456 0, /* segment descriptor present */
1458 0, /* default 32 vs 16 bit size */
1459 0 /* limit granularity (byte/page units)*/ },
1460 /* Data Descriptor for user */
1461 { 0x0, /* segment base address */
1462 0xfffff, /* length - all address space */
1463 SDT_MEMRWA, /* segment type */
1464 SEL_UPL, /* segment descriptor priority level */
1465 1, /* segment descriptor present */
1467 1, /* default 32 vs 16 bit size */
1468 1 /* limit granularity (byte/page units)*/ },
1472 setidt(idx, func, typ, dpl, selec)
1479 struct gate_descriptor *ip;
1482 ip->gd_looffset = (int)func;
1483 ip->gd_selector = selec;
1489 ip->gd_hioffset = ((int)func)>>16 ;
1492 #define IDTVEC(name) __CONCAT(X,name)
1495 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
1496 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
1497 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
1498 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
1499 IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
1503 struct segment_descriptor *sd;
1504 struct soft_segment_descriptor *ssd;
1506 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
1507 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
1508 ssd->ssd_type = sd->sd_type;
1509 ssd->ssd_dpl = sd->sd_dpl;
1510 ssd->ssd_p = sd->sd_p;
1511 ssd->ssd_def32 = sd->sd_def32;
1512 ssd->ssd_gran = sd->sd_gran;
1515 #define PHYSMAP_SIZE (2 * 8)
1518 * Populate the (physmap) array with base/bound pairs describing the
1519 * available physical memory in the system, then test this memory and
1520 * build the phys_avail array describing the actually-available memory.
1522 * If we cannot accurately determine the physical memory map, then use
1523 * value from the 0xE801 call, and failing that, the RTC.
1525 * Total memory size may be set by the kernel environment variable
1526 * hw.physmem or the compile-time define MAXMEM.
1528 * XXX first should be vm_paddr_t.
1531 getmemsize(int first)
1534 int i, physmap_idx, pa_indx, pg_n;
1535 u_int basemem, extmem, under16;
1536 vm_offset_t pa, physmap[PHYSMAP_SIZE];
1540 int i, physmap_idx, pa_indx;
1541 u_int basemem, extmem;
1542 struct vm86frame vmf;
1543 struct vm86context vmc;
1544 vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1547 struct bios_smap *smap;
1551 /* XXX - some of EPSON machines can't use PG_N */
1553 if (pc98_machine_type & M_EPSON_PC98) {
1554 switch (epson_machine_id) {
1558 case 0x34: /* PC-486HX */
1559 case 0x35: /* PC-486HG */
1560 case 0x3B: /* PC-486HA */
1565 bzero(physmap, sizeof(physmap));
1568 * Perform "base memory" related probes & setup
1570 under16 = pc98_getmemsize(&basemem, &extmem);
1571 if (basemem > 640) {
1572 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
1578 * XXX if biosbasemem is now < 640, there is a `hole'
1579 * between the end of base memory and the start of
1580 * ISA memory. The hole may be empty or it may
1581 * contain BIOS code or data. Map it read/write so
1582 * that the BIOS can write to it. (Memory from 0 to
1583 * the physical end of the kernel is mapped read-only
1584 * to begin with and then parts of it are remapped.
1585 * The parts that aren't remapped form holes that
1586 * remain read-only and are unused by the kernel.
1587 * The base memory area is below the physical end of
1588 * the kernel and right now forms a read-only hole.
1589 * The part of it from PAGE_SIZE to
1590 * (trunc_page(biosbasemem * 1024) - 1) will be
1591 * remapped and used by the kernel later.)
1593 * This code is similar to the code used in
1594 * pmap_mapdev, but since no memory needs to be
1595 * allocated we simply change the mapping.
1597 for (pa = trunc_page(basemem * 1024);
1598 pa < ISA_HOLE_START; pa += PAGE_SIZE)
1599 pmap_kenter(KERNBASE + pa, pa);
1602 * if basemem != 640, map pages r/w into vm86 page table so
1603 * that the bios can scribble on it.
1605 pte = (pt_entry_t *)vm86paddr;
1606 for (i = basemem / 4; i < 160; i++)
1607 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
1611 bzero(&vmf, sizeof(struct vm86frame));
1612 bzero(physmap, sizeof(physmap));
1616 * map page 1 R/W into the kernel page table so we can use it
1617 * as a buffer. The kernel will unmap this page later.
1619 pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
1622 * get memory map with INT 15:E820
1625 smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
1626 vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
1631 vmf.vmf_eax = 0xE820;
1632 vmf.vmf_edx = SMAP_SIG;
1633 vmf.vmf_ecx = sizeof(struct bios_smap);
1634 i = vm86_datacall(0x15, &vmf, &vmc);
1635 if (i || vmf.vmf_eax != SMAP_SIG)
1637 if (boothowto & RB_VERBOSE)
1638 printf("SMAP type=%02x base=%016llx len=%016llx\n",
1639 smap->type, smap->base, smap->length);
1641 if (smap->type != 0x01)
1644 if (smap->length == 0)
1647 if (smap->base >= 0xffffffff) {
1648 printf("%uK of memory above 4GB ignored\n",
1649 (u_int)(smap->length / 1024));
1653 for (i = 0; i <= physmap_idx; i += 2) {
1654 if (smap->base < physmap[i + 1]) {
1655 if (boothowto & RB_VERBOSE)
1657 "Overlapping or non-montonic memory region, ignoring second region\n");
1662 if (smap->base == physmap[physmap_idx + 1]) {
1663 physmap[physmap_idx + 1] += smap->length;
1668 if (physmap_idx == PHYSMAP_SIZE) {
1670 "Too many segments in the physical address map, giving up\n");
1673 physmap[physmap_idx] = smap->base;
1674 physmap[physmap_idx + 1] = smap->base + smap->length;
1676 } while (vmf.vmf_ebx != 0);
1679 * Perform "base memory" related probes & setup
1681 for (i = 0; i <= physmap_idx; i += 2) {
1682 if (physmap[i] == 0x00000000) {
1683 basemem = physmap[i + 1] / 1024;
1688 /* Fall back to the old compatibility function for base memory */
1690 vm86_intcall(0x12, &vmf);
1691 basemem = vmf.vmf_ax;
1694 if (basemem > 640) {
1695 printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
1701 * XXX if biosbasemem is now < 640, there is a `hole'
1702 * between the end of base memory and the start of
1703 * ISA memory. The hole may be empty or it may
1704 * contain BIOS code or data. Map it read/write so
1705 * that the BIOS can write to it. (Memory from 0 to
1706 * the physical end of the kernel is mapped read-only
1707 * to begin with and then parts of it are remapped.
1708 * The parts that aren't remapped form holes that
1709 * remain read-only and are unused by the kernel.
1710 * The base memory area is below the physical end of
1711 * the kernel and right now forms a read-only hole.
1712 * The part of it from PAGE_SIZE to
1713 * (trunc_page(biosbasemem * 1024) - 1) will be
1714 * remapped and used by the kernel later.)
1716 * This code is similar to the code used in
1717 * pmap_mapdev, but since no memory needs to be
1718 * allocated we simply change the mapping.
1720 for (pa = trunc_page(basemem * 1024);
1721 pa < ISA_HOLE_START; pa += PAGE_SIZE)
1722 pmap_kenter(KERNBASE + pa, pa);
1725 * if basemem != 640, map pages r/w into vm86 page table so
1726 * that the bios can scribble on it.
1728 pte = (pt_entry_t *)vm86paddr;
1729 for (i = basemem / 4; i < 160; i++)
1730 pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
1732 if (physmap[1] != 0)
1736 * If we failed above, try memory map with INT 15:E801
1738 vmf.vmf_ax = 0xE801;
1739 if (vm86_intcall(0x15, &vmf) == 0) {
1740 extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
1744 vm86_intcall(0x15, &vmf);
1745 extmem = vmf.vmf_ax;
1748 * Prefer the RTC value for extended memory.
1750 extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
1755 * Special hack for chipsets that still remap the 384k hole when
1756 * there's 16MB of memory - this really confuses people that
1757 * are trying to use bus mastering ISA controllers with the
1758 * "16MB limit"; they only have 16MB, but the remapping puts
1759 * them beyond the limit.
1761 * If extended memory is between 15-16MB (16-17MB phys address range),
1764 if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
1769 physmap[1] = basemem * 1024;
1771 physmap[physmap_idx] = 0x100000;
1772 physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
1775 if ((under16 != 16 * 1024) && (extmem > 15 * 1024)) {
1776 /* 15M - 16M region is cut off, so need to divide chunk */
1777 physmap[physmap_idx + 1] = under16 * 1024;
1779 physmap[physmap_idx] = 0x1000000;
1780 physmap[physmap_idx + 1] = physmap[2] + extmem * 1024;
1786 * Now, physmap contains a map of physical memory.
1790 /* make hole for AP bootstrap code */
1791 physmap[1] = mp_bootaddress(physmap[1] / 1024);
1793 /* look for the MP hardware - needed for apic addresses */
1798 * Maxmem isn't the "maximum memory", it's one larger than the
1799 * highest page of the physical address space. It should be
1800 * called something like "Maxphyspage". We may adjust this
1801 * based on ``hw.physmem'' and the results of the memory test.
1803 Maxmem = atop(physmap[physmap_idx + 1]);
1806 Maxmem = MAXMEM / 4;
1810 * hw.physmem is a size in bytes; we also allow k, m, and g suffixes
1811 * for the appropriate modifiers. This overrides MAXMEM.
1813 if ((cp = getenv("hw.physmem")) != NULL) {
1814 u_int64_t AllowMem, sanity;
1817 sanity = AllowMem = strtouq(cp, &ep, 0);
1818 if ((ep != cp) && (*ep != 0)) {
1831 AllowMem = sanity = 0;
1833 if (AllowMem < sanity)
1837 printf("Ignoring invalid memory size of '%s'\n", cp);
1839 Maxmem = atop(AllowMem);
1843 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1844 (boothowto & RB_VERBOSE))
1845 printf("Physical memory use set to %ldK\n", Maxmem * 4);
1848 * If Maxmem has been increased beyond what the system has detected,
1849 * extend the last memory segment to the new limit.
1851 if (atop(physmap[physmap_idx + 1]) < Maxmem)
1852 physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
1854 /* call pmap initialization to make new kernel address space */
1855 pmap_bootstrap(first, 0);
1858 * Size up each available chunk of physical memory.
1860 physmap[0] = PAGE_SIZE; /* mask off page 0 */
1862 phys_avail[pa_indx++] = physmap[0];
1863 phys_avail[pa_indx] = physmap[0];
1867 * physmap is in bytes, so when converting to page boundaries,
1868 * round up the start address and round down the end address.
1870 for (i = 0; i <= physmap_idx; i += 2) {
1873 end = ptoa((vm_paddr_t)Maxmem);
1874 if (physmap[i + 1] < end)
1875 end = trunc_page(physmap[i + 1]);
1876 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1878 int *ptr = (int *)CADDR1;
1881 * block out kernel memory as not available.
1883 if (pa >= 0x100000 && pa < first)
1889 * map page into kernel: valid, read/write,non-cacheable
1892 *pte = pa | PG_V | PG_RW | pg_n;
1894 *pte = pa | PG_V | PG_RW | PG_N;
1900 * Test for alternating 1's and 0's
1902 *(volatile int *)ptr = 0xaaaaaaaa;
1903 if (*(volatile int *)ptr != 0xaaaaaaaa) {
1907 * Test for alternating 0's and 1's
1909 *(volatile int *)ptr = 0x55555555;
1910 if (*(volatile int *)ptr != 0x55555555) {
1916 *(volatile int *)ptr = 0xffffffff;
1917 if (*(volatile int *)ptr != 0xffffffff) {
1923 *(volatile int *)ptr = 0x0;
1924 if (*(volatile int *)ptr != 0x0) {
1928 * Restore original value.
1933 * Adjust array of valid/good pages.
1935 if (page_bad == TRUE) {
1939 * If this good page is a continuation of the
1940 * previous set of good pages, then just increase
1941 * the end pointer. Otherwise start a new chunk.
1942 * Note that "end" points one higher than end,
1943 * making the range >= start and < end.
1944 * If we're also doing a speculative memory
1945 * test and we at or past the end, bump up Maxmem
1946 * so that we keep going. The first bad page
1947 * will terminate the loop.
1949 if (phys_avail[pa_indx] == pa) {
1950 phys_avail[pa_indx] += PAGE_SIZE;
1953 if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1955 "Too many holes in the physical address space, giving up\n");
1959 phys_avail[pa_indx++] = pa; /* start */
1960 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1970 * The last chunk must contain at least one page plus the message
1971 * buffer to avoid complicating other code (message buffer address
1972 * calculation, etc.).
1974 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1975 round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) {
1976 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1977 phys_avail[pa_indx--] = 0;
1978 phys_avail[pa_indx--] = 0;
1981 Maxmem = atop(phys_avail[pa_indx]);
1983 /* Trim off space for the message buffer. */
1984 phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
1986 avail_end = phys_avail[pa_indx];
1993 struct gate_descriptor *gdp;
1994 int gsel_tss, metadata_missing, off, x;
1996 /* table descriptors - used to load tables by microp */
1997 struct region_descriptor r_gdt, r_idt;
2001 proc0.p_uarea = proc0uarea;
2002 thread0.td_kstack = proc0kstack;
2003 thread0.td_pcb = (struct pcb *)
2004 (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
2005 atdevbase = ISA_HOLE_START + KERNBASE;
2008 * This may be done better later if it gets more high level
2009 * components in it. If so just link td->td_proc here.
2011 proc_linkup(&proc0, &ksegrp0, &kse0, &thread0);
2020 metadata_missing = 0;
2021 if (bootinfo.bi_modulep) {
2022 preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
2023 preload_bootstrap_relocate(KERNBASE);
2025 metadata_missing = 1;
2028 kern_envp = static_env;
2029 else if (bootinfo.bi_envp)
2030 kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
2032 /* Init basic tunables, hz etc */
2036 * make gdt memory segments, the code segment goes up to end of the
2037 * page with etext in it, the data segment goes to the end of
2041 * XXX text protection is temporarily (?) disabled. The limit was
2042 * i386_btop(round_page(etext)) - 1.
2044 gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
2045 gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
2047 pc = &SMP_prvspace[0].pcpu;
2048 gdt_segs[GPRIV_SEL].ssd_limit =
2049 atop(sizeof(struct privatespace) - 1);
2052 gdt_segs[GPRIV_SEL].ssd_limit =
2053 atop(sizeof(struct pcpu) - 1);
2055 gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
2056 gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
2058 for (x = 0; x < NGDT; x++)
2059 ssdtosd(&gdt_segs[x], &gdt[x].sd);
2061 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
2062 r_gdt.rd_base = (int) gdt;
2065 pcpu_init(pc, 0, sizeof(struct pcpu));
2066 PCPU_SET(prvspace, pc);
2067 PCPU_SET(curthread, &thread0);
2070 * Initialize mutexes.
2072 * icu_lock: in order to allow an interrupt to occur in a critical
2073 * section, to set pcpu->ipending (etc...) properly, we
2074 * must be able to get the icu lock, so it can't be
2078 mtx_init(&clock_lock, "clk", NULL, MTX_SPIN | MTX_RECURSE);
2079 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
2081 /* make ldt memory segments */
2083 * XXX - VM_MAXUSER_ADDRESS is an end address, not a max. And it
2084 * should be spelled ...MAX_USER...
2086 ldt_segs[LUCODE_SEL].ssd_limit = atop(VM_MAXUSER_ADDRESS - 1);
2087 ldt_segs[LUDATA_SEL].ssd_limit = atop(VM_MAXUSER_ADDRESS - 1);
2088 for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
2089 ssdtosd(&ldt_segs[x], &ldt[x].sd);
2091 _default_ldt = GSEL(GLDT_SEL, SEL_KPL);
2093 PCPU_SET(currentldt, _default_ldt);
2096 for (x = 0; x < NIDT; x++)
2097 setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
2098 GSEL(GCODE_SEL, SEL_KPL));
2099 setidt(0, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL,
2100 GSEL(GCODE_SEL, SEL_KPL));
2101 setidt(1, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
2102 GSEL(GCODE_SEL, SEL_KPL));
2103 setidt(2, &IDTVEC(nmi), SDT_SYS386TGT, SEL_KPL,
2104 GSEL(GCODE_SEL, SEL_KPL));
2105 setidt(3, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
2106 GSEL(GCODE_SEL, SEL_KPL));
2107 setidt(4, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL,
2108 GSEL(GCODE_SEL, SEL_KPL));
2109 setidt(5, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL,
2110 GSEL(GCODE_SEL, SEL_KPL));
2111 setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
2112 GSEL(GCODE_SEL, SEL_KPL));
2113 setidt(7, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL
2114 , GSEL(GCODE_SEL, SEL_KPL));
2115 setidt(8, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
2116 setidt(9, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL,
2117 GSEL(GCODE_SEL, SEL_KPL));
2118 setidt(10, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL,
2119 GSEL(GCODE_SEL, SEL_KPL));
2120 setidt(11, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL,
2121 GSEL(GCODE_SEL, SEL_KPL));
2122 setidt(12, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL,
2123 GSEL(GCODE_SEL, SEL_KPL));
2124 setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
2125 GSEL(GCODE_SEL, SEL_KPL));
2126 setidt(14, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
2127 GSEL(GCODE_SEL, SEL_KPL));
2128 setidt(15, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
2129 GSEL(GCODE_SEL, SEL_KPL));
2130 setidt(16, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL,
2131 GSEL(GCODE_SEL, SEL_KPL));
2132 setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
2133 GSEL(GCODE_SEL, SEL_KPL));
2134 setidt(18, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL,
2135 GSEL(GCODE_SEL, SEL_KPL));
2136 setidt(19, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
2137 GSEL(GCODE_SEL, SEL_KPL));
2138 setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
2139 GSEL(GCODE_SEL, SEL_KPL));
2141 r_idt.rd_limit = sizeof(idt0) - 1;
2142 r_idt.rd_base = (int) idt;
2146 * Initialize the console before we print anything out.
2150 if (metadata_missing)
2151 printf("WARNING: loader(8) metadata is missing!\n");
2159 if (boothowto & RB_KDB)
2160 Debugger("Boot flags requested debugger");
2163 finishidentcpu(); /* Final stage of CPU initialization */
2164 setidt(6, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
2165 GSEL(GCODE_SEL, SEL_KPL));
2166 setidt(13, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
2167 GSEL(GCODE_SEL, SEL_KPL));
2168 initializecpu(); /* Initialize CPU registers */
2170 /* make an initial tss so cpu can get interrupt stack on syscall! */
2171 /* Note: -16 is so we can grow the trapframe if we came from vm86 */
2172 PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
2173 KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16);
2174 PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
2175 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
2177 PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
2178 PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
2179 PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
2182 dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
2183 dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
2184 dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
2185 dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
2186 dblfault_tss.tss_cr3 = (int)IdlePTD;
2187 dblfault_tss.tss_eip = (int)dblfault_handler;
2188 dblfault_tss.tss_eflags = PSL_KERNEL;
2189 dblfault_tss.tss_ds = dblfault_tss.tss_es =
2190 dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
2191 dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
2192 dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
2193 dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
2197 init_param2(physmem);
2199 /* now running on new page tables, configured,and u/iom is accessible */
2201 /* Map the message buffer. */
2202 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
2203 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2205 msgbufinit(msgbufp, MSGBUF_SIZE);
2207 /* make a call gate to reenter kernel with */
2208 gdp = &ldt[LSYS5CALLS_SEL].gd;
2210 x = (int) &IDTVEC(lcall_syscall);
2211 gdp->gd_looffset = x;
2212 gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
2214 gdp->gd_type = SDT_SYS386CGT;
2215 gdp->gd_dpl = SEL_UPL;
2217 gdp->gd_hioffset = x >> 16;
2219 /* XXX does this work? */
2220 ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
2221 ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
2223 /* transfer to user mode */
2225 _ucodesel = LSEL(LUCODE_SEL, SEL_UPL);
2226 _udatasel = LSEL(LUDATA_SEL, SEL_UPL);
2228 /* setup proc 0's pcb */
2229 thread0.td_pcb->pcb_flags = 0; /* XXXKSE */
2230 thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
2231 thread0.td_pcb->pcb_ext = 0;
2232 thread0.td_frame = &proc0_tf;
2236 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
2240 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
2241 static void f00f_hack(void *unused);
2242 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
2245 f00f_hack(void *unused) {
2246 struct gate_descriptor *new_idt;
2248 struct region_descriptor r_idt;
2257 printf("Intel Pentium detected, installing workaround for F00F bug\n");
2259 r_idt.rd_limit = sizeof(idt0) - 1;
2261 tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
2263 panic("kmem_alloc returned 0");
2264 if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0)
2265 panic("kmem_alloc returned non-page-aligned memory");
2266 /* Put the first seven entries in the lower page */
2267 new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8));
2268 bcopy(idt, new_idt, sizeof(idt0));
2269 r_idt.rd_base = (int)new_idt;
2272 if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
2273 VM_PROT_READ, FALSE) != KERN_SUCCESS)
2274 panic("vm_map_protect failed");
2277 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
2280 ptrace_set_pc(struct thread *td, unsigned long addr)
2282 td->td_frame->tf_eip = addr;
2287 ptrace_single_step(struct thread *td)
2289 td->td_frame->tf_eflags |= PSL_T;
2294 fill_regs(struct thread *td, struct reg *regs)
2297 struct trapframe *tp;
2300 regs->r_fs = tp->tf_fs;
2301 regs->r_es = tp->tf_es;
2302 regs->r_ds = tp->tf_ds;
2303 regs->r_edi = tp->tf_edi;
2304 regs->r_esi = tp->tf_esi;
2305 regs->r_ebp = tp->tf_ebp;
2306 regs->r_ebx = tp->tf_ebx;
2307 regs->r_edx = tp->tf_edx;
2308 regs->r_ecx = tp->tf_ecx;
2309 regs->r_eax = tp->tf_eax;
2310 regs->r_eip = tp->tf_eip;
2311 regs->r_cs = tp->tf_cs;
2312 regs->r_eflags = tp->tf_eflags;
2313 regs->r_esp = tp->tf_esp;
2314 regs->r_ss = tp->tf_ss;
2316 regs->r_gs = pcb->pcb_gs;
2321 set_regs(struct thread *td, struct reg *regs)
2324 struct trapframe *tp;
2327 if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
2328 !CS_SECURE(regs->r_cs))
2330 tp->tf_fs = regs->r_fs;
2331 tp->tf_es = regs->r_es;
2332 tp->tf_ds = regs->r_ds;
2333 tp->tf_edi = regs->r_edi;
2334 tp->tf_esi = regs->r_esi;
2335 tp->tf_ebp = regs->r_ebp;
2336 tp->tf_ebx = regs->r_ebx;
2337 tp->tf_edx = regs->r_edx;
2338 tp->tf_ecx = regs->r_ecx;
2339 tp->tf_eax = regs->r_eax;
2340 tp->tf_eip = regs->r_eip;
2341 tp->tf_cs = regs->r_cs;
2342 tp->tf_eflags = regs->r_eflags;
2343 tp->tf_esp = regs->r_esp;
2344 tp->tf_ss = regs->r_ss;
2346 pcb->pcb_gs = regs->r_gs;
2350 #ifdef CPU_ENABLE_SSE
2352 fill_fpregs_xmm(sv_xmm, sv_87)
2353 struct savexmm *sv_xmm;
2354 struct save87 *sv_87;
2356 register struct env87 *penv_87 = &sv_87->sv_env;
2357 register struct envxmm *penv_xmm = &sv_xmm->sv_env;
2360 bzero(sv_87, sizeof(*sv_87));
2362 /* FPU control/status */
2363 penv_87->en_cw = penv_xmm->en_cw;
2364 penv_87->en_sw = penv_xmm->en_sw;
2365 penv_87->en_tw = penv_xmm->en_tw;
2366 penv_87->en_fip = penv_xmm->en_fip;
2367 penv_87->en_fcs = penv_xmm->en_fcs;
2368 penv_87->en_opcode = penv_xmm->en_opcode;
2369 penv_87->en_foo = penv_xmm->en_foo;
2370 penv_87->en_fos = penv_xmm->en_fos;
2373 for (i = 0; i < 8; ++i)
2374 sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
2378 set_fpregs_xmm(sv_87, sv_xmm)
2379 struct save87 *sv_87;
2380 struct savexmm *sv_xmm;
2382 register struct env87 *penv_87 = &sv_87->sv_env;
2383 register struct envxmm *penv_xmm = &sv_xmm->sv_env;
2386 /* FPU control/status */
2387 penv_xmm->en_cw = penv_87->en_cw;
2388 penv_xmm->en_sw = penv_87->en_sw;
2389 penv_xmm->en_tw = penv_87->en_tw;
2390 penv_xmm->en_fip = penv_87->en_fip;
2391 penv_xmm->en_fcs = penv_87->en_fcs;
2392 penv_xmm->en_opcode = penv_87->en_opcode;
2393 penv_xmm->en_foo = penv_87->en_foo;
2394 penv_xmm->en_fos = penv_87->en_fos;
2397 for (i = 0; i < 8; ++i)
2398 sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
2400 #endif /* CPU_ENABLE_SSE */
2403 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2405 #ifdef CPU_ENABLE_SSE
2407 fill_fpregs_xmm(&td->td_pcb->pcb_save.sv_xmm,
2408 (struct save87 *)fpregs);
2411 #endif /* CPU_ENABLE_SSE */
2412 bcopy(&td->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs);
2417 set_fpregs(struct thread *td, struct fpreg *fpregs)
2419 #ifdef CPU_ENABLE_SSE
2421 set_fpregs_xmm((struct save87 *)fpregs,
2422 &td->td_pcb->pcb_save.sv_xmm);
2425 #endif /* CPU_ENABLE_SSE */
2426 bcopy(fpregs, &td->td_pcb->pcb_save.sv_87, sizeof *fpregs);
2431 * Get machine context.
2434 get_mcontext(struct thread *td, mcontext_t *mcp, int clear_ret)
2436 struct trapframe *tp;
2440 PROC_LOCK(curthread->td_proc);
2441 mcp->mc_onstack = sigonstack(tp->tf_esp);
2442 PROC_UNLOCK(curthread->td_proc);
2443 mcp->mc_gs = td->td_pcb->pcb_gs;
2444 mcp->mc_fs = tp->tf_fs;
2445 mcp->mc_es = tp->tf_es;
2446 mcp->mc_ds = tp->tf_ds;
2447 mcp->mc_edi = tp->tf_edi;
2448 mcp->mc_esi = tp->tf_esi;
2449 mcp->mc_ebp = tp->tf_ebp;
2450 mcp->mc_isp = tp->tf_isp;
2451 mcp->mc_ebx = tp->tf_ebx;
2452 if (clear_ret != 0) {
2456 mcp->mc_eax = tp->tf_eax;
2457 mcp->mc_edx = tp->tf_edx;
2459 mcp->mc_ecx = tp->tf_ecx;
2460 mcp->mc_eip = tp->tf_eip;
2461 mcp->mc_cs = tp->tf_cs;
2462 mcp->mc_eflags = tp->tf_eflags;
2463 mcp->mc_esp = tp->tf_esp;
2464 mcp->mc_ss = tp->tf_ss;
2465 mcp->mc_len = sizeof(*mcp);
2466 get_fpcontext(td, mcp);
2471 * Set machine context.
2473 * However, we don't set any but the user modifiable flags, and we won't
2474 * touch the cs selector.
2477 set_mcontext(struct thread *td, const mcontext_t *mcp)
2479 struct trapframe *tp;
2483 if (mcp->mc_len != sizeof(*mcp))
2485 eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
2486 (tp->tf_eflags & ~PSL_USERCHANGE);
2487 if ((ret = set_fpcontext(td, mcp)) == 0) {
2488 tp->tf_fs = mcp->mc_fs;
2489 tp->tf_es = mcp->mc_es;
2490 tp->tf_ds = mcp->mc_ds;
2491 tp->tf_edi = mcp->mc_edi;
2492 tp->tf_esi = mcp->mc_esi;
2493 tp->tf_ebp = mcp->mc_ebp;
2494 tp->tf_ebx = mcp->mc_ebx;
2495 tp->tf_edx = mcp->mc_edx;
2496 tp->tf_ecx = mcp->mc_ecx;
2497 tp->tf_eax = mcp->mc_eax;
2498 tp->tf_eip = mcp->mc_eip;
2499 tp->tf_eflags = eflags;
2500 tp->tf_esp = mcp->mc_esp;
2501 tp->tf_ss = mcp->mc_ss;
2502 td->td_pcb->pcb_gs = mcp->mc_gs;
2509 get_fpcontext(struct thread *td, mcontext_t *mcp)
2512 mcp->mc_fpformat = _MC_FPFMT_NODEV;
2513 mcp->mc_ownedfp = _MC_FPOWNED_NONE;
2515 union savefpu *addr;
2518 * XXX mc_fpstate might be misaligned, since its declaration is not
2519 * unportabilized using __attribute__((aligned(16))) like the
2520 * declaration of struct savemm, and anyway, alignment doesn't work
2521 * for auto variables since we don't use gcc's pessimal stack
2522 * alignment. Work around this by abusing the spare fields after
2525 * XXX unpessimize most cases by only aligning when fxsave might be
2526 * called, although this requires knowing too much about
2527 * npxgetregs()'s internals.
2529 addr = (union savefpu *)&mcp->mc_fpstate;
2530 if (td == PCPU_GET(fpcurthread) &&
2531 #ifdef CPU_ENABLE_SSE
2534 ((uintptr_t)(void *)addr & 0xF)) {
2536 addr = (void *)((char *)addr + 4);
2537 while ((uintptr_t)(void *)addr & 0xF);
2539 mcp->mc_ownedfp = npxgetregs(td, addr);
2540 if (addr != (union savefpu *)&mcp->mc_fpstate) {
2541 bcopy(addr, &mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
2542 bzero(&mcp->mc_spare2, sizeof(mcp->mc_spare2));
2544 mcp->mc_fpformat = npxformat();
2549 set_fpcontext(struct thread *td, const mcontext_t *mcp)
2551 union savefpu *addr;
2553 if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2555 else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
2556 mcp->mc_fpformat != _MC_FPFMT_XMM)
2558 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
2559 /* We don't care what state is left in the FPU or PCB. */
2561 else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2562 mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2563 /* XXX align as above. */
2564 addr = (union savefpu *)&mcp->mc_fpstate;
2565 if (td == PCPU_GET(fpcurthread) &&
2566 #ifdef CPU_ENABLE_SSE
2569 ((uintptr_t)(void *)addr & 0xF)) {
2571 addr = (void *)((char *)addr + 4);
2572 while ((uintptr_t)(void *)addr & 0xF);
2573 bcopy(&mcp->mc_fpstate, addr, sizeof(mcp->mc_fpstate));
2577 * XXX we violate the dubious requirement that npxsetregs()
2578 * be called with interrupts disabled.
2580 npxsetregs(td, addr);
2583 * Don't bother putting things back where they were in the
2584 * misaligned case, since we know that the caller won't use
2593 fpstate_drop(struct thread *td)
2599 if (PCPU_GET(fpcurthread) == td)
2603 * XXX force a full drop of the npx. The above only drops it if we
2604 * owned it. npxgetregs() has the same bug in the !cpu_fxsr case.
2606 * XXX I don't much like npxgetregs()'s semantics of doing a full
2607 * drop. Dropping only to the pcb matches fnsave's behaviour.
2608 * We only need to drop to !PCB_INITDONE in sendsig(). But
2609 * sendsig() is the only caller of npxgetregs()... perhaps we just
2610 * have too many layers.
2612 curthread->td_pcb->pcb_flags &= ~PCB_NPXINITDONE;
2617 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2622 dbregs->dr[0] = rdr0();
2623 dbregs->dr[1] = rdr1();
2624 dbregs->dr[2] = rdr2();
2625 dbregs->dr[3] = rdr3();
2626 dbregs->dr[4] = rdr4();
2627 dbregs->dr[5] = rdr5();
2628 dbregs->dr[6] = rdr6();
2629 dbregs->dr[7] = rdr7();
2632 dbregs->dr[0] = pcb->pcb_dr0;
2633 dbregs->dr[1] = pcb->pcb_dr1;
2634 dbregs->dr[2] = pcb->pcb_dr2;
2635 dbregs->dr[3] = pcb->pcb_dr3;
2638 dbregs->dr[6] = pcb->pcb_dr6;
2639 dbregs->dr[7] = pcb->pcb_dr7;
2645 set_dbregs(struct thread *td, struct dbreg *dbregs)
2649 u_int32_t mask1, mask2;
2652 load_dr0(dbregs->dr[0]);
2653 load_dr1(dbregs->dr[1]);
2654 load_dr2(dbregs->dr[2]);
2655 load_dr3(dbregs->dr[3]);
2656 load_dr4(dbregs->dr[4]);
2657 load_dr5(dbregs->dr[5]);
2658 load_dr6(dbregs->dr[6]);
2659 load_dr7(dbregs->dr[7]);
2662 * Don't let an illegal value for dr7 get set. Specifically,
2663 * check for undefined settings. Setting these bit patterns
2664 * result in undefined behaviour and can lead to an unexpected
2667 for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 8;
2668 i++, mask1 <<= 2, mask2 <<= 2)
2669 if ((dbregs->dr[7] & mask1) == mask2)
2675 * Don't let a process set a breakpoint that is not within the
2676 * process's address space. If a process could do this, it
2677 * could halt the system by setting a breakpoint in the kernel
2678 * (if ddb was enabled). Thus, we need to check to make sure
2679 * that no breakpoints are being enabled for addresses outside
2680 * process's address space, unless, perhaps, we were called by
2683 * XXX - what about when the watched area of the user's
2684 * address space is written into from within the kernel
2685 * ... wouldn't that still cause a breakpoint to be generated
2686 * from within kernel mode?
2689 if (suser(td) != 0) {
2690 if (dbregs->dr[7] & 0x3) {
2691 /* dr0 is enabled */
2692 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2696 if (dbregs->dr[7] & (0x3<<2)) {
2697 /* dr1 is enabled */
2698 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2702 if (dbregs->dr[7] & (0x3<<4)) {
2703 /* dr2 is enabled */
2704 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2708 if (dbregs->dr[7] & (0x3<<6)) {
2709 /* dr3 is enabled */
2710 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2715 pcb->pcb_dr0 = dbregs->dr[0];
2716 pcb->pcb_dr1 = dbregs->dr[1];
2717 pcb->pcb_dr2 = dbregs->dr[2];
2718 pcb->pcb_dr3 = dbregs->dr[3];
2719 pcb->pcb_dr6 = dbregs->dr[6];
2720 pcb->pcb_dr7 = dbregs->dr[7];
2722 pcb->pcb_flags |= PCB_DBREGS;
2729 * Return > 0 if a hardware breakpoint has been hit, and the
2730 * breakpoint was in user space. Return 0, otherwise.
2733 user_dbreg_trap(void)
2735 u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
2736 u_int32_t bp; /* breakpoint bits extracted from dr6 */
2737 int nbp; /* number of breakpoints that triggered */
2738 caddr_t addr[4]; /* breakpoint addresses */
2742 if ((dr7 & 0x000000ff) == 0) {
2744 * all GE and LE bits in the dr7 register are zero,
2745 * thus the trap couldn't have been caused by the
2746 * hardware debug registers
2753 bp = dr6 & 0x0000000f;
2757 * None of the breakpoint bits are set meaning this
2758 * trap was not caused by any of the debug registers
2764 * at least one of the breakpoints were hit, check to see
2765 * which ones and if any of them are user space addresses
2769 addr[nbp++] = (caddr_t)rdr0();
2772 addr[nbp++] = (caddr_t)rdr1();
2775 addr[nbp++] = (caddr_t)rdr2();
2778 addr[nbp++] = (caddr_t)rdr3();
2781 for (i=0; i<nbp; i++) {
2783 (caddr_t)VM_MAXUSER_ADDRESS) {
2785 * addr[i] is in user space
2792 * None of the breakpoints are in user space.
2800 Debugger(const char *msg)
2802 printf("Debugger(\"%s\") called.\n", msg);
2809 * Provide inb() and outb() as functions. They are normally only
2810 * available as macros calling inlined functions, thus cannot be
2811 * called inside DDB.
2813 * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
2819 /* silence compiler warnings */
2821 void outb(u_int, u_char);
2828 * We use %%dx and not %1 here because i/o is done at %dx and not at
2829 * %edx, while gcc generates inferior code (movw instead of movl)
2830 * if we tell it to load (u_short) port.
2832 __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
2837 outb(u_int port, u_char data)
2841 * Use an unnecessary assignment to help gcc's register allocator.
2842 * This make a large difference for gcc-1.40 and a tiny difference
2843 * for gcc-2.6.0. For gcc-1.40, al had to be ``asm("ax")'' for
2844 * best results. gcc-2.6.0 can't handle this.
2847 __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));