]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/i386/i386/trap.c
Update to ELF Tool Chain r3490
[FreeBSD/FreeBSD.git] / sys / i386 / i386 / trap.c
1 /*-
2  * Copyright (C) 1994, David Greenman
3  * Copyright (c) 1990, 1993
4  *      The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * the University of Utah, and William Jolitz.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *      This product includes software developed by the University of
20  *      California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *      from: @(#)trap.c        7.4 (Berkeley) 5/13/91
38  */
39
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42
43 /*
44  * 386 Trap and System call handling
45  */
46
47 #include "opt_clock.h"
48 #include "opt_cpu.h"
49 #include "opt_hwpmc_hooks.h"
50 #include "opt_isa.h"
51 #include "opt_kdb.h"
52 #include "opt_npx.h"
53 #include "opt_stack.h"
54 #include "opt_trap.h"
55
56 #include <sys/param.h>
57 #include <sys/bus.h>
58 #include <sys/systm.h>
59 #include <sys/proc.h>
60 #include <sys/pioctl.h>
61 #include <sys/ptrace.h>
62 #include <sys/kdb.h>
63 #include <sys/kernel.h>
64 #include <sys/ktr.h>
65 #include <sys/lock.h>
66 #include <sys/mutex.h>
67 #include <sys/resourcevar.h>
68 #include <sys/signalvar.h>
69 #include <sys/syscall.h>
70 #include <sys/sysctl.h>
71 #include <sys/sysent.h>
72 #include <sys/uio.h>
73 #include <sys/vmmeter.h>
74 #ifdef HWPMC_HOOKS
75 #include <sys/pmckern.h>
76 PMC_SOFT_DEFINE( , , page_fault, all);
77 PMC_SOFT_DEFINE( , , page_fault, read);
78 PMC_SOFT_DEFINE( , , page_fault, write);
79 #endif
80 #include <security/audit/audit.h>
81
82 #include <vm/vm.h>
83 #include <vm/vm_param.h>
84 #include <vm/pmap.h>
85 #include <vm/vm_kern.h>
86 #include <vm/vm_map.h>
87 #include <vm/vm_page.h>
88 #include <vm/vm_extern.h>
89
90 #include <machine/cpu.h>
91 #include <machine/intr_machdep.h>
92 #include <x86/mca.h>
93 #include <machine/md_var.h>
94 #include <machine/pcb.h>
95 #ifdef SMP
96 #include <machine/smp.h>
97 #endif
98 #include <machine/stack.h>
99 #include <machine/tss.h>
100 #include <machine/vm86.h>
101
102 #ifdef POWERFAIL_NMI
103 #include <sys/syslog.h>
104 #include <machine/clock.h>
105 #endif
106
107 #ifdef KDTRACE_HOOKS
108 #include <sys/dtrace_bsd.h>
109 #endif
110
111 extern void trap(struct trapframe *frame);
112 extern void syscall(struct trapframe *frame);
113
114 static int trap_pfault(struct trapframe *, int, vm_offset_t);
115 static void trap_fatal(struct trapframe *, vm_offset_t);
116 void dblfault_handler(void);
117
118 extern inthand_t IDTVEC(lcall_syscall);
119
120 #define MAX_TRAP_MSG            32
121 static char *trap_msg[] = {
122         "",                                     /*  0 unused */
123         "privileged instruction fault",         /*  1 T_PRIVINFLT */
124         "",                                     /*  2 unused */
125         "breakpoint instruction fault",         /*  3 T_BPTFLT */
126         "",                                     /*  4 unused */
127         "",                                     /*  5 unused */
128         "arithmetic trap",                      /*  6 T_ARITHTRAP */
129         "",                                     /*  7 unused */
130         "",                                     /*  8 unused */
131         "general protection fault",             /*  9 T_PROTFLT */
132         "trace trap",                           /* 10 T_TRCTRAP */
133         "",                                     /* 11 unused */
134         "page fault",                           /* 12 T_PAGEFLT */
135         "",                                     /* 13 unused */
136         "alignment fault",                      /* 14 T_ALIGNFLT */
137         "",                                     /* 15 unused */
138         "",                                     /* 16 unused */
139         "",                                     /* 17 unused */
140         "integer divide fault",                 /* 18 T_DIVIDE */
141         "non-maskable interrupt trap",          /* 19 T_NMI */
142         "overflow trap",                        /* 20 T_OFLOW */
143         "FPU bounds check fault",               /* 21 T_BOUND */
144         "FPU device not available",             /* 22 T_DNA */
145         "double fault",                         /* 23 T_DOUBLEFLT */
146         "FPU operand fetch fault",              /* 24 T_FPOPFLT */
147         "invalid TSS fault",                    /* 25 T_TSSFLT */
148         "segment not present fault",            /* 26 T_SEGNPFLT */
149         "stack fault",                          /* 27 T_STKFLT */
150         "machine check trap",                   /* 28 T_MCHK */
151         "SIMD floating-point exception",        /* 29 T_XMMFLT */
152         "reserved (unknown) fault",             /* 30 T_RESERVED */
153         "",                                     /* 31 unused (reserved) */
154         "DTrace pid return trap",               /* 32 T_DTRACE_RET */
155 };
156
157 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
158 int has_f00f_bug = 0;           /* Initialized so that it can be patched. */
159 #endif
160
161 #ifdef KDB
162 static int kdb_on_nmi = 1;
163 SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN,
164         &kdb_on_nmi, 0, "Go to KDB on NMI");
165 #endif
166 static int panic_on_nmi = 1;
167 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN,
168         &panic_on_nmi, 0, "Panic on NMI");
169 static int prot_fault_translation = 0;
170 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW,
171         &prot_fault_translation, 0, "Select signal to deliver on protection fault");
172 static int uprintf_signal;
173 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW,
174     &uprintf_signal, 0,
175     "Print debugging information on trap signal to ctty");
176
177 /*
178  * Exception, fault, and trap interface to the FreeBSD kernel.
179  * This common code is called from assembly language IDT gate entry
180  * routines that prepare a suitable stack frame, and restore this
181  * frame after the exception has been processed.
182  */
183
184 void
185 trap(struct trapframe *frame)
186 {
187 #ifdef KDTRACE_HOOKS
188         struct reg regs;
189 #endif
190         struct thread *td = curthread;
191         struct proc *p = td->td_proc;
192         int i = 0, ucode = 0, code;
193         u_int type;
194         register_t addr = 0;
195         vm_offset_t eva;
196         ksiginfo_t ksi;
197 #ifdef POWERFAIL_NMI
198         static int lastalert = 0;
199 #endif
200
201         PCPU_INC(cnt.v_trap);
202         type = frame->tf_trapno;
203
204 #ifdef SMP
205         /* Handler for NMI IPIs used for stopping CPUs. */
206         if (type == T_NMI) {
207                  if (ipi_nmi_handler() == 0)
208                            goto out;
209         }
210 #endif /* SMP */
211
212 #ifdef KDB
213         if (kdb_active) {
214                 kdb_reenter();
215                 goto out;
216         }
217 #endif
218
219         if (type == T_RESERVED) {
220                 trap_fatal(frame, 0);
221                 goto out;
222         }
223
224         if (type == T_NMI) {
225 #ifdef HWPMC_HOOKS
226                 /*
227                  * CPU PMCs interrupt using an NMI so we check for that first.
228                  * If the HWPMC module is active, 'pmc_hook' will point to
229                  * the function to be called.  A non-zero return value from the
230                  * hook means that the NMI was consumed by it and that we can
231                  * return immediately.
232                  */
233                 if (pmc_intr != NULL &&
234                     (*pmc_intr)(PCPU_GET(cpuid), frame) != 0)
235                         goto out;
236 #endif
237
238 #ifdef STACK
239                 if (stack_nmi_handler(frame) != 0)
240                         goto out;
241 #endif
242         }
243
244         if (type == T_MCHK) {
245                 mca_intr();
246                 goto out;
247         }
248
249 #ifdef KDTRACE_HOOKS
250         /*
251          * A trap can occur while DTrace executes a probe. Before
252          * executing the probe, DTrace blocks re-scheduling and sets
253          * a flag in its per-cpu flags to indicate that it doesn't
254          * want to fault. On returning from the probe, the no-fault
255          * flag is cleared and finally re-scheduling is enabled.
256          */
257         if ((type == T_PROTFLT || type == T_PAGEFLT) &&
258             dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
259                 goto out;
260 #endif
261
262         if ((frame->tf_eflags & PSL_I) == 0) {
263                 /*
264                  * Buggy application or kernel code has disabled
265                  * interrupts and then trapped.  Enabling interrupts
266                  * now is wrong, but it is better than running with
267                  * interrupts disabled until they are accidentally
268                  * enabled later.
269                  */
270                 if (ISPL(frame->tf_cs) == SEL_UPL || (frame->tf_eflags & PSL_VM))
271                         uprintf(
272                             "pid %ld (%s): trap %d with interrupts disabled\n",
273                             (long)curproc->p_pid, curthread->td_name, type);
274                 else if (type != T_NMI && type != T_BPTFLT &&
275                     type != T_TRCTRAP &&
276                     frame->tf_eip != (int)cpu_switch_load_gs) {
277                         /*
278                          * XXX not quite right, since this may be for a
279                          * multiple fault in user mode.
280                          */
281                         printf("kernel trap %d with interrupts disabled\n",
282                             type);
283                         /*
284                          * Page faults need interrupts disabled until later,
285                          * and we shouldn't enable interrupts while holding
286                          * a spin lock.
287                          */
288                         if (type != T_PAGEFLT &&
289                             td->td_md.md_spinlock_count == 0)
290                                 enable_intr();
291                 }
292         }
293         eva = 0;
294         code = frame->tf_err;
295         if (type == T_PAGEFLT) {
296                 /*
297                  * For some Cyrix CPUs, %cr2 is clobbered by
298                  * interrupts.  This problem is worked around by using
299                  * an interrupt gate for the pagefault handler.  We
300                  * are finally ready to read %cr2 and conditionally
301                  * reenable interrupts.  If we hold a spin lock, then
302                  * we must not reenable interrupts.  This might be a
303                  * spurious page fault.
304                  */
305                 eva = rcr2();
306                 if (td->td_md.md_spinlock_count == 0)
307                         enable_intr();
308         }
309
310         if ((ISPL(frame->tf_cs) == SEL_UPL) ||
311             ((frame->tf_eflags & PSL_VM) && 
312                 !(curpcb->pcb_flags & PCB_VM86CALL))) {
313                 /* user trap */
314
315                 td->td_pticks = 0;
316                 td->td_frame = frame;
317                 addr = frame->tf_eip;
318                 if (td->td_cowgen != p->p_cowgen)
319                         thread_cow_update(td);
320
321                 switch (type) {
322                 case T_PRIVINFLT:       /* privileged instruction fault */
323                         i = SIGILL;
324                         ucode = ILL_PRVOPC;
325                         break;
326
327                 case T_BPTFLT:          /* bpt instruction fault */
328                 case T_TRCTRAP:         /* trace trap */
329                         enable_intr();
330 #ifdef KDTRACE_HOOKS
331                         if (type == T_BPTFLT) {
332                                 fill_frame_regs(frame, &regs);
333                                 if (dtrace_pid_probe_ptr != NULL &&
334                                     dtrace_pid_probe_ptr(&regs) == 0)
335                                         goto out;
336                         }
337 #endif
338                         frame->tf_eflags &= ~PSL_T;
339                         i = SIGTRAP;
340                         ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
341                         break;
342
343                 case T_ARITHTRAP:       /* arithmetic trap */
344 #ifdef DEV_NPX
345                         ucode = npxtrap_x87();
346                         if (ucode == -1)
347                                 goto userout;
348 #else
349                         ucode = 0;
350 #endif
351                         i = SIGFPE;
352                         break;
353
354                         /*
355                          * The following two traps can happen in
356                          * vm86 mode, and, if so, we want to handle
357                          * them specially.
358                          */
359                 case T_PROTFLT:         /* general protection fault */
360                 case T_STKFLT:          /* stack fault */
361                         if (frame->tf_eflags & PSL_VM) {
362                                 i = vm86_emulate((struct vm86frame *)frame);
363                                 if (i == 0)
364                                         goto user;
365                                 break;
366                         }
367                         i = SIGBUS;
368                         ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
369                         break;
370                 case T_SEGNPFLT:        /* segment not present fault */
371                         i = SIGBUS;
372                         ucode = BUS_ADRERR;
373                         break;
374                 case T_TSSFLT:          /* invalid TSS fault */
375                         i = SIGBUS;
376                         ucode = BUS_OBJERR;
377                         break;
378                 case T_ALIGNFLT:
379                         i = SIGBUS;
380                         ucode = BUS_ADRALN;
381                         break;
382                 case T_DOUBLEFLT:       /* double fault */
383                 default:
384                         i = SIGBUS;
385                         ucode = BUS_OBJERR;
386                         break;
387
388                 case T_PAGEFLT:         /* page fault */
389
390                         i = trap_pfault(frame, TRUE, eva);
391 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
392                         if (i == -2) {
393                                 /*
394                                  * The f00f hack workaround has triggered, so
395                                  * treat the fault as an illegal instruction 
396                                  * (T_PRIVINFLT) instead of a page fault.
397                                  */
398                                 type = frame->tf_trapno = T_PRIVINFLT;
399
400                                 /* Proceed as in that case. */
401                                 ucode = ILL_PRVOPC;
402                                 i = SIGILL;
403                                 break;
404                         }
405 #endif
406                         if (i == -1)
407                                 goto userout;
408                         if (i == 0)
409                                 goto user;
410
411                         if (i == SIGSEGV)
412                                 ucode = SEGV_MAPERR;
413                         else {
414                                 if (prot_fault_translation == 0) {
415                                         /*
416                                          * Autodetect.
417                                          * This check also covers the images
418                                          * without the ABI-tag ELF note.
419                                          */
420                                         if (SV_CURPROC_ABI() == SV_ABI_FREEBSD
421                                             && p->p_osrel >= P_OSREL_SIGSEGV) {
422                                                 i = SIGSEGV;
423                                                 ucode = SEGV_ACCERR;
424                                         } else {
425                                                 i = SIGBUS;
426                                                 ucode = BUS_PAGE_FAULT;
427                                         }
428                                 } else if (prot_fault_translation == 1) {
429                                         /*
430                                          * Always compat mode.
431                                          */
432                                         i = SIGBUS;
433                                         ucode = BUS_PAGE_FAULT;
434                                 } else {
435                                         /*
436                                          * Always SIGSEGV mode.
437                                          */
438                                         i = SIGSEGV;
439                                         ucode = SEGV_ACCERR;
440                                 }
441                         }
442                         addr = eva;
443                         break;
444
445                 case T_DIVIDE:          /* integer divide fault */
446                         ucode = FPE_INTDIV;
447                         i = SIGFPE;
448                         break;
449
450 #ifdef DEV_ISA
451                 case T_NMI:
452 #ifdef POWERFAIL_NMI
453 #ifndef TIMER_FREQ
454 #  define TIMER_FREQ 1193182
455 #endif
456                         if (time_second - lastalert > 10) {
457                                 log(LOG_WARNING, "NMI: power fail\n");
458                                 sysbeep(880, hz);
459                                 lastalert = time_second;
460                         }
461                         goto userout;
462 #else /* !POWERFAIL_NMI */
463                         /* machine/parity/power fail/"kitchen sink" faults */
464                         if (isa_nmi(code) == 0) {
465 #ifdef KDB
466                                 /*
467                                  * NMI can be hooked up to a pushbutton
468                                  * for debugging.
469                                  */
470                                 if (kdb_on_nmi) {
471                                         printf ("NMI ... going to debugger\n");
472                                         kdb_trap(type, 0, frame);
473                                 }
474 #endif /* KDB */
475                                 goto userout;
476                         } else if (panic_on_nmi)
477                                 panic("NMI indicates hardware failure");
478                         break;
479 #endif /* POWERFAIL_NMI */
480 #endif /* DEV_ISA */
481
482                 case T_OFLOW:           /* integer overflow fault */
483                         ucode = FPE_INTOVF;
484                         i = SIGFPE;
485                         break;
486
487                 case T_BOUND:           /* bounds check fault */
488                         ucode = FPE_FLTSUB;
489                         i = SIGFPE;
490                         break;
491
492                 case T_DNA:
493 #ifdef DEV_NPX
494                         KASSERT(PCB_USER_FPU(td->td_pcb),
495                             ("kernel FPU ctx has leaked"));
496                         /* transparent fault (due to context switch "late") */
497                         if (npxdna())
498                                 goto userout;
499 #endif
500                         uprintf("pid %d killed due to lack of floating point\n",
501                                 p->p_pid);
502                         i = SIGKILL;
503                         ucode = 0;
504                         break;
505
506                 case T_FPOPFLT:         /* FPU operand fetch fault */
507                         ucode = ILL_COPROC;
508                         i = SIGILL;
509                         break;
510
511                 case T_XMMFLT:          /* SIMD floating-point exception */
512 #if defined(DEV_NPX) && !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
513                         ucode = npxtrap_sse();
514                         if (ucode == -1)
515                                 goto userout;
516 #else
517                         ucode = 0;
518 #endif
519                         i = SIGFPE;
520                         break;
521 #ifdef KDTRACE_HOOKS
522                 case T_DTRACE_RET:
523                         enable_intr();
524                         fill_frame_regs(frame, &regs);
525                         if (dtrace_return_probe_ptr != NULL &&
526                             dtrace_return_probe_ptr(&regs) == 0)
527                                 goto out;
528                         break;
529 #endif
530                 }
531         } else {
532                 /* kernel trap */
533
534                 KASSERT(cold || td->td_ucred != NULL,
535                     ("kernel trap doesn't have ucred"));
536                 switch (type) {
537                 case T_PAGEFLT:                 /* page fault */
538                         (void) trap_pfault(frame, FALSE, eva);
539                         goto out;
540
541                 case T_DNA:
542 #ifdef DEV_NPX
543                         if (PCB_USER_FPU(td->td_pcb))
544                                 panic("Unregistered use of FPU in kernel");
545                         if (npxdna())
546                                 goto out;
547 #endif
548                         break;
549
550                 case T_ARITHTRAP:       /* arithmetic trap */
551                 case T_XMMFLT:          /* SIMD floating-point exception */
552                 case T_FPOPFLT:         /* FPU operand fetch fault */
553                         /*
554                          * XXXKIB for now disable any FPU traps in kernel
555                          * handler registration seems to be overkill
556                          */
557                         trap_fatal(frame, 0);
558                         goto out;
559
560                         /*
561                          * The following two traps can happen in
562                          * vm86 mode, and, if so, we want to handle
563                          * them specially.
564                          */
565                 case T_PROTFLT:         /* general protection fault */
566                 case T_STKFLT:          /* stack fault */
567                         if (frame->tf_eflags & PSL_VM) {
568                                 i = vm86_emulate((struct vm86frame *)frame);
569                                 if (i != 0)
570                                         /*
571                                          * returns to original process
572                                          */
573                                         vm86_trap((struct vm86frame *)frame);
574                                 goto out;
575                         }
576                         if (type == T_STKFLT)
577                                 break;
578
579                         /* FALL THROUGH */
580
581                 case T_SEGNPFLT:        /* segment not present fault */
582                         if (curpcb->pcb_flags & PCB_VM86CALL)
583                                 break;
584
585                         /*
586                          * Invalid %fs's and %gs's can be created using
587                          * procfs or PT_SETREGS or by invalidating the
588                          * underlying LDT entry.  This causes a fault
589                          * in kernel mode when the kernel attempts to
590                          * switch contexts.  Lose the bad context
591                          * (XXX) so that we can continue, and generate
592                          * a signal.
593                          */
594                         if (frame->tf_eip == (int)cpu_switch_load_gs) {
595                                 curpcb->pcb_gs = 0;
596 #if 0                           
597                                 PROC_LOCK(p);
598                                 kern_psignal(p, SIGBUS);
599                                 PROC_UNLOCK(p);
600 #endif                          
601                                 goto out;
602                         }
603
604                         if (td->td_intr_nesting_level != 0)
605                                 break;
606
607                         /*
608                          * Invalid segment selectors and out of bounds
609                          * %eip's and %esp's can be set up in user mode.
610                          * This causes a fault in kernel mode when the
611                          * kernel tries to return to user mode.  We want
612                          * to get this fault so that we can fix the
613                          * problem here and not have to check all the
614                          * selectors and pointers when the user changes
615                          * them.
616                          */
617                         if (frame->tf_eip == (int)doreti_iret) {
618                                 frame->tf_eip = (int)doreti_iret_fault;
619                                 goto out;
620                         }
621                         if (frame->tf_eip == (int)doreti_popl_ds) {
622                                 frame->tf_eip = (int)doreti_popl_ds_fault;
623                                 goto out;
624                         }
625                         if (frame->tf_eip == (int)doreti_popl_es) {
626                                 frame->tf_eip = (int)doreti_popl_es_fault;
627                                 goto out;
628                         }
629                         if (frame->tf_eip == (int)doreti_popl_fs) {
630                                 frame->tf_eip = (int)doreti_popl_fs_fault;
631                                 goto out;
632                         }
633                         if (curpcb->pcb_onfault != NULL) {
634                                 frame->tf_eip =
635                                     (int)curpcb->pcb_onfault;
636                                 goto out;
637                         }
638                         break;
639
640                 case T_TSSFLT:
641                         /*
642                          * PSL_NT can be set in user mode and isn't cleared
643                          * automatically when the kernel is entered.  This
644                          * causes a TSS fault when the kernel attempts to
645                          * `iret' because the TSS link is uninitialized.  We
646                          * want to get this fault so that we can fix the
647                          * problem here and not every time the kernel is
648                          * entered.
649                          */
650                         if (frame->tf_eflags & PSL_NT) {
651                                 frame->tf_eflags &= ~PSL_NT;
652                                 goto out;
653                         }
654                         break;
655
656                 case T_TRCTRAP:  /* trace trap */
657                         if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) {
658                                 /*
659                                  * We've just entered system mode via the
660                                  * syscall lcall.  Continue single stepping
661                                  * silently until the syscall handler has
662                                  * saved the flags.
663                                  */
664                                 goto out;
665                         }
666                         if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
667                                 /*
668                                  * The syscall handler has now saved the
669                                  * flags.  Stop single stepping it.
670                                  */
671                                 frame->tf_eflags &= ~PSL_T;
672                                 goto out;
673                         }
674                         /*
675                          * Ignore debug register trace traps due to
676                          * accesses in the user's address space, which
677                          * can happen under several conditions such as
678                          * if a user sets a watchpoint on a buffer and
679                          * then passes that buffer to a system call.
680                          * We still want to get TRCTRAPS for addresses
681                          * in kernel space because that is useful when
682                          * debugging the kernel.
683                          */
684                         if (user_dbreg_trap() && 
685                            !(curpcb->pcb_flags & PCB_VM86CALL)) {
686                                 /*
687                                  * Reset breakpoint bits because the
688                                  * processor doesn't
689                                  */
690                                 load_dr6(rdr6() & 0xfffffff0);
691                                 goto out;
692                         }
693                         /*
694                          * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
695                          */
696                 case T_BPTFLT:
697                         /*
698                          * If KDB is enabled, let it handle the debugger trap.
699                          * Otherwise, debugger traps "can't happen".
700                          */
701 #ifdef KDB
702                         if (kdb_trap(type, 0, frame))
703                                 goto out;
704 #endif
705                         break;
706
707 #ifdef DEV_ISA
708                 case T_NMI:
709 #ifdef POWERFAIL_NMI
710                         if (time_second - lastalert > 10) {
711                                 log(LOG_WARNING, "NMI: power fail\n");
712                                 sysbeep(880, hz);
713                                 lastalert = time_second;
714                         }
715                         goto out;
716 #else /* !POWERFAIL_NMI */
717                         /* machine/parity/power fail/"kitchen sink" faults */
718                         if (isa_nmi(code) == 0) {
719 #ifdef KDB
720                                 /*
721                                  * NMI can be hooked up to a pushbutton
722                                  * for debugging.
723                                  */
724                                 if (kdb_on_nmi) {
725                                         printf ("NMI ... going to debugger\n");
726                                         kdb_trap(type, 0, frame);
727                                 }
728 #endif /* KDB */
729                                 goto out;
730                         } else if (panic_on_nmi == 0)
731                                 goto out;
732                         /* FALLTHROUGH */
733 #endif /* POWERFAIL_NMI */
734 #endif /* DEV_ISA */
735                 }
736
737                 trap_fatal(frame, eva);
738                 goto out;
739         }
740
741         /* Translate fault for emulators (e.g. Linux) */
742         if (*p->p_sysent->sv_transtrap)
743                 i = (*p->p_sysent->sv_transtrap)(i, type);
744
745         ksiginfo_init_trap(&ksi);
746         ksi.ksi_signo = i;
747         ksi.ksi_code = ucode;
748         ksi.ksi_addr = (void *)addr;
749         ksi.ksi_trapno = type;
750         if (uprintf_signal) {
751                 uprintf("pid %d comm %s: signal %d err %x code %d type %d "
752                     "addr 0x%x esp 0x%08x eip 0x%08x "
753                     "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
754                     p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr,
755                     frame->tf_esp, frame->tf_eip,
756                     fubyte((void *)(frame->tf_eip + 0)),
757                     fubyte((void *)(frame->tf_eip + 1)),
758                     fubyte((void *)(frame->tf_eip + 2)),
759                     fubyte((void *)(frame->tf_eip + 3)),
760                     fubyte((void *)(frame->tf_eip + 4)),
761                     fubyte((void *)(frame->tf_eip + 5)),
762                     fubyte((void *)(frame->tf_eip + 6)),
763                     fubyte((void *)(frame->tf_eip + 7)));
764         }
765         KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled"));
766         trapsignal(td, &ksi);
767
768 #ifdef DEBUG
769         if (type <= MAX_TRAP_MSG) {
770                 uprintf("fatal process exception: %s",
771                         trap_msg[type]);
772                 if ((type == T_PAGEFLT) || (type == T_PROTFLT))
773                         uprintf(", fault VA = 0x%lx", (u_long)eva);
774                 uprintf("\n");
775         }
776 #endif
777
778 user:
779         userret(td, frame);
780         KASSERT(PCB_USER_FPU(td->td_pcb),
781             ("Return from trap with kernel FPU ctx leaked"));
782 userout:
783 out:
784         return;
785 }
786
787 static int
788 trap_pfault(frame, usermode, eva)
789         struct trapframe *frame;
790         int usermode;
791         vm_offset_t eva;
792 {
793         vm_offset_t va;
794         vm_map_t map;
795         int rv = 0;
796         vm_prot_t ftype;
797         struct thread *td = curthread;
798         struct proc *p = td->td_proc;
799
800         if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
801                 /*
802                  * Due to both processor errata and lazy TLB invalidation when
803                  * access restrictions are removed from virtual pages, memory
804                  * accesses that are allowed by the physical mapping layer may
805                  * nonetheless cause one spurious page fault per virtual page. 
806                  * When the thread is executing a "no faulting" section that
807                  * is bracketed by vm_fault_{disable,enable}_pagefaults(),
808                  * every page fault is treated as a spurious page fault,
809                  * unless it accesses the same virtual address as the most
810                  * recent page fault within the same "no faulting" section.
811                  */
812                 if (td->td_md.md_spurflt_addr != eva ||
813                     (td->td_pflags & TDP_RESETSPUR) != 0) {
814                         /*
815                          * Do nothing to the TLB.  A stale TLB entry is
816                          * flushed automatically by a page fault.
817                          */
818                         td->td_md.md_spurflt_addr = eva;
819                         td->td_pflags &= ~TDP_RESETSPUR;
820                         return (0);
821                 }
822         } else {
823                 /*
824                  * If we get a page fault while in a critical section, then
825                  * it is most likely a fatal kernel page fault.  The kernel
826                  * is already going to panic trying to get a sleep lock to
827                  * do the VM lookup, so just consider it a fatal trap so the
828                  * kernel can print out a useful trap message and even get
829                  * to the debugger.
830                  *
831                  * If we get a page fault while holding a non-sleepable
832                  * lock, then it is most likely a fatal kernel page fault.
833                  * If WITNESS is enabled, then it's going to whine about
834                  * bogus LORs with various VM locks, so just skip to the
835                  * fatal trap handling directly.
836                  */
837                 if (td->td_critnest != 0 ||
838                     WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
839                     "Kernel page fault") != 0) {
840                         trap_fatal(frame, eva);
841                         return (-1);
842                 }
843         }
844         va = trunc_page(eva);
845         if (va >= KERNBASE) {
846                 /*
847                  * Don't allow user-mode faults in kernel address space.
848                  * An exception:  if the faulting address is the invalid
849                  * instruction entry in the IDT, then the Intel Pentium
850                  * F00F bug workaround was triggered, and we need to
851                  * treat it is as an illegal instruction, and not a page
852                  * fault.
853                  */
854 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
855                 if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
856                         return (-2);
857 #endif
858                 if (usermode)
859                         goto nogo;
860
861                 map = kernel_map;
862         } else {
863                 map = &p->p_vmspace->vm_map;
864
865                 /*
866                  * When accessing a user-space address, kernel must be
867                  * ready to accept the page fault, and provide a
868                  * handling routine.  Since accessing the address
869                  * without the handler is a bug, do not try to handle
870                  * it normally, and panic immediately.
871                  */
872                 if (!usermode && (td->td_intr_nesting_level != 0 ||
873                     curpcb->pcb_onfault == NULL)) {
874                         trap_fatal(frame, eva);
875                         return (-1);
876                 }
877         }
878
879         /*
880          * PGEX_I is defined only if the execute disable bit capability is
881          * supported and enabled.
882          */
883         if (frame->tf_err & PGEX_W)
884                 ftype = VM_PROT_WRITE;
885 #if defined(PAE) || defined(PAE_TABLES)
886         else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
887                 ftype = VM_PROT_EXECUTE;
888 #endif
889         else
890                 ftype = VM_PROT_READ;
891
892         /* Fault in the page. */
893         rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
894         if (rv == KERN_SUCCESS) {
895 #ifdef HWPMC_HOOKS
896                 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
897                         PMC_SOFT_CALL_TF( , , page_fault, all, frame);
898                         if (ftype == VM_PROT_READ)
899                                 PMC_SOFT_CALL_TF( , , page_fault, read,
900                                     frame);
901                         else
902                                 PMC_SOFT_CALL_TF( , , page_fault, write,
903                                     frame);
904                 }
905 #endif
906                 return (0);
907         }
908 nogo:
909         if (!usermode) {
910                 if (td->td_intr_nesting_level == 0 &&
911                     curpcb->pcb_onfault != NULL) {
912                         frame->tf_eip = (int)curpcb->pcb_onfault;
913                         return (0);
914                 }
915                 trap_fatal(frame, eva);
916                 return (-1);
917         }
918         return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
919 }
920
921 static void
922 trap_fatal(frame, eva)
923         struct trapframe *frame;
924         vm_offset_t eva;
925 {
926         int code, ss, esp;
927         u_int type;
928         struct soft_segment_descriptor softseg;
929         char *msg;
930
931         code = frame->tf_err;
932         type = frame->tf_trapno;
933         sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
934
935         if (type <= MAX_TRAP_MSG)
936                 msg = trap_msg[type];
937         else
938                 msg = "UNKNOWN";
939         printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
940             frame->tf_eflags & PSL_VM ? "vm86" :
941             ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
942 #ifdef SMP
943         /* two separate prints in case of a trap on an unmapped page */
944         printf("cpuid = %d; ", PCPU_GET(cpuid));
945         printf("apic id = %02x\n", PCPU_GET(apic_id));
946 #endif
947         if (type == T_PAGEFLT) {
948                 printf("fault virtual address   = 0x%x\n", eva);
949                 printf("fault code              = %s %s, %s\n",
950                         code & PGEX_U ? "user" : "supervisor",
951                         code & PGEX_W ? "write" : "read",
952                         code & PGEX_P ? "protection violation" : "page not present");
953         }
954         printf("instruction pointer     = 0x%x:0x%x\n",
955                frame->tf_cs & 0xffff, frame->tf_eip);
956         if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
957                 ss = frame->tf_ss & 0xffff;
958                 esp = frame->tf_esp;
959         } else {
960                 ss = GSEL(GDATA_SEL, SEL_KPL);
961                 esp = (int)&frame->tf_esp;
962         }
963         printf("stack pointer           = 0x%x:0x%x\n", ss, esp);
964         printf("frame pointer           = 0x%x:0x%x\n", ss, frame->tf_ebp);
965         printf("code segment            = base 0x%x, limit 0x%x, type 0x%x\n",
966                softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
967         printf("                        = DPL %d, pres %d, def32 %d, gran %d\n",
968                softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
969                softseg.ssd_gran);
970         printf("processor eflags        = ");
971         if (frame->tf_eflags & PSL_T)
972                 printf("trace trap, ");
973         if (frame->tf_eflags & PSL_I)
974                 printf("interrupt enabled, ");
975         if (frame->tf_eflags & PSL_NT)
976                 printf("nested task, ");
977         if (frame->tf_eflags & PSL_RF)
978                 printf("resume, ");
979         if (frame->tf_eflags & PSL_VM)
980                 printf("vm86, ");
981         printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
982         printf("current process         = %d (%s)\n",
983             curproc->p_pid, curthread->td_name);
984
985 #ifdef KDB
986         if (debugger_on_panic || kdb_active) {
987                 frame->tf_err = eva;    /* smuggle fault address to ddb */
988                 if (kdb_trap(type, 0, frame)) {
989                         frame->tf_err = code;   /* restore error code */
990                         return;
991                 }
992                 frame->tf_err = code;           /* restore error code */
993         }
994 #endif
995         printf("trap number             = %d\n", type);
996         if (type <= MAX_TRAP_MSG)
997                 panic("%s", trap_msg[type]);
998         else
999                 panic("unknown/reserved trap");
1000 }
1001
1002 /*
1003  * Double fault handler. Called when a fault occurs while writing
1004  * a frame for a trap/exception onto the stack. This usually occurs
1005  * when the stack overflows (such is the case with infinite recursion,
1006  * for example).
1007  *
1008  * XXX Note that the current PTD gets replaced by IdlePTD when the
1009  * task switch occurs. This means that the stack that was active at
1010  * the time of the double fault is not available at <kstack> unless
1011  * the machine was idle when the double fault occurred. The downside
1012  * of this is that "trace <ebp>" in ddb won't work.
1013  */
1014 void
1015 dblfault_handler()
1016 {
1017 #ifdef KDTRACE_HOOKS
1018         if (dtrace_doubletrap_func != NULL)
1019                 (*dtrace_doubletrap_func)();
1020 #endif
1021         printf("\nFatal double fault:\n");
1022         printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
1023         printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
1024         printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
1025 #ifdef SMP
1026         /* two separate prints in case of a trap on an unmapped page */
1027         printf("cpuid = %d; ", PCPU_GET(cpuid));
1028         printf("apic id = %02x\n", PCPU_GET(apic_id));
1029 #endif
1030         panic("double fault");
1031 }
1032
1033 int
1034 cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
1035 {
1036         struct proc *p;
1037         struct trapframe *frame;
1038         caddr_t params;
1039         long tmp;
1040         int error;
1041
1042         p = td->td_proc;
1043         frame = td->td_frame;
1044
1045         params = (caddr_t)frame->tf_esp + sizeof(int);
1046         sa->code = frame->tf_eax;
1047
1048         /*
1049          * Need to check if this is a 32 bit or 64 bit syscall.
1050          */
1051         if (sa->code == SYS_syscall) {
1052                 /*
1053                  * Code is first argument, followed by actual args.
1054                  */
1055                 error = fueword(params, &tmp);
1056                 if (error == -1)
1057                         return (EFAULT);
1058                 sa->code = tmp;
1059                 params += sizeof(int);
1060         } else if (sa->code == SYS___syscall) {
1061                 /*
1062                  * Like syscall, but code is a quad, so as to maintain
1063                  * quad alignment for the rest of the arguments.
1064                  */
1065                 error = fueword(params, &tmp);
1066                 if (error == -1)
1067                         return (EFAULT);
1068                 sa->code = tmp;
1069                 params += sizeof(quad_t);
1070         }
1071
1072         if (p->p_sysent->sv_mask)
1073                 sa->code &= p->p_sysent->sv_mask;
1074         if (sa->code >= p->p_sysent->sv_size)
1075                 sa->callp = &p->p_sysent->sv_table[0];
1076         else
1077                 sa->callp = &p->p_sysent->sv_table[sa->code];
1078         sa->narg = sa->callp->sy_narg;
1079
1080         if (params != NULL && sa->narg != 0)
1081                 error = copyin(params, (caddr_t)sa->args,
1082                     (u_int)(sa->narg * sizeof(int)));
1083         else
1084                 error = 0;
1085
1086         if (error == 0) {
1087                 td->td_retval[0] = 0;
1088                 td->td_retval[1] = frame->tf_edx;
1089         }
1090                 
1091         return (error);
1092 }
1093
1094 #include "../../kern/subr_syscall.c"
1095
1096 /*
1097  * syscall - system call request C handler.  A system call is
1098  * essentially treated as a trap by reusing the frame layout.
1099  */
1100 void
1101 syscall(struct trapframe *frame)
1102 {
1103         struct thread *td;
1104         struct syscall_args sa;
1105         register_t orig_tf_eflags;
1106         int error;
1107         ksiginfo_t ksi;
1108
1109 #ifdef DIAGNOSTIC
1110         if (ISPL(frame->tf_cs) != SEL_UPL) {
1111                 panic("syscall");
1112                 /* NOT REACHED */
1113         }
1114 #endif
1115         orig_tf_eflags = frame->tf_eflags;
1116
1117         td = curthread;
1118         td->td_frame = frame;
1119
1120         error = syscallenter(td, &sa);
1121
1122         /*
1123          * Traced syscall.
1124          */
1125         if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
1126                 frame->tf_eflags &= ~PSL_T;
1127                 ksiginfo_init_trap(&ksi);
1128                 ksi.ksi_signo = SIGTRAP;
1129                 ksi.ksi_code = TRAP_TRACE;
1130                 ksi.ksi_addr = (void *)frame->tf_eip;
1131                 trapsignal(td, &ksi);
1132         }
1133
1134         KASSERT(PCB_USER_FPU(td->td_pcb),
1135             ("System call %s returning with kernel FPU ctx leaked",
1136              syscallname(td->td_proc, sa.code)));
1137         KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
1138             ("System call %s returning with mangled pcb_save",
1139              syscallname(td->td_proc, sa.code)));
1140
1141         syscallret(td, error, &sa);
1142 }