]> CyberLeo.Net >> Repos - FreeBSD/stable/9.git/blob - sys/i386/i386/trap.c
MFC r363988:
[FreeBSD/stable/9.git] / sys / i386 / i386 / trap.c
1 /*-
2  * Copyright (C) 1994, David Greenman
3  * Copyright (c) 1990, 1993
4  *      The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * the University of Utah, and William Jolitz.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *      This product includes software developed by the University of
20  *      California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *      from: @(#)trap.c        7.4 (Berkeley) 5/13/91
38  */
39
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42
43 /*
44  * 386 Trap and System call handling
45  */
46
47 #include "opt_clock.h"
48 #include "opt_cpu.h"
49 #include "opt_hwpmc_hooks.h"
50 #include "opt_isa.h"
51 #include "opt_kdb.h"
52 #include "opt_kdtrace.h"
53 #include "opt_npx.h"
54 #include "opt_trap.h"
55
56 #include <sys/param.h>
57 #include <sys/bus.h>
58 #include <sys/systm.h>
59 #include <sys/proc.h>
60 #include <sys/pioctl.h>
61 #include <sys/ptrace.h>
62 #include <sys/kdb.h>
63 #include <sys/kernel.h>
64 #include <sys/ktr.h>
65 #include <sys/lock.h>
66 #include <sys/mutex.h>
67 #include <sys/resourcevar.h>
68 #include <sys/signalvar.h>
69 #include <sys/syscall.h>
70 #include <sys/sysctl.h>
71 #include <sys/sysent.h>
72 #include <sys/uio.h>
73 #include <sys/vmmeter.h>
74 #ifdef HWPMC_HOOKS
75 #include <sys/pmckern.h>
76 PMC_SOFT_DEFINE( , , page_fault, all);
77 PMC_SOFT_DEFINE( , , page_fault, read);
78 PMC_SOFT_DEFINE( , , page_fault, write);
79 #endif
80 #include <security/audit/audit.h>
81
82 #include <vm/vm.h>
83 #include <vm/vm_param.h>
84 #include <vm/pmap.h>
85 #include <vm/vm_kern.h>
86 #include <vm/vm_map.h>
87 #include <vm/vm_page.h>
88 #include <vm/vm_extern.h>
89
90 #include <machine/cpu.h>
91 #include <machine/intr_machdep.h>
92 #include <x86/mca.h>
93 #include <machine/md_var.h>
94 #include <machine/pcb.h>
95 #ifdef SMP
96 #include <machine/smp.h>
97 #endif
98 #include <machine/tss.h>
99 #include <machine/vm86.h>
100
101 #ifdef POWERFAIL_NMI
102 #include <sys/syslog.h>
103 #include <machine/clock.h>
104 #endif
105
106 #ifdef KDTRACE_HOOKS
107 #include <sys/dtrace_bsd.h>
108
109 /*
110  * This is a hook which is initialised by the dtrace module
111  * to handle traps which might occur during DTrace probe
112  * execution.
113  */
114 dtrace_trap_func_t      dtrace_trap_func;
115
116 dtrace_doubletrap_func_t        dtrace_doubletrap_func;
117
118 /*
119  * This is a hook which is initialised by the systrace module
120  * when it is loaded. This keeps the DTrace syscall provider
121  * implementation opaque. 
122  */
123 systrace_probe_func_t   systrace_probe_func;
124
125 /*
126  * These hooks are necessary for the pid and usdt providers.
127  */
128 dtrace_pid_probe_ptr_t          dtrace_pid_probe_ptr;
129 dtrace_return_probe_ptr_t       dtrace_return_probe_ptr;
130 #endif
131
132 extern void trap(struct trapframe *frame);
133 extern void syscall(struct trapframe *frame);
134
135 static int trap_pfault(struct trapframe *, int, vm_offset_t);
136 static void trap_fatal(struct trapframe *, vm_offset_t);
137 void dblfault_handler(void);
138
139 extern inthand_t IDTVEC(lcall_syscall);
140
141 #define MAX_TRAP_MSG            32
142 static char *trap_msg[] = {
143         "",                                     /*  0 unused */
144         "privileged instruction fault",         /*  1 T_PRIVINFLT */
145         "",                                     /*  2 unused */
146         "breakpoint instruction fault",         /*  3 T_BPTFLT */
147         "",                                     /*  4 unused */
148         "",                                     /*  5 unused */
149         "arithmetic trap",                      /*  6 T_ARITHTRAP */
150         "",                                     /*  7 unused */
151         "",                                     /*  8 unused */
152         "general protection fault",             /*  9 T_PROTFLT */
153         "trace trap",                           /* 10 T_TRCTRAP */
154         "",                                     /* 11 unused */
155         "page fault",                           /* 12 T_PAGEFLT */
156         "",                                     /* 13 unused */
157         "alignment fault",                      /* 14 T_ALIGNFLT */
158         "",                                     /* 15 unused */
159         "",                                     /* 16 unused */
160         "",                                     /* 17 unused */
161         "integer divide fault",                 /* 18 T_DIVIDE */
162         "non-maskable interrupt trap",          /* 19 T_NMI */
163         "overflow trap",                        /* 20 T_OFLOW */
164         "FPU bounds check fault",               /* 21 T_BOUND */
165         "FPU device not available",             /* 22 T_DNA */
166         "double fault",                         /* 23 T_DOUBLEFLT */
167         "FPU operand fetch fault",              /* 24 T_FPOPFLT */
168         "invalid TSS fault",                    /* 25 T_TSSFLT */
169         "segment not present fault",            /* 26 T_SEGNPFLT */
170         "stack fault",                          /* 27 T_STKFLT */
171         "machine check trap",                   /* 28 T_MCHK */
172         "SIMD floating-point exception",        /* 29 T_XMMFLT */
173         "reserved (unknown) fault",             /* 30 T_RESERVED */
174         "",                                     /* 31 unused (reserved) */
175         "DTrace pid return trap",               /* 32 T_DTRACE_RET */
176 };
177
178 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
179 extern int has_f00f_bug;
180 #endif
181
182 #ifdef KDB
183 static int kdb_on_nmi = 1;
184 SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RW,
185         &kdb_on_nmi, 0, "Go to KDB on NMI");
186 TUNABLE_INT("machdep.kdb_on_nmi", &kdb_on_nmi);
187 #endif
188 static int panic_on_nmi = 1;
189 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
190         &panic_on_nmi, 0, "Panic on NMI");
191 TUNABLE_INT("machdep.panic_on_nmi", &panic_on_nmi);
192 static int prot_fault_translation = 0;
193 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW,
194         &prot_fault_translation, 0, "Select signal to deliver on protection fault");
195 static int uprintf_signal;
196 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW,
197     &uprintf_signal, 0,
198     "Print debugging information on trap signal to ctty");
199
200 /*
201  * Exception, fault, and trap interface to the FreeBSD kernel.
202  * This common code is called from assembly language IDT gate entry
203  * routines that prepare a suitable stack frame, and restore this
204  * frame after the exception has been processed.
205  */
206
207 void
208 trap(struct trapframe *frame)
209 {
210 #ifdef KDTRACE_HOOKS
211         struct reg regs;
212 #endif
213         struct thread *td = curthread;
214         struct proc *p = td->td_proc;
215         int i = 0, ucode = 0, code;
216         u_int type;
217         register_t addr = 0;
218         vm_offset_t eva;
219         ksiginfo_t ksi;
220 #ifdef POWERFAIL_NMI
221         static int lastalert = 0;
222 #endif
223
224         PCPU_INC(cnt.v_trap);
225         type = frame->tf_trapno;
226
227 #ifdef SMP
228         /* Handler for NMI IPIs used for stopping CPUs. */
229         if (type == T_NMI) {
230                  if (ipi_nmi_handler() == 0)
231                            goto out;
232         }
233 #endif /* SMP */
234
235 #ifdef KDB
236         if (kdb_active) {
237                 kdb_reenter();
238                 goto out;
239         }
240 #endif
241
242         if (type == T_RESERVED) {
243                 trap_fatal(frame, 0);
244                 goto out;
245         }
246
247 #ifdef  HWPMC_HOOKS
248         /*
249          * CPU PMCs interrupt using an NMI so we check for that first.
250          * If the HWPMC module is active, 'pmc_hook' will point to
251          * the function to be called.  A return value of '1' from the
252          * hook means that the NMI was handled by it and that we can
253          * return immediately.
254          */
255         if (type == T_NMI && pmc_intr &&
256             (*pmc_intr)(PCPU_GET(cpuid), frame))
257             goto out;
258 #endif
259
260         if (type == T_MCHK) {
261                 mca_intr();
262                 goto out;
263         }
264
265 #ifdef KDTRACE_HOOKS
266         /*
267          * A trap can occur while DTrace executes a probe. Before
268          * executing the probe, DTrace blocks re-scheduling and sets
269          * a flag in its per-cpu flags to indicate that it doesn't
270          * want to fault. On returning from the probe, the no-fault
271          * flag is cleared and finally re-scheduling is enabled.
272          */
273         if ((type == T_PROTFLT || type == T_PAGEFLT) &&
274             dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
275                 goto out;
276 #endif
277
278         if ((frame->tf_eflags & PSL_I) == 0) {
279                 /*
280                  * Buggy application or kernel code has disabled
281                  * interrupts and then trapped.  Enabling interrupts
282                  * now is wrong, but it is better than running with
283                  * interrupts disabled until they are accidentally
284                  * enabled later.
285                  */
286                 if (ISPL(frame->tf_cs) == SEL_UPL || (frame->tf_eflags & PSL_VM))
287                         uprintf(
288                             "pid %ld (%s): trap %d with interrupts disabled\n",
289                             (long)curproc->p_pid, curthread->td_name, type);
290                 else if (type != T_BPTFLT && type != T_TRCTRAP &&
291                          frame->tf_eip != (int)cpu_switch_load_gs) {
292                         /*
293                          * XXX not quite right, since this may be for a
294                          * multiple fault in user mode.
295                          */
296                         printf("kernel trap %d with interrupts disabled\n",
297                             type);
298                         /*
299                          * Page faults need interrupts disabled until later,
300                          * and we shouldn't enable interrupts while holding
301                          * a spin lock or if servicing an NMI.
302                          */
303                         if (type != T_NMI && type != T_PAGEFLT &&
304                             td->td_md.md_spinlock_count == 0)
305                                 enable_intr();
306                 }
307         }
308         eva = 0;
309         code = frame->tf_err;
310         if (type == T_PAGEFLT) {
311                 /*
312                  * For some Cyrix CPUs, %cr2 is clobbered by
313                  * interrupts.  This problem is worked around by using
314                  * an interrupt gate for the pagefault handler.  We
315                  * are finally ready to read %cr2 and conditionally
316                  * reenable interrupts.  If we hold a spin lock, then
317                  * we must not reenable interrupts.  This might be a
318                  * spurious page fault.
319                  */
320                 eva = rcr2();
321                 if (td->td_md.md_spinlock_count == 0)
322                         enable_intr();
323         }
324
325         if ((ISPL(frame->tf_cs) == SEL_UPL) ||
326             ((frame->tf_eflags & PSL_VM) && 
327                 !(curpcb->pcb_flags & PCB_VM86CALL))) {
328                 /* user trap */
329
330                 td->td_pticks = 0;
331                 td->td_frame = frame;
332                 addr = frame->tf_eip;
333                 if (td->td_ucred != p->p_ucred) 
334                         cred_update_thread(td);
335
336                 switch (type) {
337                 case T_PRIVINFLT:       /* privileged instruction fault */
338                         i = SIGILL;
339                         ucode = ILL_PRVOPC;
340                         break;
341
342                 case T_BPTFLT:          /* bpt instruction fault */
343                 case T_TRCTRAP:         /* trace trap */
344                         enable_intr();
345 #ifdef KDTRACE_HOOKS
346                         if (type == T_BPTFLT) {
347                                 fill_frame_regs(frame, &regs);
348                                 if (dtrace_pid_probe_ptr != NULL &&
349                                     dtrace_pid_probe_ptr(&regs) == 0)
350                                         goto out;
351                         }
352 #endif
353                         frame->tf_eflags &= ~PSL_T;
354                         i = SIGTRAP;
355                         ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
356                         break;
357
358                 case T_ARITHTRAP:       /* arithmetic trap */
359 #ifdef DEV_NPX
360                         ucode = npxtrap_x87();
361                         if (ucode == -1)
362                                 goto userout;
363 #else
364                         ucode = 0;
365 #endif
366                         i = SIGFPE;
367                         break;
368
369                         /*
370                          * The following two traps can happen in
371                          * vm86 mode, and, if so, we want to handle
372                          * them specially.
373                          */
374                 case T_PROTFLT:         /* general protection fault */
375                 case T_STKFLT:          /* stack fault */
376                         if (frame->tf_eflags & PSL_VM) {
377                                 i = vm86_emulate((struct vm86frame *)frame);
378                                 if (i == 0)
379                                         goto user;
380                                 break;
381                         }
382                         i = SIGBUS;
383                         ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
384                         break;
385                 case T_SEGNPFLT:        /* segment not present fault */
386                         i = SIGBUS;
387                         ucode = BUS_ADRERR;
388                         break;
389                 case T_TSSFLT:          /* invalid TSS fault */
390                         i = SIGBUS;
391                         ucode = BUS_OBJERR;
392                         break;
393                 case T_ALIGNFLT:
394                         i = SIGBUS;
395                         ucode = BUS_ADRALN;
396                         break;
397                 case T_DOUBLEFLT:       /* double fault */
398                 default:
399                         i = SIGBUS;
400                         ucode = BUS_OBJERR;
401                         break;
402
403                 case T_PAGEFLT:         /* page fault */
404
405                         i = trap_pfault(frame, TRUE, eva);
406 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
407                         if (i == -2) {
408                                 /*
409                                  * The f00f hack workaround has triggered, so
410                                  * treat the fault as an illegal instruction 
411                                  * (T_PRIVINFLT) instead of a page fault.
412                                  */
413                                 type = frame->tf_trapno = T_PRIVINFLT;
414
415                                 /* Proceed as in that case. */
416                                 ucode = ILL_PRVOPC;
417                                 i = SIGILL;
418                                 break;
419                         }
420 #endif
421                         if (i == -1)
422                                 goto userout;
423                         if (i == 0)
424                                 goto user;
425
426                         if (i == SIGSEGV)
427                                 ucode = SEGV_MAPERR;
428                         else {
429                                 if (prot_fault_translation == 0) {
430                                         /*
431                                          * Autodetect.
432                                          * This check also covers the images
433                                          * without the ABI-tag ELF note.
434                                          */
435                                         if (SV_CURPROC_ABI() == SV_ABI_FREEBSD
436                                             && p->p_osrel >= P_OSREL_SIGSEGV) {
437                                                 i = SIGSEGV;
438                                                 ucode = SEGV_ACCERR;
439                                         } else {
440                                                 i = SIGBUS;
441                                                 ucode = BUS_PAGE_FAULT;
442                                         }
443                                 } else if (prot_fault_translation == 1) {
444                                         /*
445                                          * Always compat mode.
446                                          */
447                                         i = SIGBUS;
448                                         ucode = BUS_PAGE_FAULT;
449                                 } else {
450                                         /*
451                                          * Always SIGSEGV mode.
452                                          */
453                                         i = SIGSEGV;
454                                         ucode = SEGV_ACCERR;
455                                 }
456                         }
457                         addr = eva;
458                         break;
459
460                 case T_DIVIDE:          /* integer divide fault */
461                         ucode = FPE_INTDIV;
462                         i = SIGFPE;
463                         break;
464
465 #ifdef DEV_ISA
466                 case T_NMI:
467 #ifdef POWERFAIL_NMI
468 #ifndef TIMER_FREQ
469 #  define TIMER_FREQ 1193182
470 #endif
471                         if (time_second - lastalert > 10) {
472                                 log(LOG_WARNING, "NMI: power fail\n");
473                                 sysbeep(880, hz);
474                                 lastalert = time_second;
475                         }
476                         goto userout;
477 #else /* !POWERFAIL_NMI */
478                         /* machine/parity/power fail/"kitchen sink" faults */
479                         if (isa_nmi(code) == 0) {
480 #ifdef KDB
481                                 /*
482                                  * NMI can be hooked up to a pushbutton
483                                  * for debugging.
484                                  */
485                                 if (kdb_on_nmi) {
486                                         printf ("NMI ... going to debugger\n");
487                                         kdb_trap(type, 0, frame);
488                                 }
489 #endif /* KDB */
490                                 goto userout;
491                         } else if (panic_on_nmi)
492                                 panic("NMI indicates hardware failure");
493                         break;
494 #endif /* POWERFAIL_NMI */
495 #endif /* DEV_ISA */
496
497                 case T_OFLOW:           /* integer overflow fault */
498                         ucode = FPE_INTOVF;
499                         i = SIGFPE;
500                         break;
501
502                 case T_BOUND:           /* bounds check fault */
503                         ucode = FPE_FLTSUB;
504                         i = SIGFPE;
505                         break;
506
507                 case T_DNA:
508 #ifdef DEV_NPX
509                         KASSERT(PCB_USER_FPU(td->td_pcb),
510                             ("kernel FPU ctx has leaked"));
511                         /* transparent fault (due to context switch "late") */
512                         if (npxdna())
513                                 goto userout;
514 #endif
515                         uprintf("pid %d killed due to lack of floating point\n",
516                                 p->p_pid);
517                         i = SIGKILL;
518                         ucode = 0;
519                         break;
520
521                 case T_FPOPFLT:         /* FPU operand fetch fault */
522                         ucode = ILL_COPROC;
523                         i = SIGILL;
524                         break;
525
526                 case T_XMMFLT:          /* SIMD floating-point exception */
527 #if defined(DEV_NPX) && !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
528                         ucode = npxtrap_sse();
529                         if (ucode == -1)
530                                 goto userout;
531 #else
532                         ucode = 0;
533 #endif
534                         i = SIGFPE;
535                         break;
536 #ifdef KDTRACE_HOOKS
537                 case T_DTRACE_RET:
538                         enable_intr();
539                         fill_frame_regs(frame, &regs);
540                         if (dtrace_return_probe_ptr != NULL &&
541                             dtrace_return_probe_ptr(&regs) == 0)
542                                 goto out;
543                         break;
544 #endif
545                 }
546         } else {
547                 /* kernel trap */
548
549                 KASSERT(cold || td->td_ucred != NULL,
550                     ("kernel trap doesn't have ucred"));
551                 switch (type) {
552                 case T_PAGEFLT:                 /* page fault */
553                         (void) trap_pfault(frame, FALSE, eva);
554                         goto out;
555
556                 case T_DNA:
557 #ifdef DEV_NPX
558                         KASSERT(!PCB_USER_FPU(td->td_pcb),
559                             ("Unregistered use of FPU in kernel"));
560                         if (npxdna())
561                                 goto out;
562 #endif
563                         break;
564
565                 case T_ARITHTRAP:       /* arithmetic trap */
566                 case T_XMMFLT:          /* SIMD floating-point exception */
567                 case T_FPOPFLT:         /* FPU operand fetch fault */
568                         /*
569                          * XXXKIB for now disable any FPU traps in kernel
570                          * handler registration seems to be overkill
571                          */
572                         trap_fatal(frame, 0);
573                         goto out;
574
575                         /*
576                          * The following two traps can happen in
577                          * vm86 mode, and, if so, we want to handle
578                          * them specially.
579                          */
580                 case T_PROTFLT:         /* general protection fault */
581                 case T_STKFLT:          /* stack fault */
582                         if (frame->tf_eflags & PSL_VM) {
583                                 i = vm86_emulate((struct vm86frame *)frame);
584                                 if (i != 0)
585                                         /*
586                                          * returns to original process
587                                          */
588                                         vm86_trap((struct vm86frame *)frame);
589                                 goto out;
590                         }
591                         if (type == T_STKFLT)
592                                 break;
593
594                         /* FALL THROUGH */
595
596                 case T_SEGNPFLT:        /* segment not present fault */
597                         if (curpcb->pcb_flags & PCB_VM86CALL)
598                                 break;
599
600                         /*
601                          * Invalid %fs's and %gs's can be created using
602                          * procfs or PT_SETREGS or by invalidating the
603                          * underlying LDT entry.  This causes a fault
604                          * in kernel mode when the kernel attempts to
605                          * switch contexts.  Lose the bad context
606                          * (XXX) so that we can continue, and generate
607                          * a signal.
608                          */
609                         if (frame->tf_eip == (int)cpu_switch_load_gs) {
610                                 curpcb->pcb_gs = 0;
611 #if 0                           
612                                 PROC_LOCK(p);
613                                 kern_psignal(p, SIGBUS);
614                                 PROC_UNLOCK(p);
615 #endif                          
616                                 goto out;
617                         }
618
619                         if (td->td_intr_nesting_level != 0)
620                                 break;
621
622                         /*
623                          * Invalid segment selectors and out of bounds
624                          * %eip's and %esp's can be set up in user mode.
625                          * This causes a fault in kernel mode when the
626                          * kernel tries to return to user mode.  We want
627                          * to get this fault so that we can fix the
628                          * problem here and not have to check all the
629                          * selectors and pointers when the user changes
630                          * them.
631                          */
632                         if (frame->tf_eip == (int)doreti_iret) {
633                                 frame->tf_eip = (int)doreti_iret_fault;
634                                 goto out;
635                         }
636                         if (frame->tf_eip == (int)doreti_popl_ds) {
637                                 frame->tf_eip = (int)doreti_popl_ds_fault;
638                                 goto out;
639                         }
640                         if (frame->tf_eip == (int)doreti_popl_es) {
641                                 frame->tf_eip = (int)doreti_popl_es_fault;
642                                 goto out;
643                         }
644                         if (frame->tf_eip == (int)doreti_popl_fs) {
645                                 frame->tf_eip = (int)doreti_popl_fs_fault;
646                                 goto out;
647                         }
648                         if (curpcb->pcb_onfault != NULL) {
649                                 frame->tf_eip =
650                                     (int)curpcb->pcb_onfault;
651                                 goto out;
652                         }
653                         break;
654
655                 case T_TSSFLT:
656                         /*
657                          * PSL_NT can be set in user mode and isn't cleared
658                          * automatically when the kernel is entered.  This
659                          * causes a TSS fault when the kernel attempts to
660                          * `iret' because the TSS link is uninitialized.  We
661                          * want to get this fault so that we can fix the
662                          * problem here and not every time the kernel is
663                          * entered.
664                          */
665                         if (frame->tf_eflags & PSL_NT) {
666                                 frame->tf_eflags &= ~PSL_NT;
667                                 goto out;
668                         }
669                         break;
670
671                 case T_TRCTRAP:  /* trace trap */
672                         if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) {
673                                 /*
674                                  * We've just entered system mode via the
675                                  * syscall lcall.  Continue single stepping
676                                  * silently until the syscall handler has
677                                  * saved the flags.
678                                  */
679                                 goto out;
680                         }
681                         if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
682                                 /*
683                                  * The syscall handler has now saved the
684                                  * flags.  Stop single stepping it.
685                                  */
686                                 frame->tf_eflags &= ~PSL_T;
687                                 goto out;
688                         }
689                         /*
690                          * Ignore debug register trace traps due to
691                          * accesses in the user's address space, which
692                          * can happen under several conditions such as
693                          * if a user sets a watchpoint on a buffer and
694                          * then passes that buffer to a system call.
695                          * We still want to get TRCTRAPS for addresses
696                          * in kernel space because that is useful when
697                          * debugging the kernel.
698                          */
699                         if (user_dbreg_trap() && 
700                            !(curpcb->pcb_flags & PCB_VM86CALL)) {
701                                 /*
702                                  * Reset breakpoint bits because the
703                                  * processor doesn't
704                                  */
705                                 load_dr6(rdr6() & 0xfffffff0);
706                                 goto out;
707                         }
708                         /*
709                          * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
710                          */
711                 case T_BPTFLT:
712                         /*
713                          * If KDB is enabled, let it handle the debugger trap.
714                          * Otherwise, debugger traps "can't happen".
715                          */
716 #ifdef KDB
717                         if (kdb_trap(type, 0, frame))
718                                 goto out;
719 #endif
720                         break;
721
722 #ifdef DEV_ISA
723                 case T_NMI:
724 #ifdef POWERFAIL_NMI
725                         if (time_second - lastalert > 10) {
726                                 log(LOG_WARNING, "NMI: power fail\n");
727                                 sysbeep(880, hz);
728                                 lastalert = time_second;
729                         }
730                         goto out;
731 #else /* !POWERFAIL_NMI */
732                         /* machine/parity/power fail/"kitchen sink" faults */
733                         if (isa_nmi(code) == 0) {
734 #ifdef KDB
735                                 /*
736                                  * NMI can be hooked up to a pushbutton
737                                  * for debugging.
738                                  */
739                                 if (kdb_on_nmi) {
740                                         printf ("NMI ... going to debugger\n");
741                                         kdb_trap(type, 0, frame);
742                                 }
743 #endif /* KDB */
744                                 goto out;
745                         } else if (panic_on_nmi == 0)
746                                 goto out;
747                         /* FALLTHROUGH */
748 #endif /* POWERFAIL_NMI */
749 #endif /* DEV_ISA */
750                 }
751
752                 trap_fatal(frame, eva);
753                 goto out;
754         }
755
756         /* Translate fault for emulators (e.g. Linux) */
757         if (*p->p_sysent->sv_transtrap)
758                 i = (*p->p_sysent->sv_transtrap)(i, type);
759
760         ksiginfo_init_trap(&ksi);
761         ksi.ksi_signo = i;
762         ksi.ksi_code = ucode;
763         ksi.ksi_addr = (void *)addr;
764         ksi.ksi_trapno = type;
765         if (uprintf_signal) {
766                 uprintf("pid %d comm %s: signal %d err %x code %d type %d "
767                     "addr 0x%x esp 0x%08x eip 0x%08x "
768                     "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
769                     p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr,
770                     frame->tf_esp, frame->tf_eip,
771                     fubyte((void *)(frame->tf_eip + 0)),
772                     fubyte((void *)(frame->tf_eip + 1)),
773                     fubyte((void *)(frame->tf_eip + 2)),
774                     fubyte((void *)(frame->tf_eip + 3)),
775                     fubyte((void *)(frame->tf_eip + 4)),
776                     fubyte((void *)(frame->tf_eip + 5)),
777                     fubyte((void *)(frame->tf_eip + 6)),
778                     fubyte((void *)(frame->tf_eip + 7)));
779         }
780         KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled"));
781         trapsignal(td, &ksi);
782
783 #ifdef DEBUG
784         if (type <= MAX_TRAP_MSG) {
785                 uprintf("fatal process exception: %s",
786                         trap_msg[type]);
787                 if ((type == T_PAGEFLT) || (type == T_PROTFLT))
788                         uprintf(", fault VA = 0x%lx", (u_long)eva);
789                 uprintf("\n");
790         }
791 #endif
792
793 user:
794         userret(td, frame);
795         mtx_assert(&Giant, MA_NOTOWNED);
796         KASSERT(PCB_USER_FPU(td->td_pcb),
797             ("Return from trap with kernel FPU ctx leaked"));
798 userout:
799 out:
800         return;
801 }
802
803 static int
804 trap_pfault(frame, usermode, eva)
805         struct trapframe *frame;
806         int usermode;
807         vm_offset_t eva;
808 {
809         vm_offset_t va;
810         struct vmspace *vm = NULL;
811         vm_map_t map;
812         int rv = 0;
813         vm_prot_t ftype;
814         struct thread *td = curthread;
815         struct proc *p = td->td_proc;
816
817         if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
818                 /*
819                  * Due to both processor errata and lazy TLB invalidation when
820                  * access restrictions are removed from virtual pages, memory
821                  * accesses that are allowed by the physical mapping layer may
822                  * nonetheless cause one spurious page fault per virtual page. 
823                  * When the thread is executing a "no faulting" section that
824                  * is bracketed by vm_fault_{disable,enable}_pagefaults(),
825                  * every page fault is treated as a spurious page fault,
826                  * unless it accesses the same virtual address as the most
827                  * recent page fault within the same "no faulting" section.
828                  */
829                 if (td->td_md.md_spurflt_addr != eva ||
830                     (td->td_pflags & TDP_RESETSPUR) != 0) {
831                         /*
832                          * Do nothing to the TLB.  A stale TLB entry is
833                          * flushed automatically by a page fault.
834                          */
835                         td->td_md.md_spurflt_addr = eva;
836                         td->td_pflags &= ~TDP_RESETSPUR;
837                         return (0);
838                 }
839         } else {
840                 /*
841                  * If we get a page fault while in a critical section, then
842                  * it is most likely a fatal kernel page fault.  The kernel
843                  * is already going to panic trying to get a sleep lock to
844                  * do the VM lookup, so just consider it a fatal trap so the
845                  * kernel can print out a useful trap message and even get
846                  * to the debugger.
847                  *
848                  * If we get a page fault while holding a non-sleepable
849                  * lock, then it is most likely a fatal kernel page fault.
850                  * If WITNESS is enabled, then it's going to whine about
851                  * bogus LORs with various VM locks, so just skip to the
852                  * fatal trap handling directly.
853                  */
854                 if (td->td_critnest != 0 ||
855                     WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
856                     "Kernel page fault") != 0) {
857                         trap_fatal(frame, eva);
858                         return (-1);
859                 }
860         }
861         va = trunc_page(eva);
862         if (va >= KERNBASE) {
863                 /*
864                  * Don't allow user-mode faults in kernel address space.
865                  * An exception:  if the faulting address is the invalid
866                  * instruction entry in the IDT, then the Intel Pentium
867                  * F00F bug workaround was triggered, and we need to
868                  * treat it is as an illegal instruction, and not a page
869                  * fault.
870                  */
871 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
872                 if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
873                         return -2;
874 #endif
875                 if (usermode)
876                         goto nogo;
877
878                 map = kernel_map;
879         } else {
880                 /*
881                  * This is a fault on non-kernel virtual memory.
882                  * vm is initialized above to NULL. If curproc is NULL
883                  * or curproc->p_vmspace is NULL the fault is fatal.
884                  */
885                 if (p != NULL)
886                         vm = p->p_vmspace;
887
888                 if (vm == NULL)
889                         goto nogo;
890
891                 map = &vm->vm_map;
892                 if (!usermode && (td->td_intr_nesting_level != 0 ||
893                     curpcb->pcb_onfault == NULL)) {
894                         trap_fatal(frame, eva);
895                         return (-1);
896                 }
897         }
898
899         /*
900          * PGEX_I is defined only if the execute disable bit capability is
901          * supported and enabled.
902          */
903         if (frame->tf_err & PGEX_W)
904                 ftype = VM_PROT_WRITE;
905 #ifdef PAE
906         else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
907                 ftype = VM_PROT_EXECUTE;
908 #endif
909         else
910                 ftype = VM_PROT_READ;
911
912         if (map != kernel_map) {
913                 /*
914                  * Keep swapout from messing with us during this
915                  *      critical time.
916                  */
917                 PROC_LOCK(p);
918                 ++p->p_lock;
919                 PROC_UNLOCK(p);
920
921                 /* Fault in the user page: */
922                 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
923
924                 PROC_LOCK(p);
925                 --p->p_lock;
926                 PROC_UNLOCK(p);
927         } else {
928                 /*
929                  * Don't have to worry about process locking or stacks in the
930                  * kernel.
931                  */
932                 rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
933         }
934         if (rv == KERN_SUCCESS) {
935 #ifdef HWPMC_HOOKS
936                 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
937                         PMC_SOFT_CALL_TF( , , page_fault, all, frame);
938                         if (ftype == VM_PROT_READ)
939                                 PMC_SOFT_CALL_TF( , , page_fault, read,
940                                     frame);
941                         else
942                                 PMC_SOFT_CALL_TF( , , page_fault, write,
943                                     frame);
944                 }
945 #endif
946                 return (0);
947         }
948 nogo:
949         if (!usermode) {
950                 if (td->td_intr_nesting_level == 0 &&
951                     curpcb->pcb_onfault != NULL) {
952                         frame->tf_eip = (int)curpcb->pcb_onfault;
953                         return (0);
954                 }
955                 trap_fatal(frame, eva);
956                 return (-1);
957         }
958
959         return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
960 }
961
962 static void
963 trap_fatal(frame, eva)
964         struct trapframe *frame;
965         vm_offset_t eva;
966 {
967         int code, ss, esp;
968         u_int type;
969         struct soft_segment_descriptor softseg;
970         char *msg;
971
972         code = frame->tf_err;
973         type = frame->tf_trapno;
974         sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
975
976         if (type <= MAX_TRAP_MSG)
977                 msg = trap_msg[type];
978         else
979                 msg = "UNKNOWN";
980         printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
981             frame->tf_eflags & PSL_VM ? "vm86" :
982             ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
983 #ifdef SMP
984         /* two separate prints in case of a trap on an unmapped page */
985         printf("cpuid = %d; ", PCPU_GET(cpuid));
986         printf("apic id = %02x\n", PCPU_GET(apic_id));
987 #endif
988         if (type == T_PAGEFLT) {
989                 printf("fault virtual address   = 0x%x\n", eva);
990                 printf("fault code              = %s %s, %s\n",
991                         code & PGEX_U ? "user" : "supervisor",
992                         code & PGEX_W ? "write" : "read",
993                         code & PGEX_P ? "protection violation" : "page not present");
994         }
995         printf("instruction pointer     = 0x%x:0x%x\n",
996                frame->tf_cs & 0xffff, frame->tf_eip);
997         if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
998                 ss = frame->tf_ss & 0xffff;
999                 esp = frame->tf_esp;
1000         } else {
1001                 ss = GSEL(GDATA_SEL, SEL_KPL);
1002                 esp = (int)&frame->tf_esp;
1003         }
1004         printf("stack pointer           = 0x%x:0x%x\n", ss, esp);
1005         printf("frame pointer           = 0x%x:0x%x\n", ss, frame->tf_ebp);
1006         printf("code segment            = base 0x%x, limit 0x%x, type 0x%x\n",
1007                softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
1008         printf("                        = DPL %d, pres %d, def32 %d, gran %d\n",
1009                softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
1010                softseg.ssd_gran);
1011         printf("processor eflags        = ");
1012         if (frame->tf_eflags & PSL_T)
1013                 printf("trace trap, ");
1014         if (frame->tf_eflags & PSL_I)
1015                 printf("interrupt enabled, ");
1016         if (frame->tf_eflags & PSL_NT)
1017                 printf("nested task, ");
1018         if (frame->tf_eflags & PSL_RF)
1019                 printf("resume, ");
1020         if (frame->tf_eflags & PSL_VM)
1021                 printf("vm86, ");
1022         printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
1023         printf("current process         = %d (%s)\n",
1024             curproc->p_pid, curthread->td_name);
1025
1026 #ifdef KDB
1027         if (debugger_on_panic || kdb_active) {
1028                 frame->tf_err = eva;    /* smuggle fault address to ddb */
1029                 if (kdb_trap(type, 0, frame)) {
1030                         frame->tf_err = code;   /* restore error code */
1031                         return;
1032                 }
1033                 frame->tf_err = code;           /* restore error code */
1034         }
1035 #endif
1036         printf("trap number             = %d\n", type);
1037         if (type <= MAX_TRAP_MSG)
1038                 panic("%s", trap_msg[type]);
1039         else
1040                 panic("unknown/reserved trap");
1041 }
1042
1043 /*
1044  * Double fault handler. Called when a fault occurs while writing
1045  * a frame for a trap/exception onto the stack. This usually occurs
1046  * when the stack overflows (such is the case with infinite recursion,
1047  * for example).
1048  *
1049  * XXX Note that the current PTD gets replaced by IdlePTD when the
1050  * task switch occurs. This means that the stack that was active at
1051  * the time of the double fault is not available at <kstack> unless
1052  * the machine was idle when the double fault occurred. The downside
1053  * of this is that "trace <ebp>" in ddb won't work.
1054  */
1055 void
1056 dblfault_handler()
1057 {
1058 #ifdef KDTRACE_HOOKS
1059         if (dtrace_doubletrap_func != NULL)
1060                 (*dtrace_doubletrap_func)();
1061 #endif
1062         printf("\nFatal double fault:\n");
1063         printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
1064         printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
1065         printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
1066 #ifdef SMP
1067         /* two separate prints in case of a trap on an unmapped page */
1068         printf("cpuid = %d; ", PCPU_GET(cpuid));
1069         printf("apic id = %02x\n", PCPU_GET(apic_id));
1070 #endif
1071         panic("double fault");
1072 }
1073
1074 int
1075 cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
1076 {
1077         struct proc *p;
1078         struct trapframe *frame;
1079         caddr_t params;
1080         int error;
1081
1082         p = td->td_proc;
1083         frame = td->td_frame;
1084
1085         params = (caddr_t)frame->tf_esp + sizeof(int);
1086         sa->code = frame->tf_eax;
1087
1088         /*
1089          * Need to check if this is a 32 bit or 64 bit syscall.
1090          */
1091         if (sa->code == SYS_syscall) {
1092                 /*
1093                  * Code is first argument, followed by actual args.
1094                  */
1095                 sa->code = fuword(params);
1096                 params += sizeof(int);
1097         } else if (sa->code == SYS___syscall) {
1098                 /*
1099                  * Like syscall, but code is a quad, so as to maintain
1100                  * quad alignment for the rest of the arguments.
1101                  */
1102                 sa->code = fuword(params);
1103                 params += sizeof(quad_t);
1104         }
1105
1106         if (p->p_sysent->sv_mask)
1107                 sa->code &= p->p_sysent->sv_mask;
1108         if (sa->code >= p->p_sysent->sv_size)
1109                 sa->callp = &p->p_sysent->sv_table[0];
1110         else
1111                 sa->callp = &p->p_sysent->sv_table[sa->code];
1112         sa->narg = sa->callp->sy_narg;
1113
1114         if (params != NULL && sa->narg != 0)
1115                 error = copyin(params, (caddr_t)sa->args,
1116                     (u_int)(sa->narg * sizeof(int)));
1117         else
1118                 error = 0;
1119
1120         if (error == 0) {
1121                 td->td_retval[0] = 0;
1122                 td->td_retval[1] = frame->tf_edx;
1123         }
1124                 
1125         return (error);
1126 }
1127
1128 #include "../../kern/subr_syscall.c"
1129
1130 /*
1131  * syscall - system call request C handler.  A system call is
1132  * essentially treated as a trap by reusing the frame layout.
1133  */
1134 void
1135 syscall(struct trapframe *frame)
1136 {
1137         struct thread *td;
1138         struct syscall_args sa;
1139         register_t orig_tf_eflags;
1140         int error;
1141         ksiginfo_t ksi;
1142
1143 #ifdef DIAGNOSTIC
1144         if (ISPL(frame->tf_cs) != SEL_UPL) {
1145                 panic("syscall");
1146                 /* NOT REACHED */
1147         }
1148 #endif
1149         orig_tf_eflags = frame->tf_eflags;
1150
1151         td = curthread;
1152         td->td_frame = frame;
1153
1154         error = syscallenter(td, &sa);
1155
1156         /*
1157          * Traced syscall.
1158          */
1159         if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
1160                 frame->tf_eflags &= ~PSL_T;
1161                 ksiginfo_init_trap(&ksi);
1162                 ksi.ksi_signo = SIGTRAP;
1163                 ksi.ksi_code = TRAP_TRACE;
1164                 ksi.ksi_addr = (void *)frame->tf_eip;
1165                 trapsignal(td, &ksi);
1166         }
1167
1168         KASSERT(PCB_USER_FPU(td->td_pcb),
1169             ("System call %s returning with kernel FPU ctx leaked",
1170              syscallname(td->td_proc, sa.code)));
1171         KASSERT(td->td_pcb->pcb_save == &td->td_pcb->pcb_user_save,
1172             ("System call %s returning with mangled pcb_save",
1173              syscallname(td->td_proc, sa.code)));
1174
1175         syscallret(td, error, &sa);
1176 }