]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/amd64/amd64/trap.c
MFC r310205:
[FreeBSD/FreeBSD.git] / sys / amd64 / amd64 / trap.c
1 /*-
2  * Copyright (C) 1994, David Greenman
3  * Copyright (c) 1990, 1993
4  *      The Regents of the University of California.  All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * the University of Utah, and William Jolitz.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *      This product includes software developed by the University of
20  *      California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *      from: @(#)trap.c        7.4 (Berkeley) 5/13/91
38  */
39
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42
43 /*
44  * AMD64 Trap and System call handling
45  */
46
47 #include "opt_clock.h"
48 #include "opt_cpu.h"
49 #include "opt_hwpmc_hooks.h"
50 #include "opt_isa.h"
51 #include "opt_kdb.h"
52 #include "opt_stack.h"
53
54 #include <sys/param.h>
55 #include <sys/bus.h>
56 #include <sys/systm.h>
57 #include <sys/proc.h>
58 #include <sys/pioctl.h>
59 #include <sys/ptrace.h>
60 #include <sys/kdb.h>
61 #include <sys/kernel.h>
62 #include <sys/ktr.h>
63 #include <sys/lock.h>
64 #include <sys/mutex.h>
65 #include <sys/resourcevar.h>
66 #include <sys/signalvar.h>
67 #include <sys/syscall.h>
68 #include <sys/sysctl.h>
69 #include <sys/sysent.h>
70 #include <sys/uio.h>
71 #include <sys/vmmeter.h>
72 #ifdef HWPMC_HOOKS
73 #include <sys/pmckern.h>
74 PMC_SOFT_DEFINE( , , page_fault, all);
75 PMC_SOFT_DEFINE( , , page_fault, read);
76 PMC_SOFT_DEFINE( , , page_fault, write);
77 #endif
78
79 #include <vm/vm.h>
80 #include <vm/vm_param.h>
81 #include <vm/pmap.h>
82 #include <vm/vm_kern.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_page.h>
85 #include <vm/vm_extern.h>
86
87 #include <machine/cpu.h>
88 #include <machine/intr_machdep.h>
89 #include <x86/mca.h>
90 #include <machine/md_var.h>
91 #include <machine/pcb.h>
92 #ifdef SMP
93 #include <machine/smp.h>
94 #endif
95 #include <machine/stack.h>
96 #include <machine/tss.h>
97
98 #ifdef KDTRACE_HOOKS
99 #include <sys/dtrace_bsd.h>
100 #endif
101
102 extern void __noinline trap(struct trapframe *frame);
103 extern void trap_check(struct trapframe *frame);
104 extern void syscall(struct trapframe *frame);
105 void dblfault_handler(struct trapframe *frame);
106
107 static int trap_pfault(struct trapframe *, int);
108 static void trap_fatal(struct trapframe *, vm_offset_t);
109
110 #define MAX_TRAP_MSG            32
111 static char *trap_msg[] = {
112         "",                                     /*  0 unused */
113         "privileged instruction fault",         /*  1 T_PRIVINFLT */
114         "",                                     /*  2 unused */
115         "breakpoint instruction fault",         /*  3 T_BPTFLT */
116         "",                                     /*  4 unused */
117         "",                                     /*  5 unused */
118         "arithmetic trap",                      /*  6 T_ARITHTRAP */
119         "",                                     /*  7 unused */
120         "",                                     /*  8 unused */
121         "general protection fault",             /*  9 T_PROTFLT */
122         "trace trap",                           /* 10 T_TRCTRAP */
123         "",                                     /* 11 unused */
124         "page fault",                           /* 12 T_PAGEFLT */
125         "",                                     /* 13 unused */
126         "alignment fault",                      /* 14 T_ALIGNFLT */
127         "",                                     /* 15 unused */
128         "",                                     /* 16 unused */
129         "",                                     /* 17 unused */
130         "integer divide fault",                 /* 18 T_DIVIDE */
131         "non-maskable interrupt trap",          /* 19 T_NMI */
132         "overflow trap",                        /* 20 T_OFLOW */
133         "FPU bounds check fault",               /* 21 T_BOUND */
134         "FPU device not available",             /* 22 T_DNA */
135         "double fault",                         /* 23 T_DOUBLEFLT */
136         "FPU operand fetch fault",              /* 24 T_FPOPFLT */
137         "invalid TSS fault",                    /* 25 T_TSSFLT */
138         "segment not present fault",            /* 26 T_SEGNPFLT */
139         "stack fault",                          /* 27 T_STKFLT */
140         "machine check trap",                   /* 28 T_MCHK */
141         "SIMD floating-point exception",        /* 29 T_XMMFLT */
142         "reserved (unknown) fault",             /* 30 T_RESERVED */
143         "",                                     /* 31 unused (reserved) */
144         "DTrace pid return trap",               /* 32 T_DTRACE_RET */
145 };
146
147 static int prot_fault_translation;
148 SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN,
149     &prot_fault_translation, 0,
150     "Select signal to deliver on protection fault");
151 static int uprintf_signal;
152 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RWTUN,
153     &uprintf_signal, 0,
154     "Print debugging information on trap signal to ctty");
155
156 /*
157  * Exception, fault, and trap interface to the FreeBSD kernel.
158  * This common code is called from assembly language IDT gate entry
159  * routines that prepare a suitable stack frame, and restore this
160  * frame after the exception has been processed.
161  */
162
163 void
164 trap(struct trapframe *frame)
165 {
166 #ifdef KDTRACE_HOOKS
167         struct reg regs;
168 #endif
169         struct thread *td = curthread;
170         struct proc *p = td->td_proc;
171 #ifdef KDB
172         register_t dr6;
173 #endif
174         int i = 0, ucode = 0;
175         u_int type;
176         register_t addr = 0;
177         ksiginfo_t ksi;
178
179         PCPU_INC(cnt.v_trap);
180         type = frame->tf_trapno;
181
182 #ifdef SMP
183         /* Handler for NMI IPIs used for stopping CPUs. */
184         if (type == T_NMI) {
185                  if (ipi_nmi_handler() == 0)
186                            goto out;
187         }
188 #endif /* SMP */
189
190 #ifdef KDB
191         if (kdb_active) {
192                 kdb_reenter();
193                 goto out;
194         }
195 #endif
196
197         if (type == T_RESERVED) {
198                 trap_fatal(frame, 0);
199                 goto out;
200         }
201
202         if (type == T_NMI) {
203 #ifdef HWPMC_HOOKS
204                 /*
205                  * CPU PMCs interrupt using an NMI.  If the PMC module is
206                  * active, pass the 'rip' value to the PMC module's interrupt
207                  * handler.  A non-zero return value from the handler means that
208                  * the NMI was consumed by it and we can return immediately.
209                  */
210                 if (pmc_intr != NULL &&
211                     (*pmc_intr)(PCPU_GET(cpuid), frame) != 0)
212                         goto out;
213 #endif
214
215 #ifdef STACK
216                 if (stack_nmi_handler(frame) != 0)
217                         goto out;
218 #endif
219         }
220
221         if (type == T_MCHK) {
222                 mca_intr();
223                 goto out;
224         }
225
226         if ((frame->tf_rflags & PSL_I) == 0) {
227                 /*
228                  * Buggy application or kernel code has disabled
229                  * interrupts and then trapped.  Enabling interrupts
230                  * now is wrong, but it is better than running with
231                  * interrupts disabled until they are accidentally
232                  * enabled later.
233                  */
234                 if (TRAPF_USERMODE(frame))
235                         uprintf(
236                             "pid %ld (%s): trap %d with interrupts disabled\n",
237                             (long)curproc->p_pid, curthread->td_name, type);
238                 else if (type != T_NMI && type != T_BPTFLT &&
239                     type != T_TRCTRAP) {
240                         /*
241                          * XXX not quite right, since this may be for a
242                          * multiple fault in user mode.
243                          */
244                         printf("kernel trap %d with interrupts disabled\n",
245                             type);
246
247                         /*
248                          * We shouldn't enable interrupts while holding a
249                          * spin lock.
250                          */
251                         if (td->td_md.md_spinlock_count == 0)
252                                 enable_intr();
253                 }
254         }
255
256         if (TRAPF_USERMODE(frame)) {
257                 /* user trap */
258
259                 td->td_pticks = 0;
260                 td->td_frame = frame;
261                 addr = frame->tf_rip;
262                 if (td->td_cowgen != p->p_cowgen)
263                         thread_cow_update(td);
264
265                 switch (type) {
266                 case T_PRIVINFLT:       /* privileged instruction fault */
267                         i = SIGILL;
268                         ucode = ILL_PRVOPC;
269                         break;
270
271                 case T_BPTFLT:          /* bpt instruction fault */
272                 case T_TRCTRAP:         /* trace trap */
273                         enable_intr();
274 #ifdef KDTRACE_HOOKS
275                         if (type == T_BPTFLT) {
276                                 fill_frame_regs(frame, &regs);
277                                 if (dtrace_pid_probe_ptr != NULL &&
278                                     dtrace_pid_probe_ptr(&regs) == 0)
279                                         goto out;
280                         }
281 #endif
282                         frame->tf_rflags &= ~PSL_T;
283                         i = SIGTRAP;
284                         ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
285                         break;
286
287                 case T_ARITHTRAP:       /* arithmetic trap */
288                         ucode = fputrap_x87();
289                         if (ucode == -1)
290                                 goto userout;
291                         i = SIGFPE;
292                         break;
293
294                 case T_PROTFLT:         /* general protection fault */
295                         i = SIGBUS;
296                         ucode = BUS_OBJERR;
297                         break;
298                 case T_STKFLT:          /* stack fault */
299                 case T_SEGNPFLT:        /* segment not present fault */
300                         i = SIGBUS;
301                         ucode = BUS_ADRERR;
302                         break;
303                 case T_TSSFLT:          /* invalid TSS fault */
304                         i = SIGBUS;
305                         ucode = BUS_OBJERR;
306                         break;
307                 case T_ALIGNFLT:
308                         i = SIGBUS;
309                         ucode = BUS_ADRALN;
310                         break;
311                 case T_DOUBLEFLT:       /* double fault */
312                 default:
313                         i = SIGBUS;
314                         ucode = BUS_OBJERR;
315                         break;
316
317                 case T_PAGEFLT:         /* page fault */
318                         /*
319                          * Emulator can take care about this trap?
320                          */
321                         if (*p->p_sysent->sv_trap != NULL &&
322                             (*p->p_sysent->sv_trap)(td) == 0)
323                                 goto userout;
324
325                         addr = frame->tf_addr;
326                         i = trap_pfault(frame, TRUE);
327                         if (i == -1)
328                                 goto userout;
329                         if (i == 0)
330                                 goto user;
331
332                         if (i == SIGSEGV)
333                                 ucode = SEGV_MAPERR;
334                         else {
335                                 if (prot_fault_translation == 0) {
336                                         /*
337                                          * Autodetect.
338                                          * This check also covers the images
339                                          * without the ABI-tag ELF note.
340                                          */
341                                         if (SV_CURPROC_ABI() == SV_ABI_FREEBSD
342                                             && p->p_osrel >= P_OSREL_SIGSEGV) {
343                                                 i = SIGSEGV;
344                                                 ucode = SEGV_ACCERR;
345                                         } else {
346                                                 i = SIGBUS;
347                                                 ucode = BUS_PAGE_FAULT;
348                                         }
349                                 } else if (prot_fault_translation == 1) {
350                                         /*
351                                          * Always compat mode.
352                                          */
353                                         i = SIGBUS;
354                                         ucode = BUS_PAGE_FAULT;
355                                 } else {
356                                         /*
357                                          * Always SIGSEGV mode.
358                                          */
359                                         i = SIGSEGV;
360                                         ucode = SEGV_ACCERR;
361                                 }
362                         }
363                         break;
364
365                 case T_DIVIDE:          /* integer divide fault */
366                         ucode = FPE_INTDIV;
367                         i = SIGFPE;
368                         break;
369
370 #ifdef DEV_ISA
371                 case T_NMI:
372                         nmi_handle_intr(type, frame);
373                         break;
374 #endif /* DEV_ISA */
375
376                 case T_OFLOW:           /* integer overflow fault */
377                         ucode = FPE_INTOVF;
378                         i = SIGFPE;
379                         break;
380
381                 case T_BOUND:           /* bounds check fault */
382                         ucode = FPE_FLTSUB;
383                         i = SIGFPE;
384                         break;
385
386                 case T_DNA:
387                         /* transparent fault (due to context switch "late") */
388                         KASSERT(PCB_USER_FPU(td->td_pcb),
389                             ("kernel FPU ctx has leaked"));
390                         fpudna();
391                         goto userout;
392
393                 case T_FPOPFLT:         /* FPU operand fetch fault */
394                         ucode = ILL_COPROC;
395                         i = SIGILL;
396                         break;
397
398                 case T_XMMFLT:          /* SIMD floating-point exception */
399                         ucode = fputrap_sse();
400                         if (ucode == -1)
401                                 goto userout;
402                         i = SIGFPE;
403                         break;
404 #ifdef KDTRACE_HOOKS
405                 case T_DTRACE_RET:
406                         enable_intr();
407                         fill_frame_regs(frame, &regs);
408                         if (dtrace_return_probe_ptr != NULL &&
409                             dtrace_return_probe_ptr(&regs) == 0)
410                                 goto out;
411                         break;
412 #endif
413                 }
414         } else {
415                 /* kernel trap */
416
417                 KASSERT(cold || td->td_ucred != NULL,
418                     ("kernel trap doesn't have ucred"));
419                 switch (type) {
420                 case T_PAGEFLT:                 /* page fault */
421                         (void) trap_pfault(frame, FALSE);
422                         goto out;
423
424                 case T_DNA:
425                         if (PCB_USER_FPU(td->td_pcb))
426                                 panic("Unregistered use of FPU in kernel");
427                         fpudna();
428                         goto out;
429
430                 case T_ARITHTRAP:       /* arithmetic trap */
431                 case T_XMMFLT:          /* SIMD floating-point exception */
432                 case T_FPOPFLT:         /* FPU operand fetch fault */
433                         /*
434                          * For now, supporting kernel handler
435                          * registration for FPU traps is overkill.
436                          */
437                         trap_fatal(frame, 0);
438                         goto out;
439
440                 case T_STKFLT:          /* stack fault */
441                 case T_PROTFLT:         /* general protection fault */
442                 case T_SEGNPFLT:        /* segment not present fault */
443                         if (td->td_intr_nesting_level != 0)
444                                 break;
445
446                         /*
447                          * Invalid segment selectors and out of bounds
448                          * %rip's and %rsp's can be set up in user mode.
449                          * This causes a fault in kernel mode when the
450                          * kernel tries to return to user mode.  We want
451                          * to get this fault so that we can fix the
452                          * problem here and not have to check all the
453                          * selectors and pointers when the user changes
454                          * them.
455                          */
456                         if (frame->tf_rip == (long)doreti_iret) {
457                                 frame->tf_rip = (long)doreti_iret_fault;
458                                 goto out;
459                         }
460                         if (frame->tf_rip == (long)ld_ds) {
461                                 frame->tf_rip = (long)ds_load_fault;
462                                 goto out;
463                         }
464                         if (frame->tf_rip == (long)ld_es) {
465                                 frame->tf_rip = (long)es_load_fault;
466                                 goto out;
467                         }
468                         if (frame->tf_rip == (long)ld_fs) {
469                                 frame->tf_rip = (long)fs_load_fault;
470                                 goto out;
471                         }
472                         if (frame->tf_rip == (long)ld_gs) {
473                                 frame->tf_rip = (long)gs_load_fault;
474                                 goto out;
475                         }
476                         if (frame->tf_rip == (long)ld_gsbase) {
477                                 frame->tf_rip = (long)gsbase_load_fault;
478                                 goto out;
479                         }
480                         if (frame->tf_rip == (long)ld_fsbase) {
481                                 frame->tf_rip = (long)fsbase_load_fault;
482                                 goto out;
483                         }
484                         if (curpcb->pcb_onfault != NULL) {
485                                 frame->tf_rip = (long)curpcb->pcb_onfault;
486                                 goto out;
487                         }
488                         break;
489
490                 case T_TSSFLT:
491                         /*
492                          * PSL_NT can be set in user mode and isn't cleared
493                          * automatically when the kernel is entered.  This
494                          * causes a TSS fault when the kernel attempts to
495                          * `iret' because the TSS link is uninitialized.  We
496                          * want to get this fault so that we can fix the
497                          * problem here and not every time the kernel is
498                          * entered.
499                          */
500                         if (frame->tf_rflags & PSL_NT) {
501                                 frame->tf_rflags &= ~PSL_NT;
502                                 goto out;
503                         }
504                         break;
505
506                 case T_TRCTRAP:  /* trace trap */
507                         /*
508                          * Ignore debug register trace traps due to
509                          * accesses in the user's address space, which
510                          * can happen under several conditions such as
511                          * if a user sets a watchpoint on a buffer and
512                          * then passes that buffer to a system call.
513                          * We still want to get TRCTRAPS for addresses
514                          * in kernel space because that is useful when
515                          * debugging the kernel.
516                          */
517                         if (user_dbreg_trap()) {
518                                 /*
519                                  * Reset breakpoint bits because the
520                                  * processor doesn't
521                                  */
522                                 load_dr6(rdr6() & ~0xf);
523                                 goto out;
524                         }
525                         /*
526                          * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
527                          */
528                 case T_BPTFLT:
529                         /*
530                          * If KDB is enabled, let it handle the debugger trap.
531                          * Otherwise, debugger traps "can't happen".
532                          */
533 #ifdef KDB
534                         /* XXX %dr6 is not quite reentrant. */
535                         dr6 = rdr6();
536                         load_dr6(dr6 & ~0x4000);
537                         if (kdb_trap(type, dr6, frame))
538                                 goto out;
539 #endif
540                         break;
541
542 #ifdef DEV_ISA
543                 case T_NMI:
544                         nmi_handle_intr(type, frame);
545                         goto out;
546 #endif /* DEV_ISA */
547                 }
548
549                 trap_fatal(frame, 0);
550                 goto out;
551         }
552
553         /* Translate fault for emulators (e.g. Linux) */
554         if (*p->p_sysent->sv_transtrap)
555                 i = (*p->p_sysent->sv_transtrap)(i, type);
556
557         ksiginfo_init_trap(&ksi);
558         ksi.ksi_signo = i;
559         ksi.ksi_code = ucode;
560         ksi.ksi_trapno = type;
561         ksi.ksi_addr = (void *)addr;
562         if (uprintf_signal) {
563                 uprintf("pid %d comm %s: signal %d err %lx code %d type %d "
564                     "addr 0x%lx rsp 0x%lx rip 0x%lx "
565                     "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
566                     p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr,
567                     frame->tf_rsp, frame->tf_rip,
568                     fubyte((void *)(frame->tf_rip + 0)),
569                     fubyte((void *)(frame->tf_rip + 1)),
570                     fubyte((void *)(frame->tf_rip + 2)),
571                     fubyte((void *)(frame->tf_rip + 3)),
572                     fubyte((void *)(frame->tf_rip + 4)),
573                     fubyte((void *)(frame->tf_rip + 5)),
574                     fubyte((void *)(frame->tf_rip + 6)),
575                     fubyte((void *)(frame->tf_rip + 7)));
576         }
577         KASSERT((read_rflags() & PSL_I) != 0, ("interrupts disabled"));
578         trapsignal(td, &ksi);
579
580 user:
581         userret(td, frame);
582         KASSERT(PCB_USER_FPU(td->td_pcb),
583             ("Return from trap with kernel FPU ctx leaked"));
584 userout:
585 out:
586         return;
587 }
588
589 /*
590  * Ensure that we ignore any DTrace-induced faults. This function cannot
591  * be instrumented, so it cannot generate such faults itself.
592  */
593 void
594 trap_check(struct trapframe *frame)
595 {
596
597 #ifdef KDTRACE_HOOKS
598         if (dtrace_trap_func != NULL &&
599             (*dtrace_trap_func)(frame, frame->tf_trapno) != 0)
600                 return;
601 #endif
602         trap(frame);
603 }
604
605 static int
606 trap_pfault(frame, usermode)
607         struct trapframe *frame;
608         int usermode;
609 {
610         vm_offset_t va;
611         vm_map_t map;
612         int rv = 0;
613         vm_prot_t ftype;
614         struct thread *td = curthread;
615         struct proc *p = td->td_proc;
616         vm_offset_t eva = frame->tf_addr;
617
618         if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
619                 /*
620                  * Due to both processor errata and lazy TLB invalidation when
621                  * access restrictions are removed from virtual pages, memory
622                  * accesses that are allowed by the physical mapping layer may
623                  * nonetheless cause one spurious page fault per virtual page. 
624                  * When the thread is executing a "no faulting" section that
625                  * is bracketed by vm_fault_{disable,enable}_pagefaults(),
626                  * every page fault is treated as a spurious page fault,
627                  * unless it accesses the same virtual address as the most
628                  * recent page fault within the same "no faulting" section.
629                  */
630                 if (td->td_md.md_spurflt_addr != eva ||
631                     (td->td_pflags & TDP_RESETSPUR) != 0) {
632                         /*
633                          * Do nothing to the TLB.  A stale TLB entry is
634                          * flushed automatically by a page fault.
635                          */
636                         td->td_md.md_spurflt_addr = eva;
637                         td->td_pflags &= ~TDP_RESETSPUR;
638                         return (0);
639                 }
640         } else {
641                 /*
642                  * If we get a page fault while in a critical section, then
643                  * it is most likely a fatal kernel page fault.  The kernel
644                  * is already going to panic trying to get a sleep lock to
645                  * do the VM lookup, so just consider it a fatal trap so the
646                  * kernel can print out a useful trap message and even get
647                  * to the debugger.
648                  *
649                  * If we get a page fault while holding a non-sleepable
650                  * lock, then it is most likely a fatal kernel page fault.
651                  * If WITNESS is enabled, then it's going to whine about
652                  * bogus LORs with various VM locks, so just skip to the
653                  * fatal trap handling directly.
654                  */
655                 if (td->td_critnest != 0 ||
656                     WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
657                     "Kernel page fault") != 0) {
658                         trap_fatal(frame, eva);
659                         return (-1);
660                 }
661         }
662         va = trunc_page(eva);
663         if (va >= VM_MIN_KERNEL_ADDRESS) {
664                 /*
665                  * Don't allow user-mode faults in kernel address space.
666                  */
667                 if (usermode)
668                         goto nogo;
669
670                 map = kernel_map;
671         } else {
672                 map = &p->p_vmspace->vm_map;
673
674                 /*
675                  * When accessing a usermode address, kernel must be
676                  * ready to accept the page fault, and provide a
677                  * handling routine.  Since accessing the address
678                  * without the handler is a bug, do not try to handle
679                  * it normally, and panic immediately.
680                  */
681                 if (!usermode && (td->td_intr_nesting_level != 0 ||
682                     curpcb->pcb_onfault == NULL)) {
683                         trap_fatal(frame, eva);
684                         return (-1);
685                 }
686         }
687
688         /*
689          * If the trap was caused by errant bits in the PTE then panic.
690          */
691         if (frame->tf_err & PGEX_RSV) {
692                 trap_fatal(frame, eva);
693                 return (-1);
694         }
695
696         /*
697          * PGEX_I is defined only if the execute disable bit capability is
698          * supported and enabled.
699          */
700         if (frame->tf_err & PGEX_W)
701                 ftype = VM_PROT_WRITE;
702         else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
703                 ftype = VM_PROT_EXECUTE;
704         else
705                 ftype = VM_PROT_READ;
706
707         /* Fault in the page. */
708         rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
709         if (rv == KERN_SUCCESS) {
710 #ifdef HWPMC_HOOKS
711                 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
712                         PMC_SOFT_CALL_TF( , , page_fault, all, frame);
713                         if (ftype == VM_PROT_READ)
714                                 PMC_SOFT_CALL_TF( , , page_fault, read,
715                                     frame);
716                         else
717                                 PMC_SOFT_CALL_TF( , , page_fault, write,
718                                     frame);
719                 }
720 #endif
721                 return (0);
722         }
723 nogo:
724         if (!usermode) {
725                 if (td->td_intr_nesting_level == 0 &&
726                     curpcb->pcb_onfault != NULL) {
727                         frame->tf_rip = (long)curpcb->pcb_onfault;
728                         return (0);
729                 }
730                 trap_fatal(frame, eva);
731                 return (-1);
732         }
733         return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
734 }
735
736 static void
737 trap_fatal(frame, eva)
738         struct trapframe *frame;
739         vm_offset_t eva;
740 {
741         int code, ss;
742         u_int type;
743         struct soft_segment_descriptor softseg;
744         char *msg;
745
746         code = frame->tf_err;
747         type = frame->tf_trapno;
748         sdtossd(&gdt[NGDT * PCPU_GET(cpuid) + IDXSEL(frame->tf_cs & 0xffff)],
749             &softseg);
750
751         if (type <= MAX_TRAP_MSG)
752                 msg = trap_msg[type];
753         else
754                 msg = "UNKNOWN";
755         printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
756             TRAPF_USERMODE(frame) ? "user" : "kernel");
757 #ifdef SMP
758         /* two separate prints in case of a trap on an unmapped page */
759         printf("cpuid = %d; ", PCPU_GET(cpuid));
760         printf("apic id = %02x\n", PCPU_GET(apic_id));
761 #endif
762         if (type == T_PAGEFLT) {
763                 printf("fault virtual address   = 0x%lx\n", eva);
764                 printf("fault code              = %s %s %s%s, %s\n",
765                         code & PGEX_U ? "user" : "supervisor",
766                         code & PGEX_W ? "write" : "read",
767                         code & PGEX_I ? "instruction" : "data",
768                         code & PGEX_RSV ? " rsv" : "",
769                         code & PGEX_P ? "protection violation" : "page not present");
770         }
771         printf("instruction pointer     = 0x%lx:0x%lx\n",
772                frame->tf_cs & 0xffff, frame->tf_rip);
773         ss = frame->tf_ss & 0xffff;
774         printf("stack pointer           = 0x%x:0x%lx\n", ss, frame->tf_rsp);
775         printf("frame pointer           = 0x%x:0x%lx\n", ss, frame->tf_rbp);
776         printf("code segment            = base 0x%lx, limit 0x%lx, type 0x%x\n",
777                softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
778         printf("                        = DPL %d, pres %d, long %d, def32 %d, gran %d\n",
779                softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32,
780                softseg.ssd_gran);
781         printf("processor eflags        = ");
782         if (frame->tf_rflags & PSL_T)
783                 printf("trace trap, ");
784         if (frame->tf_rflags & PSL_I)
785                 printf("interrupt enabled, ");
786         if (frame->tf_rflags & PSL_NT)
787                 printf("nested task, ");
788         if (frame->tf_rflags & PSL_RF)
789                 printf("resume, ");
790         printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12);
791         printf("current process         = %d (%s)\n",
792             curproc->p_pid, curthread->td_name);
793
794 #ifdef KDB
795         if (debugger_on_panic || kdb_active)
796                 if (kdb_trap(type, 0, frame))
797                         return;
798 #endif
799         printf("trap number             = %d\n", type);
800         if (type <= MAX_TRAP_MSG)
801                 panic("%s", trap_msg[type]);
802         else
803                 panic("unknown/reserved trap");
804 }
805
806 /*
807  * Double fault handler. Called when a fault occurs while writing
808  * a frame for a trap/exception onto the stack. This usually occurs
809  * when the stack overflows (such is the case with infinite recursion,
810  * for example).
811  */
812 void
813 dblfault_handler(struct trapframe *frame)
814 {
815 #ifdef KDTRACE_HOOKS
816         if (dtrace_doubletrap_func != NULL)
817                 (*dtrace_doubletrap_func)();
818 #endif
819         printf("\nFatal double fault\n");
820         printf("rip = 0x%lx\n", frame->tf_rip);
821         printf("rsp = 0x%lx\n", frame->tf_rsp);
822         printf("rbp = 0x%lx\n", frame->tf_rbp);
823 #ifdef SMP
824         /* two separate prints in case of a trap on an unmapped page */
825         printf("cpuid = %d; ", PCPU_GET(cpuid));
826         printf("apic id = %02x\n", PCPU_GET(apic_id));
827 #endif
828         panic("double fault");
829 }
830
831 int
832 cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
833 {
834         struct proc *p;
835         struct trapframe *frame;
836         register_t *argp;
837         caddr_t params;
838         int reg, regcnt, error;
839
840         p = td->td_proc;
841         frame = td->td_frame;
842         reg = 0;
843         regcnt = 6;
844
845         params = (caddr_t)frame->tf_rsp + sizeof(register_t);
846         sa->code = frame->tf_rax;
847
848         if (sa->code == SYS_syscall || sa->code == SYS___syscall) {
849                 sa->code = frame->tf_rdi;
850                 reg++;
851                 regcnt--;
852         }
853         if (p->p_sysent->sv_mask)
854                 sa->code &= p->p_sysent->sv_mask;
855
856         if (sa->code >= p->p_sysent->sv_size)
857                 sa->callp = &p->p_sysent->sv_table[0];
858         else
859                 sa->callp = &p->p_sysent->sv_table[sa->code];
860
861         sa->narg = sa->callp->sy_narg;
862         KASSERT(sa->narg <= sizeof(sa->args) / sizeof(sa->args[0]),
863             ("Too many syscall arguments!"));
864         error = 0;
865         argp = &frame->tf_rdi;
866         argp += reg;
867         bcopy(argp, sa->args, sizeof(sa->args[0]) * regcnt);
868         if (sa->narg > regcnt) {
869                 KASSERT(params != NULL, ("copyin args with no params!"));
870                 error = copyin(params, &sa->args[regcnt],
871                     (sa->narg - regcnt) * sizeof(sa->args[0]));
872         }
873
874         if (error == 0) {
875                 td->td_retval[0] = 0;
876                 td->td_retval[1] = frame->tf_rdx;
877         }
878
879         return (error);
880 }
881
882 #include "../../kern/subr_syscall.c"
883
884 /*
885  * System call handler for native binaries.  The trap frame is already
886  * set up by the assembler trampoline and a pointer to it is saved in
887  * td_frame.
888  */
889 void
890 amd64_syscall(struct thread *td, int traced)
891 {
892         struct syscall_args sa;
893         int error;
894         ksiginfo_t ksi;
895
896 #ifdef DIAGNOSTIC
897         if (!TRAPF_USERMODE(td->td_frame)) {
898                 panic("syscall");
899                 /* NOT REACHED */
900         }
901 #endif
902         error = syscallenter(td, &sa);
903
904         /*
905          * Traced syscall.
906          */
907         if (__predict_false(traced)) {
908                 td->td_frame->tf_rflags &= ~PSL_T;
909                 ksiginfo_init_trap(&ksi);
910                 ksi.ksi_signo = SIGTRAP;
911                 ksi.ksi_code = TRAP_TRACE;
912                 ksi.ksi_addr = (void *)td->td_frame->tf_rip;
913                 trapsignal(td, &ksi);
914         }
915
916         KASSERT(PCB_USER_FPU(td->td_pcb),
917             ("System call %s returning with kernel FPU ctx leaked",
918              syscallname(td->td_proc, sa.code)));
919         KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
920             ("System call %s returning with mangled pcb_save",
921              syscallname(td->td_proc, sa.code)));
922         KASSERT(td->td_md.md_invl_gen.gen == 0,
923             ("System call %s returning with leaked invl_gen %lu",
924             syscallname(td->td_proc, sa.code), td->td_md.md_invl_gen.gen));
925
926         syscallret(td, error, &sa);
927
928         /*
929          * If the user-supplied value of %rip is not a canonical
930          * address, then some CPUs will trigger a ring 0 #GP during
931          * the sysret instruction.  However, the fault handler would
932          * execute in ring 0 with the user's %gs and %rsp which would
933          * not be safe.  Instead, use the full return path which
934          * catches the problem safely.
935          */
936         if (td->td_frame->tf_rip >= VM_MAXUSER_ADDRESS)
937                 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
938 }