2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2003 Peter Wemm
4 * Copyright (c) 2002 Doug Rabson
5 * Copyright (c) 1998-1999 Andrew Gallatin
6 * Copyright (c) 1994-1996 Søren Schmidt
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer
14 * in this position and unchanged.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. The name of the author may not be used to endorse or promote products
19 * derived from this software without specific prior written permission
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
38 #error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!"
41 #define __ELF_WORD_SIZE 32
43 #include <sys/param.h>
44 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_futex.h>
80 #include <compat/linux/linux_emul.h>
81 #include <compat/linux/linux_mib.h>
82 #include <compat/linux/linux_misc.h>
83 #include <compat/linux/linux_signal.h>
84 #include <compat/linux/linux_util.h>
86 MODULE_VERSION(linux, 1);
88 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
90 #define AUXARGS_ENTRY_32(pos, id, val) \
92 suword32(pos++, id); \
93 suword32(pos++, val); \
96 #if BYTE_ORDER == LITTLE_ENDIAN
97 #define SHELLMAGIC 0x2123 /* #! */
99 #define SHELLMAGIC 0x2321
103 * Allow the sendsig functions to use the ldebug() facility
104 * even though they are not syscalls themselves. Map them
105 * to syscall 0. This is slightly less bogus than using
108 #define LINUX_SYS_linux_rt_sendsig 0
109 #define LINUX_SYS_linux_sendsig 0
111 const char *linux_platform = "i686";
112 static int linux_szplatform;
113 extern char linux_sigcode[];
114 extern int linux_szsigcode;
116 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
118 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
119 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
121 static int elf_linux_fixup(register_t **stack_base,
122 struct image_params *iparams);
123 static register_t *linux_copyout_strings(struct image_params *imgp);
124 static void linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
126 static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
127 static void exec_linux_setregs(struct thread *td, u_long entry,
128 u_long stack, u_long ps_strings);
129 static void linux32_fixlimit(struct rlimit *rl, int which);
131 static eventhandler_tag linux_exit_tag;
132 static eventhandler_tag linux_schedtail_tag;
133 static eventhandler_tag linux_exec_tag;
136 * Linux syscalls return negative errno's, we do positive and map them
138 * FreeBSD: src/sys/sys/errno.h
139 * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h
140 * linux-2.6.17.8/include/asm-generic/errno.h
142 static int bsd_to_linux_errno[ELAST + 1] = {
143 -0, -1, -2, -3, -4, -5, -6, -7, -8, -9,
144 -10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
145 -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
146 -30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
147 -90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
148 -100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
149 -110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
150 -116, -66, -6, -6, -6, -6, -6, -37, -38, -9,
151 -6, -6, -43, -42, -75,-125, -84, -95, -16, -74,
155 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
156 LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
157 LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
158 LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
159 LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
160 LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
161 LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
162 LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
163 0, LINUX_SIGUSR1, LINUX_SIGUSR2
166 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
167 SIGHUP, SIGINT, SIGQUIT, SIGILL,
168 SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
169 SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
170 SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
171 SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
172 SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
173 SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
174 SIGIO, SIGURG, SIGSYS
177 #define LINUX_T_UNKNOWN 255
178 static int _bsd_to_linux_trapcode[] = {
179 LINUX_T_UNKNOWN, /* 0 */
180 6, /* 1 T_PRIVINFLT */
181 LINUX_T_UNKNOWN, /* 2 */
183 LINUX_T_UNKNOWN, /* 4 */
184 LINUX_T_UNKNOWN, /* 5 */
185 16, /* 6 T_ARITHTRAP */
186 254, /* 7 T_ASTFLT */
187 LINUX_T_UNKNOWN, /* 8 */
188 13, /* 9 T_PROTFLT */
189 1, /* 10 T_TRCTRAP */
190 LINUX_T_UNKNOWN, /* 11 */
191 14, /* 12 T_PAGEFLT */
192 LINUX_T_UNKNOWN, /* 13 */
193 17, /* 14 T_ALIGNFLT */
194 LINUX_T_UNKNOWN, /* 15 */
195 LINUX_T_UNKNOWN, /* 16 */
196 LINUX_T_UNKNOWN, /* 17 */
202 8, /* 23 T_DOUBLEFLT */
203 9, /* 24 T_FPOPFLT */
204 10, /* 25 T_TSSFLT */
205 11, /* 26 T_SEGNPFLT */
206 12, /* 27 T_STKFLT */
208 19, /* 29 T_XMMFLT */
209 15 /* 30 T_RESERVED */
211 #define bsd_to_linux_trapcode(code) \
212 ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
213 _bsd_to_linux_trapcode[(code)]: \
216 struct linux32_ps_strings {
217 u_int32_t ps_argvstr; /* first of 0 or more argument strings */
218 u_int ps_nargvstr; /* the number of argument strings */
219 u_int32_t ps_envstr; /* first of 0 or more environment strings */
220 u_int ps_nenvstr; /* the number of environment strings */
224 * If FreeBSD & Linux have a difference of opinion about what a trap
225 * means, deal with it here.
230 translate_traps(int signal, int trap_code)
232 if (signal != SIGBUS)
246 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
250 Elf32_Addr *pos, *uplatform;
251 struct linux32_ps_strings *arginfo;
253 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
254 uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szsigcode -
257 KASSERT(curthread->td_proc == imgp->proc,
258 ("unsafe elf_linux_fixup(), should be curproc"));
259 base = (Elf32_Addr *)*stack_base;
260 args = (Elf32_Auxargs *)imgp->auxargs;
261 pos = base + (imgp->args->argc + imgp->args->envc + 2);
263 AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
266 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
267 * as it has appeared in the 2.4.0-rc7 first time.
268 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
269 * glibc falls back to the hard-coded CLK_TCK value when aux entry
271 * Also see linux_times() implementation.
273 if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
274 AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
275 AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
276 AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
277 AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
278 AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
279 AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
280 AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
281 AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
282 AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
283 AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
284 AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
285 AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
286 AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
287 AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
288 if (args->execfd != -1)
289 AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
290 AUXARGS_ENTRY_32(pos, AT_NULL, 0);
292 free(imgp->auxargs, M_TEMP);
293 imgp->auxargs = NULL;
296 suword32(base, (uint32_t)imgp->args->argc);
297 *stack_base = (register_t *)base;
301 extern unsigned long linux_sznonrtsigcode;
304 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
306 struct thread *td = curthread;
307 struct proc *p = td->td_proc;
309 struct trapframe *regs;
310 struct l_rt_sigframe *fp, frame;
315 sig = ksi->ksi_signo;
316 code = ksi->ksi_code;
317 PROC_LOCK_ASSERT(p, MA_OWNED);
319 mtx_assert(&psp->ps_mtx, MA_OWNED);
321 oonstack = sigonstack(regs->tf_rsp);
324 if (ldebug(rt_sendsig))
325 printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
326 catcher, sig, (void*)mask, code);
329 * Allocate space for the signal handler context.
331 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
332 SIGISMEMBER(psp->ps_sigonstack, sig)) {
333 fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
334 td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
336 fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
337 mtx_unlock(&psp->ps_mtx);
340 * Build the argument list for the signal handler.
342 if (p->p_sysent->sv_sigtbl)
343 if (sig <= p->p_sysent->sv_sigsize)
344 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
346 bzero(&frame, sizeof(frame));
348 frame.sf_handler = PTROUT(catcher);
350 frame.sf_siginfo = PTROUT(&fp->sf_si);
351 frame.sf_ucontext = PTROUT(&fp->sf_sc);
353 /* Fill in POSIX parts */
354 ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
357 * Build the signal context to be used by sigreturn.
359 frame.sf_sc.uc_flags = 0; /* XXX ??? */
360 frame.sf_sc.uc_link = 0; /* XXX ??? */
362 frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
363 frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
364 frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
365 ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
368 bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
370 frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0];
371 frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi;
372 frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi;
373 frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp;
374 frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx;
375 frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx;
376 frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx;
377 frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax;
378 frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip;
379 frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs;
380 frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs;
381 frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs;
382 frame.sf_sc.uc_mcontext.sc_es = regs->tf_es;
383 frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds;
384 frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
385 frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
386 frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss;
387 frame.sf_sc.uc_mcontext.sc_err = regs->tf_err;
388 frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr;
389 frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
392 if (ldebug(rt_sendsig))
393 printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
394 frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
395 td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
398 if (copyout(&frame, fp, sizeof(frame)) != 0) {
400 * Process has trashed its stack; give it an illegal
401 * instruction to halt it in its tracks.
404 if (ldebug(rt_sendsig))
405 printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
413 * Build context to run handler in.
415 regs->tf_rsp = PTROUT(fp);
416 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
417 linux_sznonrtsigcode;
418 regs->tf_rflags &= ~(PSL_T | PSL_D);
419 regs->tf_cs = _ucode32sel;
420 regs->tf_ss = _udatasel;
421 regs->tf_ds = _udatasel;
422 regs->tf_es = _udatasel;
423 regs->tf_fs = _ufssel;
424 regs->tf_gs = _ugssel;
425 regs->tf_flags = TF_HASSEGS;
426 td->td_pcb->pcb_full_iret = 1;
428 mtx_lock(&psp->ps_mtx);
433 * Send an interrupt to process.
435 * Stack is set up to allow sigcode stored
436 * in u. to call routine, followed by kcall
437 * to sigreturn routine below. After sigreturn
438 * resets the signal mask, the stack, and the
439 * frame pointer, it returns to the user
443 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
445 struct thread *td = curthread;
446 struct proc *p = td->td_proc;
448 struct trapframe *regs;
449 struct l_sigframe *fp, frame;
454 sig = ksi->ksi_signo;
455 code = ksi->ksi_code;
456 PROC_LOCK_ASSERT(p, MA_OWNED);
458 mtx_assert(&psp->ps_mtx, MA_OWNED);
459 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
460 /* Signal handler installed with SA_SIGINFO. */
461 linux_rt_sendsig(catcher, ksi, mask);
466 oonstack = sigonstack(regs->tf_rsp);
470 printf(ARGS(sendsig, "%p, %d, %p, %u"),
471 catcher, sig, (void*)mask, code);
475 * Allocate space for the signal handler context.
477 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
478 SIGISMEMBER(psp->ps_sigonstack, sig)) {
479 fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
480 td->td_sigstk.ss_size - sizeof(struct l_sigframe));
482 fp = (struct l_sigframe *)regs->tf_rsp - 1;
483 mtx_unlock(&psp->ps_mtx);
487 * Build the argument list for the signal handler.
489 if (p->p_sysent->sv_sigtbl)
490 if (sig <= p->p_sysent->sv_sigsize)
491 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
493 bzero(&frame, sizeof(frame));
495 frame.sf_handler = PTROUT(catcher);
498 bsd_to_linux_sigset(mask, &lmask);
501 * Build the signal context to be used by sigreturn.
503 frame.sf_sc.sc_mask = lmask.__bits[0];
504 frame.sf_sc.sc_gs = regs->tf_gs;
505 frame.sf_sc.sc_fs = regs->tf_fs;
506 frame.sf_sc.sc_es = regs->tf_es;
507 frame.sf_sc.sc_ds = regs->tf_ds;
508 frame.sf_sc.sc_edi = regs->tf_rdi;
509 frame.sf_sc.sc_esi = regs->tf_rsi;
510 frame.sf_sc.sc_ebp = regs->tf_rbp;
511 frame.sf_sc.sc_ebx = regs->tf_rbx;
512 frame.sf_sc.sc_edx = regs->tf_rdx;
513 frame.sf_sc.sc_ecx = regs->tf_rcx;
514 frame.sf_sc.sc_eax = regs->tf_rax;
515 frame.sf_sc.sc_eip = regs->tf_rip;
516 frame.sf_sc.sc_cs = regs->tf_cs;
517 frame.sf_sc.sc_eflags = regs->tf_rflags;
518 frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
519 frame.sf_sc.sc_ss = regs->tf_ss;
520 frame.sf_sc.sc_err = regs->tf_err;
521 frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr;
522 frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
524 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
525 frame.sf_extramask[i] = lmask.__bits[i+1];
527 if (copyout(&frame, fp, sizeof(frame)) != 0) {
529 * Process has trashed its stack; give it an illegal
530 * instruction to halt it in its tracks.
537 * Build context to run handler in.
539 regs->tf_rsp = PTROUT(fp);
540 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
541 regs->tf_rflags &= ~(PSL_T | PSL_D);
542 regs->tf_cs = _ucode32sel;
543 regs->tf_ss = _udatasel;
544 regs->tf_ds = _udatasel;
545 regs->tf_es = _udatasel;
546 regs->tf_fs = _ufssel;
547 regs->tf_gs = _ugssel;
548 regs->tf_flags = TF_HASSEGS;
549 td->td_pcb->pcb_full_iret = 1;
551 mtx_lock(&psp->ps_mtx);
555 * System call to cleanup state after a signal
556 * has been taken. Reset signal mask and
557 * stack state from context left by sendsig (above).
558 * Return to previous pc and psl as specified by
559 * context left by sendsig. Check carefully to
560 * make sure that the user has not modified the
561 * psl to gain improper privileges or to cause
565 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
567 struct proc *p = td->td_proc;
568 struct l_sigframe frame;
569 struct trapframe *regs;
577 if (ldebug(sigreturn))
578 printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
581 * The trampoline code hands us the sigframe.
582 * It is unsafe to keep track of it ourselves, in the event that a
583 * program jumps out of a signal handler.
585 if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
589 * Check for security violations.
591 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
592 eflags = frame.sf_sc.sc_eflags;
594 * XXX do allow users to change the privileged flag PSL_RF. The
595 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should
596 * sometimes set it there too. tf_eflags is kept in the signal
597 * context during signal handling and there is no other place
598 * to remember it, so the PSL_RF bit may be corrupted by the
599 * signal handler without us knowing. Corruption of the PSL_RF
600 * bit at worst causes one more or one less debugger trap, so
601 * allowing it is fairly harmless.
603 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
607 * Don't allow users to load a valid privileged %cs. Let the
608 * hardware check for invalid selectors, excess privilege in
609 * other selectors, invalid %eip's and invalid %esp's.
611 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
612 if (!CS_SECURE(frame.sf_sc.sc_cs)) {
613 ksiginfo_init_trap(&ksi);
614 ksi.ksi_signo = SIGBUS;
615 ksi.ksi_code = BUS_OBJERR;
616 ksi.ksi_trapno = T_PROTFLT;
617 ksi.ksi_addr = (void *)regs->tf_rip;
618 trapsignal(td, &ksi);
622 lmask.__bits[0] = frame.sf_sc.sc_mask;
623 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
624 lmask.__bits[i+1] = frame.sf_extramask[i];
626 linux_to_bsd_sigset(&lmask, &td->td_sigmask);
627 SIG_CANTMASK(td->td_sigmask);
632 * Restore signal context.
634 regs->tf_rdi = frame.sf_sc.sc_edi;
635 regs->tf_rsi = frame.sf_sc.sc_esi;
636 regs->tf_rbp = frame.sf_sc.sc_ebp;
637 regs->tf_rbx = frame.sf_sc.sc_ebx;
638 regs->tf_rdx = frame.sf_sc.sc_edx;
639 regs->tf_rcx = frame.sf_sc.sc_ecx;
640 regs->tf_rax = frame.sf_sc.sc_eax;
641 regs->tf_rip = frame.sf_sc.sc_eip;
642 regs->tf_cs = frame.sf_sc.sc_cs;
643 regs->tf_ds = frame.sf_sc.sc_ds;
644 regs->tf_es = frame.sf_sc.sc_es;
645 regs->tf_fs = frame.sf_sc.sc_fs;
646 regs->tf_gs = frame.sf_sc.sc_gs;
647 regs->tf_rflags = eflags;
648 regs->tf_rsp = frame.sf_sc.sc_esp_at_signal;
649 regs->tf_ss = frame.sf_sc.sc_ss;
650 td->td_pcb->pcb_full_iret = 1;
652 return (EJUSTRETURN);
656 * System call to cleanup state after a signal
657 * has been taken. Reset signal mask and
658 * stack state from context left by rt_sendsig (above).
659 * Return to previous pc and psl as specified by
660 * context left by sendsig. Check carefully to
661 * make sure that the user has not modified the
662 * psl to gain improper privileges or to cause
666 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
668 struct proc *p = td->td_proc;
669 struct l_ucontext uc;
670 struct l_sigcontext *context;
673 struct trapframe *regs;
680 if (ldebug(rt_sigreturn))
681 printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
684 * The trampoline code hands us the ucontext.
685 * It is unsafe to keep track of it ourselves, in the event that a
686 * program jumps out of a signal handler.
688 if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
691 context = &uc.uc_mcontext;
694 * Check for security violations.
696 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
697 eflags = context->sc_eflags;
699 * XXX do allow users to change the privileged flag PSL_RF. The
700 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should
701 * sometimes set it there too. tf_eflags is kept in the signal
702 * context during signal handling and there is no other place
703 * to remember it, so the PSL_RF bit may be corrupted by the
704 * signal handler without us knowing. Corruption of the PSL_RF
705 * bit at worst causes one more or one less debugger trap, so
706 * allowing it is fairly harmless.
708 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
712 * Don't allow users to load a valid privileged %cs. Let the
713 * hardware check for invalid selectors, excess privilege in
714 * other selectors, invalid %eip's and invalid %esp's.
716 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
717 if (!CS_SECURE(context->sc_cs)) {
718 ksiginfo_init_trap(&ksi);
719 ksi.ksi_signo = SIGBUS;
720 ksi.ksi_code = BUS_OBJERR;
721 ksi.ksi_trapno = T_PROTFLT;
722 ksi.ksi_addr = (void *)regs->tf_rip;
723 trapsignal(td, &ksi);
728 linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
729 SIG_CANTMASK(td->td_sigmask);
734 * Restore signal context
736 regs->tf_gs = context->sc_gs;
737 regs->tf_fs = context->sc_fs;
738 regs->tf_es = context->sc_es;
739 regs->tf_ds = context->sc_ds;
740 regs->tf_rdi = context->sc_edi;
741 regs->tf_rsi = context->sc_esi;
742 regs->tf_rbp = context->sc_ebp;
743 regs->tf_rbx = context->sc_ebx;
744 regs->tf_rdx = context->sc_edx;
745 regs->tf_rcx = context->sc_ecx;
746 regs->tf_rax = context->sc_eax;
747 regs->tf_rip = context->sc_eip;
748 regs->tf_cs = context->sc_cs;
749 regs->tf_rflags = eflags;
750 regs->tf_rsp = context->sc_esp_at_signal;
751 regs->tf_ss = context->sc_ss;
752 td->td_pcb->pcb_full_iret = 1;
755 * call sigaltstack & ignore results..
758 ss.ss_sp = PTRIN(lss->ss_sp);
759 ss.ss_size = lss->ss_size;
760 ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
763 if (ldebug(rt_sigreturn))
764 printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
765 ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
767 (void)kern_sigaltstack(td, &ss, NULL);
769 return (EJUSTRETURN);
776 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
778 args[0] = tf->tf_rbx;
779 args[1] = tf->tf_rcx;
780 args[2] = tf->tf_rdx;
781 args[3] = tf->tf_rsi;
782 args[4] = tf->tf_rdi;
783 args[5] = tf->tf_rbp; /* Unconfirmed */
784 *params = NULL; /* no copyin */
788 * If a linux binary is exec'ing something, try this image activator
789 * first. We override standard shell script execution in order to
790 * be able to modify the interpreter path. We only do this if a linux
791 * binary is doing the exec, so we do not create an EXEC module for it.
793 static int exec_linux_imgact_try(struct image_params *iparams);
796 exec_linux_imgact_try(struct image_params *imgp)
798 const char *head = (const char *)imgp->image_header;
803 * The interpreter for shell scripts run from a linux binary needs
804 * to be located in /compat/linux if possible in order to recursively
805 * maintain linux path emulation.
807 if (((const short *)head)[0] == SHELLMAGIC) {
809 * Run our normal shell image activator. If it succeeds attempt
810 * to use the alternate path for the interpreter. If an
811 * alternate * path is found, use our stringspace to store it.
813 if ((error = exec_shell_imgact(imgp)) == 0) {
814 linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
815 imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
818 len = strlen(rpath) + 1;
820 if (len <= MAXSHELLCMDLEN) {
821 memcpy(imgp->interpreter_name, rpath,
832 * Clear registers on exec
833 * XXX copied from ia32_signal.c.
836 exec_linux_setregs(td, entry, stack, ps_strings)
842 struct trapframe *regs = td->td_frame;
843 struct pcb *pcb = td->td_pcb;
846 if (td->td_proc->p_md.md_ldt != NULL)
849 mtx_unlock(&dt_lock);
852 wrmsr(MSR_FSBASE, 0);
853 wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */
857 pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
859 bzero((char *)regs, sizeof(struct trapframe));
860 regs->tf_rip = entry;
861 regs->tf_rsp = stack;
862 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
863 regs->tf_gs = _ugssel;
864 regs->tf_fs = _ufssel;
865 regs->tf_es = _udatasel;
866 regs->tf_ds = _udatasel;
867 regs->tf_ss = _udatasel;
868 regs->tf_flags = TF_HASSEGS;
869 regs->tf_cs = _ucode32sel;
870 regs->tf_rbx = ps_strings;
871 td->td_pcb->pcb_full_iret = 1;
872 load_cr0(rcr0() | CR0_MP | CR0_TS);
875 /* Return via doreti so that we can change to a different %cs */
876 pcb->pcb_flags |= PCB_FULLCTX | PCB_32BIT;
877 pcb->pcb_flags &= ~PCB_GS32BIT;
878 td->td_retval[1] = 0;
882 * XXX copied from ia32_sysvec.c.
885 linux_copyout_strings(struct image_params *imgp)
889 char *stringp, *destp;
890 u_int32_t *stack_base;
891 struct linux32_ps_strings *arginfo;
894 * Calculate string base and vector table pointers.
895 * Also deal with signal trampoline code for this exec type.
897 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
898 destp = (caddr_t)arginfo - linux_szsigcode - SPARE_USRSPACE -
899 linux_szplatform - roundup((ARG_MAX - imgp->args->stringspace),
905 copyout(imgp->proc->p_sysent->sv_sigcode,
906 ((caddr_t)arginfo - linux_szsigcode), linux_szsigcode);
909 * Install LINUX_PLATFORM
911 copyout(linux_platform, ((caddr_t)arginfo - linux_szsigcode -
912 linux_szplatform), linux_szplatform);
915 * If we have a valid auxargs ptr, prepare some room
920 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
921 * lower compatibility.
923 imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
924 (LINUX_AT_COUNT * 2);
926 * The '+ 2' is for the null pointers at the end of each of
927 * the arg and env vector sets,and imgp->auxarg_size is room
928 * for argument of Runtime loader.
930 vectp = (u_int32_t *) (destp - (imgp->args->argc +
931 imgp->args->envc + 2 + imgp->auxarg_size) *
936 * The '+ 2' is for the null pointers at the end of each of
937 * the arg and env vector sets
939 vectp = (u_int32_t *)(destp - (imgp->args->argc +
940 imgp->args->envc + 2) * sizeof(u_int32_t));
943 * vectp also becomes our initial stack base
947 stringp = imgp->args->begin_argv;
948 argc = imgp->args->argc;
949 envc = imgp->args->envc;
951 * Copy out strings - arguments and environment.
953 copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
956 * Fill in "ps_strings" struct for ps, w, etc.
958 suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
959 suword32(&arginfo->ps_nargvstr, argc);
962 * Fill in argument portion of vector table.
964 for (; argc > 0; --argc) {
965 suword32(vectp++, (uint32_t)(intptr_t)destp);
966 while (*stringp++ != 0)
971 /* a null vector table pointer separates the argp's from the envp's */
972 suword32(vectp++, 0);
974 suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
975 suword32(&arginfo->ps_nenvstr, envc);
978 * Fill in environment portion of vector table.
980 for (; envc > 0; --envc) {
981 suword32(vectp++, (uint32_t)(intptr_t)destp);
982 while (*stringp++ != 0)
987 /* end of vector table is a null pointer */
990 return ((register_t *)stack_base);
993 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
994 "32-bit Linux emulation");
996 static u_long linux32_maxdsiz = LINUX32_MAXDSIZ;
997 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
998 &linux32_maxdsiz, 0, "");
999 static u_long linux32_maxssiz = LINUX32_MAXSSIZ;
1000 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
1001 &linux32_maxssiz, 0, "");
1002 static u_long linux32_maxvmem = LINUX32_MAXVMEM;
1003 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
1004 &linux32_maxvmem, 0, "");
1007 linux32_fixlimit(struct rlimit *rl, int which)
1012 if (linux32_maxdsiz != 0) {
1013 if (rl->rlim_cur > linux32_maxdsiz)
1014 rl->rlim_cur = linux32_maxdsiz;
1015 if (rl->rlim_max > linux32_maxdsiz)
1016 rl->rlim_max = linux32_maxdsiz;
1020 if (linux32_maxssiz != 0) {
1021 if (rl->rlim_cur > linux32_maxssiz)
1022 rl->rlim_cur = linux32_maxssiz;
1023 if (rl->rlim_max > linux32_maxssiz)
1024 rl->rlim_max = linux32_maxssiz;
1028 if (linux32_maxvmem != 0) {
1029 if (rl->rlim_cur > linux32_maxvmem)
1030 rl->rlim_cur = linux32_maxvmem;
1031 if (rl->rlim_max > linux32_maxvmem)
1032 rl->rlim_max = linux32_maxvmem;
1038 struct sysentvec elf_linux_sysvec = {
1039 .sv_size = LINUX_SYS_MAXSYSCALL,
1040 .sv_table = linux_sysent,
1042 .sv_sigsize = LINUX_SIGTBLSZ,
1043 .sv_sigtbl = bsd_to_linux_signal,
1044 .sv_errsize = ELAST + 1,
1045 .sv_errtbl = bsd_to_linux_errno,
1046 .sv_transtrap = translate_traps,
1047 .sv_fixup = elf_linux_fixup,
1048 .sv_sendsig = linux_sendsig,
1049 .sv_sigcode = linux_sigcode,
1050 .sv_szsigcode = &linux_szsigcode,
1051 .sv_prepsyscall = linux_prepsyscall,
1052 .sv_name = "Linux ELF32",
1053 .sv_coredump = elf32_coredump,
1054 .sv_imgact_try = exec_linux_imgact_try,
1055 .sv_minsigstksz = LINUX_MINSIGSTKSZ,
1056 .sv_pagesize = PAGE_SIZE,
1057 .sv_minuser = VM_MIN_ADDRESS,
1058 .sv_maxuser = LINUX32_USRSTACK,
1059 .sv_usrstack = LINUX32_USRSTACK,
1060 .sv_psstrings = LINUX32_PS_STRINGS,
1061 .sv_stackprot = VM_PROT_ALL,
1062 .sv_copyout_strings = linux_copyout_strings,
1063 .sv_setregs = exec_linux_setregs,
1064 .sv_fixlimit = linux32_fixlimit,
1065 .sv_maxssiz = &linux32_maxssiz,
1066 .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32
1069 static char GNULINUX_ABI_VENDOR[] = "GNU";
1071 static Elf_Brandnote linux32_brandnote = {
1072 .hdr.n_namesz = sizeof(GNULINUX_ABI_VENDOR),
1075 .vendor = GNULINUX_ABI_VENDOR,
1079 static Elf32_Brandinfo linux_brand = {
1080 .brand = ELFOSABI_LINUX,
1082 .compat_3_brand = "Linux",
1083 .emul_path = "/compat/linux",
1084 .interp_path = "/lib/ld-linux.so.1",
1085 .sysvec = &elf_linux_sysvec,
1086 .interp_newpath = NULL,
1087 .brand_note = &linux32_brandnote,
1088 .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1091 static Elf32_Brandinfo linux_glibc2brand = {
1092 .brand = ELFOSABI_LINUX,
1094 .compat_3_brand = "Linux",
1095 .emul_path = "/compat/linux",
1096 .interp_path = "/lib/ld-linux.so.2",
1097 .sysvec = &elf_linux_sysvec,
1098 .interp_newpath = NULL,
1099 .brand_note = &linux32_brandnote,
1100 .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1103 Elf32_Brandinfo *linux_brandlist[] = {
1110 linux_elf_modevent(module_t mod, int type, void *data)
1112 Elf32_Brandinfo **brandinfo;
1114 struct linux_ioctl_handler **lihp;
1115 struct linux_device_handler **ldhp;
1121 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1123 if (elf32_insert_brand_entry(*brandinfo) < 0)
1126 SET_FOREACH(lihp, linux_ioctl_handler_set)
1127 linux_ioctl_register_handler(*lihp);
1128 SET_FOREACH(ldhp, linux_device_handler_set)
1129 linux_device_register_handler(*ldhp);
1130 mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
1131 sx_init(&emul_shared_lock, "emuldata->shared lock");
1132 LIST_INIT(&futex_list);
1133 mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1134 linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1135 linux_proc_exit, NULL, 1000);
1136 linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail,
1137 linux_schedtail, NULL, 1000);
1138 linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1139 linux_proc_exec, NULL, 1000);
1140 linux_szplatform = roundup(strlen(linux_platform) + 1,
1142 linux_osd_jail_register();
1143 stclohz = (stathz ? stathz : hz);
1145 printf("Linux ELF exec handler installed\n");
1147 printf("cannot insert Linux ELF brand handler\n");
1150 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1152 if (elf32_brand_inuse(*brandinfo))
1155 for (brandinfo = &linux_brandlist[0];
1156 *brandinfo != NULL; ++brandinfo)
1157 if (elf32_remove_brand_entry(*brandinfo) < 0)
1161 SET_FOREACH(lihp, linux_ioctl_handler_set)
1162 linux_ioctl_unregister_handler(*lihp);
1163 SET_FOREACH(ldhp, linux_device_handler_set)
1164 linux_device_unregister_handler(*ldhp);
1165 mtx_destroy(&emul_lock);
1166 sx_destroy(&emul_shared_lock);
1167 mtx_destroy(&futex_mtx);
1168 EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1169 EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
1170 EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1171 linux_osd_jail_deregister();
1173 printf("Linux ELF exec handler removed\n");
1175 printf("Could not deinstall ELF interpreter entry\n");
1183 static moduledata_t linux_elf_mod = {
1189 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);