2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2003 Peter Wemm
4 * Copyright (c) 2002 Doug Rabson
5 * Copyright (c) 1998-1999 Andrew Gallatin
6 * Copyright (c) 1994-1996 Søren Schmidt
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer
14 * in this position and unchanged.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. The name of the author may not be used to endorse or promote products
19 * derived from this software without specific prior written permission
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
37 #ifndef COMPAT_FREEBSD32
38 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
41 #define __ELF_WORD_SIZE 32
43 #include <sys/param.h>
44 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_emul.h>
80 #include <compat/linux/linux_futex.h>
81 #include <compat/linux/linux_ioctl.h>
82 #include <compat/linux/linux_mib.h>
83 #include <compat/linux/linux_misc.h>
84 #include <compat/linux/linux_signal.h>
85 #include <compat/linux/linux_util.h>
86 #include <compat/linux/linux_vdso.h>
88 MODULE_VERSION(linux, 1);
90 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
92 #define AUXARGS_ENTRY_32(pos, id, val) \
94 suword32(pos++, id); \
95 suword32(pos++, val); \
98 #if BYTE_ORDER == LITTLE_ENDIAN
99 #define SHELLMAGIC 0x2123 /* #! */
101 #define SHELLMAGIC 0x2321
105 * Allow the sendsig functions to use the ldebug() facility
106 * even though they are not syscalls themselves. Map them
107 * to syscall 0. This is slightly less bogus than using
110 #define LINUX_SYS_linux_rt_sendsig 0
111 #define LINUX_SYS_linux_sendsig 0
113 const char *linux_platform = "i686";
114 static int linux_szplatform;
115 static int linux_szsigcode;
116 static vm_object_t linux_shared_page_obj;
117 static char *linux_shared_page_mapping;
118 extern char _binary_linux32_locore_o_start;
119 extern char _binary_linux32_locore_o_end;
121 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
123 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
124 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
126 static int elf_linux_fixup(register_t **stack_base,
127 struct image_params *iparams);
128 static register_t *linux_copyout_strings(struct image_params *imgp);
129 static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
130 static void exec_linux_setregs(struct thread *td,
131 struct image_params *imgp, u_long stack);
132 static void linux32_fixlimit(struct rlimit *rl, int which);
133 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
134 static void linux_vdso_install(void *param);
135 static void linux_vdso_deinstall(void *param);
137 static eventhandler_tag linux_exit_tag;
138 static eventhandler_tag linux_exec_tag;
139 static eventhandler_tag linux_thread_dtor_tag;
142 * Linux syscalls return negative errno's, we do positive and map them
144 * FreeBSD: src/sys/sys/errno.h
145 * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h
146 * linux-2.6.17.8/include/asm-generic/errno.h
148 static int bsd_to_linux_errno[ELAST + 1] = {
149 -0, -1, -2, -3, -4, -5, -6, -7, -8, -9,
150 -10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
151 -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
152 -30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
153 -90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
154 -100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
155 -110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
156 -116, -66, -6, -6, -6, -6, -6, -37, -38, -9,
157 -6, -6, -43, -42, -75,-125, -84, -95, -16, -74,
161 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
162 LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
163 LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
164 LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
165 LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
166 LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
167 LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
168 LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
169 0, LINUX_SIGUSR1, LINUX_SIGUSR2
172 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
173 SIGHUP, SIGINT, SIGQUIT, SIGILL,
174 SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
175 SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
176 SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
177 SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
178 SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
179 SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
180 SIGIO, SIGURG, SIGSYS
183 #define LINUX_T_UNKNOWN 255
184 static int _bsd_to_linux_trapcode[] = {
185 LINUX_T_UNKNOWN, /* 0 */
186 6, /* 1 T_PRIVINFLT */
187 LINUX_T_UNKNOWN, /* 2 */
189 LINUX_T_UNKNOWN, /* 4 */
190 LINUX_T_UNKNOWN, /* 5 */
191 16, /* 6 T_ARITHTRAP */
192 254, /* 7 T_ASTFLT */
193 LINUX_T_UNKNOWN, /* 8 */
194 13, /* 9 T_PROTFLT */
195 1, /* 10 T_TRCTRAP */
196 LINUX_T_UNKNOWN, /* 11 */
197 14, /* 12 T_PAGEFLT */
198 LINUX_T_UNKNOWN, /* 13 */
199 17, /* 14 T_ALIGNFLT */
200 LINUX_T_UNKNOWN, /* 15 */
201 LINUX_T_UNKNOWN, /* 16 */
202 LINUX_T_UNKNOWN, /* 17 */
208 8, /* 23 T_DOUBLEFLT */
209 9, /* 24 T_FPOPFLT */
210 10, /* 25 T_TSSFLT */
211 11, /* 26 T_SEGNPFLT */
212 12, /* 27 T_STKFLT */
214 19, /* 29 T_XMMFLT */
215 15 /* 30 T_RESERVED */
217 #define bsd_to_linux_trapcode(code) \
218 ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
219 _bsd_to_linux_trapcode[(code)]: \
222 struct linux32_ps_strings {
223 u_int32_t ps_argvstr; /* first of 0 or more argument strings */
224 u_int ps_nargvstr; /* the number of argument strings */
225 u_int32_t ps_envstr; /* first of 0 or more environment strings */
226 u_int ps_nenvstr; /* the number of environment strings */
229 LINUX_VDSO_SYM_INTPTR(linux32_sigcode);
230 LINUX_VDSO_SYM_INTPTR(linux32_rt_sigcode);
231 LINUX_VDSO_SYM_INTPTR(linux32_vsyscall);
234 * If FreeBSD & Linux have a difference of opinion about what a trap
235 * means, deal with it here.
240 translate_traps(int signal, int trap_code)
242 if (signal != SIGBUS)
256 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
260 Elf32_Addr *pos, *uplatform;
261 struct linux32_ps_strings *arginfo;
263 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
264 uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szplatform);
266 KASSERT(curthread->td_proc == imgp->proc,
267 ("unsafe elf_linux_fixup(), should be curproc"));
268 base = (Elf32_Addr *)*stack_base;
269 args = (Elf32_Auxargs *)imgp->auxargs;
270 pos = base + (imgp->args->argc + imgp->args->envc + 2);
272 AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO_EHDR,
273 imgp->proc->p_sysent->sv_shared_page_base);
274 AUXARGS_ENTRY_32(pos, LINUX_AT_SYSINFO, linux32_vsyscall);
275 AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
278 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
279 * as it has appeared in the 2.4.0-rc7 first time.
280 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
281 * glibc falls back to the hard-coded CLK_TCK value when aux entry
283 * Also see linux_times() implementation.
285 if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
286 AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
287 AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
288 AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
289 AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
290 AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
291 AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
292 AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
293 AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
294 AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
295 AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
296 AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
297 AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
298 AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
299 AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
300 if (args->execfd != -1)
301 AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
302 AUXARGS_ENTRY_32(pos, AT_NULL, 0);
304 free(imgp->auxargs, M_TEMP);
305 imgp->auxargs = NULL;
308 suword32(base, (uint32_t)imgp->args->argc);
309 *stack_base = (register_t *)base;
314 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
316 struct thread *td = curthread;
317 struct proc *p = td->td_proc;
319 struct trapframe *regs;
320 struct l_rt_sigframe *fp, frame;
325 sig = ksi->ksi_signo;
326 code = ksi->ksi_code;
327 PROC_LOCK_ASSERT(p, MA_OWNED);
329 mtx_assert(&psp->ps_mtx, MA_OWNED);
331 oonstack = sigonstack(regs->tf_rsp);
334 if (ldebug(rt_sendsig))
335 printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
336 catcher, sig, (void*)mask, code);
339 * Allocate space for the signal handler context.
341 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
342 SIGISMEMBER(psp->ps_sigonstack, sig)) {
343 fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
344 td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
346 fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
347 mtx_unlock(&psp->ps_mtx);
350 * Build the argument list for the signal handler.
352 if (p->p_sysent->sv_sigtbl)
353 if (sig <= p->p_sysent->sv_sigsize)
354 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
356 bzero(&frame, sizeof(frame));
358 frame.sf_handler = PTROUT(catcher);
360 frame.sf_siginfo = PTROUT(&fp->sf_si);
361 frame.sf_ucontext = PTROUT(&fp->sf_sc);
363 /* Fill in POSIX parts */
364 ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
367 * Build the signal context to be used by sigreturn
370 frame.sf_sc.uc_flags = 0; /* XXX ??? */
371 frame.sf_sc.uc_link = 0; /* XXX ??? */
373 frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
374 frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
375 frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
376 ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
379 bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
381 frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0];
382 frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi;
383 frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi;
384 frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp;
385 frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx;
386 frame.sf_sc.uc_mcontext.sc_esp = regs->tf_rsp;
387 frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx;
388 frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx;
389 frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax;
390 frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip;
391 frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs;
392 frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs;
393 frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs;
394 frame.sf_sc.uc_mcontext.sc_es = regs->tf_es;
395 frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds;
396 frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
397 frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
398 frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss;
399 frame.sf_sc.uc_mcontext.sc_err = regs->tf_err;
400 frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr;
401 frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
404 if (ldebug(rt_sendsig))
405 printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
406 frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
407 td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
410 if (copyout(&frame, fp, sizeof(frame)) != 0) {
412 * Process has trashed its stack; give it an illegal
413 * instruction to halt it in its tracks.
416 if (ldebug(rt_sendsig))
417 printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
425 * Build context to run handler in.
427 regs->tf_rsp = PTROUT(fp);
428 regs->tf_rip = linux32_rt_sigcode;
429 regs->tf_rflags &= ~(PSL_T | PSL_D);
430 regs->tf_cs = _ucode32sel;
431 regs->tf_ss = _udatasel;
432 regs->tf_ds = _udatasel;
433 regs->tf_es = _udatasel;
434 regs->tf_fs = _ufssel;
435 regs->tf_gs = _ugssel;
436 regs->tf_flags = TF_HASSEGS;
437 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
439 mtx_lock(&psp->ps_mtx);
444 * Send an interrupt to process.
446 * Stack is set up to allow sigcode stored
447 * in u. to call routine, followed by kcall
448 * to sigreturn routine below. After sigreturn
449 * resets the signal mask, the stack, and the
450 * frame pointer, it returns to the user
454 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
456 struct thread *td = curthread;
457 struct proc *p = td->td_proc;
459 struct trapframe *regs;
460 struct l_sigframe *fp, frame;
465 sig = ksi->ksi_signo;
466 code = ksi->ksi_code;
467 PROC_LOCK_ASSERT(p, MA_OWNED);
469 mtx_assert(&psp->ps_mtx, MA_OWNED);
470 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
471 /* Signal handler installed with SA_SIGINFO. */
472 linux_rt_sendsig(catcher, ksi, mask);
477 oonstack = sigonstack(regs->tf_rsp);
481 printf(ARGS(sendsig, "%p, %d, %p, %u"),
482 catcher, sig, (void*)mask, code);
486 * Allocate space for the signal handler context.
488 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
489 SIGISMEMBER(psp->ps_sigonstack, sig)) {
490 fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
491 td->td_sigstk.ss_size - sizeof(struct l_sigframe));
493 fp = (struct l_sigframe *)regs->tf_rsp - 1;
494 mtx_unlock(&psp->ps_mtx);
498 * Build the argument list for the signal handler.
500 if (p->p_sysent->sv_sigtbl)
501 if (sig <= p->p_sysent->sv_sigsize)
502 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
504 bzero(&frame, sizeof(frame));
506 frame.sf_handler = PTROUT(catcher);
509 bsd_to_linux_sigset(mask, &lmask);
512 * Build the signal context to be used by sigreturn.
514 frame.sf_sc.sc_mask = lmask.__bits[0];
515 frame.sf_sc.sc_gs = regs->tf_gs;
516 frame.sf_sc.sc_fs = regs->tf_fs;
517 frame.sf_sc.sc_es = regs->tf_es;
518 frame.sf_sc.sc_ds = regs->tf_ds;
519 frame.sf_sc.sc_edi = regs->tf_rdi;
520 frame.sf_sc.sc_esi = regs->tf_rsi;
521 frame.sf_sc.sc_ebp = regs->tf_rbp;
522 frame.sf_sc.sc_ebx = regs->tf_rbx;
523 frame.sf_sc.sc_esp = regs->tf_rsp;
524 frame.sf_sc.sc_edx = regs->tf_rdx;
525 frame.sf_sc.sc_ecx = regs->tf_rcx;
526 frame.sf_sc.sc_eax = regs->tf_rax;
527 frame.sf_sc.sc_eip = regs->tf_rip;
528 frame.sf_sc.sc_cs = regs->tf_cs;
529 frame.sf_sc.sc_eflags = regs->tf_rflags;
530 frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
531 frame.sf_sc.sc_ss = regs->tf_ss;
532 frame.sf_sc.sc_err = regs->tf_err;
533 frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr;
534 frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
536 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
537 frame.sf_extramask[i] = lmask.__bits[i+1];
539 if (copyout(&frame, fp, sizeof(frame)) != 0) {
541 * Process has trashed its stack; give it an illegal
542 * instruction to halt it in its tracks.
549 * Build context to run handler in.
551 regs->tf_rsp = PTROUT(fp);
552 regs->tf_rip = linux32_sigcode;
553 regs->tf_rflags &= ~(PSL_T | PSL_D);
554 regs->tf_cs = _ucode32sel;
555 regs->tf_ss = _udatasel;
556 regs->tf_ds = _udatasel;
557 regs->tf_es = _udatasel;
558 regs->tf_fs = _ufssel;
559 regs->tf_gs = _ugssel;
560 regs->tf_flags = TF_HASSEGS;
561 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
563 mtx_lock(&psp->ps_mtx);
567 * System call to cleanup state after a signal
568 * has been taken. Reset signal mask and
569 * stack state from context left by sendsig (above).
570 * Return to previous pc and psl as specified by
571 * context left by sendsig. Check carefully to
572 * make sure that the user has not modified the
573 * psl to gain improper privileges or to cause
577 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
579 struct l_sigframe frame;
580 struct trapframe *regs;
589 if (ldebug(sigreturn))
590 printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
593 * The trampoline code hands us the sigframe.
594 * It is unsafe to keep track of it ourselves, in the event that a
595 * program jumps out of a signal handler.
597 if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
601 * Check for security violations.
603 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
604 eflags = frame.sf_sc.sc_eflags;
605 if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
609 * Don't allow users to load a valid privileged %cs. Let the
610 * hardware check for invalid selectors, excess privilege in
611 * other selectors, invalid %eip's and invalid %esp's.
613 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
614 if (!CS_SECURE(frame.sf_sc.sc_cs)) {
615 ksiginfo_init_trap(&ksi);
616 ksi.ksi_signo = SIGBUS;
617 ksi.ksi_code = BUS_OBJERR;
618 ksi.ksi_trapno = T_PROTFLT;
619 ksi.ksi_addr = (void *)regs->tf_rip;
620 trapsignal(td, &ksi);
624 lmask.__bits[0] = frame.sf_sc.sc_mask;
625 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
626 lmask.__bits[i+1] = frame.sf_extramask[i];
627 linux_to_bsd_sigset(&lmask, &bmask);
628 kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
631 * Restore signal context.
633 regs->tf_rdi = frame.sf_sc.sc_edi;
634 regs->tf_rsi = frame.sf_sc.sc_esi;
635 regs->tf_rbp = frame.sf_sc.sc_ebp;
636 regs->tf_rbx = frame.sf_sc.sc_ebx;
637 regs->tf_rdx = frame.sf_sc.sc_edx;
638 regs->tf_rcx = frame.sf_sc.sc_ecx;
639 regs->tf_rax = frame.sf_sc.sc_eax;
640 regs->tf_rip = frame.sf_sc.sc_eip;
641 regs->tf_cs = frame.sf_sc.sc_cs;
642 regs->tf_ds = frame.sf_sc.sc_ds;
643 regs->tf_es = frame.sf_sc.sc_es;
644 regs->tf_fs = frame.sf_sc.sc_fs;
645 regs->tf_gs = frame.sf_sc.sc_gs;
646 regs->tf_rflags = eflags;
647 regs->tf_rsp = frame.sf_sc.sc_esp_at_signal;
648 regs->tf_ss = frame.sf_sc.sc_ss;
649 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
651 return (EJUSTRETURN);
655 * System call to cleanup state after a signal
656 * has been taken. Reset signal mask and
657 * stack state from context left by rt_sendsig (above).
658 * Return to previous pc and psl as specified by
659 * context left by sendsig. Check carefully to
660 * make sure that the user has not modified the
661 * psl to gain improper privileges or to cause
665 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
667 struct l_ucontext uc;
668 struct l_sigcontext *context;
672 struct trapframe *regs;
679 if (ldebug(rt_sigreturn))
680 printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
683 * The trampoline code hands us the ucontext.
684 * It is unsafe to keep track of it ourselves, in the event that a
685 * program jumps out of a signal handler.
687 if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
690 context = &uc.uc_mcontext;
693 * Check for security violations.
695 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
696 eflags = context->sc_eflags;
697 if (!EFLAGS_SECURE(eflags, regs->tf_rflags))
701 * Don't allow users to load a valid privileged %cs. Let the
702 * hardware check for invalid selectors, excess privilege in
703 * other selectors, invalid %eip's and invalid %esp's.
705 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
706 if (!CS_SECURE(context->sc_cs)) {
707 ksiginfo_init_trap(&ksi);
708 ksi.ksi_signo = SIGBUS;
709 ksi.ksi_code = BUS_OBJERR;
710 ksi.ksi_trapno = T_PROTFLT;
711 ksi.ksi_addr = (void *)regs->tf_rip;
712 trapsignal(td, &ksi);
716 linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
717 kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
720 * Restore signal context
722 regs->tf_gs = context->sc_gs;
723 regs->tf_fs = context->sc_fs;
724 regs->tf_es = context->sc_es;
725 regs->tf_ds = context->sc_ds;
726 regs->tf_rdi = context->sc_edi;
727 regs->tf_rsi = context->sc_esi;
728 regs->tf_rbp = context->sc_ebp;
729 regs->tf_rbx = context->sc_ebx;
730 regs->tf_rdx = context->sc_edx;
731 regs->tf_rcx = context->sc_ecx;
732 regs->tf_rax = context->sc_eax;
733 regs->tf_rip = context->sc_eip;
734 regs->tf_cs = context->sc_cs;
735 regs->tf_rflags = eflags;
736 regs->tf_rsp = context->sc_esp_at_signal;
737 regs->tf_ss = context->sc_ss;
738 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
741 * call sigaltstack & ignore results..
744 ss.ss_sp = PTRIN(lss->ss_sp);
745 ss.ss_size = lss->ss_size;
746 ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
749 if (ldebug(rt_sigreturn))
750 printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
751 ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
753 (void)kern_sigaltstack(td, &ss, NULL);
755 return (EJUSTRETURN);
759 linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
762 struct trapframe *frame;
765 frame = td->td_frame;
767 sa->args[0] = frame->tf_rbx;
768 sa->args[1] = frame->tf_rcx;
769 sa->args[2] = frame->tf_rdx;
770 sa->args[3] = frame->tf_rsi;
771 sa->args[4] = frame->tf_rdi;
772 sa->args[5] = frame->tf_rbp; /* Unconfirmed */
773 sa->code = frame->tf_rax;
775 if (sa->code >= p->p_sysent->sv_size)
776 sa->callp = &p->p_sysent->sv_table[0];
778 sa->callp = &p->p_sysent->sv_table[sa->code];
779 sa->narg = sa->callp->sy_narg;
781 td->td_retval[0] = 0;
782 td->td_retval[1] = frame->tf_rdx;
788 * If a linux binary is exec'ing something, try this image activator
789 * first. We override standard shell script execution in order to
790 * be able to modify the interpreter path. We only do this if a linux
791 * binary is doing the exec, so we do not create an EXEC module for it.
793 static int exec_linux_imgact_try(struct image_params *iparams);
796 exec_linux_imgact_try(struct image_params *imgp)
798 const char *head = (const char *)imgp->image_header;
803 * The interpreter for shell scripts run from a linux binary needs
804 * to be located in /compat/linux if possible in order to recursively
805 * maintain linux path emulation.
807 if (((const short *)head)[0] == SHELLMAGIC) {
809 * Run our normal shell image activator. If it succeeds attempt
810 * to use the alternate path for the interpreter. If an
811 * alternate * path is found, use our stringspace to store it.
813 if ((error = exec_shell_imgact(imgp)) == 0) {
814 linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
815 imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
818 imgp->args->fname_buf =
819 imgp->interpreter_name = rpath;
826 * Clear registers on exec
827 * XXX copied from ia32_signal.c.
830 exec_linux_setregs(struct thread *td, struct image_params *imgp, u_long stack)
832 struct trapframe *regs = td->td_frame;
833 struct pcb *pcb = td->td_pcb;
836 if (td->td_proc->p_md.md_ldt != NULL)
839 mtx_unlock(&dt_lock);
842 wrmsr(MSR_FSBASE, 0);
843 wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */
847 pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
849 bzero((char *)regs, sizeof(struct trapframe));
850 regs->tf_rip = imgp->entry_addr;
851 regs->tf_rsp = stack;
852 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
853 regs->tf_gs = _ugssel;
854 regs->tf_fs = _ufssel;
855 regs->tf_es = _udatasel;
856 regs->tf_ds = _udatasel;
857 regs->tf_ss = _udatasel;
858 regs->tf_flags = TF_HASSEGS;
859 regs->tf_cs = _ucode32sel;
860 regs->tf_rbx = imgp->ps_strings;
864 /* Do full restore on return so that we can change to a different %cs */
865 set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
866 td->td_retval[1] = 0;
870 * XXX copied from ia32_sysvec.c.
873 linux_copyout_strings(struct image_params *imgp)
877 char *stringp, *destp;
878 u_int32_t *stack_base;
879 struct linux32_ps_strings *arginfo;
882 * Calculate string base and vector table pointers.
883 * Also deal with signal trampoline code for this exec type.
885 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
886 destp = (caddr_t)arginfo - SPARE_USRSPACE - linux_szplatform -
887 roundup((ARG_MAX - imgp->args->stringspace),
891 * Install LINUX_PLATFORM
893 copyout(linux_platform, ((caddr_t)arginfo - linux_szplatform),
897 * If we have a valid auxargs ptr, prepare some room
902 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
903 * lower compatibility.
905 imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
906 (LINUX_AT_COUNT * 2);
908 * The '+ 2' is for the null pointers at the end of each of
909 * the arg and env vector sets,and imgp->auxarg_size is room
910 * for argument of Runtime loader.
912 vectp = (u_int32_t *) (destp - (imgp->args->argc +
913 imgp->args->envc + 2 + imgp->auxarg_size) *
918 * The '+ 2' is for the null pointers at the end of each of
919 * the arg and env vector sets
921 vectp = (u_int32_t *)(destp - (imgp->args->argc +
922 imgp->args->envc + 2) * sizeof(u_int32_t));
925 * vectp also becomes our initial stack base
929 stringp = imgp->args->begin_argv;
930 argc = imgp->args->argc;
931 envc = imgp->args->envc;
933 * Copy out strings - arguments and environment.
935 copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
938 * Fill in "ps_strings" struct for ps, w, etc.
940 suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
941 suword32(&arginfo->ps_nargvstr, argc);
944 * Fill in argument portion of vector table.
946 for (; argc > 0; --argc) {
947 suword32(vectp++, (uint32_t)(intptr_t)destp);
948 while (*stringp++ != 0)
953 /* a null vector table pointer separates the argp's from the envp's */
954 suword32(vectp++, 0);
956 suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
957 suword32(&arginfo->ps_nenvstr, envc);
960 * Fill in environment portion of vector table.
962 for (; envc > 0; --envc) {
963 suword32(vectp++, (uint32_t)(intptr_t)destp);
964 while (*stringp++ != 0)
969 /* end of vector table is a null pointer */
972 return ((register_t *)stack_base);
975 static SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
976 "32-bit Linux emulation");
978 static u_long linux32_maxdsiz = LINUX32_MAXDSIZ;
979 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
980 &linux32_maxdsiz, 0, "");
981 static u_long linux32_maxssiz = LINUX32_MAXSSIZ;
982 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
983 &linux32_maxssiz, 0, "");
984 static u_long linux32_maxvmem = LINUX32_MAXVMEM;
985 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
986 &linux32_maxvmem, 0, "");
989 linux32_fixlimit(struct rlimit *rl, int which)
994 if (linux32_maxdsiz != 0) {
995 if (rl->rlim_cur > linux32_maxdsiz)
996 rl->rlim_cur = linux32_maxdsiz;
997 if (rl->rlim_max > linux32_maxdsiz)
998 rl->rlim_max = linux32_maxdsiz;
1002 if (linux32_maxssiz != 0) {
1003 if (rl->rlim_cur > linux32_maxssiz)
1004 rl->rlim_cur = linux32_maxssiz;
1005 if (rl->rlim_max > linux32_maxssiz)
1006 rl->rlim_max = linux32_maxssiz;
1010 if (linux32_maxvmem != 0) {
1011 if (rl->rlim_cur > linux32_maxvmem)
1012 rl->rlim_cur = linux32_maxvmem;
1013 if (rl->rlim_max > linux32_maxvmem)
1014 rl->rlim_max = linux32_maxvmem;
1020 struct sysentvec elf_linux_sysvec = {
1021 .sv_size = LINUX_SYS_MAXSYSCALL,
1022 .sv_table = linux_sysent,
1024 .sv_sigsize = LINUX_SIGTBLSZ,
1025 .sv_sigtbl = bsd_to_linux_signal,
1026 .sv_errsize = ELAST + 1,
1027 .sv_errtbl = bsd_to_linux_errno,
1028 .sv_transtrap = translate_traps,
1029 .sv_fixup = elf_linux_fixup,
1030 .sv_sendsig = linux_sendsig,
1031 .sv_sigcode = &_binary_linux32_locore_o_start,
1032 .sv_szsigcode = &linux_szsigcode,
1033 .sv_prepsyscall = NULL,
1034 .sv_name = "Linux ELF32",
1035 .sv_coredump = elf32_coredump,
1036 .sv_imgact_try = exec_linux_imgact_try,
1037 .sv_minsigstksz = LINUX_MINSIGSTKSZ,
1038 .sv_pagesize = PAGE_SIZE,
1039 .sv_minuser = VM_MIN_ADDRESS,
1040 .sv_maxuser = LINUX32_MAXUSER,
1041 .sv_usrstack = LINUX32_USRSTACK,
1042 .sv_psstrings = LINUX32_PS_STRINGS,
1043 .sv_stackprot = VM_PROT_ALL,
1044 .sv_copyout_strings = linux_copyout_strings,
1045 .sv_setregs = exec_linux_setregs,
1046 .sv_fixlimit = linux32_fixlimit,
1047 .sv_maxssiz = &linux32_maxssiz,
1048 .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32 | SV_SHP,
1049 .sv_set_syscall_retval = cpu_set_syscall_retval,
1050 .sv_fetch_syscall_args = linux32_fetch_syscall_args,
1051 .sv_syscallnames = NULL,
1052 .sv_shared_page_base = LINUX32_SHAREDPAGE,
1053 .sv_shared_page_len = PAGE_SIZE,
1054 .sv_schedtail = linux_schedtail,
1055 .sv_thread_detach = linux_thread_detach,
1059 linux_vdso_install(void *param)
1062 linux_szsigcode = (&_binary_linux32_locore_o_end -
1063 &_binary_linux32_locore_o_start);
1065 if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len)
1066 panic("Linux invalid vdso size\n");
1068 __elfN(linux_vdso_fixup)(&elf_linux_sysvec);
1070 linux_shared_page_obj = __elfN(linux_shared_page_init)
1071 (&linux_shared_page_mapping);
1073 __elfN(linux_vdso_reloc)(&elf_linux_sysvec, LINUX32_SHAREDPAGE);
1075 bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping,
1077 elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj;
1079 SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY,
1080 (sysinit_cfunc_t)linux_vdso_install, NULL);
1083 linux_vdso_deinstall(void *param)
1086 __elfN(linux_shared_page_fini)(linux_shared_page_obj);
1088 SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST,
1089 (sysinit_cfunc_t)linux_vdso_deinstall, NULL);
1091 static char GNU_ABI_VENDOR[] = "GNU";
1092 static int GNULINUX_ABI_DESC = 0;
1095 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1097 const Elf32_Word *desc;
1100 p = (uintptr_t)(note + 1);
1101 p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1103 desc = (const Elf32_Word *)p;
1104 if (desc[0] != GNULINUX_ABI_DESC)
1108 * For linux we encode osrel as follows (see linux_mib.c):
1109 * VVVMMMIII (version, major, minor), see linux_mib.c.
1111 *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1116 static Elf_Brandnote linux32_brandnote = {
1117 .hdr.n_namesz = sizeof(GNU_ABI_VENDOR),
1118 .hdr.n_descsz = 16, /* XXX at least 16 */
1120 .vendor = GNU_ABI_VENDOR,
1121 .flags = BN_TRANSLATE_OSREL,
1122 .trans_osrel = linux32_trans_osrel
1125 static Elf32_Brandinfo linux_brand = {
1126 .brand = ELFOSABI_LINUX,
1128 .compat_3_brand = "Linux",
1129 .emul_path = "/compat/linux",
1130 .interp_path = "/lib/ld-linux.so.1",
1131 .sysvec = &elf_linux_sysvec,
1132 .interp_newpath = NULL,
1133 .brand_note = &linux32_brandnote,
1134 .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1137 static Elf32_Brandinfo linux_glibc2brand = {
1138 .brand = ELFOSABI_LINUX,
1140 .compat_3_brand = "Linux",
1141 .emul_path = "/compat/linux",
1142 .interp_path = "/lib/ld-linux.so.2",
1143 .sysvec = &elf_linux_sysvec,
1144 .interp_newpath = NULL,
1145 .brand_note = &linux32_brandnote,
1146 .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1149 Elf32_Brandinfo *linux_brandlist[] = {
1156 linux_elf_modevent(module_t mod, int type, void *data)
1158 Elf32_Brandinfo **brandinfo;
1160 struct linux_ioctl_handler **lihp;
1161 struct linux_device_handler **ldhp;
1167 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1169 if (elf32_insert_brand_entry(*brandinfo) < 0)
1172 SET_FOREACH(lihp, linux_ioctl_handler_set)
1173 linux_ioctl_register_handler(*lihp);
1174 SET_FOREACH(ldhp, linux_device_handler_set)
1175 linux_device_register_handler(*ldhp);
1176 LIST_INIT(&futex_list);
1177 mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1178 linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1179 linux_proc_exit, NULL, 1000);
1180 linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1181 linux_proc_exec, NULL, 1000);
1182 linux_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
1183 linux_thread_dtor, NULL, EVENTHANDLER_PRI_ANY);
1184 linux_szplatform = roundup(strlen(linux_platform) + 1,
1186 linux_osd_jail_register();
1187 stclohz = (stathz ? stathz : hz);
1189 printf("Linux ELF exec handler installed\n");
1191 printf("cannot insert Linux ELF brand handler\n");
1194 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1196 if (elf32_brand_inuse(*brandinfo))
1199 for (brandinfo = &linux_brandlist[0];
1200 *brandinfo != NULL; ++brandinfo)
1201 if (elf32_remove_brand_entry(*brandinfo) < 0)
1205 SET_FOREACH(lihp, linux_ioctl_handler_set)
1206 linux_ioctl_unregister_handler(*lihp);
1207 SET_FOREACH(ldhp, linux_device_handler_set)
1208 linux_device_unregister_handler(*ldhp);
1209 mtx_destroy(&futex_mtx);
1210 EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1211 EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1212 EVENTHANDLER_DEREGISTER(thread_dtor, linux_thread_dtor_tag);
1213 linux_osd_jail_deregister();
1215 printf("Linux ELF exec handler removed\n");
1217 printf("Could not deinstall ELF interpreter entry\n");
1220 return (EOPNOTSUPP);
1225 static moduledata_t linux_elf_mod = {
1231 DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);