2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2003 Peter Wemm
4 * Copyright (c) 2002 Doug Rabson
5 * Copyright (c) 1998-1999 Andrew Gallatin
6 * Copyright (c) 1994-1996 Søren Schmidt
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer
14 * in this position and unchanged.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. The name of the author may not be used to endorse or promote products
19 * derived from this software without specific prior written permission
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
37 #ifndef COMPAT_FREEBSD32
38 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
41 #define __ELF_WORD_SIZE 32
43 #include <sys/param.h>
44 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_emul.h>
80 #include <compat/linux/linux_futex.h>
81 #include <compat/linux/linux_mib.h>
82 #include <compat/linux/linux_misc.h>
83 #include <compat/linux/linux_signal.h>
84 #include <compat/linux/linux_util.h>
86 MODULE_VERSION(linux, 1);
88 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
90 #define AUXARGS_ENTRY_32(pos, id, val) \
92 suword32(pos++, id); \
93 suword32(pos++, val); \
96 #if BYTE_ORDER == LITTLE_ENDIAN
97 #define SHELLMAGIC 0x2123 /* #! */
99 #define SHELLMAGIC 0x2321
103 * Allow the sendsig functions to use the ldebug() facility
104 * even though they are not syscalls themselves. Map them
105 * to syscall 0. This is slightly less bogus than using
108 #define LINUX_SYS_linux_rt_sendsig 0
109 #define LINUX_SYS_linux_sendsig 0
111 const char *linux_platform = "i686";
112 static int linux_szplatform;
113 extern char linux_sigcode[];
114 extern int linux_szsigcode;
116 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
118 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
119 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
121 static int elf_linux_fixup(register_t **stack_base,
122 struct image_params *iparams);
123 static register_t *linux_copyout_strings(struct image_params *imgp);
124 static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
125 static void exec_linux_setregs(struct thread *td, u_long entry,
126 u_long stack, u_long ps_strings);
127 static void linux32_fixlimit(struct rlimit *rl, int which);
128 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
130 static eventhandler_tag linux_exit_tag;
131 static eventhandler_tag linux_exec_tag;
134 * Linux syscalls return negative errno's, we do positive and map them
136 * FreeBSD: src/sys/sys/errno.h
137 * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h
138 * linux-2.6.17.8/include/asm-generic/errno.h
140 static int bsd_to_linux_errno[ELAST + 1] = {
141 -0, -1, -2, -3, -4, -5, -6, -7, -8, -9,
142 -10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
143 -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
144 -30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
145 -90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
146 -100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
147 -110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
148 -116, -66, -6, -6, -6, -6, -6, -37, -38, -9,
149 -6, -6, -43, -42, -75,-125, -84, -95, -16, -74,
153 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
154 LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
155 LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
156 LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
157 LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
158 LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
159 LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
160 LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
161 0, LINUX_SIGUSR1, LINUX_SIGUSR2
164 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
165 SIGHUP, SIGINT, SIGQUIT, SIGILL,
166 SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
167 SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
168 SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
169 SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
170 SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
171 SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
172 SIGIO, SIGURG, SIGSYS
175 #define LINUX_T_UNKNOWN 255
176 static int _bsd_to_linux_trapcode[] = {
177 LINUX_T_UNKNOWN, /* 0 */
178 6, /* 1 T_PRIVINFLT */
179 LINUX_T_UNKNOWN, /* 2 */
181 LINUX_T_UNKNOWN, /* 4 */
182 LINUX_T_UNKNOWN, /* 5 */
183 16, /* 6 T_ARITHTRAP */
184 254, /* 7 T_ASTFLT */
185 LINUX_T_UNKNOWN, /* 8 */
186 13, /* 9 T_PROTFLT */
187 1, /* 10 T_TRCTRAP */
188 LINUX_T_UNKNOWN, /* 11 */
189 14, /* 12 T_PAGEFLT */
190 LINUX_T_UNKNOWN, /* 13 */
191 17, /* 14 T_ALIGNFLT */
192 LINUX_T_UNKNOWN, /* 15 */
193 LINUX_T_UNKNOWN, /* 16 */
194 LINUX_T_UNKNOWN, /* 17 */
200 8, /* 23 T_DOUBLEFLT */
201 9, /* 24 T_FPOPFLT */
202 10, /* 25 T_TSSFLT */
203 11, /* 26 T_SEGNPFLT */
204 12, /* 27 T_STKFLT */
206 19, /* 29 T_XMMFLT */
207 15 /* 30 T_RESERVED */
209 #define bsd_to_linux_trapcode(code) \
210 ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
211 _bsd_to_linux_trapcode[(code)]: \
214 struct linux32_ps_strings {
215 u_int32_t ps_argvstr; /* first of 0 or more argument strings */
216 u_int ps_nargvstr; /* the number of argument strings */
217 u_int32_t ps_envstr; /* first of 0 or more environment strings */
218 u_int ps_nenvstr; /* the number of environment strings */
222 * If FreeBSD & Linux have a difference of opinion about what a trap
223 * means, deal with it here.
228 translate_traps(int signal, int trap_code)
230 if (signal != SIGBUS)
244 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
248 Elf32_Addr *pos, *uplatform;
249 struct linux32_ps_strings *arginfo;
251 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
252 uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szsigcode -
255 KASSERT(curthread->td_proc == imgp->proc,
256 ("unsafe elf_linux_fixup(), should be curproc"));
257 base = (Elf32_Addr *)*stack_base;
258 args = (Elf32_Auxargs *)imgp->auxargs;
259 pos = base + (imgp->args->argc + imgp->args->envc + 2);
261 AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
264 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
265 * as it has appeared in the 2.4.0-rc7 first time.
266 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
267 * glibc falls back to the hard-coded CLK_TCK value when aux entry
269 * Also see linux_times() implementation.
271 if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
272 AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
273 AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
274 AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
275 AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
276 AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
277 AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
278 AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
279 AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
280 AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
281 AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
282 AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
283 AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
284 AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
285 AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
286 if (args->execfd != -1)
287 AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
288 AUXARGS_ENTRY_32(pos, AT_NULL, 0);
290 free(imgp->auxargs, M_TEMP);
291 imgp->auxargs = NULL;
294 suword32(base, (uint32_t)imgp->args->argc);
295 *stack_base = (register_t *)base;
299 extern unsigned long linux_sznonrtsigcode;
302 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
304 struct thread *td = curthread;
305 struct proc *p = td->td_proc;
307 struct trapframe *regs;
308 struct l_rt_sigframe *fp, frame;
313 sig = ksi->ksi_signo;
314 code = ksi->ksi_code;
315 PROC_LOCK_ASSERT(p, MA_OWNED);
317 mtx_assert(&psp->ps_mtx, MA_OWNED);
319 oonstack = sigonstack(regs->tf_rsp);
322 if (ldebug(rt_sendsig))
323 printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
324 catcher, sig, (void*)mask, code);
327 * Allocate space for the signal handler context.
329 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
330 SIGISMEMBER(psp->ps_sigonstack, sig)) {
331 fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
332 td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
334 fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
335 mtx_unlock(&psp->ps_mtx);
338 * Build the argument list for the signal handler.
340 if (p->p_sysent->sv_sigtbl)
341 if (sig <= p->p_sysent->sv_sigsize)
342 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
344 bzero(&frame, sizeof(frame));
346 frame.sf_handler = PTROUT(catcher);
348 frame.sf_siginfo = PTROUT(&fp->sf_si);
349 frame.sf_ucontext = PTROUT(&fp->sf_sc);
351 /* Fill in POSIX parts */
352 ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
355 * Build the signal context to be used by sigreturn.
357 frame.sf_sc.uc_flags = 0; /* XXX ??? */
358 frame.sf_sc.uc_link = 0; /* XXX ??? */
360 frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
361 frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
362 frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
363 ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
366 bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
368 frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0];
369 frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi;
370 frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi;
371 frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp;
372 frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx;
373 frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx;
374 frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx;
375 frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax;
376 frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip;
377 frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs;
378 frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs;
379 frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs;
380 frame.sf_sc.uc_mcontext.sc_es = regs->tf_es;
381 frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds;
382 frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
383 frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
384 frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss;
385 frame.sf_sc.uc_mcontext.sc_err = regs->tf_err;
386 frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr;
387 frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
390 if (ldebug(rt_sendsig))
391 printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
392 frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
393 td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
396 if (copyout(&frame, fp, sizeof(frame)) != 0) {
398 * Process has trashed its stack; give it an illegal
399 * instruction to halt it in its tracks.
402 if (ldebug(rt_sendsig))
403 printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
411 * Build context to run handler in.
413 regs->tf_rsp = PTROUT(fp);
414 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
415 linux_sznonrtsigcode;
416 regs->tf_rflags &= ~(PSL_T | PSL_D);
417 regs->tf_cs = _ucode32sel;
418 regs->tf_ss = _udatasel;
419 regs->tf_ds = _udatasel;
420 regs->tf_es = _udatasel;
421 regs->tf_fs = _ufssel;
422 regs->tf_gs = _ugssel;
423 regs->tf_flags = TF_HASSEGS;
424 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
426 mtx_lock(&psp->ps_mtx);
431 * Send an interrupt to process.
433 * Stack is set up to allow sigcode stored
434 * in u. to call routine, followed by kcall
435 * to sigreturn routine below. After sigreturn
436 * resets the signal mask, the stack, and the
437 * frame pointer, it returns to the user
441 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
443 struct thread *td = curthread;
444 struct proc *p = td->td_proc;
446 struct trapframe *regs;
447 struct l_sigframe *fp, frame;
452 sig = ksi->ksi_signo;
453 code = ksi->ksi_code;
454 PROC_LOCK_ASSERT(p, MA_OWNED);
456 mtx_assert(&psp->ps_mtx, MA_OWNED);
457 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
458 /* Signal handler installed with SA_SIGINFO. */
459 linux_rt_sendsig(catcher, ksi, mask);
464 oonstack = sigonstack(regs->tf_rsp);
468 printf(ARGS(sendsig, "%p, %d, %p, %u"),
469 catcher, sig, (void*)mask, code);
473 * Allocate space for the signal handler context.
475 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
476 SIGISMEMBER(psp->ps_sigonstack, sig)) {
477 fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
478 td->td_sigstk.ss_size - sizeof(struct l_sigframe));
480 fp = (struct l_sigframe *)regs->tf_rsp - 1;
481 mtx_unlock(&psp->ps_mtx);
485 * Build the argument list for the signal handler.
487 if (p->p_sysent->sv_sigtbl)
488 if (sig <= p->p_sysent->sv_sigsize)
489 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
491 bzero(&frame, sizeof(frame));
493 frame.sf_handler = PTROUT(catcher);
496 bsd_to_linux_sigset(mask, &lmask);
499 * Build the signal context to be used by sigreturn.
501 frame.sf_sc.sc_mask = lmask.__bits[0];
502 frame.sf_sc.sc_gs = regs->tf_gs;
503 frame.sf_sc.sc_fs = regs->tf_fs;
504 frame.sf_sc.sc_es = regs->tf_es;
505 frame.sf_sc.sc_ds = regs->tf_ds;
506 frame.sf_sc.sc_edi = regs->tf_rdi;
507 frame.sf_sc.sc_esi = regs->tf_rsi;
508 frame.sf_sc.sc_ebp = regs->tf_rbp;
509 frame.sf_sc.sc_ebx = regs->tf_rbx;
510 frame.sf_sc.sc_edx = regs->tf_rdx;
511 frame.sf_sc.sc_ecx = regs->tf_rcx;
512 frame.sf_sc.sc_eax = regs->tf_rax;
513 frame.sf_sc.sc_eip = regs->tf_rip;
514 frame.sf_sc.sc_cs = regs->tf_cs;
515 frame.sf_sc.sc_eflags = regs->tf_rflags;
516 frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
517 frame.sf_sc.sc_ss = regs->tf_ss;
518 frame.sf_sc.sc_err = regs->tf_err;
519 frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr;
520 frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
522 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
523 frame.sf_extramask[i] = lmask.__bits[i+1];
525 if (copyout(&frame, fp, sizeof(frame)) != 0) {
527 * Process has trashed its stack; give it an illegal
528 * instruction to halt it in its tracks.
535 * Build context to run handler in.
537 regs->tf_rsp = PTROUT(fp);
538 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
539 regs->tf_rflags &= ~(PSL_T | PSL_D);
540 regs->tf_cs = _ucode32sel;
541 regs->tf_ss = _udatasel;
542 regs->tf_ds = _udatasel;
543 regs->tf_es = _udatasel;
544 regs->tf_fs = _ufssel;
545 regs->tf_gs = _ugssel;
546 regs->tf_flags = TF_HASSEGS;
547 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
549 mtx_lock(&psp->ps_mtx);
553 * System call to cleanup state after a signal
554 * has been taken. Reset signal mask and
555 * stack state from context left by sendsig (above).
556 * Return to previous pc and psl as specified by
557 * context left by sendsig. Check carefully to
558 * make sure that the user has not modified the
559 * psl to gain improper privileges or to cause
563 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
565 struct l_sigframe frame;
566 struct trapframe *regs;
575 if (ldebug(sigreturn))
576 printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
579 * The trampoline code hands us the sigframe.
580 * It is unsafe to keep track of it ourselves, in the event that a
581 * program jumps out of a signal handler.
583 if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
587 * Check for security violations.
589 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
590 eflags = frame.sf_sc.sc_eflags;
592 * XXX do allow users to change the privileged flag PSL_RF. The
593 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should
594 * sometimes set it there too. tf_eflags is kept in the signal
595 * context during signal handling and there is no other place
596 * to remember it, so the PSL_RF bit may be corrupted by the
597 * signal handler without us knowing. Corruption of the PSL_RF
598 * bit at worst causes one more or one less debugger trap, so
599 * allowing it is fairly harmless.
601 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
605 * Don't allow users to load a valid privileged %cs. Let the
606 * hardware check for invalid selectors, excess privilege in
607 * other selectors, invalid %eip's and invalid %esp's.
609 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
610 if (!CS_SECURE(frame.sf_sc.sc_cs)) {
611 ksiginfo_init_trap(&ksi);
612 ksi.ksi_signo = SIGBUS;
613 ksi.ksi_code = BUS_OBJERR;
614 ksi.ksi_trapno = T_PROTFLT;
615 ksi.ksi_addr = (void *)regs->tf_rip;
616 trapsignal(td, &ksi);
620 lmask.__bits[0] = frame.sf_sc.sc_mask;
621 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
622 lmask.__bits[i+1] = frame.sf_extramask[i];
623 linux_to_bsd_sigset(&lmask, &bmask);
624 kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
627 * Restore signal context.
629 regs->tf_rdi = frame.sf_sc.sc_edi;
630 regs->tf_rsi = frame.sf_sc.sc_esi;
631 regs->tf_rbp = frame.sf_sc.sc_ebp;
632 regs->tf_rbx = frame.sf_sc.sc_ebx;
633 regs->tf_rdx = frame.sf_sc.sc_edx;
634 regs->tf_rcx = frame.sf_sc.sc_ecx;
635 regs->tf_rax = frame.sf_sc.sc_eax;
636 regs->tf_rip = frame.sf_sc.sc_eip;
637 regs->tf_cs = frame.sf_sc.sc_cs;
638 regs->tf_ds = frame.sf_sc.sc_ds;
639 regs->tf_es = frame.sf_sc.sc_es;
640 regs->tf_fs = frame.sf_sc.sc_fs;
641 regs->tf_gs = frame.sf_sc.sc_gs;
642 regs->tf_rflags = eflags;
643 regs->tf_rsp = frame.sf_sc.sc_esp_at_signal;
644 regs->tf_ss = frame.sf_sc.sc_ss;
645 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
647 return (EJUSTRETURN);
651 * System call to cleanup state after a signal
652 * has been taken. Reset signal mask and
653 * stack state from context left by rt_sendsig (above).
654 * Return to previous pc and psl as specified by
655 * context left by sendsig. Check carefully to
656 * make sure that the user has not modified the
657 * psl to gain improper privileges or to cause
661 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
663 struct l_ucontext uc;
664 struct l_sigcontext *context;
668 struct trapframe *regs;
675 if (ldebug(rt_sigreturn))
676 printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
679 * The trampoline code hands us the ucontext.
680 * It is unsafe to keep track of it ourselves, in the event that a
681 * program jumps out of a signal handler.
683 if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
686 context = &uc.uc_mcontext;
689 * Check for security violations.
691 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
692 eflags = context->sc_eflags;
694 * XXX do allow users to change the privileged flag PSL_RF. The
695 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should
696 * sometimes set it there too. tf_eflags is kept in the signal
697 * context during signal handling and there is no other place
698 * to remember it, so the PSL_RF bit may be corrupted by the
699 * signal handler without us knowing. Corruption of the PSL_RF
700 * bit at worst causes one more or one less debugger trap, so
701 * allowing it is fairly harmless.
703 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
707 * Don't allow users to load a valid privileged %cs. Let the
708 * hardware check for invalid selectors, excess privilege in
709 * other selectors, invalid %eip's and invalid %esp's.
711 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
712 if (!CS_SECURE(context->sc_cs)) {
713 ksiginfo_init_trap(&ksi);
714 ksi.ksi_signo = SIGBUS;
715 ksi.ksi_code = BUS_OBJERR;
716 ksi.ksi_trapno = T_PROTFLT;
717 ksi.ksi_addr = (void *)regs->tf_rip;
718 trapsignal(td, &ksi);
722 linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
723 kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
726 * Restore signal context
728 regs->tf_gs = context->sc_gs;
729 regs->tf_fs = context->sc_fs;
730 regs->tf_es = context->sc_es;
731 regs->tf_ds = context->sc_ds;
732 regs->tf_rdi = context->sc_edi;
733 regs->tf_rsi = context->sc_esi;
734 regs->tf_rbp = context->sc_ebp;
735 regs->tf_rbx = context->sc_ebx;
736 regs->tf_rdx = context->sc_edx;
737 regs->tf_rcx = context->sc_ecx;
738 regs->tf_rax = context->sc_eax;
739 regs->tf_rip = context->sc_eip;
740 regs->tf_cs = context->sc_cs;
741 regs->tf_rflags = eflags;
742 regs->tf_rsp = context->sc_esp_at_signal;
743 regs->tf_ss = context->sc_ss;
744 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
747 * call sigaltstack & ignore results..
750 ss.ss_sp = PTRIN(lss->ss_sp);
751 ss.ss_size = lss->ss_size;
752 ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
755 if (ldebug(rt_sigreturn))
756 printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
757 ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
759 (void)kern_sigaltstack(td, &ss, NULL);
761 return (EJUSTRETURN);
765 linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
768 struct trapframe *frame;
771 frame = td->td_frame;
773 sa->args[0] = frame->tf_rbx;
774 sa->args[1] = frame->tf_rcx;
775 sa->args[2] = frame->tf_rdx;
776 sa->args[3] = frame->tf_rsi;
777 sa->args[4] = frame->tf_rdi;
778 sa->args[5] = frame->tf_rbp; /* Unconfirmed */
779 sa->code = frame->tf_rax;
781 if (sa->code >= p->p_sysent->sv_size)
782 sa->callp = &p->p_sysent->sv_table[0];
784 sa->callp = &p->p_sysent->sv_table[sa->code];
785 sa->narg = sa->callp->sy_narg;
787 td->td_retval[0] = 0;
788 td->td_retval[1] = frame->tf_rdx;
794 * If a linux binary is exec'ing something, try this image activator
795 * first. We override standard shell script execution in order to
796 * be able to modify the interpreter path. We only do this if a linux
797 * binary is doing the exec, so we do not create an EXEC module for it.
799 static int exec_linux_imgact_try(struct image_params *iparams);
802 exec_linux_imgact_try(struct image_params *imgp)
804 const char *head = (const char *)imgp->image_header;
809 * The interpreter for shell scripts run from a linux binary needs
810 * to be located in /compat/linux if possible in order to recursively
811 * maintain linux path emulation.
813 if (((const short *)head)[0] == SHELLMAGIC) {
815 * Run our normal shell image activator. If it succeeds attempt
816 * to use the alternate path for the interpreter. If an
817 * alternate * path is found, use our stringspace to store it.
819 if ((error = exec_shell_imgact(imgp)) == 0) {
820 linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
821 imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
824 len = strlen(rpath) + 1;
826 if (len <= MAXSHELLCMDLEN) {
827 memcpy(imgp->interpreter_name, rpath,
838 * Clear registers on exec
839 * XXX copied from ia32_signal.c.
842 exec_linux_setregs(td, entry, stack, ps_strings)
848 struct trapframe *regs = td->td_frame;
849 struct pcb *pcb = td->td_pcb;
852 if (td->td_proc->p_md.md_ldt != NULL)
855 mtx_unlock(&dt_lock);
858 wrmsr(MSR_FSBASE, 0);
859 wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */
863 pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
865 bzero((char *)regs, sizeof(struct trapframe));
866 regs->tf_rip = entry;
867 regs->tf_rsp = stack;
868 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
869 regs->tf_gs = _ugssel;
870 regs->tf_fs = _ufssel;
871 regs->tf_es = _udatasel;
872 regs->tf_ds = _udatasel;
873 regs->tf_ss = _udatasel;
874 regs->tf_flags = TF_HASSEGS;
875 regs->tf_cs = _ucode32sel;
876 regs->tf_rbx = ps_strings;
880 /* Do full restore on return so that we can change to a different %cs */
881 set_pcb_flags(pcb, PCB_32BIT | PCB_FULL_IRET);
882 clear_pcb_flags(pcb, PCB_GS32BIT);
883 td->td_retval[1] = 0;
887 * XXX copied from ia32_sysvec.c.
890 linux_copyout_strings(struct image_params *imgp)
894 char *stringp, *destp;
895 u_int32_t *stack_base;
896 struct linux32_ps_strings *arginfo;
899 * Calculate string base and vector table pointers.
900 * Also deal with signal trampoline code for this exec type.
902 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
903 destp = (caddr_t)arginfo - linux_szsigcode - SPARE_USRSPACE -
904 linux_szplatform - roundup((ARG_MAX - imgp->args->stringspace),
910 copyout(imgp->proc->p_sysent->sv_sigcode,
911 ((caddr_t)arginfo - linux_szsigcode), linux_szsigcode);
914 * Install LINUX_PLATFORM
916 copyout(linux_platform, ((caddr_t)arginfo - linux_szsigcode -
917 linux_szplatform), linux_szplatform);
920 * If we have a valid auxargs ptr, prepare some room
925 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
926 * lower compatibility.
928 imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
929 (LINUX_AT_COUNT * 2);
931 * The '+ 2' is for the null pointers at the end of each of
932 * the arg and env vector sets,and imgp->auxarg_size is room
933 * for argument of Runtime loader.
935 vectp = (u_int32_t *) (destp - (imgp->args->argc +
936 imgp->args->envc + 2 + imgp->auxarg_size) *
941 * The '+ 2' is for the null pointers at the end of each of
942 * the arg and env vector sets
944 vectp = (u_int32_t *)(destp - (imgp->args->argc +
945 imgp->args->envc + 2) * sizeof(u_int32_t));
948 * vectp also becomes our initial stack base
952 stringp = imgp->args->begin_argv;
953 argc = imgp->args->argc;
954 envc = imgp->args->envc;
956 * Copy out strings - arguments and environment.
958 copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
961 * Fill in "ps_strings" struct for ps, w, etc.
963 suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
964 suword32(&arginfo->ps_nargvstr, argc);
967 * Fill in argument portion of vector table.
969 for (; argc > 0; --argc) {
970 suword32(vectp++, (uint32_t)(intptr_t)destp);
971 while (*stringp++ != 0)
976 /* a null vector table pointer separates the argp's from the envp's */
977 suword32(vectp++, 0);
979 suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
980 suword32(&arginfo->ps_nenvstr, envc);
983 * Fill in environment portion of vector table.
985 for (; envc > 0; --envc) {
986 suword32(vectp++, (uint32_t)(intptr_t)destp);
987 while (*stringp++ != 0)
992 /* end of vector table is a null pointer */
995 return ((register_t *)stack_base);
998 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
999 "32-bit Linux emulation");
1001 static u_long linux32_maxdsiz = LINUX32_MAXDSIZ;
1002 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
1003 &linux32_maxdsiz, 0, "");
1004 static u_long linux32_maxssiz = LINUX32_MAXSSIZ;
1005 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
1006 &linux32_maxssiz, 0, "");
1007 static u_long linux32_maxvmem = LINUX32_MAXVMEM;
1008 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
1009 &linux32_maxvmem, 0, "");
1012 linux32_fixlimit(struct rlimit *rl, int which)
1017 if (linux32_maxdsiz != 0) {
1018 if (rl->rlim_cur > linux32_maxdsiz)
1019 rl->rlim_cur = linux32_maxdsiz;
1020 if (rl->rlim_max > linux32_maxdsiz)
1021 rl->rlim_max = linux32_maxdsiz;
1025 if (linux32_maxssiz != 0) {
1026 if (rl->rlim_cur > linux32_maxssiz)
1027 rl->rlim_cur = linux32_maxssiz;
1028 if (rl->rlim_max > linux32_maxssiz)
1029 rl->rlim_max = linux32_maxssiz;
1033 if (linux32_maxvmem != 0) {
1034 if (rl->rlim_cur > linux32_maxvmem)
1035 rl->rlim_cur = linux32_maxvmem;
1036 if (rl->rlim_max > linux32_maxvmem)
1037 rl->rlim_max = linux32_maxvmem;
1043 struct sysentvec elf_linux_sysvec = {
1044 .sv_size = LINUX_SYS_MAXSYSCALL,
1045 .sv_table = linux_sysent,
1047 .sv_sigsize = LINUX_SIGTBLSZ,
1048 .sv_sigtbl = bsd_to_linux_signal,
1049 .sv_errsize = ELAST + 1,
1050 .sv_errtbl = bsd_to_linux_errno,
1051 .sv_transtrap = translate_traps,
1052 .sv_fixup = elf_linux_fixup,
1053 .sv_sendsig = linux_sendsig,
1054 .sv_sigcode = linux_sigcode,
1055 .sv_szsigcode = &linux_szsigcode,
1056 .sv_prepsyscall = NULL,
1057 .sv_name = "Linux ELF32",
1058 .sv_coredump = elf32_coredump,
1059 .sv_imgact_try = exec_linux_imgact_try,
1060 .sv_minsigstksz = LINUX_MINSIGSTKSZ,
1061 .sv_pagesize = PAGE_SIZE,
1062 .sv_minuser = VM_MIN_ADDRESS,
1063 .sv_maxuser = LINUX32_USRSTACK,
1064 .sv_usrstack = LINUX32_USRSTACK,
1065 .sv_psstrings = LINUX32_PS_STRINGS,
1066 .sv_stackprot = VM_PROT_ALL,
1067 .sv_copyout_strings = linux_copyout_strings,
1068 .sv_setregs = exec_linux_setregs,
1069 .sv_fixlimit = linux32_fixlimit,
1070 .sv_maxssiz = &linux32_maxssiz,
1071 .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32,
1072 .sv_set_syscall_retval = cpu_set_syscall_retval,
1073 .sv_fetch_syscall_args = linux32_fetch_syscall_args,
1074 .sv_syscallnames = NULL,
1075 .sv_schedtail = linux_schedtail,
1078 static char GNU_ABI_VENDOR[] = "GNU";
1079 static int GNULINUX_ABI_DESC = 0;
1082 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1084 const Elf32_Word *desc;
1087 p = (uintptr_t)(note + 1);
1088 p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1090 desc = (const Elf32_Word *)p;
1091 if (desc[0] != GNULINUX_ABI_DESC)
1095 * For linux we encode osrel as follows (see linux_mib.c):
1096 * VVVMMMIII (version, major, minor), see linux_mib.c.
1098 *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1103 static Elf_Brandnote linux32_brandnote = {
1104 .hdr.n_namesz = sizeof(GNU_ABI_VENDOR),
1105 .hdr.n_descsz = 16, /* XXX at least 16 */
1107 .vendor = GNU_ABI_VENDOR,
1108 .flags = BN_TRANSLATE_OSREL,
1109 .trans_osrel = linux32_trans_osrel
1112 static Elf32_Brandinfo linux_brand = {
1113 .brand = ELFOSABI_LINUX,
1115 .compat_3_brand = "Linux",
1116 .emul_path = "/compat/linux",
1117 .interp_path = "/lib/ld-linux.so.1",
1118 .sysvec = &elf_linux_sysvec,
1119 .interp_newpath = NULL,
1120 .brand_note = &linux32_brandnote,
1121 .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1124 static Elf32_Brandinfo linux_glibc2brand = {
1125 .brand = ELFOSABI_LINUX,
1127 .compat_3_brand = "Linux",
1128 .emul_path = "/compat/linux",
1129 .interp_path = "/lib/ld-linux.so.2",
1130 .sysvec = &elf_linux_sysvec,
1131 .interp_newpath = NULL,
1132 .brand_note = &linux32_brandnote,
1133 .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1136 Elf32_Brandinfo *linux_brandlist[] = {
1143 linux_elf_modevent(module_t mod, int type, void *data)
1145 Elf32_Brandinfo **brandinfo;
1147 struct linux_ioctl_handler **lihp;
1148 struct linux_device_handler **ldhp;
1154 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1156 if (elf32_insert_brand_entry(*brandinfo) < 0)
1159 SET_FOREACH(lihp, linux_ioctl_handler_set)
1160 linux_ioctl_register_handler(*lihp);
1161 SET_FOREACH(ldhp, linux_device_handler_set)
1162 linux_device_register_handler(*ldhp);
1163 mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
1164 sx_init(&emul_shared_lock, "emuldata->shared lock");
1165 LIST_INIT(&futex_list);
1166 mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1167 linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1168 linux_proc_exit, NULL, 1000);
1169 linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1170 linux_proc_exec, NULL, 1000);
1171 linux_szplatform = roundup(strlen(linux_platform) + 1,
1173 linux_osd_jail_register();
1174 stclohz = (stathz ? stathz : hz);
1176 printf("Linux ELF exec handler installed\n");
1178 printf("cannot insert Linux ELF brand handler\n");
1181 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1183 if (elf32_brand_inuse(*brandinfo))
1186 for (brandinfo = &linux_brandlist[0];
1187 *brandinfo != NULL; ++brandinfo)
1188 if (elf32_remove_brand_entry(*brandinfo) < 0)
1192 SET_FOREACH(lihp, linux_ioctl_handler_set)
1193 linux_ioctl_unregister_handler(*lihp);
1194 SET_FOREACH(ldhp, linux_device_handler_set)
1195 linux_device_unregister_handler(*ldhp);
1196 mtx_destroy(&emul_lock);
1197 sx_destroy(&emul_shared_lock);
1198 mtx_destroy(&futex_mtx);
1199 EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1200 EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1201 linux_osd_jail_deregister();
1203 printf("Linux ELF exec handler removed\n");
1205 printf("Could not deinstall ELF interpreter entry\n");
1213 static moduledata_t linux_elf_mod = {
1219 DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);