2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2003 Peter Wemm
4 * Copyright (c) 2002 Doug Rabson
5 * Copyright (c) 1998-1999 Andrew Gallatin
6 * Copyright (c) 1994-1996 Søren Schmidt
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer
14 * in this position and unchanged.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. The name of the author may not be used to endorse or promote products
19 * derived from this software without specific prior written permission
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
38 #error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!"
41 #define __ELF_WORD_SIZE 32
43 #include <sys/param.h>
44 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_futex.h>
80 #include <compat/linux/linux_emul.h>
81 #include <compat/linux/linux_mib.h>
82 #include <compat/linux/linux_misc.h>
83 #include <compat/linux/linux_signal.h>
84 #include <compat/linux/linux_util.h>
86 MODULE_VERSION(linux, 1);
88 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
90 #define AUXARGS_ENTRY_32(pos, id, val) \
92 suword32(pos++, id); \
93 suword32(pos++, val); \
96 #if BYTE_ORDER == LITTLE_ENDIAN
97 #define SHELLMAGIC 0x2123 /* #! */
99 #define SHELLMAGIC 0x2321
103 * Allow the sendsig functions to use the ldebug() facility
104 * even though they are not syscalls themselves. Map them
105 * to syscall 0. This is slightly less bogus than using
108 #define LINUX_SYS_linux_rt_sendsig 0
109 #define LINUX_SYS_linux_sendsig 0
111 const char *linux_platform = "i686";
112 static int linux_szplatform;
113 extern char linux_sigcode[];
114 extern int linux_szsigcode;
116 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
118 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
119 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
121 static int elf_linux_fixup(register_t **stack_base,
122 struct image_params *iparams);
123 static register_t *linux_copyout_strings(struct image_params *imgp);
124 static void linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
126 static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
127 static void exec_linux_setregs(struct thread *td, u_long entry,
128 u_long stack, u_long ps_strings);
129 static void linux32_fixlimit(struct rlimit *rl, int which);
130 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
132 static eventhandler_tag linux_exit_tag;
133 static eventhandler_tag linux_schedtail_tag;
134 static eventhandler_tag linux_exec_tag;
137 * Linux syscalls return negative errno's, we do positive and map them
139 * FreeBSD: src/sys/sys/errno.h
140 * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h
141 * linux-2.6.17.8/include/asm-generic/errno.h
143 static int bsd_to_linux_errno[ELAST + 1] = {
144 -0, -1, -2, -3, -4, -5, -6, -7, -8, -9,
145 -10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
146 -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
147 -30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
148 -90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
149 -100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
150 -110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
151 -116, -66, -6, -6, -6, -6, -6, -37, -38, -9,
152 -6, -6, -43, -42, -75,-125, -84, -95, -16, -74,
156 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
157 LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
158 LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
159 LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
160 LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
161 LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
162 LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
163 LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
164 0, LINUX_SIGUSR1, LINUX_SIGUSR2
167 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
168 SIGHUP, SIGINT, SIGQUIT, SIGILL,
169 SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
170 SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
171 SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
172 SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
173 SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
174 SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
175 SIGIO, SIGURG, SIGSYS
178 #define LINUX_T_UNKNOWN 255
179 static int _bsd_to_linux_trapcode[] = {
180 LINUX_T_UNKNOWN, /* 0 */
181 6, /* 1 T_PRIVINFLT */
182 LINUX_T_UNKNOWN, /* 2 */
184 LINUX_T_UNKNOWN, /* 4 */
185 LINUX_T_UNKNOWN, /* 5 */
186 16, /* 6 T_ARITHTRAP */
187 254, /* 7 T_ASTFLT */
188 LINUX_T_UNKNOWN, /* 8 */
189 13, /* 9 T_PROTFLT */
190 1, /* 10 T_TRCTRAP */
191 LINUX_T_UNKNOWN, /* 11 */
192 14, /* 12 T_PAGEFLT */
193 LINUX_T_UNKNOWN, /* 13 */
194 17, /* 14 T_ALIGNFLT */
195 LINUX_T_UNKNOWN, /* 15 */
196 LINUX_T_UNKNOWN, /* 16 */
197 LINUX_T_UNKNOWN, /* 17 */
203 8, /* 23 T_DOUBLEFLT */
204 9, /* 24 T_FPOPFLT */
205 10, /* 25 T_TSSFLT */
206 11, /* 26 T_SEGNPFLT */
207 12, /* 27 T_STKFLT */
209 19, /* 29 T_XMMFLT */
210 15 /* 30 T_RESERVED */
212 #define bsd_to_linux_trapcode(code) \
213 ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
214 _bsd_to_linux_trapcode[(code)]: \
217 struct linux32_ps_strings {
218 u_int32_t ps_argvstr; /* first of 0 or more argument strings */
219 u_int ps_nargvstr; /* the number of argument strings */
220 u_int32_t ps_envstr; /* first of 0 or more environment strings */
221 u_int ps_nenvstr; /* the number of environment strings */
225 * If FreeBSD & Linux have a difference of opinion about what a trap
226 * means, deal with it here.
231 translate_traps(int signal, int trap_code)
233 if (signal != SIGBUS)
247 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
251 Elf32_Addr *pos, *uplatform;
252 struct linux32_ps_strings *arginfo;
254 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
255 uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szsigcode -
258 KASSERT(curthread->td_proc == imgp->proc,
259 ("unsafe elf_linux_fixup(), should be curproc"));
260 base = (Elf32_Addr *)*stack_base;
261 args = (Elf32_Auxargs *)imgp->auxargs;
262 pos = base + (imgp->args->argc + imgp->args->envc + 2);
264 AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
267 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
268 * as it has appeared in the 2.4.0-rc7 first time.
269 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
270 * glibc falls back to the hard-coded CLK_TCK value when aux entry
272 * Also see linux_times() implementation.
274 if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
275 AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
276 AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
277 AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
278 AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
279 AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
280 AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
281 AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
282 AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
283 AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
284 AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
285 AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
286 AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
287 AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
288 AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
289 if (args->execfd != -1)
290 AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
291 AUXARGS_ENTRY_32(pos, AT_NULL, 0);
293 free(imgp->auxargs, M_TEMP);
294 imgp->auxargs = NULL;
297 suword32(base, (uint32_t)imgp->args->argc);
298 *stack_base = (register_t *)base;
302 extern unsigned long linux_sznonrtsigcode;
305 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
307 struct thread *td = curthread;
308 struct proc *p = td->td_proc;
310 struct trapframe *regs;
311 struct l_rt_sigframe *fp, frame;
316 sig = ksi->ksi_signo;
317 code = ksi->ksi_code;
318 PROC_LOCK_ASSERT(p, MA_OWNED);
320 mtx_assert(&psp->ps_mtx, MA_OWNED);
322 oonstack = sigonstack(regs->tf_rsp);
325 if (ldebug(rt_sendsig))
326 printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
327 catcher, sig, (void*)mask, code);
330 * Allocate space for the signal handler context.
332 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
333 SIGISMEMBER(psp->ps_sigonstack, sig)) {
334 fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
335 td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
337 fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
338 mtx_unlock(&psp->ps_mtx);
341 * Build the argument list for the signal handler.
343 if (p->p_sysent->sv_sigtbl)
344 if (sig <= p->p_sysent->sv_sigsize)
345 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
347 bzero(&frame, sizeof(frame));
349 frame.sf_handler = PTROUT(catcher);
351 frame.sf_siginfo = PTROUT(&fp->sf_si);
352 frame.sf_ucontext = PTROUT(&fp->sf_sc);
354 /* Fill in POSIX parts */
355 ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
358 * Build the signal context to be used by sigreturn.
360 frame.sf_sc.uc_flags = 0; /* XXX ??? */
361 frame.sf_sc.uc_link = 0; /* XXX ??? */
363 frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
364 frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
365 frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
366 ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
369 bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
371 frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0];
372 frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi;
373 frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi;
374 frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp;
375 frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx;
376 frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx;
377 frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx;
378 frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax;
379 frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip;
380 frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs;
381 frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs;
382 frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs;
383 frame.sf_sc.uc_mcontext.sc_es = regs->tf_es;
384 frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds;
385 frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
386 frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
387 frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss;
388 frame.sf_sc.uc_mcontext.sc_err = regs->tf_err;
389 frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr;
390 frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
393 if (ldebug(rt_sendsig))
394 printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
395 frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
396 td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
399 if (copyout(&frame, fp, sizeof(frame)) != 0) {
401 * Process has trashed its stack; give it an illegal
402 * instruction to halt it in its tracks.
405 if (ldebug(rt_sendsig))
406 printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
414 * Build context to run handler in.
416 regs->tf_rsp = PTROUT(fp);
417 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
418 linux_sznonrtsigcode;
419 regs->tf_rflags &= ~(PSL_T | PSL_D);
420 regs->tf_cs = _ucode32sel;
421 regs->tf_ss = _udatasel;
422 regs->tf_ds = _udatasel;
423 regs->tf_es = _udatasel;
424 regs->tf_fs = _ufssel;
425 regs->tf_gs = _ugssel;
426 regs->tf_flags = TF_HASSEGS;
427 td->td_pcb->pcb_full_iret = 1;
429 mtx_lock(&psp->ps_mtx);
434 * Send an interrupt to process.
436 * Stack is set up to allow sigcode stored
437 * in u. to call routine, followed by kcall
438 * to sigreturn routine below. After sigreturn
439 * resets the signal mask, the stack, and the
440 * frame pointer, it returns to the user
444 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
446 struct thread *td = curthread;
447 struct proc *p = td->td_proc;
449 struct trapframe *regs;
450 struct l_sigframe *fp, frame;
455 sig = ksi->ksi_signo;
456 code = ksi->ksi_code;
457 PROC_LOCK_ASSERT(p, MA_OWNED);
459 mtx_assert(&psp->ps_mtx, MA_OWNED);
460 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
461 /* Signal handler installed with SA_SIGINFO. */
462 linux_rt_sendsig(catcher, ksi, mask);
467 oonstack = sigonstack(regs->tf_rsp);
471 printf(ARGS(sendsig, "%p, %d, %p, %u"),
472 catcher, sig, (void*)mask, code);
476 * Allocate space for the signal handler context.
478 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
479 SIGISMEMBER(psp->ps_sigonstack, sig)) {
480 fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
481 td->td_sigstk.ss_size - sizeof(struct l_sigframe));
483 fp = (struct l_sigframe *)regs->tf_rsp - 1;
484 mtx_unlock(&psp->ps_mtx);
488 * Build the argument list for the signal handler.
490 if (p->p_sysent->sv_sigtbl)
491 if (sig <= p->p_sysent->sv_sigsize)
492 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
494 bzero(&frame, sizeof(frame));
496 frame.sf_handler = PTROUT(catcher);
499 bsd_to_linux_sigset(mask, &lmask);
502 * Build the signal context to be used by sigreturn.
504 frame.sf_sc.sc_mask = lmask.__bits[0];
505 frame.sf_sc.sc_gs = regs->tf_gs;
506 frame.sf_sc.sc_fs = regs->tf_fs;
507 frame.sf_sc.sc_es = regs->tf_es;
508 frame.sf_sc.sc_ds = regs->tf_ds;
509 frame.sf_sc.sc_edi = regs->tf_rdi;
510 frame.sf_sc.sc_esi = regs->tf_rsi;
511 frame.sf_sc.sc_ebp = regs->tf_rbp;
512 frame.sf_sc.sc_ebx = regs->tf_rbx;
513 frame.sf_sc.sc_edx = regs->tf_rdx;
514 frame.sf_sc.sc_ecx = regs->tf_rcx;
515 frame.sf_sc.sc_eax = regs->tf_rax;
516 frame.sf_sc.sc_eip = regs->tf_rip;
517 frame.sf_sc.sc_cs = regs->tf_cs;
518 frame.sf_sc.sc_eflags = regs->tf_rflags;
519 frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
520 frame.sf_sc.sc_ss = regs->tf_ss;
521 frame.sf_sc.sc_err = regs->tf_err;
522 frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr;
523 frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
525 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
526 frame.sf_extramask[i] = lmask.__bits[i+1];
528 if (copyout(&frame, fp, sizeof(frame)) != 0) {
530 * Process has trashed its stack; give it an illegal
531 * instruction to halt it in its tracks.
538 * Build context to run handler in.
540 regs->tf_rsp = PTROUT(fp);
541 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
542 regs->tf_rflags &= ~(PSL_T | PSL_D);
543 regs->tf_cs = _ucode32sel;
544 regs->tf_ss = _udatasel;
545 regs->tf_ds = _udatasel;
546 regs->tf_es = _udatasel;
547 regs->tf_fs = _ufssel;
548 regs->tf_gs = _ugssel;
549 regs->tf_flags = TF_HASSEGS;
550 td->td_pcb->pcb_full_iret = 1;
552 mtx_lock(&psp->ps_mtx);
556 * System call to cleanup state after a signal
557 * has been taken. Reset signal mask and
558 * stack state from context left by sendsig (above).
559 * Return to previous pc and psl as specified by
560 * context left by sendsig. Check carefully to
561 * make sure that the user has not modified the
562 * psl to gain improper privileges or to cause
566 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
568 struct proc *p = td->td_proc;
569 struct l_sigframe frame;
570 struct trapframe *regs;
578 if (ldebug(sigreturn))
579 printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
582 * The trampoline code hands us the sigframe.
583 * It is unsafe to keep track of it ourselves, in the event that a
584 * program jumps out of a signal handler.
586 if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
590 * Check for security violations.
592 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
593 eflags = frame.sf_sc.sc_eflags;
595 * XXX do allow users to change the privileged flag PSL_RF. The
596 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should
597 * sometimes set it there too. tf_eflags is kept in the signal
598 * context during signal handling and there is no other place
599 * to remember it, so the PSL_RF bit may be corrupted by the
600 * signal handler without us knowing. Corruption of the PSL_RF
601 * bit at worst causes one more or one less debugger trap, so
602 * allowing it is fairly harmless.
604 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
608 * Don't allow users to load a valid privileged %cs. Let the
609 * hardware check for invalid selectors, excess privilege in
610 * other selectors, invalid %eip's and invalid %esp's.
612 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
613 if (!CS_SECURE(frame.sf_sc.sc_cs)) {
614 ksiginfo_init_trap(&ksi);
615 ksi.ksi_signo = SIGBUS;
616 ksi.ksi_code = BUS_OBJERR;
617 ksi.ksi_trapno = T_PROTFLT;
618 ksi.ksi_addr = (void *)regs->tf_rip;
619 trapsignal(td, &ksi);
623 lmask.__bits[0] = frame.sf_sc.sc_mask;
624 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
625 lmask.__bits[i+1] = frame.sf_extramask[i];
627 linux_to_bsd_sigset(&lmask, &td->td_sigmask);
628 SIG_CANTMASK(td->td_sigmask);
633 * Restore signal context.
635 regs->tf_rdi = frame.sf_sc.sc_edi;
636 regs->tf_rsi = frame.sf_sc.sc_esi;
637 regs->tf_rbp = frame.sf_sc.sc_ebp;
638 regs->tf_rbx = frame.sf_sc.sc_ebx;
639 regs->tf_rdx = frame.sf_sc.sc_edx;
640 regs->tf_rcx = frame.sf_sc.sc_ecx;
641 regs->tf_rax = frame.sf_sc.sc_eax;
642 regs->tf_rip = frame.sf_sc.sc_eip;
643 regs->tf_cs = frame.sf_sc.sc_cs;
644 regs->tf_ds = frame.sf_sc.sc_ds;
645 regs->tf_es = frame.sf_sc.sc_es;
646 regs->tf_fs = frame.sf_sc.sc_fs;
647 regs->tf_gs = frame.sf_sc.sc_gs;
648 regs->tf_rflags = eflags;
649 regs->tf_rsp = frame.sf_sc.sc_esp_at_signal;
650 regs->tf_ss = frame.sf_sc.sc_ss;
651 td->td_pcb->pcb_full_iret = 1;
653 return (EJUSTRETURN);
657 * System call to cleanup state after a signal
658 * has been taken. Reset signal mask and
659 * stack state from context left by rt_sendsig (above).
660 * Return to previous pc and psl as specified by
661 * context left by sendsig. Check carefully to
662 * make sure that the user has not modified the
663 * psl to gain improper privileges or to cause
667 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
669 struct proc *p = td->td_proc;
670 struct l_ucontext uc;
671 struct l_sigcontext *context;
674 struct trapframe *regs;
681 if (ldebug(rt_sigreturn))
682 printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
685 * The trampoline code hands us the ucontext.
686 * It is unsafe to keep track of it ourselves, in the event that a
687 * program jumps out of a signal handler.
689 if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
692 context = &uc.uc_mcontext;
695 * Check for security violations.
697 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
698 eflags = context->sc_eflags;
700 * XXX do allow users to change the privileged flag PSL_RF. The
701 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should
702 * sometimes set it there too. tf_eflags is kept in the signal
703 * context during signal handling and there is no other place
704 * to remember it, so the PSL_RF bit may be corrupted by the
705 * signal handler without us knowing. Corruption of the PSL_RF
706 * bit at worst causes one more or one less debugger trap, so
707 * allowing it is fairly harmless.
709 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
713 * Don't allow users to load a valid privileged %cs. Let the
714 * hardware check for invalid selectors, excess privilege in
715 * other selectors, invalid %eip's and invalid %esp's.
717 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
718 if (!CS_SECURE(context->sc_cs)) {
719 ksiginfo_init_trap(&ksi);
720 ksi.ksi_signo = SIGBUS;
721 ksi.ksi_code = BUS_OBJERR;
722 ksi.ksi_trapno = T_PROTFLT;
723 ksi.ksi_addr = (void *)regs->tf_rip;
724 trapsignal(td, &ksi);
729 linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
730 SIG_CANTMASK(td->td_sigmask);
735 * Restore signal context
737 regs->tf_gs = context->sc_gs;
738 regs->tf_fs = context->sc_fs;
739 regs->tf_es = context->sc_es;
740 regs->tf_ds = context->sc_ds;
741 regs->tf_rdi = context->sc_edi;
742 regs->tf_rsi = context->sc_esi;
743 regs->tf_rbp = context->sc_ebp;
744 regs->tf_rbx = context->sc_ebx;
745 regs->tf_rdx = context->sc_edx;
746 regs->tf_rcx = context->sc_ecx;
747 regs->tf_rax = context->sc_eax;
748 regs->tf_rip = context->sc_eip;
749 regs->tf_cs = context->sc_cs;
750 regs->tf_rflags = eflags;
751 regs->tf_rsp = context->sc_esp_at_signal;
752 regs->tf_ss = context->sc_ss;
753 td->td_pcb->pcb_full_iret = 1;
756 * call sigaltstack & ignore results..
759 ss.ss_sp = PTRIN(lss->ss_sp);
760 ss.ss_size = lss->ss_size;
761 ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
764 if (ldebug(rt_sigreturn))
765 printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
766 ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
768 (void)kern_sigaltstack(td, &ss, NULL);
770 return (EJUSTRETURN);
777 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
779 args[0] = tf->tf_rbx;
780 args[1] = tf->tf_rcx;
781 args[2] = tf->tf_rdx;
782 args[3] = tf->tf_rsi;
783 args[4] = tf->tf_rdi;
784 args[5] = tf->tf_rbp; /* Unconfirmed */
785 *params = NULL; /* no copyin */
789 * If a linux binary is exec'ing something, try this image activator
790 * first. We override standard shell script execution in order to
791 * be able to modify the interpreter path. We only do this if a linux
792 * binary is doing the exec, so we do not create an EXEC module for it.
794 static int exec_linux_imgact_try(struct image_params *iparams);
797 exec_linux_imgact_try(struct image_params *imgp)
799 const char *head = (const char *)imgp->image_header;
804 * The interpreter for shell scripts run from a linux binary needs
805 * to be located in /compat/linux if possible in order to recursively
806 * maintain linux path emulation.
808 if (((const short *)head)[0] == SHELLMAGIC) {
810 * Run our normal shell image activator. If it succeeds attempt
811 * to use the alternate path for the interpreter. If an
812 * alternate * path is found, use our stringspace to store it.
814 if ((error = exec_shell_imgact(imgp)) == 0) {
815 linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
816 imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
819 len = strlen(rpath) + 1;
821 if (len <= MAXSHELLCMDLEN) {
822 memcpy(imgp->interpreter_name, rpath,
833 * Clear registers on exec
834 * XXX copied from ia32_signal.c.
837 exec_linux_setregs(td, entry, stack, ps_strings)
843 struct trapframe *regs = td->td_frame;
844 struct pcb *pcb = td->td_pcb;
847 if (td->td_proc->p_md.md_ldt != NULL)
850 mtx_unlock(&dt_lock);
853 wrmsr(MSR_FSBASE, 0);
854 wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */
858 pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
860 bzero((char *)regs, sizeof(struct trapframe));
861 regs->tf_rip = entry;
862 regs->tf_rsp = stack;
863 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
864 regs->tf_gs = _ugssel;
865 regs->tf_fs = _ufssel;
866 regs->tf_es = _udatasel;
867 regs->tf_ds = _udatasel;
868 regs->tf_ss = _udatasel;
869 regs->tf_flags = TF_HASSEGS;
870 regs->tf_cs = _ucode32sel;
871 regs->tf_rbx = ps_strings;
872 td->td_pcb->pcb_full_iret = 1;
873 load_cr0(rcr0() | CR0_MP | CR0_TS);
876 /* Return via doreti so that we can change to a different %cs */
877 pcb->pcb_flags |= PCB_FULLCTX | PCB_32BIT;
878 pcb->pcb_flags &= ~PCB_GS32BIT;
879 td->td_retval[1] = 0;
883 * XXX copied from ia32_sysvec.c.
886 linux_copyout_strings(struct image_params *imgp)
890 char *stringp, *destp;
891 u_int32_t *stack_base;
892 struct linux32_ps_strings *arginfo;
895 * Calculate string base and vector table pointers.
896 * Also deal with signal trampoline code for this exec type.
898 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
899 destp = (caddr_t)arginfo - linux_szsigcode - SPARE_USRSPACE -
900 linux_szplatform - roundup((ARG_MAX - imgp->args->stringspace),
906 copyout(imgp->proc->p_sysent->sv_sigcode,
907 ((caddr_t)arginfo - linux_szsigcode), linux_szsigcode);
910 * Install LINUX_PLATFORM
912 copyout(linux_platform, ((caddr_t)arginfo - linux_szsigcode -
913 linux_szplatform), linux_szplatform);
916 * If we have a valid auxargs ptr, prepare some room
921 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
922 * lower compatibility.
924 imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
925 (LINUX_AT_COUNT * 2);
927 * The '+ 2' is for the null pointers at the end of each of
928 * the arg and env vector sets,and imgp->auxarg_size is room
929 * for argument of Runtime loader.
931 vectp = (u_int32_t *) (destp - (imgp->args->argc +
932 imgp->args->envc + 2 + imgp->auxarg_size) *
937 * The '+ 2' is for the null pointers at the end of each of
938 * the arg and env vector sets
940 vectp = (u_int32_t *)(destp - (imgp->args->argc +
941 imgp->args->envc + 2) * sizeof(u_int32_t));
944 * vectp also becomes our initial stack base
948 stringp = imgp->args->begin_argv;
949 argc = imgp->args->argc;
950 envc = imgp->args->envc;
952 * Copy out strings - arguments and environment.
954 copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
957 * Fill in "ps_strings" struct for ps, w, etc.
959 suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
960 suword32(&arginfo->ps_nargvstr, argc);
963 * Fill in argument portion of vector table.
965 for (; argc > 0; --argc) {
966 suword32(vectp++, (uint32_t)(intptr_t)destp);
967 while (*stringp++ != 0)
972 /* a null vector table pointer separates the argp's from the envp's */
973 suword32(vectp++, 0);
975 suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
976 suword32(&arginfo->ps_nenvstr, envc);
979 * Fill in environment portion of vector table.
981 for (; envc > 0; --envc) {
982 suword32(vectp++, (uint32_t)(intptr_t)destp);
983 while (*stringp++ != 0)
988 /* end of vector table is a null pointer */
991 return ((register_t *)stack_base);
994 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
995 "32-bit Linux emulation");
997 static u_long linux32_maxdsiz = LINUX32_MAXDSIZ;
998 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
999 &linux32_maxdsiz, 0, "");
1000 static u_long linux32_maxssiz = LINUX32_MAXSSIZ;
1001 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
1002 &linux32_maxssiz, 0, "");
1003 static u_long linux32_maxvmem = LINUX32_MAXVMEM;
1004 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
1005 &linux32_maxvmem, 0, "");
1008 linux32_fixlimit(struct rlimit *rl, int which)
1013 if (linux32_maxdsiz != 0) {
1014 if (rl->rlim_cur > linux32_maxdsiz)
1015 rl->rlim_cur = linux32_maxdsiz;
1016 if (rl->rlim_max > linux32_maxdsiz)
1017 rl->rlim_max = linux32_maxdsiz;
1021 if (linux32_maxssiz != 0) {
1022 if (rl->rlim_cur > linux32_maxssiz)
1023 rl->rlim_cur = linux32_maxssiz;
1024 if (rl->rlim_max > linux32_maxssiz)
1025 rl->rlim_max = linux32_maxssiz;
1029 if (linux32_maxvmem != 0) {
1030 if (rl->rlim_cur > linux32_maxvmem)
1031 rl->rlim_cur = linux32_maxvmem;
1032 if (rl->rlim_max > linux32_maxvmem)
1033 rl->rlim_max = linux32_maxvmem;
1039 struct sysentvec elf_linux_sysvec = {
1040 .sv_size = LINUX_SYS_MAXSYSCALL,
1041 .sv_table = linux_sysent,
1043 .sv_sigsize = LINUX_SIGTBLSZ,
1044 .sv_sigtbl = bsd_to_linux_signal,
1045 .sv_errsize = ELAST + 1,
1046 .sv_errtbl = bsd_to_linux_errno,
1047 .sv_transtrap = translate_traps,
1048 .sv_fixup = elf_linux_fixup,
1049 .sv_sendsig = linux_sendsig,
1050 .sv_sigcode = linux_sigcode,
1051 .sv_szsigcode = &linux_szsigcode,
1052 .sv_prepsyscall = linux_prepsyscall,
1053 .sv_name = "Linux ELF32",
1054 .sv_coredump = elf32_coredump,
1055 .sv_imgact_try = exec_linux_imgact_try,
1056 .sv_minsigstksz = LINUX_MINSIGSTKSZ,
1057 .sv_pagesize = PAGE_SIZE,
1058 .sv_minuser = VM_MIN_ADDRESS,
1059 .sv_maxuser = LINUX32_USRSTACK,
1060 .sv_usrstack = LINUX32_USRSTACK,
1061 .sv_psstrings = LINUX32_PS_STRINGS,
1062 .sv_stackprot = VM_PROT_ALL,
1063 .sv_copyout_strings = linux_copyout_strings,
1064 .sv_setregs = exec_linux_setregs,
1065 .sv_fixlimit = linux32_fixlimit,
1066 .sv_maxssiz = &linux32_maxssiz,
1067 .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32
1070 static char GNU_ABI_VENDOR[] = "GNU";
1071 static int GNULINUX_ABI_DESC = 0;
1074 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1076 const Elf32_Word *desc;
1079 p = (uintptr_t)(note + 1);
1080 p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1082 desc = (const Elf32_Word *)p;
1083 if (desc[0] != GNULINUX_ABI_DESC)
1087 * For linux we encode osrel as follows (see linux_mib.c):
1088 * VVVMMMIII (version, major, minor), see linux_mib.c.
1090 *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1095 static Elf_Brandnote linux32_brandnote = {
1096 .hdr.n_namesz = sizeof(GNU_ABI_VENDOR),
1097 .hdr.n_descsz = 16, /* XXX at least 16 */
1099 .vendor = GNU_ABI_VENDOR,
1100 .flags = BN_TRANSLATE_OSREL,
1101 .trans_osrel = linux32_trans_osrel
1104 static Elf32_Brandinfo linux_brand = {
1105 .brand = ELFOSABI_LINUX,
1107 .compat_3_brand = "Linux",
1108 .emul_path = "/compat/linux",
1109 .interp_path = "/lib/ld-linux.so.1",
1110 .sysvec = &elf_linux_sysvec,
1111 .interp_newpath = NULL,
1112 .brand_note = &linux32_brandnote,
1113 .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1116 static Elf32_Brandinfo linux_glibc2brand = {
1117 .brand = ELFOSABI_LINUX,
1119 .compat_3_brand = "Linux",
1120 .emul_path = "/compat/linux",
1121 .interp_path = "/lib/ld-linux.so.2",
1122 .sysvec = &elf_linux_sysvec,
1123 .interp_newpath = NULL,
1124 .brand_note = &linux32_brandnote,
1125 .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1128 Elf32_Brandinfo *linux_brandlist[] = {
1135 linux_elf_modevent(module_t mod, int type, void *data)
1137 Elf32_Brandinfo **brandinfo;
1139 struct linux_ioctl_handler **lihp;
1140 struct linux_device_handler **ldhp;
1146 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1148 if (elf32_insert_brand_entry(*brandinfo) < 0)
1151 SET_FOREACH(lihp, linux_ioctl_handler_set)
1152 linux_ioctl_register_handler(*lihp);
1153 SET_FOREACH(ldhp, linux_device_handler_set)
1154 linux_device_register_handler(*ldhp);
1155 mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
1156 sx_init(&emul_shared_lock, "emuldata->shared lock");
1157 LIST_INIT(&futex_list);
1158 mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1159 linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1160 linux_proc_exit, NULL, 1000);
1161 linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail,
1162 linux_schedtail, NULL, 1000);
1163 linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1164 linux_proc_exec, NULL, 1000);
1165 linux_szplatform = roundup(strlen(linux_platform) + 1,
1167 linux_osd_jail_register();
1168 stclohz = (stathz ? stathz : hz);
1170 printf("Linux ELF exec handler installed\n");
1172 printf("cannot insert Linux ELF brand handler\n");
1175 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1177 if (elf32_brand_inuse(*brandinfo))
1180 for (brandinfo = &linux_brandlist[0];
1181 *brandinfo != NULL; ++brandinfo)
1182 if (elf32_remove_brand_entry(*brandinfo) < 0)
1186 SET_FOREACH(lihp, linux_ioctl_handler_set)
1187 linux_ioctl_unregister_handler(*lihp);
1188 SET_FOREACH(ldhp, linux_device_handler_set)
1189 linux_device_unregister_handler(*ldhp);
1190 mtx_destroy(&emul_lock);
1191 sx_destroy(&emul_shared_lock);
1192 mtx_destroy(&futex_mtx);
1193 EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1194 EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
1195 EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1196 linux_osd_jail_deregister();
1198 printf("Linux ELF exec handler removed\n");
1200 printf("Could not deinstall ELF interpreter entry\n");
1208 static moduledata_t linux_elf_mod = {
1214 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);