2 * Copyright (c) 2004 Tim J. Robbins
3 * Copyright (c) 2003 Peter Wemm
4 * Copyright (c) 2002 Doug Rabson
5 * Copyright (c) 1998-1999 Andrew Gallatin
6 * Copyright (c) 1994-1996 Søren Schmidt
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer
14 * in this position and unchanged.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. The name of the author may not be used to endorse or promote products
19 * derived from this software without specific prior written permission
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35 #include "opt_compat.h"
37 #ifndef COMPAT_FREEBSD32
38 #error "Unable to compile Linux-emulator due to missing COMPAT_FREEBSD32 option!"
41 #define __ELF_WORD_SIZE 32
43 #include <sys/param.h>
44 #include <sys/systm.h>
46 #include <sys/fcntl.h>
47 #include <sys/imgact.h>
48 #include <sys/imgact_elf.h>
49 #include <sys/kernel.h>
51 #include <sys/malloc.h>
52 #include <sys/module.h>
53 #include <sys/mutex.h>
55 #include <sys/resourcevar.h>
56 #include <sys/signalvar.h>
57 #include <sys/sysctl.h>
58 #include <sys/syscallsubr.h>
59 #include <sys/sysent.h>
60 #include <sys/sysproto.h>
61 #include <sys/vnode.h>
62 #include <sys/eventhandler.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_map.h>
68 #include <vm/vm_object.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_param.h>
72 #include <machine/cpu.h>
73 #include <machine/md_var.h>
74 #include <machine/pcb.h>
75 #include <machine/specialreg.h>
77 #include <amd64/linux32/linux.h>
78 #include <amd64/linux32/linux32_proto.h>
79 #include <compat/linux/linux_futex.h>
80 #include <compat/linux/linux_emul.h>
81 #include <compat/linux/linux_mib.h>
82 #include <compat/linux/linux_misc.h>
83 #include <compat/linux/linux_signal.h>
84 #include <compat/linux/linux_util.h>
86 MODULE_VERSION(linux, 1);
88 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
90 #define AUXARGS_ENTRY_32(pos, id, val) \
92 suword32(pos++, id); \
93 suword32(pos++, val); \
96 #if BYTE_ORDER == LITTLE_ENDIAN
97 #define SHELLMAGIC 0x2123 /* #! */
99 #define SHELLMAGIC 0x2321
103 * Allow the sendsig functions to use the ldebug() facility
104 * even though they are not syscalls themselves. Map them
105 * to syscall 0. This is slightly less bogus than using
108 #define LINUX_SYS_linux_rt_sendsig 0
109 #define LINUX_SYS_linux_sendsig 0
111 const char *linux_platform = "i686";
112 static int linux_szplatform;
113 extern char linux_sigcode[];
114 extern int linux_szsigcode;
116 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
118 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
119 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
121 static int elf_linux_fixup(register_t **stack_base,
122 struct image_params *iparams);
123 static register_t *linux_copyout_strings(struct image_params *imgp);
124 static void linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask);
125 static void exec_linux_setregs(struct thread *td, u_long entry,
126 u_long stack, u_long ps_strings);
127 static void linux32_fixlimit(struct rlimit *rl, int which);
128 static boolean_t linux32_trans_osrel(const Elf_Note *note, int32_t *osrel);
130 static eventhandler_tag linux_exit_tag;
131 static eventhandler_tag linux_schedtail_tag;
132 static eventhandler_tag linux_exec_tag;
135 * Linux syscalls return negative errno's, we do positive and map them
137 * FreeBSD: src/sys/sys/errno.h
138 * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h
139 * linux-2.6.17.8/include/asm-generic/errno.h
141 static int bsd_to_linux_errno[ELAST + 1] = {
142 -0, -1, -2, -3, -4, -5, -6, -7, -8, -9,
143 -10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
144 -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
145 -30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
146 -90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
147 -100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
148 -110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
149 -116, -66, -6, -6, -6, -6, -6, -37, -38, -9,
150 -6, -6, -43, -42, -75,-125, -84, -95, -16, -74,
154 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
155 LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
156 LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
157 LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
158 LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
159 LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
160 LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
161 LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
162 0, LINUX_SIGUSR1, LINUX_SIGUSR2
165 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
166 SIGHUP, SIGINT, SIGQUIT, SIGILL,
167 SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
168 SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
169 SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
170 SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
171 SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
172 SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
173 SIGIO, SIGURG, SIGSYS
176 #define LINUX_T_UNKNOWN 255
177 static int _bsd_to_linux_trapcode[] = {
178 LINUX_T_UNKNOWN, /* 0 */
179 6, /* 1 T_PRIVINFLT */
180 LINUX_T_UNKNOWN, /* 2 */
182 LINUX_T_UNKNOWN, /* 4 */
183 LINUX_T_UNKNOWN, /* 5 */
184 16, /* 6 T_ARITHTRAP */
185 254, /* 7 T_ASTFLT */
186 LINUX_T_UNKNOWN, /* 8 */
187 13, /* 9 T_PROTFLT */
188 1, /* 10 T_TRCTRAP */
189 LINUX_T_UNKNOWN, /* 11 */
190 14, /* 12 T_PAGEFLT */
191 LINUX_T_UNKNOWN, /* 13 */
192 17, /* 14 T_ALIGNFLT */
193 LINUX_T_UNKNOWN, /* 15 */
194 LINUX_T_UNKNOWN, /* 16 */
195 LINUX_T_UNKNOWN, /* 17 */
201 8, /* 23 T_DOUBLEFLT */
202 9, /* 24 T_FPOPFLT */
203 10, /* 25 T_TSSFLT */
204 11, /* 26 T_SEGNPFLT */
205 12, /* 27 T_STKFLT */
207 19, /* 29 T_XMMFLT */
208 15 /* 30 T_RESERVED */
210 #define bsd_to_linux_trapcode(code) \
211 ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
212 _bsd_to_linux_trapcode[(code)]: \
215 struct linux32_ps_strings {
216 u_int32_t ps_argvstr; /* first of 0 or more argument strings */
217 u_int ps_nargvstr; /* the number of argument strings */
218 u_int32_t ps_envstr; /* first of 0 or more environment strings */
219 u_int ps_nenvstr; /* the number of environment strings */
223 * If FreeBSD & Linux have a difference of opinion about what a trap
224 * means, deal with it here.
229 translate_traps(int signal, int trap_code)
231 if (signal != SIGBUS)
245 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
249 Elf32_Addr *pos, *uplatform;
250 struct linux32_ps_strings *arginfo;
252 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
253 uplatform = (Elf32_Addr *)((caddr_t)arginfo - linux_szsigcode -
256 KASSERT(curthread->td_proc == imgp->proc,
257 ("unsafe elf_linux_fixup(), should be curproc"));
258 base = (Elf32_Addr *)*stack_base;
259 args = (Elf32_Auxargs *)imgp->auxargs;
260 pos = base + (imgp->args->argc + imgp->args->envc + 2);
262 AUXARGS_ENTRY_32(pos, LINUX_AT_HWCAP, cpu_feature);
265 * Do not export AT_CLKTCK when emulating Linux kernel prior to 2.4.0,
266 * as it has appeared in the 2.4.0-rc7 first time.
267 * Being exported, AT_CLKTCK is returned by sysconf(_SC_CLK_TCK),
268 * glibc falls back to the hard-coded CLK_TCK value when aux entry
270 * Also see linux_times() implementation.
272 if (linux_kernver(curthread) >= LINUX_KERNVER_2004000)
273 AUXARGS_ENTRY_32(pos, LINUX_AT_CLKTCK, stclohz);
274 AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
275 AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
276 AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
277 AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
278 AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
279 AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
280 AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
281 AUXARGS_ENTRY_32(pos, LINUX_AT_SECURE, 0);
282 AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
283 AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
284 AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
285 AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
286 AUXARGS_ENTRY_32(pos, LINUX_AT_PLATFORM, PTROUT(uplatform));
287 if (args->execfd != -1)
288 AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
289 AUXARGS_ENTRY_32(pos, AT_NULL, 0);
291 free(imgp->auxargs, M_TEMP);
292 imgp->auxargs = NULL;
295 suword32(base, (uint32_t)imgp->args->argc);
296 *stack_base = (register_t *)base;
300 extern unsigned long linux_sznonrtsigcode;
303 linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
305 struct thread *td = curthread;
306 struct proc *p = td->td_proc;
308 struct trapframe *regs;
309 struct l_rt_sigframe *fp, frame;
314 sig = ksi->ksi_signo;
315 code = ksi->ksi_code;
316 PROC_LOCK_ASSERT(p, MA_OWNED);
318 mtx_assert(&psp->ps_mtx, MA_OWNED);
320 oonstack = sigonstack(regs->tf_rsp);
323 if (ldebug(rt_sendsig))
324 printf(ARGS(rt_sendsig, "%p, %d, %p, %u"),
325 catcher, sig, (void*)mask, code);
328 * Allocate space for the signal handler context.
330 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
331 SIGISMEMBER(psp->ps_sigonstack, sig)) {
332 fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
333 td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
335 fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
336 mtx_unlock(&psp->ps_mtx);
339 * Build the argument list for the signal handler.
341 if (p->p_sysent->sv_sigtbl)
342 if (sig <= p->p_sysent->sv_sigsize)
343 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
345 bzero(&frame, sizeof(frame));
347 frame.sf_handler = PTROUT(catcher);
349 frame.sf_siginfo = PTROUT(&fp->sf_si);
350 frame.sf_ucontext = PTROUT(&fp->sf_sc);
352 /* Fill in POSIX parts */
353 ksiginfo_to_lsiginfo(ksi, &frame.sf_si, sig);
356 * Build the signal context to be used by sigreturn.
358 frame.sf_sc.uc_flags = 0; /* XXX ??? */
359 frame.sf_sc.uc_link = 0; /* XXX ??? */
361 frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
362 frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
363 frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
364 ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
367 bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
369 frame.sf_sc.uc_mcontext.sc_mask = frame.sf_sc.uc_sigmask.__bits[0];
370 frame.sf_sc.uc_mcontext.sc_edi = regs->tf_rdi;
371 frame.sf_sc.uc_mcontext.sc_esi = regs->tf_rsi;
372 frame.sf_sc.uc_mcontext.sc_ebp = regs->tf_rbp;
373 frame.sf_sc.uc_mcontext.sc_ebx = regs->tf_rbx;
374 frame.sf_sc.uc_mcontext.sc_edx = regs->tf_rdx;
375 frame.sf_sc.uc_mcontext.sc_ecx = regs->tf_rcx;
376 frame.sf_sc.uc_mcontext.sc_eax = regs->tf_rax;
377 frame.sf_sc.uc_mcontext.sc_eip = regs->tf_rip;
378 frame.sf_sc.uc_mcontext.sc_cs = regs->tf_cs;
379 frame.sf_sc.uc_mcontext.sc_gs = regs->tf_gs;
380 frame.sf_sc.uc_mcontext.sc_fs = regs->tf_fs;
381 frame.sf_sc.uc_mcontext.sc_es = regs->tf_es;
382 frame.sf_sc.uc_mcontext.sc_ds = regs->tf_ds;
383 frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
384 frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
385 frame.sf_sc.uc_mcontext.sc_ss = regs->tf_ss;
386 frame.sf_sc.uc_mcontext.sc_err = regs->tf_err;
387 frame.sf_sc.uc_mcontext.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr;
388 frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
391 if (ldebug(rt_sendsig))
392 printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
393 frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
394 td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
397 if (copyout(&frame, fp, sizeof(frame)) != 0) {
399 * Process has trashed its stack; give it an illegal
400 * instruction to halt it in its tracks.
403 if (ldebug(rt_sendsig))
404 printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
412 * Build context to run handler in.
414 regs->tf_rsp = PTROUT(fp);
415 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
416 linux_sznonrtsigcode;
417 regs->tf_rflags &= ~(PSL_T | PSL_D);
418 regs->tf_cs = _ucode32sel;
419 regs->tf_ss = _udatasel;
420 regs->tf_ds = _udatasel;
421 regs->tf_es = _udatasel;
422 regs->tf_fs = _ufssel;
423 regs->tf_gs = _ugssel;
424 regs->tf_flags = TF_HASSEGS;
425 td->td_pcb->pcb_full_iret = 1;
427 mtx_lock(&psp->ps_mtx);
432 * Send an interrupt to process.
434 * Stack is set up to allow sigcode stored
435 * in u. to call routine, followed by kcall
436 * to sigreturn routine below. After sigreturn
437 * resets the signal mask, the stack, and the
438 * frame pointer, it returns to the user
442 linux_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
444 struct thread *td = curthread;
445 struct proc *p = td->td_proc;
447 struct trapframe *regs;
448 struct l_sigframe *fp, frame;
453 sig = ksi->ksi_signo;
454 code = ksi->ksi_code;
455 PROC_LOCK_ASSERT(p, MA_OWNED);
457 mtx_assert(&psp->ps_mtx, MA_OWNED);
458 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
459 /* Signal handler installed with SA_SIGINFO. */
460 linux_rt_sendsig(catcher, ksi, mask);
465 oonstack = sigonstack(regs->tf_rsp);
469 printf(ARGS(sendsig, "%p, %d, %p, %u"),
470 catcher, sig, (void*)mask, code);
474 * Allocate space for the signal handler context.
476 if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
477 SIGISMEMBER(psp->ps_sigonstack, sig)) {
478 fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
479 td->td_sigstk.ss_size - sizeof(struct l_sigframe));
481 fp = (struct l_sigframe *)regs->tf_rsp - 1;
482 mtx_unlock(&psp->ps_mtx);
486 * Build the argument list for the signal handler.
488 if (p->p_sysent->sv_sigtbl)
489 if (sig <= p->p_sysent->sv_sigsize)
490 sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
492 bzero(&frame, sizeof(frame));
494 frame.sf_handler = PTROUT(catcher);
497 bsd_to_linux_sigset(mask, &lmask);
500 * Build the signal context to be used by sigreturn.
502 frame.sf_sc.sc_mask = lmask.__bits[0];
503 frame.sf_sc.sc_gs = regs->tf_gs;
504 frame.sf_sc.sc_fs = regs->tf_fs;
505 frame.sf_sc.sc_es = regs->tf_es;
506 frame.sf_sc.sc_ds = regs->tf_ds;
507 frame.sf_sc.sc_edi = regs->tf_rdi;
508 frame.sf_sc.sc_esi = regs->tf_rsi;
509 frame.sf_sc.sc_ebp = regs->tf_rbp;
510 frame.sf_sc.sc_ebx = regs->tf_rbx;
511 frame.sf_sc.sc_edx = regs->tf_rdx;
512 frame.sf_sc.sc_ecx = regs->tf_rcx;
513 frame.sf_sc.sc_eax = regs->tf_rax;
514 frame.sf_sc.sc_eip = regs->tf_rip;
515 frame.sf_sc.sc_cs = regs->tf_cs;
516 frame.sf_sc.sc_eflags = regs->tf_rflags;
517 frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
518 frame.sf_sc.sc_ss = regs->tf_ss;
519 frame.sf_sc.sc_err = regs->tf_err;
520 frame.sf_sc.sc_cr2 = (u_int32_t)(uintptr_t)ksi->ksi_addr;
521 frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
523 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
524 frame.sf_extramask[i] = lmask.__bits[i+1];
526 if (copyout(&frame, fp, sizeof(frame)) != 0) {
528 * Process has trashed its stack; give it an illegal
529 * instruction to halt it in its tracks.
536 * Build context to run handler in.
538 regs->tf_rsp = PTROUT(fp);
539 regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
540 regs->tf_rflags &= ~(PSL_T | PSL_D);
541 regs->tf_cs = _ucode32sel;
542 regs->tf_ss = _udatasel;
543 regs->tf_ds = _udatasel;
544 regs->tf_es = _udatasel;
545 regs->tf_fs = _ufssel;
546 regs->tf_gs = _ugssel;
547 regs->tf_flags = TF_HASSEGS;
548 td->td_pcb->pcb_full_iret = 1;
550 mtx_lock(&psp->ps_mtx);
554 * System call to cleanup state after a signal
555 * has been taken. Reset signal mask and
556 * stack state from context left by sendsig (above).
557 * Return to previous pc and psl as specified by
558 * context left by sendsig. Check carefully to
559 * make sure that the user has not modified the
560 * psl to gain improper privileges or to cause
564 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
566 struct l_sigframe frame;
567 struct trapframe *regs;
576 if (ldebug(sigreturn))
577 printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
580 * The trampoline code hands us the sigframe.
581 * It is unsafe to keep track of it ourselves, in the event that a
582 * program jumps out of a signal handler.
584 if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
588 * Check for security violations.
590 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
591 eflags = frame.sf_sc.sc_eflags;
593 * XXX do allow users to change the privileged flag PSL_RF. The
594 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should
595 * sometimes set it there too. tf_eflags is kept in the signal
596 * context during signal handling and there is no other place
597 * to remember it, so the PSL_RF bit may be corrupted by the
598 * signal handler without us knowing. Corruption of the PSL_RF
599 * bit at worst causes one more or one less debugger trap, so
600 * allowing it is fairly harmless.
602 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
606 * Don't allow users to load a valid privileged %cs. Let the
607 * hardware check for invalid selectors, excess privilege in
608 * other selectors, invalid %eip's and invalid %esp's.
610 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
611 if (!CS_SECURE(frame.sf_sc.sc_cs)) {
612 ksiginfo_init_trap(&ksi);
613 ksi.ksi_signo = SIGBUS;
614 ksi.ksi_code = BUS_OBJERR;
615 ksi.ksi_trapno = T_PROTFLT;
616 ksi.ksi_addr = (void *)regs->tf_rip;
617 trapsignal(td, &ksi);
621 lmask.__bits[0] = frame.sf_sc.sc_mask;
622 for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
623 lmask.__bits[i+1] = frame.sf_extramask[i];
624 linux_to_bsd_sigset(&lmask, &bmask);
625 kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
628 * Restore signal context.
630 regs->tf_rdi = frame.sf_sc.sc_edi;
631 regs->tf_rsi = frame.sf_sc.sc_esi;
632 regs->tf_rbp = frame.sf_sc.sc_ebp;
633 regs->tf_rbx = frame.sf_sc.sc_ebx;
634 regs->tf_rdx = frame.sf_sc.sc_edx;
635 regs->tf_rcx = frame.sf_sc.sc_ecx;
636 regs->tf_rax = frame.sf_sc.sc_eax;
637 regs->tf_rip = frame.sf_sc.sc_eip;
638 regs->tf_cs = frame.sf_sc.sc_cs;
639 regs->tf_ds = frame.sf_sc.sc_ds;
640 regs->tf_es = frame.sf_sc.sc_es;
641 regs->tf_fs = frame.sf_sc.sc_fs;
642 regs->tf_gs = frame.sf_sc.sc_gs;
643 regs->tf_rflags = eflags;
644 regs->tf_rsp = frame.sf_sc.sc_esp_at_signal;
645 regs->tf_ss = frame.sf_sc.sc_ss;
646 td->td_pcb->pcb_full_iret = 1;
648 return (EJUSTRETURN);
652 * System call to cleanup state after a signal
653 * has been taken. Reset signal mask and
654 * stack state from context left by rt_sendsig (above).
655 * Return to previous pc and psl as specified by
656 * context left by sendsig. Check carefully to
657 * make sure that the user has not modified the
658 * psl to gain improper privileges or to cause
662 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
664 struct l_ucontext uc;
665 struct l_sigcontext *context;
669 struct trapframe *regs;
676 if (ldebug(rt_sigreturn))
677 printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
680 * The trampoline code hands us the ucontext.
681 * It is unsafe to keep track of it ourselves, in the event that a
682 * program jumps out of a signal handler.
684 if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
687 context = &uc.uc_mcontext;
690 * Check for security violations.
692 #define EFLAGS_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
693 eflags = context->sc_eflags;
695 * XXX do allow users to change the privileged flag PSL_RF. The
696 * cpu sets PSL_RF in tf_eflags for faults. Debuggers should
697 * sometimes set it there too. tf_eflags is kept in the signal
698 * context during signal handling and there is no other place
699 * to remember it, so the PSL_RF bit may be corrupted by the
700 * signal handler without us knowing. Corruption of the PSL_RF
701 * bit at worst causes one more or one less debugger trap, so
702 * allowing it is fairly harmless.
704 if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
708 * Don't allow users to load a valid privileged %cs. Let the
709 * hardware check for invalid selectors, excess privilege in
710 * other selectors, invalid %eip's and invalid %esp's.
712 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
713 if (!CS_SECURE(context->sc_cs)) {
714 ksiginfo_init_trap(&ksi);
715 ksi.ksi_signo = SIGBUS;
716 ksi.ksi_code = BUS_OBJERR;
717 ksi.ksi_trapno = T_PROTFLT;
718 ksi.ksi_addr = (void *)regs->tf_rip;
719 trapsignal(td, &ksi);
723 linux_to_bsd_sigset(&uc.uc_sigmask, &bmask);
724 kern_sigprocmask(td, SIG_SETMASK, &bmask, NULL, 0);
727 * Restore signal context
729 regs->tf_gs = context->sc_gs;
730 regs->tf_fs = context->sc_fs;
731 regs->tf_es = context->sc_es;
732 regs->tf_ds = context->sc_ds;
733 regs->tf_rdi = context->sc_edi;
734 regs->tf_rsi = context->sc_esi;
735 regs->tf_rbp = context->sc_ebp;
736 regs->tf_rbx = context->sc_ebx;
737 regs->tf_rdx = context->sc_edx;
738 regs->tf_rcx = context->sc_ecx;
739 regs->tf_rax = context->sc_eax;
740 regs->tf_rip = context->sc_eip;
741 regs->tf_cs = context->sc_cs;
742 regs->tf_rflags = eflags;
743 regs->tf_rsp = context->sc_esp_at_signal;
744 regs->tf_ss = context->sc_ss;
745 td->td_pcb->pcb_full_iret = 1;
748 * call sigaltstack & ignore results..
751 ss.ss_sp = PTRIN(lss->ss_sp);
752 ss.ss_size = lss->ss_size;
753 ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
756 if (ldebug(rt_sigreturn))
757 printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
758 ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
760 (void)kern_sigaltstack(td, &ss, NULL);
762 return (EJUSTRETURN);
766 linux32_fetch_syscall_args(struct thread *td, struct syscall_args *sa)
769 struct trapframe *frame;
772 frame = td->td_frame;
774 sa->args[0] = frame->tf_rbx;
775 sa->args[1] = frame->tf_rcx;
776 sa->args[2] = frame->tf_rdx;
777 sa->args[3] = frame->tf_rsi;
778 sa->args[4] = frame->tf_rdi;
779 sa->args[5] = frame->tf_rbp; /* Unconfirmed */
780 sa->code = frame->tf_rax;
782 if (sa->code >= p->p_sysent->sv_size)
783 sa->callp = &p->p_sysent->sv_table[0];
785 sa->callp = &p->p_sysent->sv_table[sa->code];
786 sa->narg = sa->callp->sy_narg;
788 td->td_retval[0] = 0;
789 td->td_retval[1] = frame->tf_rdx;
795 * If a linux binary is exec'ing something, try this image activator
796 * first. We override standard shell script execution in order to
797 * be able to modify the interpreter path. We only do this if a linux
798 * binary is doing the exec, so we do not create an EXEC module for it.
800 static int exec_linux_imgact_try(struct image_params *iparams);
803 exec_linux_imgact_try(struct image_params *imgp)
805 const char *head = (const char *)imgp->image_header;
810 * The interpreter for shell scripts run from a linux binary needs
811 * to be located in /compat/linux if possible in order to recursively
812 * maintain linux path emulation.
814 if (((const short *)head)[0] == SHELLMAGIC) {
816 * Run our normal shell image activator. If it succeeds attempt
817 * to use the alternate path for the interpreter. If an
818 * alternate * path is found, use our stringspace to store it.
820 if ((error = exec_shell_imgact(imgp)) == 0) {
821 linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
822 imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0,
825 len = strlen(rpath) + 1;
827 if (len <= MAXSHELLCMDLEN) {
828 memcpy(imgp->interpreter_name, rpath,
839 * Clear registers on exec
840 * XXX copied from ia32_signal.c.
843 exec_linux_setregs(td, entry, stack, ps_strings)
849 struct trapframe *regs = td->td_frame;
850 struct pcb *pcb = td->td_pcb;
853 if (td->td_proc->p_md.md_ldt != NULL)
856 mtx_unlock(&dt_lock);
859 wrmsr(MSR_FSBASE, 0);
860 wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */
864 pcb->pcb_initial_fpucw = __LINUX_NPXCW__;
866 bzero((char *)regs, sizeof(struct trapframe));
867 regs->tf_rip = entry;
868 regs->tf_rsp = stack;
869 regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
870 regs->tf_gs = _ugssel;
871 regs->tf_fs = _ufssel;
872 regs->tf_es = _udatasel;
873 regs->tf_ds = _udatasel;
874 regs->tf_ss = _udatasel;
875 regs->tf_flags = TF_HASSEGS;
876 regs->tf_cs = _ucode32sel;
877 regs->tf_rbx = ps_strings;
878 load_cr0(rcr0() | CR0_MP | CR0_TS);
881 /* Do full restore on return so that we can change to a different %cs */
882 pcb->pcb_flags |= PCB_32BIT;
883 pcb->pcb_flags &= ~PCB_GS32BIT;
884 pcb->pcb_full_iret = 1;
885 td->td_retval[1] = 0;
889 * XXX copied from ia32_sysvec.c.
892 linux_copyout_strings(struct image_params *imgp)
896 char *stringp, *destp;
897 u_int32_t *stack_base;
898 struct linux32_ps_strings *arginfo;
901 * Calculate string base and vector table pointers.
902 * Also deal with signal trampoline code for this exec type.
904 arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
905 destp = (caddr_t)arginfo - linux_szsigcode - SPARE_USRSPACE -
906 linux_szplatform - roundup((ARG_MAX - imgp->args->stringspace),
912 copyout(imgp->proc->p_sysent->sv_sigcode,
913 ((caddr_t)arginfo - linux_szsigcode), linux_szsigcode);
916 * Install LINUX_PLATFORM
918 copyout(linux_platform, ((caddr_t)arginfo - linux_szsigcode -
919 linux_szplatform), linux_szplatform);
922 * If we have a valid auxargs ptr, prepare some room
927 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
928 * lower compatibility.
930 imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
931 (LINUX_AT_COUNT * 2);
933 * The '+ 2' is for the null pointers at the end of each of
934 * the arg and env vector sets,and imgp->auxarg_size is room
935 * for argument of Runtime loader.
937 vectp = (u_int32_t *) (destp - (imgp->args->argc +
938 imgp->args->envc + 2 + imgp->auxarg_size) *
943 * The '+ 2' is for the null pointers at the end of each of
944 * the arg and env vector sets
946 vectp = (u_int32_t *)(destp - (imgp->args->argc +
947 imgp->args->envc + 2) * sizeof(u_int32_t));
950 * vectp also becomes our initial stack base
954 stringp = imgp->args->begin_argv;
955 argc = imgp->args->argc;
956 envc = imgp->args->envc;
958 * Copy out strings - arguments and environment.
960 copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
963 * Fill in "ps_strings" struct for ps, w, etc.
965 suword32(&arginfo->ps_argvstr, (uint32_t)(intptr_t)vectp);
966 suword32(&arginfo->ps_nargvstr, argc);
969 * Fill in argument portion of vector table.
971 for (; argc > 0; --argc) {
972 suword32(vectp++, (uint32_t)(intptr_t)destp);
973 while (*stringp++ != 0)
978 /* a null vector table pointer separates the argp's from the envp's */
979 suword32(vectp++, 0);
981 suword32(&arginfo->ps_envstr, (uint32_t)(intptr_t)vectp);
982 suword32(&arginfo->ps_nenvstr, envc);
985 * Fill in environment portion of vector table.
987 for (; envc > 0; --envc) {
988 suword32(vectp++, (uint32_t)(intptr_t)destp);
989 while (*stringp++ != 0)
994 /* end of vector table is a null pointer */
997 return ((register_t *)stack_base);
1000 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
1001 "32-bit Linux emulation");
1003 static u_long linux32_maxdsiz = LINUX32_MAXDSIZ;
1004 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
1005 &linux32_maxdsiz, 0, "");
1006 static u_long linux32_maxssiz = LINUX32_MAXSSIZ;
1007 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
1008 &linux32_maxssiz, 0, "");
1009 static u_long linux32_maxvmem = LINUX32_MAXVMEM;
1010 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
1011 &linux32_maxvmem, 0, "");
1014 linux32_fixlimit(struct rlimit *rl, int which)
1019 if (linux32_maxdsiz != 0) {
1020 if (rl->rlim_cur > linux32_maxdsiz)
1021 rl->rlim_cur = linux32_maxdsiz;
1022 if (rl->rlim_max > linux32_maxdsiz)
1023 rl->rlim_max = linux32_maxdsiz;
1027 if (linux32_maxssiz != 0) {
1028 if (rl->rlim_cur > linux32_maxssiz)
1029 rl->rlim_cur = linux32_maxssiz;
1030 if (rl->rlim_max > linux32_maxssiz)
1031 rl->rlim_max = linux32_maxssiz;
1035 if (linux32_maxvmem != 0) {
1036 if (rl->rlim_cur > linux32_maxvmem)
1037 rl->rlim_cur = linux32_maxvmem;
1038 if (rl->rlim_max > linux32_maxvmem)
1039 rl->rlim_max = linux32_maxvmem;
1045 struct sysentvec elf_linux_sysvec = {
1046 .sv_size = LINUX_SYS_MAXSYSCALL,
1047 .sv_table = linux_sysent,
1049 .sv_sigsize = LINUX_SIGTBLSZ,
1050 .sv_sigtbl = bsd_to_linux_signal,
1051 .sv_errsize = ELAST + 1,
1052 .sv_errtbl = bsd_to_linux_errno,
1053 .sv_transtrap = translate_traps,
1054 .sv_fixup = elf_linux_fixup,
1055 .sv_sendsig = linux_sendsig,
1056 .sv_sigcode = linux_sigcode,
1057 .sv_szsigcode = &linux_szsigcode,
1058 .sv_prepsyscall = NULL,
1059 .sv_name = "Linux ELF32",
1060 .sv_coredump = elf32_coredump,
1061 .sv_imgact_try = exec_linux_imgact_try,
1062 .sv_minsigstksz = LINUX_MINSIGSTKSZ,
1063 .sv_pagesize = PAGE_SIZE,
1064 .sv_minuser = VM_MIN_ADDRESS,
1065 .sv_maxuser = LINUX32_USRSTACK,
1066 .sv_usrstack = LINUX32_USRSTACK,
1067 .sv_psstrings = LINUX32_PS_STRINGS,
1068 .sv_stackprot = VM_PROT_ALL,
1069 .sv_copyout_strings = linux_copyout_strings,
1070 .sv_setregs = exec_linux_setregs,
1071 .sv_fixlimit = linux32_fixlimit,
1072 .sv_maxssiz = &linux32_maxssiz,
1073 .sv_flags = SV_ABI_LINUX | SV_ILP32 | SV_IA32,
1074 .sv_set_syscall_retval = cpu_set_syscall_retval,
1075 .sv_fetch_syscall_args = linux32_fetch_syscall_args,
1076 .sv_syscallnames = NULL,
1079 static char GNU_ABI_VENDOR[] = "GNU";
1080 static int GNULINUX_ABI_DESC = 0;
1083 linux32_trans_osrel(const Elf_Note *note, int32_t *osrel)
1085 const Elf32_Word *desc;
1088 p = (uintptr_t)(note + 1);
1089 p += roundup2(note->n_namesz, sizeof(Elf32_Addr));
1091 desc = (const Elf32_Word *)p;
1092 if (desc[0] != GNULINUX_ABI_DESC)
1096 * For linux we encode osrel as follows (see linux_mib.c):
1097 * VVVMMMIII (version, major, minor), see linux_mib.c.
1099 *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3];
1104 static Elf_Brandnote linux32_brandnote = {
1105 .hdr.n_namesz = sizeof(GNU_ABI_VENDOR),
1106 .hdr.n_descsz = 16, /* XXX at least 16 */
1108 .vendor = GNU_ABI_VENDOR,
1109 .flags = BN_TRANSLATE_OSREL,
1110 .trans_osrel = linux32_trans_osrel
1113 static Elf32_Brandinfo linux_brand = {
1114 .brand = ELFOSABI_LINUX,
1116 .compat_3_brand = "Linux",
1117 .emul_path = "/compat/linux",
1118 .interp_path = "/lib/ld-linux.so.1",
1119 .sysvec = &elf_linux_sysvec,
1120 .interp_newpath = NULL,
1121 .brand_note = &linux32_brandnote,
1122 .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1125 static Elf32_Brandinfo linux_glibc2brand = {
1126 .brand = ELFOSABI_LINUX,
1128 .compat_3_brand = "Linux",
1129 .emul_path = "/compat/linux",
1130 .interp_path = "/lib/ld-linux.so.2",
1131 .sysvec = &elf_linux_sysvec,
1132 .interp_newpath = NULL,
1133 .brand_note = &linux32_brandnote,
1134 .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
1137 Elf32_Brandinfo *linux_brandlist[] = {
1144 linux_elf_modevent(module_t mod, int type, void *data)
1146 Elf32_Brandinfo **brandinfo;
1148 struct linux_ioctl_handler **lihp;
1149 struct linux_device_handler **ldhp;
1155 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1157 if (elf32_insert_brand_entry(*brandinfo) < 0)
1160 SET_FOREACH(lihp, linux_ioctl_handler_set)
1161 linux_ioctl_register_handler(*lihp);
1162 SET_FOREACH(ldhp, linux_device_handler_set)
1163 linux_device_register_handler(*ldhp);
1164 mtx_init(&emul_lock, "emuldata lock", NULL, MTX_DEF);
1165 sx_init(&emul_shared_lock, "emuldata->shared lock");
1166 LIST_INIT(&futex_list);
1167 mtx_init(&futex_mtx, "ftllk", NULL, MTX_DEF);
1168 linux_exit_tag = EVENTHANDLER_REGISTER(process_exit,
1169 linux_proc_exit, NULL, 1000);
1170 linux_schedtail_tag = EVENTHANDLER_REGISTER(schedtail,
1171 linux_schedtail, NULL, 1000);
1172 linux_exec_tag = EVENTHANDLER_REGISTER(process_exec,
1173 linux_proc_exec, NULL, 1000);
1174 linux_szplatform = roundup(strlen(linux_platform) + 1,
1176 linux_osd_jail_register();
1177 stclohz = (stathz ? stathz : hz);
1179 printf("Linux ELF exec handler installed\n");
1181 printf("cannot insert Linux ELF brand handler\n");
1184 for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
1186 if (elf32_brand_inuse(*brandinfo))
1189 for (brandinfo = &linux_brandlist[0];
1190 *brandinfo != NULL; ++brandinfo)
1191 if (elf32_remove_brand_entry(*brandinfo) < 0)
1195 SET_FOREACH(lihp, linux_ioctl_handler_set)
1196 linux_ioctl_unregister_handler(*lihp);
1197 SET_FOREACH(ldhp, linux_device_handler_set)
1198 linux_device_unregister_handler(*ldhp);
1199 mtx_destroy(&emul_lock);
1200 sx_destroy(&emul_shared_lock);
1201 mtx_destroy(&futex_mtx);
1202 EVENTHANDLER_DEREGISTER(process_exit, linux_exit_tag);
1203 EVENTHANDLER_DEREGISTER(schedtail, linux_schedtail_tag);
1204 EVENTHANDLER_DEREGISTER(process_exec, linux_exec_tag);
1205 linux_osd_jail_deregister();
1207 printf("Linux ELF exec handler removed\n");
1209 printf("Could not deinstall ELF interpreter entry\n");
1217 static moduledata_t linux_elf_mod = {
1223 DECLARE_MODULE_TIED(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);