2 * Copyright (c) 2012 Will Drewry <wad@dataspill.org>
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 * Uncomment the SANDBOX_SECCOMP_FILTER_DEBUG macro below to help diagnose
19 * filter breakage during development. *Do not* use this in production,
20 * as it relies on making library calls that are unsafe in signal context.
22 * Instead, live systems the auditctl(8) may be used to monitor failures.
24 * auditctl -a task,always -F uid=<privsep uid>
26 #define SANDBOX_SECCOMP_FILTER_DEBUG 1
30 * For older toolchains, it may be necessary to use the kernel
33 #ifdef SANDBOX_SECCOMP_FILTER_DEBUG
34 # include <asm/siginfo.h>
35 # define __have_siginfo_t 1
36 # define __have_sigval_t 1
37 # define __have_sigevent_t 1
38 #endif /* SANDBOX_SECCOMP_FILTER_DEBUG */
43 #ifdef SANDBOX_SECCOMP_FILTER
45 #include <sys/types.h>
46 #include <sys/resource.h>
47 #include <sys/prctl.h>
49 #include <sys/syscall.h>
51 #include <linux/net.h>
52 #include <linux/audit.h>
53 #include <linux/filter.h>
54 #include <linux/seccomp.h>
57 #include <asm/unistd.h>
59 #include <asm/zcrypt.h>
65 #include <stddef.h> /* for offsetof */
72 #include "ssh-sandbox.h"
75 /* Linux seccomp_filter sandbox */
76 #define SECCOMP_FILTER_FAIL SECCOMP_RET_KILL
78 /* Use a signal handler to emit violations when debugging */
79 #ifdef SANDBOX_SECCOMP_FILTER_DEBUG
80 # undef SECCOMP_FILTER_FAIL
81 # define SECCOMP_FILTER_FAIL SECCOMP_RET_TRAP
82 #endif /* SANDBOX_SECCOMP_FILTER_DEBUG */
84 #if __BYTE_ORDER == __LITTLE_ENDIAN
85 # define ARG_LO_OFFSET 0
86 # define ARG_HI_OFFSET sizeof(uint32_t)
87 #elif __BYTE_ORDER == __BIG_ENDIAN
88 # define ARG_LO_OFFSET sizeof(uint32_t)
89 # define ARG_HI_OFFSET 0
91 #error "Unknown endianness"
94 /* Simple helpers to avoid manual errors (but larger BPF programs). */
95 #define SC_DENY(_nr, _errno) \
96 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (_nr), 0, 1), \
97 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO|(_errno))
98 #define SC_ALLOW(_nr) \
99 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (_nr), 0, 1), \
100 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
101 #define SC_ALLOW_ARG(_nr, _arg_nr, _arg_val) \
102 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (_nr), 0, 6), \
103 /* load and test syscall argument, low word */ \
104 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
105 offsetof(struct seccomp_data, args[(_arg_nr)]) + ARG_LO_OFFSET), \
106 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, \
107 ((_arg_val) & 0xFFFFFFFF), 0, 3), \
108 /* load and test syscall argument, high word */ \
109 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
110 offsetof(struct seccomp_data, args[(_arg_nr)]) + ARG_HI_OFFSET), \
111 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, \
112 (((uint32_t)((uint64_t)(_arg_val) >> 32)) & 0xFFFFFFFF), 0, 1), \
113 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), \
114 /* reload syscall number; all rules expect it in accumulator */ \
115 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
116 offsetof(struct seccomp_data, nr))
117 /* Allow if syscall argument contains only values in mask */
118 #define SC_ALLOW_ARG_MASK(_nr, _arg_nr, _arg_mask) \
119 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (_nr), 0, 8), \
120 /* load, mask and test syscall argument, low word */ \
121 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
122 offsetof(struct seccomp_data, args[(_arg_nr)]) + ARG_LO_OFFSET), \
123 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, ~((_arg_mask) & 0xFFFFFFFF)), \
124 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, 0, 4), \
125 /* load, mask and test syscall argument, high word */ \
126 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
127 offsetof(struct seccomp_data, args[(_arg_nr)]) + ARG_HI_OFFSET), \
128 BPF_STMT(BPF_ALU+BPF_AND+BPF_K, \
129 ~(((uint32_t)((uint64_t)(_arg_mask) >> 32)) & 0xFFFFFFFF)), \
130 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, 0, 0, 1), \
131 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), \
132 /* reload syscall number; all rules expect it in accumulator */ \
133 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
134 offsetof(struct seccomp_data, nr))
136 /* Syscall filtering set for preauth. */
137 static const struct sock_filter preauth_insns[] = {
138 /* Ensure the syscall arch convention is as expected. */
139 BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
140 offsetof(struct seccomp_data, arch)),
141 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_AUDIT_ARCH, 1, 0),
142 BPF_STMT(BPF_RET+BPF_K, SECCOMP_FILTER_FAIL),
143 /* Load the syscall number for checking. */
144 BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
145 offsetof(struct seccomp_data, nr)),
147 /* Syscalls to non-fatally deny */
149 SC_DENY(__NR_lstat, EACCES),
152 SC_DENY(__NR_lstat64, EACCES),
155 SC_DENY(__NR_fstat, EACCES),
158 SC_DENY(__NR_fstat64, EACCES),
160 #ifdef __NR_fstatat64
161 SC_DENY(__NR_fstatat64, EACCES),
164 SC_DENY(__NR_open, EACCES),
167 SC_DENY(__NR_openat, EACCES),
169 #ifdef __NR_newfstatat
170 SC_DENY(__NR_newfstatat, EACCES),
173 SC_DENY(__NR_stat, EACCES),
176 SC_DENY(__NR_stat64, EACCES),
179 SC_DENY(__NR_shmget, EACCES),
182 SC_DENY(__NR_shmat, EACCES),
185 SC_DENY(__NR_shmdt, EACCES),
188 SC_DENY(__NR_ipc, EACCES),
191 SC_DENY(__NR_statx, EACCES),
194 /* Syscalls to permit */
198 #ifdef __NR_clock_gettime
199 SC_ALLOW(__NR_clock_gettime),
201 #ifdef __NR_clock_gettime64
202 SC_ALLOW(__NR_clock_gettime64),
205 SC_ALLOW(__NR_close),
210 #ifdef __NR_exit_group
211 SC_ALLOW(__NR_exit_group),
214 SC_ALLOW(__NR_futex),
216 #ifdef __NR_futex_time64
217 SC_ALLOW(__NR_futex_time64),
220 SC_ALLOW(__NR_geteuid),
222 #ifdef __NR_geteuid32
223 SC_ALLOW(__NR_geteuid32),
226 SC_ALLOW(__NR_getpgid),
229 SC_ALLOW(__NR_getpid),
231 #ifdef __NR_getrandom
232 SC_ALLOW(__NR_getrandom),
235 SC_ALLOW(__NR_gettid),
237 #ifdef __NR_gettimeofday
238 SC_ALLOW(__NR_gettimeofday),
241 SC_ALLOW(__NR_getuid),
244 SC_ALLOW(__NR_getuid32),
247 SC_ALLOW(__NR_madvise),
250 SC_ALLOW_ARG_MASK(__NR_mmap, 2, PROT_READ|PROT_WRITE|PROT_NONE),
253 SC_ALLOW_ARG_MASK(__NR_mmap2, 2, PROT_READ|PROT_WRITE|PROT_NONE),
256 SC_ALLOW_ARG_MASK(__NR_mprotect, 2, PROT_READ|PROT_WRITE|PROT_NONE),
259 SC_ALLOW(__NR_mremap),
262 SC_ALLOW(__NR_munmap),
264 #ifdef __NR_nanosleep
265 SC_ALLOW(__NR_nanosleep),
267 #ifdef __NR_clock_nanosleep
268 SC_ALLOW(__NR_clock_nanosleep),
270 #ifdef __NR_clock_nanosleep_time64
271 SC_ALLOW(__NR_clock_nanosleep_time64),
273 #ifdef __NR_clock_gettime64
274 SC_ALLOW(__NR_clock_gettime64),
276 #ifdef __NR__newselect
277 SC_ALLOW(__NR__newselect),
280 SC_ALLOW(__NR_ppoll),
282 #ifdef __NR_ppoll_time64
283 SC_ALLOW(__NR_ppoll_time64),
289 SC_ALLOW(__NR_pselect6),
291 #ifdef __NR_pselect6_time64
292 SC_ALLOW(__NR_pselect6_time64),
297 #ifdef __NR_rt_sigprocmask
298 SC_ALLOW(__NR_rt_sigprocmask),
301 SC_ALLOW(__NR_select),
304 SC_ALLOW(__NR_shutdown),
306 #ifdef __NR_sigprocmask
307 SC_ALLOW(__NR_sigprocmask),
313 SC_ALLOW(__NR_write),
315 #ifdef __NR_socketcall
316 SC_ALLOW_ARG(__NR_socketcall, 0, SYS_SHUTDOWN),
317 SC_DENY(__NR_socketcall, EACCES),
319 #if defined(__NR_ioctl) && defined(__s390__)
320 /* Allow ioctls for ICA crypto card on s390 */
321 SC_ALLOW_ARG(__NR_ioctl, 1, Z90STAT_STATUS_MASK),
322 SC_ALLOW_ARG(__NR_ioctl, 1, ICARSAMODEXPO),
323 SC_ALLOW_ARG(__NR_ioctl, 1, ICARSACRT),
324 SC_ALLOW_ARG(__NR_ioctl, 1, ZSECSENDCPRB),
325 /* Allow ioctls for EP11 crypto card on s390 */
326 SC_ALLOW_ARG(__NR_ioctl, 1, ZSENDEP11CPRB),
328 #if defined(__x86_64__) && defined(__ILP32__) && defined(__X32_SYSCALL_BIT)
330 * On Linux x32, the clock_gettime VDSO falls back to the
331 * x86-64 syscall under some circumstances, e.g.
332 * https://bugs.debian.org/849923
334 SC_ALLOW(__NR_clock_gettime & ~__X32_SYSCALL_BIT),
338 BPF_STMT(BPF_RET+BPF_K, SECCOMP_FILTER_FAIL),
341 static const struct sock_fprog preauth_program = {
342 .len = (unsigned short)(sizeof(preauth_insns)/sizeof(preauth_insns[0])),
343 .filter = (struct sock_filter *)preauth_insns,
351 ssh_sandbox_init(struct monitor *monitor)
353 struct ssh_sandbox *box;
356 * Strictly, we don't need to maintain any state here but we need
357 * to return non-NULL to satisfy the API.
359 debug3("%s: preparing seccomp filter sandbox", __func__);
360 box = xcalloc(1, sizeof(*box));
366 #ifdef SANDBOX_SECCOMP_FILTER_DEBUG
367 extern struct monitor *pmonitor;
368 void mm_log_handler(LogLevel level, int forced, const char *msg, void *ctx);
371 ssh_sandbox_violation(int signum, siginfo_t *info, void *void_context)
375 snprintf(msg, sizeof(msg),
376 "%s: unexpected system call (arch:0x%x,syscall:%d @ %p)",
377 __func__, info->si_arch, info->si_syscall, info->si_call_addr);
378 mm_log_handler(SYSLOG_LEVEL_FATAL, 0, msg, pmonitor);
383 ssh_sandbox_child_debugging(void)
385 struct sigaction act;
388 debug3("%s: installing SIGSYS handler", __func__);
389 memset(&act, 0, sizeof(act));
391 sigaddset(&mask, SIGSYS);
393 act.sa_sigaction = &ssh_sandbox_violation;
394 act.sa_flags = SA_SIGINFO;
395 if (sigaction(SIGSYS, &act, NULL) == -1)
396 fatal("%s: sigaction(SIGSYS): %s", __func__, strerror(errno));
397 if (sigprocmask(SIG_UNBLOCK, &mask, NULL) == -1)
398 fatal("%s: sigprocmask(SIGSYS): %s",
399 __func__, strerror(errno));
401 #endif /* SANDBOX_SECCOMP_FILTER_DEBUG */
404 ssh_sandbox_child(struct ssh_sandbox *box)
406 struct rlimit rl_zero, rl_one = {.rlim_cur = 1, .rlim_max = 1};
409 /* Set rlimits for completeness if possible. */
410 rl_zero.rlim_cur = rl_zero.rlim_max = 0;
411 if (setrlimit(RLIMIT_FSIZE, &rl_zero) == -1)
412 fatal("%s: setrlimit(RLIMIT_FSIZE, { 0, 0 }): %s",
413 __func__, strerror(errno));
415 * Cannot use zero for nfds, because poll(2) will fail with
416 * errno=EINVAL if npfds>RLIMIT_NOFILE.
418 if (setrlimit(RLIMIT_NOFILE, &rl_one) == -1)
419 fatal("%s: setrlimit(RLIMIT_NOFILE, { 0, 0 }): %s",
420 __func__, strerror(errno));
421 if (setrlimit(RLIMIT_NPROC, &rl_zero) == -1)
422 fatal("%s: setrlimit(RLIMIT_NPROC, { 0, 0 }): %s",
423 __func__, strerror(errno));
425 #ifdef SANDBOX_SECCOMP_FILTER_DEBUG
426 ssh_sandbox_child_debugging();
427 #endif /* SANDBOX_SECCOMP_FILTER_DEBUG */
429 debug3("%s: setting PR_SET_NO_NEW_PRIVS", __func__);
430 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1) {
431 debug("%s: prctl(PR_SET_NO_NEW_PRIVS): %s",
432 __func__, strerror(errno));
435 debug3("%s: attaching seccomp filter program", __func__);
436 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &preauth_program) == -1)
437 debug("%s: prctl(PR_SET_SECCOMP): %s",
438 __func__, strerror(errno));
440 fatal("%s: SECCOMP_MODE_FILTER activated but "
441 "PR_SET_NO_NEW_PRIVS failed", __func__);
445 ssh_sandbox_parent_finish(struct ssh_sandbox *box)
448 debug3("%s: finished", __func__);
452 ssh_sandbox_parent_preauth(struct ssh_sandbox *box, pid_t child_pid)
454 box->child_pid = child_pid;
457 #endif /* SANDBOX_SECCOMP_FILTER */