2 * Copyright (c) 1999 Poul-Henning Kamp.
3 * Copyright (c) 2008 Bjoern A. Zeeb.
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
33 #include "opt_inet6.h"
36 #include <sys/param.h>
37 #include <sys/types.h>
38 #include <sys/kernel.h>
39 #include <sys/systm.h>
40 #include <sys/errno.h>
41 #include <sys/sysproto.h>
42 #include <sys/malloc.h>
45 #include <sys/taskqueue.h>
48 #include <sys/mutex.h>
50 #include <sys/namei.h>
51 #include <sys/mount.h>
52 #include <sys/queue.h>
53 #include <sys/socket.h>
54 #include <sys/syscallsubr.h>
55 #include <sys/sysctl.h>
56 #include <sys/vnode.h>
58 #include <netinet/in.h>
62 #include <netinet6/in6_var.h>
66 #include <security/mac/mac_framework.h>
68 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
70 SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
73 int jail_set_hostname_allowed = 1;
74 SYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW,
75 &jail_set_hostname_allowed, 0,
76 "Processes in jail can set their hostnames");
78 int jail_socket_unixiproute_only = 1;
79 SYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW,
80 &jail_socket_unixiproute_only, 0,
81 "Processes in jail are limited to creating UNIX/IP/route sockets only");
83 int jail_sysvipc_allowed = 0;
84 SYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW,
85 &jail_sysvipc_allowed, 0,
86 "Processes in jail can use System V IPC primitives");
88 static int jail_enforce_statfs = 2;
89 SYSCTL_INT(_security_jail, OID_AUTO, enforce_statfs, CTLFLAG_RW,
90 &jail_enforce_statfs, 0,
91 "Processes in jail cannot see all mounted file systems");
93 int jail_allow_raw_sockets = 0;
94 SYSCTL_INT(_security_jail, OID_AUTO, allow_raw_sockets, CTLFLAG_RW,
95 &jail_allow_raw_sockets, 0,
96 "Prison root can create raw sockets");
98 int jail_chflags_allowed = 0;
99 SYSCTL_INT(_security_jail, OID_AUTO, chflags_allowed, CTLFLAG_RW,
100 &jail_chflags_allowed, 0,
101 "Processes in jail can alter system file flags");
103 int jail_mount_allowed = 0;
104 SYSCTL_INT(_security_jail, OID_AUTO, mount_allowed, CTLFLAG_RW,
105 &jail_mount_allowed, 0,
106 "Processes in jail can mount/unmount jail-friendly file systems");
108 int jail_max_af_ips = 255;
109 SYSCTL_INT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
111 "Number of IP addresses a jail may have at most per address family");
113 /* allprison, lastprid, and prisoncount are protected by allprison_lock. */
114 struct prisonlist allprison;
115 struct sx allprison_lock;
120 * List of jail services. Protected by allprison_lock.
122 TAILQ_HEAD(prison_services_head, prison_service);
123 static struct prison_services_head prison_services =
124 TAILQ_HEAD_INITIALIZER(prison_services);
125 static int prison_service_slots = 0;
127 struct prison_service {
128 prison_create_t ps_create;
129 prison_destroy_t ps_destroy;
131 TAILQ_ENTRY(prison_service) ps_next;
135 static void init_prison(void *);
136 static void prison_complete(void *context, int pending);
137 static int sysctl_jail_list(SYSCTL_HANDLER_ARGS);
139 static int _prison_check_ip4(struct prison *, struct in_addr *);
142 static int _prison_check_ip6(struct prison *, struct in6_addr *);
146 init_prison(void *data __unused)
149 sx_init(&allprison_lock, "allprison");
150 LIST_INIT(&allprison);
153 SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
157 qcmp_v4(const void *ip1, const void *ip2)
162 * We need to compare in HBO here to get the list sorted as expected
163 * by the result of the code. Sorting NBO addresses gives you
164 * interesting results. If you do not understand, do not try.
166 iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
167 iab = ntohl(((const struct in_addr *)ip2)->s_addr);
170 * Do not simply return the difference of the two numbers, the int is
184 qcmp_v6(const void *ip1, const void *ip2)
186 const struct in6_addr *ia6a, *ia6b;
189 ia6a = (const struct in6_addr *)ip1;
190 ia6b = (const struct in6_addr *)ip2;
193 for (i=0; rc == 0 && i < sizeof(struct in6_addr); i++) {
194 if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
196 else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
203 #if defined(INET) || defined(INET6)
205 prison_check_conflicting_ips(struct prison *p)
210 sx_assert(&allprison_lock, SX_LOCKED);
212 if (p->pr_ip4s == 0 && p->pr_ip6s == 0)
215 LIST_FOREACH(pr, &allprison, pr_list) {
217 * Skip 'dying' prisons to avoid problems when
218 * restarting multi-IP jails.
220 if (pr->pr_state == PRISON_STATE_DYING)
224 * We permit conflicting IPs if there is no
225 * more than 1 IP on eeach jail.
226 * In case there is one duplicate on a jail with
227 * more than one IP stop checking and return error.
230 if ((p->pr_ip4s >= 1 && pr->pr_ip4s > 1) ||
231 (p->pr_ip4s > 1 && pr->pr_ip4s >= 1)) {
232 for (i = 0; i < p->pr_ip4s; i++) {
233 if (_prison_check_ip4(pr, &p->pr_ip4[i]) == 0)
239 if ((p->pr_ip6s >= 1 && pr->pr_ip6s > 1) ||
240 (p->pr_ip6s > 1 && pr->pr_ip6s >= 1)) {
241 for (i = 0; i < p->pr_ip6s; i++) {
242 if (_prison_check_ip6(pr, &p->pr_ip6[i]) == 0)
253 jail_copyin_ips(struct jail *j)
259 struct in6_addr *ip6;
264 * Copy in addresses, check for duplicate addresses and do some
265 * simple 0 and broadcast checks. If users give other bogus addresses
266 * it is their problem.
268 * IP addresses are all sorted but ip[0] to preserve the primary IP
269 * address as given from userland. This special IP is used for
270 * unbound outgoing connections as well for "loopback" traffic.
280 ip4 = (struct in_addr *)malloc(j->ip4s * sizeof(struct in_addr),
281 M_PRISON, M_WAITOK | M_ZERO);
282 error = copyin(j->ip4, ip4, j->ip4s * sizeof(struct in_addr));
285 /* Sort all but the first IPv4 address. */
287 qsort((ip4 + 1), j->ip4s - 1,
288 sizeof(struct in_addr), qcmp_v4);
291 * We do not have to care about byte order for these checks
292 * so we will do them in NBO.
294 for (i=0; i<j->ip4s; i++) {
295 if (ip4[i].s_addr == htonl(INADDR_ANY) ||
296 ip4[i].s_addr == htonl(INADDR_BROADCAST)) {
300 if ((i+1) < j->ip4s &&
301 (ip4[0].s_addr == ip4[i+1].s_addr ||
302 ip4[i].s_addr == ip4[i+1].s_addr)) {
314 ip6 = (struct in6_addr *)malloc(j->ip6s * sizeof(struct in6_addr),
315 M_PRISON, M_WAITOK | M_ZERO);
316 error = copyin(j->ip6, ip6, j->ip6s * sizeof(struct in6_addr));
319 /* Sort all but the first IPv6 address. */
321 qsort((ip6 + 1), j->ip6s - 1,
322 sizeof(struct in6_addr), qcmp_v6);
323 for (i=0; i<j->ip6s; i++) {
324 if (IN6_IS_ADDR_UNSPECIFIED(&ip6[i])) {
328 if ((i+1) < j->ip6s &&
329 (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[i+1]) ||
330 IN6_ARE_ADDR_EQUAL(&ip6[i], &ip6[i+1]))) {
351 #endif /* INET || INET6 */
354 jail_handle_ips(struct jail *j)
356 #if defined(INET) || defined(INET6)
361 * Finish conversion for older versions, copyin and setup IPs.
363 switch (j->version) {
367 /* FreeBSD single IPv4 jails. */
370 if (j->ip4s == INADDR_ANY || j->ip4s == INADDR_BROADCAST)
372 ip4 = (struct in_addr *)malloc(sizeof(struct in_addr),
373 M_PRISON, M_WAITOK | M_ZERO);
376 * Jail version 0 still used HBO for the IPv4 address.
378 ip4->s_addr = htonl(j->ip4s);
389 * Version 1 was used by multi-IPv4 jail implementations
390 * that never made it into the official kernel.
391 * We should never hit this here; jail() should catch it.
395 case 2: /* JAIL_API_VERSION */
396 /* FreeBSD multi-IPv4/IPv6,noIP jails. */
397 #if defined(INET) || defined(INET6)
399 if (j->ip4s > jail_max_af_ips)
406 if (j->ip6s > jail_max_af_ips)
412 error = jail_copyin_ips(j);
419 /* Sci-Fi jails are not supported, sorry. */
433 jail(struct thread *td, struct jail_args *uap)
439 error = copyin(uap->jail, &version, sizeof(uint32_t));
445 /* FreeBSD single IPv4 jails. */
449 bzero(&j, sizeof(struct jail));
450 error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
453 j.version = j0.version;
455 j.hostname = j0.hostname;
456 j.ip4s = j0.ip_number;
462 * Version 1 was used by multi-IPv4 jail implementations
463 * that never made it into the official kernel.
467 case 2: /* JAIL_API_VERSION */
468 /* FreeBSD multi-IPv4/IPv6,noIP jails. */
469 error = copyin(uap->jail, &j, sizeof(struct jail));
475 /* Sci-Fi jails are not supported, sorry. */
478 return (kern_jail(td, &j));
482 kern_jail(struct thread *td, struct jail *j)
485 struct prison *pr, *tpr;
486 struct prison_service *psrv;
487 struct jail_attach_args jaa;
488 int vfslocked, error, tryprid;
490 KASSERT(j != NULL, ("%s: j is NULL", __func__));
492 /* Handle addresses - convert old structs, copyin, check IPs. */
493 error = jail_handle_ips(j);
497 /* Allocate struct prison and fill it with life. */
498 pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
499 mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF);
501 error = copyinstr(j->path, &pr->pr_path, sizeof(pr->pr_path), NULL);
504 NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE,
509 vfslocked = NDHASGIANT(&nd);
510 pr->pr_root = nd.ni_vp;
511 VOP_UNLOCK(nd.ni_vp, 0, td);
512 NDFREE(&nd, NDF_ONLY_PNBUF);
513 VFS_UNLOCK_GIANT(vfslocked);
514 error = copyinstr(j->hostname, &pr->pr_host, sizeof(pr->pr_host), NULL);
517 if (j->jailname != NULL) {
518 error = copyinstr(j->jailname, &pr->pr_name,
519 sizeof(pr->pr_name), NULL);
525 pr->pr_ip4s = j->ip4s;
530 pr->pr_ip6s = j->ip6s;
534 pr->pr_securelevel = securelevel;
535 if (prison_service_slots == 0)
538 pr->pr_slots = malloc(sizeof(*pr->pr_slots) * prison_service_slots,
539 M_PRISON, M_ZERO | M_WAITOK);
543 * Pre-set prison state to ALIVE upon cration. This is needed so we
544 * can later attach the process to it, etc (avoiding another extra
545 * state for ther process of creation, complicating things).
547 pr->pr_state = PRISON_STATE_ALIVE;
549 /* Allocate a dedicated cpuset for each jail. */
550 error = cpuset_create_root(td, &pr->pr_cpuset);
554 sx_xlock(&allprison_lock);
555 /* Make sure we cannot run into problems with ambiguous bind()ings. */
556 #if defined(INET) || defined(INET6)
557 error = prison_check_conflicting_ips(pr);
559 sx_xunlock(&allprison_lock);
564 /* Determine next pr_id and add prison to allprison list. */
565 tryprid = lastprid + 1;
566 if (tryprid == JAIL_MAX)
569 LIST_FOREACH(tpr, &allprison, pr_list) {
570 if (tpr->pr_id == tryprid) {
572 if (tryprid == JAIL_MAX) {
573 sx_xunlock(&allprison_lock);
580 pr->pr_id = jaa.jid = lastprid = tryprid;
581 LIST_INSERT_HEAD(&allprison, pr, pr_list);
583 sx_downgrade(&allprison_lock);
584 TAILQ_FOREACH(psrv, &prison_services, ps_next) {
585 psrv->ps_create(psrv, pr);
587 sx_sunlock(&allprison_lock);
589 error = jail_attach(td, &jaa);
592 mtx_lock(&pr->pr_mtx);
594 mtx_unlock(&pr->pr_mtx);
595 td->td_retval[0] = jaa.jid;
598 sx_xlock(&allprison_lock);
599 LIST_REMOVE(pr, pr_list);
601 sx_downgrade(&allprison_lock);
602 TAILQ_FOREACH(psrv, &prison_services, ps_next) {
603 psrv->ps_destroy(psrv, pr);
605 sx_sunlock(&allprison_lock);
607 cpuset_rel(pr->pr_cpuset);
609 if (pr->pr_slots != NULL)
610 free(pr->pr_slots, M_PRISON);
611 vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
613 VFS_UNLOCK_GIANT(vfslocked);
615 mtx_destroy(&pr->pr_mtx);
618 free(j->ip6, M_PRISON);
621 free(j->ip4, M_PRISON);
627 * struct jail_attach_args {
632 jail_attach(struct thread *td, struct jail_attach_args *uap)
635 struct ucred *newcred, *oldcred;
637 int vfslocked, error;
640 * XXX: Note that there is a slight race here if two threads
641 * in the same privileged process attempt to attach to two
642 * different jails at the same time. It is important for
643 * user processes not to do this, or they might end up with
644 * a process root from one prison, but attached to the jail
647 error = priv_check(td, PRIV_JAIL_ATTACH);
652 sx_slock(&allprison_lock);
653 pr = prison_find(uap->jid);
655 sx_sunlock(&allprison_lock);
660 * Do not allow a process to attach to a prison that is not
661 * considered to be "ALIVE".
663 if (pr->pr_state != PRISON_STATE_ALIVE) {
664 mtx_unlock(&pr->pr_mtx);
665 sx_sunlock(&allprison_lock);
669 mtx_unlock(&pr->pr_mtx);
670 sx_sunlock(&allprison_lock);
673 * Reparent the newly attached process to this jail.
675 error = cpuset_setproc_update_set(p, pr->pr_cpuset);
679 vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
680 vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY, td);
681 if ((error = change_dir(pr->pr_root, td)) != 0)
684 if ((error = mac_check_vnode_chroot(td->td_ucred, pr->pr_root)))
687 VOP_UNLOCK(pr->pr_root, 0, td);
688 change_root(pr->pr_root, td);
689 VFS_UNLOCK_GIANT(vfslocked);
693 oldcred = p->p_ucred;
695 crcopy(newcred, oldcred);
696 newcred->cr_prison = pr;
697 p->p_ucred = newcred;
698 prison_proc_hold(pr);
703 VOP_UNLOCK(pr->pr_root, 0, td);
704 VFS_UNLOCK_GIANT(vfslocked);
706 mtx_lock(&pr->pr_mtx);
708 mtx_unlock(&pr->pr_mtx);
713 * Returns a locked prison instance, or NULL on failure.
716 prison_find(int prid)
720 sx_assert(&allprison_lock, SX_LOCKED);
721 LIST_FOREACH(pr, &allprison, pr_list) {
722 if (pr->pr_id == prid) {
723 mtx_lock(&pr->pr_mtx);
724 if (pr->pr_ref == 0) {
725 mtx_unlock(&pr->pr_mtx);
735 prison_free_locked(struct prison *pr)
738 mtx_assert(&pr->pr_mtx, MA_OWNED);
740 if (pr->pr_ref == 0) {
741 mtx_unlock(&pr->pr_mtx);
742 TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
743 taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
746 mtx_unlock(&pr->pr_mtx);
750 prison_free(struct prison *pr)
753 mtx_lock(&pr->pr_mtx);
754 prison_free_locked(pr);
758 prison_complete(void *context, int pending)
760 struct prison_service *psrv;
764 pr = (struct prison *)context;
766 sx_xlock(&allprison_lock);
767 LIST_REMOVE(pr, pr_list);
769 sx_downgrade(&allprison_lock);
770 TAILQ_FOREACH(psrv, &prison_services, ps_next) {
771 psrv->ps_destroy(psrv, pr);
773 sx_sunlock(&allprison_lock);
775 cpuset_rel(pr->pr_cpuset);
777 if (pr->pr_slots != NULL)
778 free(pr->pr_slots, M_PRISON);
780 vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
782 VFS_UNLOCK_GIANT(vfslocked);
784 mtx_destroy(&pr->pr_mtx);
785 free(pr->pr_linux, M_PRISON);
787 free(pr->pr_ip6, M_PRISON);
790 free(pr->pr_ip4, M_PRISON);
796 prison_hold_locked(struct prison *pr)
799 mtx_assert(&pr->pr_mtx, MA_OWNED);
800 KASSERT(pr->pr_ref > 0,
801 ("Trying to hold dead prison (id=%d).", pr->pr_id));
806 prison_hold(struct prison *pr)
809 mtx_lock(&pr->pr_mtx);
810 prison_hold_locked(pr);
811 mtx_unlock(&pr->pr_mtx);
815 prison_proc_hold(struct prison *pr)
818 mtx_lock(&pr->pr_mtx);
819 KASSERT(pr->pr_state == PRISON_STATE_ALIVE,
820 ("Cannot add a process to a non-alive prison (id=%d).", pr->pr_id));
822 mtx_unlock(&pr->pr_mtx);
826 prison_proc_free(struct prison *pr)
829 mtx_lock(&pr->pr_mtx);
830 KASSERT(pr->pr_state == PRISON_STATE_ALIVE && pr->pr_nprocs > 0,
831 ("Trying to kill a process in a dead prison (id=%d).", pr->pr_id));
833 if (pr->pr_nprocs == 0)
834 pr->pr_state = PRISON_STATE_DYING;
835 mtx_unlock(&pr->pr_mtx);
841 * Pass back primary IPv4 address of this jail.
843 * If not jailed return success but do not alter the address. Caller has to
844 * make sure to intialize it correctly (e.g. INADDR_ANY).
846 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
847 * Address returned in NBO.
850 prison_get_ip4(struct ucred *cred, struct in_addr *ia)
853 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
854 KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
857 /* Do not change address passed in. */
860 if (cred->cr_prison->pr_ip4 == NULL)
861 return (EAFNOSUPPORT);
863 ia->s_addr = cred->cr_prison->pr_ip4[0].s_addr;
868 * Make sure our (source) address is set to something meaningful to this
871 * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if
872 * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow IPv4.
873 * Address passed in in NBO and returned in NBO.
876 prison_local_ip4(struct ucred *cred, struct in_addr *ia)
880 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
881 KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
885 if (cred->cr_prison->pr_ip4 == NULL)
886 return (EAFNOSUPPORT);
888 ia0.s_addr = ntohl(ia->s_addr);
889 if (ia0.s_addr == INADDR_LOOPBACK) {
890 ia->s_addr = cred->cr_prison->pr_ip4[0].s_addr;
894 if (ia0.s_addr == INADDR_ANY) {
896 * In case there is only 1 IPv4 address, bind directly.
898 if (cred->cr_prison->pr_ip4s == 1)
899 ia->s_addr = cred->cr_prison->pr_ip4[0].s_addr;
903 return (_prison_check_ip4(cred->cr_prison, ia));
907 * Rewrite destination address in case we will connect to loopback address.
909 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
910 * Address passed in in NBO and returned in NBO.
913 prison_remote_ip4(struct ucred *cred, struct in_addr *ia)
916 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
917 KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
921 if (cred->cr_prison->pr_ip4 == NULL)
922 return (EAFNOSUPPORT);
924 if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
925 ia->s_addr = cred->cr_prison->pr_ip4[0].s_addr;
930 * Return success because nothing had to be changed.
936 * Check if given address belongs to the jail referenced by cred/prison.
938 * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if
939 * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow IPv4.
940 * Address passed in in NBO.
943 _prison_check_ip4(struct prison *pr, struct in_addr *ia)
948 * Check the primary IP.
950 if (pr->pr_ip4[0].s_addr == ia->s_addr)
954 * All the other IPs are sorted so we can do a binary search.
960 d = qcmp_v4(&pr->pr_ip4[i+1], ia);
969 return (EADDRNOTAVAIL);
973 prison_check_ip4(struct ucred *cred, struct in_addr *ia)
976 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
977 KASSERT(ia != NULL, ("%s: ia is NULL", __func__));
981 if (cred->cr_prison->pr_ip4 == NULL)
982 return (EAFNOSUPPORT);
984 return (_prison_check_ip4(cred->cr_prison, ia));
990 * Pass back primary IPv6 address for this jail.
992 * If not jailed return success but do not alter the address. Caller has to
993 * make sure to intialize it correctly (e.g. IN6ADDR_ANY_INIT).
995 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
998 prison_get_ip6(struct ucred *cred, struct in6_addr *ia6)
1001 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
1002 KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
1006 if (cred->cr_prison->pr_ip6 == NULL)
1007 return (EAFNOSUPPORT);
1009 bcopy(&cred->cr_prison->pr_ip6[0], ia6, sizeof(struct in6_addr));
1014 * Make sure our (source) address is set to something meaningful to this jail.
1016 * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
1017 * when needed while binding.
1019 * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if
1020 * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow IPv6.
1023 prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only)
1026 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
1027 KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
1031 if (cred->cr_prison->pr_ip6 == NULL)
1032 return (EAFNOSUPPORT);
1034 if (IN6_IS_ADDR_LOOPBACK(ia6)) {
1035 bcopy(&cred->cr_prison->pr_ip6[0], ia6,
1036 sizeof(struct in6_addr));
1040 if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
1042 * In case there is only 1 IPv6 address, and v6only is true,
1043 * then bind directly.
1045 if (v6only != 0 && cred->cr_prison->pr_ip6s == 1)
1046 bcopy(&cred->cr_prison->pr_ip6[0], ia6,
1047 sizeof(struct in6_addr));
1051 return (_prison_check_ip6(cred->cr_prison, ia6));
1055 * Rewrite destination address in case we will connect to loopback address.
1057 * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
1060 prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6)
1063 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
1064 KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
1068 if (cred->cr_prison->pr_ip6 == NULL)
1069 return (EAFNOSUPPORT);
1071 if (IN6_IS_ADDR_LOOPBACK(ia6)) {
1072 bcopy(&cred->cr_prison->pr_ip6[0], ia6,
1073 sizeof(struct in6_addr));
1078 * Return success because nothing had to be changed.
1084 * Check if given address belongs to the jail referenced by cred/prison.
1086 * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if
1087 * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow IPv6.
1090 _prison_check_ip6(struct prison *pr, struct in6_addr *ia6)
1095 * Check the primary IP.
1097 if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
1101 * All the other IPs are sorted so we can do a binary search.
1104 z = pr->pr_ip6s - 2;
1107 d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
1116 return (EADDRNOTAVAIL);
1120 prison_check_ip6(struct ucred *cred, struct in6_addr *ia6)
1123 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
1124 KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));
1128 if (cred->cr_prison->pr_ip6 == NULL)
1129 return (EAFNOSUPPORT);
1131 return (_prison_check_ip6(cred->cr_prison, ia6));
1136 * Check if a jail supports the given address family.
1138 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
1142 prison_check_af(struct ucred *cred, int af)
1146 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
1157 if (cred->cr_prison->pr_ip4 == NULL)
1158 error = EAFNOSUPPORT;
1163 if (cred->cr_prison->pr_ip6 == NULL)
1164 error = EAFNOSUPPORT;
1171 if (jail_socket_unixiproute_only)
1172 error = EAFNOSUPPORT;
1178 * Check if given address belongs to the jail referenced by cred (wrapper to
1179 * prison_check_ip[46]).
1181 * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if
1182 * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow
1183 * the address family. IPv4 Address passed in in NBO.
1186 prison_if(struct ucred *cred, struct sockaddr *sa)
1189 struct sockaddr_in *sai;
1192 struct sockaddr_in6 *sai6;
1196 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
1197 KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
1200 switch (sa->sa_family)
1204 sai = (struct sockaddr_in *)sa;
1205 error = prison_check_ip4(cred, &sai->sin_addr);
1210 sai6 = (struct sockaddr_in6 *)sa;
1211 error = prison_check_ip6(cred, &sai6->sin6_addr);
1215 if (jailed(cred) && jail_socket_unixiproute_only)
1216 error = EAFNOSUPPORT;
1222 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
1225 prison_check(struct ucred *cred1, struct ucred *cred2)
1228 if (jailed(cred1)) {
1231 if (cred2->cr_prison != cred1->cr_prison)
1239 * Return 1 if the passed credential is in a jail, otherwise 0.
1242 jailed(struct ucred *cred)
1245 return (cred->cr_prison != NULL);
1249 * Return the correct hostname for the passed credential.
1252 getcredhostname(struct ucred *cred, char *buf, size_t size)
1256 mtx_lock(&cred->cr_prison->pr_mtx);
1257 strlcpy(buf, cred->cr_prison->pr_host, size);
1258 mtx_unlock(&cred->cr_prison->pr_mtx);
1260 strlcpy(buf, hostname, size);
1264 * Determine whether the subject represented by cred can "see"
1265 * status of a mount point.
1266 * Returns: 0 for permitted, ENOENT otherwise.
1267 * XXX: This function should be called cr_canseemount() and should be
1268 * placed in kern_prot.c.
1271 prison_canseemount(struct ucred *cred, struct mount *mp)
1277 if (!jailed(cred) || jail_enforce_statfs == 0)
1279 pr = cred->cr_prison;
1280 if (pr->pr_root->v_mount == mp)
1282 if (jail_enforce_statfs == 2)
1285 * If jail's chroot directory is set to "/" we should be able to see
1286 * all mount-points from inside a jail.
1287 * This is ugly check, but this is the only situation when jail's
1288 * directory ends with '/'.
1290 if (strcmp(pr->pr_path, "/") == 0)
1292 len = strlen(pr->pr_path);
1294 if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
1297 * Be sure that we don't have situation where jail's root directory
1298 * is "/some/path" and mount point is "/some/pathpath".
1300 if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
1306 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
1308 char jpath[MAXPATHLEN];
1312 if (!jailed(cred) || jail_enforce_statfs == 0)
1314 pr = cred->cr_prison;
1315 if (prison_canseemount(cred, mp) != 0) {
1316 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1317 strlcpy(sp->f_mntonname, "[restricted]",
1318 sizeof(sp->f_mntonname));
1321 if (pr->pr_root->v_mount == mp) {
1323 * Clear current buffer data, so we are sure nothing from
1324 * the valid path left there.
1326 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1327 *sp->f_mntonname = '/';
1331 * If jail's chroot directory is set to "/" we should be able to see
1332 * all mount-points from inside a jail.
1334 if (strcmp(pr->pr_path, "/") == 0)
1336 len = strlen(pr->pr_path);
1337 strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
1339 * Clear current buffer data, so we are sure nothing from
1340 * the valid path left there.
1342 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
1343 if (*jpath == '\0') {
1344 /* Should never happen. */
1345 *sp->f_mntonname = '/';
1347 strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
1352 * Check with permission for a specific privilege is granted within jail. We
1353 * have a specific list of accepted privileges; the rest are denied.
1356 prison_priv_check(struct ucred *cred, int priv)
1365 * Allow ktrace privileges for root in jail.
1371 * Allow jailed processes to configure audit identity and
1372 * submit audit records (login, etc). In the future we may
1373 * want to further refine the relationship between audit and
1376 case PRIV_AUDIT_GETAUDIT:
1377 case PRIV_AUDIT_SETAUDIT:
1378 case PRIV_AUDIT_SUBMIT:
1382 * Allow jailed processes to manipulate process UNIX
1383 * credentials in any way they see fit.
1385 case PRIV_CRED_SETUID:
1386 case PRIV_CRED_SETEUID:
1387 case PRIV_CRED_SETGID:
1388 case PRIV_CRED_SETEGID:
1389 case PRIV_CRED_SETGROUPS:
1390 case PRIV_CRED_SETREUID:
1391 case PRIV_CRED_SETREGID:
1392 case PRIV_CRED_SETRESUID:
1393 case PRIV_CRED_SETRESGID:
1396 * Jail implements visibility constraints already, so allow
1397 * jailed root to override uid/gid-based constraints.
1399 case PRIV_SEEOTHERGIDS:
1400 case PRIV_SEEOTHERUIDS:
1403 * Jail implements inter-process debugging limits already, so
1404 * allow jailed root various debugging privileges.
1406 case PRIV_DEBUG_DIFFCRED:
1407 case PRIV_DEBUG_SUGID:
1408 case PRIV_DEBUG_UNPRIV:
1411 * Allow jail to set various resource limits and login
1412 * properties, and for now, exceed process resource limits.
1414 case PRIV_PROC_LIMIT:
1415 case PRIV_PROC_SETLOGIN:
1416 case PRIV_PROC_SETRLIMIT:
1419 * System V and POSIX IPC privileges are granted in jail.
1422 case PRIV_IPC_WRITE:
1423 case PRIV_IPC_ADMIN:
1424 case PRIV_IPC_MSGSIZE:
1428 * Jail implements its own inter-process limits, so allow
1429 * root processes in jail to change scheduling on other
1430 * processes in the same jail. Likewise for signalling.
1432 case PRIV_SCHED_DIFFCRED:
1433 case PRIV_SCHED_CPUSET:
1434 case PRIV_SIGNAL_DIFFCRED:
1435 case PRIV_SIGNAL_SUGID:
1438 * Allow jailed processes to write to sysctls marked as jail
1441 case PRIV_SYSCTL_WRITEJAIL:
1444 * Allow root in jail to manage a variety of quota
1445 * properties. These should likely be conditional on a
1446 * configuration option.
1448 case PRIV_VFS_GETQUOTA:
1449 case PRIV_VFS_SETQUOTA:
1452 * Since Jail relies on chroot() to implement file system
1453 * protections, grant many VFS privileges to root in jail.
1454 * Be careful to exclude mount-related and NFS-related
1458 case PRIV_VFS_WRITE:
1459 case PRIV_VFS_ADMIN:
1461 case PRIV_VFS_LOOKUP:
1462 case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */
1463 case PRIV_VFS_CHFLAGS_DEV:
1464 case PRIV_VFS_CHOWN:
1465 case PRIV_VFS_CHROOT:
1466 case PRIV_VFS_RETAINSUGID:
1467 case PRIV_VFS_FCHROOT:
1469 case PRIV_VFS_SETGID:
1470 case PRIV_VFS_STICKYFILE:
1474 * Depending on the global setting, allow privilege of
1475 * setting system flags.
1477 case PRIV_VFS_SYSFLAGS:
1478 if (jail_chflags_allowed)
1484 * Depending on the global setting, allow privilege of
1485 * mounting/unmounting file systems.
1487 case PRIV_VFS_MOUNT:
1488 case PRIV_VFS_UNMOUNT:
1489 case PRIV_VFS_MOUNT_NONUSER:
1490 case PRIV_VFS_MOUNT_OWNER:
1491 if (jail_mount_allowed)
1497 * Allow jailed root to bind reserved ports and reuse in-use
1500 case PRIV_NETINET_RESERVEDPORT:
1501 case PRIV_NETINET_REUSEPORT:
1505 * Allow jailed root to set certian IPv4/6 (option) headers.
1507 case PRIV_NETINET_SETHDROPTS:
1511 * Conditionally allow creating raw sockets in jail.
1513 case PRIV_NETINET_RAW:
1514 if (jail_allow_raw_sockets)
1520 * Since jail implements its own visibility limits on netstat
1521 * sysctls, allow getcred. This allows identd to work in
1524 case PRIV_NETINET_GETCRED:
1529 * In all remaining cases, deny the privilege request. This
1530 * includes almost all network privileges, many system
1531 * configuration privileges.
1538 * Register jail service. Provides 'create' and 'destroy' methods.
1539 * 'create' method will be called for every existing jail and all
1540 * jails in the future as they beeing created.
1541 * 'destroy' method will be called for every jail going away and
1542 * for all existing jails at the time of service deregistration.
1544 struct prison_service *
1545 prison_service_register(const char *name, prison_create_t create,
1546 prison_destroy_t destroy)
1548 struct prison_service *psrv, *psrv2;
1550 int reallocate = 1, slotno = 0;
1551 void **slots, **oldslots;
1553 psrv = malloc(sizeof(*psrv) + strlen(name) + 1, M_PRISON,
1555 psrv->ps_create = create;
1556 psrv->ps_destroy = destroy;
1557 strcpy(psrv->ps_name, name);
1559 * Grab the allprison_lock here, so we won't miss any jail
1560 * creation/destruction.
1562 sx_xlock(&allprison_lock);
1565 * Verify if service is not already registered.
1567 TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
1568 KASSERT(strcmp(psrv2->ps_name, name) != 0,
1569 ("jail service %s already registered", name));
1573 * Find free slot. When there is no existing free slot available,
1574 * allocate one at the end.
1576 TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
1577 if (psrv2->ps_slotno != slotno) {
1578 KASSERT(slotno < psrv2->ps_slotno,
1579 ("Invalid slotno (slotno=%d >= ps_slotno=%d",
1580 slotno, psrv2->ps_slotno));
1581 /* We found free slot. */
1587 psrv->ps_slotno = slotno;
1589 * Keep the list sorted by slot number.
1591 if (psrv2 != NULL) {
1592 KASSERT(reallocate == 0, ("psrv2 != NULL && reallocate != 0"));
1593 TAILQ_INSERT_BEFORE(psrv2, psrv, ps_next);
1595 KASSERT(reallocate == 1, ("psrv2 == NULL && reallocate == 0"));
1596 TAILQ_INSERT_TAIL(&prison_services, psrv, ps_next);
1598 prison_service_slots++;
1599 sx_downgrade(&allprison_lock);
1601 * Allocate memory for new slot if we didn't found empty one.
1602 * Do not use realloc(9), because pr_slots is protected with a mutex,
1603 * so we can't sleep.
1605 LIST_FOREACH(pr, &allprison, pr_list) {
1607 /* First allocate memory with M_WAITOK. */
1608 slots = malloc(sizeof(*slots) * prison_service_slots,
1609 M_PRISON, M_WAITOK);
1610 /* Now grab the mutex and replace pr_slots. */
1611 mtx_lock(&pr->pr_mtx);
1612 oldslots = pr->pr_slots;
1613 if (psrv->ps_slotno > 0) {
1614 bcopy(oldslots, slots,
1615 sizeof(*slots) * (prison_service_slots - 1));
1617 slots[psrv->ps_slotno] = NULL;
1618 pr->pr_slots = slots;
1619 mtx_unlock(&pr->pr_mtx);
1620 if (oldslots != NULL)
1621 free(oldslots, M_PRISON);
1624 * Call 'create' method for each existing jail.
1626 psrv->ps_create(psrv, pr);
1628 sx_sunlock(&allprison_lock);
1634 prison_service_deregister(struct prison_service *psrv)
1637 void **slots, **oldslots;
1640 sx_xlock(&allprison_lock);
1641 if (TAILQ_LAST(&prison_services, prison_services_head) == psrv)
1643 TAILQ_REMOVE(&prison_services, psrv, ps_next);
1644 prison_service_slots--;
1645 sx_downgrade(&allprison_lock);
1646 LIST_FOREACH(pr, &allprison, pr_list) {
1648 * Call 'destroy' method for every currently existing jail.
1650 psrv->ps_destroy(psrv, pr);
1652 * If this is the last slot, free the memory allocated for it.
1655 if (prison_service_slots == 0)
1658 slots = malloc(sizeof(*slots) * prison_service_slots,
1659 M_PRISON, M_WAITOK);
1661 mtx_lock(&pr->pr_mtx);
1662 oldslots = pr->pr_slots;
1664 * We require setting slot to NULL after freeing it,
1665 * this way we can check for memory leaks here.
1667 KASSERT(oldslots[psrv->ps_slotno] == NULL,
1668 ("Slot %d (service %s, jailid=%d) still contains data?",
1669 psrv->ps_slotno, psrv->ps_name, pr->pr_id));
1670 if (psrv->ps_slotno > 0) {
1671 bcopy(oldslots, slots,
1672 sizeof(*slots) * prison_service_slots);
1674 pr->pr_slots = slots;
1675 mtx_unlock(&pr->pr_mtx);
1676 KASSERT(oldslots != NULL, ("oldslots == NULL"));
1677 free(oldslots, M_PRISON);
1680 sx_sunlock(&allprison_lock);
1681 free(psrv, M_PRISON);
1685 * Function sets data for the given jail in slot assigned for the given
1689 prison_service_data_set(struct prison_service *psrv, struct prison *pr,
1693 mtx_assert(&pr->pr_mtx, MA_OWNED);
1694 pr->pr_slots[psrv->ps_slotno] = data;
1698 * Function clears slots assigned for the given jail service in the given
1699 * prison structure and returns current slot data.
1702 prison_service_data_del(struct prison_service *psrv, struct prison *pr)
1706 mtx_assert(&pr->pr_mtx, MA_OWNED);
1707 data = pr->pr_slots[psrv->ps_slotno];
1708 pr->pr_slots[psrv->ps_slotno] = NULL;
1713 * Function returns current data from the slot assigned to the given jail
1714 * service for the given jail.
1717 prison_service_data_get(struct prison_service *psrv, struct prison *pr)
1720 mtx_assert(&pr->pr_mtx, MA_OWNED);
1721 return (pr->pr_slots[psrv->ps_slotno]);
1725 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
1727 struct xprison *xp, *sxp;
1733 if (jailed(req->td->td_ucred))
1736 sx_slock(&allprison_lock);
1737 if ((count = prisoncount) == 0) {
1738 sx_sunlock(&allprison_lock);
1742 len = sizeof(*xp) * count;
1743 LIST_FOREACH(pr, &allprison, pr_list) {
1745 len += pr->pr_ip4s * sizeof(struct in_addr);
1748 len += pr->pr_ip6s * sizeof(struct in6_addr);
1752 sxp = xp = malloc(len, M_TEMP, M_WAITOK | M_ZERO);
1754 LIST_FOREACH(pr, &allprison, pr_list) {
1755 xp->pr_version = XPRISON_VERSION;
1756 xp->pr_id = pr->pr_id;
1757 xp->pr_state = pr->pr_state;
1758 xp->pr_cpusetid = pr->pr_cpuset->cs_id;
1759 strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path));
1760 mtx_lock(&pr->pr_mtx);
1761 strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host));
1762 strlcpy(xp->pr_name, pr->pr_name, sizeof(xp->pr_name));
1763 mtx_unlock(&pr->pr_mtx);
1765 xp->pr_ip4s = pr->pr_ip4s;
1768 xp->pr_ip6s = pr->pr_ip6s;
1770 p = (char *)(xp + 1);
1772 if (pr->pr_ip4s > 0) {
1773 bcopy(pr->pr_ip4, (struct in_addr *)p,
1774 pr->pr_ip4s * sizeof(struct in_addr));
1775 p += (pr->pr_ip4s * sizeof(struct in_addr));
1779 if (pr->pr_ip6s > 0) {
1780 bcopy(pr->pr_ip6, (struct in6_addr *)p,
1781 pr->pr_ip6s * sizeof(struct in6_addr));
1782 p += (pr->pr_ip6s * sizeof(struct in6_addr));
1785 xp = (struct xprison *)p;
1787 sx_sunlock(&allprison_lock);
1789 error = SYSCTL_OUT(req, sxp, len);
1794 SYSCTL_OID(_security_jail, OID_AUTO, list,
1795 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1796 sysctl_jail_list, "S", "List of active jails");
1799 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
1803 injail = jailed(req->td->td_ucred);
1804 error = SYSCTL_OUT(req, &injail, sizeof(injail));
1808 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
1809 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1810 sysctl_jail_jailed, "I", "Process in jail?");
1813 DB_SHOW_COMMAND(jails, db_show_jails)
1820 char ip6buf[INET6_ADDRSTRLEN];
1823 #if defined(INET) || defined(INET6)
1828 " JID pr_ref pr_nprocs pr_ip4s pr_ip6s\n");
1830 " Hostname Path\n");
1836 " IP Address(es)\n");
1837 LIST_FOREACH(pr, &allprison, pr_list) {
1838 db_printf("%6d %6d %9d %7d %7d\n",
1839 pr->pr_id, pr->pr_ref, pr->pr_nprocs,
1840 pr->pr_ip4s, pr->pr_ip6s);
1841 db_printf("%6s %-29.29s %.74s\n",
1842 "", pr->pr_host, pr->pr_path);
1843 if (pr->pr_state < 0 || pr->pr_state >= (int)((sizeof(
1844 prison_states) / sizeof(struct prison_state))))
1847 state = prison_states[pr->pr_state].state_name;
1848 db_printf("%6s %-29.29s %.74s\n",
1849 "", (pr->pr_name[0] != '\0') ? pr->pr_name : "", state);
1850 db_printf("%6s %-6d\n",
1851 "", pr->pr_cpuset->cs_id);
1853 for (i=0; i < pr->pr_ip4s; i++) {
1854 ia.s_addr = pr->pr_ip4[i].s_addr;
1855 db_printf("%6s %s\n", "", inet_ntoa(ia));
1859 for (i=0; i < pr->pr_ip6s; i++)
1860 db_printf("%6s %s\n",
1861 "", ip6_sprintf(ip6buf, &pr->pr_ip6[i]));