From 857a4b0ab0e30dad2b57bb5c0e3ac5c71c55abec Mon Sep 17 00:00:00 2001 From: kevans Date: Wed, 25 Nov 2020 03:14:25 +0000 Subject: [PATCH] kern: cpuset: properly rebase when attaching to a jail The current logic is a fine choice for a system administrator modifying process cpusets or a process creating a new cpuset(2), but not ideal for processes attaching to a jail. Currently, when a process attaches to a jail, it does exactly what any other process does and loses any mask it might have applied in the process of doing so because cpuset_setproc() is entirely based around the assumption that non-anonymous cpusets in the process can be replaced with the new parent set. This approach slightly improves the jail attach integration by modifying cpuset_setproc() callers to indicate if they should rebase their cpuset to the indicated set or not (i.e. cpuset_setproc_update_set). If we're rebasing and the process currently has a cpuset assigned that is not the containing jail's root set, then we will now create a new base set for it hanging off the jail's root with the existing mask applied instead of using the jail's root set as the new base set. Note that the common case will be that the process doesn't have a cpuset within the jail root, but the system root can freely assign a cpuset from a jail to a process outside of the jail with no restriction. We assume that that may have happened or that it could happen due to a race when we drop the proc lock, so we must recheck both within the loop to gather up sufficient freed cpusets and after the loop. To recap, here's how it worked before in all cases: 0 4 <-- jail 0 4 <-- jail / process | | 1 -> 1 | 3 <-- process Here's how it works now: 0 4 <-- jail 0 4 <-- jail | | | 1 -> 1 5 <-- process | 3 <-- process or 0 4 <-- jail 0 4 <-- jail / process | | 1 <-- process -> 1 More importantly, in both cases, the attaching process still retains the mask it had prior to attaching or the attach fails with EDEADLK if it's left with no CPUs to run on or the domain policy is incompatible. The author of this patch considers this almost a security feature, because a MAC policy could grant PRIV_JAIL_ATTACH to an unprivileged user that's restricted to some subset of available CPUs the ability to attach to a jail, which might lift the user's restrictions if they attach to a jail with a wider mask. In most cases, it's anticipated that admins will use this to be able to, for example, `cpuset -c -l 1 jail -c path=/ command=/long/running/cmd`, and avoid the need for contortions to spawn a command inside a jail with a more limited cpuset than the jail. Reviewed by: jamie MFC after: 1 month (maybe) Differential Revision: https://reviews.freebsd.org/D27298 --- sys/kern/kern_cpuset.c | 121 ++++++++++++++++++++++++++++++++++------- 1 file changed, 100 insertions(+), 21 deletions(-) diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c index d8382260a8c..c5f4583b0d2 100644 --- a/sys/kern/kern_cpuset.c +++ b/sys/kern/kern_cpuset.c @@ -1104,14 +1104,63 @@ cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set, domainlist); } +static int +cpuset_setproc_newbase(struct thread *td, struct cpuset *set, + struct cpuset *nroot, struct cpuset **nsetp, + struct setlist *cpusets, struct domainlist *domainlist) +{ + struct domainset ndomain; + cpuset_t nmask; + struct cpuset *pbase; + int error; + + pbase = cpuset_getbase(td->td_cpuset); + + /* Copy process mask, then further apply the new root mask. */ + CPU_COPY(&pbase->cs_mask, &nmask); + CPU_AND(&nmask, &nroot->cs_mask); + + domainset_copy(pbase->cs_domain, &ndomain); + DOMAINSET_AND(&ndomain.ds_mask, &set->cs_domain->ds_mask); + + /* Policy is too restrictive, will not work. */ + if (CPU_EMPTY(&nmask) || DOMAINSET_EMPTY(&ndomain.ds_mask)) + return (EDEADLK); + + /* + * Remove pbase from the freelist in advance, it'll be pushed to + * cpuset_ids on success. We assume here that cpuset_create() will not + * touch pbase on failure, and we just enqueue it back to the freelist + * to remain in a consistent state. + */ + pbase = LIST_FIRST(cpusets); + LIST_REMOVE(pbase, cs_link); + error = cpuset_create(&pbase, set, &nmask); + if (error != 0) { + LIST_INSERT_HEAD(cpusets, pbase, cs_link); + return (error); + } + + /* Duplicates some work from above... oh well. */ + pbase->cs_domain = domainset_shadow(set->cs_domain, &ndomain, + domainlist); + *nsetp = pbase; + return (0); +} + /* - * Handle three cases for updating an entire process. + * Handle four cases for updating an entire process. * - * 1) Set is non-null. This reparents all anonymous sets to the provided - * set and replaces all non-anonymous td_cpusets with the provided set. - * 2) Mask is non-null. This replaces or creates anonymous sets for every + * 1) Set is non-null and the process is not rebasing onto a new root. This + * reparents all anonymous sets to the provided set and replaces all + * non-anonymous td_cpusets with the provided set. + * 2) Set is non-null and the process is rebasing onto a new root. This + * creates a new base set if the process previously had its own base set, + * then reparents all anonymous sets either to that set or the provided set + * if one was not created. Non-anonymous sets are similarly replaced. + * 3) Mask is non-null. This replaces or creates anonymous sets for every * thread with the existing base as a parent. - * 3) domain is non-null. This creates anonymous sets for every thread + * 4) domain is non-null. This creates anonymous sets for every thread * and replaces the domain set. * * This is overly complicated because we can't allocate while holding a @@ -1120,15 +1169,15 @@ cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set, */ static int cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask, - struct domainset *domain) + struct domainset *domain, bool rebase) { struct setlist freelist; struct setlist droplist; struct domainlist domainlist; - struct cpuset *nset; + struct cpuset *base, *nset, *nroot, *tdroot; struct thread *td; struct proc *p; - int threads; + int needed; int nfree; int error; @@ -1144,21 +1193,49 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask, nfree = 1; LIST_INIT(&droplist); nfree = 0; + base = set; + nroot = NULL; + if (set != NULL) + nroot = cpuset_getroot(set); for (;;) { error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset); if (error) goto out; - if (nfree >= p->p_numthreads) + tdroot = cpuset_getroot(td->td_cpuset); + needed = p->p_numthreads; + if (set != NULL && rebase && tdroot != nroot) + needed++; + if (nfree >= needed) break; - threads = p->p_numthreads; PROC_UNLOCK(p); - if (nfree < threads) { - cpuset_freelist_add(&freelist, threads - nfree); - domainset_freelist_add(&domainlist, threads - nfree); - nfree = threads; + if (nfree < needed) { + cpuset_freelist_add(&freelist, needed - nfree); + domainset_freelist_add(&domainlist, needed - nfree); + nfree = needed; } } PROC_LOCK_ASSERT(p, MA_OWNED); + + /* + * If we're changing roots and the root set is what has been specified + * as the parent, then we'll check if the process was previously using + * the root set and, if it wasn't, create a new base with the process's + * mask applied to it. + */ + if (set != NULL && rebase && nroot != tdroot) { + cpusetid_t base_id, root_id; + + root_id = td->td_ucred->cr_prison->pr_cpuset->cs_id; + base_id = cpuset_getbase(td->td_cpuset)->cs_id; + + if (base_id != root_id) { + error = cpuset_setproc_newbase(td, set, nroot, &base, + &freelist, &domainlist); + if (error != 0) + goto unlock_out; + } + } + /* * Now that the appropriate locks are held and we have enough cpusets, * make sure the operation will succeed before applying changes. The @@ -1169,7 +1246,7 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask, thread_lock(td); if (set != NULL) error = cpuset_setproc_test_setthread(td->td_cpuset, - set); + base); else error = cpuset_setproc_test_maskthread(td->td_cpuset, mask, domain); @@ -1185,7 +1262,7 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask, FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (set != NULL) - error = cpuset_setproc_setthread(td->td_cpuset, set, + error = cpuset_setproc_setthread(td->td_cpuset, base, &nset, &freelist, &domainlist); else error = cpuset_setproc_maskthread(td->td_cpuset, mask, @@ -1200,6 +1277,8 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask, unlock_out: PROC_UNLOCK(p); out: + if (base != NULL && base != set) + cpuset_rel(base); while ((nset = LIST_FIRST(&droplist)) != NULL) cpuset_rel_complete(nset); cpuset_freelist_free(&freelist); @@ -1584,7 +1663,7 @@ cpuset_setproc_update_set(struct proc *p, struct cpuset *set) KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__)); cpuset_ref(set); - error = cpuset_setproc(p->p_pid, set, NULL, NULL); + error = cpuset_setproc(p->p_pid, set, NULL, NULL, true); if (error) return (error); cpuset_rel(set); @@ -1634,7 +1713,7 @@ sys_cpuset(struct thread *td, struct cpuset_args *uap) return (error); error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id)); if (error == 0) - error = cpuset_setproc(-1, set, NULL, NULL); + error = cpuset_setproc(-1, set, NULL, NULL, false); cpuset_rel(set); return (error); } @@ -1668,7 +1747,7 @@ kern_cpuset_setid(struct thread *td, cpuwhich_t which, set = cpuset_lookup(setid, td); if (set == NULL) return (ESRCH); - error = cpuset_setproc(id, set, NULL, NULL); + error = cpuset_setproc(id, set, NULL, NULL, false); cpuset_rel(set); return (error); } @@ -1942,7 +2021,7 @@ kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, error = cpuset_setthread(id, mask); break; case CPU_WHICH_PID: - error = cpuset_setproc(id, NULL, mask, NULL); + error = cpuset_setproc(id, NULL, mask, NULL, false); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: @@ -2226,7 +2305,7 @@ kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, error = _cpuset_setthread(id, NULL, &domain); break; case CPU_WHICH_PID: - error = cpuset_setproc(id, NULL, NULL, &domain); + error = cpuset_setproc(id, NULL, NULL, &domain, false); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: -- 2.45.0