]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/kern/kern_rctl.c
libarchive: merge from vendor branch
[FreeBSD/FreeBSD.git] / sys / kern / kern_rctl.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2010 The FreeBSD Foundation
5  *
6  * This software was developed by Edward Tomasz Napierala under sponsorship
7  * from the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  * $FreeBSD$
31  */
32
33 #include <sys/cdefs.h>
34 __FBSDID("$FreeBSD$");
35
36 #include <sys/param.h>
37 #include <sys/devctl.h>
38 #include <sys/malloc.h>
39 #include <sys/queue.h>
40 #include <sys/refcount.h>
41 #include <sys/jail.h>
42 #include <sys/kernel.h>
43 #include <sys/limits.h>
44 #include <sys/loginclass.h>
45 #include <sys/priv.h>
46 #include <sys/proc.h>
47 #include <sys/racct.h>
48 #include <sys/rctl.h>
49 #include <sys/resourcevar.h>
50 #include <sys/sx.h>
51 #include <sys/sysproto.h>
52 #include <sys/systm.h>
53 #include <sys/types.h>
54 #include <sys/eventhandler.h>
55 #include <sys/lock.h>
56 #include <sys/mutex.h>
57 #include <sys/rwlock.h>
58 #include <sys/sbuf.h>
59 #include <sys/taskqueue.h>
60 #include <sys/tree.h>
61 #include <vm/uma.h>
62
63 #ifdef RCTL
64 #ifndef RACCT
65 #error "The RCTL option requires the RACCT option"
66 #endif
67
68 FEATURE(rctl, "Resource Limits");
69
70 #define HRF_DEFAULT             0
71 #define HRF_DONT_INHERIT        1
72 #define HRF_DONT_ACCUMULATE     2
73
74 #define RCTL_MAX_INBUFSIZE      4 * 1024
75 #define RCTL_MAX_OUTBUFSIZE     16 * 1024 * 1024
76 #define RCTL_LOG_BUFSIZE        128
77
78 #define RCTL_PCPU_SHIFT         (10 * 1000000)
79
80 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
81 static int rctl_log_rate_limit = 10;
82 static int rctl_devctl_rate_limit = 10;
83
84 /*
85  * Values below are initialized in rctl_init().
86  */
87 static int rctl_throttle_min = -1;
88 static int rctl_throttle_max = -1;
89 static int rctl_throttle_pct = -1;
90 static int rctl_throttle_pct2 = -1;
91
92 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
93 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
94 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
95 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
96
97 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
98     "Resource Limits");
99 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
100     &rctl_maxbufsize, 0, "Maximum output buffer size");
101 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
102     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
103 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
104     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
105 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
106     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
107     &rctl_throttle_min_sysctl, "IU",
108     "Shortest throttling duration, in hz");
109 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
110 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
111     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
112     &rctl_throttle_max_sysctl, "IU",
113     "Longest throttling duration, in hz");
114 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
115 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
116     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
117     &rctl_throttle_pct_sysctl, "IU",
118     "Throttling penalty for process consumption, in percent");
119 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
120 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
121     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
122     &rctl_throttle_pct2_sysctl, "IU",
123     "Throttling penalty for container consumption, in percent");
124 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
125
126 /*
127  * 'rctl_rule_link' connects a rule with every racct it's related to.
128  * For example, rule 'user:X:openfiles:deny=N/process' is linked
129  * with uidinfo for user X, and to each process of that user.
130  */
131 struct rctl_rule_link {
132         LIST_ENTRY(rctl_rule_link)      rrl_next;
133         struct rctl_rule                *rrl_rule;
134         int                             rrl_exceeded;
135 };
136
137 struct dict {
138         const char      *d_name;
139         int             d_value;
140 };
141
142 static struct dict subjectnames[] = {
143         { "process", RCTL_SUBJECT_TYPE_PROCESS },
144         { "user", RCTL_SUBJECT_TYPE_USER },
145         { "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
146         { "jail", RCTL_SUBJECT_TYPE_JAIL },
147         { NULL, -1 }};
148
149 static struct dict resourcenames[] = {
150         { "cputime", RACCT_CPU },
151         { "datasize", RACCT_DATA },
152         { "stacksize", RACCT_STACK },
153         { "coredumpsize", RACCT_CORE },
154         { "memoryuse", RACCT_RSS },
155         { "memorylocked", RACCT_MEMLOCK },
156         { "maxproc", RACCT_NPROC },
157         { "openfiles", RACCT_NOFILE },
158         { "vmemoryuse", RACCT_VMEM },
159         { "pseudoterminals", RACCT_NPTS },
160         { "swapuse", RACCT_SWAP },
161         { "nthr", RACCT_NTHR },
162         { "msgqqueued", RACCT_MSGQQUEUED },
163         { "msgqsize", RACCT_MSGQSIZE },
164         { "nmsgq", RACCT_NMSGQ },
165         { "nsem", RACCT_NSEM },
166         { "nsemop", RACCT_NSEMOP },
167         { "nshm", RACCT_NSHM },
168         { "shmsize", RACCT_SHMSIZE },
169         { "wallclock", RACCT_WALLCLOCK },
170         { "pcpu", RACCT_PCTCPU },
171         { "readbps", RACCT_READBPS },
172         { "writebps", RACCT_WRITEBPS },
173         { "readiops", RACCT_READIOPS },
174         { "writeiops", RACCT_WRITEIOPS },
175         { NULL, -1 }};
176
177 static struct dict actionnames[] = {
178         { "sighup", RCTL_ACTION_SIGHUP },
179         { "sigint", RCTL_ACTION_SIGINT },
180         { "sigquit", RCTL_ACTION_SIGQUIT },
181         { "sigill", RCTL_ACTION_SIGILL },
182         { "sigtrap", RCTL_ACTION_SIGTRAP },
183         { "sigabrt", RCTL_ACTION_SIGABRT },
184         { "sigemt", RCTL_ACTION_SIGEMT },
185         { "sigfpe", RCTL_ACTION_SIGFPE },
186         { "sigkill", RCTL_ACTION_SIGKILL },
187         { "sigbus", RCTL_ACTION_SIGBUS },
188         { "sigsegv", RCTL_ACTION_SIGSEGV },
189         { "sigsys", RCTL_ACTION_SIGSYS },
190         { "sigpipe", RCTL_ACTION_SIGPIPE },
191         { "sigalrm", RCTL_ACTION_SIGALRM },
192         { "sigterm", RCTL_ACTION_SIGTERM },
193         { "sigurg", RCTL_ACTION_SIGURG },
194         { "sigstop", RCTL_ACTION_SIGSTOP },
195         { "sigtstp", RCTL_ACTION_SIGTSTP },
196         { "sigchld", RCTL_ACTION_SIGCHLD },
197         { "sigttin", RCTL_ACTION_SIGTTIN },
198         { "sigttou", RCTL_ACTION_SIGTTOU },
199         { "sigio", RCTL_ACTION_SIGIO },
200         { "sigxcpu", RCTL_ACTION_SIGXCPU },
201         { "sigxfsz", RCTL_ACTION_SIGXFSZ },
202         { "sigvtalrm", RCTL_ACTION_SIGVTALRM },
203         { "sigprof", RCTL_ACTION_SIGPROF },
204         { "sigwinch", RCTL_ACTION_SIGWINCH },
205         { "siginfo", RCTL_ACTION_SIGINFO },
206         { "sigusr1", RCTL_ACTION_SIGUSR1 },
207         { "sigusr2", RCTL_ACTION_SIGUSR2 },
208         { "sigthr", RCTL_ACTION_SIGTHR },
209         { "deny", RCTL_ACTION_DENY },
210         { "log", RCTL_ACTION_LOG },
211         { "devctl", RCTL_ACTION_DEVCTL },
212         { "throttle", RCTL_ACTION_THROTTLE },
213         { NULL, -1 }};
214
215 static void rctl_init(void);
216 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
217
218 static uma_zone_t rctl_rule_zone;
219 static uma_zone_t rctl_rule_link_zone;
220
221 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
222 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
223
224 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
225
226 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
227 {
228         int error, val = rctl_throttle_min;
229
230         error = sysctl_handle_int(oidp, &val, 0, req);
231         if (error || !req->newptr)
232                 return (error);
233         if (val < 1 || val > rctl_throttle_max)
234                 return (EINVAL);
235
236         RACCT_LOCK();
237         rctl_throttle_min = val;
238         RACCT_UNLOCK();
239
240         return (0);
241 }
242
243 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
244 {
245         int error, val = rctl_throttle_max;
246
247         error = sysctl_handle_int(oidp, &val, 0, req);
248         if (error || !req->newptr)
249                 return (error);
250         if (val < rctl_throttle_min)
251                 return (EINVAL);
252
253         RACCT_LOCK();
254         rctl_throttle_max = val;
255         RACCT_UNLOCK();
256
257         return (0);
258 }
259
260 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
261 {
262         int error, val = rctl_throttle_pct;
263
264         error = sysctl_handle_int(oidp, &val, 0, req);
265         if (error || !req->newptr)
266                 return (error);
267         if (val < 0)
268                 return (EINVAL);
269
270         RACCT_LOCK();
271         rctl_throttle_pct = val;
272         RACCT_UNLOCK();
273
274         return (0);
275 }
276
277 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
278 {
279         int error, val = rctl_throttle_pct2;
280
281         error = sysctl_handle_int(oidp, &val, 0, req);
282         if (error || !req->newptr)
283                 return (error);
284         if (val < 0)
285                 return (EINVAL);
286
287         RACCT_LOCK();
288         rctl_throttle_pct2 = val;
289         RACCT_UNLOCK();
290
291         return (0);
292 }
293
294 static const char *
295 rctl_subject_type_name(int subject)
296 {
297         int i;
298
299         for (i = 0; subjectnames[i].d_name != NULL; i++) {
300                 if (subjectnames[i].d_value == subject)
301                         return (subjectnames[i].d_name);
302         }
303
304         panic("rctl_subject_type_name: unknown subject type %d", subject);
305 }
306
307 static const char *
308 rctl_action_name(int action)
309 {
310         int i;
311
312         for (i = 0; actionnames[i].d_name != NULL; i++) {
313                 if (actionnames[i].d_value == action)
314                         return (actionnames[i].d_name);
315         }
316
317         panic("rctl_action_name: unknown action %d", action);
318 }
319
320 const char *
321 rctl_resource_name(int resource)
322 {
323         int i;
324
325         for (i = 0; resourcenames[i].d_name != NULL; i++) {
326                 if (resourcenames[i].d_value == resource)
327                         return (resourcenames[i].d_name);
328         }
329
330         panic("rctl_resource_name: unknown resource %d", resource);
331 }
332
333 static struct racct *
334 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
335 {
336         struct ucred *cred = p->p_ucred;
337
338         ASSERT_RACCT_ENABLED();
339         RACCT_LOCK_ASSERT();
340
341         switch (rule->rr_per) {
342         case RCTL_SUBJECT_TYPE_PROCESS:
343                 return (p->p_racct);
344         case RCTL_SUBJECT_TYPE_USER:
345                 return (cred->cr_ruidinfo->ui_racct);
346         case RCTL_SUBJECT_TYPE_LOGINCLASS:
347                 return (cred->cr_loginclass->lc_racct);
348         case RCTL_SUBJECT_TYPE_JAIL:
349                 return (cred->cr_prison->pr_prison_racct->prr_racct);
350         default:
351                 panic("%s: unknown per %d", __func__, rule->rr_per);
352         }
353 }
354
355 /*
356  * Return the amount of resource that can be allocated by 'p' before
357  * hitting 'rule'.
358  */
359 static int64_t
360 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
361 {
362         const struct racct *racct;
363         int64_t available;
364
365         ASSERT_RACCT_ENABLED();
366         RACCT_LOCK_ASSERT();
367
368         racct = rctl_proc_rule_to_racct(p, rule);
369         available = rule->rr_amount - racct->r_resources[rule->rr_resource];
370
371         return (available);
372 }
373
374 /*
375  * Called every second for proc, uidinfo, loginclass, and jail containers.
376  * If the limit isn't exceeded, it decreases the usage amount to zero.
377  * Otherwise, it decreases it by the value of the limit.  This way
378  * resource consumption exceeding the limit "carries over" to the next
379  * period.
380  */
381 void
382 rctl_throttle_decay(struct racct *racct, int resource)
383 {
384         struct rctl_rule *rule;
385         struct rctl_rule_link *link;
386         int64_t minavailable;
387
388         ASSERT_RACCT_ENABLED();
389         RACCT_LOCK_ASSERT();
390
391         minavailable = INT64_MAX;
392
393         LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
394                 rule = link->rrl_rule;
395
396                 if (rule->rr_resource != resource)
397                         continue;
398                 if (rule->rr_action != RCTL_ACTION_THROTTLE)
399                         continue;
400
401                 if (rule->rr_amount < minavailable)
402                         minavailable = rule->rr_amount;
403         }
404
405         if (racct->r_resources[resource] < minavailable) {
406                 racct->r_resources[resource] = 0;
407         } else {
408                 /*
409                  * Cap utilization counter at ten times the limit.  Otherwise,
410                  * if we changed the rule lowering the allowed amount, it could
411                  * take unreasonably long time for the accumulated resource
412                  * usage to drop.
413                  */
414                 if (racct->r_resources[resource] > minavailable * 10)
415                         racct->r_resources[resource] = minavailable * 10;
416
417                 racct->r_resources[resource] -= minavailable;
418         }
419 }
420
421 /*
422  * Special version of rctl_get_available() for the %CPU resource.
423  * We slightly cheat here and return less than we normally would.
424  */
425 int64_t
426 rctl_pcpu_available(const struct proc *p) {
427         struct rctl_rule *rule;
428         struct rctl_rule_link *link;
429         int64_t available, minavailable, limit;
430
431         ASSERT_RACCT_ENABLED();
432         RACCT_LOCK_ASSERT();
433
434         minavailable = INT64_MAX;
435         limit = 0;
436
437         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
438                 rule = link->rrl_rule;
439                 if (rule->rr_resource != RACCT_PCTCPU)
440                         continue;
441                 if (rule->rr_action != RCTL_ACTION_DENY)
442                         continue;
443                 available = rctl_available_resource(p, rule);
444                 if (available < minavailable) {
445                         minavailable = available;
446                         limit = rule->rr_amount;
447                 }
448         }
449
450         /*
451          * Return slightly less than actual value of the available
452          * %cpu resource.  This makes %cpu throttling more aggressive
453          * and lets us act sooner than the limits are already exceeded.
454          */
455         if (limit != 0) {
456                 if (limit > 2 * RCTL_PCPU_SHIFT)
457                         minavailable -= RCTL_PCPU_SHIFT;
458                 else
459                         minavailable -= (limit / 2);
460         }
461
462         return (minavailable);
463 }
464
465 static uint64_t
466 xadd(uint64_t a, uint64_t b)
467 {
468         uint64_t c;
469
470         c = a + b;
471
472         /*
473          * Detect overflow.
474          */
475         if (c < a || c < b)
476                 return (UINT64_MAX);
477
478         return (c);
479 }
480
481 static uint64_t
482 xmul(uint64_t a, uint64_t b)
483 {
484
485         if (b != 0 && a > UINT64_MAX / b)
486                 return (UINT64_MAX);
487
488         return (a * b);
489 }
490
491 /*
492  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
493  * to what it keeps allocated now.  Returns non-zero if the allocation should
494  * be denied, 0 otherwise.
495  */
496 int
497 rctl_enforce(struct proc *p, int resource, uint64_t amount)
498 {
499         static struct timeval log_lasttime, devctl_lasttime;
500         static int log_curtime = 0, devctl_curtime = 0;
501         struct rctl_rule *rule;
502         struct rctl_rule_link *link;
503         struct sbuf sb;
504         char *buf;
505         int64_t available;
506         uint64_t sleep_ms, sleep_ratio;
507         int should_deny = 0;
508
509         ASSERT_RACCT_ENABLED();
510         RACCT_LOCK_ASSERT();
511
512         /*
513          * There may be more than one matching rule; go through all of them.
514          * Denial should be done last, after logging and sending signals.
515          */
516         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
517                 rule = link->rrl_rule;
518                 if (rule->rr_resource != resource)
519                         continue;
520
521                 available = rctl_available_resource(p, rule);
522                 if (available >= (int64_t)amount) {
523                         link->rrl_exceeded = 0;
524                         continue;
525                 }
526
527                 switch (rule->rr_action) {
528                 case RCTL_ACTION_DENY:
529                         should_deny = 1;
530                         continue;
531                 case RCTL_ACTION_LOG:
532                         /*
533                          * If rrl_exceeded != 0, it means we've already
534                          * logged a warning for this process.
535                          */
536                         if (link->rrl_exceeded != 0)
537                                 continue;
538
539                         /*
540                          * If the process state is not fully initialized yet,
541                          * we can't access most of the required fields, e.g.
542                          * p->p_comm.  This happens when called from fork1().
543                          * Ignore this rule for now; it will be processed just
544                          * after fork, when called from racct_proc_fork_done().
545                          */
546                         if (p->p_state != PRS_NORMAL)
547                                 continue;
548
549                         if (!ppsratecheck(&log_lasttime, &log_curtime,
550                             rctl_log_rate_limit))
551                                 continue;
552
553                         buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
554                         if (buf == NULL) {
555                                 printf("rctl_enforce: out of memory\n");
556                                 continue;
557                         }
558                         sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
559                         rctl_rule_to_sbuf(&sb, rule);
560                         sbuf_finish(&sb);
561                         printf("rctl: rule \"%s\" matched by pid %d "
562                             "(%s), uid %d, jail %s\n", sbuf_data(&sb),
563                             p->p_pid, p->p_comm, p->p_ucred->cr_uid,
564                             p->p_ucred->cr_prison->pr_prison_racct->prr_name);
565                         sbuf_delete(&sb);
566                         free(buf, M_RCTL);
567                         link->rrl_exceeded = 1;
568                         continue;
569                 case RCTL_ACTION_DEVCTL:
570                         if (link->rrl_exceeded != 0)
571                                 continue;
572
573                         if (p->p_state != PRS_NORMAL)
574                                 continue;
575
576                         if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
577                             rctl_devctl_rate_limit))
578                                 continue;
579
580                         buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
581                         if (buf == NULL) {
582                                 printf("rctl_enforce: out of memory\n");
583                                 continue;
584                         }
585                         sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
586                         sbuf_printf(&sb, "rule=");
587                         rctl_rule_to_sbuf(&sb, rule);
588                         sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
589                             p->p_pid, p->p_ucred->cr_ruid,
590                             p->p_ucred->cr_prison->pr_prison_racct->prr_name);
591                         sbuf_finish(&sb);
592                         devctl_notify("RCTL", "rule", "matched",
593                             sbuf_data(&sb));
594                         sbuf_delete(&sb);
595                         free(buf, M_RCTL);
596                         link->rrl_exceeded = 1;
597                         continue;
598                 case RCTL_ACTION_THROTTLE:
599                         if (p->p_state != PRS_NORMAL)
600                                 continue;
601
602                         if (rule->rr_amount == 0) {
603                                 racct_proc_throttle(p, rctl_throttle_max);
604                                 continue;
605                         }
606
607                         /*
608                          * Make the process sleep for a fraction of second
609                          * proportional to the ratio of process' resource
610                          * utilization compared to the limit.  The point is
611                          * to penalize resource hogs: processes that consume
612                          * more of the available resources sleep for longer.
613                          *
614                          * We're trying to defer division until the very end,
615                          * to minimize the rounding effects.  The following
616                          * calculation could have been written in a clearer
617                          * way like this:
618                          *
619                          * sleep_ms = hz * p->p_racct->r_resources[resource] /
620                          *     rule->rr_amount;
621                          * sleep_ms *= rctl_throttle_pct / 100;
622                          * if (sleep_ms < rctl_throttle_min)
623                          *         sleep_ms = rctl_throttle_min;
624                          *
625                          */
626                         sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
627                         sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
628                         if (sleep_ms < rctl_throttle_min * rule->rr_amount)
629                                 sleep_ms = rctl_throttle_min * rule->rr_amount;
630
631                         /*
632                          * Multiply that by the ratio of the resource
633                          * consumption for the container compared to the limit,
634                          * squared.  In other words, a process in a container
635                          * that is two times over the limit will be throttled
636                          * four times as much for hitting the same rule.  The
637                          * point is to penalize processes more if the container
638                          * itself (eg certain UID or jail) is above the limit.
639                          */
640                         if (available < 0)
641                                 sleep_ratio = -available / rule->rr_amount;
642                         else
643                                 sleep_ratio = 0;
644                         sleep_ratio = xmul(sleep_ratio, sleep_ratio);
645                         sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
646                         sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
647
648                         /*
649                          * Finally the division.
650                          */
651                         sleep_ms /= rule->rr_amount;
652
653                         if (sleep_ms > rctl_throttle_max)
654                                 sleep_ms = rctl_throttle_max;
655 #if 0
656                         printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
657                            __func__, p->p_pid, p->p_comm,
658                            p->p_racct->r_resources[resource],
659                            rule->rr_amount, (uintmax_t)sleep_ms,
660                            (uintmax_t)sleep_ratio, (intmax_t)available);
661 #endif
662
663                         KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
664                             __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
665                         racct_proc_throttle(p, sleep_ms);
666                         continue;
667                 default:
668                         if (link->rrl_exceeded != 0)
669                                 continue;
670
671                         if (p->p_state != PRS_NORMAL)
672                                 continue;
673
674                         KASSERT(rule->rr_action > 0 &&
675                             rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
676                             ("rctl_enforce: unknown action %d",
677                              rule->rr_action));
678
679                         /*
680                          * We're using the fact that RCTL_ACTION_SIG* values
681                          * are equal to their counterparts from sys/signal.h.
682                          */
683                         kern_psignal(p, rule->rr_action);
684                         link->rrl_exceeded = 1;
685                         continue;
686                 }
687         }
688
689         if (should_deny) {
690                 /*
691                  * Return fake error code; the caller should change it
692                  * into one proper for the situation - EFSIZ, ENOMEM etc.
693                  */
694                 return (EDOOFUS);
695         }
696
697         return (0);
698 }
699
700 uint64_t
701 rctl_get_limit(struct proc *p, int resource)
702 {
703         struct rctl_rule *rule;
704         struct rctl_rule_link *link;
705         uint64_t amount = UINT64_MAX;
706
707         ASSERT_RACCT_ENABLED();
708         RACCT_LOCK_ASSERT();
709
710         /*
711          * There may be more than one matching rule; go through all of them.
712          * Denial should be done last, after logging and sending signals.
713          */
714         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
715                 rule = link->rrl_rule;
716                 if (rule->rr_resource != resource)
717                         continue;
718                 if (rule->rr_action != RCTL_ACTION_DENY)
719                         continue;
720                 if (rule->rr_amount < amount)
721                         amount = rule->rr_amount;
722         }
723
724         return (amount);
725 }
726
727 uint64_t
728 rctl_get_available(struct proc *p, int resource)
729 {
730         struct rctl_rule *rule;
731         struct rctl_rule_link *link;
732         int64_t available, minavailable, allocated;
733
734         minavailable = INT64_MAX;
735
736         ASSERT_RACCT_ENABLED();
737         RACCT_LOCK_ASSERT();
738
739         /*
740          * There may be more than one matching rule; go through all of them.
741          * Denial should be done last, after logging and sending signals.
742          */
743         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
744                 rule = link->rrl_rule;
745                 if (rule->rr_resource != resource)
746                         continue;
747                 if (rule->rr_action != RCTL_ACTION_DENY)
748                         continue;
749                 available = rctl_available_resource(p, rule);
750                 if (available < minavailable)
751                         minavailable = available;
752         }
753
754         /*
755          * XXX: Think about this _hard_.
756          */
757         allocated = p->p_racct->r_resources[resource];
758         if (minavailable < INT64_MAX - allocated)
759                 minavailable += allocated;
760         if (minavailable < 0)
761                 minavailable = 0;
762
763         return (minavailable);
764 }
765
766 static int
767 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
768 {
769
770         ASSERT_RACCT_ENABLED();
771
772         if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
773                 if (rule->rr_subject_type != filter->rr_subject_type)
774                         return (0);
775
776                 switch (filter->rr_subject_type) {
777                 case RCTL_SUBJECT_TYPE_PROCESS:
778                         if (filter->rr_subject.rs_proc != NULL &&
779                             rule->rr_subject.rs_proc !=
780                             filter->rr_subject.rs_proc)
781                                 return (0);
782                         break;
783                 case RCTL_SUBJECT_TYPE_USER:
784                         if (filter->rr_subject.rs_uip != NULL &&
785                             rule->rr_subject.rs_uip !=
786                             filter->rr_subject.rs_uip)
787                                 return (0);
788                         break;
789                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
790                         if (filter->rr_subject.rs_loginclass != NULL &&
791                             rule->rr_subject.rs_loginclass !=
792                             filter->rr_subject.rs_loginclass)
793                                 return (0);
794                         break;
795                 case RCTL_SUBJECT_TYPE_JAIL:
796                         if (filter->rr_subject.rs_prison_racct != NULL &&
797                             rule->rr_subject.rs_prison_racct !=
798                             filter->rr_subject.rs_prison_racct)
799                                 return (0);
800                         break;
801                 default:
802                         panic("rctl_rule_matches: unknown subject type %d",
803                             filter->rr_subject_type);
804                 }
805         }
806
807         if (filter->rr_resource != RACCT_UNDEFINED) {
808                 if (rule->rr_resource != filter->rr_resource)
809                         return (0);
810         }
811
812         if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
813                 if (rule->rr_action != filter->rr_action)
814                         return (0);
815         }
816
817         if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
818                 if (rule->rr_amount != filter->rr_amount)
819                         return (0);
820         }
821
822         if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
823                 if (rule->rr_per != filter->rr_per)
824                         return (0);
825         }
826
827         return (1);
828 }
829
830 static int
831 str2value(const char *str, int *value, struct dict *table)
832 {
833         int i;
834
835         if (value == NULL)
836                 return (EINVAL);
837
838         for (i = 0; table[i].d_name != NULL; i++) {
839                 if (strcasecmp(table[i].d_name, str) == 0) {
840                         *value =  table[i].d_value;
841                         return (0);
842                 }
843         }
844
845         return (EINVAL);
846 }
847
848 static int
849 str2id(const char *str, id_t *value)
850 {
851         char *end;
852
853         if (str == NULL)
854                 return (EINVAL);
855
856         *value = strtoul(str, &end, 10);
857         if ((size_t)(end - str) != strlen(str))
858                 return (EINVAL);
859
860         return (0);
861 }
862
863 static int
864 str2int64(const char *str, int64_t *value)
865 {
866         char *end;
867
868         if (str == NULL)
869                 return (EINVAL);
870
871         *value = strtoul(str, &end, 10);
872         if ((size_t)(end - str) != strlen(str))
873                 return (EINVAL);
874
875         if (*value < 0)
876                 return (ERANGE);
877
878         return (0);
879 }
880
881 /*
882  * Connect the rule to the racct, increasing refcount for the rule.
883  */
884 static void
885 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
886 {
887         struct rctl_rule_link *link;
888
889         ASSERT_RACCT_ENABLED();
890         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
891
892         rctl_rule_acquire(rule);
893         link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
894         link->rrl_rule = rule;
895         link->rrl_exceeded = 0;
896
897         RACCT_LOCK();
898         LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
899         RACCT_UNLOCK();
900 }
901
902 static int
903 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
904 {
905         struct rctl_rule_link *link;
906
907         ASSERT_RACCT_ENABLED();
908         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
909         RACCT_LOCK_ASSERT();
910
911         link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
912         if (link == NULL)
913                 return (ENOMEM);
914         rctl_rule_acquire(rule);
915         link->rrl_rule = rule;
916         link->rrl_exceeded = 0;
917
918         LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
919
920         return (0);
921 }
922
923 /*
924  * Remove limits for a rules matching the filter and release
925  * the refcounts for the rules, possibly freeing them.  Returns
926  * the number of limit structures removed.
927  */
928 static int
929 rctl_racct_remove_rules(struct racct *racct,
930     const struct rctl_rule *filter)
931 {
932         struct rctl_rule_link *link, *linktmp;
933         int removed = 0;
934
935         ASSERT_RACCT_ENABLED();
936         RACCT_LOCK_ASSERT();
937
938         LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
939                 if (!rctl_rule_matches(link->rrl_rule, filter))
940                         continue;
941
942                 LIST_REMOVE(link, rrl_next);
943                 rctl_rule_release(link->rrl_rule);
944                 uma_zfree(rctl_rule_link_zone, link);
945                 removed++;
946         }
947         return (removed);
948 }
949
950 static void
951 rctl_rule_acquire_subject(struct rctl_rule *rule)
952 {
953
954         ASSERT_RACCT_ENABLED();
955
956         switch (rule->rr_subject_type) {
957         case RCTL_SUBJECT_TYPE_UNDEFINED:
958         case RCTL_SUBJECT_TYPE_PROCESS:
959                 break;
960         case RCTL_SUBJECT_TYPE_JAIL:
961                 if (rule->rr_subject.rs_prison_racct != NULL)
962                         prison_racct_hold(rule->rr_subject.rs_prison_racct);
963                 break;
964         case RCTL_SUBJECT_TYPE_USER:
965                 if (rule->rr_subject.rs_uip != NULL)
966                         uihold(rule->rr_subject.rs_uip);
967                 break;
968         case RCTL_SUBJECT_TYPE_LOGINCLASS:
969                 if (rule->rr_subject.rs_loginclass != NULL)
970                         loginclass_hold(rule->rr_subject.rs_loginclass);
971                 break;
972         default:
973                 panic("rctl_rule_acquire_subject: unknown subject type %d",
974                     rule->rr_subject_type);
975         }
976 }
977
978 static void
979 rctl_rule_release_subject(struct rctl_rule *rule)
980 {
981
982         ASSERT_RACCT_ENABLED();
983
984         switch (rule->rr_subject_type) {
985         case RCTL_SUBJECT_TYPE_UNDEFINED:
986         case RCTL_SUBJECT_TYPE_PROCESS:
987                 break;
988         case RCTL_SUBJECT_TYPE_JAIL:
989                 if (rule->rr_subject.rs_prison_racct != NULL)
990                         prison_racct_free(rule->rr_subject.rs_prison_racct);
991                 break;
992         case RCTL_SUBJECT_TYPE_USER:
993                 if (rule->rr_subject.rs_uip != NULL)
994                         uifree(rule->rr_subject.rs_uip);
995                 break;
996         case RCTL_SUBJECT_TYPE_LOGINCLASS:
997                 if (rule->rr_subject.rs_loginclass != NULL)
998                         loginclass_free(rule->rr_subject.rs_loginclass);
999                 break;
1000         default:
1001                 panic("rctl_rule_release_subject: unknown subject type %d",
1002                     rule->rr_subject_type);
1003         }
1004 }
1005
1006 struct rctl_rule *
1007 rctl_rule_alloc(int flags)
1008 {
1009         struct rctl_rule *rule;
1010
1011         ASSERT_RACCT_ENABLED();
1012
1013         rule = uma_zalloc(rctl_rule_zone, flags);
1014         if (rule == NULL)
1015                 return (NULL);
1016         rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1017         rule->rr_subject.rs_proc = NULL;
1018         rule->rr_subject.rs_uip = NULL;
1019         rule->rr_subject.rs_loginclass = NULL;
1020         rule->rr_subject.rs_prison_racct = NULL;
1021         rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1022         rule->rr_resource = RACCT_UNDEFINED;
1023         rule->rr_action = RCTL_ACTION_UNDEFINED;
1024         rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1025         refcount_init(&rule->rr_refcount, 1);
1026
1027         return (rule);
1028 }
1029
1030 struct rctl_rule *
1031 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1032 {
1033         struct rctl_rule *copy;
1034
1035         ASSERT_RACCT_ENABLED();
1036
1037         copy = uma_zalloc(rctl_rule_zone, flags);
1038         if (copy == NULL)
1039                 return (NULL);
1040         copy->rr_subject_type = rule->rr_subject_type;
1041         copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1042         copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1043         copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1044         copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1045         copy->rr_per = rule->rr_per;
1046         copy->rr_resource = rule->rr_resource;
1047         copy->rr_action = rule->rr_action;
1048         copy->rr_amount = rule->rr_amount;
1049         refcount_init(&copy->rr_refcount, 1);
1050         rctl_rule_acquire_subject(copy);
1051
1052         return (copy);
1053 }
1054
1055 void
1056 rctl_rule_acquire(struct rctl_rule *rule)
1057 {
1058
1059         ASSERT_RACCT_ENABLED();
1060         KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1061
1062         refcount_acquire(&rule->rr_refcount);
1063 }
1064
1065 static void
1066 rctl_rule_free(void *context, int pending)
1067 {
1068         struct rctl_rule *rule;
1069
1070         rule = (struct rctl_rule *)context;
1071
1072         ASSERT_RACCT_ENABLED();
1073         KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1074
1075         /*
1076          * We don't need locking here; rule is guaranteed to be inaccessible.
1077          */
1078
1079         rctl_rule_release_subject(rule);
1080         uma_zfree(rctl_rule_zone, rule);
1081 }
1082
1083 void
1084 rctl_rule_release(struct rctl_rule *rule)
1085 {
1086
1087         ASSERT_RACCT_ENABLED();
1088         KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1089
1090         if (refcount_release(&rule->rr_refcount)) {
1091                 /*
1092                  * rctl_rule_release() is often called when iterating
1093                  * over all the uidinfo structures in the system,
1094                  * holding uihashtbl_lock.  Since rctl_rule_free()
1095                  * might end up calling uifree(), this would lead
1096                  * to lock recursion.  Use taskqueue to avoid this.
1097                  */
1098                 TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1099                 taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1100         }
1101 }
1102
1103 static int
1104 rctl_rule_fully_specified(const struct rctl_rule *rule)
1105 {
1106
1107         ASSERT_RACCT_ENABLED();
1108
1109         switch (rule->rr_subject_type) {
1110         case RCTL_SUBJECT_TYPE_UNDEFINED:
1111                 return (0);
1112         case RCTL_SUBJECT_TYPE_PROCESS:
1113                 if (rule->rr_subject.rs_proc == NULL)
1114                         return (0);
1115                 break;
1116         case RCTL_SUBJECT_TYPE_USER:
1117                 if (rule->rr_subject.rs_uip == NULL)
1118                         return (0);
1119                 break;
1120         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1121                 if (rule->rr_subject.rs_loginclass == NULL)
1122                         return (0);
1123                 break;
1124         case RCTL_SUBJECT_TYPE_JAIL:
1125                 if (rule->rr_subject.rs_prison_racct == NULL)
1126                         return (0);
1127                 break;
1128         default:
1129                 panic("rctl_rule_fully_specified: unknown subject type %d",
1130                     rule->rr_subject_type);
1131         }
1132         if (rule->rr_resource == RACCT_UNDEFINED)
1133                 return (0);
1134         if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1135                 return (0);
1136         if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1137                 return (0);
1138         if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1139                 return (0);
1140
1141         return (1);
1142 }
1143
1144 static int
1145 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1146 {
1147         struct rctl_rule *rule;
1148         char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1149              *amountstr, *perstr;
1150         id_t id;
1151         int error = 0;
1152
1153         ASSERT_RACCT_ENABLED();
1154
1155         rule = rctl_rule_alloc(M_WAITOK);
1156
1157         subjectstr = strsep(&rulestr, ":");
1158         subject_idstr = strsep(&rulestr, ":");
1159         resourcestr = strsep(&rulestr, ":");
1160         actionstr = strsep(&rulestr, "=/");
1161         amountstr = strsep(&rulestr, "/");
1162         perstr = rulestr;
1163
1164         if (subjectstr == NULL || subjectstr[0] == '\0')
1165                 rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1166         else {
1167                 error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1168                 if (error != 0)
1169                         goto out;
1170         }
1171
1172         if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1173                 rule->rr_subject.rs_proc = NULL;
1174                 rule->rr_subject.rs_uip = NULL;
1175                 rule->rr_subject.rs_loginclass = NULL;
1176                 rule->rr_subject.rs_prison_racct = NULL;
1177         } else {
1178                 switch (rule->rr_subject_type) {
1179                 case RCTL_SUBJECT_TYPE_UNDEFINED:
1180                         error = EINVAL;
1181                         goto out;
1182                 case RCTL_SUBJECT_TYPE_PROCESS:
1183                         error = str2id(subject_idstr, &id);
1184                         if (error != 0)
1185                                 goto out;
1186                         sx_assert(&allproc_lock, SA_LOCKED);
1187                         rule->rr_subject.rs_proc = pfind(id);
1188                         if (rule->rr_subject.rs_proc == NULL) {
1189                                 error = ESRCH;
1190                                 goto out;
1191                         }
1192                         PROC_UNLOCK(rule->rr_subject.rs_proc);
1193                         break;
1194                 case RCTL_SUBJECT_TYPE_USER:
1195                         error = str2id(subject_idstr, &id);
1196                         if (error != 0)
1197                                 goto out;
1198                         rule->rr_subject.rs_uip = uifind(id);
1199                         break;
1200                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
1201                         rule->rr_subject.rs_loginclass =
1202                             loginclass_find(subject_idstr);
1203                         if (rule->rr_subject.rs_loginclass == NULL) {
1204                                 error = ENAMETOOLONG;
1205                                 goto out;
1206                         }
1207                         break;
1208                 case RCTL_SUBJECT_TYPE_JAIL:
1209                         rule->rr_subject.rs_prison_racct =
1210                             prison_racct_find(subject_idstr);
1211                         if (rule->rr_subject.rs_prison_racct == NULL) {
1212                                 error = ENAMETOOLONG;
1213                                 goto out;
1214                         }
1215                         break;
1216                default:
1217                        panic("rctl_string_to_rule: unknown subject type %d",
1218                            rule->rr_subject_type);
1219                }
1220         }
1221
1222         if (resourcestr == NULL || resourcestr[0] == '\0')
1223                 rule->rr_resource = RACCT_UNDEFINED;
1224         else {
1225                 error = str2value(resourcestr, &rule->rr_resource,
1226                     resourcenames);
1227                 if (error != 0)
1228                         goto out;
1229         }
1230
1231         if (actionstr == NULL || actionstr[0] == '\0')
1232                 rule->rr_action = RCTL_ACTION_UNDEFINED;
1233         else {
1234                 error = str2value(actionstr, &rule->rr_action, actionnames);
1235                 if (error != 0)
1236                         goto out;
1237         }
1238
1239         if (amountstr == NULL || amountstr[0] == '\0')
1240                 rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1241         else {
1242                 error = str2int64(amountstr, &rule->rr_amount);
1243                 if (error != 0)
1244                         goto out;
1245                 if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1246                         if (rule->rr_amount > INT64_MAX / 1000000) {
1247                                 error = ERANGE;
1248                                 goto out;
1249                         }
1250                         rule->rr_amount *= 1000000;
1251                 }
1252         }
1253
1254         if (perstr == NULL || perstr[0] == '\0')
1255                 rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1256         else {
1257                 error = str2value(perstr, &rule->rr_per, subjectnames);
1258                 if (error != 0)
1259                         goto out;
1260         }
1261
1262 out:
1263         if (error == 0)
1264                 *rulep = rule;
1265         else
1266                 rctl_rule_release(rule);
1267
1268         return (error);
1269 }
1270
1271 /*
1272  * Link a rule with all the subjects it applies to.
1273  */
1274 int
1275 rctl_rule_add(struct rctl_rule *rule)
1276 {
1277         struct proc *p;
1278         struct ucred *cred;
1279         struct uidinfo *uip;
1280         struct prison *pr;
1281         struct prison_racct *prr;
1282         struct loginclass *lc;
1283         struct rctl_rule *rule2;
1284         int match;
1285
1286         ASSERT_RACCT_ENABLED();
1287         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1288
1289         /*
1290          * Some rules just don't make sense, like "deny" rule for an undeniable
1291          * resource.  The exception are the RSS and %CPU resources - they are
1292          * not deniable in the racct sense, but the limit is enforced in
1293          * a different way.
1294          */
1295         if (rule->rr_action == RCTL_ACTION_DENY &&
1296             !RACCT_IS_DENIABLE(rule->rr_resource) &&
1297             rule->rr_resource != RACCT_RSS &&
1298             rule->rr_resource != RACCT_PCTCPU) {
1299                 return (EOPNOTSUPP);
1300         }
1301
1302         if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1303             !RACCT_IS_DECAYING(rule->rr_resource)) {
1304                 return (EOPNOTSUPP);
1305         }
1306
1307         if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1308             rule->rr_resource == RACCT_PCTCPU) {
1309                 return (EOPNOTSUPP);
1310         }
1311
1312         if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1313             RACCT_IS_SLOPPY(rule->rr_resource)) {
1314                 return (EOPNOTSUPP);
1315         }
1316
1317         /*
1318          * Make sure there are no duplicated rules.  Also, for the "deny"
1319          * rules, remove ones differing only by "amount".
1320          */
1321         if (rule->rr_action == RCTL_ACTION_DENY) {
1322                 rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1323                 rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1324                 rctl_rule_remove(rule2);
1325                 rctl_rule_release(rule2);
1326         } else
1327                 rctl_rule_remove(rule);
1328
1329         switch (rule->rr_subject_type) {
1330         case RCTL_SUBJECT_TYPE_PROCESS:
1331                 p = rule->rr_subject.rs_proc;
1332                 KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1333
1334                 rctl_racct_add_rule(p->p_racct, rule);
1335                 /*
1336                  * In case of per-process rule, we don't have anything more
1337                  * to do.
1338                  */
1339                 return (0);
1340
1341         case RCTL_SUBJECT_TYPE_USER:
1342                 uip = rule->rr_subject.rs_uip;
1343                 KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1344                 rctl_racct_add_rule(uip->ui_racct, rule);
1345                 break;
1346
1347         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1348                 lc = rule->rr_subject.rs_loginclass;
1349                 KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1350                 rctl_racct_add_rule(lc->lc_racct, rule);
1351                 break;
1352
1353         case RCTL_SUBJECT_TYPE_JAIL:
1354                 prr = rule->rr_subject.rs_prison_racct;
1355                 KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1356                 rctl_racct_add_rule(prr->prr_racct, rule);
1357                 break;
1358
1359         default:
1360                 panic("rctl_rule_add: unknown subject type %d",
1361                     rule->rr_subject_type);
1362         }
1363
1364         /*
1365          * Now go through all the processes and add the new rule to the ones
1366          * it applies to.
1367          */
1368         sx_assert(&allproc_lock, SA_LOCKED);
1369         FOREACH_PROC_IN_SYSTEM(p) {
1370                 cred = p->p_ucred;
1371                 switch (rule->rr_subject_type) {
1372                 case RCTL_SUBJECT_TYPE_USER:
1373                         if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1374                             cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1375                                 break;
1376                         continue;
1377                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
1378                         if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1379                                 break;
1380                         continue;
1381                 case RCTL_SUBJECT_TYPE_JAIL:
1382                         match = 0;
1383                         for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1384                                 if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1385                                         match = 1;
1386                                         break;
1387                                 }
1388                         }
1389                         if (match)
1390                                 break;
1391                         continue;
1392                 default:
1393                         panic("rctl_rule_add: unknown subject type %d",
1394                             rule->rr_subject_type);
1395                 }
1396
1397                 rctl_racct_add_rule(p->p_racct, rule);
1398         }
1399
1400         return (0);
1401 }
1402
1403 static void
1404 rctl_rule_pre_callback(void)
1405 {
1406
1407         RACCT_LOCK();
1408 }
1409
1410 static void
1411 rctl_rule_post_callback(void)
1412 {
1413
1414         RACCT_UNLOCK();
1415 }
1416
1417 static void
1418 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1419 {
1420         struct rctl_rule *filter = (struct rctl_rule *)arg2;
1421         int found = 0;
1422
1423         ASSERT_RACCT_ENABLED();
1424         RACCT_LOCK_ASSERT();
1425
1426         found += rctl_racct_remove_rules(racct, filter);
1427
1428         *((int *)arg3) += found;
1429 }
1430
1431 /*
1432  * Remove all rules that match the filter.
1433  */
1434 int
1435 rctl_rule_remove(struct rctl_rule *filter)
1436 {
1437         struct proc *p;
1438         int found = 0;
1439
1440         ASSERT_RACCT_ENABLED();
1441
1442         if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1443             filter->rr_subject.rs_proc != NULL) {
1444                 p = filter->rr_subject.rs_proc;
1445                 RACCT_LOCK();
1446                 found = rctl_racct_remove_rules(p->p_racct, filter);
1447                 RACCT_UNLOCK();
1448                 if (found)
1449                         return (0);
1450                 return (ESRCH);
1451         }
1452
1453         loginclass_racct_foreach(rctl_rule_remove_callback,
1454             rctl_rule_pre_callback, rctl_rule_post_callback,
1455             filter, (void *)&found);
1456         ui_racct_foreach(rctl_rule_remove_callback,
1457             rctl_rule_pre_callback, rctl_rule_post_callback,
1458             filter, (void *)&found);
1459         prison_racct_foreach(rctl_rule_remove_callback,
1460             rctl_rule_pre_callback, rctl_rule_post_callback,
1461             filter, (void *)&found);
1462
1463         sx_assert(&allproc_lock, SA_LOCKED);
1464         RACCT_LOCK();
1465         FOREACH_PROC_IN_SYSTEM(p) {
1466                 found += rctl_racct_remove_rules(p->p_racct, filter);
1467         }
1468         RACCT_UNLOCK();
1469
1470         if (found)
1471                 return (0);
1472         return (ESRCH);
1473 }
1474
1475 /*
1476  * Appends a rule to the sbuf.
1477  */
1478 static void
1479 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1480 {
1481         int64_t amount;
1482
1483         ASSERT_RACCT_ENABLED();
1484
1485         sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1486
1487         switch (rule->rr_subject_type) {
1488         case RCTL_SUBJECT_TYPE_PROCESS:
1489                 if (rule->rr_subject.rs_proc == NULL)
1490                         sbuf_printf(sb, ":");
1491                 else
1492                         sbuf_printf(sb, "%d:",
1493                             rule->rr_subject.rs_proc->p_pid);
1494                 break;
1495         case RCTL_SUBJECT_TYPE_USER:
1496                 if (rule->rr_subject.rs_uip == NULL)
1497                         sbuf_printf(sb, ":");
1498                 else
1499                         sbuf_printf(sb, "%d:",
1500                             rule->rr_subject.rs_uip->ui_uid);
1501                 break;
1502         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1503                 if (rule->rr_subject.rs_loginclass == NULL)
1504                         sbuf_printf(sb, ":");
1505                 else
1506                         sbuf_printf(sb, "%s:",
1507                             rule->rr_subject.rs_loginclass->lc_name);
1508                 break;
1509         case RCTL_SUBJECT_TYPE_JAIL:
1510                 if (rule->rr_subject.rs_prison_racct == NULL)
1511                         sbuf_printf(sb, ":");
1512                 else
1513                         sbuf_printf(sb, "%s:",
1514                             rule->rr_subject.rs_prison_racct->prr_name);
1515                 break;
1516         default:
1517                 panic("rctl_rule_to_sbuf: unknown subject type %d",
1518                     rule->rr_subject_type);
1519         }
1520
1521         amount = rule->rr_amount;
1522         if (amount != RCTL_AMOUNT_UNDEFINED &&
1523             RACCT_IS_IN_MILLIONS(rule->rr_resource))
1524                 amount /= 1000000;
1525
1526         sbuf_printf(sb, "%s:%s=%jd",
1527             rctl_resource_name(rule->rr_resource),
1528             rctl_action_name(rule->rr_action),
1529             amount);
1530
1531         if (rule->rr_per != rule->rr_subject_type)
1532                 sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1533 }
1534
1535 /*
1536  * Routine used by RCTL syscalls to read in input string.
1537  */
1538 static int
1539 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1540 {
1541         char *str;
1542         int error;
1543
1544         ASSERT_RACCT_ENABLED();
1545
1546         if (inbuflen <= 0)
1547                 return (EINVAL);
1548         if (inbuflen > RCTL_MAX_INBUFSIZE)
1549                 return (E2BIG);
1550
1551         str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1552         error = copyinstr(inbufp, str, inbuflen, NULL);
1553         if (error != 0) {
1554                 free(str, M_RCTL);
1555                 return (error);
1556         }
1557
1558         *inputstr = str;
1559
1560         return (0);
1561 }
1562
1563 /*
1564  * Routine used by RCTL syscalls to write out output string.
1565  */
1566 static int
1567 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1568 {
1569         int error;
1570
1571         ASSERT_RACCT_ENABLED();
1572
1573         if (outputsbuf == NULL)
1574                 return (0);
1575
1576         sbuf_finish(outputsbuf);
1577         if (outbuflen < sbuf_len(outputsbuf) + 1) {
1578                 sbuf_delete(outputsbuf);
1579                 return (ERANGE);
1580         }
1581         error = copyout(sbuf_data(outputsbuf), outbufp,
1582             sbuf_len(outputsbuf) + 1);
1583         sbuf_delete(outputsbuf);
1584         return (error);
1585 }
1586
1587 static struct sbuf *
1588 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1589 {
1590         struct sbuf *sb;
1591         int64_t amount;
1592         int i;
1593
1594         ASSERT_RACCT_ENABLED();
1595
1596         sb = sbuf_new_auto();
1597         for (i = 0; i <= RACCT_MAX; i++) {
1598                 if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1599                         continue;
1600                 RACCT_LOCK();
1601                 amount = racct->r_resources[i];
1602                 RACCT_UNLOCK();
1603                 if (RACCT_IS_IN_MILLIONS(i))
1604                         amount /= 1000000;
1605                 sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1606         }
1607         sbuf_setpos(sb, sbuf_len(sb) - 1);
1608         return (sb);
1609 }
1610
1611 int
1612 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1613 {
1614         struct rctl_rule *filter;
1615         struct sbuf *outputsbuf = NULL;
1616         struct proc *p;
1617         struct uidinfo *uip;
1618         struct loginclass *lc;
1619         struct prison_racct *prr;
1620         char *inputstr;
1621         int error;
1622
1623         if (!racct_enable)
1624                 return (ENOSYS);
1625
1626         error = priv_check(td, PRIV_RCTL_GET_RACCT);
1627         if (error != 0)
1628                 return (error);
1629
1630         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1631         if (error != 0)
1632                 return (error);
1633
1634         sx_slock(&allproc_lock);
1635         error = rctl_string_to_rule(inputstr, &filter);
1636         free(inputstr, M_RCTL);
1637         if (error != 0) {
1638                 sx_sunlock(&allproc_lock);
1639                 return (error);
1640         }
1641
1642         switch (filter->rr_subject_type) {
1643         case RCTL_SUBJECT_TYPE_PROCESS:
1644                 p = filter->rr_subject.rs_proc;
1645                 if (p == NULL) {
1646                         error = EINVAL;
1647                         goto out;
1648                 }
1649                 outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1650                 break;
1651         case RCTL_SUBJECT_TYPE_USER:
1652                 uip = filter->rr_subject.rs_uip;
1653                 if (uip == NULL) {
1654                         error = EINVAL;
1655                         goto out;
1656                 }
1657                 outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1658                 break;
1659         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1660                 lc = filter->rr_subject.rs_loginclass;
1661                 if (lc == NULL) {
1662                         error = EINVAL;
1663                         goto out;
1664                 }
1665                 outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1666                 break;
1667         case RCTL_SUBJECT_TYPE_JAIL:
1668                 prr = filter->rr_subject.rs_prison_racct;
1669                 if (prr == NULL) {
1670                         error = EINVAL;
1671                         goto out;
1672                 }
1673                 outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1674                 break;
1675         default:
1676                 error = EINVAL;
1677         }
1678 out:
1679         rctl_rule_release(filter);
1680         sx_sunlock(&allproc_lock);
1681         if (error != 0)
1682                 return (error);
1683
1684         error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1685
1686         return (error);
1687 }
1688
1689 static void
1690 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1691 {
1692         struct rctl_rule *filter = (struct rctl_rule *)arg2;
1693         struct rctl_rule_link *link;
1694         struct sbuf *sb = (struct sbuf *)arg3;
1695
1696         ASSERT_RACCT_ENABLED();
1697         RACCT_LOCK_ASSERT();
1698
1699         LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1700                 if (!rctl_rule_matches(link->rrl_rule, filter))
1701                         continue;
1702                 rctl_rule_to_sbuf(sb, link->rrl_rule);
1703                 sbuf_printf(sb, ",");
1704         }
1705 }
1706
1707 int
1708 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1709 {
1710         struct sbuf *sb;
1711         struct rctl_rule *filter;
1712         struct rctl_rule_link *link;
1713         struct proc *p;
1714         char *inputstr, *buf;
1715         size_t bufsize;
1716         int error;
1717
1718         if (!racct_enable)
1719                 return (ENOSYS);
1720
1721         error = priv_check(td, PRIV_RCTL_GET_RULES);
1722         if (error != 0)
1723                 return (error);
1724
1725         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1726         if (error != 0)
1727                 return (error);
1728
1729         sx_slock(&allproc_lock);
1730         error = rctl_string_to_rule(inputstr, &filter);
1731         free(inputstr, M_RCTL);
1732         if (error != 0) {
1733                 sx_sunlock(&allproc_lock);
1734                 return (error);
1735         }
1736
1737         bufsize = uap->outbuflen;
1738         if (bufsize > rctl_maxbufsize) {
1739                 sx_sunlock(&allproc_lock);
1740                 return (E2BIG);
1741         }
1742
1743         buf = malloc(bufsize, M_RCTL, M_WAITOK);
1744         sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1745         KASSERT(sb != NULL, ("sbuf_new failed"));
1746
1747         FOREACH_PROC_IN_SYSTEM(p) {
1748                 RACCT_LOCK();
1749                 LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1750                         /*
1751                          * Non-process rules will be added to the buffer later.
1752                          * Adding them here would result in duplicated output.
1753                          */
1754                         if (link->rrl_rule->rr_subject_type !=
1755                             RCTL_SUBJECT_TYPE_PROCESS)
1756                                 continue;
1757                         if (!rctl_rule_matches(link->rrl_rule, filter))
1758                                 continue;
1759                         rctl_rule_to_sbuf(sb, link->rrl_rule);
1760                         sbuf_printf(sb, ",");
1761                 }
1762                 RACCT_UNLOCK();
1763         }
1764
1765         loginclass_racct_foreach(rctl_get_rules_callback,
1766             rctl_rule_pre_callback, rctl_rule_post_callback,
1767             filter, sb);
1768         ui_racct_foreach(rctl_get_rules_callback,
1769             rctl_rule_pre_callback, rctl_rule_post_callback,
1770             filter, sb);
1771         prison_racct_foreach(rctl_get_rules_callback,
1772             rctl_rule_pre_callback, rctl_rule_post_callback,
1773             filter, sb);
1774         if (sbuf_error(sb) == ENOMEM) {
1775                 error = ERANGE;
1776                 goto out;
1777         }
1778
1779         /*
1780          * Remove trailing ",".
1781          */
1782         if (sbuf_len(sb) > 0)
1783                 sbuf_setpos(sb, sbuf_len(sb) - 1);
1784
1785         error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1786 out:
1787         rctl_rule_release(filter);
1788         sx_sunlock(&allproc_lock);
1789         free(buf, M_RCTL);
1790         return (error);
1791 }
1792
1793 int
1794 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1795 {
1796         struct sbuf *sb;
1797         struct rctl_rule *filter;
1798         struct rctl_rule_link *link;
1799         char *inputstr, *buf;
1800         size_t bufsize;
1801         int error;
1802
1803         if (!racct_enable)
1804                 return (ENOSYS);
1805
1806         error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1807         if (error != 0)
1808                 return (error);
1809
1810         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1811         if (error != 0)
1812                 return (error);
1813
1814         sx_slock(&allproc_lock);
1815         error = rctl_string_to_rule(inputstr, &filter);
1816         free(inputstr, M_RCTL);
1817         if (error != 0) {
1818                 sx_sunlock(&allproc_lock);
1819                 return (error);
1820         }
1821
1822         if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1823                 rctl_rule_release(filter);
1824                 sx_sunlock(&allproc_lock);
1825                 return (EINVAL);
1826         }
1827         if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1828                 rctl_rule_release(filter);
1829                 sx_sunlock(&allproc_lock);
1830                 return (EOPNOTSUPP);
1831         }
1832         if (filter->rr_subject.rs_proc == NULL) {
1833                 rctl_rule_release(filter);
1834                 sx_sunlock(&allproc_lock);
1835                 return (EINVAL);
1836         }
1837
1838         bufsize = uap->outbuflen;
1839         if (bufsize > rctl_maxbufsize) {
1840                 rctl_rule_release(filter);
1841                 sx_sunlock(&allproc_lock);
1842                 return (E2BIG);
1843         }
1844
1845         buf = malloc(bufsize, M_RCTL, M_WAITOK);
1846         sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1847         KASSERT(sb != NULL, ("sbuf_new failed"));
1848
1849         RACCT_LOCK();
1850         LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1851             rrl_next) {
1852                 rctl_rule_to_sbuf(sb, link->rrl_rule);
1853                 sbuf_printf(sb, ",");
1854         }
1855         RACCT_UNLOCK();
1856         if (sbuf_error(sb) == ENOMEM) {
1857                 error = ERANGE;
1858                 sbuf_delete(sb);
1859                 goto out;
1860         }
1861
1862         /*
1863          * Remove trailing ",".
1864          */
1865         if (sbuf_len(sb) > 0)
1866                 sbuf_setpos(sb, sbuf_len(sb) - 1);
1867
1868         error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1869 out:
1870         rctl_rule_release(filter);
1871         sx_sunlock(&allproc_lock);
1872         free(buf, M_RCTL);
1873         return (error);
1874 }
1875
1876 int
1877 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1878 {
1879         struct rctl_rule *rule;
1880         char *inputstr;
1881         int error;
1882
1883         if (!racct_enable)
1884                 return (ENOSYS);
1885
1886         error = priv_check(td, PRIV_RCTL_ADD_RULE);
1887         if (error != 0)
1888                 return (error);
1889
1890         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1891         if (error != 0)
1892                 return (error);
1893
1894         sx_slock(&allproc_lock);
1895         error = rctl_string_to_rule(inputstr, &rule);
1896         free(inputstr, M_RCTL);
1897         if (error != 0) {
1898                 sx_sunlock(&allproc_lock);
1899                 return (error);
1900         }
1901         /*
1902          * The 'per' part of a rule is optional.
1903          */
1904         if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1905             rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1906                 rule->rr_per = rule->rr_subject_type;
1907
1908         if (!rctl_rule_fully_specified(rule)) {
1909                 error = EINVAL;
1910                 goto out;
1911         }
1912
1913         error = rctl_rule_add(rule);
1914
1915 out:
1916         rctl_rule_release(rule);
1917         sx_sunlock(&allproc_lock);
1918         return (error);
1919 }
1920
1921 int
1922 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1923 {
1924         struct rctl_rule *filter;
1925         char *inputstr;
1926         int error;
1927
1928         if (!racct_enable)
1929                 return (ENOSYS);
1930
1931         error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1932         if (error != 0)
1933                 return (error);
1934
1935         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1936         if (error != 0)
1937                 return (error);
1938
1939         sx_slock(&allproc_lock);
1940         error = rctl_string_to_rule(inputstr, &filter);
1941         free(inputstr, M_RCTL);
1942         if (error != 0) {
1943                 sx_sunlock(&allproc_lock);
1944                 return (error);
1945         }
1946
1947         error = rctl_rule_remove(filter);
1948         rctl_rule_release(filter);
1949         sx_sunlock(&allproc_lock);
1950
1951         return (error);
1952 }
1953
1954 /*
1955  * Update RCTL rule list after credential change.
1956  */
1957 void
1958 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1959 {
1960         LIST_HEAD(, rctl_rule_link) newrules;
1961         struct rctl_rule_link *link, *newlink;
1962         struct uidinfo *newuip;
1963         struct loginclass *newlc;
1964         struct prison_racct *newprr;
1965         int rulecnt, i;
1966
1967         if (!racct_enable)
1968                 return;
1969
1970         PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1971
1972         newuip = newcred->cr_ruidinfo;
1973         newlc = newcred->cr_loginclass;
1974         newprr = newcred->cr_prison->pr_prison_racct;
1975
1976         LIST_INIT(&newrules);
1977
1978 again:
1979         /*
1980          * First, count the rules that apply to the process with new
1981          * credentials.
1982          */
1983         rulecnt = 0;
1984         RACCT_LOCK();
1985         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1986                 if (link->rrl_rule->rr_subject_type ==
1987                     RCTL_SUBJECT_TYPE_PROCESS)
1988                         rulecnt++;
1989         }
1990         LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1991                 rulecnt++;
1992         LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1993                 rulecnt++;
1994         LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1995                 rulecnt++;
1996         RACCT_UNLOCK();
1997
1998         /*
1999          * Create temporary list.  We've dropped the rctl_lock in order
2000          * to use M_WAITOK.
2001          */
2002         for (i = 0; i < rulecnt; i++) {
2003                 newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
2004                 newlink->rrl_rule = NULL;
2005                 newlink->rrl_exceeded = 0;
2006                 LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
2007         }
2008
2009         newlink = LIST_FIRST(&newrules);
2010
2011         /*
2012          * Assign rules to the newly allocated list entries.
2013          */
2014         RACCT_LOCK();
2015         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2016                 if (link->rrl_rule->rr_subject_type ==
2017                     RCTL_SUBJECT_TYPE_PROCESS) {
2018                         if (newlink == NULL)
2019                                 goto goaround;
2020                         rctl_rule_acquire(link->rrl_rule);
2021                         newlink->rrl_rule = link->rrl_rule;
2022                         newlink->rrl_exceeded = link->rrl_exceeded;
2023                         newlink = LIST_NEXT(newlink, rrl_next);
2024                         rulecnt--;
2025                 }
2026         }
2027
2028         LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2029                 if (newlink == NULL)
2030                         goto goaround;
2031                 rctl_rule_acquire(link->rrl_rule);
2032                 newlink->rrl_rule = link->rrl_rule;
2033                 newlink->rrl_exceeded = link->rrl_exceeded;
2034                 newlink = LIST_NEXT(newlink, rrl_next);
2035                 rulecnt--;
2036         }
2037
2038         LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2039                 if (newlink == NULL)
2040                         goto goaround;
2041                 rctl_rule_acquire(link->rrl_rule);
2042                 newlink->rrl_rule = link->rrl_rule;
2043                 newlink->rrl_exceeded = link->rrl_exceeded;
2044                 newlink = LIST_NEXT(newlink, rrl_next);
2045                 rulecnt--;
2046         }
2047
2048         LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2049                 if (newlink == NULL)
2050                         goto goaround;
2051                 rctl_rule_acquire(link->rrl_rule);
2052                 newlink->rrl_rule = link->rrl_rule;
2053                 newlink->rrl_exceeded = link->rrl_exceeded;
2054                 newlink = LIST_NEXT(newlink, rrl_next);
2055                 rulecnt--;
2056         }
2057
2058         if (rulecnt == 0) {
2059                 /*
2060                  * Free the old rule list.
2061                  */
2062                 while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2063                         link = LIST_FIRST(&p->p_racct->r_rule_links);
2064                         LIST_REMOVE(link, rrl_next);
2065                         rctl_rule_release(link->rrl_rule);
2066                         uma_zfree(rctl_rule_link_zone, link);
2067                 }
2068
2069                 /*
2070                  * Replace lists and we're done.
2071                  *
2072                  * XXX: Is there any way to switch list heads instead
2073                  *      of iterating here?
2074                  */
2075                 while (!LIST_EMPTY(&newrules)) {
2076                         newlink = LIST_FIRST(&newrules);
2077                         LIST_REMOVE(newlink, rrl_next);
2078                         LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2079                             newlink, rrl_next);
2080                 }
2081
2082                 RACCT_UNLOCK();
2083
2084                 return;
2085         }
2086
2087 goaround:
2088         RACCT_UNLOCK();
2089
2090         /*
2091          * Rule list changed while we were not holding the rctl_lock.
2092          * Free the new list and try again.
2093          */
2094         while (!LIST_EMPTY(&newrules)) {
2095                 newlink = LIST_FIRST(&newrules);
2096                 LIST_REMOVE(newlink, rrl_next);
2097                 if (newlink->rrl_rule != NULL)
2098                         rctl_rule_release(newlink->rrl_rule);
2099                 uma_zfree(rctl_rule_link_zone, newlink);
2100         }
2101
2102         goto again;
2103 }
2104
2105 /*
2106  * Assign RCTL rules to the newly created process.
2107  */
2108 int
2109 rctl_proc_fork(struct proc *parent, struct proc *child)
2110 {
2111         struct rctl_rule *rule;
2112         struct rctl_rule_link *link;
2113         int error;
2114
2115         ASSERT_RACCT_ENABLED();
2116         RACCT_LOCK_ASSERT();
2117         KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2118
2119         LIST_INIT(&child->p_racct->r_rule_links);
2120
2121         /*
2122          * Go through limits applicable to the parent and assign them
2123          * to the child.  Rules with 'process' subject have to be duplicated
2124          * in order to make their rr_subject point to the new process.
2125          */
2126         LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2127                 if (link->rrl_rule->rr_subject_type ==
2128                     RCTL_SUBJECT_TYPE_PROCESS) {
2129                         rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2130                         if (rule == NULL)
2131                                 goto fail;
2132                         KASSERT(rule->rr_subject.rs_proc == parent,
2133                             ("rule->rr_subject.rs_proc != parent"));
2134                         rule->rr_subject.rs_proc = child;
2135                         error = rctl_racct_add_rule_locked(child->p_racct,
2136                             rule);
2137                         rctl_rule_release(rule);
2138                         if (error != 0)
2139                                 goto fail;
2140                 } else {
2141                         error = rctl_racct_add_rule_locked(child->p_racct,
2142                             link->rrl_rule);
2143                         if (error != 0)
2144                                 goto fail;
2145                 }
2146         }
2147
2148         return (0);
2149
2150 fail:
2151         while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2152                 link = LIST_FIRST(&child->p_racct->r_rule_links);
2153                 LIST_REMOVE(link, rrl_next);
2154                 rctl_rule_release(link->rrl_rule);
2155                 uma_zfree(rctl_rule_link_zone, link);
2156         }
2157
2158         return (EAGAIN);
2159 }
2160
2161 /*
2162  * Release rules attached to the racct.
2163  */
2164 void
2165 rctl_racct_release(struct racct *racct)
2166 {
2167         struct rctl_rule_link *link;
2168
2169         ASSERT_RACCT_ENABLED();
2170         RACCT_LOCK_ASSERT();
2171
2172         while (!LIST_EMPTY(&racct->r_rule_links)) {
2173                 link = LIST_FIRST(&racct->r_rule_links);
2174                 LIST_REMOVE(link, rrl_next);
2175                 rctl_rule_release(link->rrl_rule);
2176                 uma_zfree(rctl_rule_link_zone, link);
2177         }
2178 }
2179
2180 static void
2181 rctl_init(void)
2182 {
2183
2184         if (!racct_enable)
2185                 return;
2186
2187         rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2188             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2189         rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2190             sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2191             UMA_ALIGN_PTR, 0);
2192
2193         /*
2194          * Set default values, making sure not to overwrite the ones
2195          * fetched from tunables.  Most of those could be set at the
2196          * declaration, except for the rctl_throttle_max - we cannot
2197          * set it there due to hz not being compile time constant.
2198          */
2199         if (rctl_throttle_min < 1)
2200                 rctl_throttle_min = 1;
2201         if (rctl_throttle_max < rctl_throttle_min)
2202                 rctl_throttle_max = 2 * hz;
2203         if (rctl_throttle_pct < 0)
2204                 rctl_throttle_pct = 100;
2205         if (rctl_throttle_pct2 < 0)
2206                 rctl_throttle_pct2 = 100;
2207 }
2208
2209 #else /* !RCTL */
2210
2211 int
2212 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2213 {
2214
2215         return (ENOSYS);
2216 }
2217
2218 int
2219 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2220 {
2221
2222         return (ENOSYS);
2223 }
2224
2225 int
2226 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2227 {
2228
2229         return (ENOSYS);
2230 }
2231
2232 int
2233 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2234 {
2235
2236         return (ENOSYS);
2237 }
2238
2239 int
2240 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2241 {
2242
2243         return (ENOSYS);
2244 }
2245
2246 #endif /* !RCTL */