]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/kern/kern_rctl.c
Import to 0.6.1
[FreeBSD/FreeBSD.git] / sys / kern / kern_rctl.c
1 /*-
2  * Copyright (c) 2010 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by Edward Tomasz Napierala under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * $FreeBSD$
30  */
31
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34
35 #include <sys/param.h>
36 #include <sys/bus.h>
37 #include <sys/malloc.h>
38 #include <sys/queue.h>
39 #include <sys/refcount.h>
40 #include <sys/jail.h>
41 #include <sys/kernel.h>
42 #include <sys/limits.h>
43 #include <sys/loginclass.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/racct.h>
47 #include <sys/rctl.h>
48 #include <sys/resourcevar.h>
49 #include <sys/sx.h>
50 #include <sys/sysent.h>
51 #include <sys/sysproto.h>
52 #include <sys/systm.h>
53 #include <sys/types.h>
54 #include <sys/eventhandler.h>
55 #include <sys/lock.h>
56 #include <sys/mutex.h>
57 #include <sys/rwlock.h>
58 #include <sys/sbuf.h>
59 #include <sys/taskqueue.h>
60 #include <sys/tree.h>
61 #include <vm/uma.h>
62
63 #ifdef RCTL
64 #ifndef RACCT
65 #error "The RCTL option requires the RACCT option"
66 #endif
67
68 FEATURE(rctl, "Resource Limits");
69
70 #define HRF_DEFAULT             0
71 #define HRF_DONT_INHERIT        1
72 #define HRF_DONT_ACCUMULATE     2
73
74 #define RCTL_MAX_INBUFSIZE      4 * 1024
75 #define RCTL_MAX_OUTBUFSIZE     16 * 1024 * 1024
76 #define RCTL_LOG_BUFSIZE        128
77
78 #define RCTL_PCPU_SHIFT         (10 * 1000000)
79
80 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
81 static int rctl_log_rate_limit = 10;
82 static int rctl_devctl_rate_limit = 10;
83
84 /*
85  * Values below are initialized in rctl_init().
86  */
87 static int rctl_throttle_min = -1;
88 static int rctl_throttle_max = -1;
89 static int rctl_throttle_pct = -1;
90 static int rctl_throttle_pct2 = -1;
91
92 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
93 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
94 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
95 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
96
97 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
98 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
99     &rctl_maxbufsize, 0, "Maximum output buffer size");
100 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
101     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
102 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
103     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
104 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
105     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU",
106     "Shortest throttling duration, in hz");
107 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
108 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
109     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU",
110     "Longest throttling duration, in hz");
111 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
112 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
113     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU",
114     "Throttling penalty for process consumption, in percent");
115 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
116 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
117     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU",
118     "Throttling penalty for container consumption, in percent");
119 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
120
121 /*
122  * 'rctl_rule_link' connects a rule with every racct it's related to.
123  * For example, rule 'user:X:openfiles:deny=N/process' is linked
124  * with uidinfo for user X, and to each process of that user.
125  */
126 struct rctl_rule_link {
127         LIST_ENTRY(rctl_rule_link)      rrl_next;
128         struct rctl_rule                *rrl_rule;
129         int                             rrl_exceeded;
130 };
131
132 struct dict {
133         const char      *d_name;
134         int             d_value;
135 };
136
137 static struct dict subjectnames[] = {
138         { "process", RCTL_SUBJECT_TYPE_PROCESS },
139         { "user", RCTL_SUBJECT_TYPE_USER },
140         { "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
141         { "jail", RCTL_SUBJECT_TYPE_JAIL },
142         { NULL, -1 }};
143
144 static struct dict resourcenames[] = {
145         { "cputime", RACCT_CPU },
146         { "datasize", RACCT_DATA },
147         { "stacksize", RACCT_STACK },
148         { "coredumpsize", RACCT_CORE },
149         { "memoryuse", RACCT_RSS },
150         { "memorylocked", RACCT_MEMLOCK },
151         { "maxproc", RACCT_NPROC },
152         { "openfiles", RACCT_NOFILE },
153         { "vmemoryuse", RACCT_VMEM },
154         { "pseudoterminals", RACCT_NPTS },
155         { "swapuse", RACCT_SWAP },
156         { "nthr", RACCT_NTHR },
157         { "msgqqueued", RACCT_MSGQQUEUED },
158         { "msgqsize", RACCT_MSGQSIZE },
159         { "nmsgq", RACCT_NMSGQ },
160         { "nsem", RACCT_NSEM },
161         { "nsemop", RACCT_NSEMOP },
162         { "nshm", RACCT_NSHM },
163         { "shmsize", RACCT_SHMSIZE },
164         { "wallclock", RACCT_WALLCLOCK },
165         { "pcpu", RACCT_PCTCPU },
166         { "readbps", RACCT_READBPS },
167         { "writebps", RACCT_WRITEBPS },
168         { "readiops", RACCT_READIOPS },
169         { "writeiops", RACCT_WRITEIOPS },
170         { NULL, -1 }};
171
172 static struct dict actionnames[] = {
173         { "sighup", RCTL_ACTION_SIGHUP },
174         { "sigint", RCTL_ACTION_SIGINT },
175         { "sigquit", RCTL_ACTION_SIGQUIT },
176         { "sigill", RCTL_ACTION_SIGILL },
177         { "sigtrap", RCTL_ACTION_SIGTRAP },
178         { "sigabrt", RCTL_ACTION_SIGABRT },
179         { "sigemt", RCTL_ACTION_SIGEMT },
180         { "sigfpe", RCTL_ACTION_SIGFPE },
181         { "sigkill", RCTL_ACTION_SIGKILL },
182         { "sigbus", RCTL_ACTION_SIGBUS },
183         { "sigsegv", RCTL_ACTION_SIGSEGV },
184         { "sigsys", RCTL_ACTION_SIGSYS },
185         { "sigpipe", RCTL_ACTION_SIGPIPE },
186         { "sigalrm", RCTL_ACTION_SIGALRM },
187         { "sigterm", RCTL_ACTION_SIGTERM },
188         { "sigurg", RCTL_ACTION_SIGURG },
189         { "sigstop", RCTL_ACTION_SIGSTOP },
190         { "sigtstp", RCTL_ACTION_SIGTSTP },
191         { "sigchld", RCTL_ACTION_SIGCHLD },
192         { "sigttin", RCTL_ACTION_SIGTTIN },
193         { "sigttou", RCTL_ACTION_SIGTTOU },
194         { "sigio", RCTL_ACTION_SIGIO },
195         { "sigxcpu", RCTL_ACTION_SIGXCPU },
196         { "sigxfsz", RCTL_ACTION_SIGXFSZ },
197         { "sigvtalrm", RCTL_ACTION_SIGVTALRM },
198         { "sigprof", RCTL_ACTION_SIGPROF },
199         { "sigwinch", RCTL_ACTION_SIGWINCH },
200         { "siginfo", RCTL_ACTION_SIGINFO },
201         { "sigusr1", RCTL_ACTION_SIGUSR1 },
202         { "sigusr2", RCTL_ACTION_SIGUSR2 },
203         { "sigthr", RCTL_ACTION_SIGTHR },
204         { "deny", RCTL_ACTION_DENY },
205         { "log", RCTL_ACTION_LOG },
206         { "devctl", RCTL_ACTION_DEVCTL },
207         { "throttle", RCTL_ACTION_THROTTLE },
208         { NULL, -1 }};
209
210 static void rctl_init(void);
211 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
212
213 static uma_zone_t rctl_rule_zone;
214 static uma_zone_t rctl_rule_link_zone;
215 static struct rwlock rctl_lock;
216 RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock");
217
218 #define RCTL_RLOCK()            rw_rlock(&rctl_lock)
219 #define RCTL_RUNLOCK()          rw_runlock(&rctl_lock)
220 #define RCTL_WLOCK()            rw_wlock(&rctl_lock)
221 #define RCTL_WUNLOCK()          rw_wunlock(&rctl_lock)
222 #define RCTL_LOCK_ASSERT()      rw_assert(&rctl_lock, RA_LOCKED)
223 #define RCTL_WLOCK_ASSERT()     rw_assert(&rctl_lock, RA_WLOCKED)
224
225 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
226 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
227
228 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
229
230 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
231 {
232         int error, val = rctl_throttle_min;
233
234         error = sysctl_handle_int(oidp, &val, 0, req);
235         if (error || !req->newptr)
236                 return (error);
237         if (val < 1 || val > rctl_throttle_max)
238                 return (EINVAL);
239
240         RCTL_WLOCK();
241         rctl_throttle_min = val;
242         RCTL_WUNLOCK();
243
244         return (0);
245 }
246
247 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
248 {
249         int error, val = rctl_throttle_max;
250
251         error = sysctl_handle_int(oidp, &val, 0, req);
252         if (error || !req->newptr)
253                 return (error);
254         if (val < rctl_throttle_min)
255                 return (EINVAL);
256
257         RCTL_WLOCK();
258         rctl_throttle_max = val;
259         RCTL_WUNLOCK();
260
261         return (0);
262 }
263
264 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
265 {
266         int error, val = rctl_throttle_pct;
267
268         error = sysctl_handle_int(oidp, &val, 0, req);
269         if (error || !req->newptr)
270                 return (error);
271         if (val < 0)
272                 return (EINVAL);
273
274         RCTL_WLOCK();
275         rctl_throttle_pct = val;
276         RCTL_WUNLOCK();
277
278         return (0);
279 }
280
281 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
282 {
283         int error, val = rctl_throttle_pct2;
284
285         error = sysctl_handle_int(oidp, &val, 0, req);
286         if (error || !req->newptr)
287                 return (error);
288         if (val < 0)
289                 return (EINVAL);
290
291         RCTL_WLOCK();
292         rctl_throttle_pct2 = val;
293         RCTL_WUNLOCK();
294
295         return (0);
296 }
297
298 static const char *
299 rctl_subject_type_name(int subject)
300 {
301         int i;
302
303         for (i = 0; subjectnames[i].d_name != NULL; i++) {
304                 if (subjectnames[i].d_value == subject)
305                         return (subjectnames[i].d_name);
306         }
307
308         panic("rctl_subject_type_name: unknown subject type %d", subject);
309 }
310
311 static const char *
312 rctl_action_name(int action)
313 {
314         int i;
315
316         for (i = 0; actionnames[i].d_name != NULL; i++) {
317                 if (actionnames[i].d_value == action)
318                         return (actionnames[i].d_name);
319         }
320
321         panic("rctl_action_name: unknown action %d", action);
322 }
323
324 const char *
325 rctl_resource_name(int resource)
326 {
327         int i;
328
329         for (i = 0; resourcenames[i].d_name != NULL; i++) {
330                 if (resourcenames[i].d_value == resource)
331                         return (resourcenames[i].d_name);
332         }
333
334         panic("rctl_resource_name: unknown resource %d", resource);
335 }
336
337 static struct racct *
338 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
339 {
340         struct ucred *cred = p->p_ucred;
341
342         ASSERT_RACCT_ENABLED();
343         RCTL_LOCK_ASSERT();
344
345         switch (rule->rr_per) {
346         case RCTL_SUBJECT_TYPE_PROCESS:
347                 return (p->p_racct);
348         case RCTL_SUBJECT_TYPE_USER:
349                 return (cred->cr_ruidinfo->ui_racct);
350         case RCTL_SUBJECT_TYPE_LOGINCLASS:
351                 return (cred->cr_loginclass->lc_racct);
352         case RCTL_SUBJECT_TYPE_JAIL:
353                 return (cred->cr_prison->pr_prison_racct->prr_racct);
354         default:
355                 panic("%s: unknown per %d", __func__, rule->rr_per);
356         }
357 }
358
359 /*
360  * Return the amount of resource that can be allocated by 'p' before
361  * hitting 'rule'.
362  */
363 static int64_t
364 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
365 {
366         const struct racct *racct;
367         int64_t available;
368
369         ASSERT_RACCT_ENABLED();
370         RCTL_LOCK_ASSERT();
371
372         racct = rctl_proc_rule_to_racct(p, rule);
373         available = rule->rr_amount - racct->r_resources[rule->rr_resource];
374
375         return (available);
376 }
377
378 /*
379  * Called every second for proc, uidinfo, loginclass, and jail containers.
380  * If the limit isn't exceeded, it decreases the usage amount to zero.
381  * Otherwise, it decreases it by the value of the limit.  This way
382  * resource consumption exceeding the limit "carries over" to the next
383  * period.
384  */
385 void
386 rctl_throttle_decay(struct racct *racct, int resource)
387 {
388         struct rctl_rule *rule;
389         struct rctl_rule_link *link;
390         int64_t minavailable;
391
392         ASSERT_RACCT_ENABLED();
393
394         minavailable = INT64_MAX;
395
396         RCTL_RLOCK();
397
398         LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
399                 rule = link->rrl_rule;
400
401                 if (rule->rr_resource != resource)
402                         continue;
403                 if (rule->rr_action != RCTL_ACTION_THROTTLE)
404                         continue;
405
406                 if (rule->rr_amount < minavailable)
407                         minavailable = rule->rr_amount;
408         }
409
410         RCTL_RUNLOCK();
411
412         if (racct->r_resources[resource] < minavailable) {
413                 racct->r_resources[resource] = 0;
414         } else {
415                 /*
416                  * Cap utilization counter at ten times the limit.  Otherwise,
417                  * if we changed the rule lowering the allowed amount, it could
418                  * take unreasonably long time for the accumulated resource
419                  * usage to drop.
420                  */
421                 if (racct->r_resources[resource] > minavailable * 10)
422                         racct->r_resources[resource] = minavailable * 10;
423
424                 racct->r_resources[resource] -= minavailable;
425         }
426 }
427
428 /*
429  * Special version of rctl_get_available() for the %CPU resource.
430  * We slightly cheat here and return less than we normally would.
431  */
432 int64_t
433 rctl_pcpu_available(const struct proc *p) {
434         struct rctl_rule *rule;
435         struct rctl_rule_link *link;
436         int64_t available, minavailable, limit;
437
438         ASSERT_RACCT_ENABLED();
439
440         minavailable = INT64_MAX;
441         limit = 0;
442
443         RCTL_RLOCK();
444
445         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
446                 rule = link->rrl_rule;
447                 if (rule->rr_resource != RACCT_PCTCPU)
448                         continue;
449                 if (rule->rr_action != RCTL_ACTION_DENY)
450                         continue;
451                 available = rctl_available_resource(p, rule);
452                 if (available < minavailable) {
453                         minavailable = available;
454                         limit = rule->rr_amount;
455                 }
456         }
457
458         RCTL_RUNLOCK();
459
460         /*
461          * Return slightly less than actual value of the available
462          * %cpu resource.  This makes %cpu throttling more agressive
463          * and lets us act sooner than the limits are already exceeded.
464          */
465         if (limit != 0) {
466                 if (limit > 2 * RCTL_PCPU_SHIFT)
467                         minavailable -= RCTL_PCPU_SHIFT;
468                 else
469                         minavailable -= (limit / 2);
470         }
471
472         return (minavailable);
473 }
474
475 static uint64_t
476 xadd(uint64_t a, uint64_t b)
477 {
478         uint64_t c;
479
480         c = a + b;
481
482         /*
483          * Detect overflow.
484          */
485         if (c < a || c < b)
486                 return (UINT64_MAX);
487
488         return (c);
489 }
490
491 static uint64_t
492 xmul(uint64_t a, uint64_t b)
493 {
494
495         if (b != 0 && a > UINT64_MAX / b)
496                 return (UINT64_MAX);
497
498         return (a * b);
499 }
500
501 /*
502  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
503  * to what it keeps allocated now.  Returns non-zero if the allocation should
504  * be denied, 0 otherwise.
505  */
506 int
507 rctl_enforce(struct proc *p, int resource, uint64_t amount)
508 {
509         static struct timeval log_lasttime, devctl_lasttime;
510         static int log_curtime = 0, devctl_curtime = 0;
511         struct rctl_rule *rule;
512         struct rctl_rule_link *link;
513         struct sbuf sb;
514         char *buf;
515         int64_t available;
516         uint64_t sleep_ms, sleep_ratio;
517         int should_deny = 0;
518
519
520         ASSERT_RACCT_ENABLED();
521
522         RCTL_RLOCK();
523
524         /*
525          * There may be more than one matching rule; go through all of them.
526          * Denial should be done last, after logging and sending signals.
527          */
528         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
529                 rule = link->rrl_rule;
530                 if (rule->rr_resource != resource)
531                         continue;
532
533                 available = rctl_available_resource(p, rule);
534                 if (available >= (int64_t)amount) {
535                         link->rrl_exceeded = 0;
536                         continue;
537                 }
538
539                 switch (rule->rr_action) {
540                 case RCTL_ACTION_DENY:
541                         should_deny = 1;
542                         continue;
543                 case RCTL_ACTION_LOG:
544                         /*
545                          * If rrl_exceeded != 0, it means we've already
546                          * logged a warning for this process.
547                          */
548                         if (link->rrl_exceeded != 0)
549                                 continue;
550
551                         /*
552                          * If the process state is not fully initialized yet,
553                          * we can't access most of the required fields, e.g.
554                          * p->p_comm.  This happens when called from fork1().
555                          * Ignore this rule for now; it will be processed just
556                          * after fork, when called from racct_proc_fork_done().
557                          */
558                         if (p->p_state != PRS_NORMAL)
559                                 continue;
560
561                         if (!ppsratecheck(&log_lasttime, &log_curtime,
562                             rctl_log_rate_limit))
563                                 continue;
564
565                         buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
566                         if (buf == NULL) {
567                                 printf("rctl_enforce: out of memory\n");
568                                 continue;
569                         }
570                         sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
571                         rctl_rule_to_sbuf(&sb, rule);
572                         sbuf_finish(&sb);
573                         printf("rctl: rule \"%s\" matched by pid %d "
574                             "(%s), uid %d, jail %s\n", sbuf_data(&sb),
575                             p->p_pid, p->p_comm, p->p_ucred->cr_uid,
576                             p->p_ucred->cr_prison->pr_prison_racct->prr_name);
577                         sbuf_delete(&sb);
578                         free(buf, M_RCTL);
579                         link->rrl_exceeded = 1;
580                         continue;
581                 case RCTL_ACTION_DEVCTL:
582                         if (link->rrl_exceeded != 0)
583                                 continue;
584
585                         if (p->p_state != PRS_NORMAL)
586                                 continue;
587
588                         if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
589                             rctl_devctl_rate_limit))
590                                 continue;
591
592                         buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
593                         if (buf == NULL) {
594                                 printf("rctl_enforce: out of memory\n");
595                                 continue;
596                         }
597                         sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
598                         sbuf_printf(&sb, "rule=");
599                         rctl_rule_to_sbuf(&sb, rule);
600                         sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
601                             p->p_pid, p->p_ucred->cr_ruid,
602                             p->p_ucred->cr_prison->pr_prison_racct->prr_name);
603                         sbuf_finish(&sb);
604                         devctl_notify_f("RCTL", "rule", "matched",
605                             sbuf_data(&sb), M_NOWAIT);
606                         sbuf_delete(&sb);
607                         free(buf, M_RCTL);
608                         link->rrl_exceeded = 1;
609                         continue;
610                 case RCTL_ACTION_THROTTLE:
611                         if (p->p_state != PRS_NORMAL)
612                                 continue;
613
614                         /*
615                          * Make the process sleep for a fraction of second
616                          * proportional to the ratio of process' resource
617                          * utilization compared to the limit.  The point is
618                          * to penalize resource hogs: processes that consume
619                          * more of the available resources sleep for longer.
620                          *
621                          * We're trying to defer division until the very end,
622                          * to minimize the rounding effects.  The following
623                          * calculation could have been written in a clearer
624                          * way like this:
625                          *
626                          * sleep_ms = hz * p->p_racct->r_resources[resource] /
627                          *     rule->rr_amount;
628                          * sleep_ms *= rctl_throttle_pct / 100;
629                          * if (sleep_ms < rctl_throttle_min)
630                          *         sleep_ms = rctl_throttle_min;
631                          *
632                          */
633                         sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
634                         sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
635                         if (sleep_ms < rctl_throttle_min * rule->rr_amount)
636                                 sleep_ms = rctl_throttle_min * rule->rr_amount;
637
638                         /*
639                          * Multiply that by the ratio of the resource
640                          * consumption for the container compared to the limit,
641                          * squared.  In other words, a process in a container
642                          * that is two times over the limit will be throttled
643                          * four times as much for hitting the same rule.  The
644                          * point is to penalize processes more if the container
645                          * itself (eg certain UID or jail) is above the limit.
646                          */
647                         if (available < 0)
648                                 sleep_ratio = -available / rule->rr_amount;
649                         else
650                                 sleep_ratio = 0;
651                         sleep_ratio = xmul(sleep_ratio, sleep_ratio);
652                         sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
653                         sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
654
655                         /*
656                          * Finally the division.
657                          */
658                         sleep_ms /= rule->rr_amount;
659
660                         if (sleep_ms > rctl_throttle_max)
661                                 sleep_ms = rctl_throttle_max;
662 #if 0
663                         printf("%s: pid %d (%s), %jd of %jd, will sleep for %ld ms (ratio %ld, available %ld)\n",
664                            __func__, p->p_pid, p->p_comm,
665                            p->p_racct->r_resources[resource],
666                            rule->rr_amount, sleep_ms, sleep_ratio, available);
667 #endif
668
669                         KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
670                             __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
671                         racct_proc_throttle(p, sleep_ms);
672                         continue;
673                 default:
674                         if (link->rrl_exceeded != 0)
675                                 continue;
676
677                         if (p->p_state != PRS_NORMAL)
678                                 continue;
679
680                         KASSERT(rule->rr_action > 0 &&
681                             rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
682                             ("rctl_enforce: unknown action %d",
683                              rule->rr_action));
684
685                         /*
686                          * We're using the fact that RCTL_ACTION_SIG* values
687                          * are equal to their counterparts from sys/signal.h.
688                          */
689                         kern_psignal(p, rule->rr_action);
690                         link->rrl_exceeded = 1;
691                         continue;
692                 }
693         }
694
695         RCTL_RUNLOCK();
696
697         if (should_deny) {
698                 /*
699                  * Return fake error code; the caller should change it
700                  * into one proper for the situation - EFSIZ, ENOMEM etc.
701                  */
702                 return (EDOOFUS);
703         }
704
705         return (0);
706 }
707
708 uint64_t
709 rctl_get_limit(struct proc *p, int resource)
710 {
711         struct rctl_rule *rule;
712         struct rctl_rule_link *link;
713         uint64_t amount = UINT64_MAX;
714
715         ASSERT_RACCT_ENABLED();
716
717         RCTL_RLOCK();
718
719         /*
720          * There may be more than one matching rule; go through all of them.
721          * Denial should be done last, after logging and sending signals.
722          */
723         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
724                 rule = link->rrl_rule;
725                 if (rule->rr_resource != resource)
726                         continue;
727                 if (rule->rr_action != RCTL_ACTION_DENY)
728                         continue;
729                 if (rule->rr_amount < amount)
730                         amount = rule->rr_amount;
731         }
732
733         RCTL_RUNLOCK();
734
735         return (amount);
736 }
737
738 uint64_t
739 rctl_get_available(struct proc *p, int resource)
740 {
741         struct rctl_rule *rule;
742         struct rctl_rule_link *link;
743         int64_t available, minavailable, allocated;
744
745         minavailable = INT64_MAX;
746
747         ASSERT_RACCT_ENABLED();
748
749         RCTL_RLOCK();
750
751         /*
752          * There may be more than one matching rule; go through all of them.
753          * Denial should be done last, after logging and sending signals.
754          */
755         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
756                 rule = link->rrl_rule;
757                 if (rule->rr_resource != resource)
758                         continue;
759                 if (rule->rr_action != RCTL_ACTION_DENY)
760                         continue;
761                 available = rctl_available_resource(p, rule);
762                 if (available < minavailable)
763                         minavailable = available;
764         }
765
766         RCTL_RUNLOCK();
767
768         /*
769          * XXX: Think about this _hard_.
770          */
771         allocated = p->p_racct->r_resources[resource];
772         if (minavailable < INT64_MAX - allocated)
773                 minavailable += allocated;
774         if (minavailable < 0)
775                 minavailable = 0;
776         return (minavailable);
777 }
778
779 static int
780 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
781 {
782
783         ASSERT_RACCT_ENABLED();
784
785         if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
786                 if (rule->rr_subject_type != filter->rr_subject_type)
787                         return (0);
788
789                 switch (filter->rr_subject_type) {
790                 case RCTL_SUBJECT_TYPE_PROCESS:
791                         if (filter->rr_subject.rs_proc != NULL &&
792                             rule->rr_subject.rs_proc !=
793                             filter->rr_subject.rs_proc)
794                                 return (0);
795                         break;
796                 case RCTL_SUBJECT_TYPE_USER:
797                         if (filter->rr_subject.rs_uip != NULL &&
798                             rule->rr_subject.rs_uip !=
799                             filter->rr_subject.rs_uip)
800                                 return (0);
801                         break;
802                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
803                         if (filter->rr_subject.rs_loginclass != NULL &&
804                             rule->rr_subject.rs_loginclass !=
805                             filter->rr_subject.rs_loginclass)
806                                 return (0);
807                         break;
808                 case RCTL_SUBJECT_TYPE_JAIL:
809                         if (filter->rr_subject.rs_prison_racct != NULL &&
810                             rule->rr_subject.rs_prison_racct !=
811                             filter->rr_subject.rs_prison_racct)
812                                 return (0);
813                         break;
814                 default:
815                         panic("rctl_rule_matches: unknown subject type %d",
816                             filter->rr_subject_type);
817                 }
818         }
819
820         if (filter->rr_resource != RACCT_UNDEFINED) {
821                 if (rule->rr_resource != filter->rr_resource)
822                         return (0);
823         }
824
825         if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
826                 if (rule->rr_action != filter->rr_action)
827                         return (0);
828         }
829
830         if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
831                 if (rule->rr_amount != filter->rr_amount)
832                         return (0);
833         }
834
835         if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
836                 if (rule->rr_per != filter->rr_per)
837                         return (0);
838         }
839
840         return (1);
841 }
842
843 static int
844 str2value(const char *str, int *value, struct dict *table)
845 {
846         int i;
847
848         if (value == NULL)
849                 return (EINVAL);
850
851         for (i = 0; table[i].d_name != NULL; i++) {
852                 if (strcasecmp(table[i].d_name, str) == 0) {
853                         *value =  table[i].d_value;
854                         return (0);
855                 }
856         }
857
858         return (EINVAL);
859 }
860
861 static int
862 str2id(const char *str, id_t *value)
863 {
864         char *end;
865
866         if (str == NULL)
867                 return (EINVAL);
868
869         *value = strtoul(str, &end, 10);
870         if ((size_t)(end - str) != strlen(str))
871                 return (EINVAL);
872
873         return (0);
874 }
875
876 static int
877 str2int64(const char *str, int64_t *value)
878 {
879         char *end;
880
881         if (str == NULL)
882                 return (EINVAL);
883
884         *value = strtoul(str, &end, 10);
885         if ((size_t)(end - str) != strlen(str))
886                 return (EINVAL);
887
888         if (*value < 0)
889                 return (ERANGE);
890
891         return (0);
892 }
893
894 /*
895  * Connect the rule to the racct, increasing refcount for the rule.
896  */
897 static void
898 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
899 {
900         struct rctl_rule_link *link;
901
902         ASSERT_RACCT_ENABLED();
903         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
904
905         rctl_rule_acquire(rule);
906         link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
907         link->rrl_rule = rule;
908         link->rrl_exceeded = 0;
909
910         RCTL_WLOCK();
911         LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
912         RCTL_WUNLOCK();
913 }
914
915 static int
916 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
917 {
918         struct rctl_rule_link *link;
919
920         ASSERT_RACCT_ENABLED();
921         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
922         RCTL_WLOCK_ASSERT();
923
924         link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
925         if (link == NULL)
926                 return (ENOMEM);
927         rctl_rule_acquire(rule);
928         link->rrl_rule = rule;
929         link->rrl_exceeded = 0;
930
931         LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
932         return (0);
933 }
934
935 /*
936  * Remove limits for a rules matching the filter and release
937  * the refcounts for the rules, possibly freeing them.  Returns
938  * the number of limit structures removed.
939  */
940 static int
941 rctl_racct_remove_rules(struct racct *racct,
942     const struct rctl_rule *filter)
943 {
944         struct rctl_rule_link *link, *linktmp;
945         int removed = 0;
946
947         ASSERT_RACCT_ENABLED();
948         RCTL_WLOCK_ASSERT();
949
950         LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
951                 if (!rctl_rule_matches(link->rrl_rule, filter))
952                         continue;
953
954                 LIST_REMOVE(link, rrl_next);
955                 rctl_rule_release(link->rrl_rule);
956                 uma_zfree(rctl_rule_link_zone, link);
957                 removed++;
958         }
959         return (removed);
960 }
961
962 static void
963 rctl_rule_acquire_subject(struct rctl_rule *rule)
964 {
965
966         ASSERT_RACCT_ENABLED();
967
968         switch (rule->rr_subject_type) {
969         case RCTL_SUBJECT_TYPE_UNDEFINED:
970         case RCTL_SUBJECT_TYPE_PROCESS:
971                 break;
972         case RCTL_SUBJECT_TYPE_JAIL:
973                 if (rule->rr_subject.rs_prison_racct != NULL)
974                         prison_racct_hold(rule->rr_subject.rs_prison_racct);
975                 break;
976         case RCTL_SUBJECT_TYPE_USER:
977                 if (rule->rr_subject.rs_uip != NULL)
978                         uihold(rule->rr_subject.rs_uip);
979                 break;
980         case RCTL_SUBJECT_TYPE_LOGINCLASS:
981                 if (rule->rr_subject.rs_loginclass != NULL)
982                         loginclass_hold(rule->rr_subject.rs_loginclass);
983                 break;
984         default:
985                 panic("rctl_rule_acquire_subject: unknown subject type %d",
986                     rule->rr_subject_type);
987         }
988 }
989
990 static void
991 rctl_rule_release_subject(struct rctl_rule *rule)
992 {
993
994         ASSERT_RACCT_ENABLED();
995
996         switch (rule->rr_subject_type) {
997         case RCTL_SUBJECT_TYPE_UNDEFINED:
998         case RCTL_SUBJECT_TYPE_PROCESS:
999                 break;
1000         case RCTL_SUBJECT_TYPE_JAIL:
1001                 if (rule->rr_subject.rs_prison_racct != NULL)
1002                         prison_racct_free(rule->rr_subject.rs_prison_racct);
1003                 break;
1004         case RCTL_SUBJECT_TYPE_USER:
1005                 if (rule->rr_subject.rs_uip != NULL)
1006                         uifree(rule->rr_subject.rs_uip);
1007                 break;
1008         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1009                 if (rule->rr_subject.rs_loginclass != NULL)
1010                         loginclass_free(rule->rr_subject.rs_loginclass);
1011                 break;
1012         default:
1013                 panic("rctl_rule_release_subject: unknown subject type %d",
1014                     rule->rr_subject_type);
1015         }
1016 }
1017
1018 struct rctl_rule *
1019 rctl_rule_alloc(int flags)
1020 {
1021         struct rctl_rule *rule;
1022
1023         ASSERT_RACCT_ENABLED();
1024
1025         rule = uma_zalloc(rctl_rule_zone, flags);
1026         if (rule == NULL)
1027                 return (NULL);
1028         rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1029         rule->rr_subject.rs_proc = NULL;
1030         rule->rr_subject.rs_uip = NULL;
1031         rule->rr_subject.rs_loginclass = NULL;
1032         rule->rr_subject.rs_prison_racct = NULL;
1033         rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1034         rule->rr_resource = RACCT_UNDEFINED;
1035         rule->rr_action = RCTL_ACTION_UNDEFINED;
1036         rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1037         refcount_init(&rule->rr_refcount, 1);
1038
1039         return (rule);
1040 }
1041
1042 struct rctl_rule *
1043 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1044 {
1045         struct rctl_rule *copy;
1046
1047         ASSERT_RACCT_ENABLED();
1048
1049         copy = uma_zalloc(rctl_rule_zone, flags);
1050         if (copy == NULL)
1051                 return (NULL);
1052         copy->rr_subject_type = rule->rr_subject_type;
1053         copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1054         copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1055         copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1056         copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1057         copy->rr_per = rule->rr_per;
1058         copy->rr_resource = rule->rr_resource;
1059         copy->rr_action = rule->rr_action;
1060         copy->rr_amount = rule->rr_amount;
1061         refcount_init(&copy->rr_refcount, 1);
1062         rctl_rule_acquire_subject(copy);
1063
1064         return (copy);
1065 }
1066
1067 void
1068 rctl_rule_acquire(struct rctl_rule *rule)
1069 {
1070
1071         ASSERT_RACCT_ENABLED();
1072         KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1073
1074         refcount_acquire(&rule->rr_refcount);
1075 }
1076
1077 static void
1078 rctl_rule_free(void *context, int pending)
1079 {
1080         struct rctl_rule *rule;
1081         
1082         rule = (struct rctl_rule *)context;
1083
1084         ASSERT_RACCT_ENABLED();
1085         KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1086         
1087         /*
1088          * We don't need locking here; rule is guaranteed to be inaccessible.
1089          */
1090         
1091         rctl_rule_release_subject(rule);
1092         uma_zfree(rctl_rule_zone, rule);
1093 }
1094
1095 void
1096 rctl_rule_release(struct rctl_rule *rule)
1097 {
1098
1099         ASSERT_RACCT_ENABLED();
1100         KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1101
1102         if (refcount_release(&rule->rr_refcount)) {
1103                 /*
1104                  * rctl_rule_release() is often called when iterating
1105                  * over all the uidinfo structures in the system,
1106                  * holding uihashtbl_lock.  Since rctl_rule_free()
1107                  * might end up calling uifree(), this would lead
1108                  * to lock recursion.  Use taskqueue to avoid this.
1109                  */
1110                 TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1111                 taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1112         }
1113 }
1114
1115 static int
1116 rctl_rule_fully_specified(const struct rctl_rule *rule)
1117 {
1118
1119         ASSERT_RACCT_ENABLED();
1120
1121         switch (rule->rr_subject_type) {
1122         case RCTL_SUBJECT_TYPE_UNDEFINED:
1123                 return (0);
1124         case RCTL_SUBJECT_TYPE_PROCESS:
1125                 if (rule->rr_subject.rs_proc == NULL)
1126                         return (0);
1127                 break;
1128         case RCTL_SUBJECT_TYPE_USER:
1129                 if (rule->rr_subject.rs_uip == NULL)
1130                         return (0);
1131                 break;
1132         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1133                 if (rule->rr_subject.rs_loginclass == NULL)
1134                         return (0);
1135                 break;
1136         case RCTL_SUBJECT_TYPE_JAIL:
1137                 if (rule->rr_subject.rs_prison_racct == NULL)
1138                         return (0);
1139                 break;
1140         default:
1141                 panic("rctl_rule_fully_specified: unknown subject type %d",
1142                     rule->rr_subject_type);
1143         }
1144         if (rule->rr_resource == RACCT_UNDEFINED)
1145                 return (0);
1146         if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1147                 return (0);
1148         if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1149                 return (0);
1150         if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1151                 return (0);
1152
1153         return (1);
1154 }
1155
1156 static int
1157 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1158 {
1159         struct rctl_rule *rule;
1160         char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1161              *amountstr, *perstr;
1162         id_t id;
1163         int error = 0;
1164
1165         ASSERT_RACCT_ENABLED();
1166
1167         rule = rctl_rule_alloc(M_WAITOK);
1168
1169         subjectstr = strsep(&rulestr, ":");
1170         subject_idstr = strsep(&rulestr, ":");
1171         resourcestr = strsep(&rulestr, ":");
1172         actionstr = strsep(&rulestr, "=/");
1173         amountstr = strsep(&rulestr, "/");
1174         perstr = rulestr;
1175
1176         if (subjectstr == NULL || subjectstr[0] == '\0')
1177                 rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1178         else {
1179                 error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1180                 if (error != 0)
1181                         goto out;
1182         }
1183
1184         if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1185                 rule->rr_subject.rs_proc = NULL;
1186                 rule->rr_subject.rs_uip = NULL;
1187                 rule->rr_subject.rs_loginclass = NULL;
1188                 rule->rr_subject.rs_prison_racct = NULL;
1189         } else {
1190                 switch (rule->rr_subject_type) {
1191                 case RCTL_SUBJECT_TYPE_UNDEFINED:
1192                         error = EINVAL;
1193                         goto out;
1194                 case RCTL_SUBJECT_TYPE_PROCESS:
1195                         error = str2id(subject_idstr, &id);
1196                         if (error != 0)
1197                                 goto out;
1198                         sx_assert(&allproc_lock, SA_LOCKED);
1199                         rule->rr_subject.rs_proc = pfind(id);
1200                         if (rule->rr_subject.rs_proc == NULL) {
1201                                 error = ESRCH;
1202                                 goto out;
1203                         }
1204                         PROC_UNLOCK(rule->rr_subject.rs_proc);
1205                         break;
1206                 case RCTL_SUBJECT_TYPE_USER:
1207                         error = str2id(subject_idstr, &id);
1208                         if (error != 0)
1209                                 goto out;
1210                         rule->rr_subject.rs_uip = uifind(id);
1211                         break;
1212                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
1213                         rule->rr_subject.rs_loginclass =
1214                             loginclass_find(subject_idstr);
1215                         if (rule->rr_subject.rs_loginclass == NULL) {
1216                                 error = ENAMETOOLONG;
1217                                 goto out;
1218                         }
1219                         break;
1220                 case RCTL_SUBJECT_TYPE_JAIL:
1221                         rule->rr_subject.rs_prison_racct =
1222                             prison_racct_find(subject_idstr);
1223                         if (rule->rr_subject.rs_prison_racct == NULL) {
1224                                 error = ENAMETOOLONG;
1225                                 goto out;
1226                         }
1227                         break;
1228                default:
1229                        panic("rctl_string_to_rule: unknown subject type %d",
1230                            rule->rr_subject_type);
1231                }
1232         }
1233
1234         if (resourcestr == NULL || resourcestr[0] == '\0')
1235                 rule->rr_resource = RACCT_UNDEFINED;
1236         else {
1237                 error = str2value(resourcestr, &rule->rr_resource,
1238                     resourcenames);
1239                 if (error != 0)
1240                         goto out;
1241         }
1242
1243         if (actionstr == NULL || actionstr[0] == '\0')
1244                 rule->rr_action = RCTL_ACTION_UNDEFINED;
1245         else {
1246                 error = str2value(actionstr, &rule->rr_action, actionnames);
1247                 if (error != 0)
1248                         goto out;
1249         }
1250
1251         if (amountstr == NULL || amountstr[0] == '\0')
1252                 rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1253         else {
1254                 error = str2int64(amountstr, &rule->rr_amount);
1255                 if (error != 0)
1256                         goto out;
1257                 if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1258                         if (rule->rr_amount > INT64_MAX / 1000000) {
1259                                 error = ERANGE;
1260                                 goto out;
1261                         }
1262                         rule->rr_amount *= 1000000;
1263                 }
1264         }
1265
1266         if (perstr == NULL || perstr[0] == '\0')
1267                 rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1268         else {
1269                 error = str2value(perstr, &rule->rr_per, subjectnames);
1270                 if (error != 0)
1271                         goto out;
1272         }
1273
1274 out:
1275         if (error == 0)
1276                 *rulep = rule;
1277         else
1278                 rctl_rule_release(rule);
1279
1280         return (error);
1281 }
1282
1283 /*
1284  * Link a rule with all the subjects it applies to.
1285  */
1286 int
1287 rctl_rule_add(struct rctl_rule *rule)
1288 {
1289         struct proc *p;
1290         struct ucred *cred;
1291         struct uidinfo *uip;
1292         struct prison *pr;
1293         struct prison_racct *prr;
1294         struct loginclass *lc;
1295         struct rctl_rule *rule2;
1296         int match;
1297
1298         ASSERT_RACCT_ENABLED();
1299         KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1300
1301         /*
1302          * Some rules just don't make sense, like "deny" rule for an undeniable
1303          * resource.  The exception are the RSS and %CPU resources - they are
1304          * not deniable in the racct sense, but the limit is enforced in
1305          * a different way.
1306          */
1307         if (rule->rr_action == RCTL_ACTION_DENY &&
1308             !RACCT_IS_DENIABLE(rule->rr_resource) &&
1309             rule->rr_resource != RACCT_RSS &&
1310             rule->rr_resource != RACCT_PCTCPU) {
1311                 return (EOPNOTSUPP);
1312         }
1313
1314         if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1315             !RACCT_IS_DECAYING(rule->rr_resource)) {
1316                 return (EOPNOTSUPP);
1317         }
1318
1319         if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1320             rule->rr_resource == RACCT_PCTCPU) {
1321                 return (EOPNOTSUPP);
1322         }
1323
1324         if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1325             RACCT_IS_SLOPPY(rule->rr_resource)) {
1326                 return (EOPNOTSUPP);
1327         }
1328
1329         /*
1330          * Make sure there are no duplicated rules.  Also, for the "deny"
1331          * rules, remove ones differing only by "amount".
1332          */
1333         if (rule->rr_action == RCTL_ACTION_DENY) {
1334                 rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1335                 rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1336                 rctl_rule_remove(rule2);
1337                 rctl_rule_release(rule2);
1338         } else
1339                 rctl_rule_remove(rule);
1340
1341         switch (rule->rr_subject_type) {
1342         case RCTL_SUBJECT_TYPE_PROCESS:
1343                 p = rule->rr_subject.rs_proc;
1344                 KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1345
1346                 rctl_racct_add_rule(p->p_racct, rule);
1347                 /*
1348                  * In case of per-process rule, we don't have anything more
1349                  * to do.
1350                  */
1351                 return (0);
1352
1353         case RCTL_SUBJECT_TYPE_USER:
1354                 uip = rule->rr_subject.rs_uip;
1355                 KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1356                 rctl_racct_add_rule(uip->ui_racct, rule);
1357                 break;
1358
1359         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1360                 lc = rule->rr_subject.rs_loginclass;
1361                 KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1362                 rctl_racct_add_rule(lc->lc_racct, rule);
1363                 break;
1364
1365         case RCTL_SUBJECT_TYPE_JAIL:
1366                 prr = rule->rr_subject.rs_prison_racct;
1367                 KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1368                 rctl_racct_add_rule(prr->prr_racct, rule);
1369                 break;
1370
1371         default:
1372                 panic("rctl_rule_add: unknown subject type %d",
1373                     rule->rr_subject_type);
1374         }
1375
1376         /*
1377          * Now go through all the processes and add the new rule to the ones
1378          * it applies to.
1379          */
1380         sx_assert(&allproc_lock, SA_LOCKED);
1381         FOREACH_PROC_IN_SYSTEM(p) {
1382                 cred = p->p_ucred;
1383                 switch (rule->rr_subject_type) {
1384                 case RCTL_SUBJECT_TYPE_USER:
1385                         if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1386                             cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1387                                 break;
1388                         continue;
1389                 case RCTL_SUBJECT_TYPE_LOGINCLASS:
1390                         if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1391                                 break;
1392                         continue;
1393                 case RCTL_SUBJECT_TYPE_JAIL:
1394                         match = 0;
1395                         for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1396                                 if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1397                                         match = 1;
1398                                         break;
1399                                 }
1400                         }
1401                         if (match)
1402                                 break;
1403                         continue;
1404                 default:
1405                         panic("rctl_rule_add: unknown subject type %d",
1406                             rule->rr_subject_type);
1407                 }
1408
1409                 rctl_racct_add_rule(p->p_racct, rule);
1410         }
1411
1412         return (0);
1413 }
1414
1415 static void
1416 rctl_rule_pre_callback(void)
1417 {
1418
1419         RCTL_WLOCK();
1420 }
1421
1422 static void
1423 rctl_rule_post_callback(void)
1424 {
1425
1426         RCTL_WUNLOCK();
1427 }
1428
1429 static void
1430 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1431 {
1432         struct rctl_rule *filter = (struct rctl_rule *)arg2;
1433         int found = 0;
1434
1435         ASSERT_RACCT_ENABLED();
1436         RCTL_WLOCK_ASSERT();
1437
1438         found += rctl_racct_remove_rules(racct, filter);
1439
1440         *((int *)arg3) += found;
1441 }
1442
1443 /*
1444  * Remove all rules that match the filter.
1445  */
1446 int
1447 rctl_rule_remove(struct rctl_rule *filter)
1448 {
1449         struct proc *p;
1450         int found = 0;
1451
1452         ASSERT_RACCT_ENABLED();
1453
1454         if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1455             filter->rr_subject.rs_proc != NULL) {
1456                 p = filter->rr_subject.rs_proc;
1457                 RCTL_WLOCK();
1458                 found = rctl_racct_remove_rules(p->p_racct, filter);
1459                 RCTL_WUNLOCK();
1460                 if (found)
1461                         return (0);
1462                 return (ESRCH);
1463         }
1464
1465         loginclass_racct_foreach(rctl_rule_remove_callback,
1466             rctl_rule_pre_callback, rctl_rule_post_callback,
1467             filter, (void *)&found);
1468         ui_racct_foreach(rctl_rule_remove_callback,
1469             rctl_rule_pre_callback, rctl_rule_post_callback,
1470             filter, (void *)&found);
1471         prison_racct_foreach(rctl_rule_remove_callback,
1472             rctl_rule_pre_callback, rctl_rule_post_callback,
1473             filter, (void *)&found);
1474
1475         sx_assert(&allproc_lock, SA_LOCKED);
1476         RCTL_WLOCK();
1477         FOREACH_PROC_IN_SYSTEM(p) {
1478                 found += rctl_racct_remove_rules(p->p_racct, filter);
1479         }
1480         RCTL_WUNLOCK();
1481
1482         if (found)
1483                 return (0);
1484         return (ESRCH);
1485 }
1486
1487 /*
1488  * Appends a rule to the sbuf.
1489  */
1490 static void
1491 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1492 {
1493         int64_t amount;
1494
1495         ASSERT_RACCT_ENABLED();
1496
1497         sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1498
1499         switch (rule->rr_subject_type) {
1500         case RCTL_SUBJECT_TYPE_PROCESS:
1501                 if (rule->rr_subject.rs_proc == NULL)
1502                         sbuf_printf(sb, ":");
1503                 else
1504                         sbuf_printf(sb, "%d:",
1505                             rule->rr_subject.rs_proc->p_pid);
1506                 break;
1507         case RCTL_SUBJECT_TYPE_USER:
1508                 if (rule->rr_subject.rs_uip == NULL)
1509                         sbuf_printf(sb, ":");
1510                 else
1511                         sbuf_printf(sb, "%d:",
1512                             rule->rr_subject.rs_uip->ui_uid);
1513                 break;
1514         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1515                 if (rule->rr_subject.rs_loginclass == NULL)
1516                         sbuf_printf(sb, ":");
1517                 else
1518                         sbuf_printf(sb, "%s:",
1519                             rule->rr_subject.rs_loginclass->lc_name);
1520                 break;
1521         case RCTL_SUBJECT_TYPE_JAIL:
1522                 if (rule->rr_subject.rs_prison_racct == NULL)
1523                         sbuf_printf(sb, ":");
1524                 else
1525                         sbuf_printf(sb, "%s:",
1526                             rule->rr_subject.rs_prison_racct->prr_name);
1527                 break;
1528         default:
1529                 panic("rctl_rule_to_sbuf: unknown subject type %d",
1530                     rule->rr_subject_type);
1531         }
1532
1533         amount = rule->rr_amount;
1534         if (amount != RCTL_AMOUNT_UNDEFINED &&
1535             RACCT_IS_IN_MILLIONS(rule->rr_resource))
1536                 amount /= 1000000;
1537
1538         sbuf_printf(sb, "%s:%s=%jd",
1539             rctl_resource_name(rule->rr_resource),
1540             rctl_action_name(rule->rr_action),
1541             amount);
1542
1543         if (rule->rr_per != rule->rr_subject_type)
1544                 sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1545 }
1546
1547 /*
1548  * Routine used by RCTL syscalls to read in input string.
1549  */
1550 static int
1551 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1552 {
1553         char *str;
1554         int error;
1555
1556         ASSERT_RACCT_ENABLED();
1557
1558         if (inbuflen <= 0)
1559                 return (EINVAL);
1560         if (inbuflen > RCTL_MAX_INBUFSIZE)
1561                 return (E2BIG);
1562
1563         str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1564         error = copyinstr(inbufp, str, inbuflen, NULL);
1565         if (error != 0) {
1566                 free(str, M_RCTL);
1567                 return (error);
1568         }
1569
1570         *inputstr = str;
1571
1572         return (0);
1573 }
1574
1575 /*
1576  * Routine used by RCTL syscalls to write out output string.
1577  */
1578 static int
1579 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1580 {
1581         int error;
1582
1583         ASSERT_RACCT_ENABLED();
1584
1585         if (outputsbuf == NULL)
1586                 return (0);
1587
1588         sbuf_finish(outputsbuf);
1589         if (outbuflen < sbuf_len(outputsbuf) + 1) {
1590                 sbuf_delete(outputsbuf);
1591                 return (ERANGE);
1592         }
1593         error = copyout(sbuf_data(outputsbuf), outbufp,
1594             sbuf_len(outputsbuf) + 1);
1595         sbuf_delete(outputsbuf);
1596         return (error);
1597 }
1598
1599 static struct sbuf *
1600 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1601 {
1602         struct sbuf *sb;
1603         int64_t amount;
1604         int i;
1605
1606         ASSERT_RACCT_ENABLED();
1607
1608         sb = sbuf_new_auto();
1609         for (i = 0; i <= RACCT_MAX; i++) {
1610                 if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1611                         continue;
1612                 amount = racct->r_resources[i];
1613                 if (RACCT_IS_IN_MILLIONS(i))
1614                         amount /= 1000000;
1615                 sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1616         }
1617         sbuf_setpos(sb, sbuf_len(sb) - 1);
1618         return (sb);
1619 }
1620
1621 int
1622 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1623 {
1624         struct rctl_rule *filter;
1625         struct sbuf *outputsbuf = NULL;
1626         struct proc *p;
1627         struct uidinfo *uip;
1628         struct loginclass *lc;
1629         struct prison_racct *prr;
1630         char *inputstr;
1631         int error;
1632
1633         if (!racct_enable)
1634                 return (ENOSYS);
1635
1636         error = priv_check(td, PRIV_RCTL_GET_RACCT);
1637         if (error != 0)
1638                 return (error);
1639
1640         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1641         if (error != 0)
1642                 return (error);
1643
1644         sx_slock(&allproc_lock);
1645         error = rctl_string_to_rule(inputstr, &filter);
1646         free(inputstr, M_RCTL);
1647         if (error != 0) {
1648                 sx_sunlock(&allproc_lock);
1649                 return (error);
1650         }
1651
1652         switch (filter->rr_subject_type) {
1653         case RCTL_SUBJECT_TYPE_PROCESS:
1654                 p = filter->rr_subject.rs_proc;
1655                 if (p == NULL) {
1656                         error = EINVAL;
1657                         goto out;
1658                 }
1659                 outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1660                 break;
1661         case RCTL_SUBJECT_TYPE_USER:
1662                 uip = filter->rr_subject.rs_uip;
1663                 if (uip == NULL) {
1664                         error = EINVAL;
1665                         goto out;
1666                 }
1667                 outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1668                 break;
1669         case RCTL_SUBJECT_TYPE_LOGINCLASS:
1670                 lc = filter->rr_subject.rs_loginclass;
1671                 if (lc == NULL) {
1672                         error = EINVAL;
1673                         goto out;
1674                 }
1675                 outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1676                 break;
1677         case RCTL_SUBJECT_TYPE_JAIL:
1678                 prr = filter->rr_subject.rs_prison_racct;
1679                 if (prr == NULL) {
1680                         error = EINVAL;
1681                         goto out;
1682                 }
1683                 outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1684                 break;
1685         default:
1686                 error = EINVAL;
1687         }
1688 out:
1689         rctl_rule_release(filter);
1690         sx_sunlock(&allproc_lock);
1691         if (error != 0)
1692                 return (error);
1693
1694         error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1695
1696         return (error);
1697 }
1698
1699 static void
1700 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1701 {
1702         struct rctl_rule *filter = (struct rctl_rule *)arg2;
1703         struct rctl_rule_link *link;
1704         struct sbuf *sb = (struct sbuf *)arg3;
1705
1706         ASSERT_RACCT_ENABLED();
1707         RCTL_LOCK_ASSERT();
1708
1709         LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1710                 if (!rctl_rule_matches(link->rrl_rule, filter))
1711                         continue;
1712                 rctl_rule_to_sbuf(sb, link->rrl_rule);
1713                 sbuf_printf(sb, ",");
1714         }
1715 }
1716
1717 int
1718 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1719 {
1720         struct sbuf *sb;
1721         struct rctl_rule *filter;
1722         struct rctl_rule_link *link;
1723         struct proc *p;
1724         char *inputstr, *buf;
1725         size_t bufsize;
1726         int error;
1727
1728         if (!racct_enable)
1729                 return (ENOSYS);
1730
1731         error = priv_check(td, PRIV_RCTL_GET_RULES);
1732         if (error != 0)
1733                 return (error);
1734
1735         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1736         if (error != 0)
1737                 return (error);
1738
1739         sx_slock(&allproc_lock);
1740         error = rctl_string_to_rule(inputstr, &filter);
1741         free(inputstr, M_RCTL);
1742         if (error != 0) {
1743                 sx_sunlock(&allproc_lock);
1744                 return (error);
1745         }
1746
1747         bufsize = uap->outbuflen;
1748         if (bufsize > rctl_maxbufsize) {
1749                 sx_sunlock(&allproc_lock);
1750                 return (E2BIG);
1751         }
1752
1753         buf = malloc(bufsize, M_RCTL, M_WAITOK);
1754         sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1755         KASSERT(sb != NULL, ("sbuf_new failed"));
1756
1757         FOREACH_PROC_IN_SYSTEM(p) {
1758                 RCTL_RLOCK();
1759                 LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1760                         /*
1761                          * Non-process rules will be added to the buffer later.
1762                          * Adding them here would result in duplicated output.
1763                          */
1764                         if (link->rrl_rule->rr_subject_type !=
1765                             RCTL_SUBJECT_TYPE_PROCESS)
1766                                 continue;
1767                         if (!rctl_rule_matches(link->rrl_rule, filter))
1768                                 continue;
1769                         rctl_rule_to_sbuf(sb, link->rrl_rule);
1770                         sbuf_printf(sb, ",");
1771                 }
1772                 RCTL_RUNLOCK();
1773         }
1774
1775         loginclass_racct_foreach(rctl_get_rules_callback,
1776             rctl_rule_pre_callback, rctl_rule_post_callback,
1777             filter, sb);
1778         ui_racct_foreach(rctl_get_rules_callback,
1779             rctl_rule_pre_callback, rctl_rule_post_callback,
1780             filter, sb);
1781         prison_racct_foreach(rctl_get_rules_callback,
1782             rctl_rule_pre_callback, rctl_rule_post_callback,
1783             filter, sb);
1784         if (sbuf_error(sb) == ENOMEM) {
1785                 error = ERANGE;
1786                 goto out;
1787         }
1788
1789         /*
1790          * Remove trailing ",".
1791          */
1792         if (sbuf_len(sb) > 0)
1793                 sbuf_setpos(sb, sbuf_len(sb) - 1);
1794
1795         error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1796 out:
1797         rctl_rule_release(filter);
1798         sx_sunlock(&allproc_lock);
1799         free(buf, M_RCTL);
1800         return (error);
1801 }
1802
1803 int
1804 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1805 {
1806         struct sbuf *sb;
1807         struct rctl_rule *filter;
1808         struct rctl_rule_link *link;
1809         char *inputstr, *buf;
1810         size_t bufsize;
1811         int error;
1812
1813         if (!racct_enable)
1814                 return (ENOSYS);
1815
1816         error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1817         if (error != 0)
1818                 return (error);
1819
1820         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1821         if (error != 0)
1822                 return (error);
1823
1824         sx_slock(&allproc_lock);
1825         error = rctl_string_to_rule(inputstr, &filter);
1826         free(inputstr, M_RCTL);
1827         if (error != 0) {
1828                 sx_sunlock(&allproc_lock);
1829                 return (error);
1830         }
1831
1832         if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1833                 rctl_rule_release(filter);
1834                 sx_sunlock(&allproc_lock);
1835                 return (EINVAL);
1836         }
1837         if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1838                 rctl_rule_release(filter);
1839                 sx_sunlock(&allproc_lock);
1840                 return (EOPNOTSUPP);
1841         }
1842         if (filter->rr_subject.rs_proc == NULL) {
1843                 rctl_rule_release(filter);
1844                 sx_sunlock(&allproc_lock);
1845                 return (EINVAL);
1846         }
1847
1848         bufsize = uap->outbuflen;
1849         if (bufsize > rctl_maxbufsize) {
1850                 rctl_rule_release(filter);
1851                 sx_sunlock(&allproc_lock);
1852                 return (E2BIG);
1853         }
1854
1855         buf = malloc(bufsize, M_RCTL, M_WAITOK);
1856         sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1857         KASSERT(sb != NULL, ("sbuf_new failed"));
1858
1859         RCTL_RLOCK();
1860         LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1861             rrl_next) {
1862                 rctl_rule_to_sbuf(sb, link->rrl_rule);
1863                 sbuf_printf(sb, ",");
1864         }
1865         RCTL_RUNLOCK();
1866         if (sbuf_error(sb) == ENOMEM) {
1867                 error = ERANGE;
1868                 goto out;
1869         }
1870
1871         /*
1872          * Remove trailing ",".
1873          */
1874         if (sbuf_len(sb) > 0)
1875                 sbuf_setpos(sb, sbuf_len(sb) - 1);
1876
1877         error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1878 out:
1879         rctl_rule_release(filter);
1880         sx_sunlock(&allproc_lock);
1881         free(buf, M_RCTL);
1882         return (error);
1883 }
1884
1885 int
1886 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1887 {
1888         struct rctl_rule *rule;
1889         char *inputstr;
1890         int error;
1891
1892         if (!racct_enable)
1893                 return (ENOSYS);
1894
1895         error = priv_check(td, PRIV_RCTL_ADD_RULE);
1896         if (error != 0)
1897                 return (error);
1898
1899         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1900         if (error != 0)
1901                 return (error);
1902
1903         sx_slock(&allproc_lock);
1904         error = rctl_string_to_rule(inputstr, &rule);
1905         free(inputstr, M_RCTL);
1906         if (error != 0) {
1907                 sx_sunlock(&allproc_lock);
1908                 return (error);
1909         }
1910         /*
1911          * The 'per' part of a rule is optional.
1912          */
1913         if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1914             rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1915                 rule->rr_per = rule->rr_subject_type;
1916
1917         if (!rctl_rule_fully_specified(rule)) {
1918                 error = EINVAL;
1919                 goto out;
1920         }
1921
1922         error = rctl_rule_add(rule);
1923
1924 out:
1925         rctl_rule_release(rule);
1926         sx_sunlock(&allproc_lock);
1927         return (error);
1928 }
1929
1930 int
1931 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1932 {
1933         struct rctl_rule *filter;
1934         char *inputstr;
1935         int error;
1936
1937         if (!racct_enable)
1938                 return (ENOSYS);
1939
1940         error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1941         if (error != 0)
1942                 return (error);
1943
1944         error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1945         if (error != 0)
1946                 return (error);
1947
1948         sx_slock(&allproc_lock);
1949         error = rctl_string_to_rule(inputstr, &filter);
1950         free(inputstr, M_RCTL);
1951         if (error != 0) {
1952                 sx_sunlock(&allproc_lock);
1953                 return (error);
1954         }
1955
1956         error = rctl_rule_remove(filter);
1957         rctl_rule_release(filter);
1958         sx_sunlock(&allproc_lock);
1959
1960         return (error);
1961 }
1962
1963 /*
1964  * Update RCTL rule list after credential change.
1965  */
1966 void
1967 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1968 {
1969         LIST_HEAD(, rctl_rule_link) newrules;
1970         struct rctl_rule_link *link, *newlink;
1971         struct uidinfo *newuip;
1972         struct loginclass *newlc;
1973         struct prison_racct *newprr;
1974         int rulecnt, i;
1975
1976         ASSERT_RACCT_ENABLED();
1977
1978         newuip = newcred->cr_ruidinfo;
1979         newlc = newcred->cr_loginclass;
1980         newprr = newcred->cr_prison->pr_prison_racct;
1981         
1982         LIST_INIT(&newrules);
1983
1984 again:
1985         /*
1986          * First, count the rules that apply to the process with new
1987          * credentials.
1988          */
1989         rulecnt = 0;
1990         RCTL_RLOCK();
1991         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1992                 if (link->rrl_rule->rr_subject_type ==
1993                     RCTL_SUBJECT_TYPE_PROCESS)
1994                         rulecnt++;
1995         }
1996         LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1997                 rulecnt++;
1998         LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1999                 rulecnt++;
2000         LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
2001                 rulecnt++;
2002         RCTL_RUNLOCK();
2003
2004         /*
2005          * Create temporary list.  We've dropped the rctl_lock in order
2006          * to use M_WAITOK.
2007          */
2008         for (i = 0; i < rulecnt; i++) {
2009                 newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
2010                 newlink->rrl_rule = NULL;
2011                 newlink->rrl_exceeded = 0;
2012                 LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
2013         }
2014
2015         newlink = LIST_FIRST(&newrules);
2016
2017         /*
2018          * Assign rules to the newly allocated list entries.
2019          */
2020         RCTL_WLOCK();
2021         LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2022                 if (link->rrl_rule->rr_subject_type ==
2023                     RCTL_SUBJECT_TYPE_PROCESS) {
2024                         if (newlink == NULL)
2025                                 goto goaround;
2026                         rctl_rule_acquire(link->rrl_rule);
2027                         newlink->rrl_rule = link->rrl_rule;
2028                         newlink->rrl_exceeded = link->rrl_exceeded;
2029                         newlink = LIST_NEXT(newlink, rrl_next);
2030                         rulecnt--;
2031                 }
2032         }
2033         
2034         LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2035                 if (newlink == NULL)
2036                         goto goaround;
2037                 rctl_rule_acquire(link->rrl_rule);
2038                 newlink->rrl_rule = link->rrl_rule;
2039                 newlink->rrl_exceeded = link->rrl_exceeded;
2040                 newlink = LIST_NEXT(newlink, rrl_next);
2041                 rulecnt--;
2042         }
2043
2044         LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2045                 if (newlink == NULL)
2046                         goto goaround;
2047                 rctl_rule_acquire(link->rrl_rule);
2048                 newlink->rrl_rule = link->rrl_rule;
2049                 newlink->rrl_exceeded = link->rrl_exceeded;
2050                 newlink = LIST_NEXT(newlink, rrl_next);
2051                 rulecnt--;
2052         }
2053
2054         LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2055                 if (newlink == NULL)
2056                         goto goaround;
2057                 rctl_rule_acquire(link->rrl_rule);
2058                 newlink->rrl_rule = link->rrl_rule;
2059                 newlink->rrl_exceeded = link->rrl_exceeded;
2060                 newlink = LIST_NEXT(newlink, rrl_next);
2061                 rulecnt--;
2062         }
2063
2064         if (rulecnt == 0) {
2065                 /*
2066                  * Free the old rule list.
2067                  */
2068                 while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2069                         link = LIST_FIRST(&p->p_racct->r_rule_links);
2070                         LIST_REMOVE(link, rrl_next);
2071                         rctl_rule_release(link->rrl_rule);
2072                         uma_zfree(rctl_rule_link_zone, link);
2073                 }
2074
2075                 /*
2076                  * Replace lists and we're done.
2077                  *
2078                  * XXX: Is there any way to switch list heads instead
2079                  *      of iterating here?
2080                  */
2081                 while (!LIST_EMPTY(&newrules)) {
2082                         newlink = LIST_FIRST(&newrules);
2083                         LIST_REMOVE(newlink, rrl_next);
2084                         LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2085                             newlink, rrl_next);
2086                 }
2087
2088                 RCTL_WUNLOCK();
2089
2090                 return;
2091         }
2092
2093 goaround:
2094         RCTL_WUNLOCK();
2095
2096         /*
2097          * Rule list changed while we were not holding the rctl_lock.
2098          * Free the new list and try again.
2099          */
2100         while (!LIST_EMPTY(&newrules)) {
2101                 newlink = LIST_FIRST(&newrules);
2102                 LIST_REMOVE(newlink, rrl_next);
2103                 if (newlink->rrl_rule != NULL)
2104                         rctl_rule_release(newlink->rrl_rule);
2105                 uma_zfree(rctl_rule_link_zone, newlink);
2106         }
2107
2108         goto again;
2109 }
2110
2111 /*
2112  * Assign RCTL rules to the newly created process.
2113  */
2114 int
2115 rctl_proc_fork(struct proc *parent, struct proc *child)
2116 {
2117         struct rctl_rule *rule;
2118         struct rctl_rule_link *link;
2119         int error;
2120
2121         LIST_INIT(&child->p_racct->r_rule_links);
2122
2123         ASSERT_RACCT_ENABLED();
2124         KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2125
2126         RCTL_WLOCK();
2127
2128         /*
2129          * Go through limits applicable to the parent and assign them
2130          * to the child.  Rules with 'process' subject have to be duplicated
2131          * in order to make their rr_subject point to the new process.
2132          */
2133         LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2134                 if (link->rrl_rule->rr_subject_type ==
2135                     RCTL_SUBJECT_TYPE_PROCESS) {
2136                         rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2137                         if (rule == NULL)
2138                                 goto fail;
2139                         KASSERT(rule->rr_subject.rs_proc == parent,
2140                             ("rule->rr_subject.rs_proc != parent"));
2141                         rule->rr_subject.rs_proc = child;
2142                         error = rctl_racct_add_rule_locked(child->p_racct,
2143                             rule);
2144                         rctl_rule_release(rule);
2145                         if (error != 0)
2146                                 goto fail;
2147                 } else {
2148                         error = rctl_racct_add_rule_locked(child->p_racct,
2149                             link->rrl_rule);
2150                         if (error != 0)
2151                                 goto fail;
2152                 }
2153         }
2154
2155         RCTL_WUNLOCK();
2156         return (0);
2157
2158 fail:
2159         while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2160                 link = LIST_FIRST(&child->p_racct->r_rule_links);
2161                 LIST_REMOVE(link, rrl_next);
2162                 rctl_rule_release(link->rrl_rule);
2163                 uma_zfree(rctl_rule_link_zone, link);
2164         }
2165         RCTL_WUNLOCK();
2166         return (EAGAIN);
2167 }
2168
2169 /*
2170  * Release rules attached to the racct.
2171  */
2172 void
2173 rctl_racct_release(struct racct *racct)
2174 {
2175         struct rctl_rule_link *link;
2176
2177         ASSERT_RACCT_ENABLED();
2178
2179         RCTL_WLOCK();
2180         while (!LIST_EMPTY(&racct->r_rule_links)) {
2181                 link = LIST_FIRST(&racct->r_rule_links);
2182                 LIST_REMOVE(link, rrl_next);
2183                 rctl_rule_release(link->rrl_rule);
2184                 uma_zfree(rctl_rule_link_zone, link);
2185         }
2186         RCTL_WUNLOCK();
2187 }
2188
2189 static void
2190 rctl_init(void)
2191 {
2192
2193         if (!racct_enable)
2194                 return;
2195
2196         rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2197             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2198         rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2199             sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2200             UMA_ALIGN_PTR, 0);
2201
2202         /*
2203          * Set default values, making sure not to overwrite the ones
2204          * fetched from tunables.  Most of those could be set at the
2205          * declaration, except for the rctl_throttle_max - we cannot
2206          * set it there due to hz not being compile time constant.
2207          */
2208         if (rctl_throttle_min < 1)
2209                 rctl_throttle_min = 1;
2210         if (rctl_throttle_max < rctl_throttle_min)
2211                 rctl_throttle_max = 2 * hz;
2212         if (rctl_throttle_pct < 0)
2213                 rctl_throttle_pct = 100;
2214         if (rctl_throttle_pct2 < 0)
2215                 rctl_throttle_pct2 = 100;
2216 }
2217
2218 #else /* !RCTL */
2219
2220 int
2221 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2222 {
2223         
2224         return (ENOSYS);
2225 }
2226
2227 int
2228 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2229 {
2230         
2231         return (ENOSYS);
2232 }
2233
2234 int
2235 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2236 {
2237         
2238         return (ENOSYS);
2239 }
2240
2241 int
2242 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2243 {
2244         
2245         return (ENOSYS);
2246 }
2247
2248 int
2249 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2250 {
2251         
2252         return (ENOSYS);
2253 }
2254
2255 #endif /* !RCTL */