sys/security/audit/audit.c

   1 /*
   2  * Copyright (c) 1999-2005 Apple Computer, Inc.
   3  * Copyright (c) 2006 Robert N. M. Watson
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1.  Redistributions of source code must retain the above copyright
  10  *     notice, this list of conditions and the following disclaimer.
  11  * 2.  Redistributions in binary form must reproduce the above copyright
  12  *     notice, this list of conditions and the following disclaimer in the
  13  *     documentation and/or other materials provided with the distribution.
  14  * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
  15  *     its contributors may be used to endorse or promote products derived
  16  *     from this software without specific prior written permission.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND
  19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21  * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR
  22  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  26  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  27  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28  * POSSIBILITY OF SUCH DAMAGE.
  29  *
  30  * $FreeBSD$
  31  */
  32
  33 #include <sys/param.h>
  34 #include <sys/condvar.h>
  35 #include <sys/conf.h>
  36 #include <sys/file.h>
  37 #include <sys/filedesc.h>
  38 #include <sys/fcntl.h>
  39 #include <sys/ipc.h>
  40 #include <sys/kernel.h>
  41 #include <sys/kthread.h>
  42 #include <sys/malloc.h>
  43 #include <sys/mount.h>
  44 #include <sys/namei.h>
  45 #include <sys/proc.h>
  46 #include <sys/queue.h>
  47 #include <sys/socket.h>
  48 #include <sys/socketvar.h>
  49 #include <sys/protosw.h>
  50 #include <sys/domain.h>
  51 #include <sys/sysproto.h>
  52 #include <sys/sysent.h>
  53 #include <sys/systm.h>
  54 #include <sys/ucred.h>
  55 #include <sys/uio.h>
  56 #include <sys/un.h>
  57 #include <sys/unistd.h>
  58 #include <sys/vnode.h>
  59
  60 #include <bsm/audit.h>
  61 #include <bsm/audit_internal.h>
  62 #include <bsm/audit_kevents.h>
  63
  64 #include <netinet/in.h>
  65 #include <netinet/in_pcb.h>
  66
  67 #include <security/audit/audit.h>
  68 #include <security/audit/audit_private.h>
  69
  70 #include <vm/uma.h>
  71
  72 /*
  73  * The AUDIT_EXCESSIVELY_VERBOSE define enables a number of
  74  * gratuitously noisy printf's to the console.  Due to the
  75  * volume, it should be left off unless you want your system
  76  * to churn a lot whenever the audit record flow gets high.
  77  */
  78 //#define       AUDIT_EXCESSIVELY_VERBOSE
  79 #ifdef AUDIT_EXCESSIVELY_VERBOSE
  80 #define AUDIT_PRINTF(x) printf x
  81 #else
  82 #define AUDIT_PRINTF(X)
  83 #endif
  84
  85 static uma_zone_t audit_record_zone;
  86 static MALLOC_DEFINE(M_AUDITPROC, "audit_proc", "Audit process storage");
  87 MALLOC_DEFINE(M_AUDITDATA, "audit_data", "Audit data storage");
  88 MALLOC_DEFINE(M_AUDITPATH, "audit_path", "Audit path storage");
  89 MALLOC_DEFINE(M_AUDITTEXT, "audit_text", "Audit text storage");
  90
  91 /*
  92  * Audit control settings that are set/read by system calls and are
  93  * hence non-static.
  94  */
  95 /*
  96  * Define the audit control flags.
  97  */
  98 int                                     audit_enabled;
  99 int                                     audit_suspended;
 100
 101 /*
 102  * Flags controlling behavior in low storage situations.
 103  * Should we panic if a write fails?  Should we fail stop
 104  * if we're out of disk space?
 105  */
 106 int                                     audit_panic_on_write_fail;
 107 int                                     audit_fail_stop;
 108
 109 /*
 110  * Are we currently "failing stop" due to out of disk space?
 111  */
 112 static int                               audit_in_failure;
 113
 114 /*
 115  * Global audit statistiscs.
 116  */
 117 struct audit_fstat                      audit_fstat;
 118
 119 /*
 120  * Preselection mask for non-attributable events.
 121  */
 122 struct au_mask                          audit_nae_mask;
 123
 124 /*
 125  * Mutex to protect global variables shared between various threads and
 126  * processes.
 127  */
 128 static struct mtx                       audit_mtx;
 129
 130 /*
 131  * Queue of audit records ready for delivery to disk.  We insert new
 132  * records at the tail, and remove records from the head.  Also,
 133  * a count of the number of records used for checking queue depth.
 134  * In addition, a counter of records that we have allocated but are
 135  * not yet in the queue, which is needed to estimate the total
 136  * size of the combined set of records outstanding in the system.
 137  */
 138 static TAILQ_HEAD(, kaudit_record)      audit_q;
 139 static int                              audit_q_len;
 140 static int                              audit_pre_q_len;
 141
 142 /*
 143  * Audit queue control settings (minimum free, low/high water marks, etc.)
 144  */
 145 struct au_qctrl                         audit_qctrl;
 146
 147 /*
 148  * Condition variable to signal to the worker that it has work to do:
 149  * either new records are in the queue, or a log replacement is taking
 150  * place.
 151  */
 152 static struct cv                        audit_cv;
 153
 154 /*
 155  * Worker thread that will schedule disk I/O, etc.
 156  */
 157 static struct proc                      *audit_thread;
 158
 159 /*
 160  * When an audit log is rotated, the actual rotation must be performed
 161  * by the audit worker thread, as it may have outstanding writes on the
 162  * current audit log.  audit_replacement_vp holds the vnode replacing
 163  * the current vnode.  We can't let more than one replacement occur
 164  * at a time, so if more than one thread requests a replacement, only
 165  * one can have the replacement "in progress" at any given moment.  If
 166  * a thread tries to replace the audit vnode and discovers a replacement
 167  * is already in progress (i.e., audit_replacement_flag != 0), then it
 168  * will sleep on audit_replacement_cv waiting its turn to perform a
 169  * replacement.  When a replacement is completed, this cv is signalled
 170  * by the worker thread so a waiting thread can start another replacement.
 171  * We also store a credential to perform audit log write operations with.
 172  *
 173  * The current credential and vnode are thread-local to audit_worker.
 174  */
 175 static struct cv                        audit_replacement_cv;
 176
 177 static int                              audit_replacement_flag;
 178 static struct vnode                     *audit_replacement_vp;
 179 static struct ucred                     *audit_replacement_cred;
 180
 181 /*
 182  * Condition variable to signal to the worker that it has work to do:
 183  * either new records are in the queue, or a log replacement is taking
 184  * place.
 185  */
 186 static struct cv                        audit_commit_cv;
 187
 188 /*
 189  * Condition variable for  auditing threads wait on when in fail-stop mode.
 190  * Threads wait on this CV forever (and ever), never seeing the light of
 191  * day again.
 192  */
 193 static struct cv                        audit_fail_cv;
 194
 195 /*
 196  * Flags related to Kernel->user-space communication.
 197  */
 198 static int                      audit_file_rotate_wait;
 199
 200 /*
 201  * Construct an audit record for the passed thread.
 202  */
 203 static int
 204 audit_record_ctor(void *mem, int size, void *arg, int flags)
 205 {
 206         struct kaudit_record *ar;
 207         struct thread *td;
 208
 209         KASSERT(sizeof(*ar) == size, ("audit_record_ctor: wrong size"));
 210
 211         td = arg;
 212         ar = mem;
 213         bzero(ar, sizeof(*ar));
 214         ar->k_ar.ar_magic = AUDIT_RECORD_MAGIC;
 215         nanotime(&ar->k_ar.ar_starttime);
 216
 217         /*
 218          * Export the subject credential.
 219          *
 220          * XXXAUDIT: td_ucred access is OK without proc lock, but some other
 221          * fields here may require the proc lock.
 222          */
 223         cru2x(td->td_ucred, &ar->k_ar.ar_subj_cred);
 224         ar->k_ar.ar_subj_ruid = td->td_ucred->cr_ruid;
 225         ar->k_ar.ar_subj_rgid = td->td_ucred->cr_rgid;
 226         ar->k_ar.ar_subj_egid = td->td_ucred->cr_groups[0];
 227         ar->k_ar.ar_subj_auid = td->td_proc->p_au->ai_auid;
 228         ar->k_ar.ar_subj_asid = td->td_proc->p_au->ai_asid;
 229         ar->k_ar.ar_subj_pid = td->td_proc->p_pid;
 230         ar->k_ar.ar_subj_amask = td->td_proc->p_au->ai_mask;
 231         ar->k_ar.ar_subj_term = td->td_proc->p_au->ai_termid;
 232         bcopy(td->td_proc->p_comm, ar->k_ar.ar_subj_comm, MAXCOMLEN);
 233
 234         return (0);
 235 }
 236
 237 static void
 238 audit_record_dtor(void *mem, int size, void *arg)
 239 {
 240         struct kaudit_record *ar;
 241
 242         KASSERT(sizeof(*ar) == size, ("audit_record_dtor: wrong size"));
 243
 244         ar = mem;
 245         if (ar->k_ar.ar_arg_upath1 != NULL)
 246                 free(ar->k_ar.ar_arg_upath1, M_AUDITPATH);
 247         if (ar->k_ar.ar_arg_upath2 != NULL)
 248                 free(ar->k_ar.ar_arg_upath2, M_AUDITPATH);
 249         if (ar->k_ar.ar_arg_text != NULL)
 250                 free(ar->k_ar.ar_arg_text, M_AUDITTEXT);
 251         if (ar->k_udata != NULL)
 252                 free(ar->k_udata, M_AUDITDATA);
 253 }
 254
 255 /*
 256  * XXXAUDIT: Should adjust comments below to make it clear that we get to
 257  * this point only if we believe we have storage, so not having space here
 258  * is a violation of invariants derived from administrative procedures.
 259  * I.e., someone else has written to the audit partition, leaving less space
 260  * than we accounted for.
 261  */
 262 static int
 263 audit_record_write(struct vnode *vp, struct kaudit_record *ar,
 264     struct ucred *cred, struct thread *td)
 265 {
 266         int ret;
 267         long temp;
 268         struct au_record *bsm;
 269         struct vattr vattr;
 270         struct statfs *mnt_stat = &vp->v_mount->mnt_stat;
 271         int vfslocked;
 272
 273         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 274
 275         /*
 276          * First, gather statistics on the audit log file and file system
 277          * so that we know how we're doing on space.  In both cases,
 278          * if we're unable to perform the operation, we drop the record
 279          * and return.  However, this is arguably an assertion failure.
 280          * XXX Need a FreeBSD equivalent.
 281          */
 282         ret = VFS_STATFS(vp->v_mount, mnt_stat, td);
 283         if (ret)
 284                 goto out;
 285
 286         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 287         ret = VOP_GETATTR(vp, &vattr, cred, td);
 288         VOP_UNLOCK(vp, 0, td);
 289         if (ret)
 290                 goto out;
 291
 292         /* update the global stats struct */
 293         audit_fstat.af_currsz = vattr.va_size;
 294
 295         /*
 296          * XXX Need to decide what to do if the trigger to the audit daemon
 297          * fails.
 298          */
 299
 300         /*
 301          * If we fall below minimum free blocks (hard limit), tell the audit
 302          * daemon to force a rotation off of the file system. We also stop
 303          * writing, which means this audit record is probably lost.
 304          * If we fall below the minimum percent free blocks (soft limit),
 305          * then kindly suggest to the audit daemon to do something.
 306          */
 307         if (mnt_stat->f_bfree < AUDIT_HARD_LIMIT_FREE_BLOCKS) {
 308                 (void)send_trigger(AUDIT_TRIGGER_NO_SPACE);
 309                 /* Hopefully userspace did something about all the previous
 310                  * triggers that were sent prior to this critical condition.
 311                  * If fail-stop is set, then we're done; goodnight Gracie.
 312                  */
 313                 if (audit_fail_stop)
 314                         panic("Audit log space exhausted and fail-stop set.");
 315                 else {
 316                         audit_suspended = 1;
 317                         ret = ENOSPC;
 318                         goto out;
 319                 }
 320         } else
 321                 /*
 322                  * Send a message to the audit daemon that disk space
 323                  * is getting low.
 324                  *
 325                  * XXXAUDIT: Check math and block size calculation here.
 326                  */
 327                 if (audit_qctrl.aq_minfree != 0) {
 328                         temp = mnt_stat->f_blocks / (100 /
 329                             audit_qctrl.aq_minfree);
 330                         if (mnt_stat->f_bfree < temp)
 331                                 (void)send_trigger(AUDIT_TRIGGER_LOW_SPACE);
 332                 }
 333
 334         /* Check if the current log file is full; if so, call for
 335          * a log rotate. This is not an exact comparison; we may
 336          * write some records over the limit. If that's not
 337          * acceptable, then add a fudge factor here.
 338          */
 339         if ((audit_fstat.af_filesz != 0) &&
 340             (audit_file_rotate_wait == 0) &&
 341             (vattr.va_size >= audit_fstat.af_filesz)) {
 342                 audit_file_rotate_wait = 1;
 343                 (void)send_trigger(AUDIT_TRIGGER_OPEN_NEW);
 344         }
 345
 346         /*
 347          * If the estimated amount of audit data in the audit event queue
 348          * (plus records allocated but not yet queued) has reached the
 349          * amount of free space on the disk, then we need to go into an
 350          * audit fail stop state, in which we do not permit the
 351          * allocation/committing of any new audit records.  We continue to
 352          * process packets but don't allow any activities that might
 353          * generate new records.  In the future, we might want to detect
 354          * when space is available again and allow operation to continue,
 355          * but this behavior is sufficient to meet fail stop requirements
 356          * in CAPP.
 357          */
 358         if (audit_fail_stop &&
 359             (unsigned long)
 360             ((audit_q_len + audit_pre_q_len + 1) * MAX_AUDIT_RECORD_SIZE) /
 361             mnt_stat->f_bsize >= (unsigned long)(mnt_stat->f_bfree)) {
 362                 printf("audit_record_write: free space below size of audit "
 363                     "queue, failing stop\n");
 364                 audit_in_failure = 1;
 365         }
 366
 367         /*
 368          * If there is a user audit record attached to the kernel record,
 369          * then write the user record.
 370          */
 371         /* XXX Need to decide a few things here: IF the user audit
 372          * record is written, but the write of the kernel record fails,
 373          * what to do? Should the kernel record come before or after the
 374          * user record? For now, we write the user record first, and
 375          * we ignore errors.
 376          */
 377         if (ar->k_ar_commit & AR_COMMIT_USER) {
 378                 /*
 379                  * Try submitting the record to any active audit pipes.
 380                  */
 381                 audit_pipe_submit((void *)ar->k_udata, ar->k_ulen);
 382
 383                 /*
 384                  * And to disk.
 385                  */
 386                 ret = vn_rdwr(UIO_WRITE, vp, (void *)ar->k_udata, ar->k_ulen,
 387                           (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, cred, NULL,
 388                           NULL, td);
 389                 if (ret)
 390                         goto out;
 391         }
 392
 393         /*
 394          * Convert the internal kernel record to BSM format and write it
 395          * out if everything's OK.
 396          */
 397         if (!(ar->k_ar_commit & AR_COMMIT_KERNEL)) {
 398                 ret = 0;
 399                 goto out;
 400         }
 401
 402         /*
 403          * XXXAUDIT: Should we actually allow this conversion to fail?  With
 404          * sleeping memory allocation and invariants checks, perhaps not.
 405          */
 406         ret = kaudit_to_bsm(ar, &bsm);
 407         if (ret == BSM_NOAUDIT) {
 408                 ret = 0;
 409                 goto out;
 410         }
 411
 412         /*
 413          * XXX: We drop the record on BSM conversion failure, but really
 414          * this is an assertion failure.
 415          */
 416         if (ret == BSM_FAILURE) {
 417                 AUDIT_PRINTF(("BSM conversion failure\n"));
 418                 ret = EINVAL;
 419                 goto out;
 420         }
 421
 422         /*
 423          * Try submitting the record to any active audit pipes.
 424          */
 425         audit_pipe_submit((void *)bsm->data, bsm->len);
 426
 427         /*
 428          * XXX
 429          * We should break the write functionality away from the BSM record
 430          * generation and have the BSM generation done before this function
 431          * is called. This function will then take the BSM record as a
 432          * parameter.
 433          */
 434         ret = (vn_rdwr(UIO_WRITE, vp, (void *)bsm->data, bsm->len,
 435             (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, cred, NULL, NULL, td));
 436
 437         kau_free(bsm);
 438
 439 out:
 440         /*
 441          * When we're done processing the current record, we have to
 442          * check to see if we're in a failure mode, and if so, whether
 443          * this was the last record left to be drained.  If we're done
 444          * draining, then we fsync the vnode and panic.
 445          */
 446         if (audit_in_failure &&
 447             audit_q_len == 0 && audit_pre_q_len == 0) {
 448                 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
 449                 (void)VOP_FSYNC(vp, MNT_WAIT, td);
 450                 VOP_UNLOCK(vp, 0, td);
 451                 panic("Audit store overflow; record queue drained.");
 452         }
 453
 454         VFS_UNLOCK_GIANT(vfslocked);
 455
 456         return (ret);
 457 }
 458
 459 /*
 460  * The audit_worker thread is responsible for watching the event queue,
 461  * dequeueing records, converting them to BSM format, and committing them to
 462  * disk.  In order to minimize lock thrashing, records are dequeued in sets
 463  * to a thread-local work queue.  In addition, the audit_work performs the
 464  * actual exchange of audit log vnode pointer, as audit_vp is a thread-local
 465  * variable.
 466  */
 467 static void
 468 audit_worker(void *arg)
 469 {
 470         int do_replacement_signal, error;
 471         TAILQ_HEAD(, kaudit_record) ar_worklist;
 472         struct kaudit_record *ar;
 473         struct vnode *audit_vp, *old_vp;
 474         int vfslocked;
 475
 476         struct ucred *audit_cred, *old_cred;
 477         struct thread *audit_td;
 478
 479         AUDIT_PRINTF(("audit_worker starting\n"));
 480
 481         /*
 482          * These are thread-local variables requiring no synchronization.
 483          */
 484         TAILQ_INIT(&ar_worklist);
 485         audit_cred = NULL;
 486         audit_td = curthread;
 487         audit_vp = NULL;
 488
 489         mtx_lock(&audit_mtx);
 490         while (1) {
 491                 /*
 492                  * First priority: replace the audit log target if requested.
 493                  * Accessing the vnode here requires dropping the audit_mtx;
 494                  * in case another replacement was scheduled while the mutex
 495                  * was released, we loop.
 496                  *
 497                  * XXX It could well be we should drain existing records
 498                  * first to ensure that the timestamps and ordering
 499                  * are right.
 500                  */
 501                 do_replacement_signal = 0;
 502                 while (audit_replacement_flag != 0) {
 503                         old_cred = audit_cred;
 504                         old_vp = audit_vp;
 505                         audit_cred = audit_replacement_cred;
 506                         audit_vp = audit_replacement_vp;
 507                         audit_replacement_cred = NULL;
 508                         audit_replacement_vp = NULL;
 509                         audit_replacement_flag = 0;
 510
 511                         audit_enabled = (audit_vp != NULL);
 512
 513                         /*
 514                          * XXX: What to do about write failures here?
 515                          */
 516                         if (old_vp != NULL) {
 517                                 AUDIT_PRINTF(("Closing old audit file\n"));
 518                                 mtx_unlock(&audit_mtx);
 519                                 vfslocked = VFS_LOCK_GIANT(old_vp->v_mount);
 520                                 vn_close(old_vp, AUDIT_CLOSE_FLAGS, old_cred,
 521                                     audit_td);
 522                                 VFS_UNLOCK_GIANT(vfslocked);
 523                                 crfree(old_cred);
 524                                 mtx_lock(&audit_mtx);
 525                                 old_cred = NULL;
 526                                 old_vp = NULL;
 527                                 AUDIT_PRINTF(("Audit file closed\n"));
 528                         }
 529                         if (audit_vp != NULL) {
 530                                 AUDIT_PRINTF(("Opening new audit file\n"));
 531                         }
 532                         do_replacement_signal = 1;
 533                 }
 534                 /*
 535                  * Signal that replacement have occurred to wake up and
 536                  * start any other replacements started in parallel.  We can
 537                  * continue about our business in the mean time.  We
 538                  * broadcast so that both new replacements can be inserted,
 539                  * but also so that the source(s) of replacement can return
 540                  * successfully.
 541                  */
 542                 if (do_replacement_signal)
 543                         cv_broadcast(&audit_replacement_cv);
 544
 545                 /*
 546                  * Next, check to see if we have any records to drain into
 547                  * the vnode.  If not, go back to waiting for an event.
 548                  */
 549                 if (TAILQ_EMPTY(&audit_q)) {
 550                         AUDIT_PRINTF(("audit_worker waiting\n"));
 551                         cv_wait(&audit_cv, &audit_mtx);
 552                         AUDIT_PRINTF(("audit_worker woken up\n"));
 553         AUDIT_PRINTF(("audit_worker: new vp = %p; value of flag %d\n",
 554             audit_replacement_vp, audit_replacement_flag));
 555                         continue;
 556                 }
 557
 558                 /*
 559                  * If we have records, but there's no active vnode to write
 560                  * to, drain the record queue.  Generally, we prevent the
 561                  * unnecessary allocation of records elsewhere, but we need
 562                  * to allow for races between conditional allocation and
 563                  * queueing.  Go back to waiting when we're done.
 564                  */
 565                 if (audit_vp == NULL) {
 566                         while ((ar = TAILQ_FIRST(&audit_q))) {
 567                                 TAILQ_REMOVE(&audit_q, ar, k_q);
 568                                 uma_zfree(audit_record_zone, ar);
 569                                 audit_q_len--;
 570                                 /*
 571                                  * XXXRW: Why broadcast if we hold the
 572                                  * mutex and know that audit_vp is NULL?
 573                                  */
 574                                 if (audit_q_len <= audit_qctrl.aq_lowater)
 575                                         cv_broadcast(&audit_commit_cv);
 576                         }
 577                         continue;
 578                 }
 579
 580                 /*
 581                  * We have both records to write and an active vnode to write
 582                  * to.  Dequeue a record, and start the write.  Eventually,
 583                  * it might make sense to dequeue several records and perform
 584                  * our own clustering, if the lower layers aren't doing it
 585                  * automatically enough.
 586                  */
 587                 while ((ar = TAILQ_FIRST(&audit_q))) {
 588                         TAILQ_REMOVE(&audit_q, ar, k_q);
 589                         audit_q_len--;
 590                         if (audit_q_len <= audit_qctrl.aq_lowater)
 591                                 cv_broadcast(&audit_commit_cv);
 592                         TAILQ_INSERT_TAIL(&ar_worklist, ar, k_q);
 593                 }
 594
 595                 mtx_unlock(&audit_mtx);
 596                 while ((ar = TAILQ_FIRST(&ar_worklist))) {
 597                         TAILQ_REMOVE(&ar_worklist, ar, k_q);
 598                         if (audit_vp != NULL) {
 599                                 error = audit_record_write(audit_vp, ar,
 600                                     audit_cred, audit_td);
 601                                 if (error && audit_panic_on_write_fail)
 602                                         panic("audit_worker: write error %d\n",
 603                                             error);
 604                                 else if (error)
 605                                         printf("audit_worker: write error %d\n",
 606                                             error);
 607                         }
 608                         uma_zfree(audit_record_zone, ar);
 609                 }
 610                 mtx_lock(&audit_mtx);
 611         }
 612 }
 613
 614 /*
 615  * Initialize the Audit subsystem: configuration state, work queue,
 616  * synchronization primitives, worker thread, and trigger device node.  Also
 617  * call into the BSM assembly code to initialize it.
 618  */
 619 static void
 620 audit_init(void)
 621 {
 622         int error;
 623
 624         printf("Security auditing service present\n");
 625         audit_enabled = 0;
 626         audit_suspended = 0;
 627         audit_panic_on_write_fail = 0;
 628         audit_fail_stop = 0;
 629         audit_in_failure = 0;
 630
 631         audit_replacement_vp = NULL;
 632         audit_replacement_cred = NULL;
 633         audit_replacement_flag = 0;
 634
 635         audit_fstat.af_filesz = 0;      /* '0' means unset, unbounded */
 636         audit_fstat.af_currsz = 0;
 637         audit_nae_mask.am_success = AU_NULL;
 638         audit_nae_mask.am_failure = AU_NULL;
 639
 640         TAILQ_INIT(&audit_q);
 641         audit_q_len = 0;
 642         audit_pre_q_len = 0;
 643         audit_qctrl.aq_hiwater = AQ_HIWATER;
 644         audit_qctrl.aq_lowater = AQ_LOWATER;
 645         audit_qctrl.aq_bufsz = AQ_BUFSZ;
 646         audit_qctrl.aq_minfree = AU_FS_MINFREE;
 647
 648         mtx_init(&audit_mtx, "audit_mtx", NULL, MTX_DEF);
 649         cv_init(&audit_cv, "audit_cv");
 650         cv_init(&audit_replacement_cv, "audit_replacement_cv");
 651         cv_init(&audit_commit_cv, "audit_commit_cv");
 652         cv_init(&audit_fail_cv, "audit_fail_cv");
 653
 654         audit_record_zone = uma_zcreate("audit_record_zone",
 655             sizeof(struct kaudit_record), audit_record_ctor,
 656             audit_record_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
 657
 658         /* Initialize the BSM audit subsystem. */
 659         kau_init();
 660
 661         audit_file_rotate_wait = 0;
 662         audit_trigger_init();
 663
 664         /* Register shutdown handler. */
 665         EVENTHANDLER_REGISTER(shutdown_pre_sync, audit_shutdown, NULL,
 666             SHUTDOWN_PRI_FIRST);
 667
 668         error = kthread_create(audit_worker, NULL, &audit_thread, RFHIGHPID,
 669             0, "audit_worker");
 670         if (error != 0)
 671                 panic("audit_init: kthread_create returned %d", error);
 672 }
 673
 674 SYSINIT(audit_init, SI_SUB_AUDIT, SI_ORDER_FIRST, audit_init, NULL)
 675
 676 /*
 677  * audit_rotate_vnode() is called by a user or kernel thread to configure or
 678  * de-configure auditing on a vnode.  The arguments are the replacement
 679  * credential and vnode to substitute for the current credential and vnode,
 680  * if any.  If either is set to NULL, both should be NULL, and this is used
 681  * to indicate that audit is being disabled.  The real work is done in the
 682  * audit_worker thread, but audit_rotate_vnode() waits synchronously for that
 683  * to complete.
 684  *
 685  * The vnode should be referenced and opened by the caller.  The credential
 686  * should be referenced.  audit_rotate_vnode() will own both references as of
 687  * this call, so the caller should not release either.
 688  *
 689  * XXXAUDIT: Review synchronize communication logic.  Really, this is a
 690  * message queue of depth 1.
 691  *
 692  * XXXAUDIT: Enhance the comments below to indicate that we are basically
 693  * acquiring ownership of the communications queue, inserting our message,
 694  * and waiting for an acknowledgement.
 695  */
 696 void
 697 audit_rotate_vnode(struct ucred *cred, struct vnode *vp)
 698 {
 699
 700         /*
 701          * If other parallel log replacements have been requested, we wait
 702          * until they've finished before continuing.
 703          */
 704         mtx_lock(&audit_mtx);
 705         while (audit_replacement_flag != 0) {
 706                 AUDIT_PRINTF(("audit_rotate_vnode: sleeping to wait for "
 707                     "flag\n"));
 708                 cv_wait(&audit_replacement_cv, &audit_mtx);
 709                 AUDIT_PRINTF(("audit_rotate_vnode: woken up (flag %d)\n",
 710                     audit_replacement_flag));
 711         }
 712         audit_replacement_cred = cred;
 713         audit_replacement_flag = 1;
 714         audit_replacement_vp = vp;
 715
 716         /*
 717          * Wake up the audit worker to perform the exchange once we
 718          * release the mutex.
 719          */
 720         cv_signal(&audit_cv);
 721
 722         /*
 723          * Wait for the audit_worker to broadcast that a replacement has
 724          * taken place; we know that once this has happened, our vnode
 725          * has been replaced in, so we can return successfully.
 726          */
 727         AUDIT_PRINTF(("audit_rotate_vnode: waiting for news of "
 728             "replacement\n"));
 729         cv_wait(&audit_replacement_cv, &audit_mtx);
 730         AUDIT_PRINTF(("audit_rotate_vnode: change acknowledged by "
 731             "audit_worker (flag " "now %d)\n", audit_replacement_flag));
 732         mtx_unlock(&audit_mtx);
 733
 734         audit_file_rotate_wait = 0; /* We can now request another rotation */
 735 }
 736
 737 /*
 738  * Drain the audit queue and close the log at shutdown.  Note that this can
 739  * be called both from the system shutdown path and also from audit
 740  * configuration syscalls, so 'arg' and 'howto' are ignored.
 741  */
 742 void
 743 audit_shutdown(void *arg, int howto)
 744 {
 745
 746         audit_rotate_vnode(NULL, NULL);
 747 }
 748
 749 /*
 750  * Return the current thread's audit record, if any.
 751  */
 752 __inline__ struct kaudit_record *
 753 currecord(void)
 754 {
 755
 756         return (curthread->td_ar);
 757 }
 758
 759 /*
 760  * MPSAFE
 761  *
 762  * XXXAUDIT: There are a number of races present in the code below due to
 763  * release and re-grab of the mutex.  The code should be revised to become
 764  * slightly less racy.
 765  *
 766  * XXXAUDIT: Shouldn't there be logic here to sleep waiting on available
 767  * pre_q space, suspending the system call until there is room?
 768  */
 769 struct kaudit_record *
 770 audit_new(int event, struct thread *td)
 771 {
 772         struct kaudit_record *ar;
 773         int no_record;
 774
 775         mtx_lock(&audit_mtx);
 776         no_record = (audit_suspended || !audit_enabled);
 777         mtx_unlock(&audit_mtx);
 778         if (no_record)
 779                 return (NULL);
 780
 781         /*
 782          * XXX: The number of outstanding uncommitted audit records is
 783          * limited to the number of concurrent threads servicing system
 784          * calls in the kernel.
 785          */
 786         ar = uma_zalloc_arg(audit_record_zone, td, M_WAITOK);
 787         ar->k_ar.ar_event = event;
 788
 789         mtx_lock(&audit_mtx);
 790         audit_pre_q_len++;
 791         mtx_unlock(&audit_mtx);
 792
 793         return (ar);
 794 }
 795
 796 /*
 797  * MPSAFE
 798  */
 799 void
 800 audit_commit(struct kaudit_record *ar, int error, int retval)
 801 {
 802         int sorf;
 803         struct au_mask *aumask;
 804
 805         if (ar == NULL)
 806                 return;
 807
 808         /*
 809          * Decide whether to commit the audit record by checking the
 810          * error value from the system call and using the appropriate
 811          * audit mask.
 812          *
 813          * XXXAUDIT: Synchronize access to audit_nae_mask?
 814          */
 815         if (ar->k_ar.ar_subj_auid == AU_DEFAUDITID)
 816                 aumask = &audit_nae_mask;
 817         else
 818                 aumask = &ar->k_ar.ar_subj_amask;
 819
 820         if (error)
 821                 sorf = AU_PRS_FAILURE;
 822         else
 823                 sorf = AU_PRS_SUCCESS;
 824
 825         switch(ar->k_ar.ar_event) {
 826
 827         case AUE_OPEN_RWTC:
 828                 /* The open syscall always writes a AUE_OPEN_RWTC event; change
 829                  * it to the proper type of event based on the flags and the
 830                  * error value.
 831                  */
 832                 ar->k_ar.ar_event = flags_and_error_to_openevent(
 833                     ar->k_ar.ar_arg_fflags, error);
 834                 break;
 835
 836         case AUE_SYSCTL:
 837                 ar->k_ar.ar_event = ctlname_to_sysctlevent(
 838                     ar->k_ar.ar_arg_ctlname, ar->k_ar.ar_valid_arg);
 839                 break;
 840
 841         case AUE_AUDITON:
 842                 /* Convert the auditon() command to an event */
 843                 ar->k_ar.ar_event = auditon_command_event(ar->k_ar.ar_arg_cmd);
 844                 break;
 845         }
 846
 847         if (au_preselect(ar->k_ar.ar_event, aumask, sorf) != 0)
 848                 ar->k_ar_commit |= AR_COMMIT_KERNEL;
 849
 850         /*
 851          * XXXRW: Why is this necessary?  Should we ever accept a record that
 852          * we're not willing to commit?
 853          */
 854         if ((ar->k_ar_commit & (AR_COMMIT_USER | AR_COMMIT_KERNEL)) == 0) {
 855                 mtx_lock(&audit_mtx);
 856                 audit_pre_q_len--;
 857                 mtx_unlock(&audit_mtx);
 858                 uma_zfree(audit_record_zone, ar);
 859                 return;
 860         }
 861
 862         ar->k_ar.ar_errno = error;
 863         ar->k_ar.ar_retval = retval;
 864
 865         /*
 866          * We might want to do some system-wide post-filtering
 867          * here at some point.
 868          */
 869
 870         /*
 871          * Timestamp system call end.
 872          */
 873         nanotime(&ar->k_ar.ar_endtime);
 874
 875         mtx_lock(&audit_mtx);
 876
 877         /*
 878          * Note: it could be that some records initiated while audit was
 879          * enabled should still be committed?
 880          */
 881         if (audit_suspended || !audit_enabled) {
 882                 audit_pre_q_len--;
 883                 mtx_unlock(&audit_mtx);
 884                 uma_zfree(audit_record_zone, ar);
 885                 return;
 886         }
 887
 888         /*
 889          * Constrain the number of committed audit records based on
 890          * the configurable parameter.
 891          */
 892         while (audit_q_len >= audit_qctrl.aq_hiwater) {
 893                 AUDIT_PRINTF(("audit_commit: sleeping to wait for "
 894                    "audit queue to drain below high water mark\n"));
 895                 cv_wait(&audit_commit_cv, &audit_mtx);
 896                 AUDIT_PRINTF(("audit_commit: woke up waiting for "
 897                    "audit queue draining\n"));
 898         }
 899
 900         TAILQ_INSERT_TAIL(&audit_q, ar, k_q);
 901         audit_q_len++;
 902         audit_pre_q_len--;
 903         cv_signal(&audit_cv);
 904         mtx_unlock(&audit_mtx);
 905 }
 906
 907 /*
 908  * audit_syscall_enter() is called on entry to each system call.  It is
 909  * responsible for deciding whether or not to audit the call (preselection),
 910  * and if so, allocating a per-thread audit record.  audit_new() will fill in
 911  * basic thread/credential properties.
 912  */
 913 void
 914 audit_syscall_enter(unsigned short code, struct thread *td)
 915 {
 916         int audit_event;
 917         struct au_mask *aumask;
 918
 919         KASSERT(td->td_ar == NULL, ("audit_syscall_enter: td->td_ar != NULL"));
 920
 921         /*
 922          * In FreeBSD, each ABI has its own system call table, and hence
 923          * mapping of system call codes to audit events.  Convert the code to
 924          * an audit event identifier using the process system call table
 925          * reference.  In Darwin, there's only one, so we use the global
 926          * symbol for the system call table.
 927          *
 928          * XXXAUDIT: Should we audit that a bad system call was made, and if
 929          * so, how?
 930          */
 931         if (code >= td->td_proc->p_sysent->sv_size)
 932                 return;
 933
 934         audit_event = td->td_proc->p_sysent->sv_table[code].sy_auevent;
 935         if (audit_event == AUE_NULL)
 936                 return;
 937
 938         /*
 939          * Check which audit mask to use; either the kernel non-attributable
 940          * event mask or the process audit mask.
 941          */
 942         if (td->td_proc->p_au->ai_auid == AU_DEFAUDITID)
 943                 aumask = &audit_nae_mask;
 944         else
 945                 aumask = &td->td_proc->p_au->ai_mask;
 946
 947         /*
 948          * Allocate an audit record, if preselection allows it, and store
 949          * in the thread for later use.
 950          */
 951         if (au_preselect(audit_event, aumask,
 952                         AU_PRS_FAILURE | AU_PRS_SUCCESS)) {
 953                 /*
 954                  * If we're out of space and need to suspend unprivileged
 955                  * processes, do that here rather than trying to allocate
 956                  * another audit record.
 957                  *
 958                  * XXXRW: We might wish to be able to continue here in the
 959                  * future, if the system recovers.  That should be possible
 960                  * by means of checking the condition in a loop around
 961                  * cv_wait().  It might be desirable to reevaluate whether an
 962                  * audit record is still required for this event by
 963                  * re-calling au_preselect().
 964                  */
 965                 if (audit_in_failure && suser(td) != 0) {
 966                         cv_wait(&audit_fail_cv, &audit_mtx);
 967                         panic("audit_failing_stop: thread continued");
 968                 }
 969                 td->td_ar = audit_new(audit_event, td);
 970         } else
 971                 td->td_ar = NULL;
 972 }
 973
 974 /*
 975  * audit_syscall_exit() is called from the return of every system call, or in
 976  * the event of exit1(), during the execution of exit1().  It is responsible
 977  * for committing the audit record, if any, along with return condition.
 978  */
 979 void
 980 audit_syscall_exit(int error, struct thread *td)
 981 {
 982         int retval;
 983
 984         /*
 985          * Commit the audit record as desired; once we pass the record
 986          * into audit_commit(), the memory is owned by the audit
 987          * subsystem.
 988          * The return value from the system call is stored on the user
 989          * thread. If there was an error, the return value is set to -1,
 990          * imitating the behavior of the cerror routine.
 991          */
 992         if (error)
 993                 retval = -1;
 994         else
 995                 retval = td->td_retval[0];
 996
 997         audit_commit(td->td_ar, error, retval);
 998         if (td->td_ar != NULL)
 999                 AUDIT_PRINTF(("audit record committed by pid %d\n",
1000                         td->td_proc->p_pid));
1001         td->td_ar = NULL;
1002
1003 }
1004
1005 /*
1006  * Allocate storage for a new process (init, or otherwise).
1007  */
1008 void
1009 audit_proc_alloc(struct proc *p)
1010 {
1011
1012         KASSERT(p->p_au == NULL, ("audit_proc_alloc: p->p_au != NULL (%d)",
1013             p->p_pid));
1014         p->p_au = malloc(sizeof(*(p->p_au)), M_AUDITPROC, M_WAITOK);
1015         /* XXXAUDIT: Zero?  Slab allocate? */
1016         //printf("audit_proc_alloc: pid %d p_au %p\n", p->p_pid, p->p_au);
1017 }
1018
1019 /*
1020  * Allocate storage for a new thread.
1021  */
1022 void
1023 audit_thread_alloc(struct thread *td)
1024 {
1025
1026         td->td_ar = NULL;
1027 }
1028
1029 /*
1030  * Thread destruction.
1031  */
1032 void
1033 audit_thread_free(struct thread *td)
1034 {
1035
1036         KASSERT(td->td_ar == NULL, ("audit_thread_free: td_ar != NULL"));
1037 }
1038
1039 /*
1040  * Initialize the audit information for the a process, presumably the first
1041  * process in the system.
1042  * XXX It is not clear what the initial values should be for audit ID,
1043  * session ID, etc.
1044  */
1045 void
1046 audit_proc_kproc0(struct proc *p)
1047 {
1048
1049         KASSERT(p->p_au != NULL, ("audit_proc_kproc0: p->p_au == NULL (%d)",
1050             p->p_pid));
1051         //printf("audit_proc_kproc0: pid %d p_au %p\n", p->p_pid, p->p_au);
1052         bzero(p->p_au, sizeof(*(p)->p_au));
1053 }
1054
1055 void
1056 audit_proc_init(struct proc *p)
1057 {
1058
1059         KASSERT(p->p_au != NULL, ("audit_proc_init: p->p_au == NULL (%d)",
1060             p->p_pid));
1061         //printf("audit_proc_init: pid %d p_au %p\n", p->p_pid, p->p_au);
1062         bzero(p->p_au, sizeof(*(p)->p_au));
1063         p->p_au->ai_auid = AU_DEFAUDITID;
1064 }
1065
1066 /*
1067  * Copy the audit info from the parent process to the child process when
1068  * a fork takes place.
1069  */
1070 void
1071 audit_proc_fork(struct proc *parent, struct proc *child)
1072 {
1073
1074         PROC_LOCK_ASSERT(parent, MA_OWNED);
1075         PROC_LOCK_ASSERT(child, MA_OWNED);
1076         KASSERT(parent->p_au != NULL,
1077             ("audit_proc_fork: parent->p_au == NULL (%d)", parent->p_pid));
1078         KASSERT(child->p_au != NULL,
1079             ("audit_proc_fork: child->p_au == NULL (%d)", child->p_pid));
1080         //printf("audit_proc_fork: parent pid %d p_au %p\n", parent->p_pid,
1081         //    parent->p_au);
1082         //printf("audit_proc_fork: child pid %d p_au %p\n", child->p_pid,
1083         //    child->p_au);
1084         bcopy(parent->p_au, child->p_au, sizeof(*child->p_au));
1085         /*
1086          * XXXAUDIT: Zero pointers to external memory, or assert they are
1087          * zero?
1088          */
1089 }
1090
1091 /*
1092  * Free the auditing structure for the process.
1093  */
1094 void
1095 audit_proc_free(struct proc *p)
1096 {
1097
1098         KASSERT(p->p_au != NULL, ("p->p_au == NULL (%d)", p->p_pid));
1099         //printf("audit_proc_free: pid %d p_au %p\n", p->p_pid, p->p_au);
1100         /*
1101          * XXXAUDIT: Assert that external memory pointers are NULL?
1102          */
1103         free(p->p_au, M_AUDITPROC);
1104         p->p_au = NULL;
1105 }