sys/security/audit/audit.c

   1 /*
   2  * Copyright (c) 1999-2005 Apple Computer, Inc.
   3  * Copyright (c) 2006 Robert N. M. Watson
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1.  Redistributions of source code must retain the above copyright
  10  *     notice, this list of conditions and the following disclaimer.
  11  * 2.  Redistributions in binary form must reproduce the above copyright
  12  *     notice, this list of conditions and the following disclaimer in the
  13  *     documentation and/or other materials provided with the distribution.
  14  * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
  15  *     its contributors may be used to endorse or promote products derived
  16  *     from this software without specific prior written permission.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND
  19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21  * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR
  22  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  26  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  27  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28  * POSSIBILITY OF SUCH DAMAGE.
  29  *
  30  * $FreeBSD$
  31  */
  32
  33 #include <sys/param.h>
  34 #include <sys/condvar.h>
  35 #include <sys/conf.h>
  36 #include <sys/file.h>
  37 #include <sys/filedesc.h>
  38 #include <sys/fcntl.h>
  39 #include <sys/ipc.h>
  40 #include <sys/kernel.h>
  41 #include <sys/kthread.h>
  42 #include <sys/malloc.h>
  43 #include <sys/mount.h>
  44 #include <sys/namei.h>
  45 #include <sys/proc.h>
  46 #include <sys/queue.h>
  47 #include <sys/socket.h>
  48 #include <sys/socketvar.h>
  49 #include <sys/protosw.h>
  50 #include <sys/domain.h>
  51 #include <sys/sysproto.h>
  52 #include <sys/sysent.h>
  53 #include <sys/systm.h>
  54 #include <sys/ucred.h>
  55 #include <sys/uio.h>
  56 #include <sys/un.h>
  57 #include <sys/unistd.h>
  58 #include <sys/vnode.h>
  59
  60 #include <bsm/audit.h>
  61 #include <bsm/audit_kevents.h>
  62
  63 #include <netinet/in.h>
  64 #include <netinet/in_pcb.h>
  65
  66 #include <security/audit/audit.h>
  67 #include <security/audit/audit_private.h>
  68
  69 #include <vm/uma.h>
  70
  71 /*
  72  * The AUDIT_EXCESSIVELY_VERBOSE define enables a number of
  73  * gratuitously noisy printf's to the console.  Due to the
  74  * volume, it should be left off unless you want your system
  75  * to churn a lot whenever the audit record flow gets high.
  76  */
  77 //#define       AUDIT_EXCESSIVELY_VERBOSE
  78 #ifdef AUDIT_EXCESSIVELY_VERBOSE
  79 #define AUDIT_PRINTF(x) printf x
  80 #else
  81 #define AUDIT_PRINTF(X)
  82 #endif
  83
  84 static uma_zone_t audit_record_zone;
  85 static MALLOC_DEFINE(M_AUDITPROC, "audit_proc", "Audit process storage");
  86 MALLOC_DEFINE(M_AUDITDATA, "audit_data", "Audit data storage");
  87 MALLOC_DEFINE(M_AUDITPATH, "audit_path", "Audit path storage");
  88 MALLOC_DEFINE(M_AUDITTEXT, "audit_text", "Audit text storage");
  89
  90 /*
  91  * Audit control settings that are set/read by system calls and are
  92  * hence non-static.
  93  */
  94 /*
  95  * Define the audit control flags.
  96  */
  97 int                                     audit_enabled;
  98 int                                     audit_suspended;
  99
 100 /*
 101  * Flags controlling behavior in low storage situations.
 102  * Should we panic if a write fails?  Should we fail stop
 103  * if we're out of disk space?
 104  */
 105 int                                     audit_panic_on_write_fail;
 106 int                                     audit_fail_stop;
 107
 108 /*
 109  * Are we currently "failing stop" due to out of disk space?
 110  */
 111 static int                               audit_in_failure;
 112
 113 /*
 114  * Global audit statistiscs.
 115  */
 116 struct audit_fstat                      audit_fstat;
 117
 118 /*
 119  * Preselection mask for non-attributable events.
 120  */
 121 struct au_mask                          audit_nae_mask;
 122
 123 /*
 124  * Mutex to protect global variables shared between various threads and
 125  * processes.
 126  */
 127 static struct mtx                       audit_mtx;
 128
 129 /*
 130  * Queue of audit records ready for delivery to disk.  We insert new
 131  * records at the tail, and remove records from the head.  Also,
 132  * a count of the number of records used for checking queue depth.
 133  * In addition, a counter of records that we have allocated but are
 134  * not yet in the queue, which is needed to estimate the total
 135  * size of the combined set of records outstanding in the system.
 136  */
 137 static TAILQ_HEAD(, kaudit_record)      audit_q;
 138 static int                              audit_q_len;
 139 static int                              audit_pre_q_len;
 140
 141 /*
 142  * Audit queue control settings (minimum free, low/high water marks, etc.)
 143  */
 144 struct au_qctrl                         audit_qctrl;
 145
 146 /*
 147  * Condition variable to signal to the worker that it has work to do:
 148  * either new records are in the queue, or a log replacement is taking
 149  * place.
 150  */
 151 static struct cv                        audit_cv;
 152
 153 /*
 154  * Worker thread that will schedule disk I/O, etc.
 155  */
 156 static struct proc                      *audit_thread;
 157
 158 /*
 159  * When an audit log is rotated, the actual rotation must be performed
 160  * by the audit worker thread, as it may have outstanding writes on the
 161  * current audit log.  audit_replacement_vp holds the vnode replacing
 162  * the current vnode.  We can't let more than one replacement occur
 163  * at a time, so if more than one thread requests a replacement, only
 164  * one can have the replacement "in progress" at any given moment.  If
 165  * a thread tries to replace the audit vnode and discovers a replacement
 166  * is already in progress (i.e., audit_replacement_flag != 0), then it
 167  * will sleep on audit_replacement_cv waiting its turn to perform a
 168  * replacement.  When a replacement is completed, this cv is signalled
 169  * by the worker thread so a waiting thread can start another replacement.
 170  * We also store a credential to perform audit log write operations with.
 171  *
 172  * The current credential and vnode are thread-local to audit_worker.
 173  */
 174 static struct cv                        audit_replacement_cv;
 175
 176 static int                              audit_replacement_flag;
 177 static struct vnode                     *audit_replacement_vp;
 178 static struct ucred                     *audit_replacement_cred;
 179
 180 /*
 181  * Condition variable to signal to the worker that it has work to do:
 182  * either new records are in the queue, or a log replacement is taking
 183  * place.
 184  */
 185 static struct cv                        audit_commit_cv;
 186
 187 /*
 188  * Condition variable for  auditing threads wait on when in fail-stop mode.
 189  * Threads wait on this CV forever (and ever), never seeing the light of
 190  * day again.
 191  */
 192 static struct cv                        audit_fail_cv;
 193
 194 /*
 195  * Flags related to Kernel->user-space communication.
 196  */
 197 static int                      audit_file_rotate_wait;
 198
 199 /*
 200  * Construct an audit record for the passed thread.
 201  */
 202 static int
 203 audit_record_ctor(void *mem, int size, void *arg, int flags)
 204 {
 205         struct kaudit_record *ar;
 206         struct thread *td;
 207
 208         KASSERT(sizeof(*ar) == size, ("audit_record_ctor: wrong size"));
 209
 210         td = arg;
 211         ar = mem;
 212         bzero(ar, sizeof(*ar));
 213         ar->k_ar.ar_magic = AUDIT_RECORD_MAGIC;
 214         nanotime(&ar->k_ar.ar_starttime);
 215
 216         /*
 217          * Export the subject credential.
 218          *
 219          * XXXAUDIT: td_ucred access is OK without proc lock, but some other
 220          * fields here may require the proc lock.
 221          */
 222         cru2x(td->td_ucred, &ar->k_ar.ar_subj_cred);
 223         ar->k_ar.ar_subj_ruid = td->td_ucred->cr_ruid;
 224         ar->k_ar.ar_subj_rgid = td->td_ucred->cr_rgid;
 225         ar->k_ar.ar_subj_egid = td->td_ucred->cr_groups[0];
 226         ar->k_ar.ar_subj_auid = td->td_proc->p_au->ai_auid;
 227         ar->k_ar.ar_subj_asid = td->td_proc->p_au->ai_asid;
 228         ar->k_ar.ar_subj_pid = td->td_proc->p_pid;
 229         ar->k_ar.ar_subj_amask = td->td_proc->p_au->ai_mask;
 230         ar->k_ar.ar_subj_term = td->td_proc->p_au->ai_termid;
 231         bcopy(td->td_proc->p_comm, ar->k_ar.ar_subj_comm, MAXCOMLEN);
 232
 233         return (0);
 234 }
 235
 236 static void
 237 audit_record_dtor(void *mem, int size, void *arg)
 238 {
 239         struct kaudit_record *ar;
 240
 241         KASSERT(sizeof(*ar) == size, ("audit_record_dtor: wrong size"));
 242
 243         ar = mem;
 244         if (ar->k_ar.ar_arg_upath1 != NULL)
 245                 free(ar->k_ar.ar_arg_upath1, M_AUDITPATH);
 246         if (ar->k_ar.ar_arg_upath2 != NULL)
 247                 free(ar->k_ar.ar_arg_upath2, M_AUDITPATH);
 248         if (ar->k_ar.ar_arg_text != NULL)
 249                 free(ar->k_ar.ar_arg_text, M_AUDITTEXT);
 250         if (ar->k_udata != NULL)
 251                 free(ar->k_udata, M_AUDITDATA);
 252 }
 253
 254 /*
 255  * XXXAUDIT: Should adjust comments below to make it clear that we get to
 256  * this point only if we believe we have storage, so not having space here
 257  * is a violation of invariants derived from administrative procedures.
 258  * I.e., someone else has written to the audit partition, leaving less space
 259  * than we accounted for.
 260  */
 261 static int
 262 audit_record_write(struct vnode *vp, struct kaudit_record *ar,
 263     struct ucred *cred, struct thread *td)
 264 {
 265         int ret;
 266         long temp;
 267         struct au_record *bsm;
 268         struct vattr vattr;
 269         struct statfs *mnt_stat = &vp->v_mount->mnt_stat;
 270         int vfslocked;
 271
 272         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 273
 274         /*
 275          * First, gather statistics on the audit log file and file system
 276          * so that we know how we're doing on space.  In both cases,
 277          * if we're unable to perform the operation, we drop the record
 278          * and return.  However, this is arguably an assertion failure.
 279          * XXX Need a FreeBSD equivalent.
 280          */
 281         ret = VFS_STATFS(vp->v_mount, mnt_stat, td);
 282         if (ret)
 283                 goto out;
 284
 285         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 286         ret = VOP_GETATTR(vp, &vattr, cred, td);
 287         VOP_UNLOCK(vp, 0, td);
 288         if (ret)
 289                 goto out;
 290
 291         /* update the global stats struct */
 292         audit_fstat.af_currsz = vattr.va_size;
 293
 294         /*
 295          * XXX Need to decide what to do if the trigger to the audit daemon
 296          * fails.
 297          */
 298
 299         /*
 300          * If we fall below minimum free blocks (hard limit), tell the audit
 301          * daemon to force a rotation off of the file system. We also stop
 302          * writing, which means this audit record is probably lost.
 303          * If we fall below the minimum percent free blocks (soft limit),
 304          * then kindly suggest to the audit daemon to do something.
 305          */
 306         if (mnt_stat->f_bfree < AUDIT_HARD_LIMIT_FREE_BLOCKS) {
 307                 send_trigger(AUDIT_TRIGGER_NO_SPACE);
 308                 /* Hopefully userspace did something about all the previous
 309                  * triggers that were sent prior to this critical condition.
 310                  * If fail-stop is set, then we're done; goodnight Gracie.
 311                  */
 312                 if (audit_fail_stop)
 313                         panic("Audit log space exhausted and fail-stop set.");
 314                 else {
 315                         audit_suspended = 1;
 316                         ret = ENOSPC;
 317                         goto out;
 318                 }
 319         } else
 320                 /*
 321                  * Send a message to the audit daemon that disk space
 322                  * is getting low.
 323                  *
 324                  * XXXAUDIT: Check math and block size calculation here.
 325                  */
 326                 if (audit_qctrl.aq_minfree != 0) {
 327                         temp = mnt_stat->f_blocks / (100 /
 328                             audit_qctrl.aq_minfree);
 329                         if (mnt_stat->f_bfree < temp)
 330                                 send_trigger(AUDIT_TRIGGER_LOW_SPACE);
 331                 }
 332
 333         /* Check if the current log file is full; if so, call for
 334          * a log rotate. This is not an exact comparison; we may
 335          * write some records over the limit. If that's not
 336          * acceptable, then add a fudge factor here.
 337          */
 338         if ((audit_fstat.af_filesz != 0) &&
 339             (audit_file_rotate_wait == 0) &&
 340             (vattr.va_size >= audit_fstat.af_filesz)) {
 341                 audit_file_rotate_wait = 1;
 342                 send_trigger(AUDIT_TRIGGER_OPEN_NEW);
 343         }
 344
 345         /*
 346          * If the estimated amount of audit data in the audit event queue
 347          * (plus records allocated but not yet queued) has reached the
 348          * amount of free space on the disk, then we need to go into an
 349          * audit fail stop state, in which we do not permit the
 350          * allocation/committing of any new audit records.  We continue to
 351          * process packets but don't allow any activities that might
 352          * generate new records.  In the future, we might want to detect
 353          * when space is available again and allow operation to continue,
 354          * but this behavior is sufficient to meet fail stop requirements
 355          * in CAPP.
 356          */
 357         if (audit_fail_stop &&
 358             (unsigned long)
 359             ((audit_q_len + audit_pre_q_len + 1) * MAX_AUDIT_RECORD_SIZE) /
 360             mnt_stat->f_bsize >= (unsigned long)(mnt_stat->f_bfree)) {
 361                 printf(
 362     "audit_worker: free space below size of audit queue, failing stop\n");
 363                 audit_in_failure = 1;
 364         }
 365
 366         /*
 367          * If there is a user audit record attached to the kernel record,
 368          * then write the user record.
 369          */
 370         /* XXX Need to decide a few things here: IF the user audit
 371          * record is written, but the write of the kernel record fails,
 372          * what to do? Should the kernel record come before or after the
 373          * user record? For now, we write the user record first, and
 374          * we ignore errors.
 375          */
 376         if (ar->k_ar_commit & AR_COMMIT_USER) {
 377                 /*
 378                  * Try submitting the record to any active audit pipes.
 379                  */
 380                 audit_pipe_submit((void *)ar->k_udata, ar->k_ulen);
 381
 382                 /*
 383                  * And to disk.
 384                  */
 385                 ret = vn_rdwr(UIO_WRITE, vp, (void *)ar->k_udata, ar->k_ulen,
 386                           (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, cred, NULL,
 387                           NULL, td);
 388                 if (ret)
 389                         goto out;
 390         }
 391
 392         /*
 393          * Convert the internal kernel record to BSM format and write it
 394          * out if everything's OK.
 395          */
 396         if (!(ar->k_ar_commit & AR_COMMIT_KERNEL)) {
 397                 ret = 0;
 398                 goto out;
 399         }
 400
 401         /*
 402          * XXXAUDIT: Should we actually allow this conversion to fail?  With
 403          * sleeping memory allocation and invariants checks, perhaps not.
 404          */
 405         ret = kaudit_to_bsm(ar, &bsm);
 406         if (ret == BSM_NOAUDIT) {
 407                 ret = 0;
 408                 goto out;
 409         }
 410
 411         /*
 412          * XXX: We drop the record on BSM conversion failure, but really
 413          * this is an assertion failure.
 414          */
 415         if (ret == BSM_FAILURE) {
 416                 AUDIT_PRINTF(("BSM conversion failure\n"));
 417                 ret = EINVAL;
 418                 goto out;
 419         }
 420
 421         /*
 422          * Try submitting the record to any active audit pipes.
 423          */
 424         audit_pipe_submit((void *)bsm->data, bsm->len);
 425
 426         /*
 427          * XXX
 428          * We should break the write functionality away from the BSM record
 429          * generation and have the BSM generation done before this function
 430          * is called. This function will then take the BSM record as a
 431          * parameter.
 432          */
 433         ret = (vn_rdwr(UIO_WRITE, vp, (void *)bsm->data, bsm->len,
 434             (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, cred, NULL, NULL, td));
 435
 436         kau_free(bsm);
 437
 438 out:
 439         /*
 440          * When we're done processing the current record, we have to
 441          * check to see if we're in a failure mode, and if so, whether
 442          * this was the last record left to be drained.  If we're done
 443          * draining, then we fsync the vnode and panic.
 444          */
 445         if (audit_in_failure &&
 446             audit_q_len == 0 && audit_pre_q_len == 0) {
 447                 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
 448                 (void)VOP_FSYNC(vp, MNT_WAIT, td);
 449                 VOP_UNLOCK(vp, 0, td);
 450                 panic("Audit store overflow; record queue drained.");
 451         }
 452
 453         VFS_UNLOCK_GIANT(vfslocked);
 454
 455         return (ret);
 456 }
 457
 458 /*
 459  * The audit_worker thread is responsible for watching the event queue,
 460  * dequeueing records, converting them to BSM format, and committing them to
 461  * disk.  In order to minimize lock thrashing, records are dequeued in sets
 462  * to a thread-local work queue.  In addition, the audit_work performs the
 463  * actual exchange of audit log vnode pointer, as audit_vp is a thread-local
 464  * variable.
 465  */
 466 static void
 467 audit_worker(void *arg)
 468 {
 469         int do_replacement_signal, error;
 470         TAILQ_HEAD(, kaudit_record) ar_worklist;
 471         struct kaudit_record *ar;
 472         struct vnode *audit_vp, *old_vp;
 473         int vfslocked;
 474
 475         struct ucred *audit_cred, *old_cred;
 476         struct thread *audit_td;
 477
 478         AUDIT_PRINTF(("audit_worker starting\n"));
 479
 480         /*
 481          * These are thread-local variables requiring no synchronization.
 482          */
 483         TAILQ_INIT(&ar_worklist);
 484         audit_cred = NULL;
 485         audit_td = curthread;
 486         audit_vp = NULL;
 487
 488         mtx_lock(&audit_mtx);
 489         while (1) {
 490                 /*
 491                  * First priority: replace the audit log target if requested.
 492                  * Accessing the vnode here requires dropping the audit_mtx;
 493                  * in case another replacement was scheduled while the mutex
 494                  * was released, we loop.
 495                  *
 496                  * XXX It could well be we should drain existing records
 497                  * first to ensure that the timestamps and ordering
 498                  * are right.
 499                  */
 500                 do_replacement_signal = 0;
 501                 while (audit_replacement_flag != 0) {
 502                         old_cred = audit_cred;
 503                         old_vp = audit_vp;
 504                         audit_cred = audit_replacement_cred;
 505                         audit_vp = audit_replacement_vp;
 506                         audit_replacement_cred = NULL;
 507                         audit_replacement_vp = NULL;
 508                         audit_replacement_flag = 0;
 509
 510                         audit_enabled = (audit_vp != NULL);
 511
 512                         /*
 513                          * XXX: What to do about write failures here?
 514                          */
 515                         if (old_vp != NULL) {
 516                                 AUDIT_PRINTF(("Closing old audit file\n"));
 517                                 mtx_unlock(&audit_mtx);
 518                                 vfslocked = VFS_LOCK_GIANT(old_vp->v_mount);
 519                                 vn_close(old_vp, AUDIT_CLOSE_FLAGS, old_cred,
 520                                     audit_td);
 521                                 VFS_UNLOCK_GIANT(vfslocked);
 522                                 crfree(old_cred);
 523                                 mtx_lock(&audit_mtx);
 524                                 old_cred = NULL;
 525                                 old_vp = NULL;
 526                                 AUDIT_PRINTF(("Audit file closed\n"));
 527                         }
 528                         if (audit_vp != NULL) {
 529                                 AUDIT_PRINTF(("Opening new audit file\n"));
 530                         }
 531                         do_replacement_signal = 1;
 532                 }
 533                 /*
 534                  * Signal that replacement have occurred to wake up and
 535                  * start any other replacements started in parallel.  We can
 536                  * continue about our business in the mean time.  We
 537                  * broadcast so that both new replacements can be inserted,
 538                  * but also so that the source(s) of replacement can return
 539                  * successfully.
 540                  */
 541                 if (do_replacement_signal)
 542                         cv_broadcast(&audit_replacement_cv);
 543
 544                 /*
 545                  * Next, check to see if we have any records to drain into
 546                  * the vnode.  If not, go back to waiting for an event.
 547                  */
 548                 if (TAILQ_EMPTY(&audit_q)) {
 549                         AUDIT_PRINTF(("audit_worker waiting\n"));
 550                         cv_wait(&audit_cv, &audit_mtx);
 551                         AUDIT_PRINTF(("audit_worker woken up\n"));
 552         AUDIT_PRINTF(("audit_worker: new vp = %p; value of flag %d\n",
 553             audit_replacement_vp, audit_replacement_flag));
 554                         continue;
 555                 }
 556
 557                 /*
 558                  * If we have records, but there's no active vnode to write
 559                  * to, drain the record queue.  Generally, we prevent the
 560                  * unnecessary allocation of records elsewhere, but we need
 561                  * to allow for races between conditional allocation and
 562                  * queueing.  Go back to waiting when we're done.
 563                  */
 564                 if (audit_vp == NULL) {
 565                         while ((ar = TAILQ_FIRST(&audit_q))) {
 566                                 TAILQ_REMOVE(&audit_q, ar, k_q);
 567                                 uma_zfree(audit_record_zone, ar);
 568                                 audit_q_len--;
 569                                 /*
 570                                  * XXXRW: Why broadcast if we hold the
 571                                  * mutex and know that audit_vp is NULL?
 572                                  */
 573                                 if (audit_q_len <= audit_qctrl.aq_lowater)
 574                                         cv_broadcast(&audit_commit_cv);
 575                         }
 576                         continue;
 577                 }
 578
 579                 /*
 580                  * We have both records to write and an active vnode to write
 581                  * to.  Dequeue a record, and start the write.  Eventually,
 582                  * it might make sense to dequeue several records and perform
 583                  * our own clustering, if the lower layers aren't doing it
 584                  * automatically enough.
 585                  */
 586                 while ((ar = TAILQ_FIRST(&audit_q))) {
 587                         TAILQ_REMOVE(&audit_q, ar, k_q);
 588                         audit_q_len--;
 589                         if (audit_q_len <= audit_qctrl.aq_lowater)
 590                                 cv_broadcast(&audit_commit_cv);
 591                         TAILQ_INSERT_TAIL(&ar_worklist, ar, k_q);
 592                 }
 593
 594                 mtx_unlock(&audit_mtx);
 595                 while ((ar = TAILQ_FIRST(&ar_worklist))) {
 596                         TAILQ_REMOVE(&ar_worklist, ar, k_q);
 597                         if (audit_vp != NULL) {
 598                                 error = audit_record_write(audit_vp, ar,
 599                                     audit_cred, audit_td);
 600                                 if (error && audit_panic_on_write_fail)
 601                                         panic("audit_worker: write error %d\n",
 602                                             error);
 603                                 else if (error)
 604                                         printf("audit_worker: write error %d\n",
 605                                             error);
 606                         }
 607                         uma_zfree(audit_record_zone, ar);
 608                 }
 609                 mtx_lock(&audit_mtx);
 610         }
 611 }
 612
 613 /*
 614  * Initialize the Audit subsystem: configuration state, work queue,
 615  * synchronization primitives, worker thread, and trigger device node.  Also
 616  * call into the BSM assembly code to initialize it.
 617  */
 618 static void
 619 audit_init(void)
 620 {
 621         int error;
 622
 623         printf("Security auditing service present\n");
 624         audit_enabled = 0;
 625         audit_suspended = 0;
 626         audit_panic_on_write_fail = 0;
 627         audit_fail_stop = 0;
 628         audit_in_failure = 0;
 629
 630         audit_replacement_vp = NULL;
 631         audit_replacement_cred = NULL;
 632         audit_replacement_flag = 0;
 633
 634         audit_fstat.af_filesz = 0;      /* '0' means unset, unbounded */
 635         audit_fstat.af_currsz = 0;
 636         audit_nae_mask.am_success = AU_NULL;
 637         audit_nae_mask.am_failure = AU_NULL;
 638
 639         TAILQ_INIT(&audit_q);
 640         audit_q_len = 0;
 641         audit_pre_q_len = 0;
 642         audit_qctrl.aq_hiwater = AQ_HIWATER;
 643         audit_qctrl.aq_lowater = AQ_LOWATER;
 644         audit_qctrl.aq_bufsz = AQ_BUFSZ;
 645         audit_qctrl.aq_minfree = AU_FS_MINFREE;
 646
 647         mtx_init(&audit_mtx, "audit_mtx", NULL, MTX_DEF);
 648         cv_init(&audit_cv, "audit_cv");
 649         cv_init(&audit_replacement_cv, "audit_replacement_cv");
 650         cv_init(&audit_commit_cv, "audit_commit_cv");
 651         cv_init(&audit_fail_cv, "audit_fail_cv");
 652
 653         audit_record_zone = uma_zcreate("audit_record_zone",
 654             sizeof(struct kaudit_record), audit_record_ctor,
 655             audit_record_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
 656
 657         /* Initialize the BSM audit subsystem. */
 658         kau_init();
 659
 660         audit_file_rotate_wait = 0;
 661         audit_trigger_init();
 662
 663         /* Register shutdown handler. */
 664         EVENTHANDLER_REGISTER(shutdown_pre_sync, audit_shutdown, NULL,
 665             SHUTDOWN_PRI_FIRST);
 666
 667         error = kthread_create(audit_worker, NULL, &audit_thread, RFHIGHPID,
 668             0, "audit_worker");
 669         if (error != 0)
 670                 panic("audit_init: kthread_create returned %d", error);
 671 }
 672
 673 SYSINIT(audit_init, SI_SUB_AUDIT, SI_ORDER_FIRST, audit_init, NULL)
 674
 675 /*
 676  * audit_rotate_vnode() is called by a user or kernel thread to configure or
 677  * de-configure auditing on a vnode.  The arguments are the replacement
 678  * credential and vnode to substitute for the current credential and vnode,
 679  * if any.  If either is set to NULL, both should be NULL, and this is used
 680  * to indicate that audit is being disabled.  The real work is done in the
 681  * audit_worker thread, but audit_rotate_vnode() waits synchronously for that
 682  * to complete.
 683  *
 684  * The vnode should be referenced and opened by the caller.  The credential
 685  * should be referenced.  audit_rotate_vnode() will own both references as of
 686  * this call, so the caller should not release either.
 687  *
 688  * XXXAUDIT: Review synchronize communication logic.  Really, this is a
 689  * message queue of depth 1.
 690  *
 691  * XXXAUDIT: Enhance the comments below to indicate that we are basically
 692  * acquiring ownership of the communications queue, inserting our message,
 693  * and waiting for an acknowledgement.
 694  */
 695 void
 696 audit_rotate_vnode(struct ucred *cred, struct vnode *vp)
 697 {
 698
 699         /*
 700          * If other parallel log replacements have been requested, we wait
 701          * until they've finished before continuing.
 702          */
 703         mtx_lock(&audit_mtx);
 704         while (audit_replacement_flag != 0) {
 705                 AUDIT_PRINTF(("audit_rotate_vnode: sleeping to wait for "
 706                     "flag\n"));
 707                 cv_wait(&audit_replacement_cv, &audit_mtx);
 708                 AUDIT_PRINTF(("audit_rotate_vnode: woken up (flag %d)\n",
 709                     audit_replacement_flag));
 710         }
 711         audit_replacement_cred = cred;
 712         audit_replacement_flag = 1;
 713         audit_replacement_vp = vp;
 714
 715         /*
 716          * Wake up the audit worker to perform the exchange once we
 717          * release the mutex.
 718          */
 719         cv_signal(&audit_cv);
 720
 721         /*
 722          * Wait for the audit_worker to broadcast that a replacement has
 723          * taken place; we know that once this has happened, our vnode
 724          * has been replaced in, so we can return successfully.
 725          */
 726         AUDIT_PRINTF(("audit_rotate_vnode: waiting for news of "
 727             "replacement\n"));
 728         cv_wait(&audit_replacement_cv, &audit_mtx);
 729         AUDIT_PRINTF(("audit_rotate_vnode: change acknowledged by "
 730             "audit_worker (flag " "now %d)\n", audit_replacement_flag));
 731         mtx_unlock(&audit_mtx);
 732
 733         audit_file_rotate_wait = 0; /* We can now request another rotation */
 734 }
 735
 736 /*
 737  * Drain the audit queue and close the log at shutdown.  Note that this can
 738  * be called both from the system shutdown path and also from audit
 739  * configuration syscalls, so 'arg' and 'howto' are ignored.
 740  */
 741 void
 742 audit_shutdown(void *arg, int howto)
 743 {
 744
 745         audit_rotate_vnode(NULL, NULL);
 746 }
 747
 748 /*
 749  * Return the current thread's audit record, if any.
 750  */
 751 __inline__ struct kaudit_record *
 752 currecord(void)
 753 {
 754
 755         return (curthread->td_ar);
 756 }
 757
 758 /*
 759  * MPSAFE
 760  *
 761  * XXXAUDIT: There are a number of races present in the code below due to
 762  * release and re-grab of the mutex.  The code should be revised to become
 763  * slightly less racy.
 764  *
 765  * XXXAUDIT: Shouldn't there be logic here to sleep waiting on available
 766  * pre_q space, suspending the system call until there is room?
 767  */
 768 struct kaudit_record *
 769 audit_new(int event, struct thread *td)
 770 {
 771         struct kaudit_record *ar;
 772         int no_record;
 773
 774         mtx_lock(&audit_mtx);
 775         no_record = (audit_suspended || !audit_enabled);
 776         mtx_unlock(&audit_mtx);
 777         if (no_record)
 778                 return (NULL);
 779
 780         /*
 781          * XXX: The number of outstanding uncommitted audit records is
 782          * limited to the number of concurrent threads servicing system
 783          * calls in the kernel.
 784          */
 785         ar = uma_zalloc_arg(audit_record_zone, td, M_WAITOK);
 786         ar->k_ar.ar_event = event;
 787
 788         mtx_lock(&audit_mtx);
 789         audit_pre_q_len++;
 790         mtx_unlock(&audit_mtx);
 791
 792         return (ar);
 793 }
 794
 795 /*
 796  * MPSAFE
 797  */
 798 void
 799 audit_commit(struct kaudit_record *ar, int error, int retval)
 800 {
 801         int sorf;
 802         struct au_mask *aumask;
 803
 804         if (ar == NULL)
 805                 return;
 806
 807         /*
 808          * Decide whether to commit the audit record by checking the
 809          * error value from the system call and using the appropriate
 810          * audit mask.
 811          *
 812          * XXXAUDIT: Synchronize access to audit_nae_mask?
 813          */
 814         if (ar->k_ar.ar_subj_auid == AU_DEFAUDITID)
 815                 aumask = &audit_nae_mask;
 816         else
 817                 aumask = &ar->k_ar.ar_subj_amask;
 818
 819         if (error)
 820                 sorf = AU_PRS_FAILURE;
 821         else
 822                 sorf = AU_PRS_SUCCESS;
 823
 824         switch(ar->k_ar.ar_event) {
 825
 826         case AUE_OPEN_RWTC:
 827                 /* The open syscall always writes a AUE_OPEN_RWTC event; change
 828                  * it to the proper type of event based on the flags and the
 829                  * error value.
 830                  */
 831                 ar->k_ar.ar_event = flags_and_error_to_openevent(
 832                     ar->k_ar.ar_arg_fflags, error);
 833                 break;
 834
 835         case AUE_SYSCTL:
 836                 ar->k_ar.ar_event = ctlname_to_sysctlevent(
 837                     ar->k_ar.ar_arg_ctlname, ar->k_ar.ar_valid_arg);
 838                 break;
 839
 840         case AUE_AUDITON:
 841                 /* Convert the auditon() command to an event */
 842                 ar->k_ar.ar_event = auditon_command_event(ar->k_ar.ar_arg_cmd);
 843                 break;
 844         }
 845
 846         if (au_preselect(ar->k_ar.ar_event, aumask, sorf) != 0)
 847                 ar->k_ar_commit |= AR_COMMIT_KERNEL;
 848
 849         /*
 850          * XXXRW: Why is this necessary?  Should we ever accept a record that
 851          * we're not willing to commit?
 852          */
 853         if ((ar->k_ar_commit & (AR_COMMIT_USER | AR_COMMIT_KERNEL)) == 0) {
 854                 mtx_lock(&audit_mtx);
 855                 audit_pre_q_len--;
 856                 mtx_unlock(&audit_mtx);
 857                 uma_zfree(audit_record_zone, ar);
 858                 return;
 859         }
 860
 861         ar->k_ar.ar_errno = error;
 862         ar->k_ar.ar_retval = retval;
 863
 864         /*
 865          * We might want to do some system-wide post-filtering
 866          * here at some point.
 867          */
 868
 869         /*
 870          * Timestamp system call end.
 871          */
 872         nanotime(&ar->k_ar.ar_endtime);
 873
 874         mtx_lock(&audit_mtx);
 875
 876         /*
 877          * Note: it could be that some records initiated while audit was
 878          * enabled should still be committed?
 879          */
 880         if (audit_suspended || !audit_enabled) {
 881                 audit_pre_q_len--;
 882                 mtx_unlock(&audit_mtx);
 883                 uma_zfree(audit_record_zone, ar);
 884                 return;
 885         }
 886
 887         /*
 888          * Constrain the number of committed audit records based on
 889          * the configurable parameter.
 890          */
 891         while (audit_q_len >= audit_qctrl.aq_hiwater) {
 892                 AUDIT_PRINTF(("audit_commit: sleeping to wait for "
 893                    "audit queue to drain below high water mark\n"));
 894                 cv_wait(&audit_commit_cv, &audit_mtx);
 895                 AUDIT_PRINTF(("audit_commit: woke up waiting for "
 896                    "audit queue draining\n"));
 897         }
 898
 899         TAILQ_INSERT_TAIL(&audit_q, ar, k_q);
 900         audit_q_len++;
 901         audit_pre_q_len--;
 902         cv_signal(&audit_cv);
 903         mtx_unlock(&audit_mtx);
 904 }
 905
 906 /*
 907  * audit_syscall_enter() is called on entry to each system call.  It is
 908  * responsible for deciding whether or not to audit the call (preselection),
 909  * and if so, allocating a per-thread audit record.  audit_new() will fill in
 910  * basic thread/credential properties.
 911  */
 912 void
 913 audit_syscall_enter(unsigned short code, struct thread *td)
 914 {
 915         int audit_event;
 916         struct au_mask *aumask;
 917
 918         KASSERT(td->td_ar == NULL, ("audit_syscall_enter: td->td_ar != NULL"));
 919
 920         /*
 921          * In FreeBSD, each ABI has its own system call table, and hence
 922          * mapping of system call codes to audit events.  Convert the code to
 923          * an audit event identifier using the process system call table
 924          * reference.  In Darwin, there's only one, so we use the global
 925          * symbol for the system call table.
 926          *
 927          * XXXAUDIT: Should we audit that a bad system call was made, and if
 928          * so, how?
 929          */
 930         if (code >= td->td_proc->p_sysent->sv_size)
 931                 return;
 932
 933         audit_event = td->td_proc->p_sysent->sv_table[code].sy_auevent;
 934         if (audit_event == AUE_NULL)
 935                 return;
 936
 937         /*
 938          * Check which audit mask to use; either the kernel non-attributable
 939          * event mask or the process audit mask.
 940          */
 941         if (td->td_proc->p_au->ai_auid == AU_DEFAUDITID)
 942                 aumask = &audit_nae_mask;
 943         else
 944                 aumask = &td->td_proc->p_au->ai_mask;
 945
 946         /*
 947          * Allocate an audit record, if preselection allows it, and store
 948          * in the thread for later use.
 949          */
 950         if (au_preselect(audit_event, aumask,
 951                         AU_PRS_FAILURE | AU_PRS_SUCCESS)) {
 952                 /*
 953                  * If we're out of space and need to suspend unprivileged
 954                  * processes, do that here rather than trying to allocate
 955                  * another audit record.
 956                  *
 957                  * XXXRW: We might wish to be able to continue here in the
 958                  * future, if the system recovers.  That should be possible
 959                  * by means of checking the condition in a loop around
 960                  * cv_wait().  It might be desirable to reevaluate whether an
 961                  * audit record is still required for this event by
 962                  * re-calling au_preselect().
 963                  */
 964                 if (audit_in_failure && suser(td) != 0) {
 965                         cv_wait(&audit_fail_cv, &audit_mtx);
 966                         panic("audit_failing_stop: thread continued");
 967                 }
 968                 td->td_ar = audit_new(audit_event, td);
 969         } else
 970                 td->td_ar = NULL;
 971 }
 972
 973 /*
 974  * audit_syscall_exit() is called from the return of every system call, or in
 975  * the event of exit1(), during the execution of exit1().  It is responsible
 976  * for committing the audit record, if any, along with return condition.
 977  */
 978 void
 979 audit_syscall_exit(int error, struct thread *td)
 980 {
 981         int retval;
 982
 983         /*
 984          * Commit the audit record as desired; once we pass the record
 985          * into audit_commit(), the memory is owned by the audit
 986          * subsystem.
 987          * The return value from the system call is stored on the user
 988          * thread. If there was an error, the return value is set to -1,
 989          * imitating the behavior of the cerror routine.
 990          */
 991         if (error)
 992                 retval = -1;
 993         else
 994                 retval = td->td_retval[0];
 995
 996         audit_commit(td->td_ar, error, retval);
 997         if (td->td_ar != NULL)
 998                 AUDIT_PRINTF(("audit record committed by pid %d\n",
 999                         td->td_proc->p_pid));
1000         td->td_ar = NULL;
1001
1002 }
1003
1004 /*
1005  * Allocate storage for a new process (init, or otherwise).
1006  */
1007 void
1008 audit_proc_alloc(struct proc *p)
1009 {
1010
1011         KASSERT(p->p_au == NULL, ("audit_proc_alloc: p->p_au != NULL (%d)",
1012             p->p_pid));
1013         p->p_au = malloc(sizeof(*(p->p_au)), M_AUDITPROC, M_WAITOK);
1014         /* XXXAUDIT: Zero?  Slab allocate? */
1015         //printf("audit_proc_alloc: pid %d p_au %p\n", p->p_pid, p->p_au);
1016 }
1017
1018 /*
1019  * Allocate storage for a new thread.
1020  */
1021 void
1022 audit_thread_alloc(struct thread *td)
1023 {
1024
1025         td->td_ar = NULL;
1026 }
1027
1028 /*
1029  * Thread destruction.
1030  */
1031 void
1032 audit_thread_free(struct thread *td)
1033 {
1034
1035         KASSERT(td->td_ar == NULL, ("audit_thread_free: td_ar != NULL"));
1036 }
1037
1038 /*
1039  * Initialize the audit information for the a process, presumably the first
1040  * process in the system.
1041  * XXX It is not clear what the initial values should be for audit ID,
1042  * session ID, etc.
1043  */
1044 void
1045 audit_proc_kproc0(struct proc *p)
1046 {
1047
1048         KASSERT(p->p_au != NULL, ("audit_proc_kproc0: p->p_au == NULL (%d)",
1049             p->p_pid));
1050         //printf("audit_proc_kproc0: pid %d p_au %p\n", p->p_pid, p->p_au);
1051         bzero(p->p_au, sizeof(*(p)->p_au));
1052 }
1053
1054 void
1055 audit_proc_init(struct proc *p)
1056 {
1057
1058         KASSERT(p->p_au != NULL, ("audit_proc_init: p->p_au == NULL (%d)",
1059             p->p_pid));
1060         //printf("audit_proc_init: pid %d p_au %p\n", p->p_pid, p->p_au);
1061         bzero(p->p_au, sizeof(*(p)->p_au));
1062         p->p_au->ai_auid = AU_DEFAUDITID;
1063 }
1064
1065 /*
1066  * Copy the audit info from the parent process to the child process when
1067  * a fork takes place.
1068  */
1069 void
1070 audit_proc_fork(struct proc *parent, struct proc *child)
1071 {
1072
1073         PROC_LOCK_ASSERT(parent, MA_OWNED);
1074         PROC_LOCK_ASSERT(child, MA_OWNED);
1075         KASSERT(parent->p_au != NULL,
1076             ("audit_proc_fork: parent->p_au == NULL (%d)", parent->p_pid));
1077         KASSERT(child->p_au != NULL,
1078             ("audit_proc_fork: child->p_au == NULL (%d)", child->p_pid));
1079         //printf("audit_proc_fork: parent pid %d p_au %p\n", parent->p_pid,
1080         //    parent->p_au);
1081         //printf("audit_proc_fork: child pid %d p_au %p\n", child->p_pid,
1082         //    child->p_au);
1083         bcopy(parent->p_au, child->p_au, sizeof(*child->p_au));
1084         /*
1085          * XXXAUDIT: Zero pointers to external memory, or assert they are
1086          * zero?
1087          */
1088 }
1089
1090 /*
1091  * Free the auditing structure for the process.
1092  */
1093 void
1094 audit_proc_free(struct proc *p)
1095 {
1096
1097         KASSERT(p->p_au != NULL, ("p->p_au == NULL (%d)", p->p_pid));
1098         //printf("audit_proc_free: pid %d p_au %p\n", p->p_pid, p->p_au);
1099         /*
1100          * XXXAUDIT: Assert that external memory pointers are NULL?
1101          */
1102         free(p->p_au, M_AUDITPROC);
1103         p->p_au = NULL;
1104 }