sys/netinet/tcp_log_buf.c

   1
   2 /*-
   3  * SPDX-License-Identifier: BSD-2-Clause
   4  *
   5  * Copyright (c) 2016-2018 Netflix, Inc.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  *
  28  */
  29
  30 #include <sys/cdefs.h>
  31 __FBSDID("$FreeBSD$");
  32
  33 #include "opt_inet.h"
  34 #include <sys/param.h>
  35 #include <sys/arb.h>
  36 #include <sys/hash.h>
  37 #include <sys/kernel.h>
  38 #include <sys/lock.h>
  39 #include <sys/malloc.h>
  40 #include <sys/mutex.h>
  41 #include <sys/qmath.h>
  42 #include <sys/queue.h>
  43 #include <sys/refcount.h>
  44 #include <sys/rwlock.h>
  45 #include <sys/socket.h>
  46 #include <sys/socketvar.h>
  47 #include <sys/sysctl.h>
  48 #include <sys/tree.h>
  49 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
  50 #include <sys/counter.h>
  51 #include <dev/tcp_log/tcp_log_dev.h>
  52
  53 #include <net/if.h>
  54 #include <net/if_var.h>
  55 #include <net/vnet.h>
  56
  57 #include <netinet/in.h>
  58 #include <netinet/in_pcb.h>
  59 #include <netinet/in_var.h>
  60 #include <netinet/tcp_var.h>
  61 #include <netinet/tcp_log_buf.h>
  62 #include <netinet/tcp_seq.h>
  63 #include <netinet/tcp_hpts.h>
  64
  65 /* Default expiry time */
  66 #define TCP_LOG_EXPIRE_TIME     ((sbintime_t)60 * SBT_1S)
  67
  68 /* Max interval at which to run the expiry timer */
  69 #define TCP_LOG_EXPIRE_INTVL    ((sbintime_t)5 * SBT_1S)
  70
  71 bool    tcp_log_verbose;
  72 static uma_zone_t tcp_log_id_bucket_zone, tcp_log_id_node_zone, tcp_log_zone;
  73 static int      tcp_log_session_limit = TCP_LOG_BUF_DEFAULT_SESSION_LIMIT;
  74 static uint32_t tcp_log_version = TCP_LOG_BUF_VER;
  75 RB_HEAD(tcp_log_id_tree, tcp_log_id_bucket);
  76 static struct tcp_log_id_tree tcp_log_id_head;
  77 static STAILQ_HEAD(, tcp_log_id_node) tcp_log_expireq_head =
  78     STAILQ_HEAD_INITIALIZER(tcp_log_expireq_head);
  79 static struct mtx tcp_log_expireq_mtx;
  80 static struct callout tcp_log_expireq_callout;
  81 static u_long tcp_log_auto_ratio = 0;
  82 static volatile u_long tcp_log_auto_ratio_cur = 0;
  83 static uint32_t tcp_log_auto_mode = TCP_LOG_STATE_TAIL;
  84 static bool tcp_log_auto_all = false;
  85 static uint32_t tcp_disable_all_bb_logs = 0;
  86
  87 RB_PROTOTYPE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp)
  88
  89 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, bb, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  90     "TCP Black Box controls");
  91
  92 SYSCTL_NODE(_net_inet_tcp_bb, OID_AUTO, tp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  93     "TCP Black Box Trace Point controls");
  94
  95 SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_verbose, CTLFLAG_RW, &tcp_log_verbose,
  96     0, "Force verbose logging for TCP traces");
  97
  98 SYSCTL_INT(_net_inet_tcp_bb, OID_AUTO, log_session_limit,
  99     CTLFLAG_RW, &tcp_log_session_limit, 0,
 100     "Maximum number of events maintained for each TCP session");
 101
 102 uint32_t tcp_trace_point_config = 0;
 103 SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, number, CTLFLAG_RW,
 104     &tcp_trace_point_config, TCP_LOG_STATE_HEAD_AUTO,
 105     "What is the trace point number to activate (0=none, 0xffffffff = all)?");
 106
 107 uint32_t tcp_trace_point_bb_mode = TCP_LOG_STATE_CONTINUAL;
 108 SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, bbmode, CTLFLAG_RW,
 109     &tcp_trace_point_bb_mode, TCP_LOG_STATE_HEAD_AUTO,
 110     "What is BB logging mode that is activated?");
 111
 112 int32_t tcp_trace_point_count = 0;
 113 SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, count, CTLFLAG_RW,
 114     &tcp_trace_point_count, TCP_LOG_STATE_HEAD_AUTO,
 115     "How many connections will have BB logging turned on that hit the tracepoint?");
 116
 117
 118
 119 SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_global_limit, CTLFLAG_RW,
 120     &tcp_log_zone, "Maximum number of events maintained for all TCP sessions");
 121
 122 SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_global_entries, CTLFLAG_RD,
 123     &tcp_log_zone, "Current number of events maintained for all TCP sessions");
 124
 125 SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_limit, CTLFLAG_RW,
 126     &tcp_log_id_bucket_zone, "Maximum number of log IDs");
 127
 128 SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_entries, CTLFLAG_RD,
 129     &tcp_log_id_bucket_zone, "Current number of log IDs");
 130
 131 SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_limit, CTLFLAG_RW,
 132     &tcp_log_id_node_zone, "Maximum number of tcpcbs with log IDs");
 133
 134 SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_entries, CTLFLAG_RD,
 135     &tcp_log_id_node_zone, "Current number of tcpcbs with log IDs");
 136
 137 SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_version, CTLFLAG_RD, &tcp_log_version,
 138     0, "Version of log formats exported");
 139
 140 SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, disable_all, CTLFLAG_RW,
 141     &tcp_disable_all_bb_logs, 0,
 142     "Disable all BB logging for all connections");
 143
 144 SYSCTL_ULONG(_net_inet_tcp_bb, OID_AUTO, log_auto_ratio, CTLFLAG_RW,
 145     &tcp_log_auto_ratio, 0, "Do auto capturing for 1 out of N sessions");
 146
 147 SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_auto_mode, CTLFLAG_RW,
 148     &tcp_log_auto_mode, 0,
 149     "Logging mode for auto-selected sessions (default is TCP_LOG_STATE_TAIL)");
 150
 151 SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_auto_all, CTLFLAG_RW,
 152     &tcp_log_auto_all, 0,
 153     "Auto-select from all sessions (rather than just those with IDs)");
 154
 155 #ifdef TCPLOG_DEBUG_COUNTERS
 156 counter_u64_t tcp_log_queued;
 157 counter_u64_t tcp_log_que_fail1;
 158 counter_u64_t tcp_log_que_fail2;
 159 counter_u64_t tcp_log_que_fail3;
 160 counter_u64_t tcp_log_que_fail4;
 161 counter_u64_t tcp_log_que_fail5;
 162 counter_u64_t tcp_log_que_copyout;
 163 counter_u64_t tcp_log_que_read;
 164 counter_u64_t tcp_log_que_freed;
 165
 166 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, queued, CTLFLAG_RD,
 167     &tcp_log_queued, "Number of entries queued");
 168 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail1, CTLFLAG_RD,
 169     &tcp_log_que_fail1, "Number of entries queued but fail 1");
 170 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail2, CTLFLAG_RD,
 171     &tcp_log_que_fail2, "Number of entries queued but fail 2");
 172 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail3, CTLFLAG_RD,
 173     &tcp_log_que_fail3, "Number of entries queued but fail 3");
 174 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail4, CTLFLAG_RD,
 175     &tcp_log_que_fail4, "Number of entries queued but fail 4");
 176 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail5, CTLFLAG_RD,
 177     &tcp_log_que_fail5, "Number of entries queued but fail 4");
 178 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, copyout, CTLFLAG_RD,
 179     &tcp_log_que_copyout, "Number of entries copied out");
 180 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, read, CTLFLAG_RD,
 181     &tcp_log_que_read, "Number of entries read from the queue");
 182 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, freed, CTLFLAG_RD,
 183     &tcp_log_que_freed, "Number of entries freed after reading");
 184 #endif
 185
 186 #ifdef INVARIANTS
 187 #define TCPLOG_DEBUG_RINGBUF
 188 #endif
 189 /* Number of requests to consider a PBCID "active". */
 190 #define ACTIVE_REQUEST_COUNT    10
 191
 192 /* Statistic tracking for "active" PBCIDs. */
 193 static counter_u64_t tcp_log_pcb_ids_cur;
 194 static counter_u64_t tcp_log_pcb_ids_tot;
 195
 196 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, pcb_ids_cur, CTLFLAG_RD,
 197     &tcp_log_pcb_ids_cur, "Number of pcb IDs allocated in the system");
 198 SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, pcb_ids_tot, CTLFLAG_RD,
 199     &tcp_log_pcb_ids_tot, "Total number of pcb IDs that have been allocated");
 200
 201 struct tcp_log_mem
 202 {
 203         STAILQ_ENTRY(tcp_log_mem) tlm_queue;
 204         struct tcp_log_buffer   tlm_buf;
 205         struct tcp_log_verbose  tlm_v;
 206 #ifdef TCPLOG_DEBUG_RINGBUF
 207         volatile int            tlm_refcnt;
 208 #endif
 209 };
 210
 211 /* 60 bytes for the header, + 16 bytes for padding */
 212 static uint8_t  zerobuf[76];
 213
 214 /*
 215  * Lock order:
 216  * 1. TCPID_TREE
 217  * 2. TCPID_BUCKET
 218  * 3. INP
 219  *
 220  * Rules:
 221  * A. You need a lock on the Tree to add/remove buckets.
 222  * B. You need a lock on the bucket to add/remove nodes from the bucket.
 223  * C. To change information in a node, you need the INP lock if the tln_closed
 224  *    field is false. Otherwise, you need the bucket lock. (Note that the
 225  *    tln_closed field can change at any point, so you need to recheck the
 226  *    entry after acquiring the INP lock.)
 227  * D. To remove a node from the bucket, you must have that entry locked,
 228  *    according to the criteria of Rule C. Also, the node must not be on
 229  *    the expiry queue.
 230  * E. The exception to C is the expiry queue fields, which are locked by
 231  *    the TCPLOG_EXPIREQ lock.
 232  *
 233  * Buckets have a reference count. Each node is a reference. Further,
 234  * other callers may add reference counts to keep a bucket from disappearing.
 235  * You can add a reference as long as you own a lock sufficient to keep the
 236  * bucket from disappearing. For example, a common use is:
 237  *   a. Have a locked INP, but need to lock the TCPID_BUCKET.
 238  *   b. Add a refcount on the bucket. (Safe because the INP lock prevents
 239  *      the TCPID_BUCKET from going away.)
 240  *   c. Drop the INP lock.
 241  *   d. Acquire a lock on the TCPID_BUCKET.
 242  *   e. Acquire a lock on the INP.
 243  *   f. Drop the refcount on the bucket.
 244  *      (At this point, the bucket may disappear.)
 245  *
 246  * Expire queue lock:
 247  * You can acquire this with either the bucket or INP lock. Don't reverse it.
 248  * When the expire code has committed to freeing a node, it resets the expiry
 249  * time to SBT_MAX. That is the signal to everyone else that they should
 250  * leave that node alone.
 251  */
 252 static struct rwlock tcp_id_tree_lock;
 253 #define TCPID_TREE_WLOCK()              rw_wlock(&tcp_id_tree_lock)
 254 #define TCPID_TREE_RLOCK()              rw_rlock(&tcp_id_tree_lock)
 255 #define TCPID_TREE_UPGRADE()            rw_try_upgrade(&tcp_id_tree_lock)
 256 #define TCPID_TREE_WUNLOCK()            rw_wunlock(&tcp_id_tree_lock)
 257 #define TCPID_TREE_RUNLOCK()            rw_runlock(&tcp_id_tree_lock)
 258 #define TCPID_TREE_WLOCK_ASSERT()       rw_assert(&tcp_id_tree_lock, RA_WLOCKED)
 259 #define TCPID_TREE_RLOCK_ASSERT()       rw_assert(&tcp_id_tree_lock, RA_RLOCKED)
 260 #define TCPID_TREE_UNLOCK_ASSERT()      rw_assert(&tcp_id_tree_lock, RA_UNLOCKED)
 261
 262 #define TCPID_BUCKET_LOCK_INIT(tlb)     mtx_init(&((tlb)->tlb_mtx), "tcp log id bucket", NULL, MTX_DEF)
 263 #define TCPID_BUCKET_LOCK_DESTROY(tlb)  mtx_destroy(&((tlb)->tlb_mtx))
 264 #define TCPID_BUCKET_LOCK(tlb)          mtx_lock(&((tlb)->tlb_mtx))
 265 #define TCPID_BUCKET_UNLOCK(tlb)        mtx_unlock(&((tlb)->tlb_mtx))
 266 #define TCPID_BUCKET_LOCK_ASSERT(tlb)   mtx_assert(&((tlb)->tlb_mtx), MA_OWNED)
 267 #define TCPID_BUCKET_UNLOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_NOTOWNED)
 268
 269 #define TCPID_BUCKET_REF(tlb)           refcount_acquire(&((tlb)->tlb_refcnt))
 270 #define TCPID_BUCKET_UNREF(tlb)         refcount_release(&((tlb)->tlb_refcnt))
 271
 272 #define TCPLOG_EXPIREQ_LOCK()           mtx_lock(&tcp_log_expireq_mtx)
 273 #define TCPLOG_EXPIREQ_UNLOCK()         mtx_unlock(&tcp_log_expireq_mtx)
 274
 275 SLIST_HEAD(tcp_log_id_head, tcp_log_id_node);
 276
 277 struct tcp_log_id_bucket
 278 {
 279         /*
 280          * tlb_id must be first. This lets us use strcmp on
 281          * (struct tcp_log_id_bucket *) and (char *) interchangeably.
 282          */
 283         char                            tlb_id[TCP_LOG_ID_LEN];
 284         char                            tlb_tag[TCP_LOG_TAG_LEN];
 285         RB_ENTRY(tcp_log_id_bucket)     tlb_rb;
 286         struct tcp_log_id_head          tlb_head;
 287         struct mtx                      tlb_mtx;
 288         volatile u_int                  tlb_refcnt;
 289         volatile u_int                  tlb_reqcnt;
 290         uint32_t                        tlb_loglimit;
 291         int8_t                          tlb_logstate;
 292 };
 293
 294 struct tcp_log_id_node
 295 {
 296         SLIST_ENTRY(tcp_log_id_node) tln_list;
 297         STAILQ_ENTRY(tcp_log_id_node) tln_expireq; /* Locked by the expireq lock */
 298         sbintime_t              tln_expiretime; /* Locked by the expireq lock */
 299
 300         /*
 301          * If INP is NULL, that means the connection has closed. We've
 302          * saved the connection endpoint information and the log entries
 303          * in the tln_ie and tln_entries members. We've also saved a pointer
 304          * to the enclosing bucket here. If INP is not NULL, the information is
 305          * in the PCB and not here.
 306          */
 307         struct inpcb            *tln_inp;
 308         struct tcpcb            *tln_tp;
 309         struct tcp_log_id_bucket *tln_bucket;
 310         struct in_endpoints     tln_ie;
 311         struct tcp_log_stailq   tln_entries;
 312         int                     tln_count;
 313         volatile int            tln_closed;
 314         uint8_t                 tln_af;
 315 };
 316
 317 enum tree_lock_state {
 318         TREE_UNLOCKED = 0,
 319         TREE_RLOCKED,
 320         TREE_WLOCKED,
 321 };
 322
 323 /* Do we want to select this session for auto-logging? */
 324 static __inline bool
 325 tcp_log_selectauto(void)
 326 {
 327
 328         /*
 329          * If we are doing auto-capturing, figure out whether we will capture
 330          * this session.
 331          */
 332         if (tcp_log_auto_ratio &&
 333             (tcp_disable_all_bb_logs == 0) &&
 334             (atomic_fetchadd_long(&tcp_log_auto_ratio_cur, 1) %
 335             tcp_log_auto_ratio) == 0)
 336                 return (true);
 337         return (false);
 338 }
 339
 340 static __inline int
 341 tcp_log_id_cmp(struct tcp_log_id_bucket *a, struct tcp_log_id_bucket *b)
 342 {
 343         KASSERT(a != NULL, ("tcp_log_id_cmp: argument a is unexpectedly NULL"));
 344         KASSERT(b != NULL, ("tcp_log_id_cmp: argument b is unexpectedly NULL"));
 345         return strncmp(a->tlb_id, b->tlb_id, TCP_LOG_ID_LEN);
 346 }
 347
 348 RB_GENERATE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp)
 349
 350 static __inline void
 351 tcp_log_id_validate_tree_lock(int tree_locked)
 352 {
 353
 354 #ifdef INVARIANTS
 355         switch (tree_locked) {
 356         case TREE_WLOCKED:
 357                 TCPID_TREE_WLOCK_ASSERT();
 358                 break;
 359         case TREE_RLOCKED:
 360                 TCPID_TREE_RLOCK_ASSERT();
 361                 break;
 362         case TREE_UNLOCKED:
 363                 TCPID_TREE_UNLOCK_ASSERT();
 364                 break;
 365         default:
 366                 kassert_panic("%s:%d: unknown tree lock state", __func__,
 367                     __LINE__);
 368         }
 369 #endif
 370 }
 371
 372 static __inline void
 373 tcp_log_remove_bucket(struct tcp_log_id_bucket *tlb)
 374 {
 375
 376         TCPID_TREE_WLOCK_ASSERT();
 377         KASSERT(SLIST_EMPTY(&tlb->tlb_head),
 378             ("%s: Attempt to remove non-empty bucket", __func__));
 379         if (RB_REMOVE(tcp_log_id_tree, &tcp_log_id_head, tlb) == NULL) {
 380 #ifdef INVARIANTS
 381                 kassert_panic("%s:%d: error removing element from tree",
 382                             __func__, __LINE__);
 383 #endif
 384         }
 385         TCPID_BUCKET_LOCK_DESTROY(tlb);
 386         counter_u64_add(tcp_log_pcb_ids_cur, (int64_t)-1);
 387         uma_zfree(tcp_log_id_bucket_zone, tlb);
 388 }
 389
 390 /*
 391  * Call with a referenced and locked bucket.
 392  * Will return true if the bucket was freed; otherwise, false.
 393  * tlb: The bucket to unreference.
 394  * tree_locked: A pointer to the state of the tree lock. If the tree lock
 395  *    state changes, the function will update it.
 396  * inp: If not NULL and the function needs to drop the inp lock to relock the
 397  *    tree, it will do so. (The caller must ensure inp will not become invalid,
 398  *    probably by holding a reference to it.)
 399  */
 400 static bool
 401 tcp_log_unref_bucket(struct tcp_log_id_bucket *tlb, int *tree_locked,
 402     struct inpcb *inp)
 403 {
 404
 405         KASSERT(tlb != NULL, ("%s: called with NULL tlb", __func__));
 406         KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked",
 407             __func__));
 408
 409         tcp_log_id_validate_tree_lock(*tree_locked);
 410
 411         /*
 412          * Did we hold the last reference on the tlb? If so, we may need
 413          * to free it. (Note that we can realistically only execute the
 414          * loop twice: once without a write lock and once with a write
 415          * lock.)
 416          */
 417         while (TCPID_BUCKET_UNREF(tlb)) {
 418                 /*
 419                  * We need a write lock on the tree to free this.
 420                  * If we can upgrade the tree lock, this is "easy". If we
 421                  * can't upgrade the tree lock, we need to do this the
 422                  * "hard" way: unwind all our locks and relock everything.
 423                  * In the meantime, anything could have changed. We even
 424                  * need to validate that we still need to free the bucket.
 425                  */
 426                 if (*tree_locked == TREE_RLOCKED && TCPID_TREE_UPGRADE())
 427                         *tree_locked = TREE_WLOCKED;
 428                 else if (*tree_locked != TREE_WLOCKED) {
 429                         TCPID_BUCKET_REF(tlb);
 430                         if (inp != NULL)
 431                                 INP_WUNLOCK(inp);
 432                         TCPID_BUCKET_UNLOCK(tlb);
 433                         if (*tree_locked == TREE_RLOCKED)
 434                                 TCPID_TREE_RUNLOCK();
 435                         TCPID_TREE_WLOCK();
 436                         *tree_locked = TREE_WLOCKED;
 437                         TCPID_BUCKET_LOCK(tlb);
 438                         if (inp != NULL)
 439                                 INP_WLOCK(inp);
 440                         continue;
 441                 }
 442
 443                 /*
 444                  * We have an empty bucket and a write lock on the tree.
 445                  * Remove the empty bucket.
 446                  */
 447                 tcp_log_remove_bucket(tlb);
 448                 return (true);
 449         }
 450         return (false);
 451 }
 452
 453 /*
 454  * Call with a locked bucket. This function will release the lock on the
 455  * bucket before returning.
 456  *
 457  * The caller is responsible for freeing the tp->t_lin/tln node!
 458  *
 459  * Note: one of tp or both tlb and tln must be supplied.
 460  *
 461  * inp: A pointer to the inp. If the function needs to drop the inp lock to
 462  *    acquire the tree write lock, it will do so. (The caller must ensure inp
 463  *    will not become invalid, probably by holding a reference to it.)
 464  * tp: A pointer to the tcpcb. (optional; if specified, tlb and tln are ignored)
 465  * tlb: A pointer to the bucket. (optional; ignored if tp is specified)
 466  * tln: A pointer to the node. (optional; ignored if tp is specified)
 467  * tree_locked: A pointer to the state of the tree lock. If the tree lock
 468  *    state changes, the function will update it.
 469  *
 470  * Will return true if the INP lock was reacquired; otherwise, false.
 471  */
 472 static bool
 473 tcp_log_remove_id_node(struct inpcb *inp, struct tcpcb *tp,
 474     struct tcp_log_id_bucket *tlb, struct tcp_log_id_node *tln,
 475     int *tree_locked)
 476 {
 477         int orig_tree_locked;
 478
 479         KASSERT(tp != NULL || (tlb != NULL && tln != NULL),
 480             ("%s: called with tp=%p, tlb=%p, tln=%p", __func__,
 481             tp, tlb, tln));
 482         KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked",
 483             __func__));
 484
 485         if (tp != NULL) {
 486                 tlb = tp->t_lib;
 487                 tln = tp->t_lin;
 488                 KASSERT(tlb != NULL, ("%s: unexpectedly NULL tlb", __func__));
 489                 KASSERT(tln != NULL, ("%s: unexpectedly NULL tln", __func__));
 490         }
 491
 492         tcp_log_id_validate_tree_lock(*tree_locked);
 493         TCPID_BUCKET_LOCK_ASSERT(tlb);
 494
 495         /*
 496          * Remove the node, clear the log bucket and node from the TCPCB, and
 497          * decrement the bucket refcount. In the process, if this is the
 498          * last reference, the bucket will be freed.
 499          */
 500         SLIST_REMOVE(&tlb->tlb_head, tln, tcp_log_id_node, tln_list);
 501         if (tp != NULL) {
 502                 tp->t_lib = NULL;
 503                 tp->t_lin = NULL;
 504         }
 505         orig_tree_locked = *tree_locked;
 506         if (!tcp_log_unref_bucket(tlb, tree_locked, inp))
 507                 TCPID_BUCKET_UNLOCK(tlb);
 508         return (*tree_locked != orig_tree_locked);
 509 }
 510
 511 #define RECHECK_INP_CLEAN(cleanup)      do {                    \
 512         if (inp->inp_flags & INP_DROPPED) {                     \
 513                 rv = ECONNRESET;                                \
 514                 cleanup;                                        \
 515                 goto done;                                      \
 516         }                                                       \
 517         tp = intotcpcb(inp);                                    \
 518 } while (0)
 519
 520 #define RECHECK_INP()   RECHECK_INP_CLEAN(/* noop */)
 521
 522 static void
 523 tcp_log_grow_tlb(char *tlb_id, struct tcpcb *tp)
 524 {
 525
 526         INP_WLOCK_ASSERT(tptoinpcb(tp));
 527
 528 #ifdef STATS
 529         if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL)
 530                 (void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id));
 531 #endif
 532 }
 533
 534 static void
 535 tcp_log_increment_reqcnt(struct tcp_log_id_bucket *tlb)
 536 {
 537
 538         atomic_fetchadd_int(&tlb->tlb_reqcnt, 1);
 539 }
 540
 541 int
 542 tcp_log_apply_ratio(struct tcpcb *tp, int ratio)
 543 {
 544         struct tcp_log_id_bucket *tlb;
 545         struct inpcb *inp = tptoinpcb(tp);
 546         uint32_t hash, ratio_hash_thresh;
 547         int rv, tree_locked;
 548
 549         rv = 0;
 550         tree_locked = TREE_UNLOCKED;
 551         tlb = tp->t_lib;
 552
 553         INP_WLOCK_ASSERT(inp);
 554         if (tlb == NULL) {
 555                 INP_WUNLOCK(inp);
 556                 return (EOPNOTSUPP);
 557         }
 558         if (ratio)
 559                 ratio_hash_thresh = max(1, UINT32_MAX / ratio);
 560         else
 561                 ratio_hash_thresh = 0;
 562         TCPID_BUCKET_REF(tlb);
 563         INP_WUNLOCK(inp);
 564         TCPID_BUCKET_LOCK(tlb);
 565
 566         hash = hash32_buf(tlb->tlb_id, strlen(tlb->tlb_id), 0);
 567         if (hash > ratio_hash_thresh && tp->_t_logstate == TCP_LOG_STATE_OFF &&
 568             tlb->tlb_logstate == TCP_LOG_STATE_OFF) {
 569                 /*
 570                  * Ratio decision not to log this log ID (and this connection by
 571                  * way of association). We only apply a log ratio log disable
 572                  * decision if it would not interfere with a log enable decision
 573                  * made elsewhere e.g. tcp_log_selectauto() or setsockopt().
 574                  */
 575                 tlb->tlb_logstate = TCP_LOG_STATE_RATIO_OFF;
 576                 INP_WLOCK(inp);
 577                 RECHECK_INP();
 578                 (void)tcp_log_state_change(tp, TCP_LOG_STATE_OFF);
 579 done:
 580                 INP_WUNLOCK(inp);
 581         }
 582
 583         INP_UNLOCK_ASSERT(inp);
 584         if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
 585                 TCPID_BUCKET_UNLOCK(tlb);
 586
 587         if (tree_locked == TREE_WLOCKED) {
 588                 TCPID_TREE_WLOCK_ASSERT();
 589                 TCPID_TREE_WUNLOCK();
 590         } else if (tree_locked == TREE_RLOCKED) {
 591                 TCPID_TREE_RLOCK_ASSERT();
 592                 TCPID_TREE_RUNLOCK();
 593         } else
 594                 TCPID_TREE_UNLOCK_ASSERT();
 595
 596         return (rv);
 597 }
 598
 599 /*
 600  * Associate the specified tag with a particular TCP log ID.
 601  * Called with INPCB locked. Returns with it unlocked.
 602  * Returns 0 on success or EOPNOTSUPP if the connection has no TCP log ID.
 603  */
 604 int
 605 tcp_log_set_tag(struct tcpcb *tp, char *tag)
 606 {
 607         struct inpcb *inp = tptoinpcb(tp);
 608         struct tcp_log_id_bucket *tlb;
 609         int tree_locked;
 610
 611         INP_WLOCK_ASSERT(inp);
 612
 613         tree_locked = TREE_UNLOCKED;
 614         tlb = tp->t_lib;
 615         if (tlb == NULL) {
 616                 INP_WUNLOCK(inp);
 617                 return (EOPNOTSUPP);
 618         }
 619
 620         TCPID_BUCKET_REF(tlb);
 621         INP_WUNLOCK(inp);
 622         TCPID_BUCKET_LOCK(tlb);
 623         strlcpy(tlb->tlb_tag, tag, TCP_LOG_TAG_LEN);
 624         if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
 625                 TCPID_BUCKET_UNLOCK(tlb);
 626
 627         if (tree_locked == TREE_WLOCKED) {
 628                 TCPID_TREE_WLOCK_ASSERT();
 629                 TCPID_TREE_WUNLOCK();
 630         } else if (tree_locked == TREE_RLOCKED) {
 631                 TCPID_TREE_RLOCK_ASSERT();
 632                 TCPID_TREE_RUNLOCK();
 633         } else
 634                 TCPID_TREE_UNLOCK_ASSERT();
 635
 636         return (0);
 637 }
 638
 639 /*
 640  * Set the TCP log ID for a TCPCB.
 641  * Called with INPCB locked. Returns with it unlocked.
 642  */
 643 int
 644 tcp_log_set_id(struct tcpcb *tp, char *id)
 645 {
 646         struct tcp_log_id_bucket *tlb, *tmp_tlb;
 647         struct tcp_log_id_node *tln;
 648         struct inpcb *inp = tptoinpcb(tp);
 649         int tree_locked, rv;
 650         bool bucket_locked, same;
 651
 652         tlb = NULL;
 653         tln = NULL;
 654         tree_locked = TREE_UNLOCKED;
 655         bucket_locked = false;
 656
 657 restart:
 658         INP_WLOCK_ASSERT(inp);
 659         /* See if the ID is unchanged. */
 660         same = ((tp->t_lib != NULL && !strcmp(tp->t_lib->tlb_id, id)) ||
 661                 (tp->t_lib == NULL && *id == 0));
 662         if (tp->_t_logstate && STAILQ_FIRST(&tp->t_logs) && !same) {
 663                 /*
 664                  * There are residual logs left we may
 665                  * be changing id's so dump what we can.
 666                  */
 667                 switch(tp->_t_logstate) {
 668                 case TCP_LOG_STATE_HEAD_AUTO:
 669                         (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head at id switch",
 670                                                      M_NOWAIT, false);
 671                         break;
 672                 case TCP_LOG_STATE_TAIL_AUTO:
 673                         (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail at id switch",
 674                                                      M_NOWAIT, false);
 675                         break;
 676                 case TCP_LOG_STATE_CONTINUAL:
 677                         (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual at id switch",
 678                                                      M_NOWAIT, false);
 679                         break;
 680                 case TCP_LOG_VIA_BBPOINTS:
 681                         (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints at id switch",
 682                                                      M_NOWAIT, false);
 683                         break;
 684                 }
 685         }
 686         if (same) {
 687                 if (tp->t_lib != NULL) {
 688                         tcp_log_increment_reqcnt(tp->t_lib);
 689                         if ((tp->t_lib->tlb_logstate > TCP_LOG_STATE_OFF) &&
 690                             (tp->t_log_state_set == 0)) {
 691                                 /* Clone in any logging */
 692
 693                                 tp->_t_logstate = tp->t_lib->tlb_logstate;
 694                         }
 695                         if ((tp->t_lib->tlb_loglimit) &&
 696                             (tp->t_log_state_set == 0)) {
 697                                 /* We also have a limit set */
 698
 699                                 tp->t_loglimit = tp->t_lib->tlb_loglimit;
 700                         }
 701                 }
 702                 rv = 0;
 703                 goto done;
 704         }
 705
 706         /*
 707          * If the TCPCB had a previous ID, we need to extricate it from
 708          * the previous list.
 709          *
 710          * Drop the TCPCB lock and lock the tree and the bucket.
 711          * Because this is called in the socket context, we (theoretically)
 712          * don't need to worry about the INPCB completely going away
 713          * while we are gone.
 714          */
 715         if (tp->t_lib != NULL) {
 716                 tlb = tp->t_lib;
 717                 TCPID_BUCKET_REF(tlb);
 718                 INP_WUNLOCK(inp);
 719
 720                 if (tree_locked == TREE_UNLOCKED) {
 721                         TCPID_TREE_RLOCK();
 722                         tree_locked = TREE_RLOCKED;
 723                 }
 724                 TCPID_BUCKET_LOCK(tlb);
 725                 bucket_locked = true;
 726                 INP_WLOCK(inp);
 727
 728                 /*
 729                  * Unreference the bucket. If our bucket went away, it is no
 730                  * longer locked or valid.
 731                  */
 732                 if (tcp_log_unref_bucket(tlb, &tree_locked, inp)) {
 733                         bucket_locked = false;
 734                         tlb = NULL;
 735                 }
 736
 737                 /* Validate the INP. */
 738                 RECHECK_INP();
 739
 740                 /*
 741                  * Evaluate whether the bucket changed while we were unlocked.
 742                  *
 743                  * Possible scenarios here:
 744                  * 1. Bucket is unchanged and the same one we started with.
 745                  * 2. The TCPCB no longer has a bucket and our bucket was
 746                  *    freed.
 747                  * 3. The TCPCB has a new bucket, whether ours was freed.
 748                  * 4. The TCPCB no longer has a bucket and our bucket was
 749                  *    not freed.
 750                  *
 751                  * In cases 2-4, we will start over. In case 1, we will
 752                  * proceed here to remove the bucket.
 753                  */
 754                 if (tlb == NULL || tp->t_lib != tlb) {
 755                         KASSERT(bucket_locked || tlb == NULL,
 756                             ("%s: bucket_locked (%d) and tlb (%p) are "
 757                             "inconsistent", __func__, bucket_locked, tlb));
 758
 759                         if (bucket_locked) {
 760                                 TCPID_BUCKET_UNLOCK(tlb);
 761                                 bucket_locked = false;
 762                                 tlb = NULL;
 763                         }
 764                         goto restart;
 765                 }
 766
 767                 /*
 768                  * Store the (struct tcp_log_id_node) for reuse. Then, remove
 769                  * it from the bucket. In the process, we may end up relocking.
 770                  * If so, we need to validate that the INP is still valid, and
 771                  * the TCPCB entries match we expect.
 772                  *
 773                  * We will clear tlb and change the bucket_locked state just
 774                  * before calling tcp_log_remove_id_node(), since that function
 775                  * will unlock the bucket.
 776                  */
 777                 if (tln != NULL)
 778                         uma_zfree(tcp_log_id_node_zone, tln);
 779                 tln = tp->t_lin;
 780                 tlb = NULL;
 781                 bucket_locked = false;
 782                 if (tcp_log_remove_id_node(inp, tp, NULL, NULL, &tree_locked)) {
 783                         RECHECK_INP();
 784
 785                         /*
 786                          * If the TCPCB moved to a new bucket while we had
 787                          * dropped the lock, restart.
 788                          */
 789                         if (tp->t_lib != NULL || tp->t_lin != NULL)
 790                                 goto restart;
 791                 }
 792
 793                 /*
 794                  * Yay! We successfully removed the TCPCB from its old
 795                  * bucket. Phew!
 796                  *
 797                  * On to bigger and better things...
 798                  */
 799         }
 800
 801         /* At this point, the TCPCB should not be in any bucket. */
 802         KASSERT(tp->t_lib == NULL, ("%s: tp->t_lib is not NULL", __func__));
 803
 804         /*
 805          * If the new ID is not empty, we need to now assign this TCPCB to a
 806          * new bucket.
 807          */
 808         if (*id) {
 809                 /* Get a new tln, if we don't already have one to reuse. */
 810                 if (tln == NULL) {
 811                         tln = uma_zalloc(tcp_log_id_node_zone,
 812                                 M_NOWAIT | M_ZERO);
 813                         if (tln == NULL) {
 814                                 rv = ENOBUFS;
 815                                 goto done;
 816                         }
 817                         tln->tln_inp = inp;
 818                         tln->tln_tp = tp;
 819                 }
 820
 821                 /*
 822                  * Drop the INP lock for a bit. We don't need it, and dropping
 823                  * it prevents lock order reversals.
 824                  */
 825                 INP_WUNLOCK(inp);
 826
 827                 /* Make sure we have at least a read lock on the tree. */
 828                 tcp_log_id_validate_tree_lock(tree_locked);
 829                 if (tree_locked == TREE_UNLOCKED) {
 830                         TCPID_TREE_RLOCK();
 831                         tree_locked = TREE_RLOCKED;
 832                 }
 833
 834 refind:
 835                 /*
 836                  * Remember that we constructed (struct tcp_log_id_node) so
 837                  * we can safely cast the id to it for the purposes of finding.
 838                  */
 839                 KASSERT(tlb == NULL, ("%s:%d tlb unexpectedly non-NULL",
 840                     __func__, __LINE__));
 841                 tmp_tlb = RB_FIND(tcp_log_id_tree, &tcp_log_id_head,
 842                     (struct tcp_log_id_bucket *) id);
 843
 844                 /*
 845                  * If we didn't find a matching bucket, we need to add a new
 846                  * one. This requires a write lock. But, of course, we will
 847                  * need to recheck some things when we re-acquire the lock.
 848                  */
 849                 if (tmp_tlb == NULL && tree_locked != TREE_WLOCKED) {
 850                         tree_locked = TREE_WLOCKED;
 851                         if (!TCPID_TREE_UPGRADE()) {
 852                                 TCPID_TREE_RUNLOCK();
 853                                 TCPID_TREE_WLOCK();
 854
 855                                 /*
 856                                  * The tree may have changed while we were
 857                                  * unlocked.
 858                                  */
 859                                 goto refind;
 860                         }
 861                 }
 862
 863                 /* If we need to add a new bucket, do it now. */
 864                 if (tmp_tlb == NULL) {
 865                         /* Allocate new bucket. */
 866                         tlb = uma_zalloc(tcp_log_id_bucket_zone, M_NOWAIT);
 867                         if (tlb == NULL) {
 868                                 rv = ENOBUFS;
 869                                 goto done_noinp;
 870                         }
 871                         counter_u64_add(tcp_log_pcb_ids_cur, 1);
 872                         counter_u64_add(tcp_log_pcb_ids_tot, 1);
 873
 874                         if ((tcp_log_auto_all == false) &&
 875                             tcp_log_auto_mode &&
 876                             tcp_log_selectauto()) {
 877                                 /* Save off the log state */
 878                                 tlb->tlb_logstate = tcp_log_auto_mode;
 879                         } else
 880                                 tlb->tlb_logstate = TCP_LOG_STATE_OFF;
 881                         tlb->tlb_loglimit = 0;
 882                         tlb->tlb_tag[0] = '\0'; /* Default to an empty tag. */
 883
 884                         /*
 885                          * Copy the ID to the bucket.
 886                          * NB: Don't use strlcpy() unless you are sure
 887                          * we've always validated NULL termination.
 888                          *
 889                          * TODO: When I'm done writing this, see if we
 890                          * we have correctly validated NULL termination and
 891                          * can use strlcpy(). :-)
 892                          */
 893                         strncpy(tlb->tlb_id, id, TCP_LOG_ID_LEN - 1);
 894                         tlb->tlb_id[TCP_LOG_ID_LEN - 1] = '\0';
 895
 896                         /*
 897                          * Take the refcount for the first node and go ahead
 898                          * and lock this. Note that we zero the tlb_mtx
 899                          * structure, since 0xdeadc0de flips the right bits
 900                          * for the code to think that this mutex has already
 901                          * been initialized. :-(
 902                          */
 903                         SLIST_INIT(&tlb->tlb_head);
 904                         refcount_init(&tlb->tlb_refcnt, 1);
 905                         tlb->tlb_reqcnt = 1;
 906                         memset(&tlb->tlb_mtx, 0, sizeof(struct mtx));
 907                         TCPID_BUCKET_LOCK_INIT(tlb);
 908                         TCPID_BUCKET_LOCK(tlb);
 909                         bucket_locked = true;
 910
 911 #define FREE_NEW_TLB()  do {                            \
 912         TCPID_BUCKET_LOCK_DESTROY(tlb);                 \
 913         uma_zfree(tcp_log_id_bucket_zone, tlb);         \
 914         counter_u64_add(tcp_log_pcb_ids_cur, (int64_t)-1);      \
 915         counter_u64_add(tcp_log_pcb_ids_tot, (int64_t)-1);      \
 916         bucket_locked = false;                          \
 917         tlb = NULL;                                     \
 918 } while (0)
 919                         /*
 920                          * Relock the INP and make sure we are still
 921                          * unassigned.
 922                          */
 923                         INP_WLOCK(inp);
 924                         RECHECK_INP_CLEAN(FREE_NEW_TLB());
 925                         if (tp->t_lib != NULL) {
 926                                 FREE_NEW_TLB();
 927                                 goto restart;
 928                         }
 929
 930                         /* Add the new bucket to the tree. */
 931                         tmp_tlb = RB_INSERT(tcp_log_id_tree, &tcp_log_id_head,
 932                             tlb);
 933                         KASSERT(tmp_tlb == NULL,
 934                             ("%s: Unexpected conflicting bucket (%p) while "
 935                             "adding new bucket (%p)", __func__, tmp_tlb, tlb));
 936
 937                         /*
 938                          * If we found a conflicting bucket, free the new
 939                          * one we made and fall through to use the existing
 940                          * bucket.
 941                          */
 942                         if (tmp_tlb != NULL) {
 943                                 FREE_NEW_TLB();
 944                                 INP_WUNLOCK(inp);
 945                         }
 946 #undef  FREE_NEW_TLB
 947                 }
 948
 949                 /* If we found an existing bucket, use it. */
 950                 if (tmp_tlb != NULL) {
 951                         tlb = tmp_tlb;
 952                         TCPID_BUCKET_LOCK(tlb);
 953                         bucket_locked = true;
 954
 955                         /*
 956                          * Relock the INP and make sure we are still
 957                          * unassigned.
 958                          */
 959                         INP_UNLOCK_ASSERT(inp);
 960                         INP_WLOCK(inp);
 961                         RECHECK_INP();
 962                         if (tp->t_lib != NULL) {
 963                                 TCPID_BUCKET_UNLOCK(tlb);
 964                                 bucket_locked = false;
 965                                 tlb = NULL;
 966                                 goto restart;
 967                         }
 968
 969                         /* Take a reference on the bucket. */
 970                         TCPID_BUCKET_REF(tlb);
 971
 972                         /* Record the request. */
 973                         tcp_log_increment_reqcnt(tlb);
 974                 }
 975
 976                 tcp_log_grow_tlb(tlb->tlb_id, tp);
 977
 978                 /* Add the new node to the list. */
 979                 SLIST_INSERT_HEAD(&tlb->tlb_head, tln, tln_list);
 980                 tp->t_lib = tlb;
 981                 tp->t_lin = tln;
 982                 if (tp->t_lib->tlb_logstate > TCP_LOG_STATE_OFF) {
 983                         /* Clone in any logging */
 984
 985                         tp->_t_logstate = tp->t_lib->tlb_logstate;
 986                 }
 987                 if (tp->t_lib->tlb_loglimit) {
 988                         /* The loglimit too */
 989
 990                         tp->t_loglimit = tp->t_lib->tlb_loglimit;
 991                 }
 992                 tln = NULL;
 993         }
 994
 995         rv = 0;
 996
 997 done:
 998         /* Unlock things, as needed, and return. */
 999         INP_WUNLOCK(inp);
1000 done_noinp:
1001         INP_UNLOCK_ASSERT(inp);
1002         if (bucket_locked) {
1003                 TCPID_BUCKET_LOCK_ASSERT(tlb);
1004                 TCPID_BUCKET_UNLOCK(tlb);
1005         } else if (tlb != NULL)
1006                 TCPID_BUCKET_UNLOCK_ASSERT(tlb);
1007         if (tree_locked == TREE_WLOCKED) {
1008                 TCPID_TREE_WLOCK_ASSERT();
1009                 TCPID_TREE_WUNLOCK();
1010         } else if (tree_locked == TREE_RLOCKED) {
1011                 TCPID_TREE_RLOCK_ASSERT();
1012                 TCPID_TREE_RUNLOCK();
1013         } else
1014                 TCPID_TREE_UNLOCK_ASSERT();
1015         if (tln != NULL)
1016                 uma_zfree(tcp_log_id_node_zone, tln);
1017         return (rv);
1018 }
1019
1020 /*
1021  * Get the TCP log ID for a TCPCB.
1022  * Called with INPCB locked.
1023  * 'buf' must point to a buffer that is at least TCP_LOG_ID_LEN bytes long.
1024  * Returns number of bytes copied.
1025  */
1026 size_t
1027 tcp_log_get_id(struct tcpcb *tp, char *buf)
1028 {
1029         size_t len;
1030
1031         INP_LOCK_ASSERT(tptoinpcb(tp));
1032         if (tp->t_lib != NULL) {
1033                 len = strlcpy(buf, tp->t_lib->tlb_id, TCP_LOG_ID_LEN);
1034                 KASSERT(len < TCP_LOG_ID_LEN,
1035                     ("%s:%d: tp->t_lib->tlb_id too long (%zu)",
1036                     __func__, __LINE__, len));
1037         } else {
1038                 *buf = '\0';
1039                 len = 0;
1040         }
1041         return (len);
1042 }
1043
1044 /*
1045  * Get the tag associated with the TCPCB's log ID.
1046  * Called with INPCB locked. Returns with it unlocked.
1047  * 'buf' must point to a buffer that is at least TCP_LOG_TAG_LEN bytes long.
1048  * Returns number of bytes copied.
1049  */
1050 size_t
1051 tcp_log_get_tag(struct tcpcb *tp, char *buf)
1052 {
1053         struct inpcb *inp = tptoinpcb(tp);
1054         struct tcp_log_id_bucket *tlb;
1055         size_t len;
1056         int tree_locked;
1057
1058         INP_WLOCK_ASSERT(inp);
1059
1060         tree_locked = TREE_UNLOCKED;
1061         tlb = tp->t_lib;
1062
1063         if (tlb != NULL) {
1064                 TCPID_BUCKET_REF(tlb);
1065                 INP_WUNLOCK(inp);
1066                 TCPID_BUCKET_LOCK(tlb);
1067                 len = strlcpy(buf, tlb->tlb_tag, TCP_LOG_TAG_LEN);
1068                 KASSERT(len < TCP_LOG_TAG_LEN,
1069                     ("%s:%d: tp->t_lib->tlb_tag too long (%zu)",
1070                     __func__, __LINE__, len));
1071                 if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
1072                         TCPID_BUCKET_UNLOCK(tlb);
1073
1074                 if (tree_locked == TREE_WLOCKED) {
1075                         TCPID_TREE_WLOCK_ASSERT();
1076                         TCPID_TREE_WUNLOCK();
1077                 } else if (tree_locked == TREE_RLOCKED) {
1078                         TCPID_TREE_RLOCK_ASSERT();
1079                         TCPID_TREE_RUNLOCK();
1080                 } else
1081                         TCPID_TREE_UNLOCK_ASSERT();
1082         } else {
1083                 INP_WUNLOCK(inp);
1084                 *buf = '\0';
1085                 len = 0;
1086         }
1087
1088         return (len);
1089 }
1090
1091 /*
1092  * Get number of connections with the same log ID.
1093  * Log ID is taken from given TCPCB.
1094  * Called with INPCB locked.
1095  */
1096 u_int
1097 tcp_log_get_id_cnt(struct tcpcb *tp)
1098 {
1099
1100         INP_WLOCK_ASSERT(tptoinpcb(tp));
1101         return ((tp->t_lib == NULL) ? 0 : tp->t_lib->tlb_refcnt);
1102 }
1103
1104 #ifdef TCPLOG_DEBUG_RINGBUF
1105 /*
1106  * Functions/macros to increment/decrement reference count for a log
1107  * entry. This should catch when we do a double-free/double-remove or
1108  * a double-add.
1109  */
1110 static inline void
1111 _tcp_log_entry_refcnt_add(struct tcp_log_mem *log_entry, const char *func,
1112     int line)
1113 {
1114         int refcnt;
1115
1116         refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, 1);
1117         if (refcnt != 0)
1118                 panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 0)",
1119                     func, line, log_entry, refcnt);
1120 }
1121 #define tcp_log_entry_refcnt_add(l)     \
1122     _tcp_log_entry_refcnt_add((l), __func__, __LINE__)
1123
1124 static inline void
1125 _tcp_log_entry_refcnt_rem(struct tcp_log_mem *log_entry, const char *func,
1126     int line)
1127 {
1128         int refcnt;
1129
1130         refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, -1);
1131         if (refcnt != 1)
1132                 panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 1)",
1133                     func, line, log_entry, refcnt);
1134 }
1135 #define tcp_log_entry_refcnt_rem(l)     \
1136     _tcp_log_entry_refcnt_rem((l), __func__, __LINE__)
1137
1138 #else /* !TCPLOG_DEBUG_RINGBUF */
1139
1140 #define tcp_log_entry_refcnt_add(l)
1141 #define tcp_log_entry_refcnt_rem(l)
1142
1143 #endif
1144
1145 /*
1146  * Cleanup after removing a log entry, but only decrement the count if we
1147  * are running INVARIANTS.
1148  */
1149 static inline void
1150 tcp_log_free_log_common(struct tcp_log_mem *log_entry, int *count __unused)
1151 {
1152
1153         uma_zfree(tcp_log_zone, log_entry);
1154 #ifdef INVARIANTS
1155         (*count)--;
1156         KASSERT(*count >= 0,
1157             ("%s: count unexpectedly negative", __func__));
1158 #endif
1159 }
1160
1161 static void
1162 tcp_log_free_entries(struct tcp_log_stailq *head, int *count)
1163 {
1164         struct tcp_log_mem *log_entry;
1165
1166         /* Free the entries. */
1167         while ((log_entry = STAILQ_FIRST(head)) != NULL) {
1168                 STAILQ_REMOVE_HEAD(head, tlm_queue);
1169                 tcp_log_entry_refcnt_rem(log_entry);
1170                 tcp_log_free_log_common(log_entry, count);
1171         }
1172 }
1173
1174 /* Cleanup after removing a log entry. */
1175 static inline void
1176 tcp_log_remove_log_cleanup(struct tcpcb *tp, struct tcp_log_mem *log_entry)
1177 {
1178         uma_zfree(tcp_log_zone, log_entry);
1179         tp->t_lognum--;
1180         KASSERT(tp->t_lognum >= 0,
1181             ("%s: tp->t_lognum unexpectedly negative", __func__));
1182 }
1183
1184 /* Remove a log entry from the head of a list. */
1185 static inline void
1186 tcp_log_remove_log_head(struct tcpcb *tp, struct tcp_log_mem *log_entry)
1187 {
1188
1189         KASSERT(log_entry == STAILQ_FIRST(&tp->t_logs),
1190             ("%s: attempt to remove non-HEAD log entry", __func__));
1191         STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue);
1192         tcp_log_entry_refcnt_rem(log_entry);
1193         tcp_log_remove_log_cleanup(tp, log_entry);
1194 }
1195
1196 #ifdef TCPLOG_DEBUG_RINGBUF
1197 /*
1198  * Initialize the log entry's reference count, which we want to
1199  * survive allocations.
1200  */
1201 static int
1202 tcp_log_zone_init(void *mem, int size, int flags __unused)
1203 {
1204         struct tcp_log_mem *tlm;
1205
1206         KASSERT(size >= sizeof(struct tcp_log_mem),
1207             ("%s: unexpectedly short (%d) allocation", __func__, size));
1208         tlm = (struct tcp_log_mem *)mem;
1209         tlm->tlm_refcnt = 0;
1210         return (0);
1211 }
1212
1213 /*
1214  * Double check that the refcnt is zero on allocation and return.
1215  */
1216 static int
1217 tcp_log_zone_ctor(void *mem, int size, void *args __unused, int flags __unused)
1218 {
1219         struct tcp_log_mem *tlm;
1220
1221         KASSERT(size >= sizeof(struct tcp_log_mem),
1222             ("%s: unexpectedly short (%d) allocation", __func__, size));
1223         tlm = (struct tcp_log_mem *)mem;
1224         if (tlm->tlm_refcnt != 0)
1225                 panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)",
1226                     __func__, __LINE__, tlm, tlm->tlm_refcnt);
1227         return (0);
1228 }
1229
1230 static void
1231 tcp_log_zone_dtor(void *mem, int size, void *args __unused)
1232 {
1233         struct tcp_log_mem *tlm;
1234
1235         KASSERT(size >= sizeof(struct tcp_log_mem),
1236             ("%s: unexpectedly short (%d) allocation", __func__, size));
1237         tlm = (struct tcp_log_mem *)mem;
1238         if (tlm->tlm_refcnt != 0)
1239                 panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)",
1240                     __func__, __LINE__, tlm, tlm->tlm_refcnt);
1241 }
1242 #endif /* TCPLOG_DEBUG_RINGBUF */
1243
1244 /* Do global initialization. */
1245 void
1246 tcp_log_init(void)
1247 {
1248
1249         tcp_log_zone = uma_zcreate("tcp_log", sizeof(struct tcp_log_mem),
1250 #ifdef TCPLOG_DEBUG_RINGBUF
1251             tcp_log_zone_ctor, tcp_log_zone_dtor, tcp_log_zone_init,
1252 #else
1253             NULL, NULL, NULL,
1254 #endif
1255             NULL, UMA_ALIGN_PTR, 0);
1256         (void)uma_zone_set_max(tcp_log_zone, TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT);
1257         tcp_log_id_bucket_zone = uma_zcreate("tcp_log_id_bucket",
1258             sizeof(struct tcp_log_id_bucket), NULL, NULL, NULL, NULL,
1259             UMA_ALIGN_PTR, 0);
1260         tcp_log_id_node_zone = uma_zcreate("tcp_log_id_node",
1261             sizeof(struct tcp_log_id_node), NULL, NULL, NULL, NULL,
1262             UMA_ALIGN_PTR, 0);
1263 #ifdef TCPLOG_DEBUG_COUNTERS
1264         tcp_log_queued = counter_u64_alloc(M_WAITOK);
1265         tcp_log_que_fail1 = counter_u64_alloc(M_WAITOK);
1266         tcp_log_que_fail2 = counter_u64_alloc(M_WAITOK);
1267         tcp_log_que_fail3 = counter_u64_alloc(M_WAITOK);
1268         tcp_log_que_fail4 = counter_u64_alloc(M_WAITOK);
1269         tcp_log_que_fail5 = counter_u64_alloc(M_WAITOK);
1270         tcp_log_que_copyout = counter_u64_alloc(M_WAITOK);
1271         tcp_log_que_read = counter_u64_alloc(M_WAITOK);
1272         tcp_log_que_freed = counter_u64_alloc(M_WAITOK);
1273 #endif
1274         tcp_log_pcb_ids_cur = counter_u64_alloc(M_WAITOK);
1275         tcp_log_pcb_ids_tot = counter_u64_alloc(M_WAITOK);
1276
1277         rw_init_flags(&tcp_id_tree_lock, "TCP ID tree", RW_NEW);
1278         mtx_init(&tcp_log_expireq_mtx, "TCP log expireq", NULL, MTX_DEF);
1279         callout_init(&tcp_log_expireq_callout, 1);
1280 }
1281
1282 /* Do per-TCPCB initialization. */
1283 void
1284 tcp_log_tcpcbinit(struct tcpcb *tp)
1285 {
1286
1287         /* A new TCPCB should start out zero-initialized. */
1288         STAILQ_INIT(&tp->t_logs);
1289
1290         /*
1291          * If we are doing auto-capturing, figure out whether we will capture
1292          * this session.
1293          */
1294         tp->t_loglimit = tcp_log_session_limit;
1295         if ((tcp_log_auto_all == true) &&
1296             tcp_log_auto_mode &&
1297             tcp_log_selectauto()) {
1298                 tp->_t_logstate = tcp_log_auto_mode;
1299                 tp->t_flags2 |= TF2_LOG_AUTO;
1300         }
1301 }
1302
1303 /* Remove entries */
1304 static void
1305 tcp_log_expire(void *unused __unused)
1306 {
1307         struct tcp_log_id_bucket *tlb;
1308         struct tcp_log_id_node *tln;
1309         sbintime_t expiry_limit;
1310         int tree_locked;
1311
1312         TCPLOG_EXPIREQ_LOCK();
1313         if (callout_pending(&tcp_log_expireq_callout)) {
1314                 /* Callout was reset. */
1315                 TCPLOG_EXPIREQ_UNLOCK();
1316                 return;
1317         }
1318
1319         /*
1320          * Process entries until we reach one that expires too far in the
1321          * future. Look one second in the future.
1322          */
1323         expiry_limit = getsbinuptime() + SBT_1S;
1324         tree_locked = TREE_UNLOCKED;
1325
1326         while ((tln = STAILQ_FIRST(&tcp_log_expireq_head)) != NULL &&
1327             tln->tln_expiretime <= expiry_limit) {
1328                 if (!callout_active(&tcp_log_expireq_callout)) {
1329                         /*
1330                          * Callout was stopped. I guess we should
1331                          * just quit at this point.
1332                          */
1333                         TCPLOG_EXPIREQ_UNLOCK();
1334                         return;
1335                 }
1336
1337                 /*
1338                  * Remove the node from the head of the list and unlock
1339                  * the list. Change the expiry time to SBT_MAX as a signal
1340                  * to other threads that we now own this.
1341                  */
1342                 STAILQ_REMOVE_HEAD(&tcp_log_expireq_head, tln_expireq);
1343                 tln->tln_expiretime = SBT_MAX;
1344                 TCPLOG_EXPIREQ_UNLOCK();
1345
1346                 /*
1347                  * Remove the node from the bucket.
1348                  */
1349                 tlb = tln->tln_bucket;
1350                 TCPID_BUCKET_LOCK(tlb);
1351                 if (tcp_log_remove_id_node(NULL, NULL, tlb, tln, &tree_locked)) {
1352                         tcp_log_id_validate_tree_lock(tree_locked);
1353                         if (tree_locked == TREE_WLOCKED)
1354                                 TCPID_TREE_WUNLOCK();
1355                         else
1356                                 TCPID_TREE_RUNLOCK();
1357                         tree_locked = TREE_UNLOCKED;
1358                 }
1359
1360                 /* Drop the INP reference. */
1361                 INP_WLOCK(tln->tln_inp);
1362                 if (!in_pcbrele_wlocked(tln->tln_inp))
1363                         INP_WUNLOCK(tln->tln_inp);
1364
1365                 /* Free the log records. */
1366                 tcp_log_free_entries(&tln->tln_entries, &tln->tln_count);
1367
1368                 /* Free the node. */
1369                 uma_zfree(tcp_log_id_node_zone, tln);
1370
1371                 /* Relock the expiry queue. */
1372                 TCPLOG_EXPIREQ_LOCK();
1373         }
1374
1375         /*
1376          * We've expired all the entries we can. Do we need to reschedule
1377          * ourselves?
1378          */
1379         callout_deactivate(&tcp_log_expireq_callout);
1380         if (tln != NULL) {
1381                 /*
1382                  * Get max(now + TCP_LOG_EXPIRE_INTVL, tln->tln_expiretime) and
1383                  * set the next callout to that. (This helps ensure we generally
1384                  * run the callout no more often than desired.)
1385                  */
1386                 expiry_limit = getsbinuptime() + TCP_LOG_EXPIRE_INTVL;
1387                 if (expiry_limit < tln->tln_expiretime)
1388                         expiry_limit = tln->tln_expiretime;
1389                 callout_reset_sbt(&tcp_log_expireq_callout, expiry_limit,
1390                     SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE);
1391         }
1392
1393         /* We're done. */
1394         TCPLOG_EXPIREQ_UNLOCK();
1395         return;
1396 }
1397
1398 /*
1399  * Move log data from the TCPCB to a new node. This will reset the TCPCB log
1400  * entries and log count; however, it will not touch other things from the
1401  * TCPCB (e.g. t_lin, t_lib).
1402  *
1403  * NOTE: Must hold a lock on the INP.
1404  */
1405 static void
1406 tcp_log_move_tp_to_node(struct tcpcb *tp, struct tcp_log_id_node *tln)
1407 {
1408         struct inpcb *inp = tptoinpcb(tp);
1409
1410         INP_WLOCK_ASSERT(inp);
1411
1412         tln->tln_ie = inp->inp_inc.inc_ie;
1413         if (inp->inp_inc.inc_flags & INC_ISIPV6)
1414                 tln->tln_af = AF_INET6;
1415         else
1416                 tln->tln_af = AF_INET;
1417         tln->tln_entries = tp->t_logs;
1418         tln->tln_count = tp->t_lognum;
1419         tln->tln_bucket = tp->t_lib;
1420
1421         /* Clear information from the PCB. */
1422         STAILQ_INIT(&tp->t_logs);
1423         tp->t_lognum = 0;
1424 }
1425
1426 /* Do per-TCPCB cleanup */
1427 void
1428 tcp_log_tcpcbfini(struct tcpcb *tp)
1429 {
1430         struct tcp_log_id_node *tln, *tln_first;
1431         struct tcp_log_mem *log_entry;
1432         sbintime_t callouttime;
1433
1434
1435         INP_WLOCK_ASSERT(tptoinpcb(tp));
1436         if (tp->_t_logstate) {
1437                 union tcp_log_stackspecific log;
1438                 struct timeval tv;
1439 #ifdef TCP_ACCOUNTING
1440                 struct tcp_log_buffer *lgb;
1441                 int i;
1442
1443                 memset(&log, 0, sizeof(log));
1444                 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
1445                         for (i = 0; i < TCP_NUM_CNT_COUNTERS; i++) {
1446                                 log.u_raw.u64_flex[i] = tp->tcp_cnt_counters[i];
1447                         }
1448                         lgb = tcp_log_event(tp, NULL,
1449                                   NULL,
1450                                   NULL,
1451                                   TCP_LOG_ACCOUNTING, 0,
1452                                   0, &log, false, NULL, NULL, 0, &tv);
1453                         if (lgb != NULL) {
1454                                 lgb->tlb_flex1 = TCP_NUM_CNT_COUNTERS;
1455                                 lgb->tlb_flex2 = 1;
1456                         } else
1457                                 goto skip_out;
1458                         for (i = 0; i<TCP_NUM_CNT_COUNTERS; i++) {
1459                                 log.u_raw.u64_flex[i] = tp->tcp_proc_time[i];
1460                         }
1461                         lgb = tcp_log_event(tp, NULL,
1462                                  NULL,
1463                                  NULL,
1464                                  TCP_LOG_ACCOUNTING, 0,
1465                                  0, &log, false, NULL, NULL, 0, &tv);
1466                         if (lgb != NULL) {
1467                                 lgb->tlb_flex1 = TCP_NUM_CNT_COUNTERS;
1468                                 lgb->tlb_flex2 = 2;
1469                         }
1470                 }
1471 skip_out:
1472 #endif
1473                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1474                 log.u_bbr.cur_del_rate = tp->t_end_info;
1475                 (void)tcp_log_event(tp, NULL,
1476                          NULL,
1477                          NULL,
1478                          TCP_LOG_CONNEND, 0,
1479                          0, &log, false, NULL, NULL, 0,  &tv);
1480         }
1481         /*
1482          * If we were gathering packets to be automatically dumped, try to do
1483          * it now. If this succeeds, the log information in the TCPCB will be
1484          * cleared. Otherwise, we'll handle the log information as we do
1485          * for other states.
1486          */
1487         switch(tp->_t_logstate) {
1488         case TCP_LOG_STATE_HEAD_AUTO:
1489                 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head",
1490                     M_NOWAIT, false);
1491                 break;
1492         case TCP_LOG_STATE_TAIL_AUTO:
1493                 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail",
1494                     M_NOWAIT, false);
1495                 break;
1496         case TCP_LOG_VIA_BBPOINTS:
1497                 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints",
1498                     M_NOWAIT, false);
1499                 break;
1500         case TCP_LOG_STATE_CONTINUAL:
1501                 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
1502                     M_NOWAIT, false);
1503                 break;
1504         }
1505
1506         /*
1507          * There are two ways we could keep logs: per-socket or per-ID. If
1508          * we are tracking logs with an ID, then the logs survive the
1509          * destruction of the TCPCB.
1510          *
1511          * If the TCPCB is associated with an ID node, move the logs from the
1512          * TCPCB to the ID node. In theory, this is safe, for reasons which I
1513          * will now explain for my own benefit when I next need to figure out
1514          * this code. :-)
1515          *
1516          * We own the INP lock. Therefore, no one else can change the contents
1517          * of this node (Rule C). Further, no one can remove this node from
1518          * the bucket while we hold the lock (Rule D). Basically, no one can
1519          * mess with this node. That leaves two states in which we could be:
1520          *
1521          * 1. Another thread is currently waiting to acquire the INP lock, with
1522          *    plans to do something with this node. When we drop the INP lock,
1523          *    they will have a chance to do that. They will recheck the
1524          *    tln_closed field (see note to Rule C) and then acquire the
1525          *    bucket lock before proceeding further.
1526          *
1527          * 2. Another thread will try to acquire a lock at some point in the
1528          *    future. If they try to acquire a lock before we set the
1529          *    tln_closed field, they will follow state #1. If they try to
1530          *    acquire a lock after we set the tln_closed field, they will be
1531          *    able to make changes to the node, at will, following Rule C.
1532          *
1533          * Therefore, we currently own this node and can make any changes
1534          * we want. But, as soon as we set the tln_closed field to true, we
1535          * have effectively dropped our lock on the node. (For this reason, we
1536          * also need to make sure our writes are ordered correctly. An atomic
1537          * operation with "release" semantics should be sufficient.)
1538          */
1539
1540         if (tp->t_lin != NULL) {
1541                 struct inpcb *inp = tptoinpcb(tp);
1542
1543                 /* Copy the relevant information to the log entry. */
1544                 tln = tp->t_lin;
1545                 KASSERT(tln->tln_inp == inp,
1546                     ("%s: Mismatched inp (tln->tln_inp=%p, tp inpcb=%p)",
1547                     __func__, tln->tln_inp, inp));
1548                 tcp_log_move_tp_to_node(tp, tln);
1549
1550                 /* Clear information from the PCB. */
1551                 tp->t_lin = NULL;
1552                 tp->t_lib = NULL;
1553
1554                 /*
1555                  * Take a reference on the INP. This ensures that the INP
1556                  * remains valid while the node is on the expiry queue. This
1557                  * ensures the INP is valid for other threads that may be
1558                  * racing to lock this node when we move it to the expire
1559                  * queue.
1560                  */
1561                 in_pcbref(inp);
1562
1563                 /*
1564                  * Store the entry on the expiry list. The exact behavior
1565                  * depends on whether we have entries to keep. If so, we
1566                  * put the entry at the tail of the list and expire in
1567                  * TCP_LOG_EXPIRE_TIME. Otherwise, we expire "now" and put
1568                  * the entry at the head of the list. (Handling the cleanup
1569                  * via the expiry timer lets us avoid locking messy-ness here.)
1570                  */
1571                 tln->tln_expiretime = getsbinuptime();
1572                 TCPLOG_EXPIREQ_LOCK();
1573                 if (tln->tln_count) {
1574                         tln->tln_expiretime += TCP_LOG_EXPIRE_TIME;
1575                         if (STAILQ_EMPTY(&tcp_log_expireq_head) &&
1576                             !callout_active(&tcp_log_expireq_callout)) {
1577                                 /*
1578                                  * We are adding the first entry and a callout
1579                                  * is not currently scheduled; therefore, we
1580                                  * need to schedule one.
1581                                  */
1582                                 callout_reset_sbt(&tcp_log_expireq_callout,
1583                                     tln->tln_expiretime, SBT_1S, tcp_log_expire,
1584                                     NULL, C_ABSOLUTE);
1585                         }
1586                         STAILQ_INSERT_TAIL(&tcp_log_expireq_head, tln,
1587                             tln_expireq);
1588                 } else {
1589                         callouttime = tln->tln_expiretime +
1590                             TCP_LOG_EXPIRE_INTVL;
1591                         tln_first = STAILQ_FIRST(&tcp_log_expireq_head);
1592
1593                         if ((tln_first == NULL ||
1594                             callouttime < tln_first->tln_expiretime) &&
1595                             (callout_pending(&tcp_log_expireq_callout) ||
1596                             !callout_active(&tcp_log_expireq_callout))) {
1597                                 /*
1598                                  * The list is empty, or we want to run the
1599                                  * expire code before the first entry's timer
1600                                  * fires. Also, we are in a case where a callout
1601                                  * is not actively running. We want to reset
1602                                  * the callout to occur sooner.
1603                                  */
1604                                 callout_reset_sbt(&tcp_log_expireq_callout,
1605                                     callouttime, SBT_1S, tcp_log_expire, NULL,
1606                                     C_ABSOLUTE);
1607                         }
1608
1609                         /*
1610                          * Insert to the head, or just after the head, as
1611                          * appropriate. (This might result in small
1612                          * mis-orderings as a bunch of "expire now" entries
1613                          * gather at the start of the list, but that should
1614                          * not produce big problems, since the expire timer
1615                          * will walk through all of them.)
1616                          */
1617                         if (tln_first == NULL ||
1618                             tln->tln_expiretime < tln_first->tln_expiretime)
1619                                 STAILQ_INSERT_HEAD(&tcp_log_expireq_head, tln,
1620                                     tln_expireq);
1621                         else
1622                                 STAILQ_INSERT_AFTER(&tcp_log_expireq_head,
1623                                     tln_first, tln, tln_expireq);
1624                 }
1625                 TCPLOG_EXPIREQ_UNLOCK();
1626
1627                 /*
1628                  * We are done messing with the tln. After this point, we
1629                  * can't touch it. (Note that the "release" semantics should
1630                  * be included with the TCPLOG_EXPIREQ_UNLOCK() call above.
1631                  * Therefore, they should be unnecessary here. However, it
1632                  * seems like a good idea to include them anyway, since we
1633                  * really are releasing a lock here.)
1634                  */
1635                 atomic_store_rel_int(&tln->tln_closed, 1);
1636         } else {
1637                 /* Remove log entries. */
1638                 while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
1639                         tcp_log_remove_log_head(tp, log_entry);
1640                 KASSERT(tp->t_lognum == 0,
1641                     ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
1642                         __func__, tp->t_lognum));
1643         }
1644
1645         /*
1646          * Change the log state to off (just in case anything tries to sneak
1647          * in a last-minute log).
1648          */
1649         tp->_t_logstate = TCP_LOG_STATE_OFF;
1650 }
1651
1652 static void
1653 tcp_log_purge_tp_logbuf(struct tcpcb *tp)
1654 {
1655         struct tcp_log_mem *log_entry;
1656
1657         INP_WLOCK_ASSERT(tptoinpcb(tp));
1658         if (tp->t_lognum == 0)
1659                 return;
1660
1661         while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
1662                 tcp_log_remove_log_head(tp, log_entry);
1663         KASSERT(tp->t_lognum == 0,
1664                 ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
1665                  __func__, tp->t_lognum));
1666         tp->_t_logstate = TCP_LOG_STATE_OFF;
1667 }
1668
1669 /*
1670  * This logs an event for a TCP socket. Normally, this is called via
1671  * TCP_LOG_EVENT or TCP_LOG_EVENT_VERBOSE. See the documentation for
1672  * TCP_LOG_EVENT().
1673  */
1674
1675 struct tcp_log_buffer *
1676 tcp_log_event(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf,
1677     struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len,
1678     union tcp_log_stackspecific *stackinfo, int th_hostorder,
1679     const char *output_caller, const char *func, int line, const struct timeval *itv)
1680 {
1681         struct tcp_log_mem *log_entry;
1682         struct tcp_log_buffer *log_buf;
1683         int attempt_count = 0;
1684         struct tcp_log_verbose *log_verbose;
1685         uint32_t logsn;
1686
1687         KASSERT((func == NULL && line == 0) || (func != NULL && line > 0),
1688             ("%s called with inconsistent func (%p) and line (%d) arguments",
1689                 __func__, func, line));
1690
1691         INP_WLOCK_ASSERT(tptoinpcb(tp));
1692         if (tcp_disable_all_bb_logs) {
1693                 /*
1694                  * The global shutdown logging
1695                  * switch has been thrown. Call
1696                  * the purge function that frees
1697                  * purges out the logs and
1698                  * turns off logging.
1699                  */
1700                 tcp_log_purge_tp_logbuf(tp);
1701                 return (NULL);
1702         }
1703         KASSERT(tp->_t_logstate == TCP_LOG_STATE_HEAD ||
1704             tp->_t_logstate == TCP_LOG_STATE_TAIL ||
1705             tp->_t_logstate == TCP_LOG_STATE_CONTINUAL ||
1706             tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO ||
1707             tp->_t_logstate == TCP_LOG_VIA_BBPOINTS ||
1708             tp->_t_logstate == TCP_LOG_STATE_TAIL_AUTO,
1709             ("%s called with unexpected tp->_t_logstate (%d)", __func__,
1710                 tp->_t_logstate));
1711
1712         /*
1713          * Get the serial number. We do this early so it will
1714          * increment even if we end up skipping the log entry for some
1715          * reason.
1716          */
1717         logsn = tp->t_logsn++;
1718
1719         /*
1720          * Can we get a new log entry? If so, increment the lognum counter
1721          * here.
1722          */
1723 retry:
1724         if (tp->t_lognum < tp->t_loglimit) {
1725                 if ((log_entry = uma_zalloc(tcp_log_zone, M_NOWAIT)) != NULL)
1726                         tp->t_lognum++;
1727         } else
1728                 log_entry = NULL;
1729
1730         /* Do we need to try to reuse? */
1731         if (log_entry == NULL) {
1732                 /*
1733                  * Sacrifice auto-logged sessions without a log ID if
1734                  * tcp_log_auto_all is false. (If they don't have a log
1735                  * ID by now, it is probable that either they won't get one
1736                  * or we are resource-constrained.)
1737                  */
1738                 if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) &&
1739                     !tcp_log_auto_all) {
1740                         if (tcp_log_state_change(tp, TCP_LOG_STATE_CLEAR)) {
1741 #ifdef INVARIANTS
1742                                 panic("%s:%d: tcp_log_state_change() failed "
1743                                     "to set tp %p to TCP_LOG_STATE_CLEAR",
1744                                     __func__, __LINE__, tp);
1745 #endif
1746                                 tp->_t_logstate = TCP_LOG_STATE_OFF;
1747                         }
1748                         return (NULL);
1749                 }
1750                 /*
1751                  * If we are in TCP_LOG_STATE_HEAD_AUTO state, try to dump
1752                  * the buffers. If successful, deactivate tracing. Otherwise,
1753                  * leave it active so we will retry.
1754                  */
1755                 if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO &&
1756                     !tcp_log_dump_tp_logbuf(tp, "auto-dumped from head",
1757                     M_NOWAIT, false)) {
1758                         tp->_t_logstate = TCP_LOG_STATE_OFF;
1759                         return(NULL);
1760                 } else if ((tp->_t_logstate == TCP_LOG_STATE_CONTINUAL) &&
1761                     !tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
1762                     M_NOWAIT, false)) {
1763                         if (attempt_count == 0) {
1764                                 attempt_count++;
1765                                 goto retry;
1766                         }
1767 #ifdef TCPLOG_DEBUG_COUNTERS
1768                         counter_u64_add(tcp_log_que_fail4, 1);
1769 #endif
1770                         return(NULL);
1771
1772                 } else if ((tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) &&
1773                     !tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints",
1774                     M_NOWAIT, false)) {
1775                         if (attempt_count == 0) {
1776                                 attempt_count++;
1777                                 goto retry;
1778                         }
1779 #ifdef TCPLOG_DEBUG_COUNTERS
1780                         counter_u64_add(tcp_log_que_fail4, 1);
1781 #endif
1782                         return(NULL);
1783                 } else if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO)
1784                         return(NULL);
1785
1786                 /* If in HEAD state, just deactivate the tracing and return. */
1787                 if (tp->_t_logstate == TCP_LOG_STATE_HEAD) {
1788                         tp->_t_logstate = TCP_LOG_STATE_OFF;
1789                         return(NULL);
1790                 }
1791                 /*
1792                  * Get a buffer to reuse. If that fails, just give up.
1793                  * (We can't log anything without a buffer in which to
1794                  * put it.)
1795                  *
1796                  * Note that we don't change the t_lognum counter
1797                  * here. Because we are re-using the buffer, the total
1798                  * number won't change.
1799                  */
1800                 if ((log_entry = STAILQ_FIRST(&tp->t_logs)) == NULL)
1801                         return(NULL);
1802                 STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue);
1803                 tcp_log_entry_refcnt_rem(log_entry);
1804         }
1805
1806         KASSERT(log_entry != NULL,
1807             ("%s: log_entry unexpectedly NULL", __func__));
1808
1809         /* Extract the log buffer and verbose buffer pointers. */
1810         log_buf = &log_entry->tlm_buf;
1811         log_verbose = &log_entry->tlm_v;
1812
1813         /* Basic entries. */
1814         if (itv == NULL)
1815                 microuptime(&log_buf->tlb_tv);
1816         else
1817                 memcpy(&log_buf->tlb_tv, itv, sizeof(struct timeval));
1818         log_buf->tlb_ticks = ticks;
1819         log_buf->tlb_sn = logsn;
1820         log_buf->tlb_stackid = tp->t_fb->tfb_id;
1821         log_buf->tlb_eventid = eventid;
1822         log_buf->tlb_eventflags = 0;
1823         log_buf->tlb_errno = errornum;
1824
1825         /* Socket buffers */
1826         if (rxbuf != NULL) {
1827                 log_buf->tlb_eventflags |= TLB_FLAG_RXBUF;
1828                 log_buf->tlb_rxbuf.tls_sb_acc = rxbuf->sb_acc;
1829                 log_buf->tlb_rxbuf.tls_sb_ccc = rxbuf->sb_ccc;
1830                 log_buf->tlb_rxbuf.tls_sb_spare = 0;
1831         } else {
1832                 log_buf->tlb_rxbuf.tls_sb_acc = 0;
1833                 log_buf->tlb_rxbuf.tls_sb_ccc = 0;
1834         }
1835         if (txbuf != NULL) {
1836                 log_buf->tlb_eventflags |= TLB_FLAG_TXBUF;
1837                 log_buf->tlb_txbuf.tls_sb_acc = txbuf->sb_acc;
1838                 log_buf->tlb_txbuf.tls_sb_ccc = txbuf->sb_ccc;
1839                 log_buf->tlb_txbuf.tls_sb_spare = 0;
1840         } else {
1841                 log_buf->tlb_txbuf.tls_sb_acc = 0;
1842                 log_buf->tlb_txbuf.tls_sb_ccc = 0;
1843         }
1844         /* Copy values from tp to the log entry. */
1845 #define COPY_STAT(f)    log_buf->tlb_ ## f = tp->f
1846 #define COPY_STAT_T(f)  log_buf->tlb_ ## f = tp->t_ ## f
1847         COPY_STAT_T(state);
1848         COPY_STAT_T(starttime);
1849         COPY_STAT(iss);
1850         COPY_STAT_T(flags);
1851         COPY_STAT(snd_una);
1852         COPY_STAT(snd_max);
1853         COPY_STAT(snd_cwnd);
1854         COPY_STAT(snd_nxt);
1855         COPY_STAT(snd_recover);
1856         COPY_STAT(snd_wnd);
1857         COPY_STAT(snd_ssthresh);
1858         COPY_STAT_T(srtt);
1859         COPY_STAT_T(rttvar);
1860         COPY_STAT(rcv_up);
1861         COPY_STAT(rcv_adv);
1862         COPY_STAT(rcv_nxt);
1863         COPY_STAT(rcv_wnd);
1864         COPY_STAT_T(dupacks);
1865         COPY_STAT_T(segqlen);
1866         COPY_STAT(snd_numholes);
1867         COPY_STAT(snd_scale);
1868         COPY_STAT(rcv_scale);
1869         COPY_STAT_T(flags2);
1870         COPY_STAT_T(fbyte_in);
1871         COPY_STAT_T(fbyte_out);
1872 #undef COPY_STAT
1873 #undef COPY_STAT_T
1874         /* Copy stack-specific info. */
1875         if (stackinfo != NULL) {
1876                 memcpy(&log_buf->tlb_stackinfo, stackinfo,
1877                     sizeof(log_buf->tlb_stackinfo));
1878                 log_buf->tlb_eventflags |= TLB_FLAG_STACKINFO;
1879         }
1880
1881         /* The packet */
1882         log_buf->tlb_len = len;
1883         if (th) {
1884                 int optlen;
1885
1886                 log_buf->tlb_eventflags |= TLB_FLAG_HDR;
1887                 log_buf->tlb_th = *th;
1888                 if (th_hostorder)
1889                         tcp_fields_to_net(&log_buf->tlb_th);
1890                 optlen = (th->th_off << 2) - sizeof (struct tcphdr);
1891                 if (optlen > 0)
1892                         memcpy(log_buf->tlb_opts, th + 1, optlen);
1893         } else {
1894                 memset(&log_buf->tlb_th, 0, sizeof(*th));
1895         }
1896
1897         /* Verbose information */
1898         if (func != NULL) {
1899                 log_buf->tlb_eventflags |= TLB_FLAG_VERBOSE;
1900                 if (output_caller != NULL)
1901                         strlcpy(log_verbose->tlv_snd_frm, output_caller,
1902                             TCP_FUNC_LEN);
1903                 else
1904                         *log_verbose->tlv_snd_frm = 0;
1905                 strlcpy(log_verbose->tlv_trace_func, func, TCP_FUNC_LEN);
1906                 log_verbose->tlv_trace_line = line;
1907         }
1908
1909         /* Insert the new log at the tail. */
1910         STAILQ_INSERT_TAIL(&tp->t_logs, log_entry, tlm_queue);
1911         tcp_log_entry_refcnt_add(log_entry);
1912         return (log_buf);
1913 }
1914
1915 /*
1916  * Change the logging state for a TCPCB. Returns 0 on success or an
1917  * error code on failure.
1918  */
1919 int
1920 tcp_log_state_change(struct tcpcb *tp, int state)
1921 {
1922         struct tcp_log_mem *log_entry;
1923         int rv;
1924
1925         INP_WLOCK_ASSERT(tptoinpcb(tp));
1926         rv = 0;
1927         switch(state) {
1928         case TCP_LOG_STATE_CLEAR:
1929                 while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
1930                         tcp_log_remove_log_head(tp, log_entry);
1931                 /* Fall through */
1932
1933         case TCP_LOG_STATE_OFF:
1934                 tp->_t_logstate = TCP_LOG_STATE_OFF;
1935                 break;
1936
1937         case TCP_LOG_STATE_TAIL:
1938         case TCP_LOG_STATE_HEAD:
1939         case TCP_LOG_STATE_CONTINUAL:
1940         case TCP_LOG_VIA_BBPOINTS:
1941         case TCP_LOG_STATE_HEAD_AUTO:
1942         case TCP_LOG_STATE_TAIL_AUTO:
1943                 /*
1944                  * When the RATIO_OFF state is set for the bucket, the log ID
1945                  * this tp is associated with has been probabilistically opted
1946                  * out of logging per tcp_log_apply_ratio().
1947                  */
1948                 if (tp->t_lib == NULL ||
1949                     tp->t_lib->tlb_logstate != TCP_LOG_STATE_RATIO_OFF) {
1950                         tp->_t_logstate = state;
1951                 } else {
1952                         rv = ECANCELED;
1953                         tp->_t_logstate = TCP_LOG_STATE_OFF;
1954                 }
1955                 break;
1956
1957         default:
1958                 return (EINVAL);
1959         }
1960         if (tcp_disable_all_bb_logs) {
1961                 /* We are prohibited from doing any logs */
1962                 tp->_t_logstate = TCP_LOG_STATE_OFF;
1963                 rv = EBUSY;
1964         }
1965         tp->t_flags2 &= ~(TF2_LOG_AUTO);
1966
1967         return (rv);
1968 }
1969
1970 /* If tcp_drain() is called, flush half the log entries. */
1971 void
1972 tcp_log_drain(struct tcpcb *tp)
1973 {
1974         struct tcp_log_mem *log_entry, *next;
1975         int target, skip;
1976
1977         INP_WLOCK_ASSERT(tptoinpcb(tp));
1978         if ((target = tp->t_lognum / 2) == 0)
1979                 return;
1980
1981         /*
1982          * XXXRRS: At this I don't think this is wise that
1983          * we do this. All that a drain call means is that
1984          * we are hitting one of the system mbuf limits. BB
1985          * logging, or freeing of them, will not create any
1986          * more mbufs and really has nothing to do with
1987          * the system running out of mbufs. For now I
1988          * am changing this to free any "AUTO" by dumping
1989          * them out. But this should either be changed
1990          * so that it gets called when we hit the BB limit
1991          * or it should just not get called (one of the two)
1992          * since I don't think the mbuf <-> BB log cleanup
1993          * is the right thing to do here.
1994          */
1995         /*
1996          * If we are logging the "head" packets, we want to discard
1997          * from the tail of the queue. Otherwise, we want to discard
1998          * from the head.
1999          */
2000         if (tp->_t_logstate == TCP_LOG_STATE_HEAD) {
2001                 skip = tp->t_lognum - target;
2002                 STAILQ_FOREACH(log_entry, &tp->t_logs, tlm_queue)
2003                         if (!--skip)
2004                                 break;
2005                 KASSERT(log_entry != NULL,
2006                     ("%s: skipped through all entries!", __func__));
2007                 if (log_entry == NULL)
2008                         return;
2009                 while ((next = STAILQ_NEXT(log_entry, tlm_queue)) != NULL) {
2010                         STAILQ_REMOVE_AFTER(&tp->t_logs, log_entry, tlm_queue);
2011                         tcp_log_entry_refcnt_rem(next);
2012                         tcp_log_remove_log_cleanup(tp, next);
2013 #ifdef INVARIANTS
2014                         target--;
2015 #endif
2016                 }
2017                 KASSERT(target == 0,
2018                     ("%s: After removing from tail, target was %d", __func__,
2019                         target));
2020         } else if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO) {
2021                 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head at drain",
2022                     M_NOWAIT, false);
2023         } else if (tp->_t_logstate == TCP_LOG_STATE_TAIL_AUTO) {
2024                 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail at drain",
2025                     M_NOWAIT, false);
2026         } else if (tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) {
2027                 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints",
2028                     M_NOWAIT, false);
2029         } else if (tp->_t_logstate == TCP_LOG_STATE_CONTINUAL) {
2030                 (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
2031                     M_NOWAIT, false);
2032         } else {
2033                 while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL &&
2034                     target--)
2035                         tcp_log_remove_log_head(tp, log_entry);
2036                 KASSERT(target <= 0,
2037                     ("%s: After removing from head, target was %d", __func__,
2038                         target));
2039                 KASSERT(tp->t_lognum > 0,
2040                     ("%s: After removing from head, tp->t_lognum was %d",
2041                         __func__, target));
2042                 KASSERT(log_entry != NULL,
2043                     ("%s: After removing from head, the tailq was empty",
2044                         __func__));
2045         }
2046 }
2047
2048 static inline int
2049 tcp_log_copyout(struct sockopt *sopt, void *src, void *dst, size_t len)
2050 {
2051
2052         if (sopt->sopt_td != NULL)
2053                 return (copyout(src, dst, len));
2054         bcopy(src, dst, len);
2055         return (0);
2056 }
2057
2058 static int
2059 tcp_log_logs_to_buf(struct sockopt *sopt, struct tcp_log_stailq *log_tailqp,
2060     struct tcp_log_buffer **end, int count)
2061 {
2062         struct tcp_log_buffer *out_entry;
2063         struct tcp_log_mem *log_entry;
2064         size_t entrysize;
2065         int error;
2066 #ifdef INVARIANTS
2067         int orig_count = count;
2068 #endif
2069
2070         /* Copy the data out. */
2071         error = 0;
2072         out_entry = (struct tcp_log_buffer *) sopt->sopt_val;
2073         STAILQ_FOREACH(log_entry, log_tailqp, tlm_queue) {
2074                 count--;
2075                 KASSERT(count >= 0,
2076                     ("%s:%d: Exceeded expected count (%d) processing list %p",
2077                     __func__, __LINE__, orig_count, log_tailqp));
2078
2079 #ifdef TCPLOG_DEBUG_COUNTERS
2080                 counter_u64_add(tcp_log_que_copyout, 1);
2081 #endif
2082
2083                 /*
2084                  * Skip copying out the header if it isn't present.
2085                  * Instead, copy out zeros (to ensure we don't leak info).
2086                  * TODO: Make sure we truly do zero everything we don't
2087                  * explicitly set.
2088                  */
2089                 if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)
2090                         entrysize = sizeof(struct tcp_log_buffer);
2091                 else
2092                         entrysize = offsetof(struct tcp_log_buffer, tlb_th);
2093                 error = tcp_log_copyout(sopt, &log_entry->tlm_buf, out_entry,
2094                     entrysize);
2095                 if (error)
2096                         break;
2097                 if (!(log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)) {
2098                         error = tcp_log_copyout(sopt, zerobuf,
2099                             ((uint8_t *)out_entry) + entrysize,
2100                             sizeof(struct tcp_log_buffer) - entrysize);
2101                 }
2102
2103                 /*
2104                  * Copy out the verbose bit, if needed. Either way,
2105                  * increment the output pointer the correct amount.
2106                  */
2107                 if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) {
2108                         error = tcp_log_copyout(sopt, &log_entry->tlm_v,
2109                             out_entry->tlb_verbose,
2110                             sizeof(struct tcp_log_verbose));
2111                         if (error)
2112                                 break;
2113                         out_entry = (struct tcp_log_buffer *)
2114                             (((uint8_t *) (out_entry + 1)) +
2115                             sizeof(struct tcp_log_verbose));
2116                 } else
2117                         out_entry++;
2118         }
2119         *end = out_entry;
2120         KASSERT(error || count == 0,
2121             ("%s:%d: Less than expected count (%d) processing list %p"
2122             " (%d remain)", __func__, __LINE__, orig_count,
2123             log_tailqp, count));
2124
2125         return (error);
2126 }
2127
2128 /*
2129  * Copy out the buffer. Note that we do incremental copying, so
2130  * sooptcopyout() won't work. However, the goal is to produce the same
2131  * end result as if we copied in the entire user buffer, updated it,
2132  * and then used sooptcopyout() to copy it out.
2133  *
2134  * NOTE: This should be called with a write lock on the PCB; however,
2135  * the function will drop it after it extracts the data from the TCPCB.
2136  */
2137 int
2138 tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp)
2139 {
2140         struct tcp_log_stailq log_tailq;
2141         struct tcp_log_mem *log_entry, *log_next;
2142         struct tcp_log_buffer *out_entry;
2143         struct inpcb *inp = tptoinpcb(tp);
2144         size_t outsize, entrysize;
2145         int error, outnum;
2146
2147         INP_WLOCK_ASSERT(inp);
2148
2149         /*
2150          * Determine which log entries will fit in the buffer. As an
2151          * optimization, skip this if all the entries will clearly fit
2152          * in the buffer. (However, get an exact size if we are using
2153          * INVARIANTS.)
2154          */
2155 #ifndef INVARIANTS
2156         if (sopt->sopt_valsize / (sizeof(struct tcp_log_buffer) +
2157             sizeof(struct tcp_log_verbose)) >= tp->t_lognum) {
2158                 log_entry = STAILQ_LAST(&tp->t_logs, tcp_log_mem, tlm_queue);
2159                 log_next = NULL;
2160                 outsize = 0;
2161                 outnum = tp->t_lognum;
2162         } else {
2163 #endif
2164                 outsize = outnum = 0;
2165                 log_entry = NULL;
2166                 STAILQ_FOREACH(log_next, &tp->t_logs, tlm_queue) {
2167                         entrysize = sizeof(struct tcp_log_buffer);
2168                         if (log_next->tlm_buf.tlb_eventflags &
2169                             TLB_FLAG_VERBOSE)
2170                                 entrysize += sizeof(struct tcp_log_verbose);
2171                         if ((sopt->sopt_valsize - outsize) < entrysize)
2172                                 break;
2173                         outsize += entrysize;
2174                         outnum++;
2175                         log_entry = log_next;
2176                 }
2177                 KASSERT(outsize <= sopt->sopt_valsize,
2178                     ("%s: calculated output size (%zu) greater than available"
2179                         "space (%zu)", __func__, outsize, sopt->sopt_valsize));
2180 #ifndef INVARIANTS
2181         }
2182 #endif
2183
2184         /*
2185          * Copy traditional sooptcopyout() behavior: if sopt->sopt_val
2186          * is NULL, silently skip the copy. However, in this case, we
2187          * will leave the list alone and return. Functionally, this
2188          * gives userspace a way to poll for an approximate buffer
2189          * size they will need to get the log entries.
2190          */
2191         if (sopt->sopt_val == NULL) {
2192                 INP_WUNLOCK(inp);
2193                 if (outsize == 0) {
2194                         outsize = outnum * (sizeof(struct tcp_log_buffer) +
2195                             sizeof(struct tcp_log_verbose));
2196                 }
2197                 if (sopt->sopt_valsize > outsize)
2198                         sopt->sopt_valsize = outsize;
2199                 return (0);
2200         }
2201
2202         /*
2203          * Break apart the list. We'll save the ones we want to copy
2204          * out locally and remove them from the TCPCB list. We can
2205          * then drop the INPCB lock while we do the copyout.
2206          *
2207          * There are roughly three cases:
2208          * 1. There was nothing to copy out. That's easy: drop the
2209          * lock and return.
2210          * 2. We are copying out the entire list. Again, that's easy:
2211          * move the whole list.
2212          * 3. We are copying out a partial list. That's harder. We
2213          * need to update the list book-keeping entries.
2214          */
2215         if (log_entry != NULL && log_next == NULL) {
2216                 /* Move entire list. */
2217                 KASSERT(outnum == tp->t_lognum,
2218                     ("%s:%d: outnum (%d) should match tp->t_lognum (%d)",
2219                         __func__, __LINE__, outnum, tp->t_lognum));
2220                 log_tailq = tp->t_logs;
2221                 tp->t_lognum = 0;
2222                 STAILQ_INIT(&tp->t_logs);
2223         } else if (log_entry != NULL) {
2224                 /* Move partial list. */
2225                 KASSERT(outnum < tp->t_lognum,
2226                     ("%s:%d: outnum (%d) not less than tp->t_lognum (%d)",
2227                         __func__, __LINE__, outnum, tp->t_lognum));
2228                 STAILQ_FIRST(&log_tailq) = STAILQ_FIRST(&tp->t_logs);
2229                 STAILQ_FIRST(&tp->t_logs) = STAILQ_NEXT(log_entry, tlm_queue);
2230                 KASSERT(STAILQ_NEXT(log_entry, tlm_queue) != NULL,
2231                     ("%s:%d: tp->t_logs is unexpectedly shorter than expected"
2232                     "(tp: %p, log_tailq: %p, outnum: %d, tp->t_lognum: %d)",
2233                     __func__, __LINE__, tp, &log_tailq, outnum, tp->t_lognum));
2234                 STAILQ_NEXT(log_entry, tlm_queue) = NULL;
2235                 log_tailq.stqh_last = &STAILQ_NEXT(log_entry, tlm_queue);
2236                 tp->t_lognum -= outnum;
2237         } else
2238                 STAILQ_INIT(&log_tailq);
2239
2240         /* Drop the PCB lock. */
2241         INP_WUNLOCK(inp);
2242
2243         /* Copy the data out. */
2244         error = tcp_log_logs_to_buf(sopt, &log_tailq, &out_entry, outnum);
2245
2246         if (error) {
2247                 /* Restore list */
2248                 INP_WLOCK(inp);
2249                 if ((inp->inp_flags & INP_DROPPED) == 0) {
2250                         tp = intotcpcb(inp);
2251
2252                         /* Merge the two lists. */
2253                         STAILQ_CONCAT(&log_tailq, &tp->t_logs);
2254                         tp->t_logs = log_tailq;
2255                         tp->t_lognum += outnum;
2256                 }
2257                 INP_WUNLOCK(inp);
2258         } else {
2259                 /* Sanity check entries */
2260                 KASSERT(((caddr_t)out_entry - (caddr_t)sopt->sopt_val)  ==
2261                     outsize, ("%s: Actual output size (%zu) != "
2262                         "calculated output size (%zu)", __func__,
2263                         (size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val),
2264                         outsize));
2265
2266                 /* Free the entries we just copied out. */
2267                 STAILQ_FOREACH_SAFE(log_entry, &log_tailq, tlm_queue, log_next) {
2268                         tcp_log_entry_refcnt_rem(log_entry);
2269                         uma_zfree(tcp_log_zone, log_entry);
2270                 }
2271         }
2272
2273         sopt->sopt_valsize = (size_t)((caddr_t)out_entry -
2274             (caddr_t)sopt->sopt_val);
2275         return (error);
2276 }
2277
2278 static void
2279 tcp_log_free_queue(struct tcp_log_dev_queue *param)
2280 {
2281         struct tcp_log_dev_log_queue *entry;
2282
2283         KASSERT(param != NULL, ("%s: called with NULL param", __func__));
2284         if (param == NULL)
2285                 return;
2286
2287         entry = (struct tcp_log_dev_log_queue *)param;
2288
2289         /* Free the entries. */
2290         tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count);
2291
2292         /* Free the buffer, if it is allocated. */
2293         if (entry->tldl_common.tldq_buf != NULL)
2294                 free(entry->tldl_common.tldq_buf, M_TCPLOGDEV);
2295
2296         /* Free the queue entry. */
2297         free(entry, M_TCPLOGDEV);
2298 }
2299
2300 static struct tcp_log_common_header *
2301 tcp_log_expandlogbuf(struct tcp_log_dev_queue *param)
2302 {
2303         struct tcp_log_dev_log_queue *entry;
2304         struct tcp_log_header *hdr;
2305         uint8_t *end;
2306         struct sockopt sopt;
2307         int error;
2308
2309         entry = (struct tcp_log_dev_log_queue *)param;
2310
2311         /* Take a worst-case guess at space needs. */
2312         sopt.sopt_valsize = sizeof(struct tcp_log_header) +
2313             entry->tldl_count * (sizeof(struct tcp_log_buffer) +
2314             sizeof(struct tcp_log_verbose));
2315         hdr = malloc(sopt.sopt_valsize, M_TCPLOGDEV, M_NOWAIT);
2316         if (hdr == NULL) {
2317 #ifdef TCPLOG_DEBUG_COUNTERS
2318                 counter_u64_add(tcp_log_que_fail5, entry->tldl_count);
2319 #endif
2320                 return (NULL);
2321         }
2322         sopt.sopt_val = hdr + 1;
2323         sopt.sopt_valsize -= sizeof(struct tcp_log_header);
2324         sopt.sopt_td = NULL;
2325
2326         error = tcp_log_logs_to_buf(&sopt, &entry->tldl_entries,
2327             (struct tcp_log_buffer **)&end, entry->tldl_count);
2328         if (error) {
2329                 free(hdr, M_TCPLOGDEV);
2330                 return (NULL);
2331         }
2332
2333         /* Free the entries. */
2334         tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count);
2335         entry->tldl_count = 0;
2336
2337         memset(hdr, 0, sizeof(struct tcp_log_header));
2338         hdr->tlh_version = TCP_LOG_BUF_VER;
2339         hdr->tlh_type = TCP_LOG_DEV_TYPE_BBR;
2340         hdr->tlh_length = end - (uint8_t *)hdr;
2341         hdr->tlh_ie = entry->tldl_ie;
2342         hdr->tlh_af = entry->tldl_af;
2343         getboottime(&hdr->tlh_offset);
2344         strlcpy(hdr->tlh_id, entry->tldl_id, TCP_LOG_ID_LEN);
2345         strlcpy(hdr->tlh_tag, entry->tldl_tag, TCP_LOG_TAG_LEN);
2346         strlcpy(hdr->tlh_reason, entry->tldl_reason, TCP_LOG_REASON_LEN);
2347         return ((struct tcp_log_common_header *)hdr);
2348 }
2349
2350 /*
2351  * Queue the tcpcb's log buffer for transmission via the log buffer facility.
2352  *
2353  * NOTE: This should be called with a write lock on the PCB.
2354  *
2355  * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop
2356  * and reacquire the INP lock if it needs to do so.
2357  *
2358  * If force is false, this will only dump auto-logged sessions if
2359  * tcp_log_auto_all is true or if there is a log ID defined for the session.
2360  */
2361 int
2362 tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force)
2363 {
2364         struct tcp_log_dev_log_queue *entry;
2365         struct inpcb *inp = tptoinpcb(tp);
2366 #ifdef TCPLOG_DEBUG_COUNTERS
2367         int num_entries;
2368 #endif
2369
2370         INP_WLOCK_ASSERT(inp);
2371
2372         /* If there are no log entries, there is nothing to do. */
2373         if (tp->t_lognum == 0)
2374                 return (0);
2375
2376         /* Check for a log ID. */
2377         if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) &&
2378             !tcp_log_auto_all && !force) {
2379                 struct tcp_log_mem *log_entry;
2380
2381                 /*
2382                  * We needed a log ID and none was found. Free the log entries
2383                  * and return success. Also, cancel further logging. If the
2384                  * session doesn't have a log ID by now, we'll assume it isn't
2385                  * going to get one.
2386                  */
2387                 while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
2388                         tcp_log_remove_log_head(tp, log_entry);
2389                 KASSERT(tp->t_lognum == 0,
2390                     ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
2391                         __func__, tp->t_lognum));
2392                 tp->_t_logstate = TCP_LOG_STATE_OFF;
2393                 return (0);
2394         }
2395
2396         /*
2397          * Allocate memory. If we must wait, we'll need to drop the locks
2398          * and reacquire them (and do all the related business that goes
2399          * along with that).
2400          */
2401         entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV,
2402             M_NOWAIT);
2403         if (entry == NULL && (how & M_NOWAIT)) {
2404 #ifdef TCPLOG_DEBUG_COUNTERS
2405                 counter_u64_add(tcp_log_que_fail3, 1);
2406 #endif
2407                 return (ENOBUFS);
2408         }
2409         if (entry == NULL) {
2410                 INP_WUNLOCK(inp);
2411                 entry = malloc(sizeof(struct tcp_log_dev_log_queue),
2412                     M_TCPLOGDEV, M_WAITOK);
2413                 INP_WLOCK(inp);
2414                 /*
2415                  * Note that this check is slightly overly-restrictive in
2416                  * that the TCB can survive either of these events.
2417                  * However, there is currently not a good way to ensure
2418                  * that is the case. So, if we hit this M_WAIT path, we
2419                  * may end up dropping some entries. That seems like a
2420                  * small price to pay for safety.
2421                  */
2422                 if (inp->inp_flags & INP_DROPPED) {
2423                         free(entry, M_TCPLOGDEV);
2424 #ifdef TCPLOG_DEBUG_COUNTERS
2425                         counter_u64_add(tcp_log_que_fail2, 1);
2426 #endif
2427                         return (ECONNRESET);
2428                 }
2429                 tp = intotcpcb(inp);
2430                 if (tp->t_lognum == 0) {
2431                         free(entry, M_TCPLOGDEV);
2432                         return (0);
2433                 }
2434         }
2435
2436         /* Fill in the unique parts of the queue entry. */
2437         if (tp->t_lib != NULL) {
2438                 strlcpy(entry->tldl_id, tp->t_lib->tlb_id, TCP_LOG_ID_LEN);
2439                 strlcpy(entry->tldl_tag, tp->t_lib->tlb_tag, TCP_LOG_TAG_LEN);
2440         } else {
2441                 strlcpy(entry->tldl_id, "UNKNOWN", TCP_LOG_ID_LEN);
2442                 strlcpy(entry->tldl_tag, "UNKNOWN", TCP_LOG_TAG_LEN);
2443         }
2444         if (reason != NULL)
2445                 strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN);
2446         else
2447                 strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN);
2448         entry->tldl_ie = inp->inp_inc.inc_ie;
2449         if (inp->inp_inc.inc_flags & INC_ISIPV6)
2450                 entry->tldl_af = AF_INET6;
2451         else
2452                 entry->tldl_af = AF_INET;
2453         entry->tldl_entries = tp->t_logs;
2454         entry->tldl_count = tp->t_lognum;
2455
2456         /* Fill in the common parts of the queue entry. */
2457         entry->tldl_common.tldq_buf = NULL;
2458         entry->tldl_common.tldq_xform = tcp_log_expandlogbuf;
2459         entry->tldl_common.tldq_dtor = tcp_log_free_queue;
2460
2461         /* Clear the log data from the TCPCB. */
2462 #ifdef TCPLOG_DEBUG_COUNTERS
2463         num_entries = tp->t_lognum;
2464 #endif
2465         tp->t_lognum = 0;
2466         STAILQ_INIT(&tp->t_logs);
2467
2468         /* Add the entry. If no one is listening, free the entry. */
2469         if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) {
2470                 tcp_log_free_queue((struct tcp_log_dev_queue *)entry);
2471 #ifdef TCPLOG_DEBUG_COUNTERS
2472                 counter_u64_add(tcp_log_que_fail1, num_entries);
2473         } else {
2474                 counter_u64_add(tcp_log_queued, num_entries);
2475 #endif
2476         }
2477         return (0);
2478 }
2479
2480 /*
2481  * Queue the log_id_node's log buffers for transmission via the log buffer
2482  * facility.
2483  *
2484  * NOTE: This should be called with the bucket locked and referenced.
2485  *
2486  * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop
2487  * and reacquire the bucket lock if it needs to do so. (The caller must
2488  * ensure that the tln is no longer on any lists so no one else will mess
2489  * with this while the lock is dropped!)
2490  */
2491 static int
2492 tcp_log_dump_node_logbuf(struct tcp_log_id_node *tln, char *reason, int how)
2493 {
2494         struct tcp_log_dev_log_queue *entry;
2495         struct tcp_log_id_bucket *tlb;
2496
2497         tlb = tln->tln_bucket;
2498         TCPID_BUCKET_LOCK_ASSERT(tlb);
2499         KASSERT(tlb->tlb_refcnt > 0,
2500             ("%s:%d: Called with unreferenced bucket (tln=%p, tlb=%p)",
2501             __func__, __LINE__, tln, tlb));
2502         KASSERT(tln->tln_closed,
2503             ("%s:%d: Called for node with tln_closed==false (tln=%p)",
2504             __func__, __LINE__, tln));
2505
2506         /* If there are no log entries, there is nothing to do. */
2507         if (tln->tln_count == 0)
2508                 return (0);
2509
2510         /*
2511          * Allocate memory. If we must wait, we'll need to drop the locks
2512          * and reacquire them (and do all the related business that goes
2513          * along with that).
2514          */
2515         entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV,
2516             M_NOWAIT);
2517         if (entry == NULL && (how & M_NOWAIT))
2518                 return (ENOBUFS);
2519         if (entry == NULL) {
2520                 TCPID_BUCKET_UNLOCK(tlb);
2521                 entry = malloc(sizeof(struct tcp_log_dev_log_queue),
2522                     M_TCPLOGDEV, M_WAITOK);
2523                 TCPID_BUCKET_LOCK(tlb);
2524         }
2525
2526         /* Fill in the common parts of the queue entry.. */
2527         entry->tldl_common.tldq_buf = NULL;
2528         entry->tldl_common.tldq_xform = tcp_log_expandlogbuf;
2529         entry->tldl_common.tldq_dtor = tcp_log_free_queue;
2530
2531         /* Fill in the unique parts of the queue entry. */
2532         strlcpy(entry->tldl_id, tlb->tlb_id, TCP_LOG_ID_LEN);
2533         strlcpy(entry->tldl_tag, tlb->tlb_tag, TCP_LOG_TAG_LEN);
2534         if (reason != NULL)
2535                 strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN);
2536         else
2537                 strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN);
2538         entry->tldl_ie = tln->tln_ie;
2539         entry->tldl_entries = tln->tln_entries;
2540         entry->tldl_count = tln->tln_count;
2541         entry->tldl_af = tln->tln_af;
2542
2543         /* Add the entry. If no one is listening, free the entry. */
2544         if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry))
2545                 tcp_log_free_queue((struct tcp_log_dev_queue *)entry);
2546
2547         return (0);
2548 }
2549
2550 /*
2551  * Queue the log buffers for all sessions in a bucket for transmissions via
2552  * the log buffer facility.
2553  *
2554  * NOTE: This should be called with a locked bucket; however, the function
2555  * will drop the lock.
2556  */
2557 #define LOCAL_SAVE      10
2558 static void
2559 tcp_log_dumpbucketlogs(struct tcp_log_id_bucket *tlb, char *reason)
2560 {
2561         struct tcp_log_id_node local_entries[LOCAL_SAVE];
2562         struct inpcb *inp;
2563         struct tcpcb *tp;
2564         struct tcp_log_id_node *cur_tln, *prev_tln, *tmp_tln;
2565         int i, num_local_entries, tree_locked;
2566         bool expireq_locked;
2567
2568         TCPID_BUCKET_LOCK_ASSERT(tlb);
2569
2570         /*
2571          * Take a reference on the bucket to keep it from disappearing until
2572          * we are done.
2573          */
2574         TCPID_BUCKET_REF(tlb);
2575
2576         /*
2577          * We'll try to create these without dropping locks. However, we
2578          * might very well need to drop locks to get memory. If that's the
2579          * case, we'll save up to 10 on the stack, and sacrifice the rest.
2580          * (Otherwise, we need to worry about finding our place again in a
2581          * potentially changed list. It just doesn't seem worth the trouble
2582          * to do that.
2583          */
2584         expireq_locked = false;
2585         num_local_entries = 0;
2586         prev_tln = NULL;
2587         tree_locked = TREE_UNLOCKED;
2588         SLIST_FOREACH_SAFE(cur_tln, &tlb->tlb_head, tln_list, tmp_tln) {
2589                 /*
2590                  * If this isn't associated with a TCPCB, we can pull it off
2591                  * the list now. We need to be careful that the expire timer
2592                  * hasn't already taken ownership (tln_expiretime == SBT_MAX).
2593                  * If so, we let the expire timer code free the data.
2594                  */
2595                 if (cur_tln->tln_closed) {
2596 no_inp:
2597                         /*
2598                          * Get the expireq lock so we can get a consistent
2599                          * read of tln_expiretime and so we can remove this
2600                          * from the expireq.
2601                          */
2602                         if (!expireq_locked) {
2603                                 TCPLOG_EXPIREQ_LOCK();
2604                                 expireq_locked = true;
2605                         }
2606
2607                         /*
2608                          * We ignore entries with tln_expiretime == SBT_MAX.
2609                          * The expire timer code already owns those.
2610                          */
2611                         KASSERT(cur_tln->tln_expiretime > (sbintime_t) 0,
2612                             ("%s:%d: node on the expire queue without positive "
2613                             "expire time", __func__, __LINE__));
2614                         if (cur_tln->tln_expiretime == SBT_MAX) {
2615                                 prev_tln = cur_tln;
2616                                 continue;
2617                         }
2618
2619                         /* Remove the entry from the expireq. */
2620                         STAILQ_REMOVE(&tcp_log_expireq_head, cur_tln,
2621                             tcp_log_id_node, tln_expireq);
2622
2623                         /* Remove the entry from the bucket. */
2624                         if (prev_tln != NULL)
2625                                 SLIST_REMOVE_AFTER(prev_tln, tln_list);
2626                         else
2627                                 SLIST_REMOVE_HEAD(&tlb->tlb_head, tln_list);
2628
2629                         /*
2630                          * Drop the INP and bucket reference counts. Due to
2631                          * lock-ordering rules, we need to drop the expire
2632                          * queue lock.
2633                          */
2634                         TCPLOG_EXPIREQ_UNLOCK();
2635                         expireq_locked = false;
2636
2637                         /* Drop the INP reference. */
2638                         INP_WLOCK(cur_tln->tln_inp);
2639                         if (!in_pcbrele_wlocked(cur_tln->tln_inp))
2640                                 INP_WUNLOCK(cur_tln->tln_inp);
2641
2642                         if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) {
2643 #ifdef INVARIANTS
2644                                 panic("%s: Bucket refcount unexpectedly 0.",
2645                                     __func__);
2646 #endif
2647                                 /*
2648                                  * Recover as best we can: free the entry we
2649                                  * own.
2650                                  */
2651                                 tcp_log_free_entries(&cur_tln->tln_entries,
2652                                     &cur_tln->tln_count);
2653                                 uma_zfree(tcp_log_id_node_zone, cur_tln);
2654                                 goto done;
2655                         }
2656
2657                         if (tcp_log_dump_node_logbuf(cur_tln, reason,
2658                             M_NOWAIT)) {
2659                                 /*
2660                                  * If we have sapce, save the entries locally.
2661                                  * Otherwise, free them.
2662                                  */
2663                                 if (num_local_entries < LOCAL_SAVE) {
2664                                         local_entries[num_local_entries] =
2665                                             *cur_tln;
2666                                         num_local_entries++;
2667                                 } else {
2668                                         tcp_log_free_entries(
2669                                             &cur_tln->tln_entries,
2670                                             &cur_tln->tln_count);
2671                                 }
2672                         }
2673
2674                         /* No matter what, we are done with the node now. */
2675                         uma_zfree(tcp_log_id_node_zone, cur_tln);
2676
2677                         /*
2678                          * Because we removed this entry from the list, prev_tln
2679                          * (which tracks the previous entry still on the tlb
2680                          * list) remains unchanged.
2681                          */
2682                         continue;
2683                 }
2684
2685                 /*
2686                  * If we get to this point, the session data is still held in
2687                  * the TCPCB. So, we need to pull the data out of that.
2688                  *
2689                  * We will need to drop the expireq lock so we can lock the INP.
2690                  * We can then try to extract the data the "easy" way. If that
2691                  * fails, we'll save the log entries for later.
2692                  */
2693                 if (expireq_locked) {
2694                         TCPLOG_EXPIREQ_UNLOCK();
2695                         expireq_locked = false;
2696                 }
2697
2698                 /* Lock the INP and then re-check the state. */
2699                 inp = cur_tln->tln_inp;
2700                 INP_WLOCK(inp);
2701                 /*
2702                  * If we caught this while it was transitioning, the data
2703                  * might have moved from the TCPCB to the tln (signified by
2704                  * setting tln_closed to true. If so, treat this like an
2705                  * inactive connection.
2706                  */
2707                 if (cur_tln->tln_closed) {
2708                         /*
2709                          * It looks like we may have caught this connection
2710                          * while it was transitioning from active to inactive.
2711                          * Treat this like an inactive connection.
2712                          */
2713                         INP_WUNLOCK(inp);
2714                         goto no_inp;
2715                 }
2716
2717                 /*
2718                  * Try to dump the data from the tp without dropping the lock.
2719                  * If this fails, try to save off the data locally.
2720                  */
2721                 tp = cur_tln->tln_tp;
2722                 if (tcp_log_dump_tp_logbuf(tp, reason, M_NOWAIT, true) &&
2723                     num_local_entries < LOCAL_SAVE) {
2724                         tcp_log_move_tp_to_node(tp,
2725                             &local_entries[num_local_entries]);
2726                         local_entries[num_local_entries].tln_closed = 1;
2727                         KASSERT(local_entries[num_local_entries].tln_bucket ==
2728                             tlb, ("%s: %d: bucket mismatch for node %p",
2729                             __func__, __LINE__, cur_tln));
2730                         num_local_entries++;
2731                 }
2732
2733                 INP_WUNLOCK(inp);
2734
2735                 /*
2736                  * We are goint to leave the current tln on the list. It will
2737                  * become the previous tln.
2738                  */
2739                 prev_tln = cur_tln;
2740         }
2741
2742         /* Drop our locks, if any. */
2743         KASSERT(tree_locked == TREE_UNLOCKED,
2744             ("%s: %d: tree unexpectedly locked", __func__, __LINE__));
2745         switch (tree_locked) {
2746         case TREE_WLOCKED:
2747                 TCPID_TREE_WUNLOCK();
2748                 tree_locked = TREE_UNLOCKED;
2749                 break;
2750         case TREE_RLOCKED:
2751                 TCPID_TREE_RUNLOCK();
2752                 tree_locked = TREE_UNLOCKED;
2753                 break;
2754         }
2755         if (expireq_locked) {
2756                 TCPLOG_EXPIREQ_UNLOCK();
2757                 expireq_locked = false;
2758         }
2759
2760         /*
2761          * Try again for any saved entries. tcp_log_dump_node_logbuf() is
2762          * guaranteed to free the log entries within the node. And, since
2763          * the node itself is on our stack, we don't need to free it.
2764          */
2765         for (i = 0; i < num_local_entries; i++)
2766                 tcp_log_dump_node_logbuf(&local_entries[i], reason, M_WAITOK);
2767
2768         /* Drop our reference. */
2769         if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
2770                 TCPID_BUCKET_UNLOCK(tlb);
2771
2772 done:
2773         /* Drop our locks, if any. */
2774         switch (tree_locked) {
2775         case TREE_WLOCKED:
2776                 TCPID_TREE_WUNLOCK();
2777                 break;
2778         case TREE_RLOCKED:
2779                 TCPID_TREE_RUNLOCK();
2780                 break;
2781         }
2782         if (expireq_locked)
2783                 TCPLOG_EXPIREQ_UNLOCK();
2784 }
2785 #undef  LOCAL_SAVE
2786
2787 /*
2788  * Queue the log buffers for all sessions in a bucket for transmissions via
2789  * the log buffer facility.
2790  *
2791  * NOTE: This should be called with a locked INP; however, the function
2792  * will drop the lock.
2793  */
2794 void
2795 tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason)
2796 {
2797         struct inpcb *inp = tptoinpcb(tp);
2798         struct tcp_log_id_bucket *tlb;
2799         int tree_locked;
2800
2801         /* Figure out our bucket and lock it. */
2802         INP_WLOCK_ASSERT(inp);
2803         tlb = tp->t_lib;
2804         if (tlb == NULL) {
2805                 /*
2806                  * No bucket; treat this like a request to dump a single
2807                  * session's traces.
2808                  */
2809                 (void)tcp_log_dump_tp_logbuf(tp, reason, M_WAITOK, true);
2810                 INP_WUNLOCK(inp);
2811                 return;
2812         }
2813         TCPID_BUCKET_REF(tlb);
2814         INP_WUNLOCK(inp);
2815         TCPID_BUCKET_LOCK(tlb);
2816
2817         /* If we are the last reference, we have nothing more to do here. */
2818         tree_locked = TREE_UNLOCKED;
2819         if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) {
2820                 switch (tree_locked) {
2821                 case TREE_WLOCKED:
2822                         TCPID_TREE_WUNLOCK();
2823                         break;
2824                 case TREE_RLOCKED:
2825                         TCPID_TREE_RUNLOCK();
2826                         break;
2827                 }
2828                 return;
2829         }
2830
2831         /* Turn this over to tcp_log_dumpbucketlogs() to finish the work. */
2832         tcp_log_dumpbucketlogs(tlb, reason);
2833 }
2834
2835 /*
2836  * Mark the end of a flow with the current stack. A stack can add
2837  * stack-specific info to this trace event by overriding this
2838  * function (see bbr_log_flowend() for example).
2839  */
2840 void
2841 tcp_log_flowend(struct tcpcb *tp)
2842 {
2843         if (tp->_t_logstate != TCP_LOG_STATE_OFF) {
2844                 struct socket *so = tptosocket(tp);
2845                 TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd,
2846                                 TCP_LOG_FLOWEND, 0, 0, NULL, false);
2847         }
2848 }
2849
2850 void
2851 tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags)
2852 {
2853         struct inpcb *inp;
2854         struct tcpcb *tp;
2855 #ifdef TCP_REQUEST_TRK
2856         struct tcp_sendfile_track *ent;
2857         int i, fnd;
2858 #endif
2859
2860         inp = sotoinpcb(so);
2861         KASSERT(inp != NULL, ("tcp_log_sendfile: inp == NULL"));
2862
2863         /* quick check to see if logging is enabled for this connection */
2864         tp = intotcpcb(inp);
2865         if ((inp->inp_flags & INP_DROPPED) ||
2866             (tp->_t_logstate == TCP_LOG_STATE_OFF)) {
2867                 return;
2868         }
2869
2870         INP_WLOCK(inp);
2871         /* double check log state now that we have the lock */
2872         if (inp->inp_flags & INP_DROPPED)
2873                 goto done;
2874         if (tp->_t_logstate != TCP_LOG_STATE_OFF) {
2875                 struct timeval tv;
2876                 tcp_log_eventspecific_t log;
2877
2878                 microuptime(&tv);
2879                 log.u_sf.offset = offset;
2880                 log.u_sf.length = nbytes;
2881                 log.u_sf.flags = flags;
2882
2883                 TCP_LOG_EVENTP(tp, NULL,
2884                     &tptosocket(tp)->so_rcv,
2885                     &tptosocket(tp)->so_snd,
2886                     TCP_LOG_SENDFILE, 0, 0, &log, false, &tv);
2887         }
2888 #ifdef TCP_REQUEST_TRK
2889         if (tp->t_tcpreq_req == 0) {
2890                 /* No http requests to track */
2891                 goto done;
2892         }
2893         fnd = 0;
2894         if (tp->t_tcpreq_closed == 0) {
2895                 /* No closed end req to track */
2896                 goto skip_closed_req;
2897         }
2898         for(i = 0; i < MAX_TCP_TRK_REQ; i++) {
2899                 /* Lets see if this one can be found */
2900                 ent = &tp->t_tcpreq_info[i];
2901                 if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) {
2902                         /* Not used */
2903                         continue;
2904                 }
2905                 if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) {
2906                         /* This pass does not consider open requests */
2907                         continue;
2908                 }
2909                 if (ent->flags & TCP_TRK_TRACK_FLG_COMP) {
2910                         /* Don't look at what we have completed */
2911                         continue;
2912                 }
2913                 /* If we reach here its a allocated closed end request */
2914                 if ((ent->start == offset) ||
2915                     ((offset > ent->start) && (offset < ent->end))){
2916                         /* Its within this request?? */
2917                         fnd = 1;
2918                 }
2919                 if (fnd) {
2920                         /*
2921                          * It is at or past the end, its complete.
2922                          */
2923                         ent->flags |= TCP_TRK_TRACK_FLG_SEQV;
2924                         /*
2925                          * When an entry completes we can take (snd_una + sb_cc) and know where
2926                          * the end of the range really is. Note that this works since two
2927                          * requests must be sequential and sendfile now is complete for *this* request.
2928                          * we must use sb_ccc since the data may still be in-flight in TLS.
2929                          *
2930                          * We always cautiously move the end_seq only if our calculations
2931                          * show it happened (just in case sf has the call to here at the wrong
2932                          * place). When we go COMP we will stop coming here and hopefully be
2933                          * left with the correct end_seq.
2934                          */
2935                         if (SEQ_GT((tp->snd_una + so->so_snd.sb_ccc), ent->end_seq))
2936                                 ent->end_seq = tp->snd_una + so->so_snd.sb_ccc;
2937                         if ((offset + nbytes) >= ent->end) {
2938                                 ent->flags |= TCP_TRK_TRACK_FLG_COMP;
2939                                 tcp_req_log_req_info(tp, ent, i, TCP_TRK_REQ_LOG_COMPLETE, offset, nbytes);
2940                         } else {
2941                                 tcp_req_log_req_info(tp, ent, i, TCP_TRK_REQ_LOG_MOREYET, offset, nbytes);
2942                         }
2943                         /* We assume that sendfile never sends overlapping requests */
2944                         goto done;
2945                 }
2946         }
2947 skip_closed_req:
2948         if (!fnd) {
2949                 /* Ok now lets look for open requests */
2950                 for(i = 0; i < MAX_TCP_TRK_REQ; i++) {
2951                         ent = &tp->t_tcpreq_info[i];
2952                         if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) {
2953                                 /* Not used */
2954                                 continue;
2955                         }
2956                         if ((ent->flags & TCP_TRK_TRACK_FLG_OPEN) == 0)
2957                                 continue;
2958                         /* If we reach here its an allocated open request */
2959                         if (ent->start == offset) {
2960                                 /* It begins this request */
2961                                 ent->start_seq = tp->snd_una +
2962                                     tptosocket(tp)->so_snd.sb_ccc;
2963                                 ent->flags |= TCP_TRK_TRACK_FLG_SEQV;
2964                                 break;
2965                         } else if (offset > ent->start) {
2966                                 ent->flags |= TCP_TRK_TRACK_FLG_SEQV;
2967                                 break;
2968                         }
2969                 }
2970         }
2971 #endif
2972 done:
2973         INP_WUNLOCK(inp);
2974 }