sys/netinet/tcp_pcap.c

   1 /*-
   2  * Copyright (c) 2015
   3  *      Jonathan Looney. All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  *
  26  * $FreeBSD$
  27  */
  28
  29 #include <sys/queue.h>
  30 #include <sys/param.h>
  31 #include <sys/types.h>
  32 #include <sys/socket.h>
  33 #include <sys/socketvar.h>
  34 #include <sys/sysctl.h>
  35 #include <sys/systm.h>
  36 #include <sys/mbuf.h>
  37 #include <sys/eventhandler.h>
  38 #include <machine/atomic.h>
  39 #include <netinet/in.h>
  40 #include <netinet/in_pcb.h>
  41 #include <netinet/tcp_var.h>
  42 #include <netinet/tcp_pcap.h>
  43
  44 #define M_LEADINGSPACE_NOWRITE(m)                                       \
  45         ((m)->m_data - M_START(m))
  46
  47 int tcp_pcap_aggressive_free = 1;
  48 static int tcp_pcap_clusters_referenced_cur = 0;
  49 static int tcp_pcap_clusters_referenced_max = 0;
  50
  51 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_aggressive_free,
  52         CTLFLAG_RW, &tcp_pcap_aggressive_free, 0,
  53         "Free saved packets when the memory system comes under pressure");
  54 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_cur,
  55         CTLFLAG_RD, &tcp_pcap_clusters_referenced_cur, 0,
  56         "Number of clusters currently referenced on TCP PCAP queues");
  57 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_max,
  58         CTLFLAG_RW, &tcp_pcap_clusters_referenced_max, 0,
  59         "Maximum number of clusters allowed to be referenced on TCP PCAP "
  60         "queues");
  61
  62 static int tcp_pcap_alloc_reuse_ext = 0;
  63 static int tcp_pcap_alloc_reuse_mbuf = 0;
  64 static int tcp_pcap_alloc_new_mbuf = 0;
  65 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_ext,
  66         CTLFLAG_RD, &tcp_pcap_alloc_reuse_ext, 0,
  67         "Number of mbufs with external storage reused for the TCP PCAP "
  68         "functionality");
  69 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_mbuf,
  70         CTLFLAG_RD, &tcp_pcap_alloc_reuse_mbuf, 0,
  71         "Number of mbufs with internal storage reused for the TCP PCAP "
  72         "functionality");
  73 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_new_mbuf,
  74         CTLFLAG_RD, &tcp_pcap_alloc_new_mbuf, 0,
  75         "Number of new mbufs allocated for the TCP PCAP functionality");
  76
  77 VNET_DEFINE(int, tcp_pcap_packets) = 0;
  78 #define V_tcp_pcap_packets      VNET(tcp_pcap_packets)
  79 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_packets,
  80         CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_pcap_packets), 0,
  81         "Default number of packets saved per direction per TCPCB");
  82
  83 /* Initialize the values. */
  84 static void
  85 tcp_pcap_max_set(void)
  86 {
  87
  88         tcp_pcap_clusters_referenced_max = nmbclusters / 4;
  89 }
  90
  91 void
  92 tcp_pcap_init(void)
  93 {
  94
  95         tcp_pcap_max_set();
  96         EVENTHANDLER_REGISTER(nmbclusters_change, tcp_pcap_max_set,
  97                 NULL, EVENTHANDLER_PRI_ANY);
  98 }
  99
 100 /*
 101  * If we are below the maximum allowed cluster references,
 102  * increment the reference count and return TRUE. Otherwise,
 103  * leave the reference count alone and return FALSE.
 104  */
 105 static __inline bool
 106 tcp_pcap_take_cluster_reference(void)
 107 {
 108         if (atomic_fetchadd_int(&tcp_pcap_clusters_referenced_cur, 1) >=
 109                 tcp_pcap_clusters_referenced_max) {
 110                 atomic_add_int(&tcp_pcap_clusters_referenced_cur, -1);
 111                 return FALSE;
 112         }
 113         return TRUE;
 114 }
 115
 116 /*
 117  * For all the external entries in m, apply the given adjustment.
 118  * This can be used to adjust the counter when an mbuf chain is
 119  * copied or freed.
 120  */
 121 static __inline void
 122 tcp_pcap_adj_cluster_reference(struct mbuf *m, int adj)
 123 {
 124         while (m) {
 125                 if (m->m_flags & M_EXT)
 126                         atomic_add_int(&tcp_pcap_clusters_referenced_cur, adj);
 127
 128                 m = m->m_next;
 129         }
 130 }
 131
 132 /*
 133  * Free all mbufs in a chain, decrementing the reference count as
 134  * necessary.
 135  *
 136  * Functions in this file should use this instead of m_freem() when
 137  * they are freeing mbuf chains that may contain clusters that were
 138  * already included in tcp_pcap_clusters_referenced_cur.
 139  */
 140 static void
 141 tcp_pcap_m_freem(struct mbuf *mb)
 142 {
 143         while (mb != NULL) {
 144                 if (mb->m_flags & M_EXT)
 145                         atomic_subtract_int(&tcp_pcap_clusters_referenced_cur,
 146                             1);
 147                 mb = m_free(mb);
 148         }
 149 }
 150
 151 /*
 152  * Copy data from m to n, where n cannot fit all the data we might
 153  * want from m.
 154  *
 155  * Prioritize data like this:
 156  * 1. TCP header
 157  * 2. IP header
 158  * 3. Data
 159  */
 160 static void
 161 tcp_pcap_copy_bestfit(struct tcphdr *th, struct mbuf *m, struct mbuf *n)
 162 {
 163         struct mbuf *m_cur = m;
 164         int bytes_to_copy=0, trailing_data, skip=0, tcp_off;
 165
 166         /* Below, we assume these will be non-NULL. */
 167         KASSERT(th, ("%s: called with th == NULL", __func__));
 168         KASSERT(m, ("%s: called with m == NULL", __func__));
 169         KASSERT(n, ("%s: called with n == NULL", __func__));
 170
 171         /* We assume this initialization occurred elsewhere. */
 172         KASSERT(n->m_len == 0, ("%s: called with n->m_len=%d (expected 0)",
 173                 __func__, n->m_len));
 174         KASSERT(n->m_data == M_START(n),
 175                 ("%s: called with n->m_data != M_START(n)", __func__));
 176
 177         /*
 178          * Calculate the size of the TCP header. We use this often
 179          * enough that it is worth just calculating at the start.
 180          */
 181         tcp_off = th->th_off << 2;
 182
 183         /* Trim off leading empty mbufs. */
 184         while (m && m->m_len == 0)
 185                 m = m->m_next;
 186
 187         if (m) {
 188                 m_cur = m;
 189         }
 190         else {
 191                 /*
 192                  * No data? Highly unusual. We would expect to at
 193                  * least see a TCP header in the mbuf.
 194                  * As we have a pointer to the TCP header, I guess
 195                  * we should just copy that. (???)
 196                  */
 197 fallback:
 198                 bytes_to_copy = tcp_off;
 199                 if (bytes_to_copy > M_SIZE(n))
 200                         bytes_to_copy = M_SIZE(n);
 201                 bcopy(th, n->m_data, bytes_to_copy);
 202                 n->m_len = bytes_to_copy;
 203                 return;
 204         }
 205
 206         /*
 207          * Find TCP header. Record the total number of bytes up to,
 208          * and including, the TCP header.
 209          */
 210         while (m_cur) {
 211                 if ((caddr_t) th >= (caddr_t) m_cur->m_data &&
 212                         (caddr_t) th < (caddr_t) (m_cur->m_data + m_cur->m_len))
 213                         break;
 214                 bytes_to_copy += m_cur->m_len;
 215                 m_cur = m_cur->m_next;
 216         }
 217         if (m_cur)
 218                 bytes_to_copy += (caddr_t) th - (caddr_t) m_cur->m_data;
 219         else
 220                 goto fallback;
 221         bytes_to_copy += tcp_off;
 222
 223         /*
 224          * If we already want to copy more bytes than we can hold
 225          * in the destination mbuf, skip leading bytes and copy
 226          * what we can.
 227          *
 228          * Otherwise, consider trailing data.
 229          */
 230         if (bytes_to_copy > M_SIZE(n)) {
 231                 skip  = bytes_to_copy - M_SIZE(n);
 232                 bytes_to_copy = M_SIZE(n);
 233         }
 234         else {
 235                 /*
 236                  * Determine how much trailing data is in the chain.
 237                  * We start with the length of this mbuf (the one
 238                  * containing th) and subtract the size of the TCP
 239                  * header (tcp_off) and the size of the data prior
 240                  * to th (th - m_cur->m_data).
 241                  *
 242                  * This *should not* be negative, as the TCP code
 243                  * should put the whole TCP header in a single
 244                  * mbuf. But, it isn't a problem if it is. We will
 245                  * simple work off our negative balance as we look
 246                  * at subsequent mbufs.
 247                  */
 248                 trailing_data = m_cur->m_len - tcp_off;
 249                 trailing_data -= (caddr_t) th - (caddr_t) m_cur->m_data;
 250                 m_cur = m_cur->m_next;
 251                 while (m_cur) {
 252                         trailing_data += m_cur->m_len;
 253                         m_cur = m_cur->m_next;
 254                 }
 255                 if ((bytes_to_copy + trailing_data) > M_SIZE(n))
 256                         bytes_to_copy = M_SIZE(n);
 257                 else
 258                         bytes_to_copy += trailing_data;
 259         }
 260
 261         m_copydata(m, skip, bytes_to_copy, n->m_data);
 262         n->m_len = bytes_to_copy;
 263 }
 264
 265 void
 266 tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
 267 {
 268         struct mbuf *n = NULL, *mhead;
 269
 270         KASSERT(th, ("%s: called with th == NULL", __func__));
 271         KASSERT(m, ("%s: called with m == NULL", __func__));
 272         KASSERT(queue, ("%s: called with queue == NULL", __func__));
 273
 274         /* We only care about data packets. */
 275         while (m && m->m_type != MT_DATA)
 276                 m = m->m_next;
 277
 278         /* We only need to do something if we still have an mbuf. */
 279         if (!m)
 280                 return;
 281
 282         /* If we are not saving mbufs, return now. */
 283         if (queue->mq_maxlen == 0)
 284                 return;
 285
 286         /*
 287          * Check to see if we will need to recycle mbufs.
 288          *
 289          * If we need to get rid of mbufs to stay below
 290          * our packet count, try to reuse the mbuf. Once
 291          * we already have a new mbuf (n), then we can
 292          * simply free subsequent mbufs.
 293          *
 294          * Note that most of the logic in here is to deal
 295          * with the reuse. If we are fine with constant
 296          * mbuf allocs/deallocs, we could ditch this logic.
 297          * But, it only seems to make sense to reuse
 298          * mbufs we already have.
 299          */
 300         while (mbufq_full(queue)) {
 301                 mhead = mbufq_dequeue(queue);
 302
 303                 if (n) {
 304                         tcp_pcap_m_freem(mhead);
 305                 }
 306                 else {
 307                         /*
 308                          * If this held an external cluster, try to
 309                          * detach the cluster. But, if we held the
 310                          * last reference, go through the normal
 311                          * free-ing process.
 312                          */
 313                         if (mhead->m_flags & M_EXTPG) {
 314                                 /* Don't mess around with these. */
 315                                 tcp_pcap_m_freem(mhead);
 316                                 continue;
 317                         } else if (mhead->m_flags & M_EXT) {
 318                                 switch (mhead->m_ext.ext_type) {
 319                                 case EXT_SFBUF:
 320                                         /* Don't mess around with these. */
 321                                         tcp_pcap_m_freem(mhead);
 322                                         continue;
 323                                 default:
 324                                         if (atomic_fetchadd_int(
 325                                                 mhead->m_ext.ext_cnt, -1) == 1)
 326                                         {
 327                                                 /*
 328                                                  * We held the last reference
 329                                                  * on this cluster. Restore
 330                                                  * the reference count and put
 331                                                  * it back in the pool.
 332                                                  */
 333                                                 *(mhead->m_ext.ext_cnt) = 1;
 334                                                 tcp_pcap_m_freem(mhead);
 335                                                 continue;
 336                                         }
 337                                         /*
 338                                          * We were able to cleanly free the
 339                                          * reference.
 340                                          */
 341                                         atomic_subtract_int(
 342                                             &tcp_pcap_clusters_referenced_cur,
 343                                             1);
 344                                         tcp_pcap_alloc_reuse_ext++;
 345                                         break;
 346                                 }
 347                         } else {
 348                                 tcp_pcap_alloc_reuse_mbuf++;
 349                         }
 350
 351                         n = mhead;
 352                         tcp_pcap_m_freem(n->m_next);
 353                         m_init(n, M_NOWAIT, MT_DATA, 0);
 354                 }
 355         }
 356
 357         /* Check to see if we need to get a new mbuf. */
 358         if (!n) {
 359                 if (!(n = m_get(M_NOWAIT, MT_DATA)))
 360                         return;
 361                 tcp_pcap_alloc_new_mbuf++;
 362         }
 363
 364         /*
 365          * What are we dealing with? If a cluster, attach it. Otherwise,
 366          * try to copy the data from the beginning of the mbuf to the
 367          * end of data. (There may be data between the start of the data
 368          * area and the current data pointer. We want to get this, because
 369          * it may contain header information that is useful.)
 370          * In cases where that isn't possible, settle for what we can
 371          * get.
 372          */
 373         if ((m->m_flags & (M_EXT|M_EXTPG)) &&
 374             tcp_pcap_take_cluster_reference()) {
 375                 n->m_data = m->m_data;
 376                 n->m_len = m->m_len;
 377                 mb_dupcl(n, m);
 378         }
 379         else if (((m->m_data + m->m_len) - M_START(m)) <= M_SIZE(n)) {
 380                 /*
 381                  * At this point, n is guaranteed to be a normal mbuf
 382                  * with no cluster and no packet header. Because the
 383                  * logic in this code block requires this, the assert
 384                  * is here to catch any instances where someone
 385                  * changes the logic to invalidate that assumption.
 386                  */
 387                 KASSERT((n->m_flags & (M_EXT | M_PKTHDR)) == 0,
 388                         ("%s: Unexpected flags (%#x) for mbuf",
 389                         __func__, n->m_flags));
 390                 n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m);
 391                 n->m_len = m->m_len;
 392                 if (m->m_flags & M_EXTPG)
 393                         m_copydata(m, 0, m->m_len, n->m_data);
 394                 else
 395                         bcopy(M_START(m), n->m_dat,
 396                             m->m_len + M_LEADINGSPACE_NOWRITE(m));
 397         }
 398         else {
 399                 /*
 400                  * This is the case where we need to "settle for what
 401                  * we can get". The most probable way to this code
 402                  * path is that we've already taken references to the
 403                  * maximum number of mbuf clusters we can, and the data
 404                  * is too long to fit in an mbuf's internal storage.
 405                  * Try for a "best fit".
 406                  */
 407                 tcp_pcap_copy_bestfit(th, m, n);
 408
 409                 /* Don't try to get additional data. */
 410                 goto add_to_queue;
 411         }
 412
 413         if (m->m_next) {
 414                 n->m_next = m_copym(m->m_next, 0, M_COPYALL, M_NOWAIT);
 415                 tcp_pcap_adj_cluster_reference(n->m_next, 1);
 416         }
 417
 418 add_to_queue:
 419         /* Add the new mbuf to the list. */
 420         if (mbufq_enqueue(queue, n)) {
 421                 /* This shouldn't happen. If INVARIANTS is defined, panic. */
 422                 KASSERT(0, ("%s: mbufq was unexpectedly full!", __func__));
 423                 tcp_pcap_m_freem(n);
 424         }
 425 }
 426
 427 void
 428 tcp_pcap_drain(struct mbufq *queue)
 429 {
 430         struct mbuf *m;
 431         while ((m = mbufq_dequeue(queue)))
 432                 tcp_pcap_m_freem(m);
 433 }
 434
 435 void
 436 tcp_pcap_tcpcb_init(struct tcpcb *tp)
 437 {
 438         mbufq_init(&(tp->t_inpkts), V_tcp_pcap_packets);
 439         mbufq_init(&(tp->t_outpkts), V_tcp_pcap_packets);
 440 }
 441
 442 void
 443 tcp_pcap_set_sock_max(struct mbufq *queue, int newval)
 444 {
 445         queue->mq_maxlen = newval;
 446         while (queue->mq_len > queue->mq_maxlen)
 447                 tcp_pcap_m_freem(mbufq_dequeue(queue));
 448 }
 449
 450 int
 451 tcp_pcap_get_sock_max(struct mbufq *queue)
 452 {
 453         return queue->mq_maxlen;
 454 }