sys/netinet/ip_reass.c

   1 /*-
   2  * Copyright (c) 2015 Gleb Smirnoff <glebius@FreeBSD.org>
   3  * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>
   4  * Copyright (c) 1982, 1986, 1988, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. Neither the name of the University nor the names of its contributors
  16  *    may be used to endorse or promote products derived from this software
  17  *    without specific prior written permission.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29  * SUCH DAMAGE.
  30  *
  31  *      @(#)ip_input.c  8.2 (Berkeley) 1/4/94
  32  */
  33
  34 #include <sys/cdefs.h>
  35 __FBSDID("$FreeBSD$");
  36
  37 #include "opt_rss.h"
  38
  39 #include <sys/param.h>
  40 #include <sys/systm.h>
  41 #include <sys/eventhandler.h>
  42 #include <sys/hash.h>
  43 #include <sys/mbuf.h>
  44 #include <sys/malloc.h>
  45 #include <sys/limits.h>
  46 #include <sys/lock.h>
  47 #include <sys/mutex.h>
  48 #include <sys/sysctl.h>
  49
  50 #include <net/rss_config.h>
  51 #include <net/netisr.h>
  52 #include <net/vnet.h>
  53
  54 #include <netinet/in.h>
  55 #include <netinet/ip.h>
  56 #include <netinet/ip_var.h>
  57 #include <netinet/in_rss.h>
  58 #ifdef MAC
  59 #include <security/mac/mac_framework.h>
  60 #endif
  61
  62 SYSCTL_DECL(_net_inet_ip);
  63
  64 /*
  65  * Reassembly headers are stored in hash buckets.
  66  */
  67 #define IPREASS_NHASH_LOG2      10
  68 #define IPREASS_NHASH           (1 << IPREASS_NHASH_LOG2)
  69 #define IPREASS_HMASK           (IPREASS_NHASH - 1)
  70
  71 struct ipqbucket {
  72         TAILQ_HEAD(ipqhead, ipq) head;
  73         struct mtx               lock;
  74         int                      count;
  75 };
  76
  77 VNET_DEFINE_STATIC(struct ipqbucket, ipq[IPREASS_NHASH]);
  78 #define V_ipq           VNET(ipq)
  79 VNET_DEFINE_STATIC(uint32_t, ipq_hashseed);
  80 #define V_ipq_hashseed   VNET(ipq_hashseed)
  81
  82 #define IPQ_LOCK(i)     mtx_lock(&V_ipq[i].lock)
  83 #define IPQ_TRYLOCK(i)  mtx_trylock(&V_ipq[i].lock)
  84 #define IPQ_UNLOCK(i)   mtx_unlock(&V_ipq[i].lock)
  85 #define IPQ_LOCK_ASSERT(i)      mtx_assert(&V_ipq[i].lock, MA_OWNED)
  86
  87 VNET_DEFINE_STATIC(int, ipreass_maxbucketsize);
  88 #define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize)
  89
  90 void            ipreass_init(void);
  91 void            ipreass_drain(void);
  92 void            ipreass_slowtimo(void);
  93 #ifdef VIMAGE
  94 void            ipreass_destroy(void);
  95 #endif
  96 static int      sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS);
  97 static int      sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS);
  98 static void     ipreass_zone_change(void *);
  99 static void     ipreass_drain_tomax(void);
 100 static void     ipq_free(struct ipqbucket *, struct ipq *);
 101 static struct ipq * ipq_reuse(int);
 102
 103 static inline void
 104 ipq_timeout(struct ipqbucket *bucket, struct ipq *fp)
 105 {
 106
 107         IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
 108         ipq_free(bucket, fp);
 109 }
 110
 111 static inline void
 112 ipq_drop(struct ipqbucket *bucket, struct ipq *fp)
 113 {
 114
 115         IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
 116         ipq_free(bucket, fp);
 117 }
 118
 119 /*
 120  * By default, limit the number of IP fragments across all reassembly
 121  * queues to  1/32 of the total number of mbuf clusters.
 122  *
 123  * Limit the total number of reassembly queues per VNET to the
 124  * IP fragment limit, but ensure the limit will not allow any bucket
 125  * to grow above 100 items. (The bucket limit is
 126  * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct
 127  * multiplier to reach a 100-item limit.)
 128  * The 100-item limit was chosen as brief testing seems to show that
 129  * this produces "reasonable" performance on some subset of systems
 130  * under DoS attack.
 131  */
 132 #define IP_MAXFRAGS             (nmbclusters / 32)
 133 #define IP_MAXFRAGPACKETS       (imin(IP_MAXFRAGS, IPREASS_NHASH * 50))
 134
 135 static int              maxfrags;
 136 static volatile u_int   nfrags;
 137 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW,
 138     &maxfrags, 0,
 139     "Maximum number of IPv4 fragments allowed across all reassembly queues");
 140 SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD,
 141     __DEVOLATILE(u_int *, &nfrags), 0,
 142     "Current number of IPv4 fragments across all reassembly queues");
 143
 144 VNET_DEFINE_STATIC(uma_zone_t, ipq_zone);
 145 #define V_ipq_zone      VNET(ipq_zone)
 146 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET |
 147     CTLTYPE_INT | CTLFLAG_RW, NULL, 0, sysctl_maxfragpackets, "I",
 148     "Maximum number of IPv4 fragment reassembly queue entries");
 149 SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET,
 150     &VNET_NAME(ipq_zone),
 151     "Current number of IPv4 fragment reassembly queue entries");
 152
 153 VNET_DEFINE_STATIC(int, noreass);
 154 #define V_noreass       VNET(noreass)
 155
 156 VNET_DEFINE_STATIC(int, maxfragsperpacket);
 157 #define V_maxfragsperpacket     VNET(maxfragsperpacket)
 158 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW,
 159     &VNET_NAME(maxfragsperpacket), 0,
 160     "Maximum number of IPv4 fragments allowed per packet");
 161 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize,
 162     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0,
 163     sysctl_maxfragbucketsize, "I",
 164     "Maximum number of IPv4 fragment reassembly queue entries per bucket");
 165
 166 /*
 167  * Take incoming datagram fragment and try to reassemble it into
 168  * whole datagram.  If the argument is the first fragment or one
 169  * in between the function will return NULL and store the mbuf
 170  * in the fragment chain.  If the argument is the last fragment
 171  * the packet will be reassembled and the pointer to the new
 172  * mbuf returned for further processing.  Only m_tags attached
 173  * to the first packet/fragment are preserved.
 174  * The IP header is *NOT* adjusted out of iplen.
 175  */
 176 #define M_IP_FRAG       M_PROTO9
 177 struct mbuf *
 178 ip_reass(struct mbuf *m)
 179 {
 180         struct ip *ip;
 181         struct mbuf *p, *q, *nq, *t;
 182         struct ipq *fp;
 183         struct ipqhead *head;
 184         int i, hlen, next, tmpmax;
 185         u_int8_t ecn, ecn0;
 186         uint32_t hash, hashkey[3];
 187 #ifdef  RSS
 188         uint32_t rss_hash, rss_type;
 189 #endif
 190
 191         /*
 192          * If no reassembling or maxfragsperpacket are 0,
 193          * never accept fragments.
 194          * Also, drop packet if it would exceed the maximum
 195          * number of fragments.
 196          */
 197         tmpmax = maxfrags;
 198         if (V_noreass == 1 || V_maxfragsperpacket == 0 ||
 199             (tmpmax >= 0 && atomic_load_int(&nfrags) >= (u_int)tmpmax)) {
 200                 IPSTAT_INC(ips_fragments);
 201                 IPSTAT_INC(ips_fragdropped);
 202                 m_freem(m);
 203                 return (NULL);
 204         }
 205
 206         ip = mtod(m, struct ip *);
 207         hlen = ip->ip_hl << 2;
 208
 209         /*
 210          * Adjust ip_len to not reflect header,
 211          * convert offset of this to bytes.
 212          */
 213         ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
 214         if (ip->ip_off & htons(IP_MF)) {
 215                 /*
 216                  * Make sure that fragments have a data length
 217                  * that's a non-zero multiple of 8 bytes.
 218                  */
 219                 if (ip->ip_len == htons(0) || (ntohs(ip->ip_len) & 0x7) != 0) {
 220                         IPSTAT_INC(ips_toosmall); /* XXX */
 221                         IPSTAT_INC(ips_fragdropped);
 222                         m_freem(m);
 223                         return (NULL);
 224                 }
 225                 m->m_flags |= M_IP_FRAG;
 226         } else
 227                 m->m_flags &= ~M_IP_FRAG;
 228         ip->ip_off = htons(ntohs(ip->ip_off) << 3);
 229
 230         /*
 231          * Make sure the fragment lies within a packet of valid size.
 232          */
 233         if (ntohs(ip->ip_len) + ntohs(ip->ip_off) > IP_MAXPACKET) {
 234                 IPSTAT_INC(ips_toolong);
 235                 IPSTAT_INC(ips_fragdropped);
 236                 m_freem(m);
 237                 return (NULL);
 238         }
 239
 240         /*
 241          * Attempt reassembly; if it succeeds, proceed.
 242          * ip_reass() will return a different mbuf.
 243          */
 244         IPSTAT_INC(ips_fragments);
 245         m->m_pkthdr.PH_loc.ptr = ip;
 246
 247         /*
 248          * Presence of header sizes in mbufs
 249          * would confuse code below.
 250          */
 251         m->m_data += hlen;
 252         m->m_len -= hlen;
 253
 254         hashkey[0] = ip->ip_src.s_addr;
 255         hashkey[1] = ip->ip_dst.s_addr;
 256         hashkey[2] = (uint32_t)ip->ip_p << 16;
 257         hashkey[2] += ip->ip_id;
 258         hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed);
 259         hash &= IPREASS_HMASK;
 260         head = &V_ipq[hash].head;
 261         IPQ_LOCK(hash);
 262
 263         /*
 264          * Look for queue of fragments
 265          * of this datagram.
 266          */
 267         TAILQ_FOREACH(fp, head, ipq_list)
 268                 if (ip->ip_id == fp->ipq_id &&
 269                     ip->ip_src.s_addr == fp->ipq_src.s_addr &&
 270                     ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
 271 #ifdef MAC
 272                     mac_ipq_match(m, fp) &&
 273 #endif
 274                     ip->ip_p == fp->ipq_p)
 275                         break;
 276         /*
 277          * If first fragment to arrive, create a reassembly queue.
 278          */
 279         if (fp == NULL) {
 280                 if (V_ipq[hash].count < V_ipreass_maxbucketsize)
 281                         fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
 282                 if (fp == NULL)
 283                         fp = ipq_reuse(hash);
 284                 if (fp == NULL)
 285                         goto dropfrag;
 286 #ifdef MAC
 287                 if (mac_ipq_init(fp, M_NOWAIT) != 0) {
 288                         uma_zfree(V_ipq_zone, fp);
 289                         fp = NULL;
 290                         goto dropfrag;
 291                 }
 292                 mac_ipq_create(m, fp);
 293 #endif
 294                 TAILQ_INSERT_HEAD(head, fp, ipq_list);
 295                 V_ipq[hash].count++;
 296                 fp->ipq_nfrags = 1;
 297                 atomic_add_int(&nfrags, 1);
 298                 fp->ipq_ttl = IPFRAGTTL;
 299                 fp->ipq_p = ip->ip_p;
 300                 fp->ipq_id = ip->ip_id;
 301                 fp->ipq_src = ip->ip_src;
 302                 fp->ipq_dst = ip->ip_dst;
 303                 fp->ipq_frags = m;
 304                 m->m_nextpkt = NULL;
 305                 goto done;
 306         } else {
 307                 fp->ipq_nfrags++;
 308                 atomic_add_int(&nfrags, 1);
 309 #ifdef MAC
 310                 mac_ipq_update(m, fp);
 311 #endif
 312         }
 313
 314 #define GETIP(m)        ((struct ip*)((m)->m_pkthdr.PH_loc.ptr))
 315
 316         /*
 317          * Handle ECN by comparing this segment with the first one;
 318          * if CE is set, do not lose CE.
 319          * drop if CE and not-ECT are mixed for the same packet.
 320          */
 321         ecn = ip->ip_tos & IPTOS_ECN_MASK;
 322         ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
 323         if (ecn == IPTOS_ECN_CE) {
 324                 if (ecn0 == IPTOS_ECN_NOTECT)
 325                         goto dropfrag;
 326                 if (ecn0 != IPTOS_ECN_CE)
 327                         GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
 328         }
 329         if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
 330                 goto dropfrag;
 331
 332         /*
 333          * Find a segment which begins after this one does.
 334          */
 335         for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
 336                 if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off))
 337                         break;
 338
 339         /*
 340          * If there is a preceding segment, it may provide some of
 341          * our data already.  If so, drop the data from the incoming
 342          * segment.  If it provides all of our data, drop us, otherwise
 343          * stick new segment in the proper place.
 344          *
 345          * If some of the data is dropped from the preceding
 346          * segment, then it's checksum is invalidated.
 347          */
 348         if (p) {
 349                 i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) -
 350                     ntohs(ip->ip_off);
 351                 if (i > 0) {
 352                         if (i >= ntohs(ip->ip_len))
 353                                 goto dropfrag;
 354                         m_adj(m, i);
 355                         m->m_pkthdr.csum_flags = 0;
 356                         ip->ip_off = htons(ntohs(ip->ip_off) + i);
 357                         ip->ip_len = htons(ntohs(ip->ip_len) - i);
 358                 }
 359                 m->m_nextpkt = p->m_nextpkt;
 360                 p->m_nextpkt = m;
 361         } else {
 362                 m->m_nextpkt = fp->ipq_frags;
 363                 fp->ipq_frags = m;
 364         }
 365
 366         /*
 367          * While we overlap succeeding segments trim them or,
 368          * if they are completely covered, dequeue them.
 369          */
 370         for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) >
 371             ntohs(GETIP(q)->ip_off); q = nq) {
 372                 i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) -
 373                     ntohs(GETIP(q)->ip_off);
 374                 if (i < ntohs(GETIP(q)->ip_len)) {
 375                         GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i);
 376                         GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i);
 377                         m_adj(q, i);
 378                         q->m_pkthdr.csum_flags = 0;
 379                         break;
 380                 }
 381                 nq = q->m_nextpkt;
 382                 m->m_nextpkt = nq;
 383                 IPSTAT_INC(ips_fragdropped);
 384                 fp->ipq_nfrags--;
 385                 atomic_subtract_int(&nfrags, 1);
 386                 m_freem(q);
 387         }
 388
 389         /*
 390          * Check for complete reassembly and perform frag per packet
 391          * limiting.
 392          *
 393          * Frag limiting is performed here so that the nth frag has
 394          * a chance to complete the packet before we drop the packet.
 395          * As a result, n+1 frags are actually allowed per packet, but
 396          * only n will ever be stored. (n = maxfragsperpacket.)
 397          *
 398          */
 399         next = 0;
 400         for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
 401                 if (ntohs(GETIP(q)->ip_off) != next) {
 402                         if (fp->ipq_nfrags > V_maxfragsperpacket)
 403                                 ipq_drop(&V_ipq[hash], fp);
 404                         goto done;
 405                 }
 406                 next += ntohs(GETIP(q)->ip_len);
 407         }
 408         /* Make sure the last packet didn't have the IP_MF flag */
 409         if (p->m_flags & M_IP_FRAG) {
 410                 if (fp->ipq_nfrags > V_maxfragsperpacket)
 411                         ipq_drop(&V_ipq[hash], fp);
 412                 goto done;
 413         }
 414
 415         /*
 416          * Reassembly is complete.  Make sure the packet is a sane size.
 417          */
 418         q = fp->ipq_frags;
 419         ip = GETIP(q);
 420         if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
 421                 IPSTAT_INC(ips_toolong);
 422                 ipq_drop(&V_ipq[hash], fp);
 423                 goto done;
 424         }
 425
 426         /*
 427          * Concatenate fragments.
 428          */
 429         m = q;
 430         t = m->m_next;
 431         m->m_next = NULL;
 432         m_cat(m, t);
 433         nq = q->m_nextpkt;
 434         q->m_nextpkt = NULL;
 435         for (q = nq; q != NULL; q = nq) {
 436                 nq = q->m_nextpkt;
 437                 q->m_nextpkt = NULL;
 438                 m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
 439                 m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
 440                 m_demote_pkthdr(q);
 441                 m_cat(m, q);
 442         }
 443         /*
 444          * In order to do checksumming faster we do 'end-around carry' here
 445          * (and not in for{} loop), though it implies we are not going to
 446          * reassemble more than 64k fragments.
 447          */
 448         while (m->m_pkthdr.csum_data & 0xffff0000)
 449                 m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
 450                     (m->m_pkthdr.csum_data >> 16);
 451         atomic_subtract_int(&nfrags, fp->ipq_nfrags);
 452 #ifdef MAC
 453         mac_ipq_reassemble(fp, m);
 454         mac_ipq_destroy(fp);
 455 #endif
 456
 457         /*
 458          * Create header for new ip packet by modifying header of first
 459          * packet;  dequeue and discard fragment reassembly header.
 460          * Make header visible.
 461          */
 462         ip->ip_len = htons((ip->ip_hl << 2) + next);
 463         ip->ip_src = fp->ipq_src;
 464         ip->ip_dst = fp->ipq_dst;
 465         TAILQ_REMOVE(head, fp, ipq_list);
 466         V_ipq[hash].count--;
 467         uma_zfree(V_ipq_zone, fp);
 468         m->m_len += (ip->ip_hl << 2);
 469         m->m_data -= (ip->ip_hl << 2);
 470         /* some debugging cruft by sklower, below, will go away soon */
 471         if (m->m_flags & M_PKTHDR)      /* XXX this should be done elsewhere */
 472                 m_fixhdr(m);
 473         IPSTAT_INC(ips_reassembled);
 474         IPQ_UNLOCK(hash);
 475
 476 #ifdef  RSS
 477         /*
 478          * Query the RSS layer for the flowid / flowtype for the
 479          * mbuf payload.
 480          *
 481          * For now, just assume we have to calculate a new one.
 482          * Later on we should check to see if the assigned flowid matches
 483          * what RSS wants for the given IP protocol and if so, just keep it.
 484          *
 485          * We then queue into the relevant netisr so it can be dispatched
 486          * to the correct CPU.
 487          *
 488          * Note - this may return 1, which means the flowid in the mbuf
 489          * is correct for the configured RSS hash types and can be used.
 490          */
 491         if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) {
 492                 m->m_pkthdr.flowid = rss_hash;
 493                 M_HASHTYPE_SET(m, rss_type);
 494         }
 495
 496         /*
 497          * Queue/dispatch for reprocessing.
 498          *
 499          * Note: this is much slower than just handling the frame in the
 500          * current receive context.  It's likely worth investigating
 501          * why this is.
 502          */
 503         netisr_dispatch(NETISR_IP_DIRECT, m);
 504         return (NULL);
 505 #endif
 506
 507         /* Handle in-line */
 508         return (m);
 509
 510 dropfrag:
 511         IPSTAT_INC(ips_fragdropped);
 512         if (fp != NULL) {
 513                 fp->ipq_nfrags--;
 514                 atomic_subtract_int(&nfrags, 1);
 515         }
 516         m_freem(m);
 517 done:
 518         IPQ_UNLOCK(hash);
 519         return (NULL);
 520
 521 #undef GETIP
 522 }
 523
 524 /*
 525  * Initialize IP reassembly structures.
 526  */
 527 void
 528 ipreass_init(void)
 529 {
 530         int max;
 531
 532         for (int i = 0; i < IPREASS_NHASH; i++) {
 533                 TAILQ_INIT(&V_ipq[i].head);
 534                 mtx_init(&V_ipq[i].lock, "IP reassembly", NULL,
 535                     MTX_DEF | MTX_DUPOK);
 536                 V_ipq[i].count = 0;
 537         }
 538         V_ipq_hashseed = arc4random();
 539         V_maxfragsperpacket = 16;
 540         V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
 541             NULL, UMA_ALIGN_PTR, 0);
 542         max = IP_MAXFRAGPACKETS;
 543         max = uma_zone_set_max(V_ipq_zone, max);
 544         V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
 545
 546         if (IS_DEFAULT_VNET(curvnet)) {
 547                 maxfrags = IP_MAXFRAGS;
 548                 EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change,
 549                     NULL, EVENTHANDLER_PRI_ANY);
 550         }
 551 }
 552
 553 /*
 554  * If a timer expires on a reassembly queue, discard it.
 555  */
 556 void
 557 ipreass_slowtimo(void)
 558 {
 559         struct ipq *fp, *tmp;
 560
 561         for (int i = 0; i < IPREASS_NHASH; i++) {
 562                 IPQ_LOCK(i);
 563                 TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp)
 564                 if (--fp->ipq_ttl == 0)
 565                                 ipq_timeout(&V_ipq[i], fp);
 566                 IPQ_UNLOCK(i);
 567         }
 568 }
 569
 570 /*
 571  * Drain off all datagram fragments.
 572  */
 573 void
 574 ipreass_drain(void)
 575 {
 576
 577         for (int i = 0; i < IPREASS_NHASH; i++) {
 578                 IPQ_LOCK(i);
 579                 while(!TAILQ_EMPTY(&V_ipq[i].head))
 580                         ipq_drop(&V_ipq[i], TAILQ_FIRST(&V_ipq[i].head));
 581                 KASSERT(V_ipq[i].count == 0,
 582                     ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i,
 583                     V_ipq[i].count, V_ipq));
 584                 IPQ_UNLOCK(i);
 585         }
 586 }
 587
 588 #ifdef VIMAGE
 589 /*
 590  * Destroy IP reassembly structures.
 591  */
 592 void
 593 ipreass_destroy(void)
 594 {
 595
 596         ipreass_drain();
 597         uma_zdestroy(V_ipq_zone);
 598         for (int i = 0; i < IPREASS_NHASH; i++)
 599                 mtx_destroy(&V_ipq[i].lock);
 600 }
 601 #endif
 602
 603 /*
 604  * After maxnipq has been updated, propagate the change to UMA.  The UMA zone
 605  * max has slightly different semantics than the sysctl, for historical
 606  * reasons.
 607  */
 608 static void
 609 ipreass_drain_tomax(void)
 610 {
 611         struct ipq *fp;
 612         int target;
 613
 614         /*
 615          * Make sure each bucket is under the new limit. If
 616          * necessary, drop enough of the oldest elements from
 617          * each bucket to get under the new limit.
 618          */
 619         for (int i = 0; i < IPREASS_NHASH; i++) {
 620                 IPQ_LOCK(i);
 621                 while (V_ipq[i].count > V_ipreass_maxbucketsize &&
 622                     (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL)
 623                         ipq_timeout(&V_ipq[i], fp);
 624                 IPQ_UNLOCK(i);
 625         }
 626
 627         /*
 628          * If we are over the maximum number of fragments,
 629          * drain off enough to get down to the new limit,
 630          * stripping off last elements on queues.  Every
 631          * run we strip the oldest element from each bucket.
 632          */
 633         target = uma_zone_get_max(V_ipq_zone);
 634         while (uma_zone_get_cur(V_ipq_zone) > target) {
 635                 for (int i = 0; i < IPREASS_NHASH; i++) {
 636                         IPQ_LOCK(i);
 637                         fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
 638                         if (fp != NULL)
 639                                 ipq_timeout(&V_ipq[i], fp);
 640                         IPQ_UNLOCK(i);
 641                 }
 642         }
 643 }
 644
 645 static void
 646 ipreass_zone_change(void *tag)
 647 {
 648         VNET_ITERATOR_DECL(vnet_iter);
 649         int max;
 650
 651         maxfrags = IP_MAXFRAGS;
 652         max = IP_MAXFRAGPACKETS;
 653         VNET_LIST_RLOCK_NOSLEEP();
 654         VNET_FOREACH(vnet_iter) {
 655                 CURVNET_SET(vnet_iter);
 656                 max = uma_zone_set_max(V_ipq_zone, max);
 657                 V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
 658                 ipreass_drain_tomax();
 659                 CURVNET_RESTORE();
 660         }
 661         VNET_LIST_RUNLOCK_NOSLEEP();
 662 }
 663
 664 /*
 665  * Change the limit on the UMA zone, or disable the fragment allocation
 666  * at all.  Since 0 and -1 is a special values here, we need our own handler,
 667  * instead of sysctl_handle_uma_zone_max().
 668  */
 669 static int
 670 sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS)
 671 {
 672         int error, max;
 673
 674         if (V_noreass == 0) {
 675                 max = uma_zone_get_max(V_ipq_zone);
 676                 if (max == 0)
 677                         max = -1;
 678         } else
 679                 max = 0;
 680         error = sysctl_handle_int(oidp, &max, 0, req);
 681         if (error || !req->newptr)
 682                 return (error);
 683         if (max > 0) {
 684                 /*
 685                  * XXXRW: Might be a good idea to sanity check the argument
 686                  * and place an extreme upper bound.
 687                  */
 688                 max = uma_zone_set_max(V_ipq_zone, max);
 689                 V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
 690                 ipreass_drain_tomax();
 691                 V_noreass = 0;
 692         } else if (max == 0) {
 693                 V_noreass = 1;
 694                 ipreass_drain();
 695         } else if (max == -1) {
 696                 V_noreass = 0;
 697                 uma_zone_set_max(V_ipq_zone, 0);
 698                 V_ipreass_maxbucketsize = INT_MAX;
 699         } else
 700                 return (EINVAL);
 701         return (0);
 702 }
 703
 704 /*
 705  * Seek for old fragment queue header that can be reused.  Try to
 706  * reuse a header from currently locked hash bucket.
 707  */
 708 static struct ipq *
 709 ipq_reuse(int start)
 710 {
 711         struct ipq *fp;
 712         int bucket, i;
 713
 714         IPQ_LOCK_ASSERT(start);
 715
 716         for (i = 0; i < IPREASS_NHASH; i++) {
 717                 bucket = (start + i) % IPREASS_NHASH;
 718                 if (bucket != start && IPQ_TRYLOCK(bucket) == 0)
 719                         continue;
 720                 fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead);
 721                 if (fp) {
 722                         struct mbuf *m;
 723
 724                         IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
 725                         atomic_subtract_int(&nfrags, fp->ipq_nfrags);
 726                         while (fp->ipq_frags) {
 727                                 m = fp->ipq_frags;
 728                                 fp->ipq_frags = m->m_nextpkt;
 729                                 m_freem(m);
 730                         }
 731                         TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list);
 732                         V_ipq[bucket].count--;
 733                         if (bucket != start)
 734                                 IPQ_UNLOCK(bucket);
 735                         break;
 736                 }
 737                 if (bucket != start)
 738                         IPQ_UNLOCK(bucket);
 739         }
 740         IPQ_LOCK_ASSERT(start);
 741         return (fp);
 742 }
 743
 744 /*
 745  * Free a fragment reassembly header and all associated datagrams.
 746  */
 747 static void
 748 ipq_free(struct ipqbucket *bucket, struct ipq *fp)
 749 {
 750         struct mbuf *q;
 751
 752         atomic_subtract_int(&nfrags, fp->ipq_nfrags);
 753         while (fp->ipq_frags) {
 754                 q = fp->ipq_frags;
 755                 fp->ipq_frags = q->m_nextpkt;
 756                 m_freem(q);
 757         }
 758         TAILQ_REMOVE(&bucket->head, fp, ipq_list);
 759         bucket->count--;
 760         uma_zfree(V_ipq_zone, fp);
 761 }
 762
 763 /*
 764  * Get or set the maximum number of reassembly queues per bucket.
 765  */
 766 static int
 767 sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS)
 768 {
 769         int error, max;
 770
 771         max = V_ipreass_maxbucketsize;
 772         error = sysctl_handle_int(oidp, &max, 0, req);
 773         if (error || !req->newptr)
 774                 return (error);
 775         if (max <= 0)
 776                 return (EINVAL);
 777         V_ipreass_maxbucketsize = max;
 778         ipreass_drain_tomax();
 779         return (0);
 780 }