sys/netinet/ip_reass.c

   1 /*-
   2  * Copyright (c) 2015 Gleb Smirnoff <glebius@FreeBSD.org>
   3  * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>
   4  * Copyright (c) 1982, 1986, 1988, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. Neither the name of the University nor the names of its contributors
  16  *    may be used to endorse or promote products derived from this software
  17  *    without specific prior written permission.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29  * SUCH DAMAGE.
  30  *
  31  *      @(#)ip_input.c  8.2 (Berkeley) 1/4/94
  32  */
  33
  34 #include <sys/cdefs.h>
  35 __FBSDID("$FreeBSD$");
  36
  37 #include "opt_rss.h"
  38
  39 #include <sys/param.h>
  40 #include <sys/systm.h>
  41 #include <sys/eventhandler.h>
  42 #include <sys/kernel.h>
  43 #include <sys/hash.h>
  44 #include <sys/mbuf.h>
  45 #include <sys/malloc.h>
  46 #include <sys/limits.h>
  47 #include <sys/lock.h>
  48 #include <sys/mutex.h>
  49 #include <sys/sysctl.h>
  50 #include <sys/socket.h>
  51
  52 #include <net/if.h>
  53 #include <net/if_var.h>
  54 #include <net/rss_config.h>
  55 #include <net/netisr.h>
  56 #include <net/vnet.h>
  57
  58 #include <netinet/in.h>
  59 #include <netinet/ip.h>
  60 #include <netinet/ip_var.h>
  61 #include <netinet/in_rss.h>
  62 #ifdef MAC
  63 #include <security/mac/mac_framework.h>
  64 #endif
  65
  66 SYSCTL_DECL(_net_inet_ip);
  67
  68 /*
  69  * Reassembly headers are stored in hash buckets.
  70  */
  71 #define IPREASS_NHASH_LOG2      10
  72 #define IPREASS_NHASH           (1 << IPREASS_NHASH_LOG2)
  73 #define IPREASS_HMASK           (IPREASS_NHASH - 1)
  74
  75 struct ipqbucket {
  76         TAILQ_HEAD(ipqhead, ipq) head;
  77         struct mtx               lock;
  78         int                      count;
  79 };
  80
  81 VNET_DEFINE_STATIC(struct ipqbucket, ipq[IPREASS_NHASH]);
  82 #define V_ipq           VNET(ipq)
  83 VNET_DEFINE_STATIC(uint32_t, ipq_hashseed);
  84 #define V_ipq_hashseed   VNET(ipq_hashseed)
  85
  86 #define IPQ_LOCK(i)     mtx_lock(&V_ipq[i].lock)
  87 #define IPQ_TRYLOCK(i)  mtx_trylock(&V_ipq[i].lock)
  88 #define IPQ_UNLOCK(i)   mtx_unlock(&V_ipq[i].lock)
  89 #define IPQ_LOCK_ASSERT(i)      mtx_assert(&V_ipq[i].lock, MA_OWNED)
  90
  91 VNET_DEFINE_STATIC(int, ipreass_maxbucketsize);
  92 #define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize)
  93
  94 void            ipreass_init(void);
  95 void            ipreass_drain(void);
  96 void            ipreass_slowtimo(void);
  97 #ifdef VIMAGE
  98 void            ipreass_destroy(void);
  99 #endif
 100 static int      sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS);
 101 static int      sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS);
 102 static void     ipreass_zone_change(void *);
 103 static void     ipreass_drain_tomax(void);
 104 static void     ipq_free(struct ipqbucket *, struct ipq *);
 105 static struct ipq * ipq_reuse(int);
 106
 107 static inline void
 108 ipq_timeout(struct ipqbucket *bucket, struct ipq *fp)
 109 {
 110
 111         IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
 112         ipq_free(bucket, fp);
 113 }
 114
 115 static inline void
 116 ipq_drop(struct ipqbucket *bucket, struct ipq *fp)
 117 {
 118
 119         IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
 120         ipq_free(bucket, fp);
 121 }
 122
 123 /*
 124  * By default, limit the number of IP fragments across all reassembly
 125  * queues to  1/32 of the total number of mbuf clusters.
 126  *
 127  * Limit the total number of reassembly queues per VNET to the
 128  * IP fragment limit, but ensure the limit will not allow any bucket
 129  * to grow above 100 items. (The bucket limit is
 130  * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct
 131  * multiplier to reach a 100-item limit.)
 132  * The 100-item limit was chosen as brief testing seems to show that
 133  * this produces "reasonable" performance on some subset of systems
 134  * under DoS attack.
 135  */
 136 #define IP_MAXFRAGS             (nmbclusters / 32)
 137 #define IP_MAXFRAGPACKETS       (imin(IP_MAXFRAGS, IPREASS_NHASH * 50))
 138
 139 static int              maxfrags;
 140 static volatile u_int   nfrags;
 141 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW,
 142     &maxfrags, 0,
 143     "Maximum number of IPv4 fragments allowed across all reassembly queues");
 144 SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD,
 145     __DEVOLATILE(u_int *, &nfrags), 0,
 146     "Current number of IPv4 fragments across all reassembly queues");
 147
 148 VNET_DEFINE_STATIC(uma_zone_t, ipq_zone);
 149 #define V_ipq_zone      VNET(ipq_zone)
 150 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets,
 151     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 152     NULL, 0, sysctl_maxfragpackets, "I",
 153     "Maximum number of IPv4 fragment reassembly queue entries");
 154 SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET,
 155     &VNET_NAME(ipq_zone),
 156     "Current number of IPv4 fragment reassembly queue entries");
 157
 158 VNET_DEFINE_STATIC(int, noreass);
 159 #define V_noreass       VNET(noreass)
 160
 161 VNET_DEFINE_STATIC(int, maxfragsperpacket);
 162 #define V_maxfragsperpacket     VNET(maxfragsperpacket)
 163 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW,
 164     &VNET_NAME(maxfragsperpacket), 0,
 165     "Maximum number of IPv4 fragments allowed per packet");
 166 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize,
 167     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0,
 168     sysctl_maxfragbucketsize, "I",
 169     "Maximum number of IPv4 fragment reassembly queue entries per bucket");
 170
 171 /*
 172  * Take incoming datagram fragment and try to reassemble it into
 173  * whole datagram.  If the argument is the first fragment or one
 174  * in between the function will return NULL and store the mbuf
 175  * in the fragment chain.  If the argument is the last fragment
 176  * the packet will be reassembled and the pointer to the new
 177  * mbuf returned for further processing.  Only m_tags attached
 178  * to the first packet/fragment are preserved.
 179  * The IP header is *NOT* adjusted out of iplen.
 180  */
 181 #define M_IP_FRAG       M_PROTO9
 182 struct mbuf *
 183 ip_reass(struct mbuf *m)
 184 {
 185         struct ip *ip;
 186         struct mbuf *p, *q, *nq, *t;
 187         struct ipq *fp;
 188         struct ifnet *srcifp;
 189         struct ipqhead *head;
 190         int i, hlen, next, tmpmax;
 191         u_int8_t ecn, ecn0;
 192         uint32_t hash, hashkey[3];
 193 #ifdef  RSS
 194         uint32_t rss_hash, rss_type;
 195 #endif
 196
 197         /*
 198          * If no reassembling or maxfragsperpacket are 0,
 199          * never accept fragments.
 200          * Also, drop packet if it would exceed the maximum
 201          * number of fragments.
 202          */
 203         tmpmax = maxfrags;
 204         if (V_noreass == 1 || V_maxfragsperpacket == 0 ||
 205             (tmpmax >= 0 && atomic_load_int(&nfrags) >= (u_int)tmpmax)) {
 206                 IPSTAT_INC(ips_fragments);
 207                 IPSTAT_INC(ips_fragdropped);
 208                 m_freem(m);
 209                 return (NULL);
 210         }
 211
 212         ip = mtod(m, struct ip *);
 213         hlen = ip->ip_hl << 2;
 214
 215         /*
 216          * Adjust ip_len to not reflect header,
 217          * convert offset of this to bytes.
 218          */
 219         ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
 220         /*
 221          * Make sure that fragments have a data length
 222          * that's a non-zero multiple of 8 bytes, unless
 223          * this is the last fragment.
 224          */
 225         if (ip->ip_len == htons(0) ||
 226             ((ip->ip_off & htons(IP_MF)) && (ntohs(ip->ip_len) & 0x7) != 0)) {
 227                 IPSTAT_INC(ips_toosmall); /* XXX */
 228                 IPSTAT_INC(ips_fragdropped);
 229                 m_freem(m);
 230                 return (NULL);
 231         }
 232         if (ip->ip_off & htons(IP_MF))
 233                 m->m_flags |= M_IP_FRAG;
 234         else
 235                 m->m_flags &= ~M_IP_FRAG;
 236         ip->ip_off = htons(ntohs(ip->ip_off) << 3);
 237
 238         /*
 239          * Make sure the fragment lies within a packet of valid size.
 240          */
 241         if (ntohs(ip->ip_len) + ntohs(ip->ip_off) > IP_MAXPACKET) {
 242                 IPSTAT_INC(ips_toolong);
 243                 IPSTAT_INC(ips_fragdropped);
 244                 m_freem(m);
 245                 return (NULL);
 246         }
 247
 248         /*
 249          * Store receive network interface pointer for later.
 250          */
 251         srcifp = m->m_pkthdr.rcvif;
 252
 253         /*
 254          * Attempt reassembly; if it succeeds, proceed.
 255          * ip_reass() will return a different mbuf.
 256          */
 257         IPSTAT_INC(ips_fragments);
 258         m->m_pkthdr.PH_loc.ptr = ip;
 259
 260         /*
 261          * Presence of header sizes in mbufs
 262          * would confuse code below.
 263          */
 264         m->m_data += hlen;
 265         m->m_len -= hlen;
 266
 267         hashkey[0] = ip->ip_src.s_addr;
 268         hashkey[1] = ip->ip_dst.s_addr;
 269         hashkey[2] = (uint32_t)ip->ip_p << 16;
 270         hashkey[2] += ip->ip_id;
 271         hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed);
 272         hash &= IPREASS_HMASK;
 273         head = &V_ipq[hash].head;
 274         IPQ_LOCK(hash);
 275
 276         /*
 277          * Look for queue of fragments
 278          * of this datagram.
 279          */
 280         TAILQ_FOREACH(fp, head, ipq_list)
 281                 if (ip->ip_id == fp->ipq_id &&
 282                     ip->ip_src.s_addr == fp->ipq_src.s_addr &&
 283                     ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
 284 #ifdef MAC
 285                     mac_ipq_match(m, fp) &&
 286 #endif
 287                     ip->ip_p == fp->ipq_p)
 288                         break;
 289         /*
 290          * If first fragment to arrive, create a reassembly queue.
 291          */
 292         if (fp == NULL) {
 293                 if (V_ipq[hash].count < V_ipreass_maxbucketsize)
 294                         fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
 295                 if (fp == NULL)
 296                         fp = ipq_reuse(hash);
 297                 if (fp == NULL)
 298                         goto dropfrag;
 299 #ifdef MAC
 300                 if (mac_ipq_init(fp, M_NOWAIT) != 0) {
 301                         uma_zfree(V_ipq_zone, fp);
 302                         fp = NULL;
 303                         goto dropfrag;
 304                 }
 305                 mac_ipq_create(m, fp);
 306 #endif
 307                 TAILQ_INSERT_HEAD(head, fp, ipq_list);
 308                 V_ipq[hash].count++;
 309                 fp->ipq_nfrags = 1;
 310                 atomic_add_int(&nfrags, 1);
 311                 fp->ipq_ttl = IPFRAGTTL;
 312                 fp->ipq_p = ip->ip_p;
 313                 fp->ipq_id = ip->ip_id;
 314                 fp->ipq_src = ip->ip_src;
 315                 fp->ipq_dst = ip->ip_dst;
 316                 fp->ipq_frags = m;
 317                 if (m->m_flags & M_IP_FRAG)
 318                         fp->ipq_maxoff = -1;
 319                 else
 320                         fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len);
 321                 m->m_nextpkt = NULL;
 322                 goto done;
 323         } else {
 324                 /*
 325                  * If we already saw the last fragment, make sure
 326                  * this fragment's offset looks sane. Otherwise, if
 327                  * this is the last fragment, record its endpoint.
 328                  */
 329                 if (fp->ipq_maxoff > 0) {
 330                         i = ntohs(ip->ip_off) + ntohs(ip->ip_len);
 331                         if (((m->m_flags & M_IP_FRAG) && i >= fp->ipq_maxoff) ||
 332                             ((m->m_flags & M_IP_FRAG) == 0 &&
 333                             i != fp->ipq_maxoff)) {
 334                                 fp = NULL;
 335                                 goto dropfrag;
 336                         }
 337                 } else if ((m->m_flags & M_IP_FRAG) == 0)
 338                         fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len);
 339                 fp->ipq_nfrags++;
 340                 atomic_add_int(&nfrags, 1);
 341 #ifdef MAC
 342                 mac_ipq_update(m, fp);
 343 #endif
 344         }
 345
 346 #define GETIP(m)        ((struct ip*)((m)->m_pkthdr.PH_loc.ptr))
 347
 348         /*
 349          * Handle ECN by comparing this segment with the first one;
 350          * if CE is set, do not lose CE.
 351          * drop if CE and not-ECT are mixed for the same packet.
 352          */
 353         ecn = ip->ip_tos & IPTOS_ECN_MASK;
 354         ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
 355         if (ecn == IPTOS_ECN_CE) {
 356                 if (ecn0 == IPTOS_ECN_NOTECT)
 357                         goto dropfrag;
 358                 if (ecn0 != IPTOS_ECN_CE)
 359                         GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
 360         }
 361         if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
 362                 goto dropfrag;
 363
 364         /*
 365          * Find a segment which begins after this one does.
 366          */
 367         for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
 368                 if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off))
 369                         break;
 370
 371         /*
 372          * If there is a preceding segment, it may provide some of
 373          * our data already.  If so, drop the data from the incoming
 374          * segment.  If it provides all of our data, drop us, otherwise
 375          * stick new segment in the proper place.
 376          *
 377          * If some of the data is dropped from the preceding
 378          * segment, then it's checksum is invalidated.
 379          */
 380         if (p) {
 381                 i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) -
 382                     ntohs(ip->ip_off);
 383                 if (i > 0) {
 384                         if (i >= ntohs(ip->ip_len))
 385                                 goto dropfrag;
 386                         m_adj(m, i);
 387                         m->m_pkthdr.csum_flags = 0;
 388                         ip->ip_off = htons(ntohs(ip->ip_off) + i);
 389                         ip->ip_len = htons(ntohs(ip->ip_len) - i);
 390                 }
 391                 m->m_nextpkt = p->m_nextpkt;
 392                 p->m_nextpkt = m;
 393         } else {
 394                 m->m_nextpkt = fp->ipq_frags;
 395                 fp->ipq_frags = m;
 396         }
 397
 398         /*
 399          * While we overlap succeeding segments trim them or,
 400          * if they are completely covered, dequeue them.
 401          */
 402         for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) >
 403             ntohs(GETIP(q)->ip_off); q = nq) {
 404                 i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) -
 405                     ntohs(GETIP(q)->ip_off);
 406                 if (i < ntohs(GETIP(q)->ip_len)) {
 407                         GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i);
 408                         GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i);
 409                         m_adj(q, i);
 410                         q->m_pkthdr.csum_flags = 0;
 411                         break;
 412                 }
 413                 nq = q->m_nextpkt;
 414                 m->m_nextpkt = nq;
 415                 IPSTAT_INC(ips_fragdropped);
 416                 fp->ipq_nfrags--;
 417                 atomic_subtract_int(&nfrags, 1);
 418                 m_freem(q);
 419         }
 420
 421         /*
 422          * Check for complete reassembly and perform frag per packet
 423          * limiting.
 424          *
 425          * Frag limiting is performed here so that the nth frag has
 426          * a chance to complete the packet before we drop the packet.
 427          * As a result, n+1 frags are actually allowed per packet, but
 428          * only n will ever be stored. (n = maxfragsperpacket.)
 429          *
 430          */
 431         next = 0;
 432         for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
 433                 if (ntohs(GETIP(q)->ip_off) != next) {
 434                         if (fp->ipq_nfrags > V_maxfragsperpacket)
 435                                 ipq_drop(&V_ipq[hash], fp);
 436                         goto done;
 437                 }
 438                 next += ntohs(GETIP(q)->ip_len);
 439         }
 440         /* Make sure the last packet didn't have the IP_MF flag */
 441         if (p->m_flags & M_IP_FRAG) {
 442                 if (fp->ipq_nfrags > V_maxfragsperpacket)
 443                         ipq_drop(&V_ipq[hash], fp);
 444                 goto done;
 445         }
 446
 447         /*
 448          * Reassembly is complete.  Make sure the packet is a sane size.
 449          */
 450         q = fp->ipq_frags;
 451         ip = GETIP(q);
 452         if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
 453                 IPSTAT_INC(ips_toolong);
 454                 ipq_drop(&V_ipq[hash], fp);
 455                 goto done;
 456         }
 457
 458         /*
 459          * Concatenate fragments.
 460          */
 461         m = q;
 462         t = m->m_next;
 463         m->m_next = NULL;
 464         m_cat(m, t);
 465         nq = q->m_nextpkt;
 466         q->m_nextpkt = NULL;
 467         for (q = nq; q != NULL; q = nq) {
 468                 nq = q->m_nextpkt;
 469                 q->m_nextpkt = NULL;
 470                 m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
 471                 m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
 472                 m_demote_pkthdr(q);
 473                 m_cat(m, q);
 474         }
 475         /*
 476          * In order to do checksumming faster we do 'end-around carry' here
 477          * (and not in for{} loop), though it implies we are not going to
 478          * reassemble more than 64k fragments.
 479          */
 480         while (m->m_pkthdr.csum_data & 0xffff0000)
 481                 m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
 482                     (m->m_pkthdr.csum_data >> 16);
 483         atomic_subtract_int(&nfrags, fp->ipq_nfrags);
 484 #ifdef MAC
 485         mac_ipq_reassemble(fp, m);
 486         mac_ipq_destroy(fp);
 487 #endif
 488
 489         /*
 490          * Create header for new ip packet by modifying header of first
 491          * packet;  dequeue and discard fragment reassembly header.
 492          * Make header visible.
 493          */
 494         ip->ip_len = htons((ip->ip_hl << 2) + next);
 495         ip->ip_src = fp->ipq_src;
 496         ip->ip_dst = fp->ipq_dst;
 497         TAILQ_REMOVE(head, fp, ipq_list);
 498         V_ipq[hash].count--;
 499         uma_zfree(V_ipq_zone, fp);
 500         m->m_len += (ip->ip_hl << 2);
 501         m->m_data -= (ip->ip_hl << 2);
 502         /* some debugging cruft by sklower, below, will go away soon */
 503         if (m->m_flags & M_PKTHDR) {    /* XXX this should be done elsewhere */
 504                 m_fixhdr(m);
 505                 /* set valid receive interface pointer */
 506                 m->m_pkthdr.rcvif = srcifp;
 507         }
 508         IPSTAT_INC(ips_reassembled);
 509         IPQ_UNLOCK(hash);
 510
 511 #ifdef  RSS
 512         /*
 513          * Query the RSS layer for the flowid / flowtype for the
 514          * mbuf payload.
 515          *
 516          * For now, just assume we have to calculate a new one.
 517          * Later on we should check to see if the assigned flowid matches
 518          * what RSS wants for the given IP protocol and if so, just keep it.
 519          *
 520          * We then queue into the relevant netisr so it can be dispatched
 521          * to the correct CPU.
 522          *
 523          * Note - this may return 1, which means the flowid in the mbuf
 524          * is correct for the configured RSS hash types and can be used.
 525          */
 526         if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) {
 527                 m->m_pkthdr.flowid = rss_hash;
 528                 M_HASHTYPE_SET(m, rss_type);
 529         }
 530
 531         /*
 532          * Queue/dispatch for reprocessing.
 533          *
 534          * Note: this is much slower than just handling the frame in the
 535          * current receive context.  It's likely worth investigating
 536          * why this is.
 537          */
 538         netisr_dispatch(NETISR_IP_DIRECT, m);
 539         return (NULL);
 540 #endif
 541
 542         /* Handle in-line */
 543         return (m);
 544
 545 dropfrag:
 546         IPSTAT_INC(ips_fragdropped);
 547         if (fp != NULL) {
 548                 fp->ipq_nfrags--;
 549                 atomic_subtract_int(&nfrags, 1);
 550         }
 551         m_freem(m);
 552 done:
 553         IPQ_UNLOCK(hash);
 554         return (NULL);
 555
 556 #undef GETIP
 557 }
 558
 559 /*
 560  * Initialize IP reassembly structures.
 561  */
 562 void
 563 ipreass_init(void)
 564 {
 565         int max;
 566
 567         for (int i = 0; i < IPREASS_NHASH; i++) {
 568                 TAILQ_INIT(&V_ipq[i].head);
 569                 mtx_init(&V_ipq[i].lock, "IP reassembly", NULL,
 570                     MTX_DEF | MTX_DUPOK);
 571                 V_ipq[i].count = 0;
 572         }
 573         V_ipq_hashseed = arc4random();
 574         V_maxfragsperpacket = 16;
 575         V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
 576             NULL, UMA_ALIGN_PTR, 0);
 577         max = IP_MAXFRAGPACKETS;
 578         max = uma_zone_set_max(V_ipq_zone, max);
 579         V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
 580
 581         if (IS_DEFAULT_VNET(curvnet)) {
 582                 maxfrags = IP_MAXFRAGS;
 583                 EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change,
 584                     NULL, EVENTHANDLER_PRI_ANY);
 585         }
 586 }
 587
 588 /*
 589  * If a timer expires on a reassembly queue, discard it.
 590  */
 591 void
 592 ipreass_slowtimo(void)
 593 {
 594         struct ipq *fp, *tmp;
 595
 596         for (int i = 0; i < IPREASS_NHASH; i++) {
 597                 IPQ_LOCK(i);
 598                 TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp)
 599                 if (--fp->ipq_ttl == 0)
 600                                 ipq_timeout(&V_ipq[i], fp);
 601                 IPQ_UNLOCK(i);
 602         }
 603 }
 604
 605 /*
 606  * Drain off all datagram fragments.
 607  */
 608 void
 609 ipreass_drain(void)
 610 {
 611
 612         for (int i = 0; i < IPREASS_NHASH; i++) {
 613                 IPQ_LOCK(i);
 614                 while(!TAILQ_EMPTY(&V_ipq[i].head))
 615                         ipq_drop(&V_ipq[i], TAILQ_FIRST(&V_ipq[i].head));
 616                 KASSERT(V_ipq[i].count == 0,
 617                     ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i,
 618                     V_ipq[i].count, V_ipq));
 619                 IPQ_UNLOCK(i);
 620         }
 621 }
 622
 623 /*
 624  * Drain off all datagram fragments belonging to
 625  * the given network interface.
 626  */
 627 static void
 628 ipreass_cleanup(void *arg __unused, struct ifnet *ifp)
 629 {
 630         struct ipq *fp, *temp;
 631         struct mbuf *m;
 632         int i;
 633
 634         KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
 635
 636         CURVNET_SET_QUIET(ifp->if_vnet);
 637
 638         /*
 639          * Skip processing if IPv4 reassembly is not initialised or
 640          * torn down by ipreass_destroy().
 641          */
 642         if (V_ipq_zone == NULL) {
 643                 CURVNET_RESTORE();
 644                 return;
 645         }
 646
 647         for (i = 0; i < IPREASS_NHASH; i++) {
 648                 IPQ_LOCK(i);
 649                 /* Scan fragment list. */
 650                 TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, temp) {
 651                         for (m = fp->ipq_frags; m != NULL; m = m->m_nextpkt) {
 652                                 /* clear no longer valid rcvif pointer */
 653                                 if (m->m_pkthdr.rcvif == ifp)
 654                                         m->m_pkthdr.rcvif = NULL;
 655                         }
 656                 }
 657                 IPQ_UNLOCK(i);
 658         }
 659         CURVNET_RESTORE();
 660 }
 661 EVENTHANDLER_DEFINE(ifnet_departure_event, ipreass_cleanup, NULL, 0);
 662
 663 #ifdef VIMAGE
 664 /*
 665  * Destroy IP reassembly structures.
 666  */
 667 void
 668 ipreass_destroy(void)
 669 {
 670
 671         ipreass_drain();
 672         uma_zdestroy(V_ipq_zone);
 673         V_ipq_zone = NULL;
 674         for (int i = 0; i < IPREASS_NHASH; i++)
 675                 mtx_destroy(&V_ipq[i].lock);
 676 }
 677 #endif
 678
 679 /*
 680  * After maxnipq has been updated, propagate the change to UMA.  The UMA zone
 681  * max has slightly different semantics than the sysctl, for historical
 682  * reasons.
 683  */
 684 static void
 685 ipreass_drain_tomax(void)
 686 {
 687         struct ipq *fp;
 688         int target;
 689
 690         /*
 691          * Make sure each bucket is under the new limit. If
 692          * necessary, drop enough of the oldest elements from
 693          * each bucket to get under the new limit.
 694          */
 695         for (int i = 0; i < IPREASS_NHASH; i++) {
 696                 IPQ_LOCK(i);
 697                 while (V_ipq[i].count > V_ipreass_maxbucketsize &&
 698                     (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL)
 699                         ipq_timeout(&V_ipq[i], fp);
 700                 IPQ_UNLOCK(i);
 701         }
 702
 703         /*
 704          * If we are over the maximum number of fragments,
 705          * drain off enough to get down to the new limit,
 706          * stripping off last elements on queues.  Every
 707          * run we strip the oldest element from each bucket.
 708          */
 709         target = uma_zone_get_max(V_ipq_zone);
 710         while (uma_zone_get_cur(V_ipq_zone) > target) {
 711                 for (int i = 0; i < IPREASS_NHASH; i++) {
 712                         IPQ_LOCK(i);
 713                         fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
 714                         if (fp != NULL)
 715                                 ipq_timeout(&V_ipq[i], fp);
 716                         IPQ_UNLOCK(i);
 717                 }
 718         }
 719 }
 720
 721 static void
 722 ipreass_zone_change(void *tag)
 723 {
 724         VNET_ITERATOR_DECL(vnet_iter);
 725         int max;
 726
 727         maxfrags = IP_MAXFRAGS;
 728         max = IP_MAXFRAGPACKETS;
 729         VNET_LIST_RLOCK_NOSLEEP();
 730         VNET_FOREACH(vnet_iter) {
 731                 CURVNET_SET(vnet_iter);
 732                 max = uma_zone_set_max(V_ipq_zone, max);
 733                 V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
 734                 ipreass_drain_tomax();
 735                 CURVNET_RESTORE();
 736         }
 737         VNET_LIST_RUNLOCK_NOSLEEP();
 738 }
 739
 740 /*
 741  * Change the limit on the UMA zone, or disable the fragment allocation
 742  * at all.  Since 0 and -1 is a special values here, we need our own handler,
 743  * instead of sysctl_handle_uma_zone_max().
 744  */
 745 static int
 746 sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS)
 747 {
 748         int error, max;
 749
 750         if (V_noreass == 0) {
 751                 max = uma_zone_get_max(V_ipq_zone);
 752                 if (max == 0)
 753                         max = -1;
 754         } else
 755                 max = 0;
 756         error = sysctl_handle_int(oidp, &max, 0, req);
 757         if (error || !req->newptr)
 758                 return (error);
 759         if (max > 0) {
 760                 /*
 761                  * XXXRW: Might be a good idea to sanity check the argument
 762                  * and place an extreme upper bound.
 763                  */
 764                 max = uma_zone_set_max(V_ipq_zone, max);
 765                 V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
 766                 ipreass_drain_tomax();
 767                 V_noreass = 0;
 768         } else if (max == 0) {
 769                 V_noreass = 1;
 770                 ipreass_drain();
 771         } else if (max == -1) {
 772                 V_noreass = 0;
 773                 uma_zone_set_max(V_ipq_zone, 0);
 774                 V_ipreass_maxbucketsize = INT_MAX;
 775         } else
 776                 return (EINVAL);
 777         return (0);
 778 }
 779
 780 /*
 781  * Seek for old fragment queue header that can be reused.  Try to
 782  * reuse a header from currently locked hash bucket.
 783  */
 784 static struct ipq *
 785 ipq_reuse(int start)
 786 {
 787         struct ipq *fp;
 788         int bucket, i;
 789
 790         IPQ_LOCK_ASSERT(start);
 791
 792         for (i = 0; i < IPREASS_NHASH; i++) {
 793                 bucket = (start + i) % IPREASS_NHASH;
 794                 if (bucket != start && IPQ_TRYLOCK(bucket) == 0)
 795                         continue;
 796                 fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead);
 797                 if (fp) {
 798                         struct mbuf *m;
 799
 800                         IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
 801                         atomic_subtract_int(&nfrags, fp->ipq_nfrags);
 802                         while (fp->ipq_frags) {
 803                                 m = fp->ipq_frags;
 804                                 fp->ipq_frags = m->m_nextpkt;
 805                                 m_freem(m);
 806                         }
 807                         TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list);
 808                         V_ipq[bucket].count--;
 809                         if (bucket != start)
 810                                 IPQ_UNLOCK(bucket);
 811                         break;
 812                 }
 813                 if (bucket != start)
 814                         IPQ_UNLOCK(bucket);
 815         }
 816         IPQ_LOCK_ASSERT(start);
 817         return (fp);
 818 }
 819
 820 /*
 821  * Free a fragment reassembly header and all associated datagrams.
 822  */
 823 static void
 824 ipq_free(struct ipqbucket *bucket, struct ipq *fp)
 825 {
 826         struct mbuf *q;
 827
 828         atomic_subtract_int(&nfrags, fp->ipq_nfrags);
 829         while (fp->ipq_frags) {
 830                 q = fp->ipq_frags;
 831                 fp->ipq_frags = q->m_nextpkt;
 832                 m_freem(q);
 833         }
 834         TAILQ_REMOVE(&bucket->head, fp, ipq_list);
 835         bucket->count--;
 836         uma_zfree(V_ipq_zone, fp);
 837 }
 838
 839 /*
 840  * Get or set the maximum number of reassembly queues per bucket.
 841  */
 842 static int
 843 sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS)
 844 {
 845         int error, max;
 846
 847         max = V_ipreass_maxbucketsize;
 848         error = sysctl_handle_int(oidp, &max, 0, req);
 849         if (error || !req->newptr)
 850                 return (error);
 851         if (max <= 0)
 852                 return (EINVAL);
 853         V_ipreass_maxbucketsize = max;
 854         ipreass_drain_tomax();
 855         return (0);
 856 }