sys/netinet6/frag6.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. Neither the name of the project nor the names of its contributors
  16  *    may be used to endorse or promote products derived from this software
  17  *    without specific prior written permission.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29  * SUCH DAMAGE.
  30  *
  31  *      $KAME: frag6.c,v 1.33 2002/01/07 11:34:48 kjc Exp $
  32  */
  33
  34 #include <sys/cdefs.h>
  35 __FBSDID("$FreeBSD$");
  36
  37 #include "opt_rss.h"
  38
  39 #include <sys/param.h>
  40 #include <sys/domain.h>
  41 #include <sys/errno.h>
  42 #include <sys/eventhandler.h>
  43 #include <sys/hash.h>
  44 #include <sys/kernel.h>
  45 #include <sys/malloc.h>
  46 #include <sys/mbuf.h>
  47 #include <sys/protosw.h>
  48 #include <sys/socket.h>
  49 #include <sys/time.h>
  50 #include <sys/sysctl.h>
  51 #include <sys/syslog.h>
  52
  53 #include <machine/atomic.h>
  54
  55 #include <net/if.h>
  56 #include <net/if_var.h>
  57 #include <net/netisr.h>
  58 #include <net/route.h>
  59 #include <net/vnet.h>
  60
  61 #include <netinet/in.h>
  62 #include <netinet/in_var.h>
  63 #include <netinet/ip6.h>
  64 #include <netinet6/ip6_var.h>
  65 #include <netinet/icmp6.h>
  66 #include <netinet/in_systm.h>   /* for ECN definitions */
  67 #include <netinet/ip.h>         /* for ECN definitions */
  68
  69 #ifdef MAC
  70 #include <security/mac/mac_framework.h>
  71 #endif
  72
  73 /*
  74  * Reassembly headers are stored in hash buckets.
  75  */
  76 #define IP6REASS_NHASH_LOG2     10
  77 #define IP6REASS_NHASH          (1 << IP6REASS_NHASH_LOG2)
  78 #define IP6REASS_HMASK          (IP6REASS_NHASH - 1)
  79
  80 static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *,
  81     uint32_t bucket __unused);
  82 static void frag6_deq(struct ip6asfrag *, uint32_t bucket __unused);
  83 static void frag6_insque_head(struct ip6q *, struct ip6q *,
  84     uint32_t bucket);
  85 static void frag6_remque(struct ip6q *, uint32_t bucket);
  86 static void frag6_freef(struct ip6q *, uint32_t bucket);
  87
  88 struct ip6qbucket {
  89         struct ip6q     ip6q;
  90         struct mtx      lock;
  91         int             count;
  92 };
  93
  94 /* System wide (global) maximum and count of packets in reassembly queues. */
  95 static int ip6_maxfrags;
  96 static volatile u_int frag6_nfrags = 0;
  97
  98 /* Maximum and current packets in per-VNET reassembly queue. */
  99 VNET_DEFINE_STATIC(int,                 ip6_maxfragpackets);
 100 VNET_DEFINE_STATIC(volatile u_int,      frag6_nfragpackets);
 101 #define V_ip6_maxfragpackets            VNET(ip6_maxfragpackets)
 102 #define V_frag6_nfragpackets            VNET(frag6_nfragpackets)
 103
 104 /* Maximum per-VNET reassembly queues per bucket and fragments per packet. */
 105 VNET_DEFINE_STATIC(int,                 ip6_maxfragbucketsize);
 106 VNET_DEFINE_STATIC(int,                 ip6_maxfragsperpacket);
 107 #define V_ip6_maxfragbucketsize         VNET(ip6_maxfragbucketsize)
 108 #define V_ip6_maxfragsperpacket         VNET(ip6_maxfragsperpacket)
 109
 110 /* Per-VNET reassembly queue buckets. */
 111 VNET_DEFINE_STATIC(struct ip6qbucket,   ip6q[IP6REASS_NHASH]);
 112 VNET_DEFINE_STATIC(uint32_t,            ip6q_hashseed);
 113 #define V_ip6q                          VNET(ip6q)
 114 #define V_ip6q_hashseed                 VNET(ip6q_hashseed)
 115
 116 #define IP6Q_LOCK(i)            mtx_lock(&V_ip6q[(i)].lock)
 117 #define IP6Q_TRYLOCK(i)         mtx_trylock(&V_ip6q[(i)].lock)
 118 #define IP6Q_LOCK_ASSERT(i)     mtx_assert(&V_ip6q[(i)].lock, MA_OWNED)
 119 #define IP6Q_UNLOCK(i)          mtx_unlock(&V_ip6q[(i)].lock)
 120 #define IP6Q_HEAD(i)            (&V_ip6q[(i)].ip6q)
 121
 122 static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header");
 123
 124 /*
 125  * By default, limit the number of IP6 fragments across all reassembly
 126  * queues to  1/32 of the total number of mbuf clusters.
 127  *
 128  * Limit the total number of reassembly queues per VNET to the
 129  * IP6 fragment limit, but ensure the limit will not allow any bucket
 130  * to grow above 100 items. (The bucket limit is
 131  * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct
 132  * multiplier to reach a 100-item limit.)
 133  * The 100-item limit was chosen as brief testing seems to show that
 134  * this produces "reasonable" performance on some subset of systems
 135  * under DoS attack.
 136  */
 137 #define IP6_MAXFRAGS            (nmbclusters / 32)
 138 #define IP6_MAXFRAGPACKETS      (imin(IP6_MAXFRAGS, IP6REASS_NHASH * 50))
 139
 140
 141 /*
 142  * Sysctls and helper function.
 143  */
 144 SYSCTL_DECL(_net_inet6_ip6);
 145
 146 static void
 147 frag6_set_bucketsize()
 148 {
 149         int i;
 150
 151         if ((i = V_ip6_maxfragpackets) > 0)
 152                 V_ip6_maxfragbucketsize = imax(i / (IP6REASS_NHASH / 2), 1);
 153 }
 154
 155 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags,
 156         CTLFLAG_RW, &ip6_maxfrags, 0,
 157         "Maximum allowed number of outstanding IPv6 packet fragments. "
 158         "A value of 0 means no fragmented packets will be accepted, while a "
 159         "a value of -1 means no limit");
 160
 161 static int
 162 sysctl_ip6_maxfragpackets(SYSCTL_HANDLER_ARGS)
 163 {
 164         int error, val;
 165
 166         val = V_ip6_maxfragpackets;
 167         error = sysctl_handle_int(oidp, &val, 0, req);
 168         if (error != 0 || !req->newptr)
 169                 return (error);
 170         V_ip6_maxfragpackets = val;
 171         frag6_set_bucketsize();
 172         return (0);
 173 }
 174 SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets,
 175         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, NULL, 0,
 176         sysctl_ip6_maxfragpackets, "I",
 177         "Default maximum number of outstanding fragmented IPv6 packets. "
 178         "A value of 0 means no fragmented packets will be accepted, while a "
 179         "a value of -1 means no limit");
 180 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket,
 181         CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragsperpacket), 0,
 182         "Maximum allowed number of fragments per packet");
 183 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGBUCKETSIZE, maxfragbucketsize,
 184         CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragbucketsize), 0,
 185         "Maximum number of reassembly queues per hash bucket");
 186
 187
 188 /*
 189  * Initialise reassembly queue and fragment identifier.
 190  */
 191 static void
 192 frag6_change(void *tag)
 193 {
 194         VNET_ITERATOR_DECL(vnet_iter);
 195
 196         ip6_maxfrags = IP6_MAXFRAGS;
 197         VNET_LIST_RLOCK_NOSLEEP();
 198         VNET_FOREACH(vnet_iter) {
 199                 CURVNET_SET(vnet_iter);
 200                 V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS;
 201                 frag6_set_bucketsize();
 202                 CURVNET_RESTORE();
 203         }
 204         VNET_LIST_RUNLOCK_NOSLEEP();
 205 }
 206
 207 void
 208 frag6_init(void)
 209 {
 210         struct ip6q *q6;
 211         int i;
 212
 213         V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS;
 214         frag6_set_bucketsize();
 215         for (i = 0; i < IP6REASS_NHASH; i++) {
 216                 q6 = IP6Q_HEAD(i);
 217                 q6->ip6q_next = q6->ip6q_prev = q6;
 218                 mtx_init(&V_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF);
 219                 V_ip6q[i].count = 0;
 220         }
 221         V_ip6q_hashseed = arc4random();
 222         V_ip6_maxfragsperpacket = 64;
 223         if (!IS_DEFAULT_VNET(curvnet))
 224                 return;
 225
 226         ip6_maxfrags = IP6_MAXFRAGS;
 227         EVENTHANDLER_REGISTER(nmbclusters_change,
 228             frag6_change, NULL, EVENTHANDLER_PRI_ANY);
 229 }
 230
 231 /*
 232  * In RFC2460, fragment and reassembly rule do not agree with each other,
 233  * in terms of next header field handling in fragment header.
 234  * While the sender will use the same value for all of the fragmented packets,
 235  * receiver is suggested not to check the consistency.
 236  *
 237  * fragment rule (p20):
 238  *      (2) A Fragment header containing:
 239  *      The Next Header value that identifies the first header of
 240  *      the Fragmentable Part of the original packet.
 241  *              -> next header field is same for all fragments
 242  *
 243  * reassembly rule (p21):
 244  *      The Next Header field of the last header of the Unfragmentable
 245  *      Part is obtained from the Next Header field of the first
 246  *      fragment's Fragment header.
 247  *              -> should grab it from the first fragment only
 248  *
 249  * The following note also contradicts with fragment rule - no one is going to
 250  * send different fragment with different next header field.
 251  *
 252  * additional note (p22):
 253  *      The Next Header values in the Fragment headers of different
 254  *      fragments of the same original packet may differ.  Only the value
 255  *      from the Offset zero fragment packet is used for reassembly.
 256  *              -> should grab it from the first fragment only
 257  *
 258  * There is no explicit reason given in the RFC.  Historical reason maybe?
 259  */
 260 /*
 261  * Fragment input
 262  */
 263 int
 264 frag6_input(struct mbuf **mp, int *offp, int proto)
 265 {
 266         struct mbuf *m = *mp, *t;
 267         struct ip6_hdr *ip6;
 268         struct ip6_frag *ip6f;
 269         struct ip6q *head, *q6;
 270         struct ip6asfrag *af6, *ip6af, *af6dwn;
 271         struct in6_ifaddr *ia;
 272         int offset = *offp, nxt, i, next;
 273         int first_frag = 0;
 274         int fragoff, frgpartlen;        /* must be larger than u_int16_t */
 275         uint32_t hashkey[(sizeof(struct in6_addr) * 2 +
 276                     sizeof(ip6f->ip6f_ident)) / sizeof(uint32_t)];
 277         uint32_t hash, *hashkeyp;
 278         struct ifnet *dstifp;
 279         u_int8_t ecn, ecn0;
 280 #ifdef RSS
 281         struct m_tag *mtag;
 282         struct ip6_direct_ctx *ip6dc;
 283 #endif
 284
 285 #if 0
 286         char ip6buf[INET6_ADDRSTRLEN];
 287 #endif
 288
 289         ip6 = mtod(m, struct ip6_hdr *);
 290 #ifndef PULLDOWN_TEST
 291         IP6_EXTHDR_CHECK(m, offset, sizeof(struct ip6_frag), IPPROTO_DONE);
 292         ip6f = (struct ip6_frag *)((caddr_t)ip6 + offset);
 293 #else
 294         IP6_EXTHDR_GET(ip6f, struct ip6_frag *, m, offset, sizeof(*ip6f));
 295         if (ip6f == NULL)
 296                 return (IPPROTO_DONE);
 297 #endif
 298
 299         dstifp = NULL;
 300         /* find the destination interface of the packet. */
 301         ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */);
 302         if (ia != NULL) {
 303                 dstifp = ia->ia_ifp;
 304                 ifa_free(&ia->ia_ifa);
 305         }
 306         /* jumbo payload can't contain a fragment header */
 307         if (ip6->ip6_plen == 0) {
 308                 icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset);
 309                 in6_ifstat_inc(dstifp, ifs6_reass_fail);
 310                 return IPPROTO_DONE;
 311         }
 312
 313         /*
 314          * check whether fragment packet's fragment length is
 315          * multiple of 8 octets.
 316          * sizeof(struct ip6_frag) == 8
 317          * sizeof(struct ip6_hdr) = 40
 318          */
 319         if ((ip6f->ip6f_offlg & IP6F_MORE_FRAG) &&
 320             (((ntohs(ip6->ip6_plen) - offset) & 0x7) != 0)) {
 321                 icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 322                     offsetof(struct ip6_hdr, ip6_plen));
 323                 in6_ifstat_inc(dstifp, ifs6_reass_fail);
 324                 return IPPROTO_DONE;
 325         }
 326
 327         IP6STAT_INC(ip6s_fragments);
 328         in6_ifstat_inc(dstifp, ifs6_reass_reqd);
 329
 330         /* offset now points to data portion */
 331         offset += sizeof(struct ip6_frag);
 332
 333         /*
 334          * Handle "atomic" fragments (offset and m bit set to 0) upfront,
 335          * unrelated to any reassembly (see RFC 6946 and section 4.5 of RFC
 336          * 8200).  Just skip the fragment header.
 337          */
 338         if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) {
 339                 IP6STAT_INC(ip6s_atomicfrags);
 340                 in6_ifstat_inc(dstifp, ifs6_reass_ok);
 341                 *offp = offset;
 342                 m->m_flags |= M_FRAGMENTED;
 343                 return (ip6f->ip6f_nxt);
 344         }
 345
 346         /* Get fragment length and discard 0-byte fragments. */
 347         frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset;
 348         if (frgpartlen == 0) {
 349                 icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 350                     offsetof(struct ip6_hdr, ip6_plen));
 351                 in6_ifstat_inc(dstifp, ifs6_reass_fail);
 352                 IP6STAT_INC(ip6s_fragdropped);
 353                 return IPPROTO_DONE;
 354         }
 355
 356         hashkeyp = hashkey;
 357         memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr));
 358         hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp);
 359         memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr));
 360         hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp);
 361         *hashkeyp = ip6f->ip6f_ident;
 362         hash = jenkins_hash32(hashkey, nitems(hashkey), V_ip6q_hashseed);
 363         hash &= IP6REASS_HMASK;
 364         head = IP6Q_HEAD(hash);
 365         IP6Q_LOCK(hash);
 366
 367         /*
 368          * Enforce upper bound on number of fragments.
 369          * If maxfrag is 0, never accept fragments.
 370          * If maxfrag is -1, accept all fragments without limitation.
 371          */
 372         if (ip6_maxfrags < 0)
 373                 ;
 374         else if (atomic_load_int(&frag6_nfrags) >= (u_int)ip6_maxfrags)
 375                 goto dropfrag;
 376
 377         for (q6 = head->ip6q_next; q6 != head; q6 = q6->ip6q_next)
 378                 if (ip6f->ip6f_ident == q6->ip6q_ident &&
 379                     IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) &&
 380                     IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst)
 381 #ifdef MAC
 382                     && mac_ip6q_match(m, q6)
 383 #endif
 384                     )
 385                         break;
 386
 387         if (q6 == head) {
 388                 /*
 389                  * the first fragment to arrive, create a reassembly queue.
 390                  */
 391                 first_frag = 1;
 392
 393                 /*
 394                  * Enforce upper bound on number of fragmented packets
 395                  * for which we attempt reassembly;
 396                  * If maxfragpackets is 0, never accept fragments.
 397                  * If maxfragpackets is -1, accept all fragments without
 398                  * limitation.
 399                  */
 400                 if (V_ip6_maxfragpackets < 0)
 401                         ;
 402                 else if (V_ip6q[hash].count >= V_ip6_maxfragbucketsize ||
 403                     atomic_load_int(&V_frag6_nfragpackets) >=
 404                     (u_int)V_ip6_maxfragpackets)
 405                         goto dropfrag;
 406                 atomic_add_int(&V_frag6_nfragpackets, 1);
 407                 q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE,
 408                     M_NOWAIT);
 409                 if (q6 == NULL)
 410                         goto dropfrag;
 411                 bzero(q6, sizeof(*q6));
 412 #ifdef MAC
 413                 if (mac_ip6q_init(q6, M_NOWAIT) != 0) {
 414                         free(q6, M_FTABLE);
 415                         goto dropfrag;
 416                 }
 417                 mac_ip6q_create(m, q6);
 418 #endif
 419                 frag6_insque_head(q6, head, hash);
 420
 421                 /* ip6q_nxt will be filled afterwards, from 1st fragment */
 422                 q6->ip6q_down   = q6->ip6q_up = (struct ip6asfrag *)q6;
 423 #ifdef notyet
 424                 q6->ip6q_nxtp   = (u_char *)nxtp;
 425 #endif
 426                 q6->ip6q_ident  = ip6f->ip6f_ident;
 427                 q6->ip6q_ttl    = IPV6_FRAGTTL;
 428                 q6->ip6q_src    = ip6->ip6_src;
 429                 q6->ip6q_dst    = ip6->ip6_dst;
 430                 q6->ip6q_ecn    =
 431                     (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
 432                 q6->ip6q_unfrglen = -1; /* The 1st fragment has not arrived. */
 433
 434                 q6->ip6q_nfrag = 0;
 435         }
 436
 437         /*
 438          * If it's the 1st fragment, record the length of the
 439          * unfragmentable part and the next header of the fragment header.
 440          */
 441         fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK);
 442         if (fragoff == 0) {
 443                 q6->ip6q_unfrglen = offset - sizeof(struct ip6_hdr) -
 444                     sizeof(struct ip6_frag);
 445                 q6->ip6q_nxt = ip6f->ip6f_nxt;
 446         }
 447
 448         /*
 449          * Check that the reassembled packet would not exceed 65535 bytes
 450          * in size.
 451          * If it would exceed, discard the fragment and return an ICMP error.
 452          */
 453         if (q6->ip6q_unfrglen >= 0) {
 454                 /* The 1st fragment has already arrived. */
 455                 if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) {
 456                         icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 457                             offset - sizeof(struct ip6_frag) +
 458                             offsetof(struct ip6_frag, ip6f_offlg));
 459                         IP6Q_UNLOCK(hash);
 460                         return (IPPROTO_DONE);
 461                 }
 462         } else if (fragoff + frgpartlen > IPV6_MAXPACKET) {
 463                 icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 464                     offset - sizeof(struct ip6_frag) +
 465                     offsetof(struct ip6_frag, ip6f_offlg));
 466                 IP6Q_UNLOCK(hash);
 467                 return (IPPROTO_DONE);
 468         }
 469         /*
 470          * If it's the first fragment, do the above check for each
 471          * fragment already stored in the reassembly queue.
 472          */
 473         if (fragoff == 0) {
 474                 for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
 475                      af6 = af6dwn) {
 476                         af6dwn = af6->ip6af_down;
 477
 478                         if (q6->ip6q_unfrglen + af6->ip6af_off + af6->ip6af_frglen >
 479                             IPV6_MAXPACKET) {
 480                                 struct mbuf *merr = IP6_REASS_MBUF(af6);
 481                                 struct ip6_hdr *ip6err;
 482                                 int erroff = af6->ip6af_offset;
 483
 484                                 /* dequeue the fragment. */
 485                                 frag6_deq(af6, hash);
 486                                 free(af6, M_FTABLE);
 487
 488                                 /* adjust pointer. */
 489                                 ip6err = mtod(merr, struct ip6_hdr *);
 490
 491                                 /*
 492                                  * Restore source and destination addresses
 493                                  * in the erroneous IPv6 header.
 494                                  */
 495                                 ip6err->ip6_src = q6->ip6q_src;
 496                                 ip6err->ip6_dst = q6->ip6q_dst;
 497
 498                                 icmp6_error(merr, ICMP6_PARAM_PROB,
 499                                     ICMP6_PARAMPROB_HEADER,
 500                                     erroff - sizeof(struct ip6_frag) +
 501                                     offsetof(struct ip6_frag, ip6f_offlg));
 502                         }
 503                 }
 504         }
 505
 506         ip6af = (struct ip6asfrag *)malloc(sizeof(struct ip6asfrag), M_FTABLE,
 507             M_NOWAIT);
 508         if (ip6af == NULL)
 509                 goto dropfrag;
 510         bzero(ip6af, sizeof(*ip6af));
 511         ip6af->ip6af_mff = ip6f->ip6f_offlg & IP6F_MORE_FRAG;
 512         ip6af->ip6af_off = fragoff;
 513         ip6af->ip6af_frglen = frgpartlen;
 514         ip6af->ip6af_offset = offset;
 515         IP6_REASS_MBUF(ip6af) = m;
 516
 517         if (first_frag) {
 518                 af6 = (struct ip6asfrag *)q6;
 519                 goto insert;
 520         }
 521
 522         /*
 523          * Handle ECN by comparing this segment with the first one;
 524          * if CE is set, do not lose CE.
 525          * drop if CE and not-ECT are mixed for the same packet.
 526          */
 527         ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
 528         ecn0 = q6->ip6q_ecn;
 529         if (ecn == IPTOS_ECN_CE) {
 530                 if (ecn0 == IPTOS_ECN_NOTECT) {
 531                         free(ip6af, M_FTABLE);
 532                         goto dropfrag;
 533                 }
 534                 if (ecn0 != IPTOS_ECN_CE)
 535                         q6->ip6q_ecn = IPTOS_ECN_CE;
 536         }
 537         if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) {
 538                 free(ip6af, M_FTABLE);
 539                 goto dropfrag;
 540         }
 541
 542         /*
 543          * Find a segment which begins after this one does.
 544          */
 545         for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
 546              af6 = af6->ip6af_down)
 547                 if (af6->ip6af_off > ip6af->ip6af_off)
 548                         break;
 549
 550 #if 0
 551         /*
 552          * If there is a preceding segment, it may provide some of
 553          * our data already.  If so, drop the data from the incoming
 554          * segment.  If it provides all of our data, drop us.
 555          */
 556         if (af6->ip6af_up != (struct ip6asfrag *)q6) {
 557                 i = af6->ip6af_up->ip6af_off + af6->ip6af_up->ip6af_frglen
 558                         - ip6af->ip6af_off;
 559                 if (i > 0) {
 560                         if (i >= ip6af->ip6af_frglen)
 561                                 goto dropfrag;
 562                         m_adj(IP6_REASS_MBUF(ip6af), i);
 563                         ip6af->ip6af_off += i;
 564                         ip6af->ip6af_frglen -= i;
 565                 }
 566         }
 567
 568         /*
 569          * While we overlap succeeding segments trim them or,
 570          * if they are completely covered, dequeue them.
 571          */
 572         while (af6 != (struct ip6asfrag *)q6 &&
 573                ip6af->ip6af_off + ip6af->ip6af_frglen > af6->ip6af_off) {
 574                 i = (ip6af->ip6af_off + ip6af->ip6af_frglen) - af6->ip6af_off;
 575                 if (i < af6->ip6af_frglen) {
 576                         af6->ip6af_frglen -= i;
 577                         af6->ip6af_off += i;
 578                         m_adj(IP6_REASS_MBUF(af6), i);
 579                         break;
 580                 }
 581                 af6 = af6->ip6af_down;
 582                 m_freem(IP6_REASS_MBUF(af6->ip6af_up));
 583                 frag6_deq(af6->ip6af_up, hash);
 584         }
 585 #else
 586         /*
 587          * If the incoming framgent overlaps some existing fragments in
 588          * the reassembly queue, drop it, since it is dangerous to override
 589          * existing fragments from a security point of view.
 590          * We don't know which fragment is the bad guy - here we trust
 591          * fragment that came in earlier, with no real reason.
 592          *
 593          * Note: due to changes after disabling this part, mbuf passed to
 594          * m_adj() below now does not meet the requirement.
 595          */
 596         if (af6->ip6af_up != (struct ip6asfrag *)q6) {
 597                 i = af6->ip6af_up->ip6af_off + af6->ip6af_up->ip6af_frglen
 598                         - ip6af->ip6af_off;
 599                 if (i > 0) {
 600 #if 0                           /* suppress the noisy log */
 601                         log(LOG_ERR, "%d bytes of a fragment from %s "
 602                             "overlaps the previous fragment\n",
 603                             i, ip6_sprintf(ip6buf, &q6->ip6q_src));
 604 #endif
 605                         free(ip6af, M_FTABLE);
 606                         goto dropfrag;
 607                 }
 608         }
 609         if (af6 != (struct ip6asfrag *)q6) {
 610                 i = (ip6af->ip6af_off + ip6af->ip6af_frglen) - af6->ip6af_off;
 611                 if (i > 0) {
 612 #if 0                           /* suppress the noisy log */
 613                         log(LOG_ERR, "%d bytes of a fragment from %s "
 614                             "overlaps the succeeding fragment",
 615                             i, ip6_sprintf(ip6buf, &q6->ip6q_src));
 616 #endif
 617                         free(ip6af, M_FTABLE);
 618                         goto dropfrag;
 619                 }
 620         }
 621 #endif
 622
 623 insert:
 624 #ifdef MAC
 625         if (!first_frag)
 626                 mac_ip6q_update(m, q6);
 627 #endif
 628
 629         /*
 630          * Stick new segment in its place;
 631          * check for complete reassembly.
 632          * If not complete, check fragment limit.
 633          * Move to front of packet queue, as we are
 634          * the most recently active fragmented packet.
 635          */
 636         frag6_enq(ip6af, af6->ip6af_up, hash);
 637         atomic_add_int(&frag6_nfrags, 1);
 638         q6->ip6q_nfrag++;
 639 #if 0 /* xxx */
 640         if (q6 != head->ip6q_next) {
 641                 frag6_remque(q6, hash);
 642                 frag6_insque_head(q6, head, hash);
 643         }
 644 #endif
 645         next = 0;
 646         for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
 647              af6 = af6->ip6af_down) {
 648                 if (af6->ip6af_off != next) {
 649                         if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) {
 650                                 IP6STAT_ADD(ip6s_fragdropped, q6->ip6q_nfrag);
 651                                 frag6_freef(q6, hash);
 652                         }
 653                         IP6Q_UNLOCK(hash);
 654                         return IPPROTO_DONE;
 655                 }
 656                 next += af6->ip6af_frglen;
 657         }
 658         if (af6->ip6af_up->ip6af_mff) {
 659                 if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) {
 660                         IP6STAT_ADD(ip6s_fragdropped, q6->ip6q_nfrag);
 661                         frag6_freef(q6, hash);
 662                 }
 663                 IP6Q_UNLOCK(hash);
 664                 return IPPROTO_DONE;
 665         }
 666
 667         /*
 668          * Reassembly is complete; concatenate fragments.
 669          */
 670         ip6af = q6->ip6q_down;
 671         t = m = IP6_REASS_MBUF(ip6af);
 672         af6 = ip6af->ip6af_down;
 673         frag6_deq(ip6af, hash);
 674         while (af6 != (struct ip6asfrag *)q6) {
 675                 m->m_pkthdr.csum_flags &=
 676                     IP6_REASS_MBUF(af6)->m_pkthdr.csum_flags;
 677                 m->m_pkthdr.csum_data +=
 678                     IP6_REASS_MBUF(af6)->m_pkthdr.csum_data;
 679
 680                 af6dwn = af6->ip6af_down;
 681                 frag6_deq(af6, hash);
 682                 while (t->m_next)
 683                         t = t->m_next;
 684                 m_adj(IP6_REASS_MBUF(af6), af6->ip6af_offset);
 685                 m_demote_pkthdr(IP6_REASS_MBUF(af6));
 686                 m_cat(t, IP6_REASS_MBUF(af6));
 687                 free(af6, M_FTABLE);
 688                 af6 = af6dwn;
 689         }
 690
 691         while (m->m_pkthdr.csum_data & 0xffff0000)
 692                 m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
 693                     (m->m_pkthdr.csum_data >> 16);
 694
 695         /* adjust offset to point where the original next header starts */
 696         offset = ip6af->ip6af_offset - sizeof(struct ip6_frag);
 697         free(ip6af, M_FTABLE);
 698         ip6 = mtod(m, struct ip6_hdr *);
 699         ip6->ip6_plen = htons((u_short)next + offset - sizeof(struct ip6_hdr));
 700         if (q6->ip6q_ecn == IPTOS_ECN_CE)
 701                 ip6->ip6_flow |= htonl(IPTOS_ECN_CE << 20);
 702         nxt = q6->ip6q_nxt;
 703 #ifdef notyet
 704         *q6->ip6q_nxtp = (u_char)(nxt & 0xff);
 705 #endif
 706
 707         if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) {
 708                 frag6_remque(q6, hash);
 709                 atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
 710 #ifdef MAC
 711                 mac_ip6q_destroy(q6);
 712 #endif
 713                 free(q6, M_FTABLE);
 714                 atomic_subtract_int(&V_frag6_nfragpackets, 1);
 715
 716                 goto dropfrag;
 717         }
 718
 719         /*
 720          * Store NXT to the original.
 721          */
 722         m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t),
 723             (caddr_t)&nxt);
 724
 725         frag6_remque(q6, hash);
 726         atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
 727 #ifdef MAC
 728         mac_ip6q_reassemble(q6, m);
 729         mac_ip6q_destroy(q6);
 730 #endif
 731         free(q6, M_FTABLE);
 732         atomic_subtract_int(&V_frag6_nfragpackets, 1);
 733
 734         if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */
 735                 int plen = 0;
 736                 for (t = m; t; t = t->m_next)
 737                         plen += t->m_len;
 738                 m->m_pkthdr.len = plen;
 739         }
 740
 741 #ifdef RSS
 742         mtag = m_tag_alloc(MTAG_ABI_IPV6, IPV6_TAG_DIRECT, sizeof(*ip6dc),
 743             M_NOWAIT);
 744         if (mtag == NULL)
 745                 goto dropfrag;
 746
 747         ip6dc = (struct ip6_direct_ctx *)(mtag + 1);
 748         ip6dc->ip6dc_nxt = nxt;
 749         ip6dc->ip6dc_off = offset;
 750
 751         m_tag_prepend(m, mtag);
 752 #endif
 753
 754         IP6Q_UNLOCK(hash);
 755         IP6STAT_INC(ip6s_reassembled);
 756         in6_ifstat_inc(dstifp, ifs6_reass_ok);
 757
 758 #ifdef RSS
 759         /*
 760          * Queue/dispatch for reprocessing.
 761          */
 762         netisr_dispatch(NETISR_IPV6_DIRECT, m);
 763         return IPPROTO_DONE;
 764 #endif
 765
 766         /*
 767          * Tell launch routine the next header
 768          */
 769
 770         *mp = m;
 771         *offp = offset;
 772
 773         return nxt;
 774
 775  dropfrag:
 776         IP6Q_UNLOCK(hash);
 777         in6_ifstat_inc(dstifp, ifs6_reass_fail);
 778         IP6STAT_INC(ip6s_fragdropped);
 779         m_freem(m);
 780         return IPPROTO_DONE;
 781 }
 782
 783 /*
 784  * Free a fragment reassembly header and all
 785  * associated datagrams.
 786  */
 787 static void
 788 frag6_freef(struct ip6q *q6, uint32_t bucket)
 789 {
 790         struct ip6asfrag *af6, *down6;
 791
 792         IP6Q_LOCK_ASSERT(bucket);
 793
 794         for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
 795              af6 = down6) {
 796                 struct mbuf *m = IP6_REASS_MBUF(af6);
 797
 798                 down6 = af6->ip6af_down;
 799                 frag6_deq(af6, bucket);
 800
 801                 /*
 802                  * Return ICMP time exceeded error for the 1st fragment.
 803                  * Just free other fragments.
 804                  */
 805                 if (af6->ip6af_off == 0) {
 806                         struct ip6_hdr *ip6;
 807
 808                         /* adjust pointer */
 809                         ip6 = mtod(m, struct ip6_hdr *);
 810
 811                         /* restore source and destination addresses */
 812                         ip6->ip6_src = q6->ip6q_src;
 813                         ip6->ip6_dst = q6->ip6q_dst;
 814
 815                         icmp6_error(m, ICMP6_TIME_EXCEEDED,
 816                                     ICMP6_TIME_EXCEED_REASSEMBLY, 0);
 817                 } else
 818                         m_freem(m);
 819                 free(af6, M_FTABLE);
 820         }
 821         frag6_remque(q6, bucket);
 822         atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
 823 #ifdef MAC
 824         mac_ip6q_destroy(q6);
 825 #endif
 826         free(q6, M_FTABLE);
 827         atomic_subtract_int(&V_frag6_nfragpackets, 1);
 828 }
 829
 830 /*
 831  * Put an ip fragment on a reassembly chain.
 832  * Like insque, but pointers in middle of structure.
 833  */
 834 static void
 835 frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6,
 836     uint32_t bucket __unused)
 837 {
 838
 839         IP6Q_LOCK_ASSERT(bucket);
 840
 841         af6->ip6af_up = up6;
 842         af6->ip6af_down = up6->ip6af_down;
 843         up6->ip6af_down->ip6af_up = af6;
 844         up6->ip6af_down = af6;
 845 }
 846
 847 /*
 848  * To frag6_enq as remque is to insque.
 849  */
 850 static void
 851 frag6_deq(struct ip6asfrag *af6, uint32_t bucket __unused)
 852 {
 853
 854         IP6Q_LOCK_ASSERT(bucket);
 855
 856         af6->ip6af_up->ip6af_down = af6->ip6af_down;
 857         af6->ip6af_down->ip6af_up = af6->ip6af_up;
 858 }
 859
 860 static void
 861 frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket)
 862 {
 863
 864         IP6Q_LOCK_ASSERT(bucket);
 865         KASSERT(IP6Q_HEAD(bucket) == old,
 866             ("%s: attempt to insert at head of wrong bucket"
 867             " (bucket=%u, old=%p)", __func__, bucket, old));
 868
 869         new->ip6q_prev = old;
 870         new->ip6q_next = old->ip6q_next;
 871         old->ip6q_next->ip6q_prev= new;
 872         old->ip6q_next = new;
 873         V_ip6q[bucket].count++;
 874 }
 875
 876 static void
 877 frag6_remque(struct ip6q *p6, uint32_t bucket)
 878 {
 879
 880         IP6Q_LOCK_ASSERT(bucket);
 881
 882         p6->ip6q_prev->ip6q_next = p6->ip6q_next;
 883         p6->ip6q_next->ip6q_prev = p6->ip6q_prev;
 884         V_ip6q[bucket].count--;
 885 }
 886
 887 /*
 888  * IPv6 reassembling timer processing;
 889  * if a timer expires on a reassembly
 890  * queue, discard it.
 891  */
 892 void
 893 frag6_slowtimo(void)
 894 {
 895         VNET_ITERATOR_DECL(vnet_iter);
 896         struct ip6q *head, *q6;
 897         int i;
 898
 899         VNET_LIST_RLOCK_NOSLEEP();
 900         VNET_FOREACH(vnet_iter) {
 901                 CURVNET_SET(vnet_iter);
 902                 for (i = 0; i < IP6REASS_NHASH; i++) {
 903                         IP6Q_LOCK(i);
 904                         head = IP6Q_HEAD(i);
 905                         q6 = head->ip6q_next;
 906                         if (q6 == NULL) {
 907                                 /*
 908                                  * XXXJTL: This should never happen. This
 909                                  * should turn into an assertion.
 910                                  */
 911                                 IP6Q_UNLOCK(i);
 912                                 continue;
 913                         }
 914                         while (q6 != head) {
 915                                 --q6->ip6q_ttl;
 916                                 q6 = q6->ip6q_next;
 917                                 if (q6->ip6q_prev->ip6q_ttl == 0) {
 918                                         IP6STAT_ADD(ip6s_fragtimeout,
 919                                                 q6->ip6q_prev->ip6q_nfrag);
 920                                         /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
 921                                         frag6_freef(q6->ip6q_prev, i);
 922                                 }
 923                         }
 924                         /*
 925                          * If we are over the maximum number of fragments
 926                          * (due to the limit being lowered), drain off
 927                          * enough to get down to the new limit.
 928                          * Note that we drain all reassembly queues if
 929                          * maxfragpackets is 0 (fragmentation is disabled),
 930                          * and don't enforce a limit when maxfragpackets
 931                          * is negative.
 932                          */
 933                         while ((V_ip6_maxfragpackets == 0 ||
 934                             (V_ip6_maxfragpackets > 0 &&
 935                             V_ip6q[i].count > V_ip6_maxfragbucketsize)) &&
 936                             head->ip6q_prev != head) {
 937                                 IP6STAT_ADD(ip6s_fragoverflow,
 938                                         q6->ip6q_prev->ip6q_nfrag);
 939                                 /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
 940                                 frag6_freef(head->ip6q_prev, i);
 941                         }
 942                         IP6Q_UNLOCK(i);
 943                 }
 944                 /*
 945                  * If we are still over the maximum number of fragmented
 946                  * packets, drain off enough to get down to the new limit.
 947                  */
 948                 i = 0;
 949                 while (V_ip6_maxfragpackets >= 0 &&
 950                     atomic_load_int(&V_frag6_nfragpackets) >
 951                     (u_int)V_ip6_maxfragpackets) {
 952                         IP6Q_LOCK(i);
 953                         head = IP6Q_HEAD(i);
 954                         if (head->ip6q_prev != head) {
 955                                 IP6STAT_ADD(ip6s_fragoverflow,
 956                                         q6->ip6q_prev->ip6q_nfrag);
 957                                 /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
 958                                 frag6_freef(head->ip6q_prev, i);
 959                         }
 960                         IP6Q_UNLOCK(i);
 961                         i = (i + 1) % IP6REASS_NHASH;
 962                 }
 963                 CURVNET_RESTORE();
 964         }
 965         VNET_LIST_RUNLOCK_NOSLEEP();
 966 }
 967
 968 /*
 969  * Drain off all datagram fragments.
 970  */
 971 void
 972 frag6_drain(void)
 973 {
 974         VNET_ITERATOR_DECL(vnet_iter);
 975         struct ip6q *head;
 976         int i;
 977
 978         VNET_LIST_RLOCK_NOSLEEP();
 979         VNET_FOREACH(vnet_iter) {
 980                 CURVNET_SET(vnet_iter);
 981                 for (i = 0; i < IP6REASS_NHASH; i++) {
 982                         if (IP6Q_TRYLOCK(i) == 0)
 983                                 continue;
 984                         head = IP6Q_HEAD(i);
 985                         while (head->ip6q_next != head) {
 986                                 IP6STAT_INC(ip6s_fragdropped);
 987                                 /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
 988                                 frag6_freef(head->ip6q_next, i);
 989                         }
 990                         IP6Q_UNLOCK(i);
 991                 }
 992                 CURVNET_RESTORE();
 993         }
 994         VNET_LIST_RUNLOCK_NOSLEEP();
 995 }
 996
 997 int
 998 ip6_deletefraghdr(struct mbuf *m, int offset, int wait)
 999 {
1000         struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
1001         struct mbuf *t;
1002
1003         /* Delete frag6 header. */
1004         if (m->m_len >= offset + sizeof(struct ip6_frag)) {
1005                 /* This is the only possible case with !PULLDOWN_TEST. */
1006                 bcopy(ip6, (char *)ip6 + sizeof(struct ip6_frag),
1007                     offset);
1008                 m->m_data += sizeof(struct ip6_frag);
1009                 m->m_len -= sizeof(struct ip6_frag);
1010         } else {
1011                 /* This comes with no copy if the boundary is on cluster. */
1012                 if ((t = m_split(m, offset, wait)) == NULL)
1013                         return (ENOMEM);
1014                 m_adj(t, sizeof(struct ip6_frag));
1015                 m_cat(m, t);
1016         }
1017
1018         m->m_flags |= M_FRAGMENTED;
1019         return (0);
1020 }