sys/netpfil/pf/pf.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause
   3  *
   4  * Copyright (c) 2001 Daniel Hartmeier
   5  * Copyright (c) 2002 - 2008 Henning Brauer
   6  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
   7  * All rights reserved.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  *
  13  *    - Redistributions of source code must retain the above copyright
  14  *      notice, this list of conditions and the following disclaimer.
  15  *    - Redistributions in binary form must reproduce the above
  16  *      copyright notice, this list of conditions and the following
  17  *      disclaimer in the documentation and/or other materials provided
  18  *      with the distribution.
  19  *
  20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  23  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  24  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  26  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  28  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  30  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31  * POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * Effort sponsored in part by the Defense Advanced Research Projects
  34  * Agency (DARPA) and Air Force Research Laboratory, Air Force
  35  * Materiel Command, USAF, under agreement number F30602-01-2-0537.
  36  *
  37  *      $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
  38  */
  39
  40 #include <sys/cdefs.h>
  41 __FBSDID("$FreeBSD$");
  42
  43 #include "opt_inet.h"
  44 #include "opt_inet6.h"
  45 #include "opt_bpf.h"
  46 #include "opt_pf.h"
  47 #include "opt_sctp.h"
  48
  49 #include <sys/param.h>
  50 #include <sys/bus.h>
  51 #include <sys/endian.h>
  52 #include <sys/gsb_crc32.h>
  53 #include <sys/hash.h>
  54 #include <sys/interrupt.h>
  55 #include <sys/kernel.h>
  56 #include <sys/kthread.h>
  57 #include <sys/limits.h>
  58 #include <sys/mbuf.h>
  59 #include <sys/md5.h>
  60 #include <sys/random.h>
  61 #include <sys/refcount.h>
  62 #include <sys/socket.h>
  63 #include <sys/sysctl.h>
  64 #include <sys/taskqueue.h>
  65 #include <sys/ucred.h>
  66
  67 #include <net/if.h>
  68 #include <net/if_var.h>
  69 #include <net/if_types.h>
  70 #include <net/if_vlan_var.h>
  71 #include <net/route.h>
  72 #include <net/radix_mpath.h>
  73 #include <net/vnet.h>
  74
  75 #include <net/pfil.h>
  76 #include <net/pfvar.h>
  77 #include <net/if_pflog.h>
  78 #include <net/if_pfsync.h>
  79
  80 #include <netinet/in_pcb.h>
  81 #include <netinet/in_var.h>
  82 #include <netinet/in_fib.h>
  83 #include <netinet/ip.h>
  84 #include <netinet/ip_fw.h>
  85 #include <netinet/ip_icmp.h>
  86 #include <netinet/icmp_var.h>
  87 #include <netinet/ip_var.h>
  88 #include <netinet/tcp.h>
  89 #include <netinet/tcp_fsm.h>
  90 #include <netinet/tcp_seq.h>
  91 #include <netinet/tcp_timer.h>
  92 #include <netinet/tcp_var.h>
  93 #include <netinet/udp.h>
  94 #include <netinet/udp_var.h>
  95
  96 #ifdef INET6
  97 #include <netinet/ip6.h>
  98 #include <netinet/icmp6.h>
  99 #include <netinet6/nd6.h>
 100 #include <netinet6/ip6_var.h>
 101 #include <netinet6/in6_pcb.h>
 102 #include <netinet6/in6_fib.h>
 103 #include <netinet6/scope6_var.h>
 104 #endif /* INET6 */
 105
 106 #ifdef SCTP
 107 #include <netinet/sctp_crc32.h>
 108 #endif
 109
 110 #include <machine/in_cksum.h>
 111 #include <security/mac/mac_framework.h>
 112
 113 #define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
 114
 115 /*
 116  * Global variables
 117  */
 118
 119 /* state tables */
 120 VNET_DEFINE(struct pf_altqqueue,         pf_altqs[4]);
 121 VNET_DEFINE(struct pf_palist,            pf_pabuf);
 122 VNET_DEFINE(struct pf_altqqueue *,       pf_altqs_active);
 123 VNET_DEFINE(struct pf_altqqueue *,       pf_altq_ifs_active);
 124 VNET_DEFINE(struct pf_altqqueue *,       pf_altqs_inactive);
 125 VNET_DEFINE(struct pf_altqqueue *,       pf_altq_ifs_inactive);
 126 VNET_DEFINE(struct pf_kstatus,           pf_status);
 127
 128 VNET_DEFINE(u_int32_t,                   ticket_altqs_active);
 129 VNET_DEFINE(u_int32_t,                   ticket_altqs_inactive);
 130 VNET_DEFINE(int,                         altqs_inactive_open);
 131 VNET_DEFINE(u_int32_t,                   ticket_pabuf);
 132
 133 VNET_DEFINE(MD5_CTX,                     pf_tcp_secret_ctx);
 134 #define V_pf_tcp_secret_ctx              VNET(pf_tcp_secret_ctx)
 135 VNET_DEFINE(u_char,                      pf_tcp_secret[16]);
 136 #define V_pf_tcp_secret                  VNET(pf_tcp_secret)
 137 VNET_DEFINE(int,                         pf_tcp_secret_init);
 138 #define V_pf_tcp_secret_init             VNET(pf_tcp_secret_init)
 139 VNET_DEFINE(int,                         pf_tcp_iss_off);
 140 #define V_pf_tcp_iss_off                 VNET(pf_tcp_iss_off)
 141 VNET_DECLARE(int,                        pf_vnet_active);
 142 #define V_pf_vnet_active                 VNET(pf_vnet_active)
 143
 144 VNET_DEFINE_STATIC(uint32_t, pf_purge_idx);
 145 #define V_pf_purge_idx  VNET(pf_purge_idx)
 146
 147 /*
 148  * Queue for pf_intr() sends.
 149  */
 150 static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations");
 151 struct pf_send_entry {
 152         STAILQ_ENTRY(pf_send_entry)     pfse_next;
 153         struct mbuf                     *pfse_m;
 154         enum {
 155                 PFSE_IP,
 156                 PFSE_IP6,
 157                 PFSE_ICMP,
 158                 PFSE_ICMP6,
 159         }                               pfse_type;
 160         struct {
 161                 int             type;
 162                 int             code;
 163                 int             mtu;
 164         } icmpopts;
 165 };
 166
 167 STAILQ_HEAD(pf_send_head, pf_send_entry);
 168 VNET_DEFINE_STATIC(struct pf_send_head, pf_sendqueue);
 169 #define V_pf_sendqueue  VNET(pf_sendqueue)
 170
 171 static struct mtx pf_sendqueue_mtx;
 172 MTX_SYSINIT(pf_sendqueue_mtx, &pf_sendqueue_mtx, "pf send queue", MTX_DEF);
 173 #define PF_SENDQ_LOCK()         mtx_lock(&pf_sendqueue_mtx)
 174 #define PF_SENDQ_UNLOCK()       mtx_unlock(&pf_sendqueue_mtx)
 175
 176 /*
 177  * Queue for pf_overload_task() tasks.
 178  */
 179 struct pf_overload_entry {
 180         SLIST_ENTRY(pf_overload_entry)  next;
 181         struct pf_addr                  addr;
 182         sa_family_t                     af;
 183         uint8_t                         dir;
 184         struct pf_rule                  *rule;
 185 };
 186
 187 SLIST_HEAD(pf_overload_head, pf_overload_entry);
 188 VNET_DEFINE_STATIC(struct pf_overload_head, pf_overloadqueue);
 189 #define V_pf_overloadqueue      VNET(pf_overloadqueue)
 190 VNET_DEFINE_STATIC(struct task, pf_overloadtask);
 191 #define V_pf_overloadtask       VNET(pf_overloadtask)
 192
 193 static struct mtx pf_overloadqueue_mtx;
 194 MTX_SYSINIT(pf_overloadqueue_mtx, &pf_overloadqueue_mtx,
 195     "pf overload/flush queue", MTX_DEF);
 196 #define PF_OVERLOADQ_LOCK()     mtx_lock(&pf_overloadqueue_mtx)
 197 #define PF_OVERLOADQ_UNLOCK()   mtx_unlock(&pf_overloadqueue_mtx)
 198
 199 VNET_DEFINE(struct pf_rulequeue, pf_unlinked_rules);
 200 struct mtx pf_unlnkdrules_mtx;
 201 MTX_SYSINIT(pf_unlnkdrules_mtx, &pf_unlnkdrules_mtx, "pf unlinked rules",
 202     MTX_DEF);
 203
 204 VNET_DEFINE_STATIC(uma_zone_t,  pf_sources_z);
 205 #define V_pf_sources_z  VNET(pf_sources_z)
 206 uma_zone_t              pf_mtag_z;
 207 VNET_DEFINE(uma_zone_t,  pf_state_z);
 208 VNET_DEFINE(uma_zone_t,  pf_state_key_z);
 209
 210 VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]);
 211 #define PFID_CPUBITS    8
 212 #define PFID_CPUSHIFT   (sizeof(uint64_t) * NBBY - PFID_CPUBITS)
 213 #define PFID_CPUMASK    ((uint64_t)((1 << PFID_CPUBITS) - 1) << PFID_CPUSHIFT)
 214 #define PFID_MAXID      (~PFID_CPUMASK)
 215 CTASSERT((1 << PFID_CPUBITS) >= MAXCPU);
 216
 217 static void              pf_src_tree_remove_state(struct pf_state *);
 218 static void              pf_init_threshold(struct pf_threshold *, u_int32_t,
 219                             u_int32_t);
 220 static void              pf_add_threshold(struct pf_threshold *);
 221 static int               pf_check_threshold(struct pf_threshold *);
 222
 223 static void              pf_change_ap(struct mbuf *, struct pf_addr *, u_int16_t *,
 224                             u_int16_t *, u_int16_t *, struct pf_addr *,
 225                             u_int16_t, u_int8_t, sa_family_t);
 226 static int               pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
 227                             struct tcphdr *, struct pf_state_peer *);
 228 static void              pf_change_icmp(struct pf_addr *, u_int16_t *,
 229                             struct pf_addr *, struct pf_addr *, u_int16_t,
 230                             u_int16_t *, u_int16_t *, u_int16_t *,
 231                             u_int16_t *, u_int8_t, sa_family_t);
 232 static void              pf_send_tcp(struct mbuf *,
 233                             const struct pf_rule *, sa_family_t,
 234                             const struct pf_addr *, const struct pf_addr *,
 235                             u_int16_t, u_int16_t, u_int32_t, u_int32_t,
 236                             u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
 237                             u_int16_t, struct ifnet *);
 238 static void              pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
 239                             sa_family_t, struct pf_rule *);
 240 static void              pf_detach_state(struct pf_state *);
 241 static int               pf_state_key_attach(struct pf_state_key *,
 242                             struct pf_state_key *, struct pf_state *);
 243 static void              pf_state_key_detach(struct pf_state *, int);
 244 static int               pf_state_key_ctor(void *, int, void *, int);
 245 static u_int32_t         pf_tcp_iss(struct pf_pdesc *);
 246 static int               pf_test_rule(struct pf_rule **, struct pf_state **,
 247                             int, struct pfi_kif *, struct mbuf *, int,
 248                             struct pf_pdesc *, struct pf_rule **,
 249                             struct pf_ruleset **, struct inpcb *);
 250 static int               pf_create_state(struct pf_rule *, struct pf_rule *,
 251                             struct pf_rule *, struct pf_pdesc *,
 252                             struct pf_src_node *, struct pf_state_key *,
 253                             struct pf_state_key *, struct mbuf *, int,
 254                             u_int16_t, u_int16_t, int *, struct pfi_kif *,
 255                             struct pf_state **, int, u_int16_t, u_int16_t,
 256                             int);
 257 static int               pf_test_fragment(struct pf_rule **, int,
 258                             struct pfi_kif *, struct mbuf *, void *,
 259                             struct pf_pdesc *, struct pf_rule **,
 260                             struct pf_ruleset **);
 261 static int               pf_tcp_track_full(struct pf_state_peer *,
 262                             struct pf_state_peer *, struct pf_state **,
 263                             struct pfi_kif *, struct mbuf *, int,
 264                             struct pf_pdesc *, u_short *, int *);
 265 static int               pf_tcp_track_sloppy(struct pf_state_peer *,
 266                             struct pf_state_peer *, struct pf_state **,
 267                             struct pf_pdesc *, u_short *);
 268 static int               pf_test_state_tcp(struct pf_state **, int,
 269                             struct pfi_kif *, struct mbuf *, int,
 270                             void *, struct pf_pdesc *, u_short *);
 271 static int               pf_test_state_udp(struct pf_state **, int,
 272                             struct pfi_kif *, struct mbuf *, int,
 273                             void *, struct pf_pdesc *);
 274 static int               pf_test_state_icmp(struct pf_state **, int,
 275                             struct pfi_kif *, struct mbuf *, int,
 276                             void *, struct pf_pdesc *, u_short *);
 277 static int               pf_test_state_other(struct pf_state **, int,
 278                             struct pfi_kif *, struct mbuf *, struct pf_pdesc *);
 279 static u_int8_t          pf_get_wscale(struct mbuf *, int, u_int16_t,
 280                             sa_family_t);
 281 static u_int16_t         pf_get_mss(struct mbuf *, int, u_int16_t,
 282                             sa_family_t);
 283 static u_int16_t         pf_calc_mss(struct pf_addr *, sa_family_t,
 284                                 int, u_int16_t);
 285 static int               pf_check_proto_cksum(struct mbuf *, int, int,
 286                             u_int8_t, sa_family_t);
 287 static void              pf_print_state_parts(struct pf_state *,
 288                             struct pf_state_key *, struct pf_state_key *);
 289 static int               pf_addr_wrap_neq(struct pf_addr_wrap *,
 290                             struct pf_addr_wrap *);
 291 static struct pf_state  *pf_find_state(struct pfi_kif *,
 292                             struct pf_state_key_cmp *, u_int);
 293 static int               pf_src_connlimit(struct pf_state **);
 294 static void              pf_overload_task(void *v, int pending);
 295 static int               pf_insert_src_node(struct pf_src_node **,
 296                             struct pf_rule *, struct pf_addr *, sa_family_t);
 297 static u_int             pf_purge_expired_states(u_int, int);
 298 static void              pf_purge_unlinked_rules(void);
 299 static int               pf_mtag_uminit(void *, int, int);
 300 static void              pf_mtag_free(struct m_tag *);
 301 #ifdef INET
 302 static void              pf_route(struct mbuf **, struct pf_rule *, int,
 303                             struct ifnet *, struct pf_state *,
 304                             struct pf_pdesc *, struct inpcb *);
 305 #endif /* INET */
 306 #ifdef INET6
 307 static void              pf_change_a6(struct pf_addr *, u_int16_t *,
 308                             struct pf_addr *, u_int8_t);
 309 static void              pf_route6(struct mbuf **, struct pf_rule *, int,
 310                             struct ifnet *, struct pf_state *,
 311                             struct pf_pdesc *, struct inpcb *);
 312 #endif /* INET6 */
 313
 314 int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
 315
 316 extern int pf_end_threads;
 317 extern struct proc *pf_purge_proc;
 318
 319 VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]);
 320
 321 #define PACKET_LOOPED(pd)       ((pd)->pf_mtag &&                       \
 322                                  (pd)->pf_mtag->flags & PF_PACKET_LOOPED)
 323
 324 #define STATE_LOOKUP(i, k, d, s, pd)                                    \
 325         do {                                                            \
 326                 (s) = pf_find_state((i), (k), (d));                     \
 327                 if ((s) == NULL)                                        \
 328                         return (PF_DROP);                               \
 329                 if (PACKET_LOOPED(pd))                                  \
 330                         return (PF_PASS);                               \
 331                 if ((d) == PF_OUT &&                                    \
 332                     (((s)->rule.ptr->rt == PF_ROUTETO &&                \
 333                     (s)->rule.ptr->direction == PF_OUT) ||              \
 334                     ((s)->rule.ptr->rt == PF_REPLYTO &&                 \
 335                     (s)->rule.ptr->direction == PF_IN)) &&              \
 336                     (s)->rt_kif != NULL &&                              \
 337                     (s)->rt_kif != (i))                                 \
 338                         return (PF_PASS);                               \
 339         } while (0)
 340
 341 #define BOUND_IFACE(r, k) \
 342         ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all
 343
 344 #define STATE_INC_COUNTERS(s)                                           \
 345         do {                                                            \
 346                 counter_u64_add(s->rule.ptr->states_cur, 1);            \
 347                 counter_u64_add(s->rule.ptr->states_tot, 1);            \
 348                 if (s->anchor.ptr != NULL) {                            \
 349                         counter_u64_add(s->anchor.ptr->states_cur, 1);  \
 350                         counter_u64_add(s->anchor.ptr->states_tot, 1);  \
 351                 }                                                       \
 352                 if (s->nat_rule.ptr != NULL) {                          \
 353                         counter_u64_add(s->nat_rule.ptr->states_cur, 1);\
 354                         counter_u64_add(s->nat_rule.ptr->states_tot, 1);\
 355                 }                                                       \
 356         } while (0)
 357
 358 #define STATE_DEC_COUNTERS(s)                                           \
 359         do {                                                            \
 360                 if (s->nat_rule.ptr != NULL)                            \
 361                         counter_u64_add(s->nat_rule.ptr->states_cur, -1);\
 362                 if (s->anchor.ptr != NULL)                              \
 363                         counter_u64_add(s->anchor.ptr->states_cur, -1); \
 364                 counter_u64_add(s->rule.ptr->states_cur, -1);           \
 365         } while (0)
 366
 367 MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures");
 368 VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
 369 VNET_DEFINE(struct pf_idhash *, pf_idhash);
 370 VNET_DEFINE(struct pf_srchash *, pf_srchash);
 371
 372 SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW, 0, "pf(4)");
 373
 374 u_long  pf_hashmask;
 375 u_long  pf_srchashmask;
 376 static u_long   pf_hashsize;
 377 static u_long   pf_srchashsize;
 378 u_long  pf_ioctl_maxcount = 65535;
 379
 380 SYSCTL_ULONG(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN,
 381     &pf_hashsize, 0, "Size of pf(4) states hashtable");
 382 SYSCTL_ULONG(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN,
 383     &pf_srchashsize, 0, "Size of pf(4) source nodes hashtable");
 384 SYSCTL_ULONG(_net_pf, OID_AUTO, request_maxcount, CTLFLAG_RW,
 385     &pf_ioctl_maxcount, 0, "Maximum number of tables, addresses, ... in a single ioctl() call");
 386
 387 VNET_DEFINE(void *, pf_swi_cookie);
 388
 389 VNET_DEFINE(uint32_t, pf_hashseed);
 390 #define V_pf_hashseed   VNET(pf_hashseed)
 391
 392 int
 393 pf_addr_cmp(struct pf_addr *a, struct pf_addr *b, sa_family_t af)
 394 {
 395
 396         switch (af) {
 397 #ifdef INET
 398         case AF_INET:
 399                 if (a->addr32[0] > b->addr32[0])
 400                         return (1);
 401                 if (a->addr32[0] < b->addr32[0])
 402                         return (-1);
 403                 break;
 404 #endif /* INET */
 405 #ifdef INET6
 406         case AF_INET6:
 407                 if (a->addr32[3] > b->addr32[3])
 408                         return (1);
 409                 if (a->addr32[3] < b->addr32[3])
 410                         return (-1);
 411                 if (a->addr32[2] > b->addr32[2])
 412                         return (1);
 413                 if (a->addr32[2] < b->addr32[2])
 414                         return (-1);
 415                 if (a->addr32[1] > b->addr32[1])
 416                         return (1);
 417                 if (a->addr32[1] < b->addr32[1])
 418                         return (-1);
 419                 if (a->addr32[0] > b->addr32[0])
 420                         return (1);
 421                 if (a->addr32[0] < b->addr32[0])
 422                         return (-1);
 423                 break;
 424 #endif /* INET6 */
 425         default:
 426                 panic("%s: unknown address family %u", __func__, af);
 427         }
 428         return (0);
 429 }
 430
 431 static __inline uint32_t
 432 pf_hashkey(struct pf_state_key *sk)
 433 {
 434         uint32_t h;
 435
 436         h = murmur3_32_hash32((uint32_t *)sk,
 437             sizeof(struct pf_state_key_cmp)/sizeof(uint32_t),
 438             V_pf_hashseed);
 439
 440         return (h & pf_hashmask);
 441 }
 442
 443 static __inline uint32_t
 444 pf_hashsrc(struct pf_addr *addr, sa_family_t af)
 445 {
 446         uint32_t h;
 447
 448         switch (af) {
 449         case AF_INET:
 450                 h = murmur3_32_hash32((uint32_t *)&addr->v4,
 451                     sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed);
 452                 break;
 453         case AF_INET6:
 454                 h = murmur3_32_hash32((uint32_t *)&addr->v6,
 455                     sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed);
 456                 break;
 457         default:
 458                 panic("%s: unknown address family %u", __func__, af);
 459         }
 460
 461         return (h & pf_srchashmask);
 462 }
 463
 464 #ifdef ALTQ
 465 static int
 466 pf_state_hash(struct pf_state *s)
 467 {
 468         u_int32_t hv = (intptr_t)s / sizeof(*s);
 469
 470         hv ^= crc32(&s->src, sizeof(s->src));
 471         hv ^= crc32(&s->dst, sizeof(s->dst));
 472         if (hv == 0)
 473                 hv = 1;
 474         return (hv);
 475 }
 476 #endif
 477
 478 #ifdef INET6
 479 void
 480 pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
 481 {
 482         switch (af) {
 483 #ifdef INET
 484         case AF_INET:
 485                 dst->addr32[0] = src->addr32[0];
 486                 break;
 487 #endif /* INET */
 488         case AF_INET6:
 489                 dst->addr32[0] = src->addr32[0];
 490                 dst->addr32[1] = src->addr32[1];
 491                 dst->addr32[2] = src->addr32[2];
 492                 dst->addr32[3] = src->addr32[3];
 493                 break;
 494         }
 495 }
 496 #endif /* INET6 */
 497
 498 static void
 499 pf_init_threshold(struct pf_threshold *threshold,
 500     u_int32_t limit, u_int32_t seconds)
 501 {
 502         threshold->limit = limit * PF_THRESHOLD_MULT;
 503         threshold->seconds = seconds;
 504         threshold->count = 0;
 505         threshold->last = time_uptime;
 506 }
 507
 508 static void
 509 pf_add_threshold(struct pf_threshold *threshold)
 510 {
 511         u_int32_t t = time_uptime, diff = t - threshold->last;
 512
 513         if (diff >= threshold->seconds)
 514                 threshold->count = 0;
 515         else
 516                 threshold->count -= threshold->count * diff /
 517                     threshold->seconds;
 518         threshold->count += PF_THRESHOLD_MULT;
 519         threshold->last = t;
 520 }
 521
 522 static int
 523 pf_check_threshold(struct pf_threshold *threshold)
 524 {
 525         return (threshold->count > threshold->limit);
 526 }
 527
 528 static int
 529 pf_src_connlimit(struct pf_state **state)
 530 {
 531         struct pf_overload_entry *pfoe;
 532         int bad = 0;
 533
 534         PF_STATE_LOCK_ASSERT(*state);
 535
 536         (*state)->src_node->conn++;
 537         (*state)->src.tcp_est = 1;
 538         pf_add_threshold(&(*state)->src_node->conn_rate);
 539
 540         if ((*state)->rule.ptr->max_src_conn &&
 541             (*state)->rule.ptr->max_src_conn <
 542             (*state)->src_node->conn) {
 543                 counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONN], 1);
 544                 bad++;
 545         }
 546
 547         if ((*state)->rule.ptr->max_src_conn_rate.limit &&
 548             pf_check_threshold(&(*state)->src_node->conn_rate)) {
 549                 counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONNRATE], 1);
 550                 bad++;
 551         }
 552
 553         if (!bad)
 554                 return (0);
 555
 556         /* Kill this state. */
 557         (*state)->timeout = PFTM_PURGE;
 558         (*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
 559
 560         if ((*state)->rule.ptr->overload_tbl == NULL)
 561                 return (1);
 562
 563         /* Schedule overloading and flushing task. */
 564         pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT);
 565         if (pfoe == NULL)
 566                 return (1);     /* too bad :( */
 567
 568         bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr));
 569         pfoe->af = (*state)->key[PF_SK_WIRE]->af;
 570         pfoe->rule = (*state)->rule.ptr;
 571         pfoe->dir = (*state)->direction;
 572         PF_OVERLOADQ_LOCK();
 573         SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next);
 574         PF_OVERLOADQ_UNLOCK();
 575         taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask);
 576
 577         return (1);
 578 }
 579
 580 static void
 581 pf_overload_task(void *v, int pending)
 582 {
 583         struct pf_overload_head queue;
 584         struct pfr_addr p;
 585         struct pf_overload_entry *pfoe, *pfoe1;
 586         uint32_t killed = 0;
 587
 588         CURVNET_SET((struct vnet *)v);
 589
 590         PF_OVERLOADQ_LOCK();
 591         queue = V_pf_overloadqueue;
 592         SLIST_INIT(&V_pf_overloadqueue);
 593         PF_OVERLOADQ_UNLOCK();
 594
 595         bzero(&p, sizeof(p));
 596         SLIST_FOREACH(pfoe, &queue, next) {
 597                 counter_u64_add(V_pf_status.lcounters[LCNT_OVERLOAD_TABLE], 1);
 598                 if (V_pf_status.debug >= PF_DEBUG_MISC) {
 599                         printf("%s: blocking address ", __func__);
 600                         pf_print_host(&pfoe->addr, 0, pfoe->af);
 601                         printf("\n");
 602                 }
 603
 604                 p.pfra_af = pfoe->af;
 605                 switch (pfoe->af) {
 606 #ifdef INET
 607                 case AF_INET:
 608                         p.pfra_net = 32;
 609                         p.pfra_ip4addr = pfoe->addr.v4;
 610                         break;
 611 #endif
 612 #ifdef INET6
 613                 case AF_INET6:
 614                         p.pfra_net = 128;
 615                         p.pfra_ip6addr = pfoe->addr.v6;
 616                         break;
 617 #endif
 618                 }
 619
 620                 PF_RULES_WLOCK();
 621                 pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second);
 622                 PF_RULES_WUNLOCK();
 623         }
 624
 625         /*
 626          * Remove those entries, that don't need flushing.
 627          */
 628         SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
 629                 if (pfoe->rule->flush == 0) {
 630                         SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next);
 631                         free(pfoe, M_PFTEMP);
 632                 } else
 633                         counter_u64_add(
 634                             V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH], 1);
 635
 636         /* If nothing to flush, return. */
 637         if (SLIST_EMPTY(&queue)) {
 638                 CURVNET_RESTORE();
 639                 return;
 640         }
 641
 642         for (int i = 0; i <= pf_hashmask; i++) {
 643                 struct pf_idhash *ih = &V_pf_idhash[i];
 644                 struct pf_state_key *sk;
 645                 struct pf_state *s;
 646
 647                 PF_HASHROW_LOCK(ih);
 648                 LIST_FOREACH(s, &ih->states, entry) {
 649                     sk = s->key[PF_SK_WIRE];
 650                     SLIST_FOREACH(pfoe, &queue, next)
 651                         if (sk->af == pfoe->af &&
 652                             ((pfoe->rule->flush & PF_FLUSH_GLOBAL) ||
 653                             pfoe->rule == s->rule.ptr) &&
 654                             ((pfoe->dir == PF_OUT &&
 655                             PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) ||
 656                             (pfoe->dir == PF_IN &&
 657                             PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) {
 658                                 s->timeout = PFTM_PURGE;
 659                                 s->src.state = s->dst.state = TCPS_CLOSED;
 660                                 killed++;
 661                         }
 662                 }
 663                 PF_HASHROW_UNLOCK(ih);
 664         }
 665         SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
 666                 free(pfoe, M_PFTEMP);
 667         if (V_pf_status.debug >= PF_DEBUG_MISC)
 668                 printf("%s: %u states killed", __func__, killed);
 669
 670         CURVNET_RESTORE();
 671 }
 672
 673 /*
 674  * Can return locked on failure, so that we can consistently
 675  * allocate and insert a new one.
 676  */
 677 struct pf_src_node *
 678 pf_find_src_node(struct pf_addr *src, struct pf_rule *rule, sa_family_t af,
 679         int returnlocked)
 680 {
 681         struct pf_srchash *sh;
 682         struct pf_src_node *n;
 683
 684         counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1);
 685
 686         sh = &V_pf_srchash[pf_hashsrc(src, af)];
 687         PF_HASHROW_LOCK(sh);
 688         LIST_FOREACH(n, &sh->nodes, entry)
 689                 if (n->rule.ptr == rule && n->af == af &&
 690                     ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) ||
 691                     (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0)))
 692                         break;
 693         if (n != NULL) {
 694                 n->states++;
 695                 PF_HASHROW_UNLOCK(sh);
 696         } else if (returnlocked == 0)
 697                 PF_HASHROW_UNLOCK(sh);
 698
 699         return (n);
 700 }
 701
 702 static int
 703 pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
 704     struct pf_addr *src, sa_family_t af)
 705 {
 706
 707         KASSERT((rule->rule_flag & PFRULE_RULESRCTRACK ||
 708             rule->rpool.opts & PF_POOL_STICKYADDR),
 709             ("%s for non-tracking rule %p", __func__, rule));
 710
 711         if (*sn == NULL)
 712                 *sn = pf_find_src_node(src, rule, af, 1);
 713
 714         if (*sn == NULL) {
 715                 struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)];
 716
 717                 PF_HASHROW_ASSERT(sh);
 718
 719                 if (!rule->max_src_nodes ||
 720                     counter_u64_fetch(rule->src_nodes) < rule->max_src_nodes)
 721                         (*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO);
 722                 else
 723                         counter_u64_add(V_pf_status.lcounters[LCNT_SRCNODES],
 724                             1);
 725                 if ((*sn) == NULL) {
 726                         PF_HASHROW_UNLOCK(sh);
 727                         return (-1);
 728                 }
 729
 730                 pf_init_threshold(&(*sn)->conn_rate,
 731                     rule->max_src_conn_rate.limit,
 732                     rule->max_src_conn_rate.seconds);
 733
 734                 (*sn)->af = af;
 735                 (*sn)->rule.ptr = rule;
 736                 PF_ACPY(&(*sn)->addr, src, af);
 737                 LIST_INSERT_HEAD(&sh->nodes, *sn, entry);
 738                 (*sn)->creation = time_uptime;
 739                 (*sn)->ruletype = rule->action;
 740                 (*sn)->states = 1;
 741                 if ((*sn)->rule.ptr != NULL)
 742                         counter_u64_add((*sn)->rule.ptr->src_nodes, 1);
 743                 PF_HASHROW_UNLOCK(sh);
 744                 counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_INSERT], 1);
 745         } else {
 746                 if (rule->max_src_states &&
 747                     (*sn)->states >= rule->max_src_states) {
 748                         counter_u64_add(V_pf_status.lcounters[LCNT_SRCSTATES],
 749                             1);
 750                         return (-1);
 751                 }
 752         }
 753         return (0);
 754 }
 755
 756 void
 757 pf_unlink_src_node(struct pf_src_node *src)
 758 {
 759
 760         PF_HASHROW_ASSERT(&V_pf_srchash[pf_hashsrc(&src->addr, src->af)]);
 761         LIST_REMOVE(src, entry);
 762         if (src->rule.ptr)
 763                 counter_u64_add(src->rule.ptr->src_nodes, -1);
 764 }
 765
 766 u_int
 767 pf_free_src_nodes(struct pf_src_node_list *head)
 768 {
 769         struct pf_src_node *sn, *tmp;
 770         u_int count = 0;
 771
 772         LIST_FOREACH_SAFE(sn, head, entry, tmp) {
 773                 uma_zfree(V_pf_sources_z, sn);
 774                 count++;
 775         }
 776
 777         counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], count);
 778
 779         return (count);
 780 }
 781
 782 void
 783 pf_mtag_initialize()
 784 {
 785
 786         pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) +
 787             sizeof(struct pf_mtag), NULL, NULL, pf_mtag_uminit, NULL,
 788             UMA_ALIGN_PTR, 0);
 789 }
 790
 791 /* Per-vnet data storage structures initialization. */
 792 void
 793 pf_initialize()
 794 {
 795         struct pf_keyhash       *kh;
 796         struct pf_idhash        *ih;
 797         struct pf_srchash       *sh;
 798         u_int i;
 799
 800         if (pf_hashsize == 0 || !powerof2(pf_hashsize))
 801                 pf_hashsize = PF_HASHSIZ;
 802         if (pf_srchashsize == 0 || !powerof2(pf_srchashsize))
 803                 pf_srchashsize = PF_SRCHASHSIZ;
 804
 805         V_pf_hashseed = arc4random();
 806
 807         /* States and state keys storage. */
 808         V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state),
 809             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 810         V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z;
 811         uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT);
 812         uma_zone_set_warning(V_pf_state_z, "PF states limit reached");
 813
 814         V_pf_state_key_z = uma_zcreate("pf state keys",
 815             sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL,
 816             UMA_ALIGN_PTR, 0);
 817
 818         V_pf_keyhash = mallocarray(pf_hashsize, sizeof(struct pf_keyhash),
 819             M_PFHASH, M_NOWAIT | M_ZERO);
 820         V_pf_idhash = mallocarray(pf_hashsize, sizeof(struct pf_idhash),
 821             M_PFHASH, M_NOWAIT | M_ZERO);
 822         if (V_pf_keyhash == NULL || V_pf_idhash == NULL) {
 823                 printf("pf: Unable to allocate memory for "
 824                     "state_hashsize %lu.\n", pf_hashsize);
 825
 826                 free(V_pf_keyhash, M_PFHASH);
 827                 free(V_pf_idhash, M_PFHASH);
 828
 829                 pf_hashsize = PF_HASHSIZ;
 830                 V_pf_keyhash = mallocarray(pf_hashsize,
 831                     sizeof(struct pf_keyhash), M_PFHASH, M_WAITOK | M_ZERO);
 832                 V_pf_idhash = mallocarray(pf_hashsize,
 833                     sizeof(struct pf_idhash), M_PFHASH, M_WAITOK | M_ZERO);
 834         }
 835
 836         pf_hashmask = pf_hashsize - 1;
 837         for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask;
 838             i++, kh++, ih++) {
 839                 mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK);
 840                 mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
 841         }
 842
 843         /* Source nodes. */
 844         V_pf_sources_z = uma_zcreate("pf source nodes",
 845             sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 846             0);
 847         V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z;
 848         uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT);
 849         uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached");
 850
 851         V_pf_srchash = mallocarray(pf_srchashsize,
 852             sizeof(struct pf_srchash), M_PFHASH, M_NOWAIT | M_ZERO);
 853         if (V_pf_srchash == NULL) {
 854                 printf("pf: Unable to allocate memory for "
 855                     "source_hashsize %lu.\n", pf_srchashsize);
 856
 857                 pf_srchashsize = PF_SRCHASHSIZ;
 858                 V_pf_srchash = mallocarray(pf_srchashsize,
 859                     sizeof(struct pf_srchash), M_PFHASH, M_WAITOK | M_ZERO);
 860         }
 861
 862         pf_srchashmask = pf_srchashsize - 1;
 863         for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++)
 864                 mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF);
 865
 866         /* ALTQ */
 867         TAILQ_INIT(&V_pf_altqs[0]);
 868         TAILQ_INIT(&V_pf_altqs[1]);
 869         TAILQ_INIT(&V_pf_altqs[2]);
 870         TAILQ_INIT(&V_pf_altqs[3]);
 871         TAILQ_INIT(&V_pf_pabuf);
 872         V_pf_altqs_active = &V_pf_altqs[0];
 873         V_pf_altq_ifs_active = &V_pf_altqs[1];
 874         V_pf_altqs_inactive = &V_pf_altqs[2];
 875         V_pf_altq_ifs_inactive = &V_pf_altqs[3];
 876
 877         /* Send & overload+flush queues. */
 878         STAILQ_INIT(&V_pf_sendqueue);
 879         SLIST_INIT(&V_pf_overloadqueue);
 880         TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, curvnet);
 881
 882         /* Unlinked, but may be referenced rules. */
 883         TAILQ_INIT(&V_pf_unlinked_rules);
 884 }
 885
 886 void
 887 pf_mtag_cleanup()
 888 {
 889
 890         uma_zdestroy(pf_mtag_z);
 891 }
 892
 893 void
 894 pf_cleanup()
 895 {
 896         struct pf_keyhash       *kh;
 897         struct pf_idhash        *ih;
 898         struct pf_srchash       *sh;
 899         struct pf_send_entry    *pfse, *next;
 900         u_int i;
 901
 902         for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask;
 903             i++, kh++, ih++) {
 904                 KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
 905                     __func__));
 906                 KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty",
 907                     __func__));
 908                 mtx_destroy(&kh->lock);
 909                 mtx_destroy(&ih->lock);
 910         }
 911         free(V_pf_keyhash, M_PFHASH);
 912         free(V_pf_idhash, M_PFHASH);
 913
 914         for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) {
 915                 KASSERT(LIST_EMPTY(&sh->nodes),
 916                     ("%s: source node hash not empty", __func__));
 917                 mtx_destroy(&sh->lock);
 918         }
 919         free(V_pf_srchash, M_PFHASH);
 920
 921         STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) {
 922                 m_freem(pfse->pfse_m);
 923                 free(pfse, M_PFTEMP);
 924         }
 925
 926         uma_zdestroy(V_pf_sources_z);
 927         uma_zdestroy(V_pf_state_z);
 928         uma_zdestroy(V_pf_state_key_z);
 929 }
 930
 931 static int
 932 pf_mtag_uminit(void *mem, int size, int how)
 933 {
 934         struct m_tag *t;
 935
 936         t = (struct m_tag *)mem;
 937         t->m_tag_cookie = MTAG_ABI_COMPAT;
 938         t->m_tag_id = PACKET_TAG_PF;
 939         t->m_tag_len = sizeof(struct pf_mtag);
 940         t->m_tag_free = pf_mtag_free;
 941
 942         return (0);
 943 }
 944
 945 static void
 946 pf_mtag_free(struct m_tag *t)
 947 {
 948
 949         uma_zfree(pf_mtag_z, t);
 950 }
 951
 952 struct pf_mtag *
 953 pf_get_mtag(struct mbuf *m)
 954 {
 955         struct m_tag *mtag;
 956
 957         if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL)
 958                 return ((struct pf_mtag *)(mtag + 1));
 959
 960         mtag = uma_zalloc(pf_mtag_z, M_NOWAIT);
 961         if (mtag == NULL)
 962                 return (NULL);
 963         bzero(mtag + 1, sizeof(struct pf_mtag));
 964         m_tag_prepend(m, mtag);
 965
 966         return ((struct pf_mtag *)(mtag + 1));
 967 }
 968
 969 static int
 970 pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
 971     struct pf_state *s)
 972 {
 973         struct pf_keyhash       *khs, *khw, *kh;
 974         struct pf_state_key     *sk, *cur;
 975         struct pf_state         *si, *olds = NULL;
 976         int idx;
 977
 978         KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
 979         KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__));
 980         KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
 981
 982         /*
 983          * We need to lock hash slots of both keys. To avoid deadlock
 984          * we always lock the slot with lower address first. Unlock order
 985          * isn't important.
 986          *
 987          * We also need to lock ID hash slot before dropping key
 988          * locks. On success we return with ID hash slot locked.
 989          */
 990
 991         if (skw == sks) {
 992                 khs = khw = &V_pf_keyhash[pf_hashkey(skw)];
 993                 PF_HASHROW_LOCK(khs);
 994         } else {
 995                 khs = &V_pf_keyhash[pf_hashkey(sks)];
 996                 khw = &V_pf_keyhash[pf_hashkey(skw)];
 997                 if (khs == khw) {
 998                         PF_HASHROW_LOCK(khs);
 999                 } else if (khs < khw) {
1000                         PF_HASHROW_LOCK(khs);
1001                         PF_HASHROW_LOCK(khw);
1002                 } else {
1003                         PF_HASHROW_LOCK(khw);
1004                         PF_HASHROW_LOCK(khs);
1005                 }
1006         }
1007
1008 #define KEYS_UNLOCK()   do {                    \
1009         if (khs != khw) {                       \
1010                 PF_HASHROW_UNLOCK(khs);         \
1011                 PF_HASHROW_UNLOCK(khw);         \
1012         } else                                  \
1013                 PF_HASHROW_UNLOCK(khs);         \
1014 } while (0)
1015
1016         /*
1017          * First run: start with wire key.
1018          */
1019         sk = skw;
1020         kh = khw;
1021         idx = PF_SK_WIRE;
1022
1023 keyattach:
1024         LIST_FOREACH(cur, &kh->keys, entry)
1025                 if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
1026                         break;
1027
1028         if (cur != NULL) {
1029                 /* Key exists. Check for same kif, if none, add to key. */
1030                 TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) {
1031                         struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)];
1032
1033                         PF_HASHROW_LOCK(ih);
1034                         if (si->kif == s->kif &&
1035                             si->direction == s->direction) {
1036                                 if (sk->proto == IPPROTO_TCP &&
1037                                     si->src.state >= TCPS_FIN_WAIT_2 &&
1038                                     si->dst.state >= TCPS_FIN_WAIT_2) {
1039                                         /*
1040                                          * New state matches an old >FIN_WAIT_2
1041                                          * state. We can't drop key hash locks,
1042                                          * thus we can't unlink it properly.
1043                                          *
1044                                          * As a workaround we drop it into
1045                                          * TCPS_CLOSED state, schedule purge
1046                                          * ASAP and push it into the very end
1047                                          * of the slot TAILQ, so that it won't
1048                                          * conflict with our new state.
1049                                          */
1050                                         si->src.state = si->dst.state =
1051                                             TCPS_CLOSED;
1052                                         si->timeout = PFTM_PURGE;
1053                                         olds = si;
1054                                 } else {
1055                                         if (V_pf_status.debug >= PF_DEBUG_MISC) {
1056                                                 printf("pf: %s key attach "
1057                                                     "failed on %s: ",
1058                                                     (idx == PF_SK_WIRE) ?
1059                                                     "wire" : "stack",
1060                                                     s->kif->pfik_name);
1061                                                 pf_print_state_parts(s,
1062                                                     (idx == PF_SK_WIRE) ?
1063                                                     sk : NULL,
1064                                                     (idx == PF_SK_STACK) ?
1065                                                     sk : NULL);
1066                                                 printf(", existing: ");
1067                                                 pf_print_state_parts(si,
1068                                                     (idx == PF_SK_WIRE) ?
1069                                                     sk : NULL,
1070                                                     (idx == PF_SK_STACK) ?
1071                                                     sk : NULL);
1072                                                 printf("\n");
1073                                         }
1074                                         PF_HASHROW_UNLOCK(ih);
1075                                         KEYS_UNLOCK();
1076                                         uma_zfree(V_pf_state_key_z, sk);
1077                                         if (idx == PF_SK_STACK)
1078                                                 pf_detach_state(s);
1079                                         return (EEXIST); /* collision! */
1080                                 }
1081                         }
1082                         PF_HASHROW_UNLOCK(ih);
1083                 }
1084                 uma_zfree(V_pf_state_key_z, sk);
1085                 s->key[idx] = cur;
1086         } else {
1087                 LIST_INSERT_HEAD(&kh->keys, sk, entry);
1088                 s->key[idx] = sk;
1089         }
1090
1091 stateattach:
1092         /* List is sorted, if-bound states before floating. */
1093         if (s->kif == V_pfi_all)
1094                 TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]);
1095         else
1096                 TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
1097
1098         if (olds) {
1099                 TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]);
1100                 TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds,
1101                     key_list[idx]);
1102                 olds = NULL;
1103         }
1104
1105         /*
1106          * Attach done. See how should we (or should not?)
1107          * attach a second key.
1108          */
1109         if (sks == skw) {
1110                 s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
1111                 idx = PF_SK_STACK;
1112                 sks = NULL;
1113                 goto stateattach;
1114         } else if (sks != NULL) {
1115                 /*
1116                  * Continue attaching with stack key.
1117                  */
1118                 sk = sks;
1119                 kh = khs;
1120                 idx = PF_SK_STACK;
1121                 sks = NULL;
1122                 goto keyattach;
1123         }
1124
1125         PF_STATE_LOCK(s);
1126         KEYS_UNLOCK();
1127
1128         KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
1129             ("%s failure", __func__));
1130
1131         return (0);
1132 #undef  KEYS_UNLOCK
1133 }
1134
1135 static void
1136 pf_detach_state(struct pf_state *s)
1137 {
1138         struct pf_state_key *sks = s->key[PF_SK_STACK];
1139         struct pf_keyhash *kh;
1140
1141         if (sks != NULL) {
1142                 kh = &V_pf_keyhash[pf_hashkey(sks)];
1143                 PF_HASHROW_LOCK(kh);
1144                 if (s->key[PF_SK_STACK] != NULL)
1145                         pf_state_key_detach(s, PF_SK_STACK);
1146                 /*
1147                  * If both point to same key, then we are done.
1148                  */
1149                 if (sks == s->key[PF_SK_WIRE]) {
1150                         pf_state_key_detach(s, PF_SK_WIRE);
1151                         PF_HASHROW_UNLOCK(kh);
1152                         return;
1153                 }
1154                 PF_HASHROW_UNLOCK(kh);
1155         }
1156
1157         if (s->key[PF_SK_WIRE] != NULL) {
1158                 kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])];
1159                 PF_HASHROW_LOCK(kh);
1160                 if (s->key[PF_SK_WIRE] != NULL)
1161                         pf_state_key_detach(s, PF_SK_WIRE);
1162                 PF_HASHROW_UNLOCK(kh);
1163         }
1164 }
1165
1166 static void
1167 pf_state_key_detach(struct pf_state *s, int idx)
1168 {
1169         struct pf_state_key *sk = s->key[idx];
1170 #ifdef INVARIANTS
1171         struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)];
1172
1173         PF_HASHROW_ASSERT(kh);
1174 #endif
1175         TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]);
1176         s->key[idx] = NULL;
1177
1178         if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) {
1179                 LIST_REMOVE(sk, entry);
1180                 uma_zfree(V_pf_state_key_z, sk);
1181         }
1182 }
1183
1184 static int
1185 pf_state_key_ctor(void *mem, int size, void *arg, int flags)
1186 {
1187         struct pf_state_key *sk = mem;
1188
1189         bzero(sk, sizeof(struct pf_state_key_cmp));
1190         TAILQ_INIT(&sk->states[PF_SK_WIRE]);
1191         TAILQ_INIT(&sk->states[PF_SK_STACK]);
1192
1193         return (0);
1194 }
1195
1196 struct pf_state_key *
1197 pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr,
1198         struct pf_addr *daddr, u_int16_t sport, u_int16_t dport)
1199 {
1200         struct pf_state_key *sk;
1201
1202         sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1203         if (sk == NULL)
1204                 return (NULL);
1205
1206         PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af);
1207         PF_ACPY(&sk->addr[pd->didx], daddr, pd->af);
1208         sk->port[pd->sidx] = sport;
1209         sk->port[pd->didx] = dport;
1210         sk->proto = pd->proto;
1211         sk->af = pd->af;
1212
1213         return (sk);
1214 }
1215
1216 struct pf_state_key *
1217 pf_state_key_clone(struct pf_state_key *orig)
1218 {
1219         struct pf_state_key *sk;
1220
1221         sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1222         if (sk == NULL)
1223                 return (NULL);
1224
1225         bcopy(orig, sk, sizeof(struct pf_state_key_cmp));
1226
1227         return (sk);
1228 }
1229
1230 int
1231 pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw,
1232     struct pf_state_key *sks, struct pf_state *s)
1233 {
1234         struct pf_idhash *ih;
1235         struct pf_state *cur;
1236         int error;
1237
1238         KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]),
1239             ("%s: sks not pristine", __func__));
1240         KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]),
1241             ("%s: skw not pristine", __func__));
1242         KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
1243
1244         s->kif = kif;
1245
1246         if (s->id == 0 && s->creatorid == 0) {
1247                 /* XXX: should be atomic, but probability of collision low */
1248                 if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID)
1249                         V_pf_stateid[curcpu] = 1;
1250                 s->id |= (uint64_t )curcpu << PFID_CPUSHIFT;
1251                 s->id = htobe64(s->id);
1252                 s->creatorid = V_pf_status.hostid;
1253         }
1254
1255         /* Returns with ID locked on success. */
1256         if ((error = pf_state_key_attach(skw, sks, s)) != 0)
1257                 return (error);
1258
1259         ih = &V_pf_idhash[PF_IDHASH(s)];
1260         PF_HASHROW_ASSERT(ih);
1261         LIST_FOREACH(cur, &ih->states, entry)
1262                 if (cur->id == s->id && cur->creatorid == s->creatorid)
1263                         break;
1264
1265         if (cur != NULL) {
1266                 PF_HASHROW_UNLOCK(ih);
1267                 if (V_pf_status.debug >= PF_DEBUG_MISC) {
1268                         printf("pf: state ID collision: "
1269                             "id: %016llx creatorid: %08x\n",
1270                             (unsigned long long)be64toh(s->id),
1271                             ntohl(s->creatorid));
1272                 }
1273                 pf_detach_state(s);
1274                 return (EEXIST);
1275         }
1276         LIST_INSERT_HEAD(&ih->states, s, entry);
1277         /* One for keys, one for ID hash. */
1278         refcount_init(&s->refs, 2);
1279
1280         counter_u64_add(V_pf_status.fcounters[FCNT_STATE_INSERT], 1);
1281         if (V_pfsync_insert_state_ptr != NULL)
1282                 V_pfsync_insert_state_ptr(s);
1283
1284         /* Returns locked. */
1285         return (0);
1286 }
1287
1288 /*
1289  * Find state by ID: returns with locked row on success.
1290  */
1291 struct pf_state *
1292 pf_find_state_byid(uint64_t id, uint32_t creatorid)
1293 {
1294         struct pf_idhash *ih;
1295         struct pf_state *s;
1296
1297         counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
1298
1299         ih = &V_pf_idhash[(be64toh(id) % (pf_hashmask + 1))];
1300
1301         PF_HASHROW_LOCK(ih);
1302         LIST_FOREACH(s, &ih->states, entry)
1303                 if (s->id == id && s->creatorid == creatorid)
1304                         break;
1305
1306         if (s == NULL)
1307                 PF_HASHROW_UNLOCK(ih);
1308
1309         return (s);
1310 }
1311
1312 /*
1313  * Find state by key.
1314  * Returns with ID hash slot locked on success.
1315  */
1316 static struct pf_state *
1317 pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir)
1318 {
1319         struct pf_keyhash       *kh;
1320         struct pf_state_key     *sk;
1321         struct pf_state         *s;
1322         int idx;
1323
1324         counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
1325
1326         kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1327
1328         PF_HASHROW_LOCK(kh);
1329         LIST_FOREACH(sk, &kh->keys, entry)
1330                 if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1331                         break;
1332         if (sk == NULL) {
1333                 PF_HASHROW_UNLOCK(kh);
1334                 return (NULL);
1335         }
1336
1337         idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK);
1338
1339         /* List is sorted, if-bound states before floating ones. */
1340         TAILQ_FOREACH(s, &sk->states[idx], key_list[idx])
1341                 if (s->kif == V_pfi_all || s->kif == kif) {
1342                         PF_STATE_LOCK(s);
1343                         PF_HASHROW_UNLOCK(kh);
1344                         if (s->timeout >= PFTM_MAX) {
1345                                 /*
1346                                  * State is either being processed by
1347                                  * pf_unlink_state() in an other thread, or
1348                                  * is scheduled for immediate expiry.
1349                                  */
1350                                 PF_STATE_UNLOCK(s);
1351                                 return (NULL);
1352                         }
1353                         return (s);
1354                 }
1355         PF_HASHROW_UNLOCK(kh);
1356
1357         return (NULL);
1358 }
1359
1360 struct pf_state *
1361 pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
1362 {
1363         struct pf_keyhash       *kh;
1364         struct pf_state_key     *sk;
1365         struct pf_state         *s, *ret = NULL;
1366         int                      idx, inout = 0;
1367
1368         counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
1369
1370         kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1371
1372         PF_HASHROW_LOCK(kh);
1373         LIST_FOREACH(sk, &kh->keys, entry)
1374                 if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1375                         break;
1376         if (sk == NULL) {
1377                 PF_HASHROW_UNLOCK(kh);
1378                 return (NULL);
1379         }
1380         switch (dir) {
1381         case PF_IN:
1382                 idx = PF_SK_WIRE;
1383                 break;
1384         case PF_OUT:
1385                 idx = PF_SK_STACK;
1386                 break;
1387         case PF_INOUT:
1388                 idx = PF_SK_WIRE;
1389                 inout = 1;
1390                 break;
1391         default:
1392                 panic("%s: dir %u", __func__, dir);
1393         }
1394 second_run:
1395         TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) {
1396                 if (more == NULL) {
1397                         PF_HASHROW_UNLOCK(kh);
1398                         return (s);
1399                 }
1400
1401                 if (ret)
1402                         (*more)++;
1403                 else
1404                         ret = s;
1405         }
1406         if (inout == 1) {
1407                 inout = 0;
1408                 idx = PF_SK_STACK;
1409                 goto second_run;
1410         }
1411         PF_HASHROW_UNLOCK(kh);
1412
1413         return (ret);
1414 }
1415
1416 /* END state table stuff */
1417
1418 static void
1419 pf_send(struct pf_send_entry *pfse)
1420 {
1421
1422         PF_SENDQ_LOCK();
1423         STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next);
1424         PF_SENDQ_UNLOCK();
1425         swi_sched(V_pf_swi_cookie, 0);
1426 }
1427
1428 void
1429 pf_intr(void *v)
1430 {
1431         struct epoch_tracker et;
1432         struct pf_send_head queue;
1433         struct pf_send_entry *pfse, *next;
1434
1435         CURVNET_SET((struct vnet *)v);
1436
1437         PF_SENDQ_LOCK();
1438         queue = V_pf_sendqueue;
1439         STAILQ_INIT(&V_pf_sendqueue);
1440         PF_SENDQ_UNLOCK();
1441
1442         NET_EPOCH_ENTER(et);
1443
1444         STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) {
1445                 switch (pfse->pfse_type) {
1446 #ifdef INET
1447                 case PFSE_IP:
1448                         ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL);
1449                         break;
1450                 case PFSE_ICMP:
1451                         icmp_error(pfse->pfse_m, pfse->icmpopts.type,
1452                             pfse->icmpopts.code, 0, pfse->icmpopts.mtu);
1453                         break;
1454 #endif /* INET */
1455 #ifdef INET6
1456                 case PFSE_IP6:
1457                         ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL,
1458                             NULL);
1459                         break;
1460                 case PFSE_ICMP6:
1461                         icmp6_error(pfse->pfse_m, pfse->icmpopts.type,
1462                             pfse->icmpopts.code, pfse->icmpopts.mtu);
1463                         break;
1464 #endif /* INET6 */
1465                 default:
1466                         panic("%s: unknown type", __func__);
1467                 }
1468                 free(pfse, M_PFTEMP);
1469         }
1470         NET_EPOCH_EXIT(et);
1471         CURVNET_RESTORE();
1472 }
1473
1474 void
1475 pf_purge_thread(void *unused __unused)
1476 {
1477         VNET_ITERATOR_DECL(vnet_iter);
1478
1479         sx_xlock(&pf_end_lock);
1480         while (pf_end_threads == 0) {
1481                 sx_sleep(pf_purge_thread, &pf_end_lock, 0, "pftm", hz / 10);
1482
1483                 VNET_LIST_RLOCK();
1484                 VNET_FOREACH(vnet_iter) {
1485                         CURVNET_SET(vnet_iter);
1486
1487
1488                         /* Wait until V_pf_default_rule is initialized. */
1489                         if (V_pf_vnet_active == 0) {
1490                                 CURVNET_RESTORE();
1491                                 continue;
1492                         }
1493
1494                         /*
1495                          *  Process 1/interval fraction of the state
1496                          * table every run.
1497                          */
1498                         V_pf_purge_idx =
1499                             pf_purge_expired_states(V_pf_purge_idx, pf_hashmask /
1500                             (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10));
1501
1502                         /*
1503                          * Purge other expired types every
1504                          * PFTM_INTERVAL seconds.
1505                          */
1506                         if (V_pf_purge_idx == 0) {
1507                                 /*
1508                                  * Order is important:
1509                                  * - states and src nodes reference rules
1510                                  * - states and rules reference kifs
1511                                  */
1512                                 pf_purge_expired_fragments();
1513                                 pf_purge_expired_src_nodes();
1514                                 pf_purge_unlinked_rules();
1515                                 pfi_kif_purge();
1516                         }
1517                         CURVNET_RESTORE();
1518                 }
1519                 VNET_LIST_RUNLOCK();
1520         }
1521
1522         pf_end_threads++;
1523         sx_xunlock(&pf_end_lock);
1524         kproc_exit(0);
1525 }
1526
1527 void
1528 pf_unload_vnet_purge(void)
1529 {
1530
1531         /*
1532          * To cleanse up all kifs and rules we need
1533          * two runs: first one clears reference flags,
1534          * then pf_purge_expired_states() doesn't
1535          * raise them, and then second run frees.
1536          */
1537         pf_purge_unlinked_rules();
1538         pfi_kif_purge();
1539
1540         /*
1541          * Now purge everything.
1542          */
1543         pf_purge_expired_states(0, pf_hashmask);
1544         pf_purge_fragments(UINT_MAX);
1545         pf_purge_expired_src_nodes();
1546
1547         /*
1548          * Now all kifs & rules should be unreferenced,
1549          * thus should be successfully freed.
1550          */
1551         pf_purge_unlinked_rules();
1552         pfi_kif_purge();
1553 }
1554
1555
1556 u_int32_t
1557 pf_state_expires(const struct pf_state *state)
1558 {
1559         u_int32_t       timeout;
1560         u_int32_t       start;
1561         u_int32_t       end;
1562         u_int32_t       states;
1563
1564         /* handle all PFTM_* > PFTM_MAX here */
1565         if (state->timeout == PFTM_PURGE)
1566                 return (time_uptime);
1567         KASSERT(state->timeout != PFTM_UNLINKED,
1568             ("pf_state_expires: timeout == PFTM_UNLINKED"));
1569         KASSERT((state->timeout < PFTM_MAX),
1570             ("pf_state_expires: timeout > PFTM_MAX"));
1571         timeout = state->rule.ptr->timeout[state->timeout];
1572         if (!timeout)
1573                 timeout = V_pf_default_rule.timeout[state->timeout];
1574         start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
1575         if (start && state->rule.ptr != &V_pf_default_rule) {
1576                 end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
1577                 states = counter_u64_fetch(state->rule.ptr->states_cur);
1578         } else {
1579                 start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START];
1580                 end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END];
1581                 states = V_pf_status.states;
1582         }
1583         if (end && states > start && start < end) {
1584                 if (states < end) {
1585                         timeout = (u_int64_t)timeout * (end - states) /
1586                             (end - start);
1587                         return (state->expire + timeout);
1588                 }
1589                 else
1590                         return (time_uptime);
1591         }
1592         return (state->expire + timeout);
1593 }
1594
1595 void
1596 pf_purge_expired_src_nodes()
1597 {
1598         struct pf_src_node_list  freelist;
1599         struct pf_srchash       *sh;
1600         struct pf_src_node      *cur, *next;
1601         int i;
1602
1603         LIST_INIT(&freelist);
1604         for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) {
1605             PF_HASHROW_LOCK(sh);
1606             LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next)
1607                 if (cur->states == 0 && cur->expire <= time_uptime) {
1608                         pf_unlink_src_node(cur);
1609                         LIST_INSERT_HEAD(&freelist, cur, entry);
1610                 } else if (cur->rule.ptr != NULL)
1611                         cur->rule.ptr->rule_flag |= PFRULE_REFS;
1612             PF_HASHROW_UNLOCK(sh);
1613         }
1614
1615         pf_free_src_nodes(&freelist);
1616
1617         V_pf_status.src_nodes = uma_zone_get_cur(V_pf_sources_z);
1618 }
1619
1620 static void
1621 pf_src_tree_remove_state(struct pf_state *s)
1622 {
1623         struct pf_src_node *sn;
1624         struct pf_srchash *sh;
1625         uint32_t timeout;
1626
1627         timeout = s->rule.ptr->timeout[PFTM_SRC_NODE] ?
1628             s->rule.ptr->timeout[PFTM_SRC_NODE] :
1629             V_pf_default_rule.timeout[PFTM_SRC_NODE];
1630
1631         if (s->src_node != NULL) {
1632                 sn = s->src_node;
1633                 sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
1634                 PF_HASHROW_LOCK(sh);
1635                 if (s->src.tcp_est)
1636                         --sn->conn;
1637                 if (--sn->states == 0)
1638                         sn->expire = time_uptime + timeout;
1639                 PF_HASHROW_UNLOCK(sh);
1640         }
1641         if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
1642                 sn = s->nat_src_node;
1643                 sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
1644                 PF_HASHROW_LOCK(sh);
1645                 if (--sn->states == 0)
1646                         sn->expire = time_uptime + timeout;
1647                 PF_HASHROW_UNLOCK(sh);
1648         }
1649         s->src_node = s->nat_src_node = NULL;
1650 }
1651
1652 /*
1653  * Unlink and potentilly free a state. Function may be
1654  * called with ID hash row locked, but always returns
1655  * unlocked, since it needs to go through key hash locking.
1656  */
1657 int
1658 pf_unlink_state(struct pf_state *s, u_int flags)
1659 {
1660         struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)];
1661
1662         if ((flags & PF_ENTER_LOCKED) == 0)
1663                 PF_HASHROW_LOCK(ih);
1664         else
1665                 PF_HASHROW_ASSERT(ih);
1666
1667         if (s->timeout == PFTM_UNLINKED) {
1668                 /*
1669                  * State is being processed
1670                  * by pf_unlink_state() in
1671                  * an other thread.
1672                  */
1673                 PF_HASHROW_UNLOCK(ih);
1674                 return (0);     /* XXXGL: undefined actually */
1675         }
1676
1677         if (s->src.state == PF_TCPS_PROXY_DST) {
1678                 /* XXX wire key the right one? */
1679                 pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af,
1680                     &s->key[PF_SK_WIRE]->addr[1],
1681                     &s->key[PF_SK_WIRE]->addr[0],
1682                     s->key[PF_SK_WIRE]->port[1],
1683                     s->key[PF_SK_WIRE]->port[0],
1684                     s->src.seqhi, s->src.seqlo + 1,
1685                     TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL);
1686         }
1687
1688         LIST_REMOVE(s, entry);
1689         pf_src_tree_remove_state(s);
1690
1691         if (V_pfsync_delete_state_ptr != NULL)
1692                 V_pfsync_delete_state_ptr(s);
1693
1694         STATE_DEC_COUNTERS(s);
1695
1696         s->timeout = PFTM_UNLINKED;
1697
1698         PF_HASHROW_UNLOCK(ih);
1699
1700         pf_detach_state(s);
1701         /* pf_state_insert() initialises refs to 2, so we can never release the
1702          * last reference here, only in pf_release_state(). */
1703         (void)refcount_release(&s->refs);
1704
1705         return (pf_release_state(s));
1706 }
1707
1708 void
1709 pf_free_state(struct pf_state *cur)
1710 {
1711
1712         KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur));
1713         KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__,
1714             cur->timeout));
1715
1716         pf_normalize_tcp_cleanup(cur);
1717         uma_zfree(V_pf_state_z, cur);
1718         counter_u64_add(V_pf_status.fcounters[FCNT_STATE_REMOVALS], 1);
1719 }
1720
1721 /*
1722  * Called only from pf_purge_thread(), thus serialized.
1723  */
1724 static u_int
1725 pf_purge_expired_states(u_int i, int maxcheck)
1726 {
1727         struct pf_idhash *ih;
1728         struct pf_state *s;
1729
1730         V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1731
1732         /*
1733          * Go through hash and unlink states that expire now.
1734          */
1735         while (maxcheck > 0) {
1736
1737                 ih = &V_pf_idhash[i];
1738
1739                 /* only take the lock if we expect to do work */
1740                 if (!LIST_EMPTY(&ih->states)) {
1741 relock:
1742                         PF_HASHROW_LOCK(ih);
1743                         LIST_FOREACH(s, &ih->states, entry) {
1744                                 if (pf_state_expires(s) <= time_uptime) {
1745                                         V_pf_status.states -=
1746                                             pf_unlink_state(s, PF_ENTER_LOCKED);
1747                                         goto relock;
1748                                 }
1749                                 s->rule.ptr->rule_flag |= PFRULE_REFS;
1750                                 if (s->nat_rule.ptr != NULL)
1751                                         s->nat_rule.ptr->rule_flag |= PFRULE_REFS;
1752                                 if (s->anchor.ptr != NULL)
1753                                         s->anchor.ptr->rule_flag |= PFRULE_REFS;
1754                                 s->kif->pfik_flags |= PFI_IFLAG_REFS;
1755                                 if (s->rt_kif)
1756                                         s->rt_kif->pfik_flags |= PFI_IFLAG_REFS;
1757                         }
1758                         PF_HASHROW_UNLOCK(ih);
1759                 }
1760
1761                 /* Return when we hit end of hash. */
1762                 if (++i > pf_hashmask) {
1763                         V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1764                         return (0);
1765                 }
1766
1767                 maxcheck--;
1768         }
1769
1770         V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1771
1772         return (i);
1773 }
1774
1775 static void
1776 pf_purge_unlinked_rules()
1777 {
1778         struct pf_rulequeue tmpq;
1779         struct pf_rule *r, *r1;
1780
1781         /*
1782          * If we have overloading task pending, then we'd
1783          * better skip purging this time. There is a tiny
1784          * probability that overloading task references
1785          * an already unlinked rule.
1786          */
1787         PF_OVERLOADQ_LOCK();
1788         if (!SLIST_EMPTY(&V_pf_overloadqueue)) {
1789                 PF_OVERLOADQ_UNLOCK();
1790                 return;
1791         }
1792         PF_OVERLOADQ_UNLOCK();
1793
1794         /*
1795          * Do naive mark-and-sweep garbage collecting of old rules.
1796          * Reference flag is raised by pf_purge_expired_states()
1797          * and pf_purge_expired_src_nodes().
1798          *
1799          * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK,
1800          * use a temporary queue.
1801          */
1802         TAILQ_INIT(&tmpq);
1803         PF_UNLNKDRULES_LOCK();
1804         TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) {
1805                 if (!(r->rule_flag & PFRULE_REFS)) {
1806                         TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries);
1807                         TAILQ_INSERT_TAIL(&tmpq, r, entries);
1808                 } else
1809                         r->rule_flag &= ~PFRULE_REFS;
1810         }
1811         PF_UNLNKDRULES_UNLOCK();
1812
1813         if (!TAILQ_EMPTY(&tmpq)) {
1814                 PF_RULES_WLOCK();
1815                 TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) {
1816                         TAILQ_REMOVE(&tmpq, r, entries);
1817                         pf_free_rule(r);
1818                 }
1819                 PF_RULES_WUNLOCK();
1820         }
1821 }
1822
1823 void
1824 pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
1825 {
1826         switch (af) {
1827 #ifdef INET
1828         case AF_INET: {
1829                 u_int32_t a = ntohl(addr->addr32[0]);
1830                 printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
1831                     (a>>8)&255, a&255);
1832                 if (p) {
1833                         p = ntohs(p);
1834                         printf(":%u", p);
1835                 }
1836                 break;
1837         }
1838 #endif /* INET */
1839 #ifdef INET6
1840         case AF_INET6: {
1841                 u_int16_t b;
1842                 u_int8_t i, curstart, curend, maxstart, maxend;
1843                 curstart = curend = maxstart = maxend = 255;
1844                 for (i = 0; i < 8; i++) {
1845                         if (!addr->addr16[i]) {
1846                                 if (curstart == 255)
1847                                         curstart = i;
1848                                 curend = i;
1849                         } else {
1850                                 if ((curend - curstart) >
1851                                     (maxend - maxstart)) {
1852                                         maxstart = curstart;
1853                                         maxend = curend;
1854                                 }
1855                                 curstart = curend = 255;
1856                         }
1857                 }
1858                 if ((curend - curstart) >
1859                     (maxend - maxstart)) {
1860                         maxstart = curstart;
1861                         maxend = curend;
1862                 }
1863                 for (i = 0; i < 8; i++) {
1864                         if (i >= maxstart && i <= maxend) {
1865                                 if (i == 0)
1866                                         printf(":");
1867                                 if (i == maxend)
1868                                         printf(":");
1869                         } else {
1870                                 b = ntohs(addr->addr16[i]);
1871                                 printf("%x", b);
1872                                 if (i < 7)
1873                                         printf(":");
1874                         }
1875                 }
1876                 if (p) {
1877                         p = ntohs(p);
1878                         printf("[%u]", p);
1879                 }
1880                 break;
1881         }
1882 #endif /* INET6 */
1883         }
1884 }
1885
1886 void
1887 pf_print_state(struct pf_state *s)
1888 {
1889         pf_print_state_parts(s, NULL, NULL);
1890 }
1891
1892 static void
1893 pf_print_state_parts(struct pf_state *s,
1894     struct pf_state_key *skwp, struct pf_state_key *sksp)
1895 {
1896         struct pf_state_key *skw, *sks;
1897         u_int8_t proto, dir;
1898
1899         /* Do our best to fill these, but they're skipped if NULL */
1900         skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
1901         sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
1902         proto = skw ? skw->proto : (sks ? sks->proto : 0);
1903         dir = s ? s->direction : 0;
1904
1905         switch (proto) {
1906         case IPPROTO_IPV4:
1907                 printf("IPv4");
1908                 break;
1909         case IPPROTO_IPV6:
1910                 printf("IPv6");
1911                 break;
1912         case IPPROTO_TCP:
1913                 printf("TCP");
1914                 break;
1915         case IPPROTO_UDP:
1916                 printf("UDP");
1917                 break;
1918         case IPPROTO_ICMP:
1919                 printf("ICMP");
1920                 break;
1921         case IPPROTO_ICMPV6:
1922                 printf("ICMPv6");
1923                 break;
1924         default:
1925                 printf("%u", proto);
1926                 break;
1927         }
1928         switch (dir) {
1929         case PF_IN:
1930                 printf(" in");
1931                 break;
1932         case PF_OUT:
1933                 printf(" out");
1934                 break;
1935         }
1936         if (skw) {
1937                 printf(" wire: ");
1938                 pf_print_host(&skw->addr[0], skw->port[0], skw->af);
1939                 printf(" ");
1940                 pf_print_host(&skw->addr[1], skw->port[1], skw->af);
1941         }
1942         if (sks) {
1943                 printf(" stack: ");
1944                 if (sks != skw) {
1945                         pf_print_host(&sks->addr[0], sks->port[0], sks->af);
1946                         printf(" ");
1947                         pf_print_host(&sks->addr[1], sks->port[1], sks->af);
1948                 } else
1949                         printf("-");
1950         }
1951         if (s) {
1952                 if (proto == IPPROTO_TCP) {
1953                         printf(" [lo=%u high=%u win=%u modulator=%u",
1954                             s->src.seqlo, s->src.seqhi,
1955                             s->src.max_win, s->src.seqdiff);
1956                         if (s->src.wscale && s->dst.wscale)
1957                                 printf(" wscale=%u",
1958                                     s->src.wscale & PF_WSCALE_MASK);
1959                         printf("]");
1960                         printf(" [lo=%u high=%u win=%u modulator=%u",
1961                             s->dst.seqlo, s->dst.seqhi,
1962                             s->dst.max_win, s->dst.seqdiff);
1963                         if (s->src.wscale && s->dst.wscale)
1964                                 printf(" wscale=%u",
1965                                 s->dst.wscale & PF_WSCALE_MASK);
1966                         printf("]");
1967                 }
1968                 printf(" %u:%u", s->src.state, s->dst.state);
1969         }
1970 }
1971
1972 void
1973 pf_print_flags(u_int8_t f)
1974 {
1975         if (f)
1976                 printf(" ");
1977         if (f & TH_FIN)
1978                 printf("F");
1979         if (f & TH_SYN)
1980                 printf("S");
1981         if (f & TH_RST)
1982                 printf("R");
1983         if (f & TH_PUSH)
1984                 printf("P");
1985         if (f & TH_ACK)
1986                 printf("A");
1987         if (f & TH_URG)
1988                 printf("U");
1989         if (f & TH_ECE)
1990                 printf("E");
1991         if (f & TH_CWR)
1992                 printf("W");
1993 }
1994
1995 #define PF_SET_SKIP_STEPS(i)                                    \
1996         do {                                                    \
1997                 while (head[i] != cur) {                        \
1998                         head[i]->skip[i].ptr = cur;             \
1999                         head[i] = TAILQ_NEXT(head[i], entries); \
2000                 }                                               \
2001         } while (0)
2002
2003 void
2004 pf_calc_skip_steps(struct pf_rulequeue *rules)
2005 {
2006         struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT];
2007         int i;
2008
2009         cur = TAILQ_FIRST(rules);
2010         prev = cur;
2011         for (i = 0; i < PF_SKIP_COUNT; ++i)
2012                 head[i] = cur;
2013         while (cur != NULL) {
2014
2015                 if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
2016                         PF_SET_SKIP_STEPS(PF_SKIP_IFP);
2017                 if (cur->direction != prev->direction)
2018                         PF_SET_SKIP_STEPS(PF_SKIP_DIR);
2019                 if (cur->af != prev->af)
2020                         PF_SET_SKIP_STEPS(PF_SKIP_AF);
2021                 if (cur->proto != prev->proto)
2022                         PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
2023                 if (cur->src.neg != prev->src.neg ||
2024                     pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
2025                         PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
2026                 if (cur->src.port[0] != prev->src.port[0] ||
2027                     cur->src.port[1] != prev->src.port[1] ||
2028                     cur->src.port_op != prev->src.port_op)
2029                         PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
2030                 if (cur->dst.neg != prev->dst.neg ||
2031                     pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
2032                         PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
2033                 if (cur->dst.port[0] != prev->dst.port[0] ||
2034                     cur->dst.port[1] != prev->dst.port[1] ||
2035                     cur->dst.port_op != prev->dst.port_op)
2036                         PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
2037
2038                 prev = cur;
2039                 cur = TAILQ_NEXT(cur, entries);
2040         }
2041         for (i = 0; i < PF_SKIP_COUNT; ++i)
2042                 PF_SET_SKIP_STEPS(i);
2043 }
2044
2045 static int
2046 pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
2047 {
2048         if (aw1->type != aw2->type)
2049                 return (1);
2050         switch (aw1->type) {
2051         case PF_ADDR_ADDRMASK:
2052         case PF_ADDR_RANGE:
2053                 if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, AF_INET6))
2054                         return (1);
2055                 if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, AF_INET6))
2056                         return (1);
2057                 return (0);
2058         case PF_ADDR_DYNIFTL:
2059                 return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
2060         case PF_ADDR_NOROUTE:
2061         case PF_ADDR_URPFFAILED:
2062                 return (0);
2063         case PF_ADDR_TABLE:
2064                 return (aw1->p.tbl != aw2->p.tbl);
2065         default:
2066                 printf("invalid address type: %d\n", aw1->type);
2067                 return (1);
2068         }
2069 }
2070
2071 /**
2072  * Checksum updates are a little complicated because the checksum in the TCP/UDP
2073  * header isn't always a full checksum. In some cases (i.e. output) it's a
2074  * pseudo-header checksum, which is a partial checksum over src/dst IP
2075  * addresses, protocol number and length.
2076  *
2077  * That means we have the following cases:
2078  *  * Input or forwarding: we don't have TSO, the checksum fields are full
2079  *      checksums, we need to update the checksum whenever we change anything.
2080  *  * Output (i.e. the checksum is a pseudo-header checksum):
2081  *      x The field being updated is src/dst address or affects the length of
2082  *      the packet. We need to update the pseudo-header checksum (note that this
2083  *      checksum is not ones' complement).
2084  *      x Some other field is being modified (e.g. src/dst port numbers): We
2085  *      don't have to update anything.
2086  **/
2087 u_int16_t
2088 pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
2089 {
2090         u_int32_t       l;
2091
2092         if (udp && !cksum)
2093                 return (0x0000);
2094         l = cksum + old - new;
2095         l = (l >> 16) + (l & 65535);
2096         l = l & 65535;
2097         if (udp && !l)
2098                 return (0xFFFF);
2099         return (l);
2100 }
2101
2102 u_int16_t
2103 pf_proto_cksum_fixup(struct mbuf *m, u_int16_t cksum, u_int16_t old,
2104         u_int16_t new, u_int8_t udp)
2105 {
2106         if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
2107                 return (cksum);
2108
2109         return (pf_cksum_fixup(cksum, old, new, udp));
2110 }
2111
2112 static void
2113 pf_change_ap(struct mbuf *m, struct pf_addr *a, u_int16_t *p, u_int16_t *ic,
2114         u_int16_t *pc, struct pf_addr *an, u_int16_t pn, u_int8_t u,
2115         sa_family_t af)
2116 {
2117         struct pf_addr  ao;
2118         u_int16_t       po = *p;
2119
2120         PF_ACPY(&ao, a, af);
2121         PF_ACPY(a, an, af);
2122
2123         if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
2124                 *pc = ~*pc;
2125
2126         *p = pn;
2127
2128         switch (af) {
2129 #ifdef INET
2130         case AF_INET:
2131                 *ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
2132                     ao.addr16[0], an->addr16[0], 0),
2133                     ao.addr16[1], an->addr16[1], 0);
2134                 *p = pn;
2135
2136                 *pc = pf_cksum_fixup(pf_cksum_fixup(*pc,
2137                     ao.addr16[0], an->addr16[0], u),
2138                     ao.addr16[1], an->addr16[1], u);
2139
2140                 *pc = pf_proto_cksum_fixup(m, *pc, po, pn, u);
2141                 break;
2142 #endif /* INET */
2143 #ifdef INET6
2144         case AF_INET6:
2145                 *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2146                     pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2147                     pf_cksum_fixup(pf_cksum_fixup(*pc,
2148                     ao.addr16[0], an->addr16[0], u),
2149                     ao.addr16[1], an->addr16[1], u),
2150                     ao.addr16[2], an->addr16[2], u),
2151                     ao.addr16[3], an->addr16[3], u),
2152                     ao.addr16[4], an->addr16[4], u),
2153                     ao.addr16[5], an->addr16[5], u),
2154                     ao.addr16[6], an->addr16[6], u),
2155                     ao.addr16[7], an->addr16[7], u);
2156
2157                 *pc = pf_proto_cksum_fixup(m, *pc, po, pn, u);
2158                 break;
2159 #endif /* INET6 */
2160         }
2161
2162         if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA |
2163             CSUM_DELAY_DATA_IPV6)) {
2164                 *pc = ~*pc;
2165                 if (! *pc)
2166                         *pc = 0xffff;
2167         }
2168 }
2169
2170 /* Changes a u_int32_t.  Uses a void * so there are no align restrictions */
2171 void
2172 pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
2173 {
2174         u_int32_t       ao;
2175
2176         memcpy(&ao, a, sizeof(ao));
2177         memcpy(a, &an, sizeof(u_int32_t));
2178         *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
2179             ao % 65536, an % 65536, u);
2180 }
2181
2182 void
2183 pf_change_proto_a(struct mbuf *m, void *a, u_int16_t *c, u_int32_t an, u_int8_t udp)
2184 {
2185         u_int32_t       ao;
2186
2187         memcpy(&ao, a, sizeof(ao));
2188         memcpy(a, &an, sizeof(u_int32_t));
2189
2190         *c = pf_proto_cksum_fixup(m,
2191             pf_proto_cksum_fixup(m, *c, ao / 65536, an / 65536, udp),
2192             ao % 65536, an % 65536, udp);
2193 }
2194
2195 #ifdef INET6
2196 static void
2197 pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
2198 {
2199         struct pf_addr  ao;
2200
2201         PF_ACPY(&ao, a, AF_INET6);
2202         PF_ACPY(a, an, AF_INET6);
2203
2204         *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2205             pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2206             pf_cksum_fixup(pf_cksum_fixup(*c,
2207             ao.addr16[0], an->addr16[0], u),
2208             ao.addr16[1], an->addr16[1], u),
2209             ao.addr16[2], an->addr16[2], u),
2210             ao.addr16[3], an->addr16[3], u),
2211             ao.addr16[4], an->addr16[4], u),
2212             ao.addr16[5], an->addr16[5], u),
2213             ao.addr16[6], an->addr16[6], u),
2214             ao.addr16[7], an->addr16[7], u);
2215 }
2216 #endif /* INET6 */
2217
2218 static void
2219 pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
2220     struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
2221     u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
2222 {
2223         struct pf_addr  oia, ooa;
2224
2225         PF_ACPY(&oia, ia, af);
2226         if (oa)
2227                 PF_ACPY(&ooa, oa, af);
2228
2229         /* Change inner protocol port, fix inner protocol checksum. */
2230         if (ip != NULL) {
2231                 u_int16_t       oip = *ip;
2232                 u_int32_t       opc;
2233
2234                 if (pc != NULL)
2235                         opc = *pc;
2236                 *ip = np;
2237                 if (pc != NULL)
2238                         *pc = pf_cksum_fixup(*pc, oip, *ip, u);
2239                 *ic = pf_cksum_fixup(*ic, oip, *ip, 0);
2240                 if (pc != NULL)
2241                         *ic = pf_cksum_fixup(*ic, opc, *pc, 0);
2242         }
2243         /* Change inner ip address, fix inner ip and icmp checksums. */
2244         PF_ACPY(ia, na, af);
2245         switch (af) {
2246 #ifdef INET
2247         case AF_INET: {
2248                 u_int32_t        oh2c = *h2c;
2249
2250                 *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
2251                     oia.addr16[0], ia->addr16[0], 0),
2252                     oia.addr16[1], ia->addr16[1], 0);
2253                 *ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
2254                     oia.addr16[0], ia->addr16[0], 0),
2255                     oia.addr16[1], ia->addr16[1], 0);
2256                 *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
2257                 break;
2258         }
2259 #endif /* INET */
2260 #ifdef INET6
2261         case AF_INET6:
2262                 *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2263                     pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2264                     pf_cksum_fixup(pf_cksum_fixup(*ic,
2265                     oia.addr16[0], ia->addr16[0], u),
2266                     oia.addr16[1], ia->addr16[1], u),
2267                     oia.addr16[2], ia->addr16[2], u),
2268                     oia.addr16[3], ia->addr16[3], u),
2269                     oia.addr16[4], ia->addr16[4], u),
2270                     oia.addr16[5], ia->addr16[5], u),
2271                     oia.addr16[6], ia->addr16[6], u),
2272                     oia.addr16[7], ia->addr16[7], u);
2273                 break;
2274 #endif /* INET6 */
2275         }
2276         /* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
2277         if (oa) {
2278                 PF_ACPY(oa, na, af);
2279                 switch (af) {
2280 #ifdef INET
2281                 case AF_INET:
2282                         *hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
2283                             ooa.addr16[0], oa->addr16[0], 0),
2284                             ooa.addr16[1], oa->addr16[1], 0);
2285                         break;
2286 #endif /* INET */
2287 #ifdef INET6
2288                 case AF_INET6:
2289                         *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2290                             pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2291                             pf_cksum_fixup(pf_cksum_fixup(*ic,
2292                             ooa.addr16[0], oa->addr16[0], u),
2293                             ooa.addr16[1], oa->addr16[1], u),
2294                             ooa.addr16[2], oa->addr16[2], u),
2295                             ooa.addr16[3], oa->addr16[3], u),
2296                             ooa.addr16[4], oa->addr16[4], u),
2297                             ooa.addr16[5], oa->addr16[5], u),
2298                             ooa.addr16[6], oa->addr16[6], u),
2299                             ooa.addr16[7], oa->addr16[7], u);
2300                         break;
2301 #endif /* INET6 */
2302                 }
2303         }
2304 }
2305
2306
2307 /*
2308  * Need to modulate the sequence numbers in the TCP SACK option
2309  * (credits to Krzysztof Pfaff for report and patch)
2310  */
2311 static int
2312 pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
2313     struct tcphdr *th, struct pf_state_peer *dst)
2314 {
2315         int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
2316         u_int8_t opts[TCP_MAXOLEN], *opt = opts;
2317         int copyback = 0, i, olen;
2318         struct sackblk sack;
2319
2320 #define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2)
2321         if (hlen < TCPOLEN_SACKLEN ||
2322             !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
2323                 return 0;
2324
2325         while (hlen >= TCPOLEN_SACKLEN) {
2326                 olen = opt[1];
2327                 switch (*opt) {
2328                 case TCPOPT_EOL:        /* FALLTHROUGH */
2329                 case TCPOPT_NOP:
2330                         opt++;
2331                         hlen--;
2332                         break;
2333                 case TCPOPT_SACK:
2334                         if (olen > hlen)
2335                                 olen = hlen;
2336                         if (olen >= TCPOLEN_SACKLEN) {
2337                                 for (i = 2; i + TCPOLEN_SACK <= olen;
2338                                     i += TCPOLEN_SACK) {
2339                                         memcpy(&sack, &opt[i], sizeof(sack));
2340                                         pf_change_proto_a(m, &sack.start, &th->th_sum,
2341                                             htonl(ntohl(sack.start) - dst->seqdiff), 0);
2342                                         pf_change_proto_a(m, &sack.end, &th->th_sum,
2343                                             htonl(ntohl(sack.end) - dst->seqdiff), 0);
2344                                         memcpy(&opt[i], &sack, sizeof(sack));
2345                                 }
2346                                 copyback = 1;
2347                         }
2348                         /* FALLTHROUGH */
2349                 default:
2350                         if (olen < 2)
2351                                 olen = 2;
2352                         hlen -= olen;
2353                         opt += olen;
2354                 }
2355         }
2356
2357         if (copyback)
2358                 m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts);
2359         return (copyback);
2360 }
2361
2362 static void
2363 pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af,
2364     const struct pf_addr *saddr, const struct pf_addr *daddr,
2365     u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
2366     u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
2367     u_int16_t rtag, struct ifnet *ifp)
2368 {
2369         struct pf_send_entry *pfse;
2370         struct mbuf     *m;
2371         int              len, tlen;
2372 #ifdef INET
2373         struct ip       *h = NULL;
2374 #endif /* INET */
2375 #ifdef INET6
2376         struct ip6_hdr  *h6 = NULL;
2377 #endif /* INET6 */
2378         struct tcphdr   *th;
2379         char            *opt;
2380         struct pf_mtag  *pf_mtag;
2381
2382         len = 0;
2383         th = NULL;
2384
2385         /* maximum segment size tcp option */
2386         tlen = sizeof(struct tcphdr);
2387         if (mss)
2388                 tlen += 4;
2389
2390         switch (af) {
2391 #ifdef INET
2392         case AF_INET:
2393                 len = sizeof(struct ip) + tlen;
2394                 break;
2395 #endif /* INET */
2396 #ifdef INET6
2397         case AF_INET6:
2398                 len = sizeof(struct ip6_hdr) + tlen;
2399                 break;
2400 #endif /* INET6 */
2401         default:
2402                 panic("%s: unsupported af %d", __func__, af);
2403         }
2404
2405         /* Allocate outgoing queue entry, mbuf and mbuf tag. */
2406         pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
2407         if (pfse == NULL)
2408                 return;
2409         m = m_gethdr(M_NOWAIT, MT_DATA);
2410         if (m == NULL) {
2411                 free(pfse, M_PFTEMP);
2412                 return;
2413         }
2414 #ifdef MAC
2415         mac_netinet_firewall_send(m);
2416 #endif
2417         if ((pf_mtag = pf_get_mtag(m)) == NULL) {
2418                 free(pfse, M_PFTEMP);
2419                 m_freem(m);
2420                 return;
2421         }
2422         if (tag)
2423                 m->m_flags |= M_SKIP_FIREWALL;
2424         pf_mtag->tag = rtag;
2425
2426         if (r != NULL && r->rtableid >= 0)
2427                 M_SETFIB(m, r->rtableid);
2428
2429 #ifdef ALTQ
2430         if (r != NULL && r->qid) {
2431                 pf_mtag->qid = r->qid;
2432
2433                 /* add hints for ecn */
2434                 pf_mtag->hdr = mtod(m, struct ip *);
2435         }
2436 #endif /* ALTQ */
2437         m->m_data += max_linkhdr;
2438         m->m_pkthdr.len = m->m_len = len;
2439         m->m_pkthdr.rcvif = NULL;
2440         bzero(m->m_data, len);
2441         switch (af) {
2442 #ifdef INET
2443         case AF_INET:
2444                 h = mtod(m, struct ip *);
2445
2446                 /* IP header fields included in the TCP checksum */
2447                 h->ip_p = IPPROTO_TCP;
2448                 h->ip_len = htons(tlen);
2449                 h->ip_src.s_addr = saddr->v4.s_addr;
2450                 h->ip_dst.s_addr = daddr->v4.s_addr;
2451
2452                 th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
2453                 break;
2454 #endif /* INET */
2455 #ifdef INET6
2456         case AF_INET6:
2457                 h6 = mtod(m, struct ip6_hdr *);
2458
2459                 /* IP header fields included in the TCP checksum */
2460                 h6->ip6_nxt = IPPROTO_TCP;
2461                 h6->ip6_plen = htons(tlen);
2462                 memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
2463                 memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
2464
2465                 th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
2466                 break;
2467 #endif /* INET6 */
2468         }
2469
2470         /* TCP header */
2471         th->th_sport = sport;
2472         th->th_dport = dport;
2473         th->th_seq = htonl(seq);
2474         th->th_ack = htonl(ack);
2475         th->th_off = tlen >> 2;
2476         th->th_flags = flags;
2477         th->th_win = htons(win);
2478
2479         if (mss) {
2480                 opt = (char *)(th + 1);
2481                 opt[0] = TCPOPT_MAXSEG;
2482                 opt[1] = 4;
2483                 HTONS(mss);
2484                 bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
2485         }
2486
2487         switch (af) {
2488 #ifdef INET
2489         case AF_INET:
2490                 /* TCP checksum */
2491                 th->th_sum = in_cksum(m, len);
2492
2493                 /* Finish the IP header */
2494                 h->ip_v = 4;
2495                 h->ip_hl = sizeof(*h) >> 2;
2496                 h->ip_tos = IPTOS_LOWDELAY;
2497                 h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0);
2498                 h->ip_len = htons(len);
2499                 h->ip_ttl = ttl ? ttl : V_ip_defttl;
2500                 h->ip_sum = 0;
2501
2502                 pfse->pfse_type = PFSE_IP;
2503                 break;
2504 #endif /* INET */
2505 #ifdef INET6
2506         case AF_INET6:
2507                 /* TCP checksum */
2508                 th->th_sum = in6_cksum(m, IPPROTO_TCP,
2509                     sizeof(struct ip6_hdr), tlen);
2510
2511                 h6->ip6_vfc |= IPV6_VERSION;
2512                 h6->ip6_hlim = IPV6_DEFHLIM;
2513
2514                 pfse->pfse_type = PFSE_IP6;
2515                 break;
2516 #endif /* INET6 */
2517         }
2518         pfse->pfse_m = m;
2519         pf_send(pfse);
2520 }
2521
2522 static void
2523 pf_return(struct pf_rule *r, struct pf_rule *nr, struct pf_pdesc *pd,
2524     struct pf_state_key *sk, int off, struct mbuf *m, struct tcphdr *th,
2525     struct pfi_kif *kif, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen,
2526     u_short *reason)
2527 {
2528         struct pf_addr  * const saddr = pd->src;
2529         struct pf_addr  * const daddr = pd->dst;
2530         sa_family_t      af = pd->af;
2531
2532         /* undo NAT changes, if they have taken place */
2533         if (nr != NULL) {
2534                 PF_ACPY(saddr, &sk->addr[pd->sidx], af);
2535                 PF_ACPY(daddr, &sk->addr[pd->didx], af);
2536                 if (pd->sport)
2537                         *pd->sport = sk->port[pd->sidx];
2538                 if (pd->dport)
2539                         *pd->dport = sk->port[pd->didx];
2540                 if (pd->proto_sum)
2541                         *pd->proto_sum = bproto_sum;
2542                 if (pd->ip_sum)
2543                         *pd->ip_sum = bip_sum;
2544                 m_copyback(m, off, hdrlen, pd->hdr.any);
2545         }
2546         if (pd->proto == IPPROTO_TCP &&
2547             ((r->rule_flag & PFRULE_RETURNRST) ||
2548             (r->rule_flag & PFRULE_RETURN)) &&
2549             !(th->th_flags & TH_RST)) {
2550                 u_int32_t        ack = ntohl(th->th_seq) + pd->p_len;
2551                 int              len = 0;
2552 #ifdef INET
2553                 struct ip       *h4;
2554 #endif
2555 #ifdef INET6
2556                 struct ip6_hdr  *h6;
2557 #endif
2558
2559                 switch (af) {
2560 #ifdef INET
2561                 case AF_INET:
2562                         h4 = mtod(m, struct ip *);
2563                         len = ntohs(h4->ip_len) - off;
2564                         break;
2565 #endif
2566 #ifdef INET6
2567                 case AF_INET6:
2568                         h6 = mtod(m, struct ip6_hdr *);
2569                         len = ntohs(h6->ip6_plen) - (off - sizeof(*h6));
2570                         break;
2571 #endif
2572                 }
2573
2574                 if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
2575                         REASON_SET(reason, PFRES_PROTCKSUM);
2576                 else {
2577                         if (th->th_flags & TH_SYN)
2578                                 ack++;
2579                         if (th->th_flags & TH_FIN)
2580                                 ack++;
2581                         pf_send_tcp(m, r, af, pd->dst,
2582                                 pd->src, th->th_dport, th->th_sport,
2583                                 ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
2584                                 r->return_ttl, 1, 0, kif->pfik_ifp);
2585                 }
2586         } else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
2587                 r->return_icmp)
2588                 pf_send_icmp(m, r->return_icmp >> 8,
2589                         r->return_icmp & 255, af, r);
2590         else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
2591                 r->return_icmp6)
2592                 pf_send_icmp(m, r->return_icmp6 >> 8,
2593                         r->return_icmp6 & 255, af, r);
2594 }
2595
2596
2597 static int
2598 pf_ieee8021q_setpcp(struct mbuf *m, u_int8_t prio)
2599 {
2600         struct m_tag *mtag;
2601
2602         KASSERT(prio <= PF_PRIO_MAX,
2603             ("%s with invalid pcp", __func__));
2604
2605         mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_OUT, NULL);
2606         if (mtag == NULL) {
2607                 mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_OUT,
2608                     sizeof(uint8_t), M_NOWAIT);
2609                 if (mtag == NULL)
2610                         return (ENOMEM);
2611                 m_tag_prepend(m, mtag);
2612         }
2613
2614         *(uint8_t *)(mtag + 1) = prio;
2615         return (0);
2616 }
2617
2618 static int
2619 pf_match_ieee8021q_pcp(u_int8_t prio, struct mbuf *m)
2620 {
2621         struct m_tag *mtag;
2622         u_int8_t mpcp;
2623
2624         mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL);
2625         if (mtag == NULL)
2626                 return (0);
2627
2628         if (prio == PF_PRIO_ZERO)
2629                 prio = 0;
2630
2631         mpcp = *(uint8_t *)(mtag + 1);
2632
2633         return (mpcp == prio);
2634 }
2635
2636 static void
2637 pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
2638     struct pf_rule *r)
2639 {
2640         struct pf_send_entry *pfse;
2641         struct mbuf *m0;
2642         struct pf_mtag *pf_mtag;
2643
2644         /* Allocate outgoing queue entry, mbuf and mbuf tag. */
2645         pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
2646         if (pfse == NULL)
2647                 return;
2648
2649         if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) {
2650                 free(pfse, M_PFTEMP);
2651                 return;
2652         }
2653
2654         if ((pf_mtag = pf_get_mtag(m0)) == NULL) {
2655                 free(pfse, M_PFTEMP);
2656                 return;
2657         }
2658         /* XXX: revisit */
2659         m0->m_flags |= M_SKIP_FIREWALL;
2660
2661         if (r->rtableid >= 0)
2662                 M_SETFIB(m0, r->rtableid);
2663
2664 #ifdef ALTQ
2665         if (r->qid) {
2666                 pf_mtag->qid = r->qid;
2667                 /* add hints for ecn */
2668                 pf_mtag->hdr = mtod(m0, struct ip *);
2669         }
2670 #endif /* ALTQ */
2671
2672         switch (af) {
2673 #ifdef INET
2674         case AF_INET:
2675                 pfse->pfse_type = PFSE_ICMP;
2676                 break;
2677 #endif /* INET */
2678 #ifdef INET6
2679         case AF_INET6:
2680                 pfse->pfse_type = PFSE_ICMP6;
2681                 break;
2682 #endif /* INET6 */
2683         }
2684         pfse->pfse_m = m0;
2685         pfse->icmpopts.type = type;
2686         pfse->icmpopts.code = code;
2687         pf_send(pfse);
2688 }
2689
2690 /*
2691  * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
2692  * If n is 0, they match if they are equal. If n is != 0, they match if they
2693  * are different.
2694  */
2695 int
2696 pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
2697     struct pf_addr *b, sa_family_t af)
2698 {
2699         int     match = 0;
2700
2701         switch (af) {
2702 #ifdef INET
2703         case AF_INET:
2704                 if ((a->addr32[0] & m->addr32[0]) ==
2705                     (b->addr32[0] & m->addr32[0]))
2706                         match++;
2707                 break;
2708 #endif /* INET */
2709 #ifdef INET6
2710         case AF_INET6:
2711                 if (((a->addr32[0] & m->addr32[0]) ==
2712                      (b->addr32[0] & m->addr32[0])) &&
2713                     ((a->addr32[1] & m->addr32[1]) ==
2714                      (b->addr32[1] & m->addr32[1])) &&
2715                     ((a->addr32[2] & m->addr32[2]) ==
2716                      (b->addr32[2] & m->addr32[2])) &&
2717                     ((a->addr32[3] & m->addr32[3]) ==
2718                      (b->addr32[3] & m->addr32[3])))
2719                         match++;
2720                 break;
2721 #endif /* INET6 */
2722         }
2723         if (match) {
2724                 if (n)
2725                         return (0);
2726                 else
2727                         return (1);
2728         } else {
2729                 if (n)
2730                         return (1);
2731                 else
2732                         return (0);
2733         }
2734 }
2735
2736 /*
2737  * Return 1 if b <= a <= e, otherwise return 0.
2738  */
2739 int
2740 pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
2741     struct pf_addr *a, sa_family_t af)
2742 {
2743         switch (af) {
2744 #ifdef INET
2745         case AF_INET:
2746                 if ((ntohl(a->addr32[0]) < ntohl(b->addr32[0])) ||
2747                     (ntohl(a->addr32[0]) > ntohl(e->addr32[0])))
2748                         return (0);
2749                 break;
2750 #endif /* INET */
2751 #ifdef INET6
2752         case AF_INET6: {
2753                 int     i;
2754
2755                 /* check a >= b */
2756                 for (i = 0; i < 4; ++i)
2757                         if (ntohl(a->addr32[i]) > ntohl(b->addr32[i]))
2758                                 break;
2759                         else if (ntohl(a->addr32[i]) < ntohl(b->addr32[i]))
2760                                 return (0);
2761                 /* check a <= e */
2762                 for (i = 0; i < 4; ++i)
2763                         if (ntohl(a->addr32[i]) < ntohl(e->addr32[i]))
2764                                 break;
2765                         else if (ntohl(a->addr32[i]) > ntohl(e->addr32[i]))
2766                                 return (0);
2767                 break;
2768         }
2769 #endif /* INET6 */
2770         }
2771         return (1);
2772 }
2773
2774 static int
2775 pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
2776 {
2777         switch (op) {
2778         case PF_OP_IRG:
2779                 return ((p > a1) && (p < a2));
2780         case PF_OP_XRG:
2781                 return ((p < a1) || (p > a2));
2782         case PF_OP_RRG:
2783                 return ((p >= a1) && (p <= a2));
2784         case PF_OP_EQ:
2785                 return (p == a1);
2786         case PF_OP_NE:
2787                 return (p != a1);
2788         case PF_OP_LT:
2789                 return (p < a1);
2790         case PF_OP_LE:
2791                 return (p <= a1);
2792         case PF_OP_GT:
2793                 return (p > a1);
2794         case PF_OP_GE:
2795                 return (p >= a1);
2796         }
2797         return (0); /* never reached */
2798 }
2799
2800 int
2801 pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
2802 {
2803         NTOHS(a1);
2804         NTOHS(a2);
2805         NTOHS(p);
2806         return (pf_match(op, a1, a2, p));
2807 }
2808
2809 static int
2810 pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
2811 {
2812         if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2813                 return (0);
2814         return (pf_match(op, a1, a2, u));
2815 }
2816
2817 static int
2818 pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
2819 {
2820         if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2821                 return (0);
2822         return (pf_match(op, a1, a2, g));
2823 }
2824
2825 int
2826 pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag, int mtag)
2827 {
2828         if (*tag == -1)
2829                 *tag = mtag;
2830
2831         return ((!r->match_tag_not && r->match_tag == *tag) ||
2832             (r->match_tag_not && r->match_tag != *tag));
2833 }
2834
2835 int
2836 pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag)
2837 {
2838
2839         KASSERT(tag > 0, ("%s: tag %d", __func__, tag));
2840
2841         if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL))
2842                 return (ENOMEM);
2843
2844         pd->pf_mtag->tag = tag;
2845
2846         return (0);
2847 }
2848
2849 #define PF_ANCHOR_STACKSIZE     32
2850 struct pf_anchor_stackframe {
2851         struct pf_ruleset       *rs;
2852         struct pf_rule          *r;     /* XXX: + match bit */
2853         struct pf_anchor        *child;
2854 };
2855
2856 /*
2857  * XXX: We rely on malloc(9) returning pointer aligned addresses.
2858  */
2859 #define PF_ANCHORSTACK_MATCH    0x00000001
2860 #define PF_ANCHORSTACK_MASK     (PF_ANCHORSTACK_MATCH)
2861
2862 #define PF_ANCHOR_MATCH(f)      ((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH)
2863 #define PF_ANCHOR_RULE(f)       (struct pf_rule *)                      \
2864                                 ((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK)
2865 #define PF_ANCHOR_SET_MATCH(f)  do { (f)->r = (void *)                  \
2866                                 ((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH);  \
2867 } while (0)
2868
2869 void
2870 pf_step_into_anchor(struct pf_anchor_stackframe *stack, int *depth,
2871     struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a,
2872     int *match)
2873 {
2874         struct pf_anchor_stackframe     *f;
2875
2876         PF_RULES_RASSERT();
2877
2878         if (match)
2879                 *match = 0;
2880         if (*depth >= PF_ANCHOR_STACKSIZE) {
2881                 printf("%s: anchor stack overflow on %s\n",
2882                     __func__, (*r)->anchor->name);
2883                 *r = TAILQ_NEXT(*r, entries);
2884                 return;
2885         } else if (*depth == 0 && a != NULL)
2886                 *a = *r;
2887         f = stack + (*depth)++;
2888         f->rs = *rs;
2889         f->r = *r;
2890         if ((*r)->anchor_wildcard) {
2891                 struct pf_anchor_node *parent = &(*r)->anchor->children;
2892
2893                 if ((f->child = RB_MIN(pf_anchor_node, parent)) == NULL) {
2894                         *r = NULL;
2895                         return;
2896                 }
2897                 *rs = &f->child->ruleset;
2898         } else {
2899                 f->child = NULL;
2900                 *rs = &(*r)->anchor->ruleset;
2901         }
2902         *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2903 }
2904
2905 int
2906 pf_step_out_of_anchor(struct pf_anchor_stackframe *stack, int *depth,
2907     struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a,
2908     int *match)
2909 {
2910         struct pf_anchor_stackframe     *f;
2911         struct pf_rule *fr;
2912         int quick = 0;
2913
2914         PF_RULES_RASSERT();
2915
2916         do {
2917                 if (*depth <= 0)
2918                         break;
2919                 f = stack + *depth - 1;
2920                 fr = PF_ANCHOR_RULE(f);
2921                 if (f->child != NULL) {
2922                         struct pf_anchor_node *parent;
2923
2924                         /*
2925                          * This block traverses through
2926                          * a wildcard anchor.
2927                          */
2928                         parent = &fr->anchor->children;
2929                         if (match != NULL && *match) {
2930                                 /*
2931                                  * If any of "*" matched, then
2932                                  * "foo/ *" matched, mark frame
2933                                  * appropriately.
2934                                  */
2935                                 PF_ANCHOR_SET_MATCH(f);
2936                                 *match = 0;
2937                         }
2938                         f->child = RB_NEXT(pf_anchor_node, parent, f->child);
2939                         if (f->child != NULL) {
2940                                 *rs = &f->child->ruleset;
2941                                 *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2942                                 if (*r == NULL)
2943                                         continue;
2944                                 else
2945                                         break;
2946                         }
2947                 }
2948                 (*depth)--;
2949                 if (*depth == 0 && a != NULL)
2950                         *a = NULL;
2951                 *rs = f->rs;
2952                 if (PF_ANCHOR_MATCH(f) || (match != NULL && *match))
2953                         quick = fr->quick;
2954                 *r = TAILQ_NEXT(fr, entries);
2955         } while (*r == NULL);
2956
2957         return (quick);
2958 }
2959
2960 #ifdef INET6
2961 void
2962 pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
2963     struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
2964 {
2965         switch (af) {
2966 #ifdef INET
2967         case AF_INET:
2968                 naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2969                 ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2970                 break;
2971 #endif /* INET */
2972         case AF_INET6:
2973                 naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2974                 ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2975                 naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
2976                 ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
2977                 naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
2978                 ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
2979                 naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
2980                 ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
2981                 break;
2982         }
2983 }
2984
2985 void
2986 pf_addr_inc(struct pf_addr *addr, sa_family_t af)
2987 {
2988         switch (af) {
2989 #ifdef INET
2990         case AF_INET:
2991                 addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
2992                 break;
2993 #endif /* INET */
2994         case AF_INET6:
2995                 if (addr->addr32[3] == 0xffffffff) {
2996                         addr->addr32[3] = 0;
2997                         if (addr->addr32[2] == 0xffffffff) {
2998                                 addr->addr32[2] = 0;
2999                                 if (addr->addr32[1] == 0xffffffff) {
3000                                         addr->addr32[1] = 0;
3001                                         addr->addr32[0] =
3002                                             htonl(ntohl(addr->addr32[0]) + 1);
3003                                 } else
3004                                         addr->addr32[1] =
3005                                             htonl(ntohl(addr->addr32[1]) + 1);
3006                         } else
3007                                 addr->addr32[2] =
3008                                     htonl(ntohl(addr->addr32[2]) + 1);
3009                 } else
3010                         addr->addr32[3] =
3011                             htonl(ntohl(addr->addr32[3]) + 1);
3012                 break;
3013         }
3014 }
3015 #endif /* INET6 */
3016
3017 int
3018 pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m)
3019 {
3020         struct pf_addr          *saddr, *daddr;
3021         u_int16_t                sport, dport;
3022         struct inpcbinfo        *pi;
3023         struct inpcb            *inp;
3024
3025         pd->lookup.uid = UID_MAX;
3026         pd->lookup.gid = GID_MAX;
3027
3028         switch (pd->proto) {
3029         case IPPROTO_TCP:
3030                 if (pd->hdr.tcp == NULL)
3031                         return (-1);
3032                 sport = pd->hdr.tcp->th_sport;
3033                 dport = pd->hdr.tcp->th_dport;
3034                 pi = &V_tcbinfo;
3035                 break;
3036         case IPPROTO_UDP:
3037                 if (pd->hdr.udp == NULL)
3038                         return (-1);
3039                 sport = pd->hdr.udp->uh_sport;
3040                 dport = pd->hdr.udp->uh_dport;
3041                 pi = &V_udbinfo;
3042                 break;
3043         default:
3044                 return (-1);
3045         }
3046         if (direction == PF_IN) {
3047                 saddr = pd->src;
3048                 daddr = pd->dst;
3049         } else {
3050                 u_int16_t       p;
3051
3052                 p = sport;
3053                 sport = dport;
3054                 dport = p;
3055                 saddr = pd->dst;
3056                 daddr = pd->src;
3057         }
3058         switch (pd->af) {
3059 #ifdef INET
3060         case AF_INET:
3061                 inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4,
3062                     dport, INPLOOKUP_RLOCKPCB, NULL, m);
3063                 if (inp == NULL) {
3064                         inp = in_pcblookup_mbuf(pi, saddr->v4, sport,
3065                            daddr->v4, dport, INPLOOKUP_WILDCARD |
3066                            INPLOOKUP_RLOCKPCB, NULL, m);
3067                         if (inp == NULL)
3068                                 return (-1);
3069                 }
3070                 break;
3071 #endif /* INET */
3072 #ifdef INET6
3073         case AF_INET6:
3074                 inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6,
3075                     dport, INPLOOKUP_RLOCKPCB, NULL, m);
3076                 if (inp == NULL) {
3077                         inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport,
3078                             &daddr->v6, dport, INPLOOKUP_WILDCARD |
3079                             INPLOOKUP_RLOCKPCB, NULL, m);
3080                         if (inp == NULL)
3081                                 return (-1);
3082                 }
3083                 break;
3084 #endif /* INET6 */
3085
3086         default:
3087                 return (-1);
3088         }
3089         INP_RLOCK_ASSERT(inp);
3090         pd->lookup.uid = inp->inp_cred->cr_uid;
3091         pd->lookup.gid = inp->inp_cred->cr_groups[0];
3092         INP_RUNLOCK(inp);
3093
3094         return (1);
3095 }
3096
3097 static u_int8_t
3098 pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
3099 {
3100         int              hlen;
3101         u_int8_t         hdr[60];
3102         u_int8_t        *opt, optlen;
3103         u_int8_t         wscale = 0;
3104
3105         hlen = th_off << 2;             /* hlen <= sizeof(hdr) */
3106         if (hlen <= sizeof(struct tcphdr))
3107                 return (0);
3108         if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
3109                 return (0);
3110         opt = hdr + sizeof(struct tcphdr);
3111         hlen -= sizeof(struct tcphdr);
3112         while (hlen >= 3) {
3113                 switch (*opt) {
3114                 case TCPOPT_EOL:
3115                 case TCPOPT_NOP:
3116                         ++opt;
3117                         --hlen;
3118                         break;
3119                 case TCPOPT_WINDOW:
3120                         wscale = opt[2];
3121                         if (wscale > TCP_MAX_WINSHIFT)
3122                                 wscale = TCP_MAX_WINSHIFT;
3123                         wscale |= PF_WSCALE_FLAG;
3124                         /* FALLTHROUGH */
3125                 default:
3126                         optlen = opt[1];
3127                         if (optlen < 2)
3128                                 optlen = 2;
3129                         hlen -= optlen;
3130                         opt += optlen;
3131                         break;
3132                 }
3133         }
3134         return (wscale);
3135 }
3136
3137 static u_int16_t
3138 pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
3139 {
3140         int              hlen;
3141         u_int8_t         hdr[60];
3142         u_int8_t        *opt, optlen;
3143         u_int16_t        mss = V_tcp_mssdflt;
3144
3145         hlen = th_off << 2;     /* hlen <= sizeof(hdr) */
3146         if (hlen <= sizeof(struct tcphdr))
3147                 return (0);
3148         if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
3149                 return (0);
3150         opt = hdr + sizeof(struct tcphdr);
3151         hlen -= sizeof(struct tcphdr);
3152         while (hlen >= TCPOLEN_MAXSEG) {
3153                 switch (*opt) {
3154                 case TCPOPT_EOL:
3155                 case TCPOPT_NOP:
3156                         ++opt;
3157                         --hlen;
3158                         break;
3159                 case TCPOPT_MAXSEG:
3160                         bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
3161                         NTOHS(mss);
3162                         /* FALLTHROUGH */
3163                 default:
3164                         optlen = opt[1];
3165                         if (optlen < 2)
3166                                 optlen = 2;
3167                         hlen -= optlen;
3168                         opt += optlen;
3169                         break;
3170                 }
3171         }
3172         return (mss);
3173 }
3174
3175 static u_int16_t
3176 pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer)
3177 {
3178 #ifdef INET
3179         struct nhop4_basic      nh4;
3180 #endif /* INET */
3181 #ifdef INET6
3182         struct nhop6_basic      nh6;
3183         struct in6_addr         dst6;
3184         uint32_t                scopeid;
3185 #endif /* INET6 */
3186         int                      hlen = 0;
3187         uint16_t                 mss = 0;
3188
3189         switch (af) {
3190 #ifdef INET
3191         case AF_INET:
3192                 hlen = sizeof(struct ip);
3193                 if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) == 0)
3194                         mss = nh4.nh_mtu - hlen - sizeof(struct tcphdr);
3195                 break;
3196 #endif /* INET */
3197 #ifdef INET6
3198         case AF_INET6:
3199                 hlen = sizeof(struct ip6_hdr);
3200                 in6_splitscope(&addr->v6, &dst6, &scopeid);
3201                 if (fib6_lookup_nh_basic(rtableid, &dst6, scopeid, 0,0,&nh6)==0)
3202                         mss = nh6.nh_mtu - hlen - sizeof(struct tcphdr);
3203                 break;
3204 #endif /* INET6 */
3205         }
3206
3207         mss = max(V_tcp_mssdflt, mss);
3208         mss = min(mss, offer);
3209         mss = max(mss, 64);             /* sanity - at least max opt space */
3210         return (mss);
3211 }
3212
3213 static u_int32_t
3214 pf_tcp_iss(struct pf_pdesc *pd)
3215 {
3216         MD5_CTX ctx;
3217         u_int32_t digest[4];
3218
3219         if (V_pf_tcp_secret_init == 0) {
3220                 arc4random_buf(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret));
3221                 MD5Init(&V_pf_tcp_secret_ctx);
3222                 MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret,
3223                     sizeof(V_pf_tcp_secret));
3224                 V_pf_tcp_secret_init = 1;
3225         }
3226
3227         ctx = V_pf_tcp_secret_ctx;
3228
3229         MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short));
3230         MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short));
3231         if (pd->af == AF_INET6) {
3232                 MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
3233                 MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
3234         } else {
3235                 MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
3236                 MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
3237         }
3238         MD5Final((u_char *)digest, &ctx);
3239         V_pf_tcp_iss_off += 4096;
3240 #define ISN_RANDOM_INCREMENT (4096 - 1)
3241         return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) +
3242             V_pf_tcp_iss_off);
3243 #undef  ISN_RANDOM_INCREMENT
3244 }
3245
3246 static int
3247 pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
3248     struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd,
3249     struct pf_rule **am, struct pf_ruleset **rsm, struct inpcb *inp)
3250 {
3251         struct pf_rule          *nr = NULL;
3252         struct pf_addr          * const saddr = pd->src;
3253         struct pf_addr          * const daddr = pd->dst;
3254         sa_family_t              af = pd->af;
3255         struct pf_rule          *r, *a = NULL;
3256         struct pf_ruleset       *ruleset = NULL;
3257         struct pf_src_node      *nsn = NULL;
3258         struct tcphdr           *th = pd->hdr.tcp;
3259         struct pf_state_key     *sk = NULL, *nk = NULL;
3260         u_short                  reason;
3261         int                      rewrite = 0, hdrlen = 0;
3262         int                      tag = -1, rtableid = -1;
3263         int                      asd = 0;
3264         int                      match = 0;
3265         int                      state_icmp = 0;
3266         u_int16_t                sport = 0, dport = 0;
3267         u_int16_t                bproto_sum = 0, bip_sum = 0;
3268         u_int8_t                 icmptype = 0, icmpcode = 0;
3269         struct pf_anchor_stackframe     anchor_stack[PF_ANCHOR_STACKSIZE];
3270
3271         PF_RULES_RASSERT();
3272
3273         if (inp != NULL) {
3274                 INP_LOCK_ASSERT(inp);
3275                 pd->lookup.uid = inp->inp_cred->cr_uid;
3276                 pd->lookup.gid = inp->inp_cred->cr_groups[0];
3277                 pd->lookup.done = 1;
3278         }
3279
3280         switch (pd->proto) {
3281         case IPPROTO_TCP:
3282                 sport = th->th_sport;
3283                 dport = th->th_dport;
3284                 hdrlen = sizeof(*th);
3285                 break;
3286         case IPPROTO_UDP:
3287                 sport = pd->hdr.udp->uh_sport;
3288                 dport = pd->hdr.udp->uh_dport;
3289                 hdrlen = sizeof(*pd->hdr.udp);
3290                 break;
3291 #ifdef INET
3292         case IPPROTO_ICMP:
3293                 if (pd->af != AF_INET)
3294                         break;
3295                 sport = dport = pd->hdr.icmp->icmp_id;
3296                 hdrlen = sizeof(*pd->hdr.icmp);
3297                 icmptype = pd->hdr.icmp->icmp_type;
3298                 icmpcode = pd->hdr.icmp->icmp_code;
3299
3300                 if (icmptype == ICMP_UNREACH ||
3301                     icmptype == ICMP_SOURCEQUENCH ||
3302                     icmptype == ICMP_REDIRECT ||
3303                     icmptype == ICMP_TIMXCEED ||
3304                     icmptype == ICMP_PARAMPROB)
3305                         state_icmp++;
3306                 break;
3307 #endif /* INET */
3308 #ifdef INET6
3309         case IPPROTO_ICMPV6:
3310                 if (af != AF_INET6)
3311                         break;
3312                 sport = dport = pd->hdr.icmp6->icmp6_id;
3313                 hdrlen = sizeof(*pd->hdr.icmp6);
3314                 icmptype = pd->hdr.icmp6->icmp6_type;
3315                 icmpcode = pd->hdr.icmp6->icmp6_code;
3316
3317                 if (icmptype == ICMP6_DST_UNREACH ||
3318                     icmptype == ICMP6_PACKET_TOO_BIG ||
3319                     icmptype == ICMP6_TIME_EXCEEDED ||
3320                     icmptype == ICMP6_PARAM_PROB)
3321                         state_icmp++;
3322                 break;
3323 #endif /* INET6 */
3324         default:
3325                 sport = dport = hdrlen = 0;
3326                 break;
3327         }
3328
3329         r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3330
3331         /* check packet for BINAT/NAT/RDR */
3332         if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk,
3333             &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) {
3334                 KASSERT(sk != NULL, ("%s: null sk", __func__));
3335                 KASSERT(nk != NULL, ("%s: null nk", __func__));
3336
3337                 if (pd->ip_sum)
3338                         bip_sum = *pd->ip_sum;
3339
3340                 switch (pd->proto) {
3341                 case IPPROTO_TCP:
3342                         bproto_sum = th->th_sum;
3343                         pd->proto_sum = &th->th_sum;
3344
3345                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3346                             nk->port[pd->sidx] != sport) {
3347                                 pf_change_ap(m, saddr, &th->th_sport, pd->ip_sum,
3348                                     &th->th_sum, &nk->addr[pd->sidx],
3349                                     nk->port[pd->sidx], 0, af);
3350                                 pd->sport = &th->th_sport;
3351                                 sport = th->th_sport;
3352                         }
3353
3354                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3355                             nk->port[pd->didx] != dport) {
3356                                 pf_change_ap(m, daddr, &th->th_dport, pd->ip_sum,
3357                                     &th->th_sum, &nk->addr[pd->didx],
3358                                     nk->port[pd->didx], 0, af);
3359                                 dport = th->th_dport;
3360                                 pd->dport = &th->th_dport;
3361                         }
3362                         rewrite++;
3363                         break;
3364                 case IPPROTO_UDP:
3365                         bproto_sum = pd->hdr.udp->uh_sum;
3366                         pd->proto_sum = &pd->hdr.udp->uh_sum;
3367
3368                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
3369                             nk->port[pd->sidx] != sport) {
3370                                 pf_change_ap(m, saddr, &pd->hdr.udp->uh_sport,
3371                                     pd->ip_sum, &pd->hdr.udp->uh_sum,
3372                                     &nk->addr[pd->sidx],
3373                                     nk->port[pd->sidx], 1, af);
3374                                 sport = pd->hdr.udp->uh_sport;
3375                                 pd->sport = &pd->hdr.udp->uh_sport;
3376                         }
3377
3378                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
3379                             nk->port[pd->didx] != dport) {
3380                                 pf_change_ap(m, daddr, &pd->hdr.udp->uh_dport,
3381                                     pd->ip_sum, &pd->hdr.udp->uh_sum,
3382                                     &nk->addr[pd->didx],
3383                                     nk->port[pd->didx], 1, af);
3384                                 dport = pd->hdr.udp->uh_dport;
3385                                 pd->dport = &pd->hdr.udp->uh_dport;
3386                         }
3387                         rewrite++;
3388                         break;
3389 #ifdef INET
3390                 case IPPROTO_ICMP:
3391                         nk->port[0] = nk->port[1];
3392                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
3393                                 pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
3394                                     nk->addr[pd->sidx].v4.s_addr, 0);
3395
3396                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
3397                                 pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
3398                                     nk->addr[pd->didx].v4.s_addr, 0);
3399
3400                         if (nk->port[1] != pd->hdr.icmp->icmp_id) {
3401                                 pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
3402                                     pd->hdr.icmp->icmp_cksum, sport,
3403                                     nk->port[1], 0);
3404                                 pd->hdr.icmp->icmp_id = nk->port[1];
3405                                 pd->sport = &pd->hdr.icmp->icmp_id;
3406                         }
3407                         m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
3408                         break;
3409 #endif /* INET */
3410 #ifdef INET6
3411                 case IPPROTO_ICMPV6:
3412                         nk->port[0] = nk->port[1];
3413                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
3414                                 pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
3415                                     &nk->addr[pd->sidx], 0);
3416
3417                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
3418                                 pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
3419                                     &nk->addr[pd->didx], 0);
3420                         rewrite++;
3421                         break;
3422 #endif /* INET */
3423                 default:
3424                         switch (af) {
3425 #ifdef INET
3426                         case AF_INET:
3427                                 if (PF_ANEQ(saddr,
3428                                     &nk->addr[pd->sidx], AF_INET))
3429                                         pf_change_a(&saddr->v4.s_addr,
3430                                             pd->ip_sum,
3431                                             nk->addr[pd->sidx].v4.s_addr, 0);
3432
3433                                 if (PF_ANEQ(daddr,
3434                                     &nk->addr[pd->didx], AF_INET))
3435                                         pf_change_a(&daddr->v4.s_addr,
3436                                             pd->ip_sum,
3437                                             nk->addr[pd->didx].v4.s_addr, 0);
3438                                 break;
3439 #endif /* INET */
3440 #ifdef INET6
3441                         case AF_INET6:
3442                                 if (PF_ANEQ(saddr,
3443                                     &nk->addr[pd->sidx], AF_INET6))
3444                                         PF_ACPY(saddr, &nk->addr[pd->sidx], af);
3445
3446                                 if (PF_ANEQ(daddr,
3447                                     &nk->addr[pd->didx], AF_INET6))
3448                                         PF_ACPY(daddr, &nk->addr[pd->didx], af);
3449                                 break;
3450 #endif /* INET */
3451                         }
3452                         break;
3453                 }
3454                 if (nr->natpass)
3455                         r = NULL;
3456                 pd->nat_rule = nr;
3457         }
3458
3459         while (r != NULL) {
3460                 r->evaluations++;
3461                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
3462                         r = r->skip[PF_SKIP_IFP].ptr;
3463                 else if (r->direction && r->direction != direction)
3464                         r = r->skip[PF_SKIP_DIR].ptr;
3465                 else if (r->af && r->af != af)
3466                         r = r->skip[PF_SKIP_AF].ptr;
3467                 else if (r->proto && r->proto != pd->proto)
3468                         r = r->skip[PF_SKIP_PROTO].ptr;
3469                 else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
3470                     r->src.neg, kif, M_GETFIB(m)))
3471                         r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3472                 /* tcp/udp only. port_op always 0 in other cases */
3473                 else if (r->src.port_op && !pf_match_port(r->src.port_op,
3474                     r->src.port[0], r->src.port[1], sport))
3475                         r = r->skip[PF_SKIP_SRC_PORT].ptr;
3476                 else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
3477                     r->dst.neg, NULL, M_GETFIB(m)))
3478                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
3479                 /* tcp/udp only. port_op always 0 in other cases */
3480                 else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
3481                     r->dst.port[0], r->dst.port[1], dport))
3482                         r = r->skip[PF_SKIP_DST_PORT].ptr;
3483                 /* icmp only. type always 0 in other cases */
3484                 else if (r->type && r->type != icmptype + 1)
3485                         r = TAILQ_NEXT(r, entries);
3486                 /* icmp only. type always 0 in other cases */
3487                 else if (r->code && r->code != icmpcode + 1)
3488                         r = TAILQ_NEXT(r, entries);
3489                 else if (r->tos && !(r->tos == pd->tos))
3490                         r = TAILQ_NEXT(r, entries);
3491                 else if (r->rule_flag & PFRULE_FRAGMENT)
3492                         r = TAILQ_NEXT(r, entries);
3493                 else if (pd->proto == IPPROTO_TCP &&
3494                     (r->flagset & th->th_flags) != r->flags)
3495                         r = TAILQ_NEXT(r, entries);
3496                 /* tcp/udp only. uid.op always 0 in other cases */
3497                 else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
3498                     pf_socket_lookup(direction, pd, m), 1)) &&
3499                     !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
3500                     pd->lookup.uid))
3501                         r = TAILQ_NEXT(r, entries);
3502                 /* tcp/udp only. gid.op always 0 in other cases */
3503                 else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
3504                     pf_socket_lookup(direction, pd, m), 1)) &&
3505                     !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
3506                     pd->lookup.gid))
3507                         r = TAILQ_NEXT(r, entries);
3508                 else if (r->prio &&
3509                     !pf_match_ieee8021q_pcp(r->prio, m))
3510                         r = TAILQ_NEXT(r, entries);
3511                 else if (r->prob &&
3512                     r->prob <= arc4random())
3513                         r = TAILQ_NEXT(r, entries);
3514                 else if (r->match_tag && !pf_match_tag(m, r, &tag,
3515                     pd->pf_mtag ? pd->pf_mtag->tag : 0))
3516                         r = TAILQ_NEXT(r, entries);
3517                 else if (r->os_fingerprint != PF_OSFP_ANY &&
3518                     (pd->proto != IPPROTO_TCP || !pf_osfp_match(
3519                     pf_osfp_fingerprint(pd, m, off, th),
3520                     r->os_fingerprint)))
3521                         r = TAILQ_NEXT(r, entries);
3522                 else {
3523                         if (r->tag)
3524                                 tag = r->tag;
3525                         if (r->rtableid >= 0)
3526                                 rtableid = r->rtableid;
3527                         if (r->anchor == NULL) {
3528                                 match = 1;
3529                                 *rm = r;
3530                                 *am = a;
3531                                 *rsm = ruleset;
3532                                 if ((*rm)->quick)
3533                                         break;
3534                                 r = TAILQ_NEXT(r, entries);
3535                         } else
3536                                 pf_step_into_anchor(anchor_stack, &asd,
3537                                     &ruleset, PF_RULESET_FILTER, &r, &a,
3538                                     &match);
3539                 }
3540                 if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
3541                     &ruleset, PF_RULESET_FILTER, &r, &a, &match))
3542                         break;
3543         }
3544         r = *rm;
3545         a = *am;
3546         ruleset = *rsm;
3547
3548         REASON_SET(&reason, PFRES_MATCH);
3549
3550         if (r->log || (nr != NULL && nr->log)) {
3551                 if (rewrite)
3552                         m_copyback(m, off, hdrlen, pd->hdr.any);
3553                 PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a,
3554                     ruleset, pd, 1);
3555         }
3556
3557         if ((r->action == PF_DROP) &&
3558             ((r->rule_flag & PFRULE_RETURNRST) ||
3559             (r->rule_flag & PFRULE_RETURNICMP) ||
3560             (r->rule_flag & PFRULE_RETURN))) {
3561                 pf_return(r, nr, pd, sk, off, m, th, kif, bproto_sum,
3562                     bip_sum, hdrlen, &reason);
3563         }
3564
3565         if (r->action == PF_DROP)
3566                 goto cleanup;
3567
3568         if (tag > 0 && pf_tag_packet(m, pd, tag)) {
3569                 REASON_SET(&reason, PFRES_MEMORY);
3570                 goto cleanup;
3571         }
3572         if (rtableid >= 0)
3573                 M_SETFIB(m, rtableid);
3574
3575         if (!state_icmp && (r->keep_state || nr != NULL ||
3576             (pd->flags & PFDESC_TCP_NORM))) {
3577                 int action;
3578                 action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off,
3579                     sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum,
3580                     hdrlen);
3581                 if (action != PF_PASS) {
3582                         if (action == PF_DROP &&
3583                             (r->rule_flag & PFRULE_RETURN))
3584                                 pf_return(r, nr, pd, sk, off, m, th, kif,
3585                                     bproto_sum, bip_sum, hdrlen, &reason);
3586                         return (action);
3587                 }
3588         } else {
3589                 if (sk != NULL)
3590                         uma_zfree(V_pf_state_key_z, sk);
3591                 if (nk != NULL)
3592                         uma_zfree(V_pf_state_key_z, nk);
3593         }
3594
3595         /* copy back packet headers if we performed NAT operations */
3596         if (rewrite)
3597                 m_copyback(m, off, hdrlen, pd->hdr.any);
3598
3599         if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) &&
3600             direction == PF_OUT &&
3601             V_pfsync_defer_ptr != NULL && V_pfsync_defer_ptr(*sm, m))
3602                 /*
3603                  * We want the state created, but we dont
3604                  * want to send this in case a partner
3605                  * firewall has to know about it to allow
3606                  * replies through it.
3607                  */
3608                 return (PF_DEFER);
3609
3610         return (PF_PASS);
3611
3612 cleanup:
3613         if (sk != NULL)
3614                 uma_zfree(V_pf_state_key_z, sk);
3615         if (nk != NULL)
3616                 uma_zfree(V_pf_state_key_z, nk);
3617         return (PF_DROP);
3618 }
3619
3620 static int
3621 pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a,
3622     struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *nk,
3623     struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport,
3624     u_int16_t dport, int *rewrite, struct pfi_kif *kif, struct pf_state **sm,
3625     int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen)
3626 {
3627         struct pf_state         *s = NULL;
3628         struct pf_src_node      *sn = NULL;
3629         struct tcphdr           *th = pd->hdr.tcp;
3630         u_int16_t                mss = V_tcp_mssdflt;
3631         u_short                  reason;
3632
3633         /* check maximums */
3634         if (r->max_states &&
3635             (counter_u64_fetch(r->states_cur) >= r->max_states)) {
3636                 counter_u64_add(V_pf_status.lcounters[LCNT_STATES], 1);
3637                 REASON_SET(&reason, PFRES_MAXSTATES);
3638                 goto csfailed;
3639         }
3640         /* src node for filter rule */
3641         if ((r->rule_flag & PFRULE_SRCTRACK ||
3642             r->rpool.opts & PF_POOL_STICKYADDR) &&
3643             pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
3644                 REASON_SET(&reason, PFRES_SRCLIMIT);
3645                 goto csfailed;
3646         }
3647         /* src node for translation rule */
3648         if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
3649             pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
3650                 REASON_SET(&reason, PFRES_SRCLIMIT);
3651                 goto csfailed;
3652         }
3653         s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO);
3654         if (s == NULL) {
3655                 REASON_SET(&reason, PFRES_MEMORY);
3656                 goto csfailed;
3657         }
3658         s->rule.ptr = r;
3659         s->nat_rule.ptr = nr;
3660         s->anchor.ptr = a;
3661         STATE_INC_COUNTERS(s);
3662         if (r->allow_opts)
3663                 s->state_flags |= PFSTATE_ALLOWOPTS;
3664         if (r->rule_flag & PFRULE_STATESLOPPY)
3665                 s->state_flags |= PFSTATE_SLOPPY;
3666         s->log = r->log & PF_LOG_ALL;
3667         s->sync_state = PFSYNC_S_NONE;
3668         if (nr != NULL)
3669                 s->log |= nr->log & PF_LOG_ALL;
3670         switch (pd->proto) {
3671         case IPPROTO_TCP:
3672                 s->src.seqlo = ntohl(th->th_seq);
3673                 s->src.seqhi = s->src.seqlo + pd->p_len + 1;
3674                 if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
3675                     r->keep_state == PF_STATE_MODULATE) {
3676                         /* Generate sequence number modulator */
3677                         if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
3678                             0)
3679                                 s->src.seqdiff = 1;
3680                         pf_change_proto_a(m, &th->th_seq, &th->th_sum,
3681                             htonl(s->src.seqlo + s->src.seqdiff), 0);
3682                         *rewrite = 1;
3683                 } else
3684                         s->src.seqdiff = 0;
3685                 if (th->th_flags & TH_SYN) {
3686                         s->src.seqhi++;
3687                         s->src.wscale = pf_get_wscale(m, off,
3688                             th->th_off, pd->af);
3689                 }
3690                 s->src.max_win = MAX(ntohs(th->th_win), 1);
3691                 if (s->src.wscale & PF_WSCALE_MASK) {
3692                         /* Remove scale factor from initial window */
3693                         int win = s->src.max_win;
3694                         win += 1 << (s->src.wscale & PF_WSCALE_MASK);
3695                         s->src.max_win = (win - 1) >>
3696                             (s->src.wscale & PF_WSCALE_MASK);
3697                 }
3698                 if (th->th_flags & TH_FIN)
3699                         s->src.seqhi++;
3700                 s->dst.seqhi = 1;
3701                 s->dst.max_win = 1;
3702                 s->src.state = TCPS_SYN_SENT;
3703                 s->dst.state = TCPS_CLOSED;
3704                 s->timeout = PFTM_TCP_FIRST_PACKET;
3705                 break;
3706         case IPPROTO_UDP:
3707                 s->src.state = PFUDPS_SINGLE;
3708                 s->dst.state = PFUDPS_NO_TRAFFIC;
3709                 s->timeout = PFTM_UDP_FIRST_PACKET;
3710                 break;
3711         case IPPROTO_ICMP:
3712 #ifdef INET6
3713         case IPPROTO_ICMPV6:
3714 #endif
3715                 s->timeout = PFTM_ICMP_FIRST_PACKET;
3716                 break;
3717         default:
3718                 s->src.state = PFOTHERS_SINGLE;
3719                 s->dst.state = PFOTHERS_NO_TRAFFIC;
3720                 s->timeout = PFTM_OTHER_FIRST_PACKET;
3721         }
3722
3723         if (r->rt) {
3724                 if (pf_map_addr(pd->af, r, pd->src, &s->rt_addr, NULL, &sn)) {
3725                         REASON_SET(&reason, PFRES_MAPFAILED);
3726                         pf_src_tree_remove_state(s);
3727                         STATE_DEC_COUNTERS(s);
3728                         uma_zfree(V_pf_state_z, s);
3729                         goto csfailed;
3730                 }
3731                 s->rt_kif = r->rpool.cur->kif;
3732         }
3733
3734         s->creation = time_uptime;
3735         s->expire = time_uptime;
3736
3737         if (sn != NULL)
3738                 s->src_node = sn;
3739         if (nsn != NULL) {
3740                 /* XXX We only modify one side for now. */
3741                 PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
3742                 s->nat_src_node = nsn;
3743         }
3744         if (pd->proto == IPPROTO_TCP) {
3745                 if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
3746                     off, pd, th, &s->src, &s->dst)) {
3747                         REASON_SET(&reason, PFRES_MEMORY);
3748                         pf_src_tree_remove_state(s);
3749                         STATE_DEC_COUNTERS(s);
3750                         uma_zfree(V_pf_state_z, s);
3751                         return (PF_DROP);
3752                 }
3753                 if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
3754                     pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
3755                     &s->src, &s->dst, rewrite)) {
3756                         /* This really shouldn't happen!!! */
3757                         DPFPRINTF(PF_DEBUG_URGENT,
3758                             ("pf_normalize_tcp_stateful failed on first "
3759                              "pkt\n"));
3760                         pf_normalize_tcp_cleanup(s);
3761                         pf_src_tree_remove_state(s);
3762                         STATE_DEC_COUNTERS(s);
3763                         uma_zfree(V_pf_state_z, s);
3764                         return (PF_DROP);
3765                 }
3766         }
3767         s->direction = pd->dir;
3768
3769         /*
3770          * sk/nk could already been setup by pf_get_translation().
3771          */
3772         if (nr == NULL) {
3773                 KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p",
3774                     __func__, nr, sk, nk));
3775                 sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport);
3776                 if (sk == NULL)
3777                         goto csfailed;
3778                 nk = sk;
3779         } else
3780                 KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p",
3781                     __func__, nr, sk, nk));
3782
3783         /* Swap sk/nk for PF_OUT. */
3784         if (pf_state_insert(BOUND_IFACE(r, kif),
3785             (pd->dir == PF_IN) ? sk : nk,
3786             (pd->dir == PF_IN) ? nk : sk, s)) {
3787                 if (pd->proto == IPPROTO_TCP)
3788                         pf_normalize_tcp_cleanup(s);
3789                 REASON_SET(&reason, PFRES_STATEINS);
3790                 pf_src_tree_remove_state(s);
3791                 STATE_DEC_COUNTERS(s);
3792                 uma_zfree(V_pf_state_z, s);
3793                 return (PF_DROP);
3794         } else
3795                 *sm = s;
3796
3797         if (tag > 0)
3798                 s->tag = tag;
3799         if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
3800             TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
3801                 s->src.state = PF_TCPS_PROXY_SRC;
3802                 /* undo NAT changes, if they have taken place */
3803                 if (nr != NULL) {
3804                         struct pf_state_key *skt = s->key[PF_SK_WIRE];
3805                         if (pd->dir == PF_OUT)
3806                                 skt = s->key[PF_SK_STACK];
3807                         PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
3808                         PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
3809                         if (pd->sport)
3810                                 *pd->sport = skt->port[pd->sidx];
3811                         if (pd->dport)
3812                                 *pd->dport = skt->port[pd->didx];
3813                         if (pd->proto_sum)
3814                                 *pd->proto_sum = bproto_sum;
3815                         if (pd->ip_sum)
3816                                 *pd->ip_sum = bip_sum;
3817                         m_copyback(m, off, hdrlen, pd->hdr.any);
3818                 }
3819                 s->src.seqhi = htonl(arc4random());
3820                 /* Find mss option */
3821                 int rtid = M_GETFIB(m);
3822                 mss = pf_get_mss(m, off, th->th_off, pd->af);
3823                 mss = pf_calc_mss(pd->src, pd->af, rtid, mss);
3824                 mss = pf_calc_mss(pd->dst, pd->af, rtid, mss);
3825                 s->src.mss = mss;
3826                 pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport,
3827                     th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
3828                     TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL);
3829                 REASON_SET(&reason, PFRES_SYNPROXY);
3830                 return (PF_SYNPROXY_DROP);
3831         }
3832
3833         return (PF_PASS);
3834
3835 csfailed:
3836         if (sk != NULL)
3837                 uma_zfree(V_pf_state_key_z, sk);
3838         if (nk != NULL)
3839                 uma_zfree(V_pf_state_key_z, nk);
3840
3841         if (sn != NULL) {
3842                 struct pf_srchash *sh;
3843
3844                 sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
3845                 PF_HASHROW_LOCK(sh);
3846                 if (--sn->states == 0 && sn->expire == 0) {
3847                         pf_unlink_src_node(sn);
3848                         uma_zfree(V_pf_sources_z, sn);
3849                         counter_u64_add(
3850                             V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1);
3851                 }
3852                 PF_HASHROW_UNLOCK(sh);
3853         }
3854
3855         if (nsn != sn && nsn != NULL) {
3856                 struct pf_srchash *sh;
3857
3858                 sh = &V_pf_srchash[pf_hashsrc(&nsn->addr, nsn->af)];
3859                 PF_HASHROW_LOCK(sh);
3860                 if (--nsn->states == 0 && nsn->expire == 0) {
3861                         pf_unlink_src_node(nsn);
3862                         uma_zfree(V_pf_sources_z, nsn);
3863                         counter_u64_add(
3864                             V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1);
3865                 }
3866                 PF_HASHROW_UNLOCK(sh);
3867         }
3868
3869         return (PF_DROP);
3870 }
3871
3872 static int
3873 pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif,
3874     struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am,
3875     struct pf_ruleset **rsm)
3876 {
3877         struct pf_rule          *r, *a = NULL;
3878         struct pf_ruleset       *ruleset = NULL;
3879         sa_family_t              af = pd->af;
3880         u_short                  reason;
3881         int                      tag = -1;
3882         int                      asd = 0;
3883         int                      match = 0;
3884         struct pf_anchor_stackframe     anchor_stack[PF_ANCHOR_STACKSIZE];
3885
3886         PF_RULES_RASSERT();
3887
3888         r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3889         while (r != NULL) {
3890                 r->evaluations++;
3891                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
3892                         r = r->skip[PF_SKIP_IFP].ptr;
3893                 else if (r->direction && r->direction != direction)
3894                         r = r->skip[PF_SKIP_DIR].ptr;
3895                 else if (r->af && r->af != af)
3896                         r = r->skip[PF_SKIP_AF].ptr;
3897                 else if (r->proto && r->proto != pd->proto)
3898                         r = r->skip[PF_SKIP_PROTO].ptr;
3899                 else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
3900                     r->src.neg, kif, M_GETFIB(m)))
3901                         r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3902                 else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
3903                     r->dst.neg, NULL, M_GETFIB(m)))
3904                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
3905                 else if (r->tos && !(r->tos == pd->tos))
3906                         r = TAILQ_NEXT(r, entries);
3907                 else if (r->os_fingerprint != PF_OSFP_ANY)
3908                         r = TAILQ_NEXT(r, entries);
3909                 else if (pd->proto == IPPROTO_UDP &&
3910                     (r->src.port_op || r->dst.port_op))
3911                         r = TAILQ_NEXT(r, entries);
3912                 else if (pd->proto == IPPROTO_TCP &&
3913                     (r->src.port_op || r->dst.port_op || r->flagset))
3914                         r = TAILQ_NEXT(r, entries);
3915                 else if ((pd->proto == IPPROTO_ICMP ||
3916                     pd->proto == IPPROTO_ICMPV6) &&
3917                     (r->type || r->code))
3918                         r = TAILQ_NEXT(r, entries);
3919                 else if (r->prio &&
3920                     !pf_match_ieee8021q_pcp(r->prio, m))
3921                         r = TAILQ_NEXT(r, entries);
3922                 else if (r->prob && r->prob <=
3923                     (arc4random() % (UINT_MAX - 1) + 1))
3924                         r = TAILQ_NEXT(r, entries);
3925                 else if (r->match_tag && !pf_match_tag(m, r, &tag,
3926                     pd->pf_mtag ? pd->pf_mtag->tag : 0))
3927                         r = TAILQ_NEXT(r, entries);
3928                 else {
3929                         if (r->anchor == NULL) {
3930                                 match = 1;
3931                                 *rm = r;
3932                                 *am = a;
3933                                 *rsm = ruleset;
3934                                 if ((*rm)->quick)
3935                                         break;
3936                                 r = TAILQ_NEXT(r, entries);
3937                         } else
3938                                 pf_step_into_anchor(anchor_stack, &asd,
3939                                     &ruleset, PF_RULESET_FILTER, &r, &a,
3940                                     &match);
3941                 }
3942                 if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
3943                     &ruleset, PF_RULESET_FILTER, &r, &a, &match))
3944                         break;
3945         }
3946         r = *rm;
3947         a = *am;
3948         ruleset = *rsm;
3949
3950         REASON_SET(&reason, PFRES_MATCH);
3951
3952         if (r->log)
3953                 PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd,
3954                     1);
3955
3956         if (r->action != PF_PASS)
3957                 return (PF_DROP);
3958
3959         if (tag > 0 && pf_tag_packet(m, pd, tag)) {
3960                 REASON_SET(&reason, PFRES_MEMORY);
3961                 return (PF_DROP);
3962         }
3963
3964         return (PF_PASS);
3965 }
3966
3967 static int
3968 pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst,
3969         struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off,
3970         struct pf_pdesc *pd, u_short *reason, int *copyback)
3971 {
3972         struct tcphdr           *th = pd->hdr.tcp;
3973         u_int16_t                win = ntohs(th->th_win);
3974         u_int32_t                ack, end, seq, orig_seq;
3975         u_int8_t                 sws, dws;
3976         int                      ackskew;
3977
3978         if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
3979                 sws = src->wscale & PF_WSCALE_MASK;
3980                 dws = dst->wscale & PF_WSCALE_MASK;
3981         } else
3982                 sws = dws = 0;
3983
3984         /*
3985          * Sequence tracking algorithm from Guido van Rooij's paper:
3986          *   http://www.madison-gurkha.com/publications/tcp_filtering/
3987          *      tcp_filtering.ps
3988          */
3989
3990         orig_seq = seq = ntohl(th->th_seq);
3991         if (src->seqlo == 0) {
3992                 /* First packet from this end. Set its state */
3993
3994                 if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
3995                     src->scrub == NULL) {
3996                         if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
3997                                 REASON_SET(reason, PFRES_MEMORY);
3998                                 return (PF_DROP);
3999                         }
4000                 }
4001
4002                 /* Deferred generation of sequence number modulator */
4003                 if (dst->seqdiff && !src->seqdiff) {
4004                         /* use random iss for the TCP server */
4005                         while ((src->seqdiff = arc4random() - seq) == 0)
4006                                 ;
4007                         ack = ntohl(th->th_ack) - dst->seqdiff;
4008                         pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq +
4009                             src->seqdiff), 0);
4010                         pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0);
4011                         *copyback = 1;
4012                 } else {
4013                         ack = ntohl(th->th_ack);
4014                 }
4015
4016                 end = seq + pd->p_len;
4017                 if (th->th_flags & TH_SYN) {
4018                         end++;
4019                         if (dst->wscale & PF_WSCALE_FLAG) {
4020                                 src->wscale = pf_get_wscale(m, off, th->th_off,
4021                                     pd->af);
4022                                 if (src->wscale & PF_WSCALE_FLAG) {
4023                                         /* Remove scale factor from initial
4024                                          * window */
4025                                         sws = src->wscale & PF_WSCALE_MASK;
4026                                         win = ((u_int32_t)win + (1 << sws) - 1)
4027                                             >> sws;
4028                                         dws = dst->wscale & PF_WSCALE_MASK;
4029                                 } else {
4030                                         /* fixup other window */
4031                                         dst->max_win <<= dst->wscale &
4032                                             PF_WSCALE_MASK;
4033                                         /* in case of a retrans SYN|ACK */
4034                                         dst->wscale = 0;
4035                                 }
4036                         }
4037                 }
4038                 if (th->th_flags & TH_FIN)
4039                         end++;
4040
4041                 src->seqlo = seq;
4042                 if (src->state < TCPS_SYN_SENT)
4043                         src->state = TCPS_SYN_SENT;
4044
4045                 /*
4046                  * May need to slide the window (seqhi may have been set by
4047                  * the crappy stack check or if we picked up the connection
4048                  * after establishment)
4049                  */
4050                 if (src->seqhi == 1 ||
4051                     SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
4052                         src->seqhi = end + MAX(1, dst->max_win << dws);
4053                 if (win > src->max_win)
4054                         src->max_win = win;
4055
4056         } else {
4057                 ack = ntohl(th->th_ack) - dst->seqdiff;
4058                 if (src->seqdiff) {
4059                         /* Modulate sequence numbers */
4060                         pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq +
4061                             src->seqdiff), 0);
4062                         pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0);
4063                         *copyback = 1;
4064                 }
4065                 end = seq + pd->p_len;
4066                 if (th->th_flags & TH_SYN)
4067                         end++;
4068                 if (th->th_flags & TH_FIN)
4069                         end++;
4070         }
4071
4072         if ((th->th_flags & TH_ACK) == 0) {
4073                 /* Let it pass through the ack skew check */
4074                 ack = dst->seqlo;
4075         } else if ((ack == 0 &&
4076             (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
4077             /* broken tcp stacks do not set ack */
4078             (dst->state < TCPS_SYN_SENT)) {
4079                 /*
4080                  * Many stacks (ours included) will set the ACK number in an
4081                  * FIN|ACK if the SYN times out -- no sequence to ACK.
4082                  */
4083                 ack = dst->seqlo;
4084         }
4085
4086         if (seq == end) {
4087                 /* Ease sequencing restrictions on no data packets */
4088                 seq = src->seqlo;
4089                 end = seq;
4090         }
4091
4092         ackskew = dst->seqlo - ack;
4093
4094
4095         /*
4096          * Need to demodulate the sequence numbers in any TCP SACK options
4097          * (Selective ACK). We could optionally validate the SACK values
4098          * against the current ACK window, either forwards or backwards, but
4099          * I'm not confident that SACK has been implemented properly
4100          * everywhere. It wouldn't surprise me if several stacks accidentally
4101          * SACK too far backwards of previously ACKed data. There really aren't
4102          * any security implications of bad SACKing unless the target stack
4103          * doesn't validate the option length correctly. Someone trying to
4104          * spoof into a TCP connection won't bother blindly sending SACK
4105          * options anyway.
4106          */
4107         if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
4108                 if (pf_modulate_sack(m, off, pd, th, dst))
4109                         *copyback = 1;
4110         }
4111
4112
4113 #define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge factor */
4114         if (SEQ_GEQ(src->seqhi, end) &&
4115             /* Last octet inside other's window space */
4116             SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
4117             /* Retrans: not more than one window back */
4118             (ackskew >= -MAXACKWINDOW) &&
4119             /* Acking not more than one reassembled fragment backwards */
4120             (ackskew <= (MAXACKWINDOW << sws)) &&
4121             /* Acking not more than one window forward */
4122             ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
4123             (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) ||
4124             (pd->flags & PFDESC_IP_REAS) == 0)) {
4125             /* Require an exact/+1 sequence match on resets when possible */
4126
4127                 if (dst->scrub || src->scrub) {
4128                         if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
4129                             *state, src, dst, copyback))
4130                                 return (PF_DROP);
4131                 }
4132
4133                 /* update max window */
4134                 if (src->max_win < win)
4135                         src->max_win = win;
4136                 /* synchronize sequencing */
4137                 if (SEQ_GT(end, src->seqlo))
4138                         src->seqlo = end;
4139                 /* slide the window of what the other end can send */
4140                 if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
4141                         dst->seqhi = ack + MAX((win << sws), 1);
4142
4143
4144                 /* update states */
4145                 if (th->th_flags & TH_SYN)
4146                         if (src->state < TCPS_SYN_SENT)
4147                                 src->state = TCPS_SYN_SENT;
4148                 if (th->th_flags & TH_FIN)
4149                         if (src->state < TCPS_CLOSING)
4150                                 src->state = TCPS_CLOSING;
4151                 if (th->th_flags & TH_ACK) {
4152                         if (dst->state == TCPS_SYN_SENT) {
4153                                 dst->state = TCPS_ESTABLISHED;
4154                                 if (src->state == TCPS_ESTABLISHED &&
4155                                     (*state)->src_node != NULL &&
4156                                     pf_src_connlimit(state)) {
4157                                         REASON_SET(reason, PFRES_SRCLIMIT);
4158                                         return (PF_DROP);
4159                                 }
4160                         } else if (dst->state == TCPS_CLOSING)
4161                                 dst->state = TCPS_FIN_WAIT_2;
4162                 }
4163                 if (th->th_flags & TH_RST)
4164                         src->state = dst->state = TCPS_TIME_WAIT;
4165
4166                 /* update expire time */
4167                 (*state)->expire = time_uptime;
4168                 if (src->state >= TCPS_FIN_WAIT_2 &&
4169                     dst->state >= TCPS_FIN_WAIT_2)
4170                         (*state)->timeout = PFTM_TCP_CLOSED;
4171                 else if (src->state >= TCPS_CLOSING &&
4172                     dst->state >= TCPS_CLOSING)
4173                         (*state)->timeout = PFTM_TCP_FIN_WAIT;
4174                 else if (src->state < TCPS_ESTABLISHED ||
4175                     dst->state < TCPS_ESTABLISHED)
4176                         (*state)->timeout = PFTM_TCP_OPENING;
4177                 else if (src->state >= TCPS_CLOSING ||
4178                     dst->state >= TCPS_CLOSING)
4179                         (*state)->timeout = PFTM_TCP_CLOSING;
4180                 else
4181                         (*state)->timeout = PFTM_TCP_ESTABLISHED;
4182
4183                 /* Fall through to PASS packet */
4184
4185         } else if ((dst->state < TCPS_SYN_SENT ||
4186                 dst->state >= TCPS_FIN_WAIT_2 ||
4187                 src->state >= TCPS_FIN_WAIT_2) &&
4188             SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
4189             /* Within a window forward of the originating packet */
4190             SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
4191             /* Within a window backward of the originating packet */
4192
4193                 /*
4194                  * This currently handles three situations:
4195                  *  1) Stupid stacks will shotgun SYNs before their peer
4196                  *     replies.
4197                  *  2) When PF catches an already established stream (the
4198                  *     firewall rebooted, the state table was flushed, routes
4199                  *     changed...)
4200                  *  3) Packets get funky immediately after the connection
4201                  *     closes (this should catch Solaris spurious ACK|FINs
4202                  *     that web servers like to spew after a close)
4203                  *
4204                  * This must be a little more careful than the above code
4205                  * since packet floods will also be caught here. We don't
4206                  * update the TTL here to mitigate the damage of a packet
4207                  * flood and so the same code can handle awkward establishment
4208                  * and a loosened connection close.
4209                  * In the establishment case, a correct peer response will
4210                  * validate the connection, go through the normal state code
4211                  * and keep updating the state TTL.
4212                  */
4213
4214                 if (V_pf_status.debug >= PF_DEBUG_MISC) {
4215                         printf("pf: loose state match: ");
4216                         pf_print_state(*state);
4217                         pf_print_flags(th->th_flags);
4218                         printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
4219                             "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack,
4220                             pd->p_len, ackskew, (unsigned long long)(*state)->packets[0],
4221                             (unsigned long long)(*state)->packets[1],
4222                             pd->dir == PF_IN ? "in" : "out",
4223                             pd->dir == (*state)->direction ? "fwd" : "rev");
4224                 }
4225
4226                 if (dst->scrub || src->scrub) {
4227                         if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
4228                             *state, src, dst, copyback))
4229                                 return (PF_DROP);
4230                 }
4231
4232                 /* update max window */
4233                 if (src->max_win < win)
4234                         src->max_win = win;
4235                 /* synchronize sequencing */
4236                 if (SEQ_GT(end, src->seqlo))
4237                         src->seqlo = end;
4238                 /* slide the window of what the other end can send */
4239                 if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
4240                         dst->seqhi = ack + MAX((win << sws), 1);
4241
4242                 /*
4243                  * Cannot set dst->seqhi here since this could be a shotgunned
4244                  * SYN and not an already established connection.
4245                  */
4246
4247                 if (th->th_flags & TH_FIN)
4248                         if (src->state < TCPS_CLOSING)
4249                                 src->state = TCPS_CLOSING;
4250                 if (th->th_flags & TH_RST)
4251                         src->state = dst->state = TCPS_TIME_WAIT;
4252
4253                 /* Fall through to PASS packet */
4254
4255         } else {
4256                 if ((*state)->dst.state == TCPS_SYN_SENT &&
4257                     (*state)->src.state == TCPS_SYN_SENT) {
4258                         /* Send RST for state mismatches during handshake */
4259                         if (!(th->th_flags & TH_RST))
4260                                 pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4261                                     pd->dst, pd->src, th->th_dport,
4262                                     th->th_sport, ntohl(th->th_ack), 0,
4263                                     TH_RST, 0, 0,
4264                                     (*state)->rule.ptr->return_ttl, 1, 0,
4265                                     kif->pfik_ifp);
4266                         src->seqlo = 0;
4267                         src->seqhi = 1;
4268                         src->max_win = 1;
4269                 } else if (V_pf_status.debug >= PF_DEBUG_MISC) {
4270                         printf("pf: BAD state: ");
4271                         pf_print_state(*state);
4272                         pf_print_flags(th->th_flags);
4273                         printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
4274                             "pkts=%llu:%llu dir=%s,%s\n",
4275                             seq, orig_seq, ack, pd->p_len, ackskew,
4276                             (unsigned long long)(*state)->packets[0],
4277                             (unsigned long long)(*state)->packets[1],
4278                             pd->dir == PF_IN ? "in" : "out",
4279                             pd->dir == (*state)->direction ? "fwd" : "rev");
4280                         printf("pf: State failure on: %c %c %c %c | %c %c\n",
4281                             SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
4282                             SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
4283                             ' ': '2',
4284                             (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
4285                             (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
4286                             SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
4287                             SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
4288                 }
4289                 REASON_SET(reason, PFRES_BADSTATE);
4290                 return (PF_DROP);
4291         }
4292
4293         return (PF_PASS);
4294 }
4295
4296 static int
4297 pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst,
4298         struct pf_state **state, struct pf_pdesc *pd, u_short *reason)
4299 {
4300         struct tcphdr           *th = pd->hdr.tcp;
4301
4302         if (th->th_flags & TH_SYN)
4303                 if (src->state < TCPS_SYN_SENT)
4304                         src->state = TCPS_SYN_SENT;
4305         if (th->th_flags & TH_FIN)
4306                 if (src->state < TCPS_CLOSING)
4307                         src->state = TCPS_CLOSING;
4308         if (th->th_flags & TH_ACK) {
4309                 if (dst->state == TCPS_SYN_SENT) {
4310                         dst->state = TCPS_ESTABLISHED;
4311                         if (src->state == TCPS_ESTABLISHED &&
4312                             (*state)->src_node != NULL &&
4313                             pf_src_connlimit(state)) {
4314                                 REASON_SET(reason, PFRES_SRCLIMIT);
4315                                 return (PF_DROP);
4316                         }
4317                 } else if (dst->state == TCPS_CLOSING) {
4318                         dst->state = TCPS_FIN_WAIT_2;
4319                 } else if (src->state == TCPS_SYN_SENT &&
4320                     dst->state < TCPS_SYN_SENT) {
4321                         /*
4322                          * Handle a special sloppy case where we only see one
4323                          * half of the connection. If there is a ACK after
4324                          * the initial SYN without ever seeing a packet from
4325                          * the destination, set the connection to established.
4326                          */
4327                         dst->state = src->state = TCPS_ESTABLISHED;
4328                         if ((*state)->src_node != NULL &&
4329                             pf_src_connlimit(state)) {
4330                                 REASON_SET(reason, PFRES_SRCLIMIT);
4331                                 return (PF_DROP);
4332                         }
4333                 } else if (src->state == TCPS_CLOSING &&
4334                     dst->state == TCPS_ESTABLISHED &&
4335                     dst->seqlo == 0) {
4336                         /*
4337                          * Handle the closing of half connections where we
4338                          * don't see the full bidirectional FIN/ACK+ACK
4339                          * handshake.
4340                          */
4341                         dst->state = TCPS_CLOSING;
4342                 }
4343         }
4344         if (th->th_flags & TH_RST)
4345                 src->state = dst->state = TCPS_TIME_WAIT;
4346
4347         /* update expire time */
4348         (*state)->expire = time_uptime;
4349         if (src->state >= TCPS_FIN_WAIT_2 &&
4350             dst->state >= TCPS_FIN_WAIT_2)
4351                 (*state)->timeout = PFTM_TCP_CLOSED;
4352         else if (src->state >= TCPS_CLOSING &&
4353             dst->state >= TCPS_CLOSING)
4354                 (*state)->timeout = PFTM_TCP_FIN_WAIT;
4355         else if (src->state < TCPS_ESTABLISHED ||
4356             dst->state < TCPS_ESTABLISHED)
4357                 (*state)->timeout = PFTM_TCP_OPENING;
4358         else if (src->state >= TCPS_CLOSING ||
4359             dst->state >= TCPS_CLOSING)
4360                 (*state)->timeout = PFTM_TCP_CLOSING;
4361         else
4362                 (*state)->timeout = PFTM_TCP_ESTABLISHED;
4363
4364         return (PF_PASS);
4365 }
4366
4367 static int
4368 pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
4369     struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
4370     u_short *reason)
4371 {
4372         struct pf_state_key_cmp  key;
4373         struct tcphdr           *th = pd->hdr.tcp;
4374         int                      copyback = 0;
4375         struct pf_state_peer    *src, *dst;
4376         struct pf_state_key     *sk;
4377
4378         bzero(&key, sizeof(key));
4379         key.af = pd->af;
4380         key.proto = IPPROTO_TCP;
4381         if (direction == PF_IN) {       /* wire side, straight */
4382                 PF_ACPY(&key.addr[0], pd->src, key.af);
4383                 PF_ACPY(&key.addr[1], pd->dst, key.af);
4384                 key.port[0] = th->th_sport;
4385                 key.port[1] = th->th_dport;
4386         } else {                        /* stack side, reverse */
4387                 PF_ACPY(&key.addr[1], pd->src, key.af);
4388                 PF_ACPY(&key.addr[0], pd->dst, key.af);
4389                 key.port[1] = th->th_sport;
4390                 key.port[0] = th->th_dport;
4391         }
4392
4393         STATE_LOOKUP(kif, &key, direction, *state, pd);
4394
4395         if (direction == (*state)->direction) {
4396                 src = &(*state)->src;
4397                 dst = &(*state)->dst;
4398         } else {
4399                 src = &(*state)->dst;
4400                 dst = &(*state)->src;
4401         }
4402
4403         sk = (*state)->key[pd->didx];
4404
4405         if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
4406                 if (direction != (*state)->direction) {
4407                         REASON_SET(reason, PFRES_SYNPROXY);
4408                         return (PF_SYNPROXY_DROP);
4409                 }
4410                 if (th->th_flags & TH_SYN) {
4411                         if (ntohl(th->th_seq) != (*state)->src.seqlo) {
4412                                 REASON_SET(reason, PFRES_SYNPROXY);
4413                                 return (PF_DROP);
4414                         }
4415                         pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
4416                             pd->src, th->th_dport, th->th_sport,
4417                             (*state)->src.seqhi, ntohl(th->th_seq) + 1,
4418                             TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL);
4419                         REASON_SET(reason, PFRES_SYNPROXY);
4420                         return (PF_SYNPROXY_DROP);
4421                 } else if ((th->th_flags & (TH_ACK|TH_RST|TH_FIN)) != TH_ACK ||
4422                     (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4423                     (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4424                         REASON_SET(reason, PFRES_SYNPROXY);
4425                         return (PF_DROP);
4426                 } else if ((*state)->src_node != NULL &&
4427                     pf_src_connlimit(state)) {
4428                         REASON_SET(reason, PFRES_SRCLIMIT);
4429                         return (PF_DROP);
4430                 } else
4431                         (*state)->src.state = PF_TCPS_PROXY_DST;
4432         }
4433         if ((*state)->src.state == PF_TCPS_PROXY_DST) {
4434                 if (direction == (*state)->direction) {
4435                         if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
4436                             (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4437                             (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4438                                 REASON_SET(reason, PFRES_SYNPROXY);
4439                                 return (PF_DROP);
4440                         }
4441                         (*state)->src.max_win = MAX(ntohs(th->th_win), 1);
4442                         if ((*state)->dst.seqhi == 1)
4443                                 (*state)->dst.seqhi = htonl(arc4random());
4444                         pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4445                             &sk->addr[pd->sidx], &sk->addr[pd->didx],
4446                             sk->port[pd->sidx], sk->port[pd->didx],
4447                             (*state)->dst.seqhi, 0, TH_SYN, 0,
4448                             (*state)->src.mss, 0, 0, (*state)->tag, NULL);
4449                         REASON_SET(reason, PFRES_SYNPROXY);
4450                         return (PF_SYNPROXY_DROP);
4451                 } else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
4452                     (TH_SYN|TH_ACK)) ||
4453                     (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
4454                         REASON_SET(reason, PFRES_SYNPROXY);
4455                         return (PF_DROP);
4456                 } else {
4457                         (*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
4458                         (*state)->dst.seqlo = ntohl(th->th_seq);
4459                         pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
4460                             pd->src, th->th_dport, th->th_sport,
4461                             ntohl(th->th_ack), ntohl(th->th_seq) + 1,
4462                             TH_ACK, (*state)->src.max_win, 0, 0, 0,
4463                             (*state)->tag, NULL);
4464                         pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4465                             &sk->addr[pd->sidx], &sk->addr[pd->didx],
4466                             sk->port[pd->sidx], sk->port[pd->didx],
4467                             (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
4468                             TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL);
4469                         (*state)->src.seqdiff = (*state)->dst.seqhi -
4470                             (*state)->src.seqlo;
4471                         (*state)->dst.seqdiff = (*state)->src.seqhi -
4472                             (*state)->dst.seqlo;
4473                         (*state)->src.seqhi = (*state)->src.seqlo +
4474                             (*state)->dst.max_win;
4475                         (*state)->dst.seqhi = (*state)->dst.seqlo +
4476                             (*state)->src.max_win;
4477                         (*state)->src.wscale = (*state)->dst.wscale = 0;
4478                         (*state)->src.state = (*state)->dst.state =
4479                             TCPS_ESTABLISHED;
4480                         REASON_SET(reason, PFRES_SYNPROXY);
4481                         return (PF_SYNPROXY_DROP);
4482                 }
4483         }
4484
4485         if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) &&
4486             dst->state >= TCPS_FIN_WAIT_2 &&
4487             src->state >= TCPS_FIN_WAIT_2) {
4488                 if (V_pf_status.debug >= PF_DEBUG_MISC) {
4489                         printf("pf: state reuse ");
4490                         pf_print_state(*state);
4491                         pf_print_flags(th->th_flags);
4492                         printf("\n");
4493                 }
4494                 /* XXX make sure it's the same direction ?? */
4495                 (*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
4496                 pf_unlink_state(*state, PF_ENTER_LOCKED);
4497                 *state = NULL;
4498                 return (PF_DROP);
4499         }
4500
4501         if ((*state)->state_flags & PFSTATE_SLOPPY) {
4502                 if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP)
4503                         return (PF_DROP);
4504         } else {
4505                 if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason,
4506                     &copyback) == PF_DROP)
4507                         return (PF_DROP);
4508         }
4509
4510         /* translate source/destination address, if necessary */
4511         if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4512                 struct pf_state_key *nk = (*state)->key[pd->didx];
4513
4514                 if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4515                     nk->port[pd->sidx] != th->th_sport)
4516                         pf_change_ap(m, pd->src, &th->th_sport,
4517                             pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx],
4518                             nk->port[pd->sidx], 0, pd->af);
4519
4520                 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
4521                     nk->port[pd->didx] != th->th_dport)
4522                         pf_change_ap(m, pd->dst, &th->th_dport,
4523                             pd->ip_sum, &th->th_sum, &nk->addr[pd->didx],
4524                             nk->port[pd->didx], 0, pd->af);
4525                 copyback = 1;
4526         }
4527
4528         /* Copyback sequence modulation or stateful scrub changes if needed */
4529         if (copyback)
4530                 m_copyback(m, off, sizeof(*th), (caddr_t)th);
4531
4532         return (PF_PASS);
4533 }
4534
4535 static int
4536 pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif,
4537     struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
4538 {
4539         struct pf_state_peer    *src, *dst;
4540         struct pf_state_key_cmp  key;
4541         struct udphdr           *uh = pd->hdr.udp;
4542
4543         bzero(&key, sizeof(key));
4544         key.af = pd->af;
4545         key.proto = IPPROTO_UDP;
4546         if (direction == PF_IN) {       /* wire side, straight */
4547                 PF_ACPY(&key.addr[0], pd->src, key.af);
4548                 PF_ACPY(&key.addr[1], pd->dst, key.af);
4549                 key.port[0] = uh->uh_sport;
4550                 key.port[1] = uh->uh_dport;
4551         } else {                        /* stack side, reverse */
4552                 PF_ACPY(&key.addr[1], pd->src, key.af);
4553                 PF_ACPY(&key.addr[0], pd->dst, key.af);
4554                 key.port[1] = uh->uh_sport;
4555                 key.port[0] = uh->uh_dport;
4556         }
4557
4558         STATE_LOOKUP(kif, &key, direction, *state, pd);
4559
4560         if (direction == (*state)->direction) {
4561                 src = &(*state)->src;
4562                 dst = &(*state)->dst;
4563         } else {
4564                 src = &(*state)->dst;
4565                 dst = &(*state)->src;
4566         }
4567
4568         /* update states */
4569         if (src->state < PFUDPS_SINGLE)
4570                 src->state = PFUDPS_SINGLE;
4571         if (dst->state == PFUDPS_SINGLE)
4572                 dst->state = PFUDPS_MULTIPLE;
4573
4574         /* update expire time */
4575         (*state)->expire = time_uptime;
4576         if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
4577                 (*state)->timeout = PFTM_UDP_MULTIPLE;
4578         else
4579                 (*state)->timeout = PFTM_UDP_SINGLE;
4580
4581         /* translate source/destination address, if necessary */
4582         if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4583                 struct pf_state_key *nk = (*state)->key[pd->didx];
4584
4585                 if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4586                     nk->port[pd->sidx] != uh->uh_sport)
4587                         pf_change_ap(m, pd->src, &uh->uh_sport, pd->ip_sum,
4588                             &uh->uh_sum, &nk->addr[pd->sidx],
4589                             nk->port[pd->sidx], 1, pd->af);
4590
4591                 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
4592                     nk->port[pd->didx] != uh->uh_dport)
4593                         pf_change_ap(m, pd->dst, &uh->uh_dport, pd->ip_sum,
4594                             &uh->uh_sum, &nk->addr[pd->didx],
4595                             nk->port[pd->didx], 1, pd->af);
4596                 m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
4597         }
4598
4599         return (PF_PASS);
4600 }
4601
4602 static int
4603 pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
4604     struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason)
4605 {
4606         struct pf_addr  *saddr = pd->src, *daddr = pd->dst;
4607         u_int16_t        icmpid = 0, *icmpsum;
4608         u_int8_t         icmptype, icmpcode;
4609         int              state_icmp = 0;
4610         struct pf_state_key_cmp key;
4611
4612         bzero(&key, sizeof(key));
4613         switch (pd->proto) {
4614 #ifdef INET
4615         case IPPROTO_ICMP:
4616                 icmptype = pd->hdr.icmp->icmp_type;
4617                 icmpcode = pd->hdr.icmp->icmp_code;
4618                 icmpid = pd->hdr.icmp->icmp_id;
4619                 icmpsum = &pd->hdr.icmp->icmp_cksum;
4620
4621                 if (icmptype == ICMP_UNREACH ||
4622                     icmptype == ICMP_SOURCEQUENCH ||
4623                     icmptype == ICMP_REDIRECT ||
4624                     icmptype == ICMP_TIMXCEED ||
4625                     icmptype == ICMP_PARAMPROB)
4626                         state_icmp++;
4627                 break;
4628 #endif /* INET */
4629 #ifdef INET6
4630         case IPPROTO_ICMPV6:
4631                 icmptype = pd->hdr.icmp6->icmp6_type;
4632                 icmpcode = pd->hdr.icmp6->icmp6_code;
4633                 icmpid = pd->hdr.icmp6->icmp6_id;
4634                 icmpsum = &pd->hdr.icmp6->icmp6_cksum;
4635
4636                 if (icmptype == ICMP6_DST_UNREACH ||
4637                     icmptype == ICMP6_PACKET_TOO_BIG ||
4638                     icmptype == ICMP6_TIME_EXCEEDED ||
4639                     icmptype == ICMP6_PARAM_PROB)
4640                         state_icmp++;
4641                 break;
4642 #endif /* INET6 */
4643         }
4644
4645         if (!state_icmp) {
4646
4647                 /*
4648                  * ICMP query/reply message not related to a TCP/UDP packet.
4649                  * Search for an ICMP state.
4650                  */
4651                 key.af = pd->af;
4652                 key.proto = pd->proto;
4653                 key.port[0] = key.port[1] = icmpid;
4654                 if (direction == PF_IN) {       /* wire side, straight */
4655                         PF_ACPY(&key.addr[0], pd->src, key.af);
4656                         PF_ACPY(&key.addr[1], pd->dst, key.af);
4657                 } else {                        /* stack side, reverse */
4658                         PF_ACPY(&key.addr[1], pd->src, key.af);
4659                         PF_ACPY(&key.addr[0], pd->dst, key.af);
4660                 }
4661
4662                 STATE_LOOKUP(kif, &key, direction, *state, pd);
4663
4664                 (*state)->expire = time_uptime;
4665                 (*state)->timeout = PFTM_ICMP_ERROR_REPLY;
4666
4667                 /* translate source/destination address, if necessary */
4668                 if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4669                         struct pf_state_key *nk = (*state)->key[pd->didx];
4670
4671                         switch (pd->af) {
4672 #ifdef INET
4673                         case AF_INET:
4674                                 if (PF_ANEQ(pd->src,
4675                                     &nk->addr[pd->sidx], AF_INET))
4676                                         pf_change_a(&saddr->v4.s_addr,
4677                                             pd->ip_sum,
4678                                             nk->addr[pd->sidx].v4.s_addr, 0);
4679
4680                                 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
4681                                     AF_INET))
4682                                         pf_change_a(&daddr->v4.s_addr,
4683                                             pd->ip_sum,
4684                                             nk->addr[pd->didx].v4.s_addr, 0);
4685
4686                                 if (nk->port[0] !=
4687                                     pd->hdr.icmp->icmp_id) {
4688                                         pd->hdr.icmp->icmp_cksum =
4689                                             pf_cksum_fixup(
4690                                             pd->hdr.icmp->icmp_cksum, icmpid,
4691                                             nk->port[pd->sidx], 0);
4692                                         pd->hdr.icmp->icmp_id =
4693                                             nk->port[pd->sidx];
4694                                 }
4695
4696                                 m_copyback(m, off, ICMP_MINLEN,
4697                                     (caddr_t )pd->hdr.icmp);
4698                                 break;
4699 #endif /* INET */
4700 #ifdef INET6
4701                         case AF_INET6:
4702                                 if (PF_ANEQ(pd->src,
4703                                     &nk->addr[pd->sidx], AF_INET6))
4704                                         pf_change_a6(saddr,
4705                                             &pd->hdr.icmp6->icmp6_cksum,
4706                                             &nk->addr[pd->sidx], 0);
4707
4708                                 if (PF_ANEQ(pd->dst,
4709                                     &nk->addr[pd->didx], AF_INET6))
4710                                         pf_change_a6(daddr,
4711                                             &pd->hdr.icmp6->icmp6_cksum,
4712                                             &nk->addr[pd->didx], 0);
4713
4714                                 m_copyback(m, off, sizeof(struct icmp6_hdr),
4715                                     (caddr_t )pd->hdr.icmp6);
4716                                 break;
4717 #endif /* INET6 */
4718                         }
4719                 }
4720                 return (PF_PASS);
4721
4722         } else {
4723                 /*
4724                  * ICMP error message in response to a TCP/UDP packet.
4725                  * Extract the inner TCP/UDP header and search for that state.
4726                  */
4727
4728                 struct pf_pdesc pd2;
4729                 bzero(&pd2, sizeof pd2);
4730 #ifdef INET
4731                 struct ip       h2;
4732 #endif /* INET */
4733 #ifdef INET6
4734                 struct ip6_hdr  h2_6;
4735                 int             terminal = 0;
4736 #endif /* INET6 */
4737                 int             ipoff2 = 0;
4738                 int             off2 = 0;
4739
4740                 pd2.af = pd->af;
4741                 /* Payload packet is from the opposite direction. */
4742                 pd2.sidx = (direction == PF_IN) ? 1 : 0;
4743                 pd2.didx = (direction == PF_IN) ? 0 : 1;
4744                 switch (pd->af) {
4745 #ifdef INET
4746                 case AF_INET:
4747                         /* offset of h2 in mbuf chain */
4748                         ipoff2 = off + ICMP_MINLEN;
4749
4750                         if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
4751                             NULL, reason, pd2.af)) {
4752                                 DPFPRINTF(PF_DEBUG_MISC,
4753                                     ("pf: ICMP error message too short "
4754                                     "(ip)\n"));
4755                                 return (PF_DROP);
4756                         }
4757                         /*
4758                          * ICMP error messages don't refer to non-first
4759                          * fragments
4760                          */
4761                         if (h2.ip_off & htons(IP_OFFMASK)) {
4762                                 REASON_SET(reason, PFRES_FRAG);
4763                                 return (PF_DROP);
4764                         }
4765
4766                         /* offset of protocol header that follows h2 */
4767                         off2 = ipoff2 + (h2.ip_hl << 2);
4768
4769                         pd2.proto = h2.ip_p;
4770                         pd2.src = (struct pf_addr *)&h2.ip_src;
4771                         pd2.dst = (struct pf_addr *)&h2.ip_dst;
4772                         pd2.ip_sum = &h2.ip_sum;
4773                         break;
4774 #endif /* INET */
4775 #ifdef INET6
4776                 case AF_INET6:
4777                         ipoff2 = off + sizeof(struct icmp6_hdr);
4778
4779                         if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
4780                             NULL, reason, pd2.af)) {
4781                                 DPFPRINTF(PF_DEBUG_MISC,
4782                                     ("pf: ICMP error message too short "
4783                                     "(ip6)\n"));
4784                                 return (PF_DROP);
4785                         }
4786                         pd2.proto = h2_6.ip6_nxt;
4787                         pd2.src = (struct pf_addr *)&h2_6.ip6_src;
4788                         pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
4789                         pd2.ip_sum = NULL;
4790                         off2 = ipoff2 + sizeof(h2_6);
4791                         do {
4792                                 switch (pd2.proto) {
4793                                 case IPPROTO_FRAGMENT:
4794                                         /*
4795                                          * ICMPv6 error messages for
4796                                          * non-first fragments
4797                                          */
4798                                         REASON_SET(reason, PFRES_FRAG);
4799                                         return (PF_DROP);
4800                                 case IPPROTO_AH:
4801                                 case IPPROTO_HOPOPTS:
4802                                 case IPPROTO_ROUTING:
4803                                 case IPPROTO_DSTOPTS: {
4804                                         /* get next header and header length */
4805                                         struct ip6_ext opt6;
4806
4807                                         if (!pf_pull_hdr(m, off2, &opt6,
4808                                             sizeof(opt6), NULL, reason,
4809                                             pd2.af)) {
4810                                                 DPFPRINTF(PF_DEBUG_MISC,
4811                                                     ("pf: ICMPv6 short opt\n"));
4812                                                 return (PF_DROP);
4813                                         }
4814                                         if (pd2.proto == IPPROTO_AH)
4815                                                 off2 += (opt6.ip6e_len + 2) * 4;
4816                                         else
4817                                                 off2 += (opt6.ip6e_len + 1) * 8;
4818                                         pd2.proto = opt6.ip6e_nxt;
4819                                         /* goto the next header */
4820                                         break;
4821                                 }
4822                                 default:
4823                                         terminal++;
4824                                         break;
4825                                 }
4826                         } while (!terminal);
4827                         break;
4828 #endif /* INET6 */
4829                 }
4830
4831                 if (PF_ANEQ(pd->dst, pd2.src, pd->af)) {
4832                         if (V_pf_status.debug >= PF_DEBUG_MISC) {
4833                                 printf("pf: BAD ICMP %d:%d outer dst: ",
4834                                     icmptype, icmpcode);
4835                                 pf_print_host(pd->src, 0, pd->af);
4836                                 printf(" -> ");
4837                                 pf_print_host(pd->dst, 0, pd->af);
4838                                 printf(" inner src: ");
4839                                 pf_print_host(pd2.src, 0, pd2.af);
4840                                 printf(" -> ");
4841                                 pf_print_host(pd2.dst, 0, pd2.af);
4842                                 printf("\n");
4843                         }
4844                         REASON_SET(reason, PFRES_BADSTATE);
4845                         return (PF_DROP);
4846                 }
4847
4848                 switch (pd2.proto) {
4849                 case IPPROTO_TCP: {
4850                         struct tcphdr            th;
4851                         u_int32_t                seq;
4852                         struct pf_state_peer    *src, *dst;
4853                         u_int8_t                 dws;
4854                         int                      copyback = 0;
4855
4856                         /*
4857                          * Only the first 8 bytes of the TCP header can be
4858                          * expected. Don't access any TCP header fields after
4859                          * th_seq, an ackskew test is not possible.
4860                          */
4861                         if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
4862                             pd2.af)) {
4863                                 DPFPRINTF(PF_DEBUG_MISC,
4864                                     ("pf: ICMP error message too short "
4865                                     "(tcp)\n"));
4866                                 return (PF_DROP);
4867                         }
4868
4869                         key.af = pd2.af;
4870                         key.proto = IPPROTO_TCP;
4871                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4872                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4873                         key.port[pd2.sidx] = th.th_sport;
4874                         key.port[pd2.didx] = th.th_dport;
4875
4876                         STATE_LOOKUP(kif, &key, direction, *state, pd);
4877
4878                         if (direction == (*state)->direction) {
4879                                 src = &(*state)->dst;
4880                                 dst = &(*state)->src;
4881                         } else {
4882                                 src = &(*state)->src;
4883                                 dst = &(*state)->dst;
4884                         }
4885
4886                         if (src->wscale && dst->wscale)
4887                                 dws = dst->wscale & PF_WSCALE_MASK;
4888                         else
4889                                 dws = 0;
4890
4891                         /* Demodulate sequence number */
4892                         seq = ntohl(th.th_seq) - src->seqdiff;
4893                         if (src->seqdiff) {
4894                                 pf_change_a(&th.th_seq, icmpsum,
4895                                     htonl(seq), 0);
4896                                 copyback = 1;
4897                         }
4898
4899                         if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
4900                             (!SEQ_GEQ(src->seqhi, seq) ||
4901                             !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
4902                                 if (V_pf_status.debug >= PF_DEBUG_MISC) {
4903                                         printf("pf: BAD ICMP %d:%d ",
4904                                             icmptype, icmpcode);
4905                                         pf_print_host(pd->src, 0, pd->af);
4906                                         printf(" -> ");
4907                                         pf_print_host(pd->dst, 0, pd->af);
4908                                         printf(" state: ");
4909                                         pf_print_state(*state);
4910                                         printf(" seq=%u\n", seq);
4911                                 }
4912                                 REASON_SET(reason, PFRES_BADSTATE);
4913                                 return (PF_DROP);
4914                         } else {
4915                                 if (V_pf_status.debug >= PF_DEBUG_MISC) {
4916                                         printf("pf: OK ICMP %d:%d ",
4917                                             icmptype, icmpcode);
4918                                         pf_print_host(pd->src, 0, pd->af);
4919                                         printf(" -> ");
4920                                         pf_print_host(pd->dst, 0, pd->af);
4921                                         printf(" state: ");
4922                                         pf_print_state(*state);
4923                                         printf(" seq=%u\n", seq);
4924                                 }
4925                         }
4926
4927                         /* translate source/destination address, if necessary */
4928                         if ((*state)->key[PF_SK_WIRE] !=
4929                             (*state)->key[PF_SK_STACK]) {
4930                                 struct pf_state_key *nk =
4931                                     (*state)->key[pd->didx];
4932
4933                                 if (PF_ANEQ(pd2.src,
4934                                     &nk->addr[pd2.sidx], pd2.af) ||
4935                                     nk->port[pd2.sidx] != th.th_sport)
4936                                         pf_change_icmp(pd2.src, &th.th_sport,
4937                                             daddr, &nk->addr[pd2.sidx],
4938                                             nk->port[pd2.sidx], NULL,
4939                                             pd2.ip_sum, icmpsum,
4940                                             pd->ip_sum, 0, pd2.af);
4941
4942                                 if (PF_ANEQ(pd2.dst,
4943                                     &nk->addr[pd2.didx], pd2.af) ||
4944                                     nk->port[pd2.didx] != th.th_dport)
4945                                         pf_change_icmp(pd2.dst, &th.th_dport,
4946                                             saddr, &nk->addr[pd2.didx],
4947                                             nk->port[pd2.didx], NULL,
4948                                             pd2.ip_sum, icmpsum,
4949                                             pd->ip_sum, 0, pd2.af);
4950                                 copyback = 1;
4951                         }
4952
4953                         if (copyback) {
4954                                 switch (pd2.af) {
4955 #ifdef INET
4956                                 case AF_INET:
4957                                         m_copyback(m, off, ICMP_MINLEN,
4958                                             (caddr_t )pd->hdr.icmp);
4959                                         m_copyback(m, ipoff2, sizeof(h2),
4960                                             (caddr_t )&h2);
4961                                         break;
4962 #endif /* INET */
4963 #ifdef INET6
4964                                 case AF_INET6:
4965                                         m_copyback(m, off,
4966                                             sizeof(struct icmp6_hdr),
4967                                             (caddr_t )pd->hdr.icmp6);
4968                                         m_copyback(m, ipoff2, sizeof(h2_6),
4969                                             (caddr_t )&h2_6);
4970                                         break;
4971 #endif /* INET6 */
4972                                 }
4973                                 m_copyback(m, off2, 8, (caddr_t)&th);
4974                         }
4975
4976                         return (PF_PASS);
4977                         break;
4978                 }
4979                 case IPPROTO_UDP: {
4980                         struct udphdr           uh;
4981
4982                         if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
4983                             NULL, reason, pd2.af)) {
4984                                 DPFPRINTF(PF_DEBUG_MISC,
4985                                     ("pf: ICMP error message too short "
4986                                     "(udp)\n"));
4987                                 return (PF_DROP);
4988                         }
4989
4990                         key.af = pd2.af;
4991                         key.proto = IPPROTO_UDP;
4992                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4993                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4994                         key.port[pd2.sidx] = uh.uh_sport;
4995                         key.port[pd2.didx] = uh.uh_dport;
4996
4997                         STATE_LOOKUP(kif, &key, direction, *state, pd);
4998
4999                         /* translate source/destination address, if necessary */
5000                         if ((*state)->key[PF_SK_WIRE] !=
5001                             (*state)->key[PF_SK_STACK]) {
5002                                 struct pf_state_key *nk =
5003                                     (*state)->key[pd->didx];
5004
5005                                 if (PF_ANEQ(pd2.src,
5006                                     &nk->addr[pd2.sidx], pd2.af) ||
5007                                     nk->port[pd2.sidx] != uh.uh_sport)
5008                                         pf_change_icmp(pd2.src, &uh.uh_sport,
5009                                             daddr, &nk->addr[pd2.sidx],
5010                                             nk->port[pd2.sidx], &uh.uh_sum,
5011                                             pd2.ip_sum, icmpsum,
5012                                             pd->ip_sum, 1, pd2.af);
5013
5014                                 if (PF_ANEQ(pd2.dst,
5015                                     &nk->addr[pd2.didx], pd2.af) ||
5016                                     nk->port[pd2.didx] != uh.uh_dport)
5017                                         pf_change_icmp(pd2.dst, &uh.uh_dport,
5018                                             saddr, &nk->addr[pd2.didx],
5019                                             nk->port[pd2.didx], &uh.uh_sum,
5020                                             pd2.ip_sum, icmpsum,
5021                                             pd->ip_sum, 1, pd2.af);
5022
5023                                 switch (pd2.af) {
5024 #ifdef INET
5025                                 case AF_INET:
5026                                         m_copyback(m, off, ICMP_MINLEN,
5027                                             (caddr_t )pd->hdr.icmp);
5028                                         m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
5029                                         break;
5030 #endif /* INET */
5031 #ifdef INET6
5032                                 case AF_INET6:
5033                                         m_copyback(m, off,
5034                                             sizeof(struct icmp6_hdr),
5035                                             (caddr_t )pd->hdr.icmp6);
5036                                         m_copyback(m, ipoff2, sizeof(h2_6),
5037                                             (caddr_t )&h2_6);
5038                                         break;
5039 #endif /* INET6 */
5040                                 }
5041                                 m_copyback(m, off2, sizeof(uh), (caddr_t)&uh);
5042                         }
5043                         return (PF_PASS);
5044                         break;
5045                 }
5046 #ifdef INET
5047                 case IPPROTO_ICMP: {
5048                         struct icmp             iih;
5049
5050                         if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
5051                             NULL, reason, pd2.af)) {
5052                                 DPFPRINTF(PF_DEBUG_MISC,
5053                                     ("pf: ICMP error message too short i"
5054                                     "(icmp)\n"));
5055                                 return (PF_DROP);
5056                         }
5057
5058                         key.af = pd2.af;
5059                         key.proto = IPPROTO_ICMP;
5060                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5061                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5062                         key.port[0] = key.port[1] = iih.icmp_id;
5063
5064                         STATE_LOOKUP(kif, &key, direction, *state, pd);
5065
5066                         /* translate source/destination address, if necessary */
5067                         if ((*state)->key[PF_SK_WIRE] !=
5068                             (*state)->key[PF_SK_STACK]) {
5069                                 struct pf_state_key *nk =
5070                                     (*state)->key[pd->didx];
5071
5072                                 if (PF_ANEQ(pd2.src,
5073                                     &nk->addr[pd2.sidx], pd2.af) ||
5074                                     nk->port[pd2.sidx] != iih.icmp_id)
5075                                         pf_change_icmp(pd2.src, &iih.icmp_id,
5076                                             daddr, &nk->addr[pd2.sidx],
5077                                             nk->port[pd2.sidx], NULL,
5078                                             pd2.ip_sum, icmpsum,
5079                                             pd->ip_sum, 0, AF_INET);
5080
5081                                 if (PF_ANEQ(pd2.dst,
5082                                     &nk->addr[pd2.didx], pd2.af) ||
5083                                     nk->port[pd2.didx] != iih.icmp_id)
5084                                         pf_change_icmp(pd2.dst, &iih.icmp_id,
5085                                             saddr, &nk->addr[pd2.didx],
5086                                             nk->port[pd2.didx], NULL,
5087                                             pd2.ip_sum, icmpsum,
5088                                             pd->ip_sum, 0, AF_INET);
5089
5090                                 m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
5091                                 m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
5092                                 m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih);
5093                         }
5094                         return (PF_PASS);
5095                         break;
5096                 }
5097 #endif /* INET */
5098 #ifdef INET6
5099                 case IPPROTO_ICMPV6: {
5100                         struct icmp6_hdr        iih;
5101
5102                         if (!pf_pull_hdr(m, off2, &iih,
5103                             sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
5104                                 DPFPRINTF(PF_DEBUG_MISC,
5105                                     ("pf: ICMP error message too short "
5106                                     "(icmp6)\n"));
5107                                 return (PF_DROP);
5108                         }
5109
5110                         key.af = pd2.af;
5111                         key.proto = IPPROTO_ICMPV6;
5112                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5113                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5114                         key.port[0] = key.port[1] = iih.icmp6_id;
5115
5116                         STATE_LOOKUP(kif, &key, direction, *state, pd);
5117
5118                         /* translate source/destination address, if necessary */
5119                         if ((*state)->key[PF_SK_WIRE] !=
5120                             (*state)->key[PF_SK_STACK]) {
5121                                 struct pf_state_key *nk =
5122                                     (*state)->key[pd->didx];
5123
5124                                 if (PF_ANEQ(pd2.src,
5125                                     &nk->addr[pd2.sidx], pd2.af) ||
5126                                     nk->port[pd2.sidx] != iih.icmp6_id)
5127                                         pf_change_icmp(pd2.src, &iih.icmp6_id,
5128                                             daddr, &nk->addr[pd2.sidx],
5129                                             nk->port[pd2.sidx], NULL,
5130                                             pd2.ip_sum, icmpsum,
5131                                             pd->ip_sum, 0, AF_INET6);
5132
5133                                 if (PF_ANEQ(pd2.dst,
5134                                     &nk->addr[pd2.didx], pd2.af) ||
5135                                     nk->port[pd2.didx] != iih.icmp6_id)
5136                                         pf_change_icmp(pd2.dst, &iih.icmp6_id,
5137                                             saddr, &nk->addr[pd2.didx],
5138                                             nk->port[pd2.didx], NULL,
5139                                             pd2.ip_sum, icmpsum,
5140                                             pd->ip_sum, 0, AF_INET6);
5141
5142                                 m_copyback(m, off, sizeof(struct icmp6_hdr),
5143                                     (caddr_t)pd->hdr.icmp6);
5144                                 m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
5145                                 m_copyback(m, off2, sizeof(struct icmp6_hdr),
5146                                     (caddr_t)&iih);
5147                         }
5148                         return (PF_PASS);
5149                         break;
5150                 }
5151 #endif /* INET6 */
5152                 default: {
5153                         key.af = pd2.af;
5154                         key.proto = pd2.proto;
5155                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
5156                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
5157                         key.port[0] = key.port[1] = 0;
5158
5159                         STATE_LOOKUP(kif, &key, direction, *state, pd);
5160
5161                         /* translate source/destination address, if necessary */
5162                         if ((*state)->key[PF_SK_WIRE] !=
5163                             (*state)->key[PF_SK_STACK]) {
5164                                 struct pf_state_key *nk =
5165                                     (*state)->key[pd->didx];
5166
5167                                 if (PF_ANEQ(pd2.src,
5168                                     &nk->addr[pd2.sidx], pd2.af))
5169                                         pf_change_icmp(pd2.src, NULL, daddr,
5170                                             &nk->addr[pd2.sidx], 0, NULL,
5171                                             pd2.ip_sum, icmpsum,
5172                                             pd->ip_sum, 0, pd2.af);
5173
5174                                 if (PF_ANEQ(pd2.dst,
5175                                     &nk->addr[pd2.didx], pd2.af))
5176                                         pf_change_icmp(pd2.dst, NULL, saddr,
5177                                             &nk->addr[pd2.didx], 0, NULL,
5178                                             pd2.ip_sum, icmpsum,
5179                                             pd->ip_sum, 0, pd2.af);
5180
5181                                 switch (pd2.af) {
5182 #ifdef INET
5183                                 case AF_INET:
5184                                         m_copyback(m, off, ICMP_MINLEN,
5185                                             (caddr_t)pd->hdr.icmp);
5186                                         m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
5187                                         break;
5188 #endif /* INET */
5189 #ifdef INET6
5190                                 case AF_INET6:
5191                                         m_copyback(m, off,
5192                                             sizeof(struct icmp6_hdr),
5193                                             (caddr_t )pd->hdr.icmp6);
5194                                         m_copyback(m, ipoff2, sizeof(h2_6),
5195                                             (caddr_t )&h2_6);
5196                                         break;
5197 #endif /* INET6 */
5198                                 }
5199                         }
5200                         return (PF_PASS);
5201                         break;
5202                 }
5203                 }
5204         }
5205 }
5206
5207 static int
5208 pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif,
5209     struct mbuf *m, struct pf_pdesc *pd)
5210 {
5211         struct pf_state_peer    *src, *dst;
5212         struct pf_state_key_cmp  key;
5213
5214         bzero(&key, sizeof(key));
5215         key.af = pd->af;
5216         key.proto = pd->proto;
5217         if (direction == PF_IN) {
5218                 PF_ACPY(&key.addr[0], pd->src, key.af);
5219                 PF_ACPY(&key.addr[1], pd->dst, key.af);
5220                 key.port[0] = key.port[1] = 0;
5221         } else {
5222                 PF_ACPY(&key.addr[1], pd->src, key.af);
5223                 PF_ACPY(&key.addr[0], pd->dst, key.af);
5224                 key.port[1] = key.port[0] = 0;
5225         }
5226
5227         STATE_LOOKUP(kif, &key, direction, *state, pd);
5228
5229         if (direction == (*state)->direction) {
5230                 src = &(*state)->src;
5231                 dst = &(*state)->dst;
5232         } else {
5233                 src = &(*state)->dst;
5234                 dst = &(*state)->src;
5235         }
5236
5237         /* update states */
5238         if (src->state < PFOTHERS_SINGLE)
5239                 src->state = PFOTHERS_SINGLE;
5240         if (dst->state == PFOTHERS_SINGLE)
5241                 dst->state = PFOTHERS_MULTIPLE;
5242
5243         /* update expire time */
5244         (*state)->expire = time_uptime;
5245         if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
5246                 (*state)->timeout = PFTM_OTHER_MULTIPLE;
5247         else
5248                 (*state)->timeout = PFTM_OTHER_SINGLE;
5249
5250         /* translate source/destination address, if necessary */
5251         if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
5252                 struct pf_state_key *nk = (*state)->key[pd->didx];
5253
5254                 KASSERT(nk, ("%s: nk is null", __func__));
5255                 KASSERT(pd, ("%s: pd is null", __func__));
5256                 KASSERT(pd->src, ("%s: pd->src is null", __func__));
5257                 KASSERT(pd->dst, ("%s: pd->dst is null", __func__));
5258                 switch (pd->af) {
5259 #ifdef INET
5260                 case AF_INET:
5261                         if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
5262                                 pf_change_a(&pd->src->v4.s_addr,
5263                                     pd->ip_sum,
5264                                     nk->addr[pd->sidx].v4.s_addr,
5265                                     0);
5266
5267
5268                         if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
5269                                 pf_change_a(&pd->dst->v4.s_addr,
5270                                     pd->ip_sum,
5271                                     nk->addr[pd->didx].v4.s_addr,
5272                                     0);
5273
5274                         break;
5275 #endif /* INET */
5276 #ifdef INET6
5277                 case AF_INET6:
5278                         if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
5279                                 PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
5280
5281                         if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
5282                                 PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
5283 #endif /* INET6 */
5284                 }
5285         }
5286         return (PF_PASS);
5287 }
5288
5289 /*
5290  * ipoff and off are measured from the start of the mbuf chain.
5291  * h must be at "ipoff" on the mbuf chain.
5292  */
5293 void *
5294 pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
5295     u_short *actionp, u_short *reasonp, sa_family_t af)
5296 {
5297         switch (af) {
5298 #ifdef INET
5299         case AF_INET: {
5300                 struct ip       *h = mtod(m, struct ip *);
5301                 u_int16_t        fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
5302
5303                 if (fragoff) {
5304                         if (fragoff >= len)
5305                                 ACTION_SET(actionp, PF_PASS);
5306                         else {
5307                                 ACTION_SET(actionp, PF_DROP);
5308                                 REASON_SET(reasonp, PFRES_FRAG);
5309                         }
5310                         return (NULL);
5311                 }
5312                 if (m->m_pkthdr.len < off + len ||
5313                     ntohs(h->ip_len) < off + len) {
5314                         ACTION_SET(actionp, PF_DROP);
5315                         REASON_SET(reasonp, PFRES_SHORT);
5316                         return (NULL);
5317                 }
5318                 break;
5319         }
5320 #endif /* INET */
5321 #ifdef INET6
5322         case AF_INET6: {
5323                 struct ip6_hdr  *h = mtod(m, struct ip6_hdr *);
5324
5325                 if (m->m_pkthdr.len < off + len ||
5326                     (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
5327                     (unsigned)(off + len)) {
5328                         ACTION_SET(actionp, PF_DROP);
5329                         REASON_SET(reasonp, PFRES_SHORT);
5330                         return (NULL);
5331                 }
5332                 break;
5333         }
5334 #endif /* INET6 */
5335         }
5336         m_copydata(m, off, len, p);
5337         return (p);
5338 }
5339
5340 #ifdef RADIX_MPATH
5341 static int
5342 pf_routable_oldmpath(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
5343     int rtableid)
5344 {
5345         struct radix_node_head  *rnh;
5346         struct sockaddr_in      *dst;
5347         int                      ret = 1;
5348         int                      check_mpath;
5349 #ifdef INET6
5350         struct sockaddr_in6     *dst6;
5351         struct route_in6         ro;
5352 #else
5353         struct route             ro;
5354 #endif
5355         struct radix_node       *rn;
5356         struct rtentry          *rt;
5357         struct ifnet            *ifp;
5358
5359         check_mpath = 0;
5360         /* XXX: stick to table 0 for now */
5361         rnh = rt_tables_get_rnh(0, af);
5362         if (rnh != NULL && rn_mpath_capable(rnh))
5363                 check_mpath = 1;
5364         bzero(&ro, sizeof(ro));
5365         switch (af) {
5366         case AF_INET:
5367                 dst = satosin(&ro.ro_dst);
5368                 dst->sin_family = AF_INET;
5369                 dst->sin_len = sizeof(*dst);
5370                 dst->sin_addr = addr->v4;
5371                 break;
5372 #ifdef INET6
5373         case AF_INET6:
5374                 /*
5375                  * Skip check for addresses with embedded interface scope,
5376                  * as they would always match anyway.
5377                  */
5378                 if (IN6_IS_SCOPE_EMBED(&addr->v6))
5379                         goto out;
5380                 dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
5381                 dst6->sin6_family = AF_INET6;
5382                 dst6->sin6_len = sizeof(*dst6);
5383                 dst6->sin6_addr = addr->v6;
5384                 break;
5385 #endif /* INET6 */
5386         default:
5387                 return (0);
5388         }
5389
5390         /* Skip checks for ipsec interfaces */
5391         if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
5392                 goto out;
5393
5394         switch (af) {
5395 #ifdef INET6
5396         case AF_INET6:
5397                 in6_rtalloc_ign(&ro, 0, rtableid);
5398                 break;
5399 #endif
5400 #ifdef INET
5401         case AF_INET:
5402                 in_rtalloc_ign((struct route *)&ro, 0, rtableid);
5403                 break;
5404 #endif
5405         }
5406
5407         if (ro.ro_rt != NULL) {
5408                 /* No interface given, this is a no-route check */
5409                 if (kif == NULL)
5410                         goto out;
5411
5412                 if (kif->pfik_ifp == NULL) {
5413                         ret = 0;
5414                         goto out;
5415                 }
5416
5417                 /* Perform uRPF check if passed input interface */
5418                 ret = 0;
5419                 rn = (struct radix_node *)ro.ro_rt;
5420                 do {
5421                         rt = (struct rtentry *)rn;
5422                         ifp = rt->rt_ifp;
5423
5424                         if (kif->pfik_ifp == ifp)
5425                                 ret = 1;
5426                         rn = rn_mpath_next(rn);
5427                 } while (check_mpath == 1 && rn != NULL && ret == 0);
5428         } else
5429                 ret = 0;
5430 out:
5431         if (ro.ro_rt != NULL)
5432                 RTFREE(ro.ro_rt);
5433         return (ret);
5434 }
5435 #endif
5436
5437 int
5438 pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
5439     int rtableid)
5440 {
5441 #ifdef INET
5442         struct nhop4_basic      nh4;
5443 #endif
5444 #ifdef INET6
5445         struct nhop6_basic      nh6;
5446 #endif
5447         struct ifnet            *ifp;
5448 #ifdef RADIX_MPATH
5449         struct radix_node_head  *rnh;
5450
5451         /* XXX: stick to table 0 for now */
5452         rnh = rt_tables_get_rnh(0, af);
5453         if (rnh != NULL && rn_mpath_capable(rnh))
5454                 return (pf_routable_oldmpath(addr, af, kif, rtableid));
5455 #endif
5456         /*
5457          * Skip check for addresses with embedded interface scope,
5458          * as they would always match anyway.
5459          */
5460         if (af == AF_INET6 && IN6_IS_SCOPE_EMBED(&addr->v6))
5461                 return (1);
5462
5463         if (af != AF_INET && af != AF_INET6)
5464                 return (0);
5465
5466         /* Skip checks for ipsec interfaces */
5467         if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
5468                 return (1);
5469
5470         ifp = NULL;
5471
5472         switch (af) {
5473 #ifdef INET6
5474         case AF_INET6:
5475                 if (fib6_lookup_nh_basic(rtableid, &addr->v6, 0, 0, 0, &nh6)!=0)
5476                         return (0);
5477                 ifp = nh6.nh_ifp;
5478                 break;
5479 #endif
5480 #ifdef INET
5481         case AF_INET:
5482                 if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) != 0)
5483                         return (0);
5484                 ifp = nh4.nh_ifp;
5485                 break;
5486 #endif
5487         }
5488
5489         /* No interface given, this is a no-route check */
5490         if (kif == NULL)
5491                 return (1);
5492
5493         if (kif->pfik_ifp == NULL)
5494                 return (0);
5495
5496         /* Perform uRPF check if passed input interface */
5497         if (kif->pfik_ifp == ifp)
5498                 return (1);
5499         return (0);
5500 }
5501
5502 #ifdef INET
5503 static void
5504 pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5505     struct pf_state *s, struct pf_pdesc *pd, struct inpcb *inp)
5506 {
5507         struct mbuf             *m0, *m1;
5508         struct sockaddr_in      dst;
5509         struct ip               *ip;
5510         struct ifnet            *ifp = NULL;
5511         struct pf_addr           naddr;
5512         struct pf_src_node      *sn = NULL;
5513         int                      error = 0;
5514         uint16_t                 ip_len, ip_off;
5515
5516         KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
5517         KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
5518             __func__));
5519
5520         if ((pd->pf_mtag == NULL &&
5521             ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
5522             pd->pf_mtag->routed++ > 3) {
5523                 m0 = *m;
5524                 *m = NULL;
5525                 goto bad_locked;
5526         }
5527
5528         if (r->rt == PF_DUPTO) {
5529                 if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
5530                         if (s)
5531                                 PF_STATE_UNLOCK(s);
5532                         return;
5533                 }
5534         } else {
5535                 if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
5536                         if (s)
5537                                 PF_STATE_UNLOCK(s);
5538                         return;
5539                 }
5540                 m0 = *m;
5541         }
5542
5543         ip = mtod(m0, struct ip *);
5544
5545         bzero(&dst, sizeof(dst));
5546         dst.sin_family = AF_INET;
5547         dst.sin_len = sizeof(dst);
5548         dst.sin_addr = ip->ip_dst;
5549
5550         bzero(&naddr, sizeof(naddr));
5551
5552         if (TAILQ_EMPTY(&r->rpool.list)) {
5553                 DPFPRINTF(PF_DEBUG_URGENT,
5554                     ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
5555                 goto bad_locked;
5556         }
5557         if (s == NULL) {
5558                 pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
5559                     &naddr, NULL, &sn);
5560                 if (!PF_AZERO(&naddr, AF_INET))
5561                         dst.sin_addr.s_addr = naddr.v4.s_addr;
5562                 ifp = r->rpool.cur->kif ?
5563                     r->rpool.cur->kif->pfik_ifp : NULL;
5564         } else {
5565                 if (!PF_AZERO(&s->rt_addr, AF_INET))
5566                         dst.sin_addr.s_addr =
5567                             s->rt_addr.v4.s_addr;
5568                 ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
5569                 PF_STATE_UNLOCK(s);
5570         }
5571         if (ifp == NULL)
5572                 goto bad;
5573
5574         if (oifp != ifp) {
5575                 if (pf_test(PF_OUT, 0, ifp, &m0, inp) != PF_PASS)
5576                         goto bad;
5577                 else if (m0 == NULL)
5578                         goto done;
5579                 if (m0->m_len < sizeof(struct ip)) {
5580                         DPFPRINTF(PF_DEBUG_URGENT,
5581                             ("%s: m0->m_len < sizeof(struct ip)\n", __func__));
5582                         goto bad;
5583                 }
5584                 ip = mtod(m0, struct ip *);
5585         }
5586
5587         if (ifp->if_flags & IFF_LOOPBACK)
5588                 m0->m_flags |= M_SKIP_FIREWALL;
5589
5590         ip_len = ntohs(ip->ip_len);
5591         ip_off = ntohs(ip->ip_off);
5592
5593         /* Copied from FreeBSD 10.0-CURRENT ip_output. */
5594         m0->m_pkthdr.csum_flags |= CSUM_IP;
5595         if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
5596                 in_delayed_cksum(m0);
5597                 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
5598         }
5599 #ifdef SCTP
5600         if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
5601                 sctp_delayed_cksum(m0, (uint32_t)(ip->ip_hl << 2));
5602                 m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
5603         }
5604 #endif
5605
5606         /*
5607          * If small enough for interface, or the interface will take
5608          * care of the fragmentation for us, we can just send directly.
5609          */
5610         if (ip_len <= ifp->if_mtu ||
5611             (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
5612                 ip->ip_sum = 0;
5613                 if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
5614                         ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
5615                         m0->m_pkthdr.csum_flags &= ~CSUM_IP;
5616                 }
5617                 m_clrprotoflags(m0);    /* Avoid confusing lower layers. */
5618                 error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
5619                 goto done;
5620         }
5621
5622         /* Balk when DF bit is set or the interface didn't support TSO. */
5623         if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) {
5624                 error = EMSGSIZE;
5625                 KMOD_IPSTAT_INC(ips_cantfrag);
5626                 if (r->rt != PF_DUPTO) {
5627                         icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
5628                             ifp->if_mtu);
5629                         goto done;
5630                 } else
5631                         goto bad;
5632         }
5633
5634         error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist);
5635         if (error)
5636                 goto bad;
5637
5638         for (; m0; m0 = m1) {
5639                 m1 = m0->m_nextpkt;
5640                 m0->m_nextpkt = NULL;
5641                 if (error == 0) {
5642                         m_clrprotoflags(m0);
5643                         error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
5644                 } else
5645                         m_freem(m0);
5646         }
5647
5648         if (error == 0)
5649                 KMOD_IPSTAT_INC(ips_fragmented);
5650
5651 done:
5652         if (r->rt != PF_DUPTO)
5653                 *m = NULL;
5654         return;
5655
5656 bad_locked:
5657         if (s)
5658                 PF_STATE_UNLOCK(s);
5659 bad:
5660         m_freem(m0);
5661         goto done;
5662 }
5663 #endif /* INET */
5664
5665 #ifdef INET6
5666 static void
5667 pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5668     struct pf_state *s, struct pf_pdesc *pd, struct inpcb *inp)
5669 {
5670         struct mbuf             *m0;
5671         struct sockaddr_in6     dst;
5672         struct ip6_hdr          *ip6;
5673         struct ifnet            *ifp = NULL;
5674         struct pf_addr           naddr;
5675         struct pf_src_node      *sn = NULL;
5676
5677         KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
5678         KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
5679             __func__));
5680
5681         if ((pd->pf_mtag == NULL &&
5682             ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
5683             pd->pf_mtag->routed++ > 3) {
5684                 m0 = *m;
5685                 *m = NULL;
5686                 goto bad_locked;
5687         }
5688
5689         if (r->rt == PF_DUPTO) {
5690                 if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
5691                         if (s)
5692                                 PF_STATE_UNLOCK(s);
5693                         return;
5694                 }
5695         } else {
5696                 if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
5697                         if (s)
5698                                 PF_STATE_UNLOCK(s);
5699                         return;
5700                 }
5701                 m0 = *m;
5702         }
5703
5704         ip6 = mtod(m0, struct ip6_hdr *);
5705
5706         bzero(&dst, sizeof(dst));
5707         dst.sin6_family = AF_INET6;
5708         dst.sin6_len = sizeof(dst);
5709         dst.sin6_addr = ip6->ip6_dst;
5710
5711         bzero(&naddr, sizeof(naddr));
5712
5713         if (TAILQ_EMPTY(&r->rpool.list)) {
5714                 DPFPRINTF(PF_DEBUG_URGENT,
5715                     ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
5716                 goto bad_locked;
5717         }
5718         if (s == NULL) {
5719                 pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
5720                     &naddr, NULL, &sn);
5721                 if (!PF_AZERO(&naddr, AF_INET6))
5722                         PF_ACPY((struct pf_addr *)&dst.sin6_addr,
5723                             &naddr, AF_INET6);
5724                 ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
5725         } else {
5726                 if (!PF_AZERO(&s->rt_addr, AF_INET6))
5727                         PF_ACPY((struct pf_addr *)&dst.sin6_addr,
5728                             &s->rt_addr, AF_INET6);
5729                 ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
5730         }
5731
5732         if (s)
5733                 PF_STATE_UNLOCK(s);
5734
5735         if (ifp == NULL)
5736                 goto bad;
5737
5738         if (oifp != ifp) {
5739                 if (pf_test6(PF_OUT, PFIL_FWD, ifp, &m0, inp) != PF_PASS)
5740                         goto bad;
5741                 else if (m0 == NULL)
5742                         goto done;
5743                 if (m0->m_len < sizeof(struct ip6_hdr)) {
5744                         DPFPRINTF(PF_DEBUG_URGENT,
5745                             ("%s: m0->m_len < sizeof(struct ip6_hdr)\n",
5746                             __func__));
5747                         goto bad;
5748                 }
5749                 ip6 = mtod(m0, struct ip6_hdr *);
5750         }
5751
5752         if (ifp->if_flags & IFF_LOOPBACK)
5753                 m0->m_flags |= M_SKIP_FIREWALL;
5754
5755         if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 &
5756             ~ifp->if_hwassist) {
5757                 uint32_t plen = m0->m_pkthdr.len - sizeof(*ip6);
5758                 in6_delayed_cksum(m0, plen, sizeof(struct ip6_hdr));
5759                 m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
5760         }
5761
5762         /*
5763          * If the packet is too large for the outgoing interface,
5764          * send back an icmp6 error.
5765          */
5766         if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr))
5767                 dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
5768         if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu)
5769                 nd6_output_ifp(ifp, ifp, m0, &dst, NULL);
5770         else {
5771                 in6_ifstat_inc(ifp, ifs6_in_toobig);
5772                 if (r->rt != PF_DUPTO)
5773                         icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
5774                 else
5775                         goto bad;
5776         }
5777
5778 done:
5779         if (r->rt != PF_DUPTO)
5780                 *m = NULL;
5781         return;
5782
5783 bad_locked:
5784         if (s)
5785                 PF_STATE_UNLOCK(s);
5786 bad:
5787         m_freem(m0);
5788         goto done;
5789 }
5790 #endif /* INET6 */
5791
5792 /*
5793  * FreeBSD supports cksum offloads for the following drivers.
5794  *  em(4), fxp(4), lge(4), ndis(4), nge(4), re(4), ti(4), txp(4), xl(4)
5795  *
5796  * CSUM_DATA_VALID | CSUM_PSEUDO_HDR :
5797  *  network driver performed cksum including pseudo header, need to verify
5798  *   csum_data
5799  * CSUM_DATA_VALID :
5800  *  network driver performed cksum, needs to additional pseudo header
5801  *  cksum computation with partial csum_data(i.e. lack of H/W support for
5802  *  pseudo header, for instance hme(4), sk(4) and possibly gem(4))
5803  *
5804  * After validating the cksum of packet, set both flag CSUM_DATA_VALID and
5805  * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper
5806  * TCP/UDP layer.
5807  * Also, set csum_data to 0xffff to force cksum validation.
5808  */
5809 static int
5810 pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af)
5811 {
5812         u_int16_t sum = 0;
5813         int hw_assist = 0;
5814         struct ip *ip;
5815
5816         if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
5817                 return (1);
5818         if (m->m_pkthdr.len < off + len)
5819                 return (1);
5820
5821         switch (p) {
5822         case IPPROTO_TCP:
5823                 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
5824                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5825                                 sum = m->m_pkthdr.csum_data;
5826                         } else {
5827                                 ip = mtod(m, struct ip *);
5828                                 sum = in_pseudo(ip->ip_src.s_addr,
5829                                 ip->ip_dst.s_addr, htonl((u_short)len +
5830                                 m->m_pkthdr.csum_data + IPPROTO_TCP));
5831                         }
5832                         sum ^= 0xffff;
5833                         ++hw_assist;
5834                 }
5835                 break;
5836         case IPPROTO_UDP:
5837                 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
5838                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5839                                 sum = m->m_pkthdr.csum_data;
5840                         } else {
5841                                 ip = mtod(m, struct ip *);
5842                                 sum = in_pseudo(ip->ip_src.s_addr,
5843                                 ip->ip_dst.s_addr, htonl((u_short)len +
5844                                 m->m_pkthdr.csum_data + IPPROTO_UDP));
5845                         }
5846                         sum ^= 0xffff;
5847                         ++hw_assist;
5848                 }
5849                 break;
5850         case IPPROTO_ICMP:
5851 #ifdef INET6
5852         case IPPROTO_ICMPV6:
5853 #endif /* INET6 */
5854                 break;
5855         default:
5856                 return (1);
5857         }
5858
5859         if (!hw_assist) {
5860                 switch (af) {
5861                 case AF_INET:
5862                         if (p == IPPROTO_ICMP) {
5863                                 if (m->m_len < off)
5864                                         return (1);
5865                                 m->m_data += off;
5866                                 m->m_len -= off;
5867                                 sum = in_cksum(m, len);
5868                                 m->m_data -= off;
5869                                 m->m_len += off;
5870                         } else {
5871                                 if (m->m_len < sizeof(struct ip))
5872                                         return (1);
5873                                 sum = in4_cksum(m, p, off, len);
5874                         }
5875                         break;
5876 #ifdef INET6
5877                 case AF_INET6:
5878                         if (m->m_len < sizeof(struct ip6_hdr))
5879                                 return (1);
5880                         sum = in6_cksum(m, p, off, len);
5881                         break;
5882 #endif /* INET6 */
5883                 default:
5884                         return (1);
5885                 }
5886         }
5887         if (sum) {
5888                 switch (p) {
5889                 case IPPROTO_TCP:
5890                     {
5891                         KMOD_TCPSTAT_INC(tcps_rcvbadsum);
5892                         break;
5893                     }
5894                 case IPPROTO_UDP:
5895                     {
5896                         KMOD_UDPSTAT_INC(udps_badsum);
5897                         break;
5898                     }
5899 #ifdef INET
5900                 case IPPROTO_ICMP:
5901                     {
5902                         KMOD_ICMPSTAT_INC(icps_checksum);
5903                         break;
5904                     }
5905 #endif
5906 #ifdef INET6
5907                 case IPPROTO_ICMPV6:
5908                     {
5909                         KMOD_ICMP6STAT_INC(icp6s_checksum);
5910                         break;
5911                     }
5912 #endif /* INET6 */
5913                 }
5914                 return (1);
5915         } else {
5916                 if (p == IPPROTO_TCP || p == IPPROTO_UDP) {
5917                         m->m_pkthdr.csum_flags |=
5918                             (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
5919                         m->m_pkthdr.csum_data = 0xffff;
5920                 }
5921         }
5922         return (0);
5923 }
5924
5925
5926 #ifdef INET
5927 int
5928 pf_test(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
5929 {
5930         struct pfi_kif          *kif;
5931         u_short                  action, reason = 0, log = 0;
5932         struct mbuf             *m = *m0;
5933         struct ip               *h = NULL;
5934         struct m_tag            *ipfwtag;
5935         struct pf_rule          *a = NULL, *r = &V_pf_default_rule, *tr, *nr;
5936         struct pf_state         *s = NULL;
5937         struct pf_ruleset       *ruleset = NULL;
5938         struct pf_pdesc          pd;
5939         int                      off, dirndx, pqid = 0;
5940
5941         PF_RULES_RLOCK_TRACKER;
5942
5943         M_ASSERTPKTHDR(m);
5944
5945         if (!V_pf_status.running)
5946                 return (PF_PASS);
5947
5948         memset(&pd, 0, sizeof(pd));
5949
5950         kif = (struct pfi_kif *)ifp->if_pf_kif;
5951
5952         if (kif == NULL) {
5953                 DPFPRINTF(PF_DEBUG_URGENT,
5954                     ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
5955                 return (PF_DROP);
5956         }
5957         if (kif->pfik_flags & PFI_IFLAG_SKIP)
5958                 return (PF_PASS);
5959
5960         if (m->m_flags & M_SKIP_FIREWALL)
5961                 return (PF_PASS);
5962
5963         pd.pf_mtag = pf_find_mtag(m);
5964
5965         PF_RULES_RLOCK();
5966
5967         if (ip_divert_ptr != NULL &&
5968             ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) {
5969                 struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1);
5970                 if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) {
5971                         if (pd.pf_mtag == NULL &&
5972                             ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5973                                 action = PF_DROP;
5974                                 goto done;
5975                         }
5976                         pd.pf_mtag->flags |= PF_PACKET_LOOPED;
5977                         m_tag_delete(m, ipfwtag);
5978                 }
5979                 if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) {
5980                         m->m_flags |= M_FASTFWD_OURS;
5981                         pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT;
5982                 }
5983         } else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
5984                 /* We do IP header normalization and packet reassembly here */
5985                 action = PF_DROP;
5986                 goto done;
5987         }
5988         m = *m0;        /* pf_normalize messes with m0 */
5989         h = mtod(m, struct ip *);
5990
5991         off = h->ip_hl << 2;
5992         if (off < (int)sizeof(struct ip)) {
5993                 action = PF_DROP;
5994                 REASON_SET(&reason, PFRES_SHORT);
5995                 log = 1;
5996                 goto done;
5997         }
5998
5999         pd.src = (struct pf_addr *)&h->ip_src;
6000         pd.dst = (struct pf_addr *)&h->ip_dst;
6001         pd.sport = pd.dport = NULL;
6002         pd.ip_sum = &h->ip_sum;
6003         pd.proto_sum = NULL;
6004         pd.proto = h->ip_p;
6005         pd.dir = dir;
6006         pd.sidx = (dir == PF_IN) ? 0 : 1;
6007         pd.didx = (dir == PF_IN) ? 1 : 0;
6008         pd.af = AF_INET;
6009         pd.tos = h->ip_tos & ~IPTOS_ECN_MASK;
6010         pd.tot_len = ntohs(h->ip_len);
6011
6012         /* handle fragments that didn't get reassembled by normalization */
6013         if (h->ip_off & htons(IP_MF | IP_OFFMASK)) {
6014                 action = pf_test_fragment(&r, dir, kif, m, h,
6015                     &pd, &a, &ruleset);
6016                 goto done;
6017         }
6018
6019         switch (h->ip_p) {
6020
6021         case IPPROTO_TCP: {
6022                 struct tcphdr   th;
6023
6024                 pd.hdr.tcp = &th;
6025                 if (!pf_pull_hdr(m, off, &th, sizeof(th),
6026                     &action, &reason, AF_INET)) {
6027                         log = action != PF_PASS;
6028                         goto done;
6029                 }
6030                 pd.p_len = pd.tot_len - off - (th.th_off << 2);
6031                 if ((th.th_flags & TH_ACK) && pd.p_len == 0)
6032                         pqid = 1;
6033                 action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
6034                 if (action == PF_DROP)
6035                         goto done;
6036                 action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
6037                     &reason);
6038                 if (action == PF_PASS) {
6039                         if (V_pfsync_update_state_ptr != NULL)
6040                                 V_pfsync_update_state_ptr(s);
6041                         r = s->rule.ptr;
6042                         a = s->anchor.ptr;
6043                         log = s->log;
6044                 } else if (s == NULL)
6045                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6046                             &a, &ruleset, inp);
6047                 break;
6048         }
6049
6050         case IPPROTO_UDP: {
6051                 struct udphdr   uh;
6052
6053                 pd.hdr.udp = &uh;
6054                 if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
6055                     &action, &reason, AF_INET)) {
6056                         log = action != PF_PASS;
6057                         goto done;
6058                 }
6059                 if (uh.uh_dport == 0 ||
6060                     ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
6061                     ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
6062                         action = PF_DROP;
6063                         REASON_SET(&reason, PFRES_SHORT);
6064                         goto done;
6065                 }
6066                 action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
6067                 if (action == PF_PASS) {
6068                         if (V_pfsync_update_state_ptr != NULL)
6069                                 V_pfsync_update_state_ptr(s);
6070                         r = s->rule.ptr;
6071                         a = s->anchor.ptr;
6072                         log = s->log;
6073                 } else if (s == NULL)
6074                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6075                             &a, &ruleset, inp);
6076                 break;
6077         }
6078
6079         case IPPROTO_ICMP: {
6080                 struct icmp     ih;
6081
6082                 pd.hdr.icmp = &ih;
6083                 if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
6084                     &action, &reason, AF_INET)) {
6085                         log = action != PF_PASS;
6086                         goto done;
6087                 }
6088                 action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
6089                     &reason);
6090                 if (action == PF_PASS) {
6091                         if (V_pfsync_update_state_ptr != NULL)
6092                                 V_pfsync_update_state_ptr(s);
6093                         r = s->rule.ptr;
6094                         a = s->anchor.ptr;
6095                         log = s->log;
6096                 } else if (s == NULL)
6097                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6098                             &a, &ruleset, inp);
6099                 break;
6100         }
6101
6102 #ifdef INET6
6103         case IPPROTO_ICMPV6: {
6104                 action = PF_DROP;
6105                 DPFPRINTF(PF_DEBUG_MISC,
6106                     ("pf: dropping IPv4 packet with ICMPv6 payload\n"));
6107                 goto done;
6108         }
6109 #endif
6110
6111         default:
6112                 action = pf_test_state_other(&s, dir, kif, m, &pd);
6113                 if (action == PF_PASS) {
6114                         if (V_pfsync_update_state_ptr != NULL)
6115                                 V_pfsync_update_state_ptr(s);
6116                         r = s->rule.ptr;
6117                         a = s->anchor.ptr;
6118                         log = s->log;
6119                 } else if (s == NULL)
6120                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6121                             &a, &ruleset, inp);
6122                 break;
6123         }
6124
6125 done:
6126         PF_RULES_RUNLOCK();
6127         if (action == PF_PASS && h->ip_hl > 5 &&
6128             !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
6129                 action = PF_DROP;
6130                 REASON_SET(&reason, PFRES_IPOPTIONS);
6131                 log = r->log;
6132                 DPFPRINTF(PF_DEBUG_MISC,
6133                     ("pf: dropping packet with ip options\n"));
6134         }
6135
6136         if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
6137                 action = PF_DROP;
6138                 REASON_SET(&reason, PFRES_MEMORY);
6139         }
6140         if (r->rtableid >= 0)
6141                 M_SETFIB(m, r->rtableid);
6142
6143         if (r->scrub_flags & PFSTATE_SETPRIO) {
6144                 if (pd.tos & IPTOS_LOWDELAY)
6145                         pqid = 1;
6146                 if (pf_ieee8021q_setpcp(m, r->set_prio[pqid])) {
6147                         action = PF_DROP;
6148                         REASON_SET(&reason, PFRES_MEMORY);
6149                         log = 1;
6150                         DPFPRINTF(PF_DEBUG_MISC,
6151                             ("pf: failed to allocate 802.1q mtag\n"));
6152                 }
6153         }
6154
6155 #ifdef ALTQ
6156         if (action == PF_PASS && r->qid) {
6157                 if (pd.pf_mtag == NULL &&
6158                     ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
6159                         action = PF_DROP;
6160                         REASON_SET(&reason, PFRES_MEMORY);
6161                 } else {
6162                         if (s != NULL)
6163                                 pd.pf_mtag->qid_hash = pf_state_hash(s);
6164                         if (pqid || (pd.tos & IPTOS_LOWDELAY))
6165                                 pd.pf_mtag->qid = r->pqid;
6166                         else
6167                                 pd.pf_mtag->qid = r->qid;
6168                         /* Add hints for ecn. */
6169                         pd.pf_mtag->hdr = h;
6170                 }
6171
6172         }
6173 #endif /* ALTQ */
6174
6175         /*
6176          * connections redirected to loopback should not match sockets
6177          * bound specifically to loopback due to security implications,
6178          * see tcp_input() and in_pcblookup_listen().
6179          */
6180         if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
6181             pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
6182             (s->nat_rule.ptr->action == PF_RDR ||
6183             s->nat_rule.ptr->action == PF_BINAT) &&
6184             IN_LOOPBACK(ntohl(pd.dst->v4.s_addr)))
6185                 m->m_flags |= M_SKIP_FIREWALL;
6186
6187         if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL &&
6188             !PACKET_LOOPED(&pd)) {
6189
6190                 ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
6191                     sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
6192                 if (ipfwtag != NULL) {
6193                         ((struct ipfw_rule_ref *)(ipfwtag+1))->info =
6194                             ntohs(r->divert.port);
6195                         ((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir;
6196
6197                         if (s)
6198                                 PF_STATE_UNLOCK(s);
6199
6200                         m_tag_prepend(m, ipfwtag);
6201                         if (m->m_flags & M_FASTFWD_OURS) {
6202                                 if (pd.pf_mtag == NULL &&
6203                                     ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
6204                                         action = PF_DROP;
6205                                         REASON_SET(&reason, PFRES_MEMORY);
6206                                         log = 1;
6207                                         DPFPRINTF(PF_DEBUG_MISC,
6208                                             ("pf: failed to allocate tag\n"));
6209                                 } else {
6210                                         pd.pf_mtag->flags |=
6211                                             PF_FASTFWD_OURS_PRESENT;
6212                                         m->m_flags &= ~M_FASTFWD_OURS;
6213                                 }
6214                         }
6215                         ip_divert_ptr(*m0, dir == PF_IN);
6216                         *m0 = NULL;
6217
6218                         return (action);
6219                 } else {
6220                         /* XXX: ipfw has the same behaviour! */
6221                         action = PF_DROP;
6222                         REASON_SET(&reason, PFRES_MEMORY);
6223                         log = 1;
6224                         DPFPRINTF(PF_DEBUG_MISC,
6225                             ("pf: failed to allocate divert tag\n"));
6226                 }
6227         }
6228
6229         if (log) {
6230                 struct pf_rule *lr;
6231
6232                 if (s != NULL && s->nat_rule.ptr != NULL &&
6233                     s->nat_rule.ptr->log & PF_LOG_ALL)
6234                         lr = s->nat_rule.ptr;
6235                 else
6236                         lr = r;
6237                 PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd,
6238                     (s == NULL));
6239         }
6240
6241         kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
6242         kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;
6243
6244         if (action == PF_PASS || r->action == PF_DROP) {
6245                 dirndx = (dir == PF_OUT);
6246                 r->packets[dirndx]++;
6247                 r->bytes[dirndx] += pd.tot_len;
6248                 if (a != NULL) {
6249                         a->packets[dirndx]++;
6250                         a->bytes[dirndx] += pd.tot_len;
6251                 }
6252                 if (s != NULL) {
6253                         if (s->nat_rule.ptr != NULL) {
6254                                 s->nat_rule.ptr->packets[dirndx]++;
6255                                 s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
6256                         }
6257                         if (s->src_node != NULL) {
6258                                 s->src_node->packets[dirndx]++;
6259                                 s->src_node->bytes[dirndx] += pd.tot_len;
6260                         }
6261                         if (s->nat_src_node != NULL) {
6262                                 s->nat_src_node->packets[dirndx]++;
6263                                 s->nat_src_node->bytes[dirndx] += pd.tot_len;
6264                         }
6265                         dirndx = (dir == s->direction) ? 0 : 1;
6266                         s->packets[dirndx]++;
6267                         s->bytes[dirndx] += pd.tot_len;
6268                 }
6269                 tr = r;
6270                 nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
6271                 if (nr != NULL && r == &V_pf_default_rule)
6272                         tr = nr;
6273                 if (tr->src.addr.type == PF_ADDR_TABLE)
6274                         pfr_update_stats(tr->src.addr.p.tbl,
6275                             (s == NULL) ? pd.src :
6276                             &s->key[(s->direction == PF_IN)]->
6277                                 addr[(s->direction == PF_OUT)],
6278                             pd.af, pd.tot_len, dir == PF_OUT,
6279                             r->action == PF_PASS, tr->src.neg);
6280                 if (tr->dst.addr.type == PF_ADDR_TABLE)
6281                         pfr_update_stats(tr->dst.addr.p.tbl,
6282                             (s == NULL) ? pd.dst :
6283                             &s->key[(s->direction == PF_IN)]->
6284                                 addr[(s->direction == PF_IN)],
6285                             pd.af, pd.tot_len, dir == PF_OUT,
6286                             r->action == PF_PASS, tr->dst.neg);
6287         }
6288
6289         switch (action) {
6290         case PF_SYNPROXY_DROP:
6291                 m_freem(*m0);
6292         case PF_DEFER:
6293                 *m0 = NULL;
6294                 action = PF_PASS;
6295                 break;
6296         case PF_DROP:
6297                 m_freem(*m0);
6298                 *m0 = NULL;
6299                 break;
6300         default:
6301                 /* pf_route() returns unlocked. */
6302                 if (r->rt) {
6303                         pf_route(m0, r, dir, kif->pfik_ifp, s, &pd, inp);
6304                         return (action);
6305                 }
6306                 break;
6307         }
6308         if (s)
6309                 PF_STATE_UNLOCK(s);
6310
6311         return (action);
6312 }
6313 #endif /* INET */
6314
6315 #ifdef INET6
6316 int
6317 pf_test6(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
6318 {
6319         struct pfi_kif          *kif;
6320         u_short                  action, reason = 0, log = 0;
6321         struct mbuf             *m = *m0, *n = NULL;
6322         struct m_tag            *mtag;
6323         struct ip6_hdr          *h = NULL;
6324         struct pf_rule          *a = NULL, *r = &V_pf_default_rule, *tr, *nr;
6325         struct pf_state         *s = NULL;
6326         struct pf_ruleset       *ruleset = NULL;
6327         struct pf_pdesc          pd;
6328         int                      off, terminal = 0, dirndx, rh_cnt = 0, pqid = 0;
6329
6330         PF_RULES_RLOCK_TRACKER;
6331         M_ASSERTPKTHDR(m);
6332
6333         if (!V_pf_status.running)
6334                 return (PF_PASS);
6335
6336         memset(&pd, 0, sizeof(pd));
6337         pd.pf_mtag = pf_find_mtag(m);
6338
6339         if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED)
6340                 return (PF_PASS);
6341
6342         kif = (struct pfi_kif *)ifp->if_pf_kif;
6343         if (kif == NULL) {
6344                 DPFPRINTF(PF_DEBUG_URGENT,
6345                     ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
6346                 return (PF_DROP);
6347         }
6348         if (kif->pfik_flags & PFI_IFLAG_SKIP)
6349                 return (PF_PASS);
6350
6351         if (m->m_flags & M_SKIP_FIREWALL)
6352                 return (PF_PASS);
6353
6354         PF_RULES_RLOCK();
6355
6356         /* We do IP header normalization and packet reassembly here */
6357         if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
6358                 action = PF_DROP;
6359                 goto done;
6360         }
6361         m = *m0;        /* pf_normalize messes with m0 */
6362         h = mtod(m, struct ip6_hdr *);
6363
6364         /*
6365          * we do not support jumbogram.  if we keep going, zero ip6_plen
6366          * will do something bad, so drop the packet for now.
6367          */
6368         if (htons(h->ip6_plen) == 0) {
6369                 action = PF_DROP;
6370                 REASON_SET(&reason, PFRES_NORM);        /*XXX*/
6371                 goto done;
6372         }
6373
6374         pd.src = (struct pf_addr *)&h->ip6_src;
6375         pd.dst = (struct pf_addr *)&h->ip6_dst;
6376         pd.sport = pd.dport = NULL;
6377         pd.ip_sum = NULL;
6378         pd.proto_sum = NULL;
6379         pd.dir = dir;
6380         pd.sidx = (dir == PF_IN) ? 0 : 1;
6381         pd.didx = (dir == PF_IN) ? 1 : 0;
6382         pd.af = AF_INET6;
6383         pd.tos = 0;
6384         pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
6385
6386         off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
6387         pd.proto = h->ip6_nxt;
6388         do {
6389                 switch (pd.proto) {
6390                 case IPPROTO_FRAGMENT:
6391                         action = pf_test_fragment(&r, dir, kif, m, h,
6392                             &pd, &a, &ruleset);
6393                         if (action == PF_DROP)
6394                                 REASON_SET(&reason, PFRES_FRAG);
6395                         goto done;
6396                 case IPPROTO_ROUTING: {
6397                         struct ip6_rthdr rthdr;
6398
6399                         if (rh_cnt++) {
6400                                 DPFPRINTF(PF_DEBUG_MISC,
6401                                     ("pf: IPv6 more than one rthdr\n"));
6402                                 action = PF_DROP;
6403                                 REASON_SET(&reason, PFRES_IPOPTIONS);
6404                                 log = 1;
6405                                 goto done;
6406                         }
6407                         if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
6408                             &reason, pd.af)) {
6409                                 DPFPRINTF(PF_DEBUG_MISC,
6410                                     ("pf: IPv6 short rthdr\n"));
6411                                 action = PF_DROP;
6412                                 REASON_SET(&reason, PFRES_SHORT);
6413                                 log = 1;
6414                                 goto done;
6415                         }
6416                         if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
6417                                 DPFPRINTF(PF_DEBUG_MISC,
6418                                     ("pf: IPv6 rthdr0\n"));
6419                                 action = PF_DROP;
6420                                 REASON_SET(&reason, PFRES_IPOPTIONS);
6421                                 log = 1;
6422                                 goto done;
6423                         }
6424                         /* FALLTHROUGH */
6425                 }
6426                 case IPPROTO_AH:
6427                 case IPPROTO_HOPOPTS:
6428                 case IPPROTO_DSTOPTS: {
6429                         /* get next header and header length */
6430                         struct ip6_ext  opt6;
6431
6432                         if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
6433                             NULL, &reason, pd.af)) {
6434                                 DPFPRINTF(PF_DEBUG_MISC,
6435                                     ("pf: IPv6 short opt\n"));
6436                                 action = PF_DROP;
6437                                 log = 1;
6438                                 goto done;
6439                         }
6440                         if (pd.proto == IPPROTO_AH)
6441                                 off += (opt6.ip6e_len + 2) * 4;
6442                         else
6443                                 off += (opt6.ip6e_len + 1) * 8;
6444                         pd.proto = opt6.ip6e_nxt;
6445                         /* goto the next header */
6446                         break;
6447                 }
6448                 default:
6449                         terminal++;
6450                         break;
6451                 }
6452         } while (!terminal);
6453
6454         /* if there's no routing header, use unmodified mbuf for checksumming */
6455         if (!n)
6456                 n = m;
6457
6458         switch (pd.proto) {
6459
6460         case IPPROTO_TCP: {
6461                 struct tcphdr   th;
6462
6463                 pd.hdr.tcp = &th;
6464                 if (!pf_pull_hdr(m, off, &th, sizeof(th),
6465                     &action, &reason, AF_INET6)) {
6466                         log = action != PF_PASS;
6467                         goto done;
6468                 }
6469                 pd.p_len = pd.tot_len - off - (th.th_off << 2);
6470                 action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
6471                 if (action == PF_DROP)
6472                         goto done;
6473                 action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
6474                     &reason);
6475                 if (action == PF_PASS) {
6476                         if (V_pfsync_update_state_ptr != NULL)
6477                                 V_pfsync_update_state_ptr(s);
6478                         r = s->rule.ptr;
6479                         a = s->anchor.ptr;
6480                         log = s->log;
6481                 } else if (s == NULL)
6482                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6483                             &a, &ruleset, inp);
6484                 break;
6485         }
6486
6487         case IPPROTO_UDP: {
6488                 struct udphdr   uh;
6489
6490                 pd.hdr.udp = &uh;
6491                 if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
6492                     &action, &reason, AF_INET6)) {
6493                         log = action != PF_PASS;
6494                         goto done;
6495                 }
6496                 if (uh.uh_dport == 0 ||
6497                     ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
6498                     ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
6499                         action = PF_DROP;
6500                         REASON_SET(&reason, PFRES_SHORT);
6501                         goto done;
6502                 }
6503                 action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
6504                 if (action == PF_PASS) {
6505                         if (V_pfsync_update_state_ptr != NULL)
6506                                 V_pfsync_update_state_ptr(s);
6507                         r = s->rule.ptr;
6508                         a = s->anchor.ptr;
6509                         log = s->log;
6510                 } else if (s == NULL)
6511                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6512                             &a, &ruleset, inp);
6513                 break;
6514         }
6515
6516         case IPPROTO_ICMP: {
6517                 action = PF_DROP;
6518                 DPFPRINTF(PF_DEBUG_MISC,
6519                     ("pf: dropping IPv6 packet with ICMPv4 payload\n"));
6520                 goto done;
6521         }
6522
6523         case IPPROTO_ICMPV6: {
6524                 struct icmp6_hdr        ih;
6525
6526                 pd.hdr.icmp6 = &ih;
6527                 if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
6528                     &action, &reason, AF_INET6)) {
6529                         log = action != PF_PASS;
6530                         goto done;
6531                 }
6532                 action = pf_test_state_icmp(&s, dir, kif,
6533                     m, off, h, &pd, &reason);
6534                 if (action == PF_PASS) {
6535                         if (V_pfsync_update_state_ptr != NULL)
6536                                 V_pfsync_update_state_ptr(s);
6537                         r = s->rule.ptr;
6538                         a = s->anchor.ptr;
6539                         log = s->log;
6540                 } else if (s == NULL)
6541                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6542                             &a, &ruleset, inp);
6543                 break;
6544         }
6545
6546         default:
6547                 action = pf_test_state_other(&s, dir, kif, m, &pd);
6548                 if (action == PF_PASS) {
6549                         if (V_pfsync_update_state_ptr != NULL)
6550                                 V_pfsync_update_state_ptr(s);
6551                         r = s->rule.ptr;
6552                         a = s->anchor.ptr;
6553                         log = s->log;
6554                 } else if (s == NULL)
6555                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6556                             &a, &ruleset, inp);
6557                 break;
6558         }
6559
6560 done:
6561         PF_RULES_RUNLOCK();
6562         if (n != m) {
6563                 m_freem(n);
6564                 n = NULL;
6565         }
6566
6567         /* handle dangerous IPv6 extension headers. */
6568         if (action == PF_PASS && rh_cnt &&
6569             !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
6570                 action = PF_DROP;
6571                 REASON_SET(&reason, PFRES_IPOPTIONS);
6572                 log = r->log;
6573                 DPFPRINTF(PF_DEBUG_MISC,
6574                     ("pf: dropping packet with dangerous v6 headers\n"));
6575         }
6576
6577         if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
6578                 action = PF_DROP;
6579                 REASON_SET(&reason, PFRES_MEMORY);
6580         }
6581         if (r->rtableid >= 0)
6582                 M_SETFIB(m, r->rtableid);
6583
6584         if (r->scrub_flags & PFSTATE_SETPRIO) {
6585                 if (pd.tos & IPTOS_LOWDELAY)
6586                         pqid = 1;
6587                 if (pf_ieee8021q_setpcp(m, r->set_prio[pqid])) {
6588                         action = PF_DROP;
6589                         REASON_SET(&reason, PFRES_MEMORY);
6590                         log = 1;
6591                         DPFPRINTF(PF_DEBUG_MISC,
6592                             ("pf: failed to allocate 802.1q mtag\n"));
6593                 }
6594         }
6595
6596 #ifdef ALTQ
6597         if (action == PF_PASS && r->qid) {
6598                 if (pd.pf_mtag == NULL &&
6599                     ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
6600                         action = PF_DROP;
6601                         REASON_SET(&reason, PFRES_MEMORY);
6602                 } else {
6603                         if (s != NULL)
6604                                 pd.pf_mtag->qid_hash = pf_state_hash(s);
6605                         if (pd.tos & IPTOS_LOWDELAY)
6606                                 pd.pf_mtag->qid = r->pqid;
6607                         else
6608                                 pd.pf_mtag->qid = r->qid;
6609                         /* Add hints for ecn. */
6610                         pd.pf_mtag->hdr = h;
6611                 }
6612         }
6613 #endif /* ALTQ */
6614
6615         if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
6616             pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
6617             (s->nat_rule.ptr->action == PF_RDR ||
6618             s->nat_rule.ptr->action == PF_BINAT) &&
6619             IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
6620                 m->m_flags |= M_SKIP_FIREWALL;
6621
6622         /* XXX: Anybody working on it?! */
6623         if (r->divert.port)
6624                 printf("pf: divert(9) is not supported for IPv6\n");
6625
6626         if (log) {
6627                 struct pf_rule *lr;
6628
6629                 if (s != NULL && s->nat_rule.ptr != NULL &&
6630                     s->nat_rule.ptr->log & PF_LOG_ALL)
6631                         lr = s->nat_rule.ptr;
6632                 else
6633                         lr = r;
6634                 PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset,
6635                     &pd, (s == NULL));
6636         }
6637
6638         kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
6639         kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;
6640
6641         if (action == PF_PASS || r->action == PF_DROP) {
6642                 dirndx = (dir == PF_OUT);
6643                 r->packets[dirndx]++;
6644                 r->bytes[dirndx] += pd.tot_len;
6645                 if (a != NULL) {
6646                         a->packets[dirndx]++;
6647                         a->bytes[dirndx] += pd.tot_len;
6648                 }
6649                 if (s != NULL) {
6650                         if (s->nat_rule.ptr != NULL) {
6651                                 s->nat_rule.ptr->packets[dirndx]++;
6652                                 s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
6653                         }
6654                         if (s->src_node != NULL) {
6655                                 s->src_node->packets[dirndx]++;
6656                                 s->src_node->bytes[dirndx] += pd.tot_len;
6657                         }
6658                         if (s->nat_src_node != NULL) {
6659                                 s->nat_src_node->packets[dirndx]++;
6660                                 s->nat_src_node->bytes[dirndx] += pd.tot_len;
6661                         }
6662                         dirndx = (dir == s->direction) ? 0 : 1;
6663                         s->packets[dirndx]++;
6664                         s->bytes[dirndx] += pd.tot_len;
6665                 }
6666                 tr = r;
6667                 nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
6668                 if (nr != NULL && r == &V_pf_default_rule)
6669                         tr = nr;
6670                 if (tr->src.addr.type == PF_ADDR_TABLE)
6671                         pfr_update_stats(tr->src.addr.p.tbl,
6672                             (s == NULL) ? pd.src :
6673                             &s->key[(s->direction == PF_IN)]->addr[0],
6674                             pd.af, pd.tot_len, dir == PF_OUT,
6675                             r->action == PF_PASS, tr->src.neg);
6676                 if (tr->dst.addr.type == PF_ADDR_TABLE)
6677                         pfr_update_stats(tr->dst.addr.p.tbl,
6678                             (s == NULL) ? pd.dst :
6679                             &s->key[(s->direction == PF_IN)]->addr[1],
6680                             pd.af, pd.tot_len, dir == PF_OUT,
6681                             r->action == PF_PASS, tr->dst.neg);
6682         }
6683
6684         switch (action) {
6685         case PF_SYNPROXY_DROP:
6686                 m_freem(*m0);
6687         case PF_DEFER:
6688                 *m0 = NULL;
6689                 action = PF_PASS;
6690                 break;
6691         case PF_DROP:
6692                 m_freem(*m0);
6693                 *m0 = NULL;
6694                 break;
6695         default:
6696                 /* pf_route6() returns unlocked. */
6697                 if (r->rt) {
6698                         pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd, inp);
6699                         return (action);
6700                 }
6701                 break;
6702         }
6703
6704         if (s)
6705                 PF_STATE_UNLOCK(s);
6706
6707         /* If reassembled packet passed, create new fragments. */
6708         if (action == PF_PASS && *m0 && (pflags & PFIL_FWD) &&
6709             (mtag = m_tag_find(m, PF_REASSEMBLED, NULL)) != NULL)
6710                 action = pf_refragment6(ifp, m0, mtag);
6711
6712         return (action);
6713 }
6714 #endif /* INET6 */