]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/contrib/pf/net/pf.c
Merge the projects/pf/head branch, that was worked on for last six months,
[FreeBSD/FreeBSD.git] / sys / contrib / pf / net / pf.c
1 /*      $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ */
2
3 /*
4  * Copyright (c) 2001 Daniel Hartmeier
5  * Copyright (c) 2002 - 2008 Henning Brauer
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  *    - Redistributions of source code must retain the above copyright
13  *      notice, this list of conditions and the following disclaimer.
14  *    - Redistributions in binary form must reproduce the above
15  *      copyright notice, this list of conditions and the following
16  *      disclaimer in the documentation and/or other materials provided
17  *      with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
25  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
26  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  *
32  * Effort sponsored in part by the Defense Advanced Research Projects
33  * Agency (DARPA) and Air Force Research Laboratory, Air Force
34  * Materiel Command, USAF, under agreement number F30602-01-2-0537.
35  *
36  */
37
38 #include <sys/cdefs.h>
39
40 __FBSDID("$FreeBSD$");
41
42 #include "opt_inet.h"
43 #include "opt_inet6.h"
44 #include "opt_bpf.h"
45 #include "opt_pf.h"
46
47 #include <sys/param.h>
48 #include <sys/bus.h>
49 #include <sys/endian.h>
50 #include <sys/hash.h>
51 #include <sys/interrupt.h>
52 #include <sys/kernel.h>
53 #include <sys/kthread.h>
54 #include <sys/limits.h>
55 #include <sys/mbuf.h>
56 #include <sys/md5.h>
57 #include <sys/random.h>
58 #include <sys/refcount.h>
59 #include <sys/socket.h>
60 #include <sys/sysctl.h>
61 #include <sys/taskqueue.h>
62 #include <sys/ucred.h>
63
64 #include <net/if.h>
65 #include <net/if_types.h>
66 #include <net/route.h>
67 #include <net/radix_mpath.h>
68 #include <net/vnet.h>
69
70 #include <net/pfvar.h>
71 #include <net/pf_mtag.h>
72 #include <net/if_pflog.h>
73 #include <net/if_pfsync.h>
74
75 #include <netinet/in_pcb.h>
76 #include <netinet/in_var.h>
77 #include <netinet/ip.h>
78 #include <netinet/ip_fw.h>
79 #include <netinet/ip_icmp.h>
80 #include <netinet/icmp_var.h>
81 #include <netinet/ip_var.h>
82 #include <netinet/ipfw/ip_fw_private.h> /* XXX: only for DIR_IN/DIR_OUT */
83 #include <netinet/tcp.h>
84 #include <netinet/tcp_fsm.h>
85 #include <netinet/tcp_seq.h>
86 #include <netinet/tcp_timer.h>
87 #include <netinet/tcp_var.h>
88 #include <netinet/udp.h>
89 #include <netinet/udp_var.h>
90
91 #ifdef INET6
92 #include <netinet/ip6.h>
93 #include <netinet/icmp6.h>
94 #include <netinet6/nd6.h>
95 #include <netinet6/ip6_var.h>
96 #include <netinet6/in6_pcb.h>
97 #endif /* INET6 */
98
99 #include <machine/in_cksum.h>
100 #include <security/mac/mac_framework.h>
101
102 #define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
103
104 /*
105  * Global variables
106  */
107
108 /* state tables */
109 VNET_DEFINE(struct pf_altqqueue,         pf_altqs[2]);
110 VNET_DEFINE(struct pf_palist,            pf_pabuf);
111 VNET_DEFINE(struct pf_altqqueue *,       pf_altqs_active);
112 VNET_DEFINE(struct pf_altqqueue *,       pf_altqs_inactive);
113 VNET_DEFINE(struct pf_status,            pf_status);
114
115 VNET_DEFINE(u_int32_t,                   ticket_altqs_active);
116 VNET_DEFINE(u_int32_t,                   ticket_altqs_inactive);
117 VNET_DEFINE(int,                         altqs_inactive_open);
118 VNET_DEFINE(u_int32_t,                   ticket_pabuf);
119
120 VNET_DEFINE(MD5_CTX,                     pf_tcp_secret_ctx);
121 #define V_pf_tcp_secret_ctx              VNET(pf_tcp_secret_ctx)
122 VNET_DEFINE(u_char,                      pf_tcp_secret[16]);
123 #define V_pf_tcp_secret                  VNET(pf_tcp_secret)
124 VNET_DEFINE(int,                         pf_tcp_secret_init);
125 #define V_pf_tcp_secret_init             VNET(pf_tcp_secret_init)
126 VNET_DEFINE(int,                         pf_tcp_iss_off);
127 #define V_pf_tcp_iss_off                 VNET(pf_tcp_iss_off)
128
129 struct pf_anchor_stackframe {
130         struct pf_ruleset               *rs;
131         struct pf_rule                  *r;
132         struct pf_anchor_node           *parent;
133         struct pf_anchor                *child;
134 };
135 VNET_DEFINE(struct pf_anchor_stackframe, pf_anchor_stack[64]);
136 #define V_pf_anchor_stack                VNET(pf_anchor_stack)
137
138 /*
139  * Queue for pf_intr() sends.
140  */
141 static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations");
142 struct pf_send_entry {
143         STAILQ_ENTRY(pf_send_entry)     pfse_next;
144         struct mbuf                     *pfse_m;
145         enum {
146                 PFSE_IP,
147                 PFSE_IP6,
148                 PFSE_ICMP,
149                 PFSE_ICMP6,
150         }                               pfse_type;
151         union {
152                 struct route            ro;
153                 struct {
154                         int             type;
155                         int             code;
156                         int             mtu;
157                 } icmpopts;
158         } u;
159 #define pfse_ro         u.ro
160 #define pfse_icmp_type  u.icmpopts.type
161 #define pfse_icmp_code  u.icmpopts.code
162 #define pfse_icmp_mtu   u.icmpopts.mtu
163 };
164
165 STAILQ_HEAD(pf_send_head, pf_send_entry);
166 static VNET_DEFINE(struct pf_send_head, pf_sendqueue);
167 #define V_pf_sendqueue  VNET(pf_sendqueue)
168
169 static struct mtx pf_sendqueue_mtx;
170 #define PF_SENDQ_LOCK()         mtx_lock(&pf_sendqueue_mtx)
171 #define PF_SENDQ_UNLOCK()       mtx_unlock(&pf_sendqueue_mtx)
172
173 /*
174  * Queue for pf_flush_task() tasks.
175  */
176 struct pf_flush_entry {
177         SLIST_ENTRY(pf_flush_entry)     next;
178         struct pf_addr                  addr;
179         sa_family_t                     af;
180         uint8_t                         dir;
181         struct pf_rule                  *rule;  /* never dereferenced */
182 };
183
184 SLIST_HEAD(pf_flush_head, pf_flush_entry);
185 static VNET_DEFINE(struct pf_flush_head, pf_flushqueue);
186 #define V_pf_flushqueue VNET(pf_flushqueue)
187 static VNET_DEFINE(struct task, pf_flushtask);
188 #define V_pf_flushtask  VNET(pf_flushtask)
189
190 static struct mtx pf_flushqueue_mtx;
191 #define PF_FLUSHQ_LOCK()        mtx_lock(&pf_flushqueue_mtx)
192 #define PF_FLUSHQ_UNLOCK()      mtx_unlock(&pf_flushqueue_mtx)
193
194 VNET_DEFINE(struct pf_rulequeue, pf_unlinked_rules);
195 struct mtx pf_unlnkdrules_mtx;
196
197 static VNET_DEFINE(uma_zone_t,  pf_sources_z);
198 #define V_pf_sources_z  VNET(pf_sources_z)
199 static VNET_DEFINE(uma_zone_t,  pf_mtag_z);
200 #define V_pf_mtag_z     VNET(pf_mtag_z)
201 VNET_DEFINE(uma_zone_t,  pf_state_z);
202 VNET_DEFINE(uma_zone_t,  pf_state_key_z);
203
204 VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]);
205 #define PFID_CPUBITS    8
206 #define PFID_CPUSHIFT   (sizeof(uint64_t) * NBBY - PFID_CPUBITS)
207 #define PFID_CPUMASK    ((uint64_t)((1 << PFID_CPUBITS) - 1) << PFID_CPUSHIFT)
208 #define PFID_MAXID      (~PFID_CPUMASK)
209 CTASSERT((1 << PFID_CPUBITS) > MAXCPU);
210
211 static void              pf_src_tree_remove_state(struct pf_state *);
212 static void              pf_init_threshold(struct pf_threshold *, u_int32_t,
213                             u_int32_t);
214 static void              pf_add_threshold(struct pf_threshold *);
215 static int               pf_check_threshold(struct pf_threshold *);
216
217 static void              pf_change_ap(struct pf_addr *, u_int16_t *,
218                             u_int16_t *, u_int16_t *, struct pf_addr *,
219                             u_int16_t, u_int8_t, sa_family_t);
220 static int               pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
221                             struct tcphdr *, struct pf_state_peer *);
222 static void              pf_change_icmp(struct pf_addr *, u_int16_t *,
223                             struct pf_addr *, struct pf_addr *, u_int16_t,
224                             u_int16_t *, u_int16_t *, u_int16_t *,
225                             u_int16_t *, u_int8_t, sa_family_t);
226 static void              pf_send_tcp(struct mbuf *,
227                             const struct pf_rule *, sa_family_t,
228                             const struct pf_addr *, const struct pf_addr *,
229                             u_int16_t, u_int16_t, u_int32_t, u_int32_t,
230                             u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
231                             u_int16_t, struct ifnet *);
232 static void              pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
233                             sa_family_t, struct pf_rule *);
234 static void              pf_detach_state(struct pf_state *);
235 static int               pf_state_key_attach(struct pf_state_key *,
236                             struct pf_state_key *, struct pf_state *);
237 static void              pf_state_key_detach(struct pf_state *, int);
238 static int               pf_state_key_ctor(void *, int, void *, int);
239 static u_int32_t         pf_tcp_iss(struct pf_pdesc *);
240 static int               pf_test_rule(struct pf_rule **, struct pf_state **,
241                             int, struct pfi_kif *, struct mbuf *, int,
242                             struct pf_pdesc *, struct pf_rule **,
243                             struct pf_ruleset **, struct inpcb *);
244 static int               pf_create_state(struct pf_rule *, struct pf_rule *,
245                             struct pf_rule *, struct pf_pdesc *,
246                             struct pf_src_node *, struct pf_state_key *,
247                             struct pf_state_key *, struct mbuf *, int,
248                             u_int16_t, u_int16_t, int *, struct pfi_kif *,
249                             struct pf_state **, int, u_int16_t, u_int16_t,
250                             int);
251 static int               pf_test_fragment(struct pf_rule **, int,
252                             struct pfi_kif *, struct mbuf *, void *,
253                             struct pf_pdesc *, struct pf_rule **,
254                             struct pf_ruleset **);
255 static int               pf_tcp_track_full(struct pf_state_peer *,
256                             struct pf_state_peer *, struct pf_state **,
257                             struct pfi_kif *, struct mbuf *, int,
258                             struct pf_pdesc *, u_short *, int *);
259 static int               pf_tcp_track_sloppy(struct pf_state_peer *,
260                             struct pf_state_peer *, struct pf_state **,
261                             struct pf_pdesc *, u_short *);
262 static int               pf_test_state_tcp(struct pf_state **, int,
263                             struct pfi_kif *, struct mbuf *, int,
264                             void *, struct pf_pdesc *, u_short *);
265 static int               pf_test_state_udp(struct pf_state **, int,
266                             struct pfi_kif *, struct mbuf *, int,
267                             void *, struct pf_pdesc *);
268 static int               pf_test_state_icmp(struct pf_state **, int,
269                             struct pfi_kif *, struct mbuf *, int,
270                             void *, struct pf_pdesc *, u_short *);
271 static int               pf_test_state_other(struct pf_state **, int,
272                             struct pfi_kif *, struct mbuf *, struct pf_pdesc *);
273 static u_int8_t          pf_get_wscale(struct mbuf *, int, u_int16_t,
274                             sa_family_t);
275 static u_int16_t         pf_get_mss(struct mbuf *, int, u_int16_t,
276                             sa_family_t);
277 static u_int16_t         pf_calc_mss(struct pf_addr *, sa_family_t,
278                                 int, u_int16_t);
279 static void              pf_set_rt_ifp(struct pf_state *,
280                             struct pf_addr *);
281 static int               pf_check_proto_cksum(struct mbuf *, int, int,
282                             u_int8_t, sa_family_t);
283 static void              pf_print_state_parts(struct pf_state *,
284                             struct pf_state_key *, struct pf_state_key *);
285 static int               pf_addr_wrap_neq(struct pf_addr_wrap *,
286                             struct pf_addr_wrap *);
287 static struct pf_state  *pf_find_state(struct pfi_kif *,
288                             struct pf_state_key_cmp *, u_int);
289 static int               pf_src_connlimit(struct pf_state **);
290 static void              pf_flush_task(void *c, int pending);
291 static int               pf_insert_src_node(struct pf_src_node **,
292                             struct pf_rule *, struct pf_addr *, sa_family_t);
293 static int               pf_purge_expired_states(int);
294 static void              pf_purge_unlinked_rules(void);
295 static int               pf_mtag_init(void *, int, int);
296 static void              pf_mtag_free(struct m_tag *);
297 #ifdef INET
298 static void              pf_route(struct mbuf **, struct pf_rule *, int,
299                             struct ifnet *, struct pf_state *,
300                             struct pf_pdesc *);
301 #endif /* INET */
302 #ifdef INET6
303 static void              pf_change_a6(struct pf_addr *, u_int16_t *,
304                             struct pf_addr *, u_int8_t);
305 static void              pf_route6(struct mbuf **, struct pf_rule *, int,
306                             struct ifnet *, struct pf_state *,
307                             struct pf_pdesc *);
308 #endif /* INET6 */
309
310 int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
311
312 VNET_DECLARE(int, pf_end_threads);
313
314 VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]);
315
316 #define PACKET_LOOPED(pd)       ((pd)->pf_mtag &&                       \
317                                  (pd)->pf_mtag->flags & PF_PACKET_LOOPED)
318
319 #define STATE_LOOKUP(i, k, d, s, pd)                                    \
320         do {                                                            \
321                 (s) = pf_find_state((i), (k), (d));                     \
322                 if ((s) == NULL || (s)->timeout == PFTM_PURGE)          \
323                         return (PF_DROP);                               \
324                 if (PACKET_LOOPED(pd))                                  \
325                         return (PF_PASS);                               \
326                 if ((d) == PF_OUT &&                                    \
327                     (((s)->rule.ptr->rt == PF_ROUTETO &&                \
328                     (s)->rule.ptr->direction == PF_OUT) ||              \
329                     ((s)->rule.ptr->rt == PF_REPLYTO &&                 \
330                     (s)->rule.ptr->direction == PF_IN)) &&              \
331                     (s)->rt_kif != NULL &&                              \
332                     (s)->rt_kif != (i))                                 \
333                         return (PF_PASS);                               \
334         } while (0)
335
336 #define BOUND_IFACE(r, k) \
337         ((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all
338
339 #define STATE_INC_COUNTERS(s)                           \
340         do {                                            \
341                 s->rule.ptr->states_cur++;              \
342                 s->rule.ptr->states_tot++;              \
343                 if (s->anchor.ptr != NULL) {            \
344                         s->anchor.ptr->states_cur++;    \
345                         s->anchor.ptr->states_tot++;    \
346                 }                                       \
347                 if (s->nat_rule.ptr != NULL) {          \
348                         s->nat_rule.ptr->states_cur++;  \
349                         s->nat_rule.ptr->states_tot++;  \
350                 }                                       \
351         } while (0)
352
353 #define STATE_DEC_COUNTERS(s)                           \
354         do {                                            \
355                 if (s->nat_rule.ptr != NULL)            \
356                         s->nat_rule.ptr->states_cur--;  \
357                 if (s->anchor.ptr != NULL)              \
358                         s->anchor.ptr->states_cur--;    \
359                 s->rule.ptr->states_cur--;              \
360         } while (0)
361
362 static MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures");
363 VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
364 VNET_DEFINE(struct pf_idhash *, pf_idhash);
365 VNET_DEFINE(u_long, pf_hashmask);
366 VNET_DEFINE(struct pf_srchash *, pf_srchash);
367 VNET_DEFINE(u_long, pf_srchashmask);
368
369 SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW, 0, "pf(4)");
370
371 VNET_DEFINE(u_long, pf_hashsize);
372 #define V_pf_hashsize   VNET(pf_hashsize)
373 SYSCTL_VNET_UINT(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN,
374     &VNET_NAME(pf_hashsize), 0, "Size of pf(4) states hashtable");
375
376 VNET_DEFINE(u_long, pf_srchashsize);
377 #define V_pf_srchashsize        VNET(pf_srchashsize)
378 SYSCTL_VNET_UINT(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN,
379     &VNET_NAME(pf_srchashsize), 0, "Size of pf(4) source nodes hashtable");
380
381 VNET_DEFINE(void *, pf_swi_cookie);
382
383 VNET_DEFINE(uint32_t, pf_hashseed);
384 #define V_pf_hashseed   VNET(pf_hashseed)
385
386 static __inline uint32_t
387 pf_hashkey(struct pf_state_key *sk)
388 {
389         uint32_t h;
390
391         h = jenkins_hash32((uint32_t *)sk,
392             sizeof(struct pf_state_key_cmp)/sizeof(uint32_t),
393             V_pf_hashseed);
394
395         return (h & V_pf_hashmask);
396 }
397
398 #ifdef INET6
399 void
400 pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
401 {
402         switch (af) {
403 #ifdef INET
404         case AF_INET:
405                 dst->addr32[0] = src->addr32[0];
406                 break;
407 #endif /* INET */
408         case AF_INET6:
409                 dst->addr32[0] = src->addr32[0];
410                 dst->addr32[1] = src->addr32[1];
411                 dst->addr32[2] = src->addr32[2];
412                 dst->addr32[3] = src->addr32[3];
413                 break;
414         }
415 }
416 #endif /* INET6 */
417
418 static void
419 pf_init_threshold(struct pf_threshold *threshold,
420     u_int32_t limit, u_int32_t seconds)
421 {
422         threshold->limit = limit * PF_THRESHOLD_MULT;
423         threshold->seconds = seconds;
424         threshold->count = 0;
425         threshold->last = time_uptime;
426 }
427
428 static void
429 pf_add_threshold(struct pf_threshold *threshold)
430 {
431         u_int32_t t = time_uptime, diff = t - threshold->last;
432
433         if (diff >= threshold->seconds)
434                 threshold->count = 0;
435         else
436                 threshold->count -= threshold->count * diff /
437                     threshold->seconds;
438         threshold->count += PF_THRESHOLD_MULT;
439         threshold->last = t;
440 }
441
442 static int
443 pf_check_threshold(struct pf_threshold *threshold)
444 {
445         return (threshold->count > threshold->limit);
446 }
447
448 static int
449 pf_src_connlimit(struct pf_state **state)
450 {
451         struct pfr_addr p;
452         struct pf_flush_entry *pffe;
453         int bad = 0;
454
455         PF_STATE_LOCK_ASSERT(*state);
456
457         (*state)->src_node->conn++;
458         (*state)->src.tcp_est = 1;
459         pf_add_threshold(&(*state)->src_node->conn_rate);
460
461         if ((*state)->rule.ptr->max_src_conn &&
462             (*state)->rule.ptr->max_src_conn <
463             (*state)->src_node->conn) {
464                 V_pf_status.lcounters[LCNT_SRCCONN]++;
465                 bad++;
466         }
467
468         if ((*state)->rule.ptr->max_src_conn_rate.limit &&
469             pf_check_threshold(&(*state)->src_node->conn_rate)) {
470                 V_pf_status.lcounters[LCNT_SRCCONNRATE]++;
471                 bad++;
472         }
473
474         if (!bad)
475                 return (0);
476
477         /* Kill this state. */
478         (*state)->timeout = PFTM_PURGE;
479         (*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
480
481         if ((*state)->rule.ptr->overload_tbl == NULL)
482                 return (1);
483
484         V_pf_status.lcounters[LCNT_OVERLOAD_TABLE]++;
485         if (V_pf_status.debug >= PF_DEBUG_MISC) {
486                 printf("%s: blocking address ", __func__);
487                 pf_print_host(&(*state)->src_node->addr, 0,
488                     (*state)->key[PF_SK_WIRE]->af);
489                 printf("\n");
490         }
491
492         bzero(&p, sizeof(p));
493         p.pfra_af = (*state)->key[PF_SK_WIRE]->af;
494         switch ((*state)->key[PF_SK_WIRE]->af) {
495 #ifdef INET
496         case AF_INET:
497                 p.pfra_net = 32;
498                 p.pfra_ip4addr = (*state)->src_node->addr.v4;
499                 break;
500 #endif /* INET */
501 #ifdef INET6
502         case AF_INET6:
503                 p.pfra_net = 128;
504                 p.pfra_ip6addr = (*state)->src_node->addr.v6;
505                 break;
506 #endif /* INET6 */
507         }
508
509         pfr_insert_kentry((*state)->rule.ptr->overload_tbl, &p, time_second);
510
511         if ((*state)->rule.ptr->flush == 0)
512                 return (1);
513
514         /* Schedule flushing task. */
515         pffe = malloc(sizeof(*pffe), M_PFTEMP, M_NOWAIT);
516         if (pffe == NULL)
517                 return (1);     /* too bad :( */
518
519         bcopy(&(*state)->src_node->addr, &pffe->addr, sizeof(pffe->addr));
520         pffe->af = (*state)->key[PF_SK_WIRE]->af;
521         pffe->dir = (*state)->direction;
522         if ((*state)->rule.ptr->flush & PF_FLUSH_GLOBAL)
523                 pffe->rule = NULL;
524         else
525                 pffe->rule = (*state)->rule.ptr;
526         PF_FLUSHQ_LOCK();
527         SLIST_INSERT_HEAD(&V_pf_flushqueue, pffe, next);
528         PF_FLUSHQ_UNLOCK();
529         taskqueue_enqueue(taskqueue_swi, &V_pf_flushtask);
530
531         return (1);
532 }
533
534 static void
535 pf_flush_task(void *c, int pending)
536 {
537         struct pf_flush_head queue;
538         struct pf_flush_entry *pffe, *pffe1;
539         uint32_t killed = 0;
540
541         PF_FLUSHQ_LOCK();
542         queue = *(struct pf_flush_head *)c;
543         SLIST_INIT((struct pf_flush_head *)c);
544         PF_FLUSHQ_UNLOCK();
545
546         V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH]++;
547
548         for (int i = 0; i <= V_pf_hashmask; i++) {
549                 struct pf_idhash *ih = &V_pf_idhash[i];
550                 struct pf_state_key *sk;
551                 struct pf_state *s;
552
553                 PF_HASHROW_LOCK(ih);
554                 LIST_FOREACH(s, &ih->states, entry) {
555                     sk = s->key[PF_SK_WIRE];
556                     SLIST_FOREACH(pffe, &queue, next)
557                         if (sk->af == pffe->af && (pffe->rule == NULL ||
558                             pffe->rule == s->rule.ptr) &&
559                             ((pffe->dir == PF_OUT &&
560                             PF_AEQ(&pffe->addr, &sk->addr[1], sk->af)) ||
561                             (pffe->dir == PF_IN &&
562                             PF_AEQ(&pffe->addr, &sk->addr[0], sk->af)))) {
563                                 s->timeout = PFTM_PURGE;
564                                 s->src.state = s->dst.state = TCPS_CLOSED;
565                                 killed++;
566                         }
567                 }
568                 PF_HASHROW_UNLOCK(ih);
569         }
570         SLIST_FOREACH_SAFE(pffe, &queue, next, pffe1)
571                 free(pffe, M_PFTEMP);
572         if (V_pf_status.debug >= PF_DEBUG_MISC)
573                 printf("%s: %u states killed", __func__, killed);
574 }
575
576 /*
577  * Can return locked on failure, so that we can consistently
578  * allocate and insert a new one.
579  */
580 struct pf_src_node *
581 pf_find_src_node(struct pf_addr *src, struct pf_rule *rule, sa_family_t af,
582         int returnlocked)
583 {
584         struct pf_srchash *sh;
585         struct pf_src_node *n;
586
587         V_pf_status.scounters[SCNT_SRC_NODE_SEARCH]++;
588
589         sh = &V_pf_srchash[pf_hashsrc(src, af)];
590         PF_HASHROW_LOCK(sh);
591         LIST_FOREACH(n, &sh->nodes, entry)
592                 if (n->rule.ptr == rule && n->af == af &&
593                     ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) ||
594                     (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0)))
595                         break;
596         if (n != NULL || returnlocked == 0)
597                 PF_HASHROW_UNLOCK(sh);
598
599         return (n);
600 }
601
602 static int
603 pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
604     struct pf_addr *src, sa_family_t af)
605 {
606
607         KASSERT((rule->rule_flag & PFRULE_RULESRCTRACK ||
608             rule->rpool.opts & PF_POOL_STICKYADDR),
609             ("%s for non-tracking rule %p", __func__, rule));
610
611         if (*sn == NULL)
612                 *sn = pf_find_src_node(src, rule, af, 1);
613
614         if (*sn == NULL) {
615                 struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)];
616
617                 PF_HASHROW_ASSERT(sh);
618
619                 if (!rule->max_src_nodes ||
620                     rule->src_nodes < rule->max_src_nodes)
621                         (*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO);
622                 else
623                         V_pf_status.lcounters[LCNT_SRCNODES]++;
624                 if ((*sn) == NULL) {
625                         PF_HASHROW_UNLOCK(sh);
626                         return (-1);
627                 }
628
629                 pf_init_threshold(&(*sn)->conn_rate,
630                     rule->max_src_conn_rate.limit,
631                     rule->max_src_conn_rate.seconds);
632
633                 (*sn)->af = af;
634                 (*sn)->rule.ptr = rule;
635                 PF_ACPY(&(*sn)->addr, src, af);
636                 LIST_INSERT_HEAD(&sh->nodes, *sn, entry);
637                 (*sn)->creation = time_uptime;
638                 (*sn)->ruletype = rule->action;
639                 if ((*sn)->rule.ptr != NULL)
640                         (*sn)->rule.ptr->src_nodes++;
641                 PF_HASHROW_UNLOCK(sh);
642                 V_pf_status.scounters[SCNT_SRC_NODE_INSERT]++;
643                 V_pf_status.src_nodes++;
644         } else {
645                 if (rule->max_src_states &&
646                     (*sn)->states >= rule->max_src_states) {
647                         V_pf_status.lcounters[LCNT_SRCSTATES]++;
648                         return (-1);
649                 }
650         }
651         return (0);
652 }
653
654 static void
655 pf_remove_src_node(struct pf_src_node *src)
656 {
657         struct pf_srchash *sh;
658
659         sh = &V_pf_srchash[pf_hashsrc(&src->addr, src->af)];
660         PF_HASHROW_LOCK(sh);
661         LIST_REMOVE(src, entry);
662         PF_HASHROW_UNLOCK(sh);
663 }
664
665 /* Data storage structures initialization. */
666 void
667 pf_initialize()
668 {
669         struct pf_keyhash       *kh;
670         struct pf_idhash        *ih;
671         struct pf_srchash       *sh;
672         u_int i;
673
674         TUNABLE_ULONG_FETCH("net.pf.states_hashsize", &V_pf_hashsize);
675         if (V_pf_hashsize == 0 || !powerof2(V_pf_hashsize))
676                 V_pf_hashsize = PF_HASHSIZ;
677         TUNABLE_ULONG_FETCH("net.pf.source_nodes_hashsize", &V_pf_srchashsize);
678         if (V_pf_srchashsize == 0 || !powerof2(V_pf_srchashsize))
679                 V_pf_srchashsize = PF_HASHSIZ / 4;
680
681         V_pf_hashseed = arc4random();
682
683         /* States and state keys storage. */
684         V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state),
685             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
686         V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z;
687         uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT);
688
689         V_pf_state_key_z = uma_zcreate("pf state keys",
690             sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL,
691             UMA_ALIGN_PTR, 0);
692         V_pf_keyhash = malloc(V_pf_hashsize * sizeof(struct pf_keyhash),
693             M_PFHASH, M_WAITOK | M_ZERO);
694         V_pf_idhash = malloc(V_pf_hashsize * sizeof(struct pf_idhash),
695             M_PFHASH, M_WAITOK | M_ZERO);
696         V_pf_hashmask = V_pf_hashsize - 1;
697         for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
698             i++, kh++, ih++) {
699                 mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF);
700                 mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
701         }
702
703         /* Source nodes. */
704         V_pf_sources_z = uma_zcreate("pf source nodes",
705             sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
706             0);
707         V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z;
708         uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT);
709         V_pf_srchash = malloc(V_pf_srchashsize * sizeof(struct pf_srchash),
710           M_PFHASH, M_WAITOK|M_ZERO);
711         V_pf_srchashmask = V_pf_srchashsize - 1;
712         for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++)
713                 mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF);
714
715         /* ALTQ */
716         TAILQ_INIT(&V_pf_altqs[0]);
717         TAILQ_INIT(&V_pf_altqs[1]);
718         TAILQ_INIT(&V_pf_pabuf);
719         V_pf_altqs_active = &V_pf_altqs[0];
720         V_pf_altqs_inactive = &V_pf_altqs[1];
721
722         /* Mbuf tags */
723         V_pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) +
724             sizeof(struct pf_mtag), NULL, NULL, pf_mtag_init, NULL,
725             UMA_ALIGN_PTR, 0);
726
727         /* Send & flush queues. */
728         STAILQ_INIT(&V_pf_sendqueue);
729         SLIST_INIT(&V_pf_flushqueue);
730         TASK_INIT(&V_pf_flushtask, 0, pf_flush_task, &V_pf_flushqueue);
731         mtx_init(&pf_sendqueue_mtx, "pf send queue", NULL, MTX_DEF);
732         mtx_init(&pf_flushqueue_mtx, "pf flush queue", NULL, MTX_DEF);
733
734         /* Unlinked, but may be referenced rules. */
735         TAILQ_INIT(&V_pf_unlinked_rules);
736         mtx_init(&pf_unlnkdrules_mtx, "pf unlinked rules", NULL, MTX_DEF);
737 }
738
739 void
740 pf_cleanup()
741 {
742         struct pf_keyhash       *kh;
743         struct pf_idhash        *ih;
744         struct pf_srchash       *sh;
745         struct pf_send_entry    *pfse, *next;
746         u_int i;
747
748         for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
749             i++, kh++, ih++) {
750                 KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
751                     __func__));
752                 KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty",
753                     __func__));
754                 mtx_destroy(&kh->lock);
755                 mtx_destroy(&ih->lock);
756         }
757         free(V_pf_keyhash, M_PFHASH);
758         free(V_pf_idhash, M_PFHASH);
759
760         for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) {
761                 KASSERT(LIST_EMPTY(&sh->nodes),
762                     ("%s: source node hash not empty", __func__));
763                 mtx_destroy(&sh->lock);
764         }
765         free(V_pf_srchash, M_PFHASH);
766
767         STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) {
768                 m_freem(pfse->pfse_m);
769                 free(pfse, M_PFTEMP);
770         }
771
772         mtx_destroy(&pf_sendqueue_mtx);
773         mtx_destroy(&pf_flushqueue_mtx);
774         mtx_destroy(&pf_unlnkdrules_mtx);
775
776         uma_zdestroy(V_pf_mtag_z);
777         uma_zdestroy(V_pf_sources_z);
778         uma_zdestroy(V_pf_state_z);
779         uma_zdestroy(V_pf_state_key_z);
780 }
781
782 static int
783 pf_mtag_init(void *mem, int size, int how)
784 {
785         struct m_tag *t;
786
787         t = (struct m_tag *)mem;
788         t->m_tag_cookie = MTAG_ABI_COMPAT;
789         t->m_tag_id = PACKET_TAG_PF;
790         t->m_tag_len = sizeof(struct pf_mtag);
791         t->m_tag_free = pf_mtag_free;
792
793         return (0);
794 }
795
796 static void
797 pf_mtag_free(struct m_tag *t)
798 {
799
800         uma_zfree(V_pf_mtag_z, t);
801 }
802
803 struct pf_mtag *
804 pf_get_mtag(struct mbuf *m)
805 {
806         struct m_tag *mtag;
807
808         if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL)
809                 return ((struct pf_mtag *)(mtag + 1));
810
811         mtag = uma_zalloc(V_pf_mtag_z, M_NOWAIT);
812         if (mtag == NULL)
813                 return (NULL);
814         bzero(mtag + 1, sizeof(struct pf_mtag));
815         m_tag_prepend(m, mtag);
816
817         return ((struct pf_mtag *)(mtag + 1));
818 }
819
820 static int
821 pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
822     struct pf_state *s)
823 {
824         struct pf_keyhash       *kh;
825         struct pf_state_key     *sk, *cur;
826         struct pf_state         *si, *olds = NULL;
827         int idx;
828
829         KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
830         KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__));
831         KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
832
833         /*
834          * First run: start with wire key.
835          */
836         sk = skw;
837         idx = PF_SK_WIRE;
838
839 keyattach:
840         kh = &V_pf_keyhash[pf_hashkey(sk)];
841
842         PF_HASHROW_LOCK(kh);
843         LIST_FOREACH(cur, &kh->keys, entry)
844                 if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
845                         break;
846
847         if (cur != NULL) {
848                 /* Key exists. Check for same kif, if none, add to key. */
849                 TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) {
850                         struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)];
851
852                         PF_HASHROW_LOCK(ih);
853                         if (si->kif == s->kif &&
854                             si->direction == s->direction) {
855                                 if (sk->proto == IPPROTO_TCP &&
856                                     si->src.state >= TCPS_FIN_WAIT_2 &&
857                                     si->dst.state >= TCPS_FIN_WAIT_2) {
858                                         si->src.state = si->dst.state =
859                                             TCPS_CLOSED;
860                                         /* Unlink later or cur can go away. */
861                                         pf_ref_state(si);
862                                         olds = si;
863                                 } else {
864                                         if (V_pf_status.debug >= PF_DEBUG_MISC) {
865                                                 printf("pf: %s key attach "
866                                                     "failed on %s: ",
867                                                     (idx == PF_SK_WIRE) ?
868                                                     "wire" : "stack",
869                                                     s->kif->pfik_name);
870                                                 pf_print_state_parts(s,
871                                                     (idx == PF_SK_WIRE) ?
872                                                     sk : NULL,
873                                                     (idx == PF_SK_STACK) ?
874                                                     sk : NULL);
875                                                 printf(", existing: ");
876                                                 pf_print_state_parts(si,
877                                                     (idx == PF_SK_WIRE) ?
878                                                     sk : NULL,
879                                                     (idx == PF_SK_STACK) ?
880                                                     sk : NULL);
881                                                 printf("\n");
882                                         }
883                                         PF_HASHROW_UNLOCK(ih);
884                                         PF_HASHROW_UNLOCK(kh);
885                                         uma_zfree(V_pf_state_key_z, sk);
886                                         if (idx == PF_SK_STACK)
887                                                 pf_detach_state(s);
888                                         return (-1);    /* collision! */
889                                 }
890                         }
891                         PF_HASHROW_UNLOCK(ih);
892                 }
893                 uma_zfree(V_pf_state_key_z, sk);
894                 s->key[idx] = cur;
895         } else {
896                 LIST_INSERT_HEAD(&kh->keys, sk, entry);
897                 s->key[idx] = sk;
898         }
899
900 stateattach:
901         /* List is sorted, if-bound states before floating. */
902         if (s->kif == V_pfi_all)
903                 TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]);
904         else
905                 TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
906
907         /*
908          * Attach done. See how should we (or should not?)
909          * attach a second key.
910          */
911         if (sks == skw) {
912                 s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
913                 idx = PF_SK_STACK;
914                 sks = NULL;
915                 goto stateattach;
916         } else if (sks != NULL) {
917                 PF_HASHROW_UNLOCK(kh);
918                 if (olds) {
919                         pf_unlink_state(olds, 0);
920                         pf_release_state(olds);
921                         olds = NULL;
922                 }
923                 /*
924                  * Continue attaching with stack key.
925                  */
926                 sk = sks;
927                 idx = PF_SK_STACK;
928                 sks = NULL;
929                 goto keyattach;
930         } else
931                 PF_HASHROW_UNLOCK(kh);
932
933         if (olds) {
934                 pf_unlink_state(olds, 0);
935                 pf_release_state(olds);
936         }
937
938         KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
939             ("%s failure", __func__));
940
941         return (0);
942 }
943
944 static void
945 pf_detach_state(struct pf_state *s)
946 {
947         struct pf_state_key *sks = s->key[PF_SK_STACK];
948         struct pf_keyhash *kh;
949
950         if (sks != NULL) {
951                 kh = &V_pf_keyhash[pf_hashkey(sks)];
952                 PF_HASHROW_LOCK(kh);
953                 if (s->key[PF_SK_STACK] != NULL)
954                         pf_state_key_detach(s, PF_SK_STACK);
955                 /*
956                  * If both point to same key, then we are done.
957                  */
958                 if (sks == s->key[PF_SK_WIRE]) {
959                         pf_state_key_detach(s, PF_SK_WIRE);
960                         PF_HASHROW_UNLOCK(kh);
961                         return;
962                 }
963                 PF_HASHROW_UNLOCK(kh);
964         }
965
966         if (s->key[PF_SK_WIRE] != NULL) {
967                 kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])];
968                 PF_HASHROW_LOCK(kh);
969                 if (s->key[PF_SK_WIRE] != NULL)
970                         pf_state_key_detach(s, PF_SK_WIRE);
971                 PF_HASHROW_UNLOCK(kh);
972         }
973 }
974
975 static void
976 pf_state_key_detach(struct pf_state *s, int idx)
977 {
978         struct pf_state_key *sk = s->key[idx];
979 #ifdef INVARIANTS
980         struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)];
981
982         PF_HASHROW_ASSERT(kh);
983 #endif
984         TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]);
985         s->key[idx] = NULL;
986
987         if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) {
988                 LIST_REMOVE(sk, entry);
989                 uma_zfree(V_pf_state_key_z, sk);
990         }
991 }
992
993 static int
994 pf_state_key_ctor(void *mem, int size, void *arg, int flags)
995 {
996         struct pf_state_key *sk = mem;
997
998         bzero(sk, sizeof(struct pf_state_key_cmp));
999         TAILQ_INIT(&sk->states[PF_SK_WIRE]);
1000         TAILQ_INIT(&sk->states[PF_SK_STACK]);
1001
1002         return (0);
1003 }
1004
1005 struct pf_state_key *
1006 pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr,
1007         struct pf_addr *daddr, u_int16_t sport, u_int16_t dport)
1008 {
1009         struct pf_state_key *sk;
1010
1011         sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1012         if (sk == NULL)
1013                 return (NULL);
1014
1015         PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af);
1016         PF_ACPY(&sk->addr[pd->didx], daddr, pd->af);
1017         sk->port[pd->sidx] = sport;
1018         sk->port[pd->didx] = dport;
1019         sk->proto = pd->proto;
1020         sk->af = pd->af;
1021
1022         return (sk);
1023 }
1024
1025 struct pf_state_key *
1026 pf_state_key_clone(struct pf_state_key *orig)
1027 {
1028         struct pf_state_key *sk;
1029
1030         sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
1031         if (sk == NULL)
1032                 return (NULL);
1033
1034         bcopy(orig, sk, sizeof(struct pf_state_key_cmp));
1035
1036         return (sk);
1037 }
1038
1039 int
1040 pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw,
1041     struct pf_state_key *sks, struct pf_state *s)
1042 {
1043         struct pf_idhash *ih;
1044         struct pf_state *cur;
1045
1046         KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]),
1047             ("%s: sks not pristine", __func__));
1048         KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]),
1049             ("%s: skw not pristine", __func__));
1050         KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
1051
1052         s->kif = kif;
1053
1054         if (pf_state_key_attach(skw, sks, s))
1055                 return (-1);
1056
1057         if (s->id == 0 && s->creatorid == 0) {
1058                 /* XXX: should be atomic, but probability of collision low */
1059                 if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID)
1060                         V_pf_stateid[curcpu] = 1;
1061                 s->id |= (uint64_t )curcpu << PFID_CPUSHIFT;
1062                 s->id = htobe64(s->id);
1063                 s->creatorid = V_pf_status.hostid;
1064         }
1065
1066         ih = &V_pf_idhash[PF_IDHASH(s)];
1067         PF_HASHROW_LOCK(ih);
1068         LIST_FOREACH(cur, &ih->states, entry)
1069                 if (cur->id == s->id && cur->creatorid == s->creatorid)
1070                         break;
1071
1072         if (cur != NULL) {
1073                 PF_HASHROW_UNLOCK(ih);
1074                 if (V_pf_status.debug >= PF_DEBUG_MISC) {
1075                         printf("pf: state insert failed: "
1076                             "id: %016llx creatorid: %08x",
1077                             (unsigned long long)be64toh(s->id),
1078                             ntohl(s->creatorid));
1079                         printf("\n");
1080                 }
1081                 pf_detach_state(s);
1082                 return (-1);
1083         }
1084         LIST_INSERT_HEAD(&ih->states, s, entry);
1085         /* One for keys, one for ID hash. */
1086         refcount_init(&s->refs, 2);
1087
1088         V_pf_status.fcounters[FCNT_STATE_INSERT]++;
1089         if (pfsync_insert_state_ptr != NULL)
1090                 pfsync_insert_state_ptr(s);
1091
1092         /* Returns locked. */
1093         return (0);
1094 }
1095
1096 /*
1097  * Find state by ID: returns with locked row on success.
1098  */
1099 struct pf_state *
1100 pf_find_state_byid(uint64_t id, uint32_t creatorid)
1101 {
1102         struct pf_idhash *ih;
1103         struct pf_state *s;
1104
1105         V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
1106
1107         ih = &V_pf_idhash[(be64toh(id) % (V_pf_hashmask + 1))];
1108
1109         PF_HASHROW_LOCK(ih);
1110         LIST_FOREACH(s, &ih->states, entry)
1111                 if (s->id == id && s->creatorid == creatorid)
1112                         break;
1113
1114         if (s == NULL)
1115                 PF_HASHROW_UNLOCK(ih);
1116
1117         return (s);
1118 }
1119
1120 /*
1121  * Find state by key.
1122  * Returns with ID hash slot locked on success.
1123  */
1124 static struct pf_state *
1125 pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir)
1126 {
1127         struct pf_keyhash       *kh;
1128         struct pf_state_key     *sk;
1129         struct pf_state         *s;
1130         int idx;
1131
1132         V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
1133
1134         kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1135
1136         PF_HASHROW_LOCK(kh);
1137         LIST_FOREACH(sk, &kh->keys, entry)
1138                 if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1139                         break;
1140         if (sk == NULL) {
1141                 PF_HASHROW_UNLOCK(kh);
1142                 return (NULL);
1143         }
1144
1145         idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK);
1146
1147         /* List is sorted, if-bound states before floating ones. */
1148         TAILQ_FOREACH(s, &sk->states[idx], key_list[idx])
1149                 if (s->kif == V_pfi_all || s->kif == kif) {
1150                         PF_STATE_LOCK(s);
1151                         PF_HASHROW_UNLOCK(kh);
1152                         if (s->timeout == PFTM_UNLINKED) {
1153                                 /*
1154                                  * State is being processed
1155                                  * by pf_unlink_state() in
1156                                  * an other thread.
1157                                  */
1158                                 PF_STATE_UNLOCK(s);
1159                                 return (NULL);
1160                         }
1161                         return (s);
1162                 }
1163         PF_HASHROW_UNLOCK(kh);
1164
1165         return (NULL);
1166 }
1167
1168 struct pf_state *
1169 pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
1170 {
1171         struct pf_keyhash       *kh;
1172         struct pf_state_key     *sk;
1173         struct pf_state         *s, *ret = NULL;
1174         int                      idx, inout = 0;
1175
1176         V_pf_status.fcounters[FCNT_STATE_SEARCH]++;
1177
1178         kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
1179
1180         PF_HASHROW_LOCK(kh);
1181         LIST_FOREACH(sk, &kh->keys, entry)
1182                 if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
1183                         break;
1184         if (sk == NULL) {
1185                 PF_HASHROW_UNLOCK(kh);
1186                 return (NULL);
1187         }
1188         switch (dir) {
1189         case PF_IN:
1190                 idx = PF_SK_WIRE;
1191                 break;
1192         case PF_OUT:
1193                 idx = PF_SK_STACK;
1194                 break;
1195         case PF_INOUT:
1196                 idx = PF_SK_WIRE;
1197                 inout = 1;
1198                 break;
1199         default:
1200                 panic("%s: dir %u", __func__, dir);
1201         }
1202 second_run:
1203         TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) {
1204                 if (more == NULL) {
1205                         PF_HASHROW_UNLOCK(kh);
1206                         return (s);
1207                 }
1208
1209                 if (ret)
1210                         (*more)++;
1211                 else
1212                         ret = s;
1213         }
1214         if (inout == 1) {
1215                 inout = 0;
1216                 idx = PF_SK_STACK;
1217                 goto second_run;
1218         }
1219         PF_HASHROW_UNLOCK(kh);
1220
1221         return (ret);
1222 }
1223
1224 /* END state table stuff */
1225
1226 static void
1227 pf_send(struct pf_send_entry *pfse)
1228 {
1229
1230         PF_SENDQ_LOCK();
1231         STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next);
1232         PF_SENDQ_UNLOCK();
1233         swi_sched(V_pf_swi_cookie, 0);
1234 }
1235
1236 void
1237 pf_intr(void *v)
1238 {
1239         struct pf_send_head queue;
1240         struct pf_send_entry *pfse, *next;
1241
1242         CURVNET_SET((struct vnet *)v);
1243
1244         PF_SENDQ_LOCK();
1245         queue = V_pf_sendqueue;
1246         STAILQ_INIT(&V_pf_sendqueue);
1247         PF_SENDQ_UNLOCK();
1248
1249         STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) {
1250                 switch (pfse->pfse_type) {
1251 #ifdef INET
1252                 case PFSE_IP:
1253                         ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL);
1254                         break;
1255                 case PFSE_ICMP:
1256                         icmp_error(pfse->pfse_m, pfse->pfse_icmp_type,
1257                             pfse->pfse_icmp_code, 0, pfse->pfse_icmp_mtu);
1258                         break;
1259 #endif /* INET */
1260 #ifdef INET6
1261                 case PFSE_IP6:
1262                         ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL,
1263                             NULL);
1264                         break;
1265                 case PFSE_ICMP6:
1266                         icmp6_error(pfse->pfse_m, pfse->pfse_icmp_type,
1267                             pfse->pfse_icmp_code, pfse->pfse_icmp_mtu);
1268                         break;
1269 #endif /* INET6 */
1270                 default:
1271                         panic("%s: unknown type", __func__);
1272                 }
1273                 free(pfse, M_PFTEMP);
1274         }
1275         CURVNET_RESTORE();
1276 }
1277
1278 void
1279 pf_purge_thread(void *v)
1280 {
1281         int fullrun;
1282
1283         CURVNET_SET((struct vnet *)v);
1284
1285         for (;;) {
1286                 PF_RULES_RLOCK();
1287                 rw_sleep(pf_purge_thread, &pf_rules_lock, 0, "pftm", hz / 10);
1288
1289                 if (V_pf_end_threads) {
1290                         /*
1291                          * To cleanse up all kifs and rules we need
1292                          * two runs: first one clears reference flags,
1293                          * then pf_purge_expired_states() doesn't
1294                          * raise them, and then second run frees.
1295                          */
1296                         PF_RULES_RUNLOCK();
1297                         pf_purge_unlinked_rules();
1298                         pfi_kif_purge();
1299
1300                         /*
1301                          * Now purge everything.
1302                          */
1303                         pf_purge_expired_states(V_pf_hashmask + 1);
1304                         pf_purge_expired_fragments();
1305                         pf_purge_expired_src_nodes();
1306
1307                         /*
1308                          * Now all kifs & rules should be unreferenced,
1309                          * thus should be successfully freed.
1310                          */
1311                         pf_purge_unlinked_rules();
1312                         pfi_kif_purge();
1313
1314                         /*
1315                          * Announce success and exit.
1316                          */
1317                         PF_RULES_RLOCK();
1318                         V_pf_end_threads++;
1319                         PF_RULES_RUNLOCK();
1320                         wakeup(pf_purge_thread);
1321                         kproc_exit(0);
1322                 }
1323                 PF_RULES_RUNLOCK();
1324
1325                 /* Process 1/interval fraction of the state table every run. */
1326                 fullrun = pf_purge_expired_states(V_pf_hashmask /
1327                             (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10));
1328
1329                 /* Purge other expired types every PFTM_INTERVAL seconds. */
1330                 if (fullrun) {
1331                         /*
1332                          * Order is important:
1333                          * - states and src nodes reference rules
1334                          * - states and rules reference kifs
1335                          */
1336                         pf_purge_expired_fragments();
1337                         pf_purge_expired_src_nodes();
1338                         pf_purge_unlinked_rules();
1339                         pfi_kif_purge();
1340                 }
1341         }
1342         /* not reached */
1343         CURVNET_RESTORE();
1344 }
1345
1346 u_int32_t
1347 pf_state_expires(const struct pf_state *state)
1348 {
1349         u_int32_t       timeout;
1350         u_int32_t       start;
1351         u_int32_t       end;
1352         u_int32_t       states;
1353
1354         /* handle all PFTM_* > PFTM_MAX here */
1355         if (state->timeout == PFTM_PURGE)
1356                 return (time_uptime);
1357         if (state->timeout == PFTM_UNTIL_PACKET)
1358                 return (0);
1359         KASSERT(state->timeout != PFTM_UNLINKED,
1360             ("pf_state_expires: timeout == PFTM_UNLINKED"));
1361         KASSERT((state->timeout < PFTM_MAX),
1362             ("pf_state_expires: timeout > PFTM_MAX"));
1363         timeout = state->rule.ptr->timeout[state->timeout];
1364         if (!timeout)
1365                 timeout = V_pf_default_rule.timeout[state->timeout];
1366         start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
1367         if (start) {
1368                 end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
1369                 states = state->rule.ptr->states_cur;   /* XXXGL */
1370         } else {
1371                 start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START];
1372                 end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END];
1373                 states = V_pf_status.states;
1374         }
1375         if (end && states > start && start < end) {
1376                 if (states < end)
1377                         return (state->expire + timeout * (end - states) /
1378                             (end - start));
1379                 else
1380                         return (time_uptime);
1381         }
1382         return (state->expire + timeout);
1383 }
1384
1385 void
1386 pf_purge_expired_src_nodes()
1387 {
1388         struct pf_srchash       *sh;
1389         struct pf_src_node      *cur, *next;
1390         int i;
1391
1392         for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++) {
1393             PF_HASHROW_LOCK(sh);
1394             LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next)
1395                 if (cur->states <= 0 && cur->expire <= time_uptime) {
1396                         if (cur->rule.ptr != NULL)
1397                                 cur->rule.ptr->src_nodes--;
1398                         LIST_REMOVE(cur, entry);
1399                         V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
1400                         V_pf_status.src_nodes--;
1401                         uma_zfree(V_pf_sources_z, cur);
1402                 } else if (cur->rule.ptr != NULL)
1403                         cur->rule.ptr->rule_flag |= PFRULE_REFS;
1404             PF_HASHROW_UNLOCK(sh);
1405         }
1406 }
1407
1408 static void
1409 pf_src_tree_remove_state(struct pf_state *s)
1410 {
1411         u_int32_t timeout;
1412
1413         if (s->src_node != NULL) {
1414                 if (s->src.tcp_est)
1415                         --s->src_node->conn;
1416                 if (--s->src_node->states <= 0) {
1417                         timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1418                         if (!timeout)
1419                                 timeout =
1420                                     V_pf_default_rule.timeout[PFTM_SRC_NODE];
1421                         s->src_node->expire = time_uptime + timeout;
1422                 }
1423         }
1424         if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
1425                 if (--s->nat_src_node->states <= 0) {
1426                         timeout = s->rule.ptr->timeout[PFTM_SRC_NODE];
1427                         if (!timeout)
1428                                 timeout =
1429                                     V_pf_default_rule.timeout[PFTM_SRC_NODE];
1430                         s->nat_src_node->expire = time_uptime + timeout;
1431                 }
1432         }
1433         s->src_node = s->nat_src_node = NULL;
1434 }
1435
1436 /*
1437  * Unlink and potentilly free a state. Function may be
1438  * called with ID hash row locked, but always returns
1439  * unlocked, since it needs to go through key hash locking.
1440  */
1441 int
1442 pf_unlink_state(struct pf_state *s, u_int flags)
1443 {
1444         struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)];
1445
1446         if ((flags & PF_ENTER_LOCKED) == 0)
1447                 PF_HASHROW_LOCK(ih);
1448         else
1449                 PF_HASHROW_ASSERT(ih);
1450
1451         if (s->timeout == PFTM_UNLINKED) {
1452                 /*
1453                  * State is being processed
1454                  * by pf_unlink_state() in
1455                  * an other thread.
1456                  */
1457                 PF_HASHROW_UNLOCK(ih);
1458                 return (0);     /* XXXGL: undefined actually */
1459         }
1460
1461         s->timeout = PFTM_UNLINKED;
1462
1463         if (s->src.state == PF_TCPS_PROXY_DST) {
1464                 /* XXX wire key the right one? */
1465                 pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af,
1466                     &s->key[PF_SK_WIRE]->addr[1],
1467                     &s->key[PF_SK_WIRE]->addr[0],
1468                     s->key[PF_SK_WIRE]->port[1],
1469                     s->key[PF_SK_WIRE]->port[0],
1470                     s->src.seqhi, s->src.seqlo + 1,
1471                     TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL);
1472         }
1473
1474         LIST_REMOVE(s, entry);
1475         pf_src_tree_remove_state(s);
1476         PF_HASHROW_UNLOCK(ih);
1477
1478         if (pfsync_delete_state_ptr != NULL)
1479                 pfsync_delete_state_ptr(s);
1480
1481         pf_detach_state(s);
1482         refcount_release(&s->refs);
1483
1484         return (pf_release_state(s));
1485 }
1486
1487 void
1488 pf_free_state(struct pf_state *cur)
1489 {
1490
1491         KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur));
1492         KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__,
1493             cur->timeout));
1494         --cur->rule.ptr->states_cur;
1495         if (cur->nat_rule.ptr != NULL)
1496                 --cur->nat_rule.ptr->states_cur;
1497         if (cur->anchor.ptr != NULL)
1498                 --cur->anchor.ptr->states_cur;
1499         pf_normalize_tcp_cleanup(cur);
1500         uma_zfree(V_pf_state_z, cur);
1501         V_pf_status.fcounters[FCNT_STATE_REMOVALS]++;
1502 }
1503
1504 /*
1505  * Called only from pf_purge_thread(), thus serialized.
1506  */
1507 static int
1508 pf_purge_expired_states(int maxcheck)
1509 {
1510         static u_int i = 0;
1511
1512         struct pf_idhash *ih;
1513         struct pf_state *s;
1514         int rv = 0;
1515
1516         V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1517
1518         /*
1519          * Go through hash and unlink states that expire now.
1520          */
1521         while (maxcheck > 0) {
1522
1523                 /* Wrap to start of hash when we hit the end. */
1524                 if (i > V_pf_hashmask) {
1525                         i = 0;
1526                         rv = 1;
1527                 }
1528
1529                 ih = &V_pf_idhash[i];
1530 relock:
1531                 PF_HASHROW_LOCK(ih);
1532                 LIST_FOREACH(s, &ih->states, entry) {
1533                         if (pf_state_expires(s) <= time_uptime) {
1534                                 V_pf_status.states -=
1535                                     pf_unlink_state(s, PF_ENTER_LOCKED);
1536                                 goto relock;
1537                         }
1538                         s->rule.ptr->rule_flag |= PFRULE_REFS;
1539                         if (s->nat_rule.ptr != NULL)
1540                                 s->nat_rule.ptr->rule_flag |= PFRULE_REFS;
1541                         if (s->anchor.ptr != NULL)
1542                                 s->anchor.ptr->rule_flag |= PFRULE_REFS;
1543                         s->kif->pfik_flags |= PFI_IFLAG_REFS;
1544                         if (s->rt_kif)
1545                                 s->rt_kif->pfik_flags |= PFI_IFLAG_REFS;
1546                 }
1547                 PF_HASHROW_UNLOCK(ih);
1548                 i++;
1549                 maxcheck--;
1550         }
1551
1552         V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
1553
1554         return (rv);
1555 }
1556
1557 static void
1558 pf_purge_unlinked_rules()
1559 {
1560         struct pf_rulequeue tmpq;
1561         struct pf_rule *r, *r1;
1562
1563         /*
1564          * Do naive mark-and-sweep garbage collecting of old rules.
1565          * Reference flag is raised by pf_purge_expired_states()
1566          * and pf_purge_expired_src_nodes().
1567          *
1568          * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK,
1569          * use a temporary queue.
1570          */
1571         TAILQ_INIT(&tmpq);
1572         PF_UNLNKDRULES_LOCK();
1573         TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) {
1574                 if (!(r->rule_flag & PFRULE_REFS)) {
1575                         TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries);
1576                         TAILQ_INSERT_TAIL(&tmpq, r, entries);
1577                 } else
1578                         r->rule_flag &= ~PFRULE_REFS;
1579         }
1580         PF_UNLNKDRULES_UNLOCK();
1581
1582         if (!TAILQ_EMPTY(&tmpq)) {
1583                 PF_RULES_WLOCK();
1584                 TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) {
1585                         TAILQ_REMOVE(&tmpq, r, entries);
1586                         pf_free_rule(r);
1587                 }
1588                 PF_RULES_WUNLOCK();
1589         }
1590 }
1591
1592 void
1593 pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
1594 {
1595         switch (af) {
1596 #ifdef INET
1597         case AF_INET: {
1598                 u_int32_t a = ntohl(addr->addr32[0]);
1599                 printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
1600                     (a>>8)&255, a&255);
1601                 if (p) {
1602                         p = ntohs(p);
1603                         printf(":%u", p);
1604                 }
1605                 break;
1606         }
1607 #endif /* INET */
1608 #ifdef INET6
1609         case AF_INET6: {
1610                 u_int16_t b;
1611                 u_int8_t i, curstart, curend, maxstart, maxend;
1612                 curstart = curend = maxstart = maxend = 255;
1613                 for (i = 0; i < 8; i++) {
1614                         if (!addr->addr16[i]) {
1615                                 if (curstart == 255)
1616                                         curstart = i;
1617                                 curend = i;
1618                         } else {
1619                                 if ((curend - curstart) >
1620                                     (maxend - maxstart)) {
1621                                         maxstart = curstart;
1622                                         maxend = curend;
1623                                 }
1624                                 curstart = curend = 255;
1625                         }
1626                 }
1627                 if ((curend - curstart) >
1628                     (maxend - maxstart)) {
1629                         maxstart = curstart;
1630                         maxend = curend;
1631                 }
1632                 for (i = 0; i < 8; i++) {
1633                         if (i >= maxstart && i <= maxend) {
1634                                 if (i == 0)
1635                                         printf(":");
1636                                 if (i == maxend)
1637                                         printf(":");
1638                         } else {
1639                                 b = ntohs(addr->addr16[i]);
1640                                 printf("%x", b);
1641                                 if (i < 7)
1642                                         printf(":");
1643                         }
1644                 }
1645                 if (p) {
1646                         p = ntohs(p);
1647                         printf("[%u]", p);
1648                 }
1649                 break;
1650         }
1651 #endif /* INET6 */
1652         }
1653 }
1654
1655 void
1656 pf_print_state(struct pf_state *s)
1657 {
1658         pf_print_state_parts(s, NULL, NULL);
1659 }
1660
1661 static void
1662 pf_print_state_parts(struct pf_state *s,
1663     struct pf_state_key *skwp, struct pf_state_key *sksp)
1664 {
1665         struct pf_state_key *skw, *sks;
1666         u_int8_t proto, dir;
1667
1668         /* Do our best to fill these, but they're skipped if NULL */
1669         skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
1670         sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
1671         proto = skw ? skw->proto : (sks ? sks->proto : 0);
1672         dir = s ? s->direction : 0;
1673
1674         switch (proto) {
1675         case IPPROTO_IPV4:
1676                 printf("IPv4");
1677                 break;
1678         case IPPROTO_IPV6:
1679                 printf("IPv6");
1680                 break;
1681         case IPPROTO_TCP:
1682                 printf("TCP");
1683                 break;
1684         case IPPROTO_UDP:
1685                 printf("UDP");
1686                 break;
1687         case IPPROTO_ICMP:
1688                 printf("ICMP");
1689                 break;
1690         case IPPROTO_ICMPV6:
1691                 printf("ICMPv6");
1692                 break;
1693         default:
1694                 printf("%u", skw->proto);
1695                 break;
1696         }
1697         switch (dir) {
1698         case PF_IN:
1699                 printf(" in");
1700                 break;
1701         case PF_OUT:
1702                 printf(" out");
1703                 break;
1704         }
1705         if (skw) {
1706                 printf(" wire: ");
1707                 pf_print_host(&skw->addr[0], skw->port[0], skw->af);
1708                 printf(" ");
1709                 pf_print_host(&skw->addr[1], skw->port[1], skw->af);
1710         }
1711         if (sks) {
1712                 printf(" stack: ");
1713                 if (sks != skw) {
1714                         pf_print_host(&sks->addr[0], sks->port[0], sks->af);
1715                         printf(" ");
1716                         pf_print_host(&sks->addr[1], sks->port[1], sks->af);
1717                 } else
1718                         printf("-");
1719         }
1720         if (s) {
1721                 if (proto == IPPROTO_TCP) {
1722                         printf(" [lo=%u high=%u win=%u modulator=%u",
1723                             s->src.seqlo, s->src.seqhi,
1724                             s->src.max_win, s->src.seqdiff);
1725                         if (s->src.wscale && s->dst.wscale)
1726                                 printf(" wscale=%u",
1727                                     s->src.wscale & PF_WSCALE_MASK);
1728                         printf("]");
1729                         printf(" [lo=%u high=%u win=%u modulator=%u",
1730                             s->dst.seqlo, s->dst.seqhi,
1731                             s->dst.max_win, s->dst.seqdiff);
1732                         if (s->src.wscale && s->dst.wscale)
1733                                 printf(" wscale=%u",
1734                                 s->dst.wscale & PF_WSCALE_MASK);
1735                         printf("]");
1736                 }
1737                 printf(" %u:%u", s->src.state, s->dst.state);
1738         }
1739 }
1740
1741 void
1742 pf_print_flags(u_int8_t f)
1743 {
1744         if (f)
1745                 printf(" ");
1746         if (f & TH_FIN)
1747                 printf("F");
1748         if (f & TH_SYN)
1749                 printf("S");
1750         if (f & TH_RST)
1751                 printf("R");
1752         if (f & TH_PUSH)
1753                 printf("P");
1754         if (f & TH_ACK)
1755                 printf("A");
1756         if (f & TH_URG)
1757                 printf("U");
1758         if (f & TH_ECE)
1759                 printf("E");
1760         if (f & TH_CWR)
1761                 printf("W");
1762 }
1763
1764 #define PF_SET_SKIP_STEPS(i)                                    \
1765         do {                                                    \
1766                 while (head[i] != cur) {                        \
1767                         head[i]->skip[i].ptr = cur;             \
1768                         head[i] = TAILQ_NEXT(head[i], entries); \
1769                 }                                               \
1770         } while (0)
1771
1772 void
1773 pf_calc_skip_steps(struct pf_rulequeue *rules)
1774 {
1775         struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT];
1776         int i;
1777
1778         cur = TAILQ_FIRST(rules);
1779         prev = cur;
1780         for (i = 0; i < PF_SKIP_COUNT; ++i)
1781                 head[i] = cur;
1782         while (cur != NULL) {
1783
1784                 if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
1785                         PF_SET_SKIP_STEPS(PF_SKIP_IFP);
1786                 if (cur->direction != prev->direction)
1787                         PF_SET_SKIP_STEPS(PF_SKIP_DIR);
1788                 if (cur->af != prev->af)
1789                         PF_SET_SKIP_STEPS(PF_SKIP_AF);
1790                 if (cur->proto != prev->proto)
1791                         PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
1792                 if (cur->src.neg != prev->src.neg ||
1793                     pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
1794                         PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
1795                 if (cur->src.port[0] != prev->src.port[0] ||
1796                     cur->src.port[1] != prev->src.port[1] ||
1797                     cur->src.port_op != prev->src.port_op)
1798                         PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
1799                 if (cur->dst.neg != prev->dst.neg ||
1800                     pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
1801                         PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
1802                 if (cur->dst.port[0] != prev->dst.port[0] ||
1803                     cur->dst.port[1] != prev->dst.port[1] ||
1804                     cur->dst.port_op != prev->dst.port_op)
1805                         PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
1806
1807                 prev = cur;
1808                 cur = TAILQ_NEXT(cur, entries);
1809         }
1810         for (i = 0; i < PF_SKIP_COUNT; ++i)
1811                 PF_SET_SKIP_STEPS(i);
1812 }
1813
1814 static int
1815 pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
1816 {
1817         if (aw1->type != aw2->type)
1818                 return (1);
1819         switch (aw1->type) {
1820         case PF_ADDR_ADDRMASK:
1821         case PF_ADDR_RANGE:
1822                 if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, 0))
1823                         return (1);
1824                 if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, 0))
1825                         return (1);
1826                 return (0);
1827         case PF_ADDR_DYNIFTL:
1828                 return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
1829         case PF_ADDR_NOROUTE:
1830         case PF_ADDR_URPFFAILED:
1831                 return (0);
1832         case PF_ADDR_TABLE:
1833                 return (aw1->p.tbl != aw2->p.tbl);
1834         default:
1835                 printf("invalid address type: %d\n", aw1->type);
1836                 return (1);
1837         }
1838 }
1839
1840 u_int16_t
1841 pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
1842 {
1843         u_int32_t       l;
1844
1845         if (udp && !cksum)
1846                 return (0x0000);
1847         l = cksum + old - new;
1848         l = (l >> 16) + (l & 65535);
1849         l = l & 65535;
1850         if (udp && !l)
1851                 return (0xFFFF);
1852         return (l);
1853 }
1854
1855 static void
1856 pf_change_ap(struct pf_addr *a, u_int16_t *p, u_int16_t *ic, u_int16_t *pc,
1857     struct pf_addr *an, u_int16_t pn, u_int8_t u, sa_family_t af)
1858 {
1859         struct pf_addr  ao;
1860         u_int16_t       po = *p;
1861
1862         PF_ACPY(&ao, a, af);
1863         PF_ACPY(a, an, af);
1864
1865         *p = pn;
1866
1867         switch (af) {
1868 #ifdef INET
1869         case AF_INET:
1870                 *ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
1871                     ao.addr16[0], an->addr16[0], 0),
1872                     ao.addr16[1], an->addr16[1], 0);
1873                 *p = pn;
1874                 *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
1875                     ao.addr16[0], an->addr16[0], u),
1876                     ao.addr16[1], an->addr16[1], u),
1877                     po, pn, u);
1878                 break;
1879 #endif /* INET */
1880 #ifdef INET6
1881         case AF_INET6:
1882                 *pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1883                     pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1884                     pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(*pc,
1885                     ao.addr16[0], an->addr16[0], u),
1886                     ao.addr16[1], an->addr16[1], u),
1887                     ao.addr16[2], an->addr16[2], u),
1888                     ao.addr16[3], an->addr16[3], u),
1889                     ao.addr16[4], an->addr16[4], u),
1890                     ao.addr16[5], an->addr16[5], u),
1891                     ao.addr16[6], an->addr16[6], u),
1892                     ao.addr16[7], an->addr16[7], u),
1893                     po, pn, u);
1894                 break;
1895 #endif /* INET6 */
1896         }
1897 }
1898
1899
1900 /* Changes a u_int32_t.  Uses a void * so there are no align restrictions */
1901 void
1902 pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
1903 {
1904         u_int32_t       ao;
1905
1906         memcpy(&ao, a, sizeof(ao));
1907         memcpy(a, &an, sizeof(u_int32_t));
1908         *c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
1909             ao % 65536, an % 65536, u);
1910 }
1911
1912 #ifdef INET6
1913 static void
1914 pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
1915 {
1916         struct pf_addr  ao;
1917
1918         PF_ACPY(&ao, a, AF_INET6);
1919         PF_ACPY(a, an, AF_INET6);
1920
1921         *c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1922             pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1923             pf_cksum_fixup(pf_cksum_fixup(*c,
1924             ao.addr16[0], an->addr16[0], u),
1925             ao.addr16[1], an->addr16[1], u),
1926             ao.addr16[2], an->addr16[2], u),
1927             ao.addr16[3], an->addr16[3], u),
1928             ao.addr16[4], an->addr16[4], u),
1929             ao.addr16[5], an->addr16[5], u),
1930             ao.addr16[6], an->addr16[6], u),
1931             ao.addr16[7], an->addr16[7], u);
1932 }
1933 #endif /* INET6 */
1934
1935 static void
1936 pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
1937     struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
1938     u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
1939 {
1940         struct pf_addr  oia, ooa;
1941
1942         PF_ACPY(&oia, ia, af);
1943         if (oa)
1944                 PF_ACPY(&ooa, oa, af);
1945
1946         /* Change inner protocol port, fix inner protocol checksum. */
1947         if (ip != NULL) {
1948                 u_int16_t       oip = *ip;
1949                 u_int32_t       opc;
1950
1951                 if (pc != NULL)
1952                         opc = *pc;
1953                 *ip = np;
1954                 if (pc != NULL)
1955                         *pc = pf_cksum_fixup(*pc, oip, *ip, u);
1956                 *ic = pf_cksum_fixup(*ic, oip, *ip, 0);
1957                 if (pc != NULL)
1958                         *ic = pf_cksum_fixup(*ic, opc, *pc, 0);
1959         }
1960         /* Change inner ip address, fix inner ip and icmp checksums. */
1961         PF_ACPY(ia, na, af);
1962         switch (af) {
1963 #ifdef INET
1964         case AF_INET: {
1965                 u_int32_t        oh2c = *h2c;
1966
1967                 *h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
1968                     oia.addr16[0], ia->addr16[0], 0),
1969                     oia.addr16[1], ia->addr16[1], 0);
1970                 *ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
1971                     oia.addr16[0], ia->addr16[0], 0),
1972                     oia.addr16[1], ia->addr16[1], 0);
1973                 *ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
1974                 break;
1975         }
1976 #endif /* INET */
1977 #ifdef INET6
1978         case AF_INET6:
1979                 *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1980                     pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
1981                     pf_cksum_fixup(pf_cksum_fixup(*ic,
1982                     oia.addr16[0], ia->addr16[0], u),
1983                     oia.addr16[1], ia->addr16[1], u),
1984                     oia.addr16[2], ia->addr16[2], u),
1985                     oia.addr16[3], ia->addr16[3], u),
1986                     oia.addr16[4], ia->addr16[4], u),
1987                     oia.addr16[5], ia->addr16[5], u),
1988                     oia.addr16[6], ia->addr16[6], u),
1989                     oia.addr16[7], ia->addr16[7], u);
1990                 break;
1991 #endif /* INET6 */
1992         }
1993         /* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
1994         if (oa) {
1995                 PF_ACPY(oa, na, af);
1996                 switch (af) {
1997 #ifdef INET
1998                 case AF_INET:
1999                         *hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
2000                             ooa.addr16[0], oa->addr16[0], 0),
2001                             ooa.addr16[1], oa->addr16[1], 0);
2002                         break;
2003 #endif /* INET */
2004 #ifdef INET6
2005                 case AF_INET6:
2006                         *ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2007                             pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
2008                             pf_cksum_fixup(pf_cksum_fixup(*ic,
2009                             ooa.addr16[0], oa->addr16[0], u),
2010                             ooa.addr16[1], oa->addr16[1], u),
2011                             ooa.addr16[2], oa->addr16[2], u),
2012                             ooa.addr16[3], oa->addr16[3], u),
2013                             ooa.addr16[4], oa->addr16[4], u),
2014                             ooa.addr16[5], oa->addr16[5], u),
2015                             ooa.addr16[6], oa->addr16[6], u),
2016                             ooa.addr16[7], oa->addr16[7], u);
2017                         break;
2018 #endif /* INET6 */
2019                 }
2020         }
2021 }
2022
2023
2024 /*
2025  * Need to modulate the sequence numbers in the TCP SACK option
2026  * (credits to Krzysztof Pfaff for report and patch)
2027  */
2028 static int
2029 pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
2030     struct tcphdr *th, struct pf_state_peer *dst)
2031 {
2032         int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
2033         u_int8_t opts[TCP_MAXOLEN], *opt = opts;
2034         int copyback = 0, i, olen;
2035         struct sackblk sack;
2036
2037 #define TCPOLEN_SACKLEN (TCPOLEN_SACK + 2)
2038         if (hlen < TCPOLEN_SACKLEN ||
2039             !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
2040                 return 0;
2041
2042         while (hlen >= TCPOLEN_SACKLEN) {
2043                 olen = opt[1];
2044                 switch (*opt) {
2045                 case TCPOPT_EOL:        /* FALLTHROUGH */
2046                 case TCPOPT_NOP:
2047                         opt++;
2048                         hlen--;
2049                         break;
2050                 case TCPOPT_SACK:
2051                         if (olen > hlen)
2052                                 olen = hlen;
2053                         if (olen >= TCPOLEN_SACKLEN) {
2054                                 for (i = 2; i + TCPOLEN_SACK <= olen;
2055                                     i += TCPOLEN_SACK) {
2056                                         memcpy(&sack, &opt[i], sizeof(sack));
2057                                         pf_change_a(&sack.start, &th->th_sum,
2058                                             htonl(ntohl(sack.start) -
2059                                             dst->seqdiff), 0);
2060                                         pf_change_a(&sack.end, &th->th_sum,
2061                                             htonl(ntohl(sack.end) -
2062                                             dst->seqdiff), 0);
2063                                         memcpy(&opt[i], &sack, sizeof(sack));
2064                                 }
2065                                 copyback = 1;
2066                         }
2067                         /* FALLTHROUGH */
2068                 default:
2069                         if (olen < 2)
2070                                 olen = 2;
2071                         hlen -= olen;
2072                         opt += olen;
2073                 }
2074         }
2075
2076         if (copyback)
2077                 m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts);
2078         return (copyback);
2079 }
2080
2081 static void
2082 pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af,
2083     const struct pf_addr *saddr, const struct pf_addr *daddr,
2084     u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
2085     u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
2086     u_int16_t rtag, struct ifnet *ifp)
2087 {
2088         struct pf_send_entry *pfse;
2089         struct mbuf     *m;
2090         int              len, tlen;
2091 #ifdef INET
2092         struct ip       *h = NULL;
2093 #endif /* INET */
2094 #ifdef INET6
2095         struct ip6_hdr  *h6 = NULL;
2096 #endif /* INET6 */
2097         struct tcphdr   *th;
2098         char            *opt;
2099         struct pf_mtag  *pf_mtag;
2100
2101         len = 0;
2102         th = NULL;
2103
2104         /* maximum segment size tcp option */
2105         tlen = sizeof(struct tcphdr);
2106         if (mss)
2107                 tlen += 4;
2108
2109         switch (af) {
2110 #ifdef INET
2111         case AF_INET:
2112                 len = sizeof(struct ip) + tlen;
2113                 break;
2114 #endif /* INET */
2115 #ifdef INET6
2116         case AF_INET6:
2117                 len = sizeof(struct ip6_hdr) + tlen;
2118                 break;
2119 #endif /* INET6 */
2120         default:
2121                 panic("%s: unsupported af %d", __func__, af);
2122         }
2123
2124         /* Allocate outgoing queue entry, mbuf and mbuf tag. */
2125         pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
2126         if (pfse == NULL)
2127                 return;
2128         m = m_gethdr(M_NOWAIT, MT_HEADER);
2129         if (m == NULL) {
2130                 free(pfse, M_PFTEMP);
2131                 return;
2132         }
2133 #ifdef MAC
2134         mac_netinet_firewall_send(m);
2135 #endif
2136         if ((pf_mtag = pf_get_mtag(m)) == NULL) {
2137                 free(pfse, M_PFTEMP);
2138                 m_freem(m);
2139                 return;
2140         }
2141         if (tag)
2142                 m->m_flags |= M_SKIP_FIREWALL;
2143         pf_mtag->tag = rtag;
2144
2145         if (r != NULL && r->rtableid >= 0)
2146                 M_SETFIB(m, r->rtableid);
2147
2148 #ifdef ALTQ
2149         if (r != NULL && r->qid) {
2150                 pf_mtag->qid = r->qid;
2151
2152                 /* add hints for ecn */
2153                 pf_mtag->hdr = mtod(m, struct ip *);
2154         }
2155 #endif /* ALTQ */
2156         m->m_data += max_linkhdr;
2157         m->m_pkthdr.len = m->m_len = len;
2158         m->m_pkthdr.rcvif = NULL;
2159         bzero(m->m_data, len);
2160         switch (af) {
2161 #ifdef INET
2162         case AF_INET:
2163                 h = mtod(m, struct ip *);
2164
2165                 /* IP header fields included in the TCP checksum */
2166                 h->ip_p = IPPROTO_TCP;
2167                 h->ip_len = htons(tlen);
2168                 h->ip_src.s_addr = saddr->v4.s_addr;
2169                 h->ip_dst.s_addr = daddr->v4.s_addr;
2170
2171                 th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
2172                 break;
2173 #endif /* INET */
2174 #ifdef INET6
2175         case AF_INET6:
2176                 h6 = mtod(m, struct ip6_hdr *);
2177
2178                 /* IP header fields included in the TCP checksum */
2179                 h6->ip6_nxt = IPPROTO_TCP;
2180                 h6->ip6_plen = htons(tlen);
2181                 memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
2182                 memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
2183
2184                 th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
2185                 break;
2186 #endif /* INET6 */
2187         }
2188
2189         /* TCP header */
2190         th->th_sport = sport;
2191         th->th_dport = dport;
2192         th->th_seq = htonl(seq);
2193         th->th_ack = htonl(ack);
2194         th->th_off = tlen >> 2;
2195         th->th_flags = flags;
2196         th->th_win = htons(win);
2197
2198         if (mss) {
2199                 opt = (char *)(th + 1);
2200                 opt[0] = TCPOPT_MAXSEG;
2201                 opt[1] = 4;
2202                 HTONS(mss);
2203                 bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
2204         }
2205
2206         switch (af) {
2207 #ifdef INET
2208         case AF_INET:
2209                 /* TCP checksum */
2210                 th->th_sum = in_cksum(m, len);
2211
2212                 /* Finish the IP header */
2213                 h->ip_v = 4;
2214                 h->ip_hl = sizeof(*h) >> 2;
2215                 h->ip_tos = IPTOS_LOWDELAY;
2216                 h->ip_off = V_path_mtu_discovery ? IP_DF : 0;
2217                 h->ip_len = len;
2218                 h->ip_ttl = ttl ? ttl : V_ip_defttl;
2219                 h->ip_sum = 0;
2220
2221                 pfse->pfse_type = PFSE_IP;
2222                 break;
2223 #endif /* INET */
2224 #ifdef INET6
2225         case AF_INET6:
2226                 /* TCP checksum */
2227                 th->th_sum = in6_cksum(m, IPPROTO_TCP,
2228                     sizeof(struct ip6_hdr), tlen);
2229
2230                 h6->ip6_vfc |= IPV6_VERSION;
2231                 h6->ip6_hlim = IPV6_DEFHLIM;
2232
2233                 pfse->pfse_type = PFSE_IP6;
2234                 break;
2235 #endif /* INET6 */
2236         }
2237         pfse->pfse_m = m;
2238         pf_send(pfse);
2239 }
2240
2241 static void
2242 pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
2243     struct pf_rule *r)
2244 {
2245         struct pf_send_entry *pfse;
2246         struct mbuf *m0;
2247         struct pf_mtag *pf_mtag;
2248
2249         /* Allocate outgoing queue entry, mbuf and mbuf tag. */
2250         pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
2251         if (pfse == NULL)
2252                 return;
2253
2254         if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) {
2255                 free(pfse, M_PFTEMP);
2256                 return;
2257         }
2258
2259         if ((pf_mtag = pf_get_mtag(m0)) == NULL) {
2260                 free(pfse, M_PFTEMP);
2261                 return;
2262         }
2263         /* XXX: revisit */
2264         m0->m_flags |= M_SKIP_FIREWALL;
2265
2266         if (r->rtableid >= 0)
2267                 M_SETFIB(m0, r->rtableid);
2268
2269 #ifdef ALTQ
2270         if (r->qid) {
2271                 pf_mtag->qid = r->qid;
2272                 /* add hints for ecn */
2273                 pf_mtag->hdr = mtod(m0, struct ip *);
2274         }
2275 #endif /* ALTQ */
2276
2277         switch (af) {
2278 #ifdef INET
2279         case AF_INET:
2280             {
2281                 struct ip *ip;
2282
2283                 /* icmp_error() expects host byte ordering */
2284                 ip = mtod(m0, struct ip *);
2285                 NTOHS(ip->ip_len);
2286                 NTOHS(ip->ip_off);
2287
2288                 pfse->pfse_type = PFSE_ICMP;
2289                 break;
2290             }
2291 #endif /* INET */
2292 #ifdef INET6
2293         case AF_INET6:
2294                 pfse->pfse_type = PFSE_ICMP6;
2295                 break;
2296 #endif /* INET6 */
2297         }
2298         pfse->pfse_m = m0;
2299         pfse->pfse_icmp_type = type;
2300         pfse->pfse_icmp_code = code;
2301         pf_send(pfse);
2302 }
2303
2304 /*
2305  * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
2306  * If n is 0, they match if they are equal. If n is != 0, they match if they
2307  * are different.
2308  */
2309 int
2310 pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
2311     struct pf_addr *b, sa_family_t af)
2312 {
2313         int     match = 0;
2314
2315         switch (af) {
2316 #ifdef INET
2317         case AF_INET:
2318                 if ((a->addr32[0] & m->addr32[0]) ==
2319                     (b->addr32[0] & m->addr32[0]))
2320                         match++;
2321                 break;
2322 #endif /* INET */
2323 #ifdef INET6
2324         case AF_INET6:
2325                 if (((a->addr32[0] & m->addr32[0]) ==
2326                      (b->addr32[0] & m->addr32[0])) &&
2327                     ((a->addr32[1] & m->addr32[1]) ==
2328                      (b->addr32[1] & m->addr32[1])) &&
2329                     ((a->addr32[2] & m->addr32[2]) ==
2330                      (b->addr32[2] & m->addr32[2])) &&
2331                     ((a->addr32[3] & m->addr32[3]) ==
2332                      (b->addr32[3] & m->addr32[3])))
2333                         match++;
2334                 break;
2335 #endif /* INET6 */
2336         }
2337         if (match) {
2338                 if (n)
2339                         return (0);
2340                 else
2341                         return (1);
2342         } else {
2343                 if (n)
2344                         return (1);
2345                 else
2346                         return (0);
2347         }
2348 }
2349
2350 /*
2351  * Return 1 if b <= a <= e, otherwise return 0.
2352  */
2353 int
2354 pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
2355     struct pf_addr *a, sa_family_t af)
2356 {
2357         switch (af) {
2358 #ifdef INET
2359         case AF_INET:
2360                 if ((a->addr32[0] < b->addr32[0]) ||
2361                     (a->addr32[0] > e->addr32[0]))
2362                         return (0);
2363                 break;
2364 #endif /* INET */
2365 #ifdef INET6
2366         case AF_INET6: {
2367                 int     i;
2368
2369                 /* check a >= b */
2370                 for (i = 0; i < 4; ++i)
2371                         if (a->addr32[i] > b->addr32[i])
2372                                 break;
2373                         else if (a->addr32[i] < b->addr32[i])
2374                                 return (0);
2375                 /* check a <= e */
2376                 for (i = 0; i < 4; ++i)
2377                         if (a->addr32[i] < e->addr32[i])
2378                                 break;
2379                         else if (a->addr32[i] > e->addr32[i])
2380                                 return (0);
2381                 break;
2382         }
2383 #endif /* INET6 */
2384         }
2385         return (1);
2386 }
2387
2388 static int
2389 pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
2390 {
2391         switch (op) {
2392         case PF_OP_IRG:
2393                 return ((p > a1) && (p < a2));
2394         case PF_OP_XRG:
2395                 return ((p < a1) || (p > a2));
2396         case PF_OP_RRG:
2397                 return ((p >= a1) && (p <= a2));
2398         case PF_OP_EQ:
2399                 return (p == a1);
2400         case PF_OP_NE:
2401                 return (p != a1);
2402         case PF_OP_LT:
2403                 return (p < a1);
2404         case PF_OP_LE:
2405                 return (p <= a1);
2406         case PF_OP_GT:
2407                 return (p > a1);
2408         case PF_OP_GE:
2409                 return (p >= a1);
2410         }
2411         return (0); /* never reached */
2412 }
2413
2414 int
2415 pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
2416 {
2417         NTOHS(a1);
2418         NTOHS(a2);
2419         NTOHS(p);
2420         return (pf_match(op, a1, a2, p));
2421 }
2422
2423 static int
2424 pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
2425 {
2426         if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2427                 return (0);
2428         return (pf_match(op, a1, a2, u));
2429 }
2430
2431 static int
2432 pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
2433 {
2434         if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
2435                 return (0);
2436         return (pf_match(op, a1, a2, g));
2437 }
2438
2439 int
2440 pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag, int mtag)
2441 {
2442         if (*tag == -1)
2443                 *tag = mtag;
2444
2445         return ((!r->match_tag_not && r->match_tag == *tag) ||
2446             (r->match_tag_not && r->match_tag != *tag));
2447 }
2448
2449 int
2450 pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag)
2451 {
2452
2453         KASSERT(tag > 0, ("%s: tag %d", __func__, tag));
2454
2455         if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL))
2456                 return (ENOMEM);
2457
2458         pd->pf_mtag->tag = tag;
2459
2460         return (0);
2461 }
2462
2463 void
2464 pf_step_into_anchor(int *depth, struct pf_ruleset **rs, int n,
2465     struct pf_rule **r, struct pf_rule **a, int *match)
2466 {
2467         struct pf_anchor_stackframe     *f;
2468
2469         PF_RULES_RASSERT();
2470
2471         (*r)->anchor->match = 0;
2472         if (match)
2473                 *match = 0;
2474         if (*depth >= sizeof(V_pf_anchor_stack) /
2475             sizeof(V_pf_anchor_stack[0])) {
2476                 printf("pf_step_into_anchor: stack overflow\n");
2477                 *r = TAILQ_NEXT(*r, entries);
2478                 return;
2479         } else if (*depth == 0 && a != NULL)
2480                 *a = *r;
2481         f = V_pf_anchor_stack + (*depth)++;
2482         f->rs = *rs;
2483         f->r = *r;
2484         if ((*r)->anchor_wildcard) {
2485                 f->parent = &(*r)->anchor->children;
2486                 if ((f->child = RB_MIN(pf_anchor_node, f->parent)) ==
2487                     NULL) {
2488                         *r = NULL;
2489                         return;
2490                 }
2491                 *rs = &f->child->ruleset;
2492         } else {
2493                 f->parent = NULL;
2494                 f->child = NULL;
2495                 *rs = &(*r)->anchor->ruleset;
2496         }
2497         *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2498 }
2499
2500 int
2501 pf_step_out_of_anchor(int *depth, struct pf_ruleset **rs, int n,
2502     struct pf_rule **r, struct pf_rule **a, int *match)
2503 {
2504         struct pf_anchor_stackframe     *f;
2505         int quick = 0;
2506
2507         PF_RULES_RASSERT();
2508
2509         do {
2510                 if (*depth <= 0)
2511                         break;
2512                 f = V_pf_anchor_stack + *depth - 1;
2513                 if (f->parent != NULL && f->child != NULL) {
2514                         if (f->child->match ||
2515                             (match != NULL && *match)) {
2516                                 f->r->anchor->match = 1;
2517                                 *match = 0;
2518                         }
2519                         f->child = RB_NEXT(pf_anchor_node, f->parent, f->child);
2520                         if (f->child != NULL) {
2521                                 *rs = &f->child->ruleset;
2522                                 *r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
2523                                 if (*r == NULL)
2524                                         continue;
2525                                 else
2526                                         break;
2527                         }
2528                 }
2529                 (*depth)--;
2530                 if (*depth == 0 && a != NULL)
2531                         *a = NULL;
2532                 *rs = f->rs;
2533                 if (f->r->anchor->match || (match != NULL && *match))
2534                         quick = f->r->quick;
2535                 *r = TAILQ_NEXT(f->r, entries);
2536         } while (*r == NULL);
2537
2538         return (quick);
2539 }
2540
2541 #ifdef INET6
2542 void
2543 pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
2544     struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
2545 {
2546         switch (af) {
2547 #ifdef INET
2548         case AF_INET:
2549                 naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2550                 ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2551                 break;
2552 #endif /* INET */
2553         case AF_INET6:
2554                 naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
2555                 ((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
2556                 naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
2557                 ((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
2558                 naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
2559                 ((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
2560                 naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
2561                 ((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
2562                 break;
2563         }
2564 }
2565
2566 void
2567 pf_addr_inc(struct pf_addr *addr, sa_family_t af)
2568 {
2569         switch (af) {
2570 #ifdef INET
2571         case AF_INET:
2572                 addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
2573                 break;
2574 #endif /* INET */
2575         case AF_INET6:
2576                 if (addr->addr32[3] == 0xffffffff) {
2577                         addr->addr32[3] = 0;
2578                         if (addr->addr32[2] == 0xffffffff) {
2579                                 addr->addr32[2] = 0;
2580                                 if (addr->addr32[1] == 0xffffffff) {
2581                                         addr->addr32[1] = 0;
2582                                         addr->addr32[0] =
2583                                             htonl(ntohl(addr->addr32[0]) + 1);
2584                                 } else
2585                                         addr->addr32[1] =
2586                                             htonl(ntohl(addr->addr32[1]) + 1);
2587                         } else
2588                                 addr->addr32[2] =
2589                                     htonl(ntohl(addr->addr32[2]) + 1);
2590                 } else
2591                         addr->addr32[3] =
2592                             htonl(ntohl(addr->addr32[3]) + 1);
2593                 break;
2594         }
2595 }
2596 #endif /* INET6 */
2597
2598 int
2599 pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m)
2600 {
2601         struct pf_addr          *saddr, *daddr;
2602         u_int16_t                sport, dport;
2603         struct inpcbinfo        *pi;
2604         struct inpcb            *inp;
2605
2606         pd->lookup.uid = UID_MAX;
2607         pd->lookup.gid = GID_MAX;
2608
2609         switch (pd->proto) {
2610         case IPPROTO_TCP:
2611                 if (pd->hdr.tcp == NULL)
2612                         return (-1);
2613                 sport = pd->hdr.tcp->th_sport;
2614                 dport = pd->hdr.tcp->th_dport;
2615                 pi = &V_tcbinfo;
2616                 break;
2617         case IPPROTO_UDP:
2618                 if (pd->hdr.udp == NULL)
2619                         return (-1);
2620                 sport = pd->hdr.udp->uh_sport;
2621                 dport = pd->hdr.udp->uh_dport;
2622                 pi = &V_udbinfo;
2623                 break;
2624         default:
2625                 return (-1);
2626         }
2627         if (direction == PF_IN) {
2628                 saddr = pd->src;
2629                 daddr = pd->dst;
2630         } else {
2631                 u_int16_t       p;
2632
2633                 p = sport;
2634                 sport = dport;
2635                 dport = p;
2636                 saddr = pd->dst;
2637                 daddr = pd->src;
2638         }
2639         switch (pd->af) {
2640 #ifdef INET
2641         case AF_INET:
2642                 inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4,
2643                     dport, INPLOOKUP_RLOCKPCB, NULL, m);
2644                 if (inp == NULL) {
2645                         inp = in_pcblookup_mbuf(pi, saddr->v4, sport,
2646                            daddr->v4, dport, INPLOOKUP_WILDCARD |
2647                            INPLOOKUP_RLOCKPCB, NULL, m);
2648                         if (inp == NULL)
2649                                 return (-1);
2650                 }
2651                 break;
2652 #endif /* INET */
2653 #ifdef INET6
2654         case AF_INET6:
2655                 inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6,
2656                     dport, INPLOOKUP_RLOCKPCB, NULL, m);
2657                 if (inp == NULL) {
2658                         inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport,
2659                             &daddr->v6, dport, INPLOOKUP_WILDCARD |
2660                             INPLOOKUP_RLOCKPCB, NULL, m);
2661                         if (inp == NULL)
2662                                 return (-1);
2663                 }
2664                 break;
2665 #endif /* INET6 */
2666
2667         default:
2668                 return (-1);
2669         }
2670         INP_RLOCK_ASSERT(inp);
2671         pd->lookup.uid = inp->inp_cred->cr_uid;
2672         pd->lookup.gid = inp->inp_cred->cr_groups[0];
2673         INP_RUNLOCK(inp);
2674
2675         return (1);
2676 }
2677
2678 static u_int8_t
2679 pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
2680 {
2681         int              hlen;
2682         u_int8_t         hdr[60];
2683         u_int8_t        *opt, optlen;
2684         u_int8_t         wscale = 0;
2685
2686         hlen = th_off << 2;             /* hlen <= sizeof(hdr) */
2687         if (hlen <= sizeof(struct tcphdr))
2688                 return (0);
2689         if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
2690                 return (0);
2691         opt = hdr + sizeof(struct tcphdr);
2692         hlen -= sizeof(struct tcphdr);
2693         while (hlen >= 3) {
2694                 switch (*opt) {
2695                 case TCPOPT_EOL:
2696                 case TCPOPT_NOP:
2697                         ++opt;
2698                         --hlen;
2699                         break;
2700                 case TCPOPT_WINDOW:
2701                         wscale = opt[2];
2702                         if (wscale > TCP_MAX_WINSHIFT)
2703                                 wscale = TCP_MAX_WINSHIFT;
2704                         wscale |= PF_WSCALE_FLAG;
2705                         /* FALLTHROUGH */
2706                 default:
2707                         optlen = opt[1];
2708                         if (optlen < 2)
2709                                 optlen = 2;
2710                         hlen -= optlen;
2711                         opt += optlen;
2712                         break;
2713                 }
2714         }
2715         return (wscale);
2716 }
2717
2718 static u_int16_t
2719 pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
2720 {
2721         int              hlen;
2722         u_int8_t         hdr[60];
2723         u_int8_t        *opt, optlen;
2724         u_int16_t        mss = V_tcp_mssdflt;
2725
2726         hlen = th_off << 2;     /* hlen <= sizeof(hdr) */
2727         if (hlen <= sizeof(struct tcphdr))
2728                 return (0);
2729         if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
2730                 return (0);
2731         opt = hdr + sizeof(struct tcphdr);
2732         hlen -= sizeof(struct tcphdr);
2733         while (hlen >= TCPOLEN_MAXSEG) {
2734                 switch (*opt) {
2735                 case TCPOPT_EOL:
2736                 case TCPOPT_NOP:
2737                         ++opt;
2738                         --hlen;
2739                         break;
2740                 case TCPOPT_MAXSEG:
2741                         bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
2742                         NTOHS(mss);
2743                         /* FALLTHROUGH */
2744                 default:
2745                         optlen = opt[1];
2746                         if (optlen < 2)
2747                                 optlen = 2;
2748                         hlen -= optlen;
2749                         opt += optlen;
2750                         break;
2751                 }
2752         }
2753         return (mss);
2754 }
2755
2756 static u_int16_t
2757 pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer)
2758 {
2759 #ifdef INET
2760         struct sockaddr_in      *dst;
2761         struct route             ro;
2762 #endif /* INET */
2763 #ifdef INET6
2764         struct sockaddr_in6     *dst6;
2765         struct route_in6         ro6;
2766 #endif /* INET6 */
2767         struct rtentry          *rt = NULL;
2768         int                      hlen = 0;
2769         u_int16_t                mss = V_tcp_mssdflt;
2770
2771         switch (af) {
2772 #ifdef INET
2773         case AF_INET:
2774                 hlen = sizeof(struct ip);
2775                 bzero(&ro, sizeof(ro));
2776                 dst = (struct sockaddr_in *)&ro.ro_dst;
2777                 dst->sin_family = AF_INET;
2778                 dst->sin_len = sizeof(*dst);
2779                 dst->sin_addr = addr->v4;
2780                 in_rtalloc_ign(&ro, 0, rtableid);
2781                 rt = ro.ro_rt;
2782                 break;
2783 #endif /* INET */
2784 #ifdef INET6
2785         case AF_INET6:
2786                 hlen = sizeof(struct ip6_hdr);
2787                 bzero(&ro6, sizeof(ro6));
2788                 dst6 = (struct sockaddr_in6 *)&ro6.ro_dst;
2789                 dst6->sin6_family = AF_INET6;
2790                 dst6->sin6_len = sizeof(*dst6);
2791                 dst6->sin6_addr = addr->v6;
2792                 in6_rtalloc_ign(&ro6, 0, rtableid);
2793                 rt = ro6.ro_rt;
2794                 break;
2795 #endif /* INET6 */
2796         }
2797
2798         if (rt && rt->rt_ifp) {
2799                 mss = rt->rt_ifp->if_mtu - hlen - sizeof(struct tcphdr);
2800                 mss = max(V_tcp_mssdflt, mss);
2801                 RTFREE(rt);
2802         }
2803         mss = min(mss, offer);
2804         mss = max(mss, 64);             /* sanity - at least max opt space */
2805         return (mss);
2806 }
2807
2808 static void
2809 pf_set_rt_ifp(struct pf_state *s, struct pf_addr *saddr)
2810 {
2811         struct pf_rule *r = s->rule.ptr;
2812         struct pf_src_node *sn = NULL;
2813
2814         s->rt_kif = NULL;
2815         if (!r->rt || r->rt == PF_FASTROUTE)
2816                 return;
2817         switch (s->key[PF_SK_WIRE]->af) {
2818 #ifdef INET
2819         case AF_INET:
2820                 pf_map_addr(AF_INET, r, saddr, &s->rt_addr, NULL, &sn);
2821                 s->rt_kif = r->rpool.cur->kif;
2822                 break;
2823 #endif /* INET */
2824 #ifdef INET6
2825         case AF_INET6:
2826                 pf_map_addr(AF_INET6, r, saddr, &s->rt_addr, NULL, &sn);
2827                 s->rt_kif = r->rpool.cur->kif;
2828                 break;
2829 #endif /* INET6 */
2830         }
2831 }
2832
2833 static u_int32_t
2834 pf_tcp_iss(struct pf_pdesc *pd)
2835 {
2836         MD5_CTX ctx;
2837         u_int32_t digest[4];
2838
2839         if (V_pf_tcp_secret_init == 0) {
2840                 read_random(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret));
2841                 MD5Init(&V_pf_tcp_secret_ctx);
2842                 MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret,
2843                     sizeof(V_pf_tcp_secret));
2844                 V_pf_tcp_secret_init = 1;
2845         }
2846
2847         ctx = V_pf_tcp_secret_ctx;
2848
2849         MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short));
2850         MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short));
2851         if (pd->af == AF_INET6) {
2852                 MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
2853                 MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
2854         } else {
2855                 MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
2856                 MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
2857         }
2858         MD5Final((u_char *)digest, &ctx);
2859         V_pf_tcp_iss_off += 4096;
2860 #define ISN_RANDOM_INCREMENT (4096 - 1)
2861         return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) +
2862             V_pf_tcp_iss_off);
2863 #undef  ISN_RANDOM_INCREMENT
2864 }
2865
2866 static int
2867 pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
2868     struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd,
2869     struct pf_rule **am, struct pf_ruleset **rsm, struct inpcb *inp)
2870 {
2871         struct pf_rule          *nr = NULL;
2872         struct pf_addr          * const saddr = pd->src;
2873         struct pf_addr          * const daddr = pd->dst;
2874         sa_family_t              af = pd->af;
2875         struct pf_rule          *r, *a = NULL;
2876         struct pf_ruleset       *ruleset = NULL;
2877         struct pf_src_node      *nsn = NULL;
2878         struct tcphdr           *th = pd->hdr.tcp;
2879         struct pf_state_key     *sk = NULL, *nk = NULL;
2880         u_short                  reason;
2881         int                      rewrite = 0, hdrlen = 0;
2882         int                      tag = -1, rtableid = -1;
2883         int                      asd = 0;
2884         int                      match = 0;
2885         int                      state_icmp = 0;
2886         u_int16_t                sport = 0, dport = 0;
2887         u_int16_t                bproto_sum = 0, bip_sum = 0;
2888         u_int8_t                 icmptype = 0, icmpcode = 0;
2889
2890         PF_RULES_RASSERT();
2891
2892         if (inp != NULL) {
2893                 INP_LOCK_ASSERT(inp);
2894                 pd->lookup.uid = inp->inp_cred->cr_uid;
2895                 pd->lookup.gid = inp->inp_cred->cr_groups[0];
2896                 pd->lookup.done = 1;
2897         }
2898
2899         switch (pd->proto) {
2900         case IPPROTO_TCP:
2901                 sport = th->th_sport;
2902                 dport = th->th_dport;
2903                 hdrlen = sizeof(*th);
2904                 break;
2905         case IPPROTO_UDP:
2906                 sport = pd->hdr.udp->uh_sport;
2907                 dport = pd->hdr.udp->uh_dport;
2908                 hdrlen = sizeof(*pd->hdr.udp);
2909                 break;
2910 #ifdef INET
2911         case IPPROTO_ICMP:
2912                 if (pd->af != AF_INET)
2913                         break;
2914                 sport = dport = pd->hdr.icmp->icmp_id;
2915                 hdrlen = sizeof(*pd->hdr.icmp);
2916                 icmptype = pd->hdr.icmp->icmp_type;
2917                 icmpcode = pd->hdr.icmp->icmp_code;
2918
2919                 if (icmptype == ICMP_UNREACH ||
2920                     icmptype == ICMP_SOURCEQUENCH ||
2921                     icmptype == ICMP_REDIRECT ||
2922                     icmptype == ICMP_TIMXCEED ||
2923                     icmptype == ICMP_PARAMPROB)
2924                         state_icmp++;
2925                 break;
2926 #endif /* INET */
2927 #ifdef INET6
2928         case IPPROTO_ICMPV6:
2929                 if (af != AF_INET6)
2930                         break;
2931                 sport = dport = pd->hdr.icmp6->icmp6_id;
2932                 hdrlen = sizeof(*pd->hdr.icmp6);
2933                 icmptype = pd->hdr.icmp6->icmp6_type;
2934                 icmpcode = pd->hdr.icmp6->icmp6_code;
2935
2936                 if (icmptype == ICMP6_DST_UNREACH ||
2937                     icmptype == ICMP6_PACKET_TOO_BIG ||
2938                     icmptype == ICMP6_TIME_EXCEEDED ||
2939                     icmptype == ICMP6_PARAM_PROB)
2940                         state_icmp++;
2941                 break;
2942 #endif /* INET6 */
2943         default:
2944                 sport = dport = hdrlen = 0;
2945                 break;
2946         }
2947
2948         r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
2949
2950         /* check packet for BINAT/NAT/RDR */
2951         if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk,
2952             &nk, saddr, daddr, sport, dport)) != NULL) {
2953                 KASSERT(sk != NULL, ("%s: null sk", __func__));
2954                 KASSERT(nk != NULL, ("%s: null nk", __func__));
2955
2956                 if (pd->ip_sum)
2957                         bip_sum = *pd->ip_sum;
2958
2959                 switch (pd->proto) {
2960                 case IPPROTO_TCP:
2961                         bproto_sum = th->th_sum;
2962                         pd->proto_sum = &th->th_sum;
2963
2964                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
2965                             nk->port[pd->sidx] != sport) {
2966                                 pf_change_ap(saddr, &th->th_sport, pd->ip_sum,
2967                                     &th->th_sum, &nk->addr[pd->sidx],
2968                                     nk->port[pd->sidx], 0, af);
2969                                 pd->sport = &th->th_sport;
2970                                 sport = th->th_sport;
2971                         }
2972
2973                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
2974                             nk->port[pd->didx] != dport) {
2975                                 pf_change_ap(daddr, &th->th_dport, pd->ip_sum,
2976                                     &th->th_sum, &nk->addr[pd->didx],
2977                                     nk->port[pd->didx], 0, af);
2978                                 dport = th->th_dport;
2979                                 pd->dport = &th->th_dport;
2980                         }
2981                         rewrite++;
2982                         break;
2983                 case IPPROTO_UDP:
2984                         bproto_sum = pd->hdr.udp->uh_sum;
2985                         pd->proto_sum = &pd->hdr.udp->uh_sum;
2986
2987                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
2988                             nk->port[pd->sidx] != sport) {
2989                                 pf_change_ap(saddr, &pd->hdr.udp->uh_sport,
2990                                     pd->ip_sum, &pd->hdr.udp->uh_sum,
2991                                     &nk->addr[pd->sidx],
2992                                     nk->port[pd->sidx], 1, af);
2993                                 sport = pd->hdr.udp->uh_sport;
2994                                 pd->sport = &pd->hdr.udp->uh_sport;
2995                         }
2996
2997                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
2998                             nk->port[pd->didx] != dport) {
2999                                 pf_change_ap(daddr, &pd->hdr.udp->uh_dport,
3000                                     pd->ip_sum, &pd->hdr.udp->uh_sum,
3001                                     &nk->addr[pd->didx],
3002                                     nk->port[pd->didx], 1, af);
3003                                 dport = pd->hdr.udp->uh_dport;
3004                                 pd->dport = &pd->hdr.udp->uh_dport;
3005                         }
3006                         rewrite++;
3007                         break;
3008 #ifdef INET
3009                 case IPPROTO_ICMP:
3010                         nk->port[0] = nk->port[1];
3011                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
3012                                 pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
3013                                     nk->addr[pd->sidx].v4.s_addr, 0);
3014
3015                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
3016                                 pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
3017                                     nk->addr[pd->didx].v4.s_addr, 0);
3018
3019                         if (nk->port[1] != pd->hdr.icmp->icmp_id) {
3020                                 pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
3021                                     pd->hdr.icmp->icmp_cksum, sport,
3022                                     nk->port[1], 0);
3023                                 pd->hdr.icmp->icmp_id = nk->port[1];
3024                                 pd->sport = &pd->hdr.icmp->icmp_id;
3025                         }
3026                         m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
3027                         break;
3028 #endif /* INET */
3029 #ifdef INET6
3030                 case IPPROTO_ICMPV6:
3031                         nk->port[0] = nk->port[1];
3032                         if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
3033                                 pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
3034                                     &nk->addr[pd->sidx], 0);
3035
3036                         if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
3037                                 pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
3038                                     &nk->addr[pd->didx], 0);
3039                         rewrite++;
3040                         break;
3041 #endif /* INET */
3042                 default:
3043                         switch (af) {
3044 #ifdef INET
3045                         case AF_INET:
3046                                 if (PF_ANEQ(saddr,
3047                                     &nk->addr[pd->sidx], AF_INET))
3048                                         pf_change_a(&saddr->v4.s_addr,
3049                                             pd->ip_sum,
3050                                             nk->addr[pd->sidx].v4.s_addr, 0);
3051
3052                                 if (PF_ANEQ(daddr,
3053                                     &nk->addr[pd->didx], AF_INET))
3054                                         pf_change_a(&daddr->v4.s_addr,
3055                                             pd->ip_sum,
3056                                             nk->addr[pd->didx].v4.s_addr, 0);
3057                                 break;
3058 #endif /* INET */
3059 #ifdef INET6
3060                         case AF_INET6:
3061                                 if (PF_ANEQ(saddr,
3062                                     &nk->addr[pd->sidx], AF_INET6))
3063                                         PF_ACPY(saddr, &nk->addr[pd->sidx], af);
3064
3065                                 if (PF_ANEQ(daddr,
3066                                     &nk->addr[pd->didx], AF_INET6))
3067                                         PF_ACPY(saddr, &nk->addr[pd->didx], af);
3068                                 break;
3069 #endif /* INET */
3070                         }
3071                         break;
3072                 }
3073                 if (nr->natpass)
3074                         r = NULL;
3075                 pd->nat_rule = nr;
3076         }
3077
3078         while (r != NULL) {
3079                 r->evaluations++;
3080                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
3081                         r = r->skip[PF_SKIP_IFP].ptr;
3082                 else if (r->direction && r->direction != direction)
3083                         r = r->skip[PF_SKIP_DIR].ptr;
3084                 else if (r->af && r->af != af)
3085                         r = r->skip[PF_SKIP_AF].ptr;
3086                 else if (r->proto && r->proto != pd->proto)
3087                         r = r->skip[PF_SKIP_PROTO].ptr;
3088                 else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
3089                     r->src.neg, kif, M_GETFIB(m)))
3090                         r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3091                 /* tcp/udp only. port_op always 0 in other cases */
3092                 else if (r->src.port_op && !pf_match_port(r->src.port_op,
3093                     r->src.port[0], r->src.port[1], sport))
3094                         r = r->skip[PF_SKIP_SRC_PORT].ptr;
3095                 else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
3096                     r->dst.neg, NULL, M_GETFIB(m)))
3097                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
3098                 /* tcp/udp only. port_op always 0 in other cases */
3099                 else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
3100                     r->dst.port[0], r->dst.port[1], dport))
3101                         r = r->skip[PF_SKIP_DST_PORT].ptr;
3102                 /* icmp only. type always 0 in other cases */
3103                 else if (r->type && r->type != icmptype + 1)
3104                         r = TAILQ_NEXT(r, entries);
3105                 /* icmp only. type always 0 in other cases */
3106                 else if (r->code && r->code != icmpcode + 1)
3107                         r = TAILQ_NEXT(r, entries);
3108                 else if (r->tos && !(r->tos == pd->tos))
3109                         r = TAILQ_NEXT(r, entries);
3110                 else if (r->rule_flag & PFRULE_FRAGMENT)
3111                         r = TAILQ_NEXT(r, entries);
3112                 else if (pd->proto == IPPROTO_TCP &&
3113                     (r->flagset & th->th_flags) != r->flags)
3114                         r = TAILQ_NEXT(r, entries);
3115                 /* tcp/udp only. uid.op always 0 in other cases */
3116                 else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
3117                     pf_socket_lookup(direction, pd, m), 1)) &&
3118                     !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
3119                     pd->lookup.uid))
3120                         r = TAILQ_NEXT(r, entries);
3121                 /* tcp/udp only. gid.op always 0 in other cases */
3122                 else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
3123                     pf_socket_lookup(direction, pd, m), 1)) &&
3124                     !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
3125                     pd->lookup.gid))
3126                         r = TAILQ_NEXT(r, entries);
3127                 else if (r->prob &&
3128                     r->prob <= arc4random())
3129                         r = TAILQ_NEXT(r, entries);
3130                 else if (r->match_tag && !pf_match_tag(m, r, &tag,
3131                     pd->pf_mtag ? pd->pf_mtag->tag : 0))
3132                         r = TAILQ_NEXT(r, entries);
3133                 else if (r->os_fingerprint != PF_OSFP_ANY &&
3134                     (pd->proto != IPPROTO_TCP || !pf_osfp_match(
3135                     pf_osfp_fingerprint(pd, m, off, th),
3136                     r->os_fingerprint)))
3137                         r = TAILQ_NEXT(r, entries);
3138                 else {
3139                         if (r->tag)
3140                                 tag = r->tag;
3141                         if (r->rtableid >= 0)
3142                                 rtableid = r->rtableid;
3143                         if (r->anchor == NULL) {
3144                                 match = 1;
3145                                 *rm = r;
3146                                 *am = a;
3147                                 *rsm = ruleset;
3148                                 if ((*rm)->quick)
3149                                         break;
3150                                 r = TAILQ_NEXT(r, entries);
3151                         } else
3152                                 pf_step_into_anchor(&asd, &ruleset,
3153                                     PF_RULESET_FILTER, &r, &a, &match);
3154                 }
3155                 if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
3156                     PF_RULESET_FILTER, &r, &a, &match))
3157                         break;
3158         }
3159         r = *rm;
3160         a = *am;
3161         ruleset = *rsm;
3162
3163         REASON_SET(&reason, PFRES_MATCH);
3164
3165         if (r->log || (nr != NULL && nr->log)) {
3166                 if (rewrite)
3167                         m_copyback(m, off, hdrlen, pd->hdr.any);
3168                 PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a,
3169                     ruleset, pd, 1);
3170         }
3171
3172         if ((r->action == PF_DROP) &&
3173             ((r->rule_flag & PFRULE_RETURNRST) ||
3174             (r->rule_flag & PFRULE_RETURNICMP) ||
3175             (r->rule_flag & PFRULE_RETURN))) {
3176                 /* undo NAT changes, if they have taken place */
3177                 if (nr != NULL) {
3178                         PF_ACPY(saddr, &sk->addr[pd->sidx], af);
3179                         PF_ACPY(daddr, &sk->addr[pd->didx], af);
3180                         if (pd->sport)
3181                                 *pd->sport = sk->port[pd->sidx];
3182                         if (pd->dport)
3183                                 *pd->dport = sk->port[pd->didx];
3184                         if (pd->proto_sum)
3185                                 *pd->proto_sum = bproto_sum;
3186                         if (pd->ip_sum)
3187                                 *pd->ip_sum = bip_sum;
3188                         m_copyback(m, off, hdrlen, pd->hdr.any);
3189                 }
3190                 if (pd->proto == IPPROTO_TCP &&
3191                     ((r->rule_flag & PFRULE_RETURNRST) ||
3192                     (r->rule_flag & PFRULE_RETURN)) &&
3193                     !(th->th_flags & TH_RST)) {
3194                         u_int32_t        ack = ntohl(th->th_seq) + pd->p_len;
3195                         int              len = 0;
3196 #ifdef INET
3197                         struct ip       *h4;
3198 #endif
3199 #ifdef INET6
3200                         struct ip6_hdr  *h6;
3201 #endif
3202
3203                         switch (af) {
3204 #ifdef INET
3205                         case AF_INET:
3206                                 h4 = mtod(m, struct ip *);
3207                                 len = ntohs(h4->ip_len) - off;
3208                                 break;
3209 #endif
3210 #ifdef INET6
3211                         case AF_INET6:
3212                                 h6 = mtod(m, struct ip6_hdr *);
3213                                 len = ntohs(h6->ip6_plen) - (off - sizeof(*h6));
3214                                 break;
3215 #endif
3216                         }
3217
3218                         if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
3219                                 REASON_SET(&reason, PFRES_PROTCKSUM);
3220                         else {
3221                                 if (th->th_flags & TH_SYN)
3222                                         ack++;
3223                                 if (th->th_flags & TH_FIN)
3224                                         ack++;
3225                                 pf_send_tcp(m, r, af, pd->dst,
3226                                     pd->src, th->th_dport, th->th_sport,
3227                                     ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
3228                                     r->return_ttl, 1, 0, kif->pfik_ifp);
3229                         }
3230                 } else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
3231                     r->return_icmp)
3232                         pf_send_icmp(m, r->return_icmp >> 8,
3233                             r->return_icmp & 255, af, r);
3234                 else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
3235                     r->return_icmp6)
3236                         pf_send_icmp(m, r->return_icmp6 >> 8,
3237                             r->return_icmp6 & 255, af, r);
3238         }
3239
3240         if (r->action == PF_DROP)
3241                 goto cleanup;
3242
3243         if (tag > 0 && pf_tag_packet(m, pd, tag)) {
3244                 REASON_SET(&reason, PFRES_MEMORY);
3245                 goto cleanup;
3246         }
3247         if (rtableid >= 0)
3248                 M_SETFIB(m, rtableid);
3249
3250         if (!state_icmp && (r->keep_state || nr != NULL ||
3251             (pd->flags & PFDESC_TCP_NORM))) {
3252                 int action;
3253                 action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off,
3254                     sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum,
3255                     hdrlen);
3256                 if (action != PF_PASS)
3257                         return (action);
3258         } else {
3259                 if (sk != NULL)
3260                         uma_zfree(V_pf_state_key_z, sk);
3261                 if (nk != NULL)
3262                         uma_zfree(V_pf_state_key_z, nk);
3263         }
3264
3265         /* copy back packet headers if we performed NAT operations */
3266         if (rewrite)
3267                 m_copyback(m, off, hdrlen, pd->hdr.any);
3268
3269         if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) &&
3270             direction == PF_OUT &&
3271             pfsync_defer_ptr != NULL && pfsync_defer_ptr(*sm, m))
3272                 /*
3273                  * We want the state created, but we dont
3274                  * want to send this in case a partner
3275                  * firewall has to know about it to allow
3276                  * replies through it.
3277                  */
3278                 return (PF_DEFER);
3279
3280         return (PF_PASS);
3281
3282 cleanup:
3283         if (sk != NULL)
3284                 uma_zfree(V_pf_state_key_z, sk);
3285         if (nk != NULL)
3286                 uma_zfree(V_pf_state_key_z, nk);
3287         return (PF_DROP);
3288 }
3289
3290 static int
3291 pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a,
3292     struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *nk,
3293     struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport,
3294     u_int16_t dport, int *rewrite, struct pfi_kif *kif, struct pf_state **sm,
3295     int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen)
3296 {
3297         struct pf_state         *s = NULL;
3298         struct pf_src_node      *sn = NULL;
3299         struct tcphdr           *th = pd->hdr.tcp;
3300         u_int16_t                mss = V_tcp_mssdflt;
3301         u_short                  reason;
3302
3303         /* check maximums */
3304         if (r->max_states && (r->states_cur >= r->max_states)) {
3305                 V_pf_status.lcounters[LCNT_STATES]++;
3306                 REASON_SET(&reason, PFRES_MAXSTATES);
3307                 return (PF_DROP);
3308         }
3309         /* src node for filter rule */
3310         if ((r->rule_flag & PFRULE_SRCTRACK ||
3311             r->rpool.opts & PF_POOL_STICKYADDR) &&
3312             pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
3313                 REASON_SET(&reason, PFRES_SRCLIMIT);
3314                 goto csfailed;
3315         }
3316         /* src node for translation rule */
3317         if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
3318             pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
3319                 REASON_SET(&reason, PFRES_SRCLIMIT);
3320                 goto csfailed;
3321         }
3322         s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO);
3323         if (s == NULL) {
3324                 REASON_SET(&reason, PFRES_MEMORY);
3325                 goto csfailed;
3326         }
3327         s->rule.ptr = r;
3328         s->nat_rule.ptr = nr;
3329         s->anchor.ptr = a;
3330         STATE_INC_COUNTERS(s);
3331         if (r->allow_opts)
3332                 s->state_flags |= PFSTATE_ALLOWOPTS;
3333         if (r->rule_flag & PFRULE_STATESLOPPY)
3334                 s->state_flags |= PFSTATE_SLOPPY;
3335         s->log = r->log & PF_LOG_ALL;
3336         s->sync_state = PFSYNC_S_NONE;
3337         if (nr != NULL)
3338                 s->log |= nr->log & PF_LOG_ALL;
3339         switch (pd->proto) {
3340         case IPPROTO_TCP:
3341                 s->src.seqlo = ntohl(th->th_seq);
3342                 s->src.seqhi = s->src.seqlo + pd->p_len + 1;
3343                 if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
3344                     r->keep_state == PF_STATE_MODULATE) {
3345                         /* Generate sequence number modulator */
3346                         if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
3347                             0)
3348                                 s->src.seqdiff = 1;
3349                         pf_change_a(&th->th_seq, &th->th_sum,
3350                             htonl(s->src.seqlo + s->src.seqdiff), 0);
3351                         *rewrite = 1;
3352                 } else
3353                         s->src.seqdiff = 0;
3354                 if (th->th_flags & TH_SYN) {
3355                         s->src.seqhi++;
3356                         s->src.wscale = pf_get_wscale(m, off,
3357                             th->th_off, pd->af);
3358                 }
3359                 s->src.max_win = MAX(ntohs(th->th_win), 1);
3360                 if (s->src.wscale & PF_WSCALE_MASK) {
3361                         /* Remove scale factor from initial window */
3362                         int win = s->src.max_win;
3363                         win += 1 << (s->src.wscale & PF_WSCALE_MASK);
3364                         s->src.max_win = (win - 1) >>
3365                             (s->src.wscale & PF_WSCALE_MASK);
3366                 }
3367                 if (th->th_flags & TH_FIN)
3368                         s->src.seqhi++;
3369                 s->dst.seqhi = 1;
3370                 s->dst.max_win = 1;
3371                 s->src.state = TCPS_SYN_SENT;
3372                 s->dst.state = TCPS_CLOSED;
3373                 s->timeout = PFTM_TCP_FIRST_PACKET;
3374                 break;
3375         case IPPROTO_UDP:
3376                 s->src.state = PFUDPS_SINGLE;
3377                 s->dst.state = PFUDPS_NO_TRAFFIC;
3378                 s->timeout = PFTM_UDP_FIRST_PACKET;
3379                 break;
3380         case IPPROTO_ICMP:
3381 #ifdef INET6
3382         case IPPROTO_ICMPV6:
3383 #endif
3384                 s->timeout = PFTM_ICMP_FIRST_PACKET;
3385                 break;
3386         default:
3387                 s->src.state = PFOTHERS_SINGLE;
3388                 s->dst.state = PFOTHERS_NO_TRAFFIC;
3389                 s->timeout = PFTM_OTHER_FIRST_PACKET;
3390         }
3391
3392         s->creation = time_uptime;
3393         s->expire = time_uptime;
3394
3395         if (sn != NULL) {
3396                 s->src_node = sn;
3397                 s->src_node->states++;
3398         }
3399         if (nsn != NULL) {
3400                 /* XXX We only modify one side for now. */
3401                 PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
3402                 s->nat_src_node = nsn;
3403                 s->nat_src_node->states++;
3404         }
3405         if (pd->proto == IPPROTO_TCP) {
3406                 if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
3407                     off, pd, th, &s->src, &s->dst)) {
3408                         REASON_SET(&reason, PFRES_MEMORY);
3409                         pf_src_tree_remove_state(s);
3410                         STATE_DEC_COUNTERS(s);
3411                         uma_zfree(V_pf_state_z, s);
3412                         return (PF_DROP);
3413                 }
3414                 if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
3415                     pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
3416                     &s->src, &s->dst, rewrite)) {
3417                         /* This really shouldn't happen!!! */
3418                         DPFPRINTF(PF_DEBUG_URGENT,
3419                             ("pf_normalize_tcp_stateful failed on first pkt"));
3420                         pf_normalize_tcp_cleanup(s);
3421                         pf_src_tree_remove_state(s);
3422                         STATE_DEC_COUNTERS(s);
3423                         uma_zfree(V_pf_state_z, s);
3424                         return (PF_DROP);
3425                 }
3426         }
3427         s->direction = pd->dir;
3428
3429         /*
3430          * sk/nk could already been setup by pf_get_translation().
3431          */
3432         if (nr == NULL) {
3433                 KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p",
3434                     __func__, nr, sk, nk));
3435                 sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport);
3436                 if (sk == NULL)
3437                         goto csfailed;
3438                 nk = sk;
3439         } else
3440                 KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p",
3441                     __func__, nr, sk, nk));
3442
3443         /* Swap sk/nk for PF_OUT. */
3444         if (pf_state_insert(BOUND_IFACE(r, kif),
3445             (pd->dir == PF_IN) ? sk : nk,
3446             (pd->dir == PF_IN) ? nk : sk, s)) {
3447                 if (pd->proto == IPPROTO_TCP)
3448                         pf_normalize_tcp_cleanup(s);
3449                 REASON_SET(&reason, PFRES_STATEINS);
3450                 pf_src_tree_remove_state(s);
3451                 STATE_DEC_COUNTERS(s);
3452                 uma_zfree(V_pf_state_z, s);
3453                 return (PF_DROP);
3454         } else
3455                 *sm = s;
3456
3457         pf_set_rt_ifp(s, pd->src);      /* needs s->state_key set */
3458         if (tag > 0)
3459                 s->tag = tag;
3460         if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
3461             TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
3462                 s->src.state = PF_TCPS_PROXY_SRC;
3463                 /* undo NAT changes, if they have taken place */
3464                 if (nr != NULL) {
3465                         struct pf_state_key *skt = s->key[PF_SK_WIRE];
3466                         if (pd->dir == PF_OUT)
3467                                 skt = s->key[PF_SK_STACK];
3468                         PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
3469                         PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
3470                         if (pd->sport)
3471                                 *pd->sport = skt->port[pd->sidx];
3472                         if (pd->dport)
3473                                 *pd->dport = skt->port[pd->didx];
3474                         if (pd->proto_sum)
3475                                 *pd->proto_sum = bproto_sum;
3476                         if (pd->ip_sum)
3477                                 *pd->ip_sum = bip_sum;
3478                         m_copyback(m, off, hdrlen, pd->hdr.any);
3479                 }
3480                 s->src.seqhi = htonl(arc4random());
3481                 /* Find mss option */
3482                 int rtid = M_GETFIB(m);
3483                 mss = pf_get_mss(m, off, th->th_off, pd->af);
3484                 mss = pf_calc_mss(pd->src, pd->af, rtid, mss);
3485                 mss = pf_calc_mss(pd->dst, pd->af, rtid, mss);
3486                 s->src.mss = mss;
3487                 pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport,
3488                     th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
3489                     TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL);
3490                 REASON_SET(&reason, PFRES_SYNPROXY);
3491                 return (PF_SYNPROXY_DROP);
3492         }
3493
3494         return (PF_PASS);
3495
3496 csfailed:
3497         if (sk != NULL)
3498                 uma_zfree(V_pf_state_key_z, sk);
3499         if (nk != NULL)
3500                 uma_zfree(V_pf_state_key_z, nk);
3501
3502         if (sn != NULL && sn->states == 0 && sn->expire == 0) {
3503                 pf_remove_src_node(sn);
3504                 V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
3505                 V_pf_status.src_nodes--;
3506                 uma_zfree(V_pf_sources_z, sn);
3507         }
3508         if (nsn != sn && nsn != NULL && nsn->states == 0 && nsn->expire == 0) {
3509                 pf_remove_src_node(nsn);
3510                 V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS]++;
3511                 V_pf_status.src_nodes--;
3512                 uma_zfree(V_pf_sources_z, nsn);
3513         }
3514         return (PF_DROP);
3515 }
3516
3517 static int
3518 pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif,
3519     struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am,
3520     struct pf_ruleset **rsm)
3521 {
3522         struct pf_rule          *r, *a = NULL;
3523         struct pf_ruleset       *ruleset = NULL;
3524         sa_family_t              af = pd->af;
3525         u_short                  reason;
3526         int                      tag = -1;
3527         int                      asd = 0;
3528         int                      match = 0;
3529
3530         PF_RULES_RASSERT();
3531
3532         r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
3533         while (r != NULL) {
3534                 r->evaluations++;
3535                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
3536                         r = r->skip[PF_SKIP_IFP].ptr;
3537                 else if (r->direction && r->direction != direction)
3538                         r = r->skip[PF_SKIP_DIR].ptr;
3539                 else if (r->af && r->af != af)
3540                         r = r->skip[PF_SKIP_AF].ptr;
3541                 else if (r->proto && r->proto != pd->proto)
3542                         r = r->skip[PF_SKIP_PROTO].ptr;
3543                 else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
3544                     r->src.neg, kif, M_GETFIB(m)))
3545                         r = r->skip[PF_SKIP_SRC_ADDR].ptr;
3546                 else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
3547                     r->dst.neg, NULL, M_GETFIB(m)))
3548                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
3549                 else if (r->tos && !(r->tos == pd->tos))
3550                         r = TAILQ_NEXT(r, entries);
3551                 else if (r->os_fingerprint != PF_OSFP_ANY)
3552                         r = TAILQ_NEXT(r, entries);
3553                 else if (pd->proto == IPPROTO_UDP &&
3554                     (r->src.port_op || r->dst.port_op))
3555                         r = TAILQ_NEXT(r, entries);
3556                 else if (pd->proto == IPPROTO_TCP &&
3557                     (r->src.port_op || r->dst.port_op || r->flagset))
3558                         r = TAILQ_NEXT(r, entries);
3559                 else if ((pd->proto == IPPROTO_ICMP ||
3560                     pd->proto == IPPROTO_ICMPV6) &&
3561                     (r->type || r->code))
3562                         r = TAILQ_NEXT(r, entries);
3563                 else if (r->prob && r->prob <=
3564                     (arc4random() % (UINT_MAX - 1) + 1))
3565                         r = TAILQ_NEXT(r, entries);
3566                 else if (r->match_tag && !pf_match_tag(m, r, &tag,
3567                     pd->pf_mtag ? pd->pf_mtag->tag : 0))
3568                         r = TAILQ_NEXT(r, entries);
3569                 else {
3570                         if (r->anchor == NULL) {
3571                                 match = 1;
3572                                 *rm = r;
3573                                 *am = a;
3574                                 *rsm = ruleset;
3575                                 if ((*rm)->quick)
3576                                         break;
3577                                 r = TAILQ_NEXT(r, entries);
3578                         } else
3579                                 pf_step_into_anchor(&asd, &ruleset,
3580                                     PF_RULESET_FILTER, &r, &a, &match);
3581                 }
3582                 if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
3583                     PF_RULESET_FILTER, &r, &a, &match))
3584                         break;
3585         }
3586         r = *rm;
3587         a = *am;
3588         ruleset = *rsm;
3589
3590         REASON_SET(&reason, PFRES_MATCH);
3591
3592         if (r->log)
3593                 PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd,
3594                     1);
3595
3596         if (r->action != PF_PASS)
3597                 return (PF_DROP);
3598
3599         if (tag > 0 && pf_tag_packet(m, pd, tag)) {
3600                 REASON_SET(&reason, PFRES_MEMORY);
3601                 return (PF_DROP);
3602         }
3603
3604         return (PF_PASS);
3605 }
3606
3607 static int
3608 pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst,
3609         struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off,
3610         struct pf_pdesc *pd, u_short *reason, int *copyback)
3611 {
3612         struct tcphdr           *th = pd->hdr.tcp;
3613         u_int16_t                win = ntohs(th->th_win);
3614         u_int32_t                ack, end, seq, orig_seq;
3615         u_int8_t                 sws, dws;
3616         int                      ackskew;
3617
3618         if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
3619                 sws = src->wscale & PF_WSCALE_MASK;
3620                 dws = dst->wscale & PF_WSCALE_MASK;
3621         } else
3622                 sws = dws = 0;
3623
3624         /*
3625          * Sequence tracking algorithm from Guido van Rooij's paper:
3626          *   http://www.madison-gurkha.com/publications/tcp_filtering/
3627          *      tcp_filtering.ps
3628          */
3629
3630         orig_seq = seq = ntohl(th->th_seq);
3631         if (src->seqlo == 0) {
3632                 /* First packet from this end. Set its state */
3633
3634                 if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
3635                     src->scrub == NULL) {
3636                         if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
3637                                 REASON_SET(reason, PFRES_MEMORY);
3638                                 return (PF_DROP);
3639                         }
3640                 }
3641
3642                 /* Deferred generation of sequence number modulator */
3643                 if (dst->seqdiff && !src->seqdiff) {
3644                         /* use random iss for the TCP server */
3645                         while ((src->seqdiff = arc4random() - seq) == 0)
3646                                 ;
3647                         ack = ntohl(th->th_ack) - dst->seqdiff;
3648                         pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
3649                             src->seqdiff), 0);
3650                         pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
3651                         *copyback = 1;
3652                 } else {
3653                         ack = ntohl(th->th_ack);
3654                 }
3655
3656                 end = seq + pd->p_len;
3657                 if (th->th_flags & TH_SYN) {
3658                         end++;
3659                         if (dst->wscale & PF_WSCALE_FLAG) {
3660                                 src->wscale = pf_get_wscale(m, off, th->th_off,
3661                                     pd->af);
3662                                 if (src->wscale & PF_WSCALE_FLAG) {
3663                                         /* Remove scale factor from initial
3664                                          * window */
3665                                         sws = src->wscale & PF_WSCALE_MASK;
3666                                         win = ((u_int32_t)win + (1 << sws) - 1)
3667                                             >> sws;
3668                                         dws = dst->wscale & PF_WSCALE_MASK;
3669                                 } else {
3670                                         /* fixup other window */
3671                                         dst->max_win <<= dst->wscale &
3672                                             PF_WSCALE_MASK;
3673                                         /* in case of a retrans SYN|ACK */
3674                                         dst->wscale = 0;
3675                                 }
3676                         }
3677                 }
3678                 if (th->th_flags & TH_FIN)
3679                         end++;
3680
3681                 src->seqlo = seq;
3682                 if (src->state < TCPS_SYN_SENT)
3683                         src->state = TCPS_SYN_SENT;
3684
3685                 /*
3686                  * May need to slide the window (seqhi may have been set by
3687                  * the crappy stack check or if we picked up the connection
3688                  * after establishment)
3689                  */
3690                 if (src->seqhi == 1 ||
3691                     SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
3692                         src->seqhi = end + MAX(1, dst->max_win << dws);
3693                 if (win > src->max_win)
3694                         src->max_win = win;
3695
3696         } else {
3697                 ack = ntohl(th->th_ack) - dst->seqdiff;
3698                 if (src->seqdiff) {
3699                         /* Modulate sequence numbers */
3700                         pf_change_a(&th->th_seq, &th->th_sum, htonl(seq +
3701                             src->seqdiff), 0);
3702                         pf_change_a(&th->th_ack, &th->th_sum, htonl(ack), 0);
3703                         *copyback = 1;
3704                 }
3705                 end = seq + pd->p_len;
3706                 if (th->th_flags & TH_SYN)
3707                         end++;
3708                 if (th->th_flags & TH_FIN)
3709                         end++;
3710         }
3711
3712         if ((th->th_flags & TH_ACK) == 0) {
3713                 /* Let it pass through the ack skew check */
3714                 ack = dst->seqlo;
3715         } else if ((ack == 0 &&
3716             (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
3717             /* broken tcp stacks do not set ack */
3718             (dst->state < TCPS_SYN_SENT)) {
3719                 /*
3720                  * Many stacks (ours included) will set the ACK number in an
3721                  * FIN|ACK if the SYN times out -- no sequence to ACK.
3722                  */
3723                 ack = dst->seqlo;
3724         }
3725
3726         if (seq == end) {
3727                 /* Ease sequencing restrictions on no data packets */
3728                 seq = src->seqlo;
3729                 end = seq;
3730         }
3731
3732         ackskew = dst->seqlo - ack;
3733
3734
3735         /*
3736          * Need to demodulate the sequence numbers in any TCP SACK options
3737          * (Selective ACK). We could optionally validate the SACK values
3738          * against the current ACK window, either forwards or backwards, but
3739          * I'm not confident that SACK has been implemented properly
3740          * everywhere. It wouldn't surprise me if several stacks accidently
3741          * SACK too far backwards of previously ACKed data. There really aren't
3742          * any security implications of bad SACKing unless the target stack
3743          * doesn't validate the option length correctly. Someone trying to
3744          * spoof into a TCP connection won't bother blindly sending SACK
3745          * options anyway.
3746          */
3747         if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
3748                 if (pf_modulate_sack(m, off, pd, th, dst))
3749                         *copyback = 1;
3750         }
3751
3752
3753 #define MAXACKWINDOW (0xffff + 1500)    /* 1500 is an arbitrary fudge factor */
3754         if (SEQ_GEQ(src->seqhi, end) &&
3755             /* Last octet inside other's window space */
3756             SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
3757             /* Retrans: not more than one window back */
3758             (ackskew >= -MAXACKWINDOW) &&
3759             /* Acking not more than one reassembled fragment backwards */
3760             (ackskew <= (MAXACKWINDOW << sws)) &&
3761             /* Acking not more than one window forward */
3762             ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
3763             (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) ||
3764             (pd->flags & PFDESC_IP_REAS) == 0)) {
3765             /* Require an exact/+1 sequence match on resets when possible */
3766
3767                 if (dst->scrub || src->scrub) {
3768                         if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
3769                             *state, src, dst, copyback))
3770                                 return (PF_DROP);
3771                 }
3772
3773                 /* update max window */
3774                 if (src->max_win < win)
3775                         src->max_win = win;
3776                 /* synchronize sequencing */
3777                 if (SEQ_GT(end, src->seqlo))
3778                         src->seqlo = end;
3779                 /* slide the window of what the other end can send */
3780                 if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
3781                         dst->seqhi = ack + MAX((win << sws), 1);
3782
3783
3784                 /* update states */
3785                 if (th->th_flags & TH_SYN)
3786                         if (src->state < TCPS_SYN_SENT)
3787                                 src->state = TCPS_SYN_SENT;
3788                 if (th->th_flags & TH_FIN)
3789                         if (src->state < TCPS_CLOSING)
3790                                 src->state = TCPS_CLOSING;
3791                 if (th->th_flags & TH_ACK) {
3792                         if (dst->state == TCPS_SYN_SENT) {
3793                                 dst->state = TCPS_ESTABLISHED;
3794                                 if (src->state == TCPS_ESTABLISHED &&
3795                                     (*state)->src_node != NULL &&
3796                                     pf_src_connlimit(state)) {
3797                                         REASON_SET(reason, PFRES_SRCLIMIT);
3798                                         return (PF_DROP);
3799                                 }
3800                         } else if (dst->state == TCPS_CLOSING)
3801                                 dst->state = TCPS_FIN_WAIT_2;
3802                 }
3803                 if (th->th_flags & TH_RST)
3804                         src->state = dst->state = TCPS_TIME_WAIT;
3805
3806                 /* update expire time */
3807                 (*state)->expire = time_uptime;
3808                 if (src->state >= TCPS_FIN_WAIT_2 &&
3809                     dst->state >= TCPS_FIN_WAIT_2)
3810                         (*state)->timeout = PFTM_TCP_CLOSED;
3811                 else if (src->state >= TCPS_CLOSING &&
3812                     dst->state >= TCPS_CLOSING)
3813                         (*state)->timeout = PFTM_TCP_FIN_WAIT;
3814                 else if (src->state < TCPS_ESTABLISHED ||
3815                     dst->state < TCPS_ESTABLISHED)
3816                         (*state)->timeout = PFTM_TCP_OPENING;
3817                 else if (src->state >= TCPS_CLOSING ||
3818                     dst->state >= TCPS_CLOSING)
3819                         (*state)->timeout = PFTM_TCP_CLOSING;
3820                 else
3821                         (*state)->timeout = PFTM_TCP_ESTABLISHED;
3822
3823                 /* Fall through to PASS packet */
3824
3825         } else if ((dst->state < TCPS_SYN_SENT ||
3826                 dst->state >= TCPS_FIN_WAIT_2 ||
3827                 src->state >= TCPS_FIN_WAIT_2) &&
3828             SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
3829             /* Within a window forward of the originating packet */
3830             SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
3831             /* Within a window backward of the originating packet */
3832
3833                 /*
3834                  * This currently handles three situations:
3835                  *  1) Stupid stacks will shotgun SYNs before their peer
3836                  *     replies.
3837                  *  2) When PF catches an already established stream (the
3838                  *     firewall rebooted, the state table was flushed, routes
3839                  *     changed...)
3840                  *  3) Packets get funky immediately after the connection
3841                  *     closes (this should catch Solaris spurious ACK|FINs
3842                  *     that web servers like to spew after a close)
3843                  *
3844                  * This must be a little more careful than the above code
3845                  * since packet floods will also be caught here. We don't
3846                  * update the TTL here to mitigate the damage of a packet
3847                  * flood and so the same code can handle awkward establishment
3848                  * and a loosened connection close.
3849                  * In the establishment case, a correct peer response will
3850                  * validate the connection, go through the normal state code
3851                  * and keep updating the state TTL.
3852                  */
3853
3854                 if (V_pf_status.debug >= PF_DEBUG_MISC) {
3855                         printf("pf: loose state match: ");
3856                         pf_print_state(*state);
3857                         pf_print_flags(th->th_flags);
3858                         printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
3859                             "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack,
3860                             pd->p_len, ackskew, (unsigned long long)(*state)->packets[0],
3861                             (unsigned long long)(*state)->packets[1],
3862                             pd->dir == PF_IN ? "in" : "out",
3863                             pd->dir == (*state)->direction ? "fwd" : "rev");
3864                 }
3865
3866                 if (dst->scrub || src->scrub) {
3867                         if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
3868                             *state, src, dst, copyback))
3869                                 return (PF_DROP);
3870                 }
3871
3872                 /* update max window */
3873                 if (src->max_win < win)
3874                         src->max_win = win;
3875                 /* synchronize sequencing */
3876                 if (SEQ_GT(end, src->seqlo))
3877                         src->seqlo = end;
3878                 /* slide the window of what the other end can send */
3879                 if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
3880                         dst->seqhi = ack + MAX((win << sws), 1);
3881
3882                 /*
3883                  * Cannot set dst->seqhi here since this could be a shotgunned
3884                  * SYN and not an already established connection.
3885                  */
3886
3887                 if (th->th_flags & TH_FIN)
3888                         if (src->state < TCPS_CLOSING)
3889                                 src->state = TCPS_CLOSING;
3890                 if (th->th_flags & TH_RST)
3891                         src->state = dst->state = TCPS_TIME_WAIT;
3892
3893                 /* Fall through to PASS packet */
3894
3895         } else {
3896                 if ((*state)->dst.state == TCPS_SYN_SENT &&
3897                     (*state)->src.state == TCPS_SYN_SENT) {
3898                         /* Send RST for state mismatches during handshake */
3899                         if (!(th->th_flags & TH_RST))
3900                                 pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
3901                                     pd->dst, pd->src, th->th_dport,
3902                                     th->th_sport, ntohl(th->th_ack), 0,
3903                                     TH_RST, 0, 0,
3904                                     (*state)->rule.ptr->return_ttl, 1, 0,
3905                                     kif->pfik_ifp);
3906                         src->seqlo = 0;
3907                         src->seqhi = 1;
3908                         src->max_win = 1;
3909                 } else if (V_pf_status.debug >= PF_DEBUG_MISC) {
3910                         printf("pf: BAD state: ");
3911                         pf_print_state(*state);
3912                         pf_print_flags(th->th_flags);
3913                         printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
3914                             "pkts=%llu:%llu dir=%s,%s\n",
3915                             seq, orig_seq, ack, pd->p_len, ackskew,
3916                             (unsigned long long)(*state)->packets[0],
3917                             (unsigned long long)(*state)->packets[1],
3918                             pd->dir == PF_IN ? "in" : "out",
3919                             pd->dir == (*state)->direction ? "fwd" : "rev");
3920                         printf("pf: State failure on: %c %c %c %c | %c %c\n",
3921                             SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
3922                             SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
3923                             ' ': '2',
3924                             (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
3925                             (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
3926                             SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
3927                             SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
3928                 }
3929                 REASON_SET(reason, PFRES_BADSTATE);
3930                 return (PF_DROP);
3931         }
3932
3933         return (PF_PASS);
3934 }
3935
3936 static int
3937 pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst,
3938         struct pf_state **state, struct pf_pdesc *pd, u_short *reason)
3939 {
3940         struct tcphdr           *th = pd->hdr.tcp;
3941
3942         if (th->th_flags & TH_SYN)
3943                 if (src->state < TCPS_SYN_SENT)
3944                         src->state = TCPS_SYN_SENT;
3945         if (th->th_flags & TH_FIN)
3946                 if (src->state < TCPS_CLOSING)
3947                         src->state = TCPS_CLOSING;
3948         if (th->th_flags & TH_ACK) {
3949                 if (dst->state == TCPS_SYN_SENT) {
3950                         dst->state = TCPS_ESTABLISHED;
3951                         if (src->state == TCPS_ESTABLISHED &&
3952                             (*state)->src_node != NULL &&
3953                             pf_src_connlimit(state)) {
3954                                 REASON_SET(reason, PFRES_SRCLIMIT);
3955                                 return (PF_DROP);
3956                         }
3957                 } else if (dst->state == TCPS_CLOSING) {
3958                         dst->state = TCPS_FIN_WAIT_2;
3959                 } else if (src->state == TCPS_SYN_SENT &&
3960                     dst->state < TCPS_SYN_SENT) {
3961                         /*
3962                          * Handle a special sloppy case where we only see one
3963                          * half of the connection. If there is a ACK after
3964                          * the initial SYN without ever seeing a packet from
3965                          * the destination, set the connection to established.
3966                          */
3967                         dst->state = src->state = TCPS_ESTABLISHED;
3968                         if ((*state)->src_node != NULL &&
3969                             pf_src_connlimit(state)) {
3970                                 REASON_SET(reason, PFRES_SRCLIMIT);
3971                                 return (PF_DROP);
3972                         }
3973                 } else if (src->state == TCPS_CLOSING &&
3974                     dst->state == TCPS_ESTABLISHED &&
3975                     dst->seqlo == 0) {
3976                         /*
3977                          * Handle the closing of half connections where we
3978                          * don't see the full bidirectional FIN/ACK+ACK
3979                          * handshake.
3980                          */
3981                         dst->state = TCPS_CLOSING;
3982                 }
3983         }
3984         if (th->th_flags & TH_RST)
3985                 src->state = dst->state = TCPS_TIME_WAIT;
3986
3987         /* update expire time */
3988         (*state)->expire = time_uptime;
3989         if (src->state >= TCPS_FIN_WAIT_2 &&
3990             dst->state >= TCPS_FIN_WAIT_2)
3991                 (*state)->timeout = PFTM_TCP_CLOSED;
3992         else if (src->state >= TCPS_CLOSING &&
3993             dst->state >= TCPS_CLOSING)
3994                 (*state)->timeout = PFTM_TCP_FIN_WAIT;
3995         else if (src->state < TCPS_ESTABLISHED ||
3996             dst->state < TCPS_ESTABLISHED)
3997                 (*state)->timeout = PFTM_TCP_OPENING;
3998         else if (src->state >= TCPS_CLOSING ||
3999             dst->state >= TCPS_CLOSING)
4000                 (*state)->timeout = PFTM_TCP_CLOSING;
4001         else
4002                 (*state)->timeout = PFTM_TCP_ESTABLISHED;
4003
4004         return (PF_PASS);
4005 }
4006
4007 static int
4008 pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
4009     struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
4010     u_short *reason)
4011 {
4012         struct pf_state_key_cmp  key;
4013         struct tcphdr           *th = pd->hdr.tcp;
4014         int                      copyback = 0;
4015         struct pf_state_peer    *src, *dst;
4016         struct pf_state_key     *sk;
4017
4018         bzero(&key, sizeof(key));
4019         key.af = pd->af;
4020         key.proto = IPPROTO_TCP;
4021         if (direction == PF_IN) {       /* wire side, straight */
4022                 PF_ACPY(&key.addr[0], pd->src, key.af);
4023                 PF_ACPY(&key.addr[1], pd->dst, key.af);
4024                 key.port[0] = th->th_sport;
4025                 key.port[1] = th->th_dport;
4026         } else {                        /* stack side, reverse */
4027                 PF_ACPY(&key.addr[1], pd->src, key.af);
4028                 PF_ACPY(&key.addr[0], pd->dst, key.af);
4029                 key.port[1] = th->th_sport;
4030                 key.port[0] = th->th_dport;
4031         }
4032
4033         STATE_LOOKUP(kif, &key, direction, *state, pd);
4034
4035         if (direction == (*state)->direction) {
4036                 src = &(*state)->src;
4037                 dst = &(*state)->dst;
4038         } else {
4039                 src = &(*state)->dst;
4040                 dst = &(*state)->src;
4041         }
4042
4043         sk = (*state)->key[pd->didx];
4044
4045         if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
4046                 if (direction != (*state)->direction) {
4047                         REASON_SET(reason, PFRES_SYNPROXY);
4048                         return (PF_SYNPROXY_DROP);
4049                 }
4050                 if (th->th_flags & TH_SYN) {
4051                         if (ntohl(th->th_seq) != (*state)->src.seqlo) {
4052                                 REASON_SET(reason, PFRES_SYNPROXY);
4053                                 return (PF_DROP);
4054                         }
4055                         pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
4056                             pd->src, th->th_dport, th->th_sport,
4057                             (*state)->src.seqhi, ntohl(th->th_seq) + 1,
4058                             TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL);
4059                         REASON_SET(reason, PFRES_SYNPROXY);
4060                         return (PF_SYNPROXY_DROP);
4061                 } else if (!(th->th_flags & TH_ACK) ||
4062                     (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4063                     (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4064                         REASON_SET(reason, PFRES_SYNPROXY);
4065                         return (PF_DROP);
4066                 } else if ((*state)->src_node != NULL &&
4067                     pf_src_connlimit(state)) {
4068                         REASON_SET(reason, PFRES_SRCLIMIT);
4069                         return (PF_DROP);
4070                 } else
4071                         (*state)->src.state = PF_TCPS_PROXY_DST;
4072         }
4073         if ((*state)->src.state == PF_TCPS_PROXY_DST) {
4074                 if (direction == (*state)->direction) {
4075                         if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
4076                             (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
4077                             (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
4078                                 REASON_SET(reason, PFRES_SYNPROXY);
4079                                 return (PF_DROP);
4080                         }
4081                         (*state)->src.max_win = MAX(ntohs(th->th_win), 1);
4082                         if ((*state)->dst.seqhi == 1)
4083                                 (*state)->dst.seqhi = htonl(arc4random());
4084                         pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4085                             &sk->addr[pd->sidx], &sk->addr[pd->didx],
4086                             sk->port[pd->sidx], sk->port[pd->didx],
4087                             (*state)->dst.seqhi, 0, TH_SYN, 0,
4088                             (*state)->src.mss, 0, 0, (*state)->tag, NULL);
4089                         REASON_SET(reason, PFRES_SYNPROXY);
4090                         return (PF_SYNPROXY_DROP);
4091                 } else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
4092                     (TH_SYN|TH_ACK)) ||
4093                     (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
4094                         REASON_SET(reason, PFRES_SYNPROXY);
4095                         return (PF_DROP);
4096                 } else {
4097                         (*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
4098                         (*state)->dst.seqlo = ntohl(th->th_seq);
4099                         pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
4100                             pd->src, th->th_dport, th->th_sport,
4101                             ntohl(th->th_ack), ntohl(th->th_seq) + 1,
4102                             TH_ACK, (*state)->src.max_win, 0, 0, 0,
4103                             (*state)->tag, NULL);
4104                         pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
4105                             &sk->addr[pd->sidx], &sk->addr[pd->didx],
4106                             sk->port[pd->sidx], sk->port[pd->didx],
4107                             (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
4108                             TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL);
4109                         (*state)->src.seqdiff = (*state)->dst.seqhi -
4110                             (*state)->src.seqlo;
4111                         (*state)->dst.seqdiff = (*state)->src.seqhi -
4112                             (*state)->dst.seqlo;
4113                         (*state)->src.seqhi = (*state)->src.seqlo +
4114                             (*state)->dst.max_win;
4115                         (*state)->dst.seqhi = (*state)->dst.seqlo +
4116                             (*state)->src.max_win;
4117                         (*state)->src.wscale = (*state)->dst.wscale = 0;
4118                         (*state)->src.state = (*state)->dst.state =
4119                             TCPS_ESTABLISHED;
4120                         REASON_SET(reason, PFRES_SYNPROXY);
4121                         return (PF_SYNPROXY_DROP);
4122                 }
4123         }
4124
4125         if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) &&
4126             dst->state >= TCPS_FIN_WAIT_2 &&
4127             src->state >= TCPS_FIN_WAIT_2) {
4128                 if (V_pf_status.debug >= PF_DEBUG_MISC) {
4129                         printf("pf: state reuse ");
4130                         pf_print_state(*state);
4131                         pf_print_flags(th->th_flags);
4132                         printf("\n");
4133                 }
4134                 /* XXX make sure it's the same direction ?? */
4135                 (*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
4136                 pf_unlink_state(*state, PF_ENTER_LOCKED);
4137                 *state = NULL;
4138                 return (PF_DROP);
4139         }
4140
4141         if ((*state)->state_flags & PFSTATE_SLOPPY) {
4142                 if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP)
4143                         return (PF_DROP);
4144         } else {
4145                 if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason,
4146                     &copyback) == PF_DROP)
4147                         return (PF_DROP);
4148         }
4149
4150         /* translate source/destination address, if necessary */
4151         if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4152                 struct pf_state_key *nk = (*state)->key[pd->didx];
4153
4154                 if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4155                     nk->port[pd->sidx] != th->th_sport)
4156                         pf_change_ap(pd->src, &th->th_sport, pd->ip_sum,
4157                             &th->th_sum, &nk->addr[pd->sidx],
4158                             nk->port[pd->sidx], 0, pd->af);
4159
4160                 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
4161                     nk->port[pd->didx] != th->th_dport)
4162                         pf_change_ap(pd->dst, &th->th_dport, pd->ip_sum,
4163                             &th->th_sum, &nk->addr[pd->didx],
4164                             nk->port[pd->didx], 0, pd->af);
4165                 copyback = 1;
4166         }
4167
4168         /* Copyback sequence modulation or stateful scrub changes if needed */
4169         if (copyback)
4170                 m_copyback(m, off, sizeof(*th), (caddr_t)th);
4171
4172         return (PF_PASS);
4173 }
4174
4175 static int
4176 pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif,
4177     struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
4178 {
4179         struct pf_state_peer    *src, *dst;
4180         struct pf_state_key_cmp  key;
4181         struct udphdr           *uh = pd->hdr.udp;
4182
4183         bzero(&key, sizeof(key));
4184         key.af = pd->af;
4185         key.proto = IPPROTO_UDP;
4186         if (direction == PF_IN) {       /* wire side, straight */
4187                 PF_ACPY(&key.addr[0], pd->src, key.af);
4188                 PF_ACPY(&key.addr[1], pd->dst, key.af);
4189                 key.port[0] = uh->uh_sport;
4190                 key.port[1] = uh->uh_dport;
4191         } else {                        /* stack side, reverse */
4192                 PF_ACPY(&key.addr[1], pd->src, key.af);
4193                 PF_ACPY(&key.addr[0], pd->dst, key.af);
4194                 key.port[1] = uh->uh_sport;
4195                 key.port[0] = uh->uh_dport;
4196         }
4197
4198         STATE_LOOKUP(kif, &key, direction, *state, pd);
4199
4200         if (direction == (*state)->direction) {
4201                 src = &(*state)->src;
4202                 dst = &(*state)->dst;
4203         } else {
4204                 src = &(*state)->dst;
4205                 dst = &(*state)->src;
4206         }
4207
4208         /* update states */
4209         if (src->state < PFUDPS_SINGLE)
4210                 src->state = PFUDPS_SINGLE;
4211         if (dst->state == PFUDPS_SINGLE)
4212                 dst->state = PFUDPS_MULTIPLE;
4213
4214         /* update expire time */
4215         (*state)->expire = time_uptime;
4216         if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
4217                 (*state)->timeout = PFTM_UDP_MULTIPLE;
4218         else
4219                 (*state)->timeout = PFTM_UDP_SINGLE;
4220
4221         /* translate source/destination address, if necessary */
4222         if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4223                 struct pf_state_key *nk = (*state)->key[pd->didx];
4224
4225                 if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
4226                     nk->port[pd->sidx] != uh->uh_sport)
4227                         pf_change_ap(pd->src, &uh->uh_sport, pd->ip_sum,
4228                             &uh->uh_sum, &nk->addr[pd->sidx],
4229                             nk->port[pd->sidx], 1, pd->af);
4230
4231                 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
4232                     nk->port[pd->didx] != uh->uh_dport)
4233                         pf_change_ap(pd->dst, &uh->uh_dport, pd->ip_sum,
4234                             &uh->uh_sum, &nk->addr[pd->didx],
4235                             nk->port[pd->didx], 1, pd->af);
4236                 m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
4237         }
4238
4239         return (PF_PASS);
4240 }
4241
4242 static int
4243 pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
4244     struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason)
4245 {
4246         struct pf_addr  *saddr = pd->src, *daddr = pd->dst;
4247         u_int16_t        icmpid = 0, *icmpsum;
4248         u_int8_t         icmptype;
4249         int              state_icmp = 0;
4250         struct pf_state_key_cmp key;
4251
4252         bzero(&key, sizeof(key));
4253         switch (pd->proto) {
4254 #ifdef INET
4255         case IPPROTO_ICMP:
4256                 icmptype = pd->hdr.icmp->icmp_type;
4257                 icmpid = pd->hdr.icmp->icmp_id;
4258                 icmpsum = &pd->hdr.icmp->icmp_cksum;
4259
4260                 if (icmptype == ICMP_UNREACH ||
4261                     icmptype == ICMP_SOURCEQUENCH ||
4262                     icmptype == ICMP_REDIRECT ||
4263                     icmptype == ICMP_TIMXCEED ||
4264                     icmptype == ICMP_PARAMPROB)
4265                         state_icmp++;
4266                 break;
4267 #endif /* INET */
4268 #ifdef INET6
4269         case IPPROTO_ICMPV6:
4270                 icmptype = pd->hdr.icmp6->icmp6_type;
4271                 icmpid = pd->hdr.icmp6->icmp6_id;
4272                 icmpsum = &pd->hdr.icmp6->icmp6_cksum;
4273
4274                 if (icmptype == ICMP6_DST_UNREACH ||
4275                     icmptype == ICMP6_PACKET_TOO_BIG ||
4276                     icmptype == ICMP6_TIME_EXCEEDED ||
4277                     icmptype == ICMP6_PARAM_PROB)
4278                         state_icmp++;
4279                 break;
4280 #endif /* INET6 */
4281         }
4282
4283         if (!state_icmp) {
4284
4285                 /*
4286                  * ICMP query/reply message not related to a TCP/UDP packet.
4287                  * Search for an ICMP state.
4288                  */
4289                 key.af = pd->af;
4290                 key.proto = pd->proto;
4291                 key.port[0] = key.port[1] = icmpid;
4292                 if (direction == PF_IN) {       /* wire side, straight */
4293                         PF_ACPY(&key.addr[0], pd->src, key.af);
4294                         PF_ACPY(&key.addr[1], pd->dst, key.af);
4295                 } else {                        /* stack side, reverse */
4296                         PF_ACPY(&key.addr[1], pd->src, key.af);
4297                         PF_ACPY(&key.addr[0], pd->dst, key.af);
4298                 }
4299
4300                 STATE_LOOKUP(kif, &key, direction, *state, pd);
4301
4302                 (*state)->expire = time_uptime;
4303                 (*state)->timeout = PFTM_ICMP_ERROR_REPLY;
4304
4305                 /* translate source/destination address, if necessary */
4306                 if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4307                         struct pf_state_key *nk = (*state)->key[pd->didx];
4308
4309                         switch (pd->af) {
4310 #ifdef INET
4311                         case AF_INET:
4312                                 if (PF_ANEQ(pd->src,
4313                                     &nk->addr[pd->sidx], AF_INET))
4314                                         pf_change_a(&saddr->v4.s_addr,
4315                                             pd->ip_sum,
4316                                             nk->addr[pd->sidx].v4.s_addr, 0);
4317
4318                                 if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
4319                                     AF_INET))
4320                                         pf_change_a(&daddr->v4.s_addr,
4321                                             pd->ip_sum,
4322                                             nk->addr[pd->didx].v4.s_addr, 0);
4323
4324                                 if (nk->port[0] !=
4325                                     pd->hdr.icmp->icmp_id) {
4326                                         pd->hdr.icmp->icmp_cksum =
4327                                             pf_cksum_fixup(
4328                                             pd->hdr.icmp->icmp_cksum, icmpid,
4329                                             nk->port[pd->sidx], 0);
4330                                         pd->hdr.icmp->icmp_id =
4331                                             nk->port[pd->sidx];
4332                                 }
4333
4334                                 m_copyback(m, off, ICMP_MINLEN,
4335                                     (caddr_t )pd->hdr.icmp);
4336                                 break;
4337 #endif /* INET */
4338 #ifdef INET6
4339                         case AF_INET6:
4340                                 if (PF_ANEQ(pd->src,
4341                                     &nk->addr[pd->sidx], AF_INET6))
4342                                         pf_change_a6(saddr,
4343                                             &pd->hdr.icmp6->icmp6_cksum,
4344                                             &nk->addr[pd->sidx], 0);
4345
4346                                 if (PF_ANEQ(pd->dst,
4347                                     &nk->addr[pd->didx], AF_INET6))
4348                                         pf_change_a6(daddr,
4349                                             &pd->hdr.icmp6->icmp6_cksum,
4350                                             &nk->addr[pd->didx], 0);
4351
4352                                 m_copyback(m, off, sizeof(struct icmp6_hdr),
4353                                     (caddr_t )pd->hdr.icmp6);
4354                                 break;
4355 #endif /* INET6 */
4356                         }
4357                 }
4358                 return (PF_PASS);
4359
4360         } else {
4361                 /*
4362                  * ICMP error message in response to a TCP/UDP packet.
4363                  * Extract the inner TCP/UDP header and search for that state.
4364                  */
4365
4366                 struct pf_pdesc pd2;
4367                 bzero(&pd2, sizeof pd2);
4368 #ifdef INET
4369                 struct ip       h2;
4370 #endif /* INET */
4371 #ifdef INET6
4372                 struct ip6_hdr  h2_6;
4373                 int             terminal = 0;
4374 #endif /* INET6 */
4375                 int             ipoff2 = 0;
4376                 int             off2 = 0;
4377
4378                 pd2.af = pd->af;
4379                 /* Payload packet is from the opposite direction. */
4380                 pd2.sidx = (direction == PF_IN) ? 1 : 0;
4381                 pd2.didx = (direction == PF_IN) ? 0 : 1;
4382                 switch (pd->af) {
4383 #ifdef INET
4384                 case AF_INET:
4385                         /* offset of h2 in mbuf chain */
4386                         ipoff2 = off + ICMP_MINLEN;
4387
4388                         if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
4389                             NULL, reason, pd2.af)) {
4390                                 DPFPRINTF(PF_DEBUG_MISC,
4391                                     ("pf: ICMP error message too short "
4392                                     "(ip)\n"));
4393                                 return (PF_DROP);
4394                         }
4395                         /*
4396                          * ICMP error messages don't refer to non-first
4397                          * fragments
4398                          */
4399                         if (h2.ip_off & htons(IP_OFFMASK)) {
4400                                 REASON_SET(reason, PFRES_FRAG);
4401                                 return (PF_DROP);
4402                         }
4403
4404                         /* offset of protocol header that follows h2 */
4405                         off2 = ipoff2 + (h2.ip_hl << 2);
4406
4407                         pd2.proto = h2.ip_p;
4408                         pd2.src = (struct pf_addr *)&h2.ip_src;
4409                         pd2.dst = (struct pf_addr *)&h2.ip_dst;
4410                         pd2.ip_sum = &h2.ip_sum;
4411                         break;
4412 #endif /* INET */
4413 #ifdef INET6
4414                 case AF_INET6:
4415                         ipoff2 = off + sizeof(struct icmp6_hdr);
4416
4417                         if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
4418                             NULL, reason, pd2.af)) {
4419                                 DPFPRINTF(PF_DEBUG_MISC,
4420                                     ("pf: ICMP error message too short "
4421                                     "(ip6)\n"));
4422                                 return (PF_DROP);
4423                         }
4424                         pd2.proto = h2_6.ip6_nxt;
4425                         pd2.src = (struct pf_addr *)&h2_6.ip6_src;
4426                         pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
4427                         pd2.ip_sum = NULL;
4428                         off2 = ipoff2 + sizeof(h2_6);
4429                         do {
4430                                 switch (pd2.proto) {
4431                                 case IPPROTO_FRAGMENT:
4432                                         /*
4433                                          * ICMPv6 error messages for
4434                                          * non-first fragments
4435                                          */
4436                                         REASON_SET(reason, PFRES_FRAG);
4437                                         return (PF_DROP);
4438                                 case IPPROTO_AH:
4439                                 case IPPROTO_HOPOPTS:
4440                                 case IPPROTO_ROUTING:
4441                                 case IPPROTO_DSTOPTS: {
4442                                         /* get next header and header length */
4443                                         struct ip6_ext opt6;
4444
4445                                         if (!pf_pull_hdr(m, off2, &opt6,
4446                                             sizeof(opt6), NULL, reason,
4447                                             pd2.af)) {
4448                                                 DPFPRINTF(PF_DEBUG_MISC,
4449                                                     ("pf: ICMPv6 short opt\n"));
4450                                                 return (PF_DROP);
4451                                         }
4452                                         if (pd2.proto == IPPROTO_AH)
4453                                                 off2 += (opt6.ip6e_len + 2) * 4;
4454                                         else
4455                                                 off2 += (opt6.ip6e_len + 1) * 8;
4456                                         pd2.proto = opt6.ip6e_nxt;
4457                                         /* goto the next header */
4458                                         break;
4459                                 }
4460                                 default:
4461                                         terminal++;
4462                                         break;
4463                                 }
4464                         } while (!terminal);
4465                         break;
4466 #endif /* INET6 */
4467                 }
4468
4469                 switch (pd2.proto) {
4470                 case IPPROTO_TCP: {
4471                         struct tcphdr            th;
4472                         u_int32_t                seq;
4473                         struct pf_state_peer    *src, *dst;
4474                         u_int8_t                 dws;
4475                         int                      copyback = 0;
4476
4477                         /*
4478                          * Only the first 8 bytes of the TCP header can be
4479                          * expected. Don't access any TCP header fields after
4480                          * th_seq, an ackskew test is not possible.
4481                          */
4482                         if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
4483                             pd2.af)) {
4484                                 DPFPRINTF(PF_DEBUG_MISC,
4485                                     ("pf: ICMP error message too short "
4486                                     "(tcp)\n"));
4487                                 return (PF_DROP);
4488                         }
4489
4490                         key.af = pd2.af;
4491                         key.proto = IPPROTO_TCP;
4492                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4493                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4494                         key.port[pd2.sidx] = th.th_sport;
4495                         key.port[pd2.didx] = th.th_dport;
4496
4497                         STATE_LOOKUP(kif, &key, direction, *state, pd);
4498
4499                         if (direction == (*state)->direction) {
4500                                 src = &(*state)->dst;
4501                                 dst = &(*state)->src;
4502                         } else {
4503                                 src = &(*state)->src;
4504                                 dst = &(*state)->dst;
4505                         }
4506
4507                         if (src->wscale && dst->wscale)
4508                                 dws = dst->wscale & PF_WSCALE_MASK;
4509                         else
4510                                 dws = 0;
4511
4512                         /* Demodulate sequence number */
4513                         seq = ntohl(th.th_seq) - src->seqdiff;
4514                         if (src->seqdiff) {
4515                                 pf_change_a(&th.th_seq, icmpsum,
4516                                     htonl(seq), 0);
4517                                 copyback = 1;
4518                         }
4519
4520                         if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
4521                             (!SEQ_GEQ(src->seqhi, seq) ||
4522                             !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
4523                                 if (V_pf_status.debug >= PF_DEBUG_MISC) {
4524                                         printf("pf: BAD ICMP %d:%d ",
4525                                             icmptype, pd->hdr.icmp->icmp_code);
4526                                         pf_print_host(pd->src, 0, pd->af);
4527                                         printf(" -> ");
4528                                         pf_print_host(pd->dst, 0, pd->af);
4529                                         printf(" state: ");
4530                                         pf_print_state(*state);
4531                                         printf(" seq=%u\n", seq);
4532                                 }
4533                                 REASON_SET(reason, PFRES_BADSTATE);
4534                                 return (PF_DROP);
4535                         } else {
4536                                 if (V_pf_status.debug >= PF_DEBUG_MISC) {
4537                                         printf("pf: OK ICMP %d:%d ",
4538                                             icmptype, pd->hdr.icmp->icmp_code);
4539                                         pf_print_host(pd->src, 0, pd->af);
4540                                         printf(" -> ");
4541                                         pf_print_host(pd->dst, 0, pd->af);
4542                                         printf(" state: ");
4543                                         pf_print_state(*state);
4544                                         printf(" seq=%u\n", seq);
4545                                 }
4546                         }
4547
4548                         /* translate source/destination address, if necessary */
4549                         if ((*state)->key[PF_SK_WIRE] !=
4550                             (*state)->key[PF_SK_STACK]) {
4551                                 struct pf_state_key *nk =
4552                                     (*state)->key[pd->didx];
4553
4554                                 if (PF_ANEQ(pd2.src,
4555                                     &nk->addr[pd2.sidx], pd2.af) ||
4556                                     nk->port[pd2.sidx] != th.th_sport)
4557                                         pf_change_icmp(pd2.src, &th.th_sport,
4558                                             daddr, &nk->addr[pd2.sidx],
4559                                             nk->port[pd2.sidx], NULL,
4560                                             pd2.ip_sum, icmpsum,
4561                                             pd->ip_sum, 0, pd2.af);
4562
4563                                 if (PF_ANEQ(pd2.dst,
4564                                     &nk->addr[pd2.didx], pd2.af) ||
4565                                     nk->port[pd2.didx] != th.th_dport)
4566                                         pf_change_icmp(pd2.dst, &th.th_dport,
4567                                             NULL, /* XXX Inbound NAT? */
4568                                             &nk->addr[pd2.didx],
4569                                             nk->port[pd2.didx], NULL,
4570                                             pd2.ip_sum, icmpsum,
4571                                             pd->ip_sum, 0, pd2.af);
4572                                 copyback = 1;
4573                         }
4574
4575                         if (copyback) {
4576                                 switch (pd2.af) {
4577 #ifdef INET
4578                                 case AF_INET:
4579                                         m_copyback(m, off, ICMP_MINLEN,
4580                                             (caddr_t )pd->hdr.icmp);
4581                                         m_copyback(m, ipoff2, sizeof(h2),
4582                                             (caddr_t )&h2);
4583                                         break;
4584 #endif /* INET */
4585 #ifdef INET6
4586                                 case AF_INET6:
4587                                         m_copyback(m, off,
4588                                             sizeof(struct icmp6_hdr),
4589                                             (caddr_t )pd->hdr.icmp6);
4590                                         m_copyback(m, ipoff2, sizeof(h2_6),
4591                                             (caddr_t )&h2_6);
4592                                         break;
4593 #endif /* INET6 */
4594                                 }
4595                                 m_copyback(m, off2, 8, (caddr_t)&th);
4596                         }
4597
4598                         return (PF_PASS);
4599                         break;
4600                 }
4601                 case IPPROTO_UDP: {
4602                         struct udphdr           uh;
4603
4604                         if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
4605                             NULL, reason, pd2.af)) {
4606                                 DPFPRINTF(PF_DEBUG_MISC,
4607                                     ("pf: ICMP error message too short "
4608                                     "(udp)\n"));
4609                                 return (PF_DROP);
4610                         }
4611
4612                         key.af = pd2.af;
4613                         key.proto = IPPROTO_UDP;
4614                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4615                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4616                         key.port[pd2.sidx] = uh.uh_sport;
4617                         key.port[pd2.didx] = uh.uh_dport;
4618
4619                         STATE_LOOKUP(kif, &key, direction, *state, pd);
4620
4621                         /* translate source/destination address, if necessary */
4622                         if ((*state)->key[PF_SK_WIRE] !=
4623                             (*state)->key[PF_SK_STACK]) {
4624                                 struct pf_state_key *nk =
4625                                     (*state)->key[pd->didx];
4626
4627                                 if (PF_ANEQ(pd2.src,
4628                                     &nk->addr[pd2.sidx], pd2.af) ||
4629                                     nk->port[pd2.sidx] != uh.uh_sport)
4630                                         pf_change_icmp(pd2.src, &uh.uh_sport,
4631                                             daddr, &nk->addr[pd2.sidx],
4632                                             nk->port[pd2.sidx], &uh.uh_sum,
4633                                             pd2.ip_sum, icmpsum,
4634                                             pd->ip_sum, 1, pd2.af);
4635
4636                                 if (PF_ANEQ(pd2.dst,
4637                                     &nk->addr[pd2.didx], pd2.af) ||
4638                                     nk->port[pd2.didx] != uh.uh_dport)
4639                                         pf_change_icmp(pd2.dst, &uh.uh_dport,
4640                                             NULL, /* XXX Inbound NAT? */
4641                                             &nk->addr[pd2.didx],
4642                                             nk->port[pd2.didx], &uh.uh_sum,
4643                                             pd2.ip_sum, icmpsum,
4644                                             pd->ip_sum, 1, pd2.af);
4645
4646                                 switch (pd2.af) {
4647 #ifdef INET
4648                                 case AF_INET:
4649                                         m_copyback(m, off, ICMP_MINLEN,
4650                                             (caddr_t )pd->hdr.icmp);
4651                                         m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4652                                         break;
4653 #endif /* INET */
4654 #ifdef INET6
4655                                 case AF_INET6:
4656                                         m_copyback(m, off,
4657                                             sizeof(struct icmp6_hdr),
4658                                             (caddr_t )pd->hdr.icmp6);
4659                                         m_copyback(m, ipoff2, sizeof(h2_6),
4660                                             (caddr_t )&h2_6);
4661                                         break;
4662 #endif /* INET6 */
4663                                 }
4664                                 m_copyback(m, off2, sizeof(uh), (caddr_t)&uh);
4665                         }
4666                         return (PF_PASS);
4667                         break;
4668                 }
4669 #ifdef INET
4670                 case IPPROTO_ICMP: {
4671                         struct icmp             iih;
4672
4673                         if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
4674                             NULL, reason, pd2.af)) {
4675                                 DPFPRINTF(PF_DEBUG_MISC,
4676                                     ("pf: ICMP error message too short i"
4677                                     "(icmp)\n"));
4678                                 return (PF_DROP);
4679                         }
4680
4681                         key.af = pd2.af;
4682                         key.proto = IPPROTO_ICMP;
4683                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4684                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4685                         key.port[0] = key.port[1] = iih.icmp_id;
4686
4687                         STATE_LOOKUP(kif, &key, direction, *state, pd);
4688
4689                         /* translate source/destination address, if necessary */
4690                         if ((*state)->key[PF_SK_WIRE] !=
4691                             (*state)->key[PF_SK_STACK]) {
4692                                 struct pf_state_key *nk =
4693                                     (*state)->key[pd->didx];
4694
4695                                 if (PF_ANEQ(pd2.src,
4696                                     &nk->addr[pd2.sidx], pd2.af) ||
4697                                     nk->port[pd2.sidx] != iih.icmp_id)
4698                                         pf_change_icmp(pd2.src, &iih.icmp_id,
4699                                             daddr, &nk->addr[pd2.sidx],
4700                                             nk->port[pd2.sidx], NULL,
4701                                             pd2.ip_sum, icmpsum,
4702                                             pd->ip_sum, 0, AF_INET);
4703
4704                                 if (PF_ANEQ(pd2.dst,
4705                                     &nk->addr[pd2.didx], pd2.af) ||
4706                                     nk->port[pd2.didx] != iih.icmp_id)
4707                                         pf_change_icmp(pd2.dst, &iih.icmp_id,
4708                                             NULL, /* XXX Inbound NAT? */
4709                                             &nk->addr[pd2.didx],
4710                                             nk->port[pd2.didx], NULL,
4711                                             pd2.ip_sum, icmpsum,
4712                                             pd->ip_sum, 0, AF_INET);
4713
4714                                 m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
4715                                 m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4716                                 m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih);
4717                         }
4718                         return (PF_PASS);
4719                         break;
4720                 }
4721 #endif /* INET */
4722 #ifdef INET6
4723                 case IPPROTO_ICMPV6: {
4724                         struct icmp6_hdr        iih;
4725
4726                         if (!pf_pull_hdr(m, off2, &iih,
4727                             sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
4728                                 DPFPRINTF(PF_DEBUG_MISC,
4729                                     ("pf: ICMP error message too short "
4730                                     "(icmp6)\n"));
4731                                 return (PF_DROP);
4732                         }
4733
4734                         key.af = pd2.af;
4735                         key.proto = IPPROTO_ICMPV6;
4736                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4737                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4738                         key.port[0] = key.port[1] = iih.icmp6_id;
4739
4740                         STATE_LOOKUP(kif, &key, direction, *state, pd);
4741
4742                         /* translate source/destination address, if necessary */
4743                         if ((*state)->key[PF_SK_WIRE] !=
4744                             (*state)->key[PF_SK_STACK]) {
4745                                 struct pf_state_key *nk =
4746                                     (*state)->key[pd->didx];
4747
4748                                 if (PF_ANEQ(pd2.src,
4749                                     &nk->addr[pd2.sidx], pd2.af) ||
4750                                     nk->port[pd2.sidx] != iih.icmp6_id)
4751                                         pf_change_icmp(pd2.src, &iih.icmp6_id,
4752                                             daddr, &nk->addr[pd2.sidx],
4753                                             nk->port[pd2.sidx], NULL,
4754                                             pd2.ip_sum, icmpsum,
4755                                             pd->ip_sum, 0, AF_INET6);
4756
4757                                 if (PF_ANEQ(pd2.dst,
4758                                     &nk->addr[pd2.didx], pd2.af) ||
4759                                     nk->port[pd2.didx] != iih.icmp6_id)
4760                                         pf_change_icmp(pd2.dst, &iih.icmp6_id,
4761                                             NULL, /* XXX Inbound NAT? */
4762                                             &nk->addr[pd2.didx],
4763                                             nk->port[pd2.didx], NULL,
4764                                             pd2.ip_sum, icmpsum,
4765                                             pd->ip_sum, 0, AF_INET6);
4766
4767                                 m_copyback(m, off, sizeof(struct icmp6_hdr),
4768                                     (caddr_t)pd->hdr.icmp6);
4769                                 m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
4770                                 m_copyback(m, off2, sizeof(struct icmp6_hdr),
4771                                     (caddr_t)&iih);
4772                         }
4773                         return (PF_PASS);
4774                         break;
4775                 }
4776 #endif /* INET6 */
4777                 default: {
4778                         key.af = pd2.af;
4779                         key.proto = pd2.proto;
4780                         PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
4781                         PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
4782                         key.port[0] = key.port[1] = 0;
4783
4784                         STATE_LOOKUP(kif, &key, direction, *state, pd);
4785
4786                         /* translate source/destination address, if necessary */
4787                         if ((*state)->key[PF_SK_WIRE] !=
4788                             (*state)->key[PF_SK_STACK]) {
4789                                 struct pf_state_key *nk =
4790                                     (*state)->key[pd->didx];
4791
4792                                 if (PF_ANEQ(pd2.src,
4793                                     &nk->addr[pd2.sidx], pd2.af))
4794                                         pf_change_icmp(pd2.src, NULL, daddr,
4795                                             &nk->addr[pd2.sidx], 0, NULL,
4796                                             pd2.ip_sum, icmpsum,
4797                                             pd->ip_sum, 0, pd2.af);
4798
4799                                 if (PF_ANEQ(pd2.dst,
4800                                     &nk->addr[pd2.didx], pd2.af))
4801                                         pf_change_icmp(pd2.src, NULL,
4802                                             NULL, /* XXX Inbound NAT? */
4803                                             &nk->addr[pd2.didx], 0, NULL,
4804                                             pd2.ip_sum, icmpsum,
4805                                             pd->ip_sum, 0, pd2.af);
4806
4807                                 switch (pd2.af) {
4808 #ifdef INET
4809                                 case AF_INET:
4810                                         m_copyback(m, off, ICMP_MINLEN,
4811                                             (caddr_t)pd->hdr.icmp);
4812                                         m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
4813                                         break;
4814 #endif /* INET */
4815 #ifdef INET6
4816                                 case AF_INET6:
4817                                         m_copyback(m, off,
4818                                             sizeof(struct icmp6_hdr),
4819                                             (caddr_t )pd->hdr.icmp6);
4820                                         m_copyback(m, ipoff2, sizeof(h2_6),
4821                                             (caddr_t )&h2_6);
4822                                         break;
4823 #endif /* INET6 */
4824                                 }
4825                         }
4826                         return (PF_PASS);
4827                         break;
4828                 }
4829                 }
4830         }
4831 }
4832
4833 static int
4834 pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif,
4835     struct mbuf *m, struct pf_pdesc *pd)
4836 {
4837         struct pf_state_peer    *src, *dst;
4838         struct pf_state_key_cmp  key;
4839
4840         bzero(&key, sizeof(key));
4841         key.af = pd->af;
4842         key.proto = pd->proto;
4843         if (direction == PF_IN) {
4844                 PF_ACPY(&key.addr[0], pd->src, key.af);
4845                 PF_ACPY(&key.addr[1], pd->dst, key.af);
4846                 key.port[0] = key.port[1] = 0;
4847         } else {
4848                 PF_ACPY(&key.addr[1], pd->src, key.af);
4849                 PF_ACPY(&key.addr[0], pd->dst, key.af);
4850                 key.port[1] = key.port[0] = 0;
4851         }
4852
4853         STATE_LOOKUP(kif, &key, direction, *state, pd);
4854
4855         if (direction == (*state)->direction) {
4856                 src = &(*state)->src;
4857                 dst = &(*state)->dst;
4858         } else {
4859                 src = &(*state)->dst;
4860                 dst = &(*state)->src;
4861         }
4862
4863         /* update states */
4864         if (src->state < PFOTHERS_SINGLE)
4865                 src->state = PFOTHERS_SINGLE;
4866         if (dst->state == PFOTHERS_SINGLE)
4867                 dst->state = PFOTHERS_MULTIPLE;
4868
4869         /* update expire time */
4870         (*state)->expire = time_uptime;
4871         if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
4872                 (*state)->timeout = PFTM_OTHER_MULTIPLE;
4873         else
4874                 (*state)->timeout = PFTM_OTHER_SINGLE;
4875
4876         /* translate source/destination address, if necessary */
4877         if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
4878                 struct pf_state_key *nk = (*state)->key[pd->didx];
4879
4880                 KASSERT(nk, ("%s: nk is null", __func__));
4881                 KASSERT(pd, ("%s: pd is null", __func__));
4882                 KASSERT(pd->src, ("%s: pd->src is null", __func__));
4883                 KASSERT(pd->dst, ("%s: pd->dst is null", __func__));
4884                 switch (pd->af) {
4885 #ifdef INET
4886                 case AF_INET:
4887                         if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
4888                                 pf_change_a(&pd->src->v4.s_addr,
4889                                     pd->ip_sum,
4890                                     nk->addr[pd->sidx].v4.s_addr,
4891                                     0);
4892
4893
4894                         if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
4895                                 pf_change_a(&pd->dst->v4.s_addr,
4896                                     pd->ip_sum,
4897                                     nk->addr[pd->didx].v4.s_addr,
4898                                     0);
4899
4900                                 break;
4901 #endif /* INET */
4902 #ifdef INET6
4903                 case AF_INET6:
4904                         if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
4905                                 PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
4906
4907                         if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
4908                                 PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
4909 #endif /* INET6 */
4910                 }
4911         }
4912         return (PF_PASS);
4913 }
4914
4915 /*
4916  * ipoff and off are measured from the start of the mbuf chain.
4917  * h must be at "ipoff" on the mbuf chain.
4918  */
4919 void *
4920 pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
4921     u_short *actionp, u_short *reasonp, sa_family_t af)
4922 {
4923         switch (af) {
4924 #ifdef INET
4925         case AF_INET: {
4926                 struct ip       *h = mtod(m, struct ip *);
4927                 u_int16_t        fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
4928
4929                 if (fragoff) {
4930                         if (fragoff >= len)
4931                                 ACTION_SET(actionp, PF_PASS);
4932                         else {
4933                                 ACTION_SET(actionp, PF_DROP);
4934                                 REASON_SET(reasonp, PFRES_FRAG);
4935                         }
4936                         return (NULL);
4937                 }
4938                 if (m->m_pkthdr.len < off + len ||
4939                     ntohs(h->ip_len) < off + len) {
4940                         ACTION_SET(actionp, PF_DROP);
4941                         REASON_SET(reasonp, PFRES_SHORT);
4942                         return (NULL);
4943                 }
4944                 break;
4945         }
4946 #endif /* INET */
4947 #ifdef INET6
4948         case AF_INET6: {
4949                 struct ip6_hdr  *h = mtod(m, struct ip6_hdr *);
4950
4951                 if (m->m_pkthdr.len < off + len ||
4952                     (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
4953                     (unsigned)(off + len)) {
4954                         ACTION_SET(actionp, PF_DROP);
4955                         REASON_SET(reasonp, PFRES_SHORT);
4956                         return (NULL);
4957                 }
4958                 break;
4959         }
4960 #endif /* INET6 */
4961         }
4962         m_copydata(m, off, len, p);
4963         return (p);
4964 }
4965
4966 int
4967 pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
4968     int rtableid)
4969 {
4970 #ifdef RADIX_MPATH
4971         struct radix_node_head  *rnh;
4972 #endif
4973         struct sockaddr_in      *dst;
4974         int                      ret = 1;
4975         int                      check_mpath;
4976 #ifdef INET6
4977         struct sockaddr_in6     *dst6;
4978         struct route_in6         ro;
4979 #else
4980         struct route             ro;
4981 #endif
4982         struct radix_node       *rn;
4983         struct rtentry          *rt;
4984         struct ifnet            *ifp;
4985
4986         check_mpath = 0;
4987 #ifdef RADIX_MPATH
4988         /* XXX: stick to table 0 for now */
4989         rnh = rt_tables_get_rnh(0, af);
4990         if (rnh != NULL && rn_mpath_capable(rnh))
4991                 check_mpath = 1;
4992 #endif
4993         bzero(&ro, sizeof(ro));
4994         switch (af) {
4995         case AF_INET:
4996                 dst = satosin(&ro.ro_dst);
4997                 dst->sin_family = AF_INET;
4998                 dst->sin_len = sizeof(*dst);
4999                 dst->sin_addr = addr->v4;
5000                 break;
5001 #ifdef INET6
5002         case AF_INET6:
5003                 /*
5004                  * Skip check for addresses with embedded interface scope,
5005                  * as they would always match anyway.
5006                  */
5007                 if (IN6_IS_SCOPE_EMBED(&addr->v6))
5008                         goto out;
5009                 dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
5010                 dst6->sin6_family = AF_INET6;
5011                 dst6->sin6_len = sizeof(*dst6);
5012                 dst6->sin6_addr = addr->v6;
5013                 break;
5014 #endif /* INET6 */
5015         default:
5016                 return (0);
5017         }
5018
5019         /* Skip checks for ipsec interfaces */
5020         if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
5021                 goto out;
5022
5023         switch (af) {
5024 #ifdef INET6
5025         case AF_INET6:
5026                 in6_rtalloc_ign(&ro, 0, rtableid);
5027                 break;
5028 #endif
5029 #ifdef INET
5030         case AF_INET:
5031                 in_rtalloc_ign((struct route *)&ro, 0, rtableid);
5032                 break;
5033 #endif
5034         default:
5035                 rtalloc_ign((struct route *)&ro, 0);    /* No/default FIB. */
5036                 break;
5037         }
5038
5039         if (ro.ro_rt != NULL) {
5040                 /* No interface given, this is a no-route check */
5041                 if (kif == NULL)
5042                         goto out;
5043
5044                 if (kif->pfik_ifp == NULL) {
5045                         ret = 0;
5046                         goto out;
5047                 }
5048
5049                 /* Perform uRPF check if passed input interface */
5050                 ret = 0;
5051                 rn = (struct radix_node *)ro.ro_rt;
5052                 do {
5053                         rt = (struct rtentry *)rn;
5054                         ifp = rt->rt_ifp;
5055
5056                         if (kif->pfik_ifp == ifp)
5057                                 ret = 1;
5058 #ifdef RADIX_MPATH
5059                         rn = rn_mpath_next(rn);
5060 #endif
5061                 } while (check_mpath == 1 && rn != NULL && ret == 0);
5062         } else
5063                 ret = 0;
5064 out:
5065         if (ro.ro_rt != NULL)
5066                 RTFREE(ro.ro_rt);
5067         return (ret);
5068 }
5069
5070 #ifdef INET
5071 static void
5072 pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5073     struct pf_state *s, struct pf_pdesc *pd)
5074 {
5075         struct mbuf             *m0, *m1;
5076         struct sockaddr_in      dst;
5077         struct ip               *ip;
5078         struct ifnet            *ifp = NULL;
5079         struct pf_addr           naddr;
5080         struct pf_src_node      *sn = NULL;
5081         int                      error = 0;
5082         int sw_csum;
5083
5084         KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
5085         KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
5086             __func__));
5087
5088         if ((pd->pf_mtag == NULL &&
5089             ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
5090             pd->pf_mtag->routed++ > 3) {
5091                 m0 = *m;
5092                 *m = NULL;
5093                 goto bad_locked;
5094         }
5095
5096         if (r->rt == PF_DUPTO) {
5097                 if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
5098                         if (s)
5099                                 PF_STATE_UNLOCK(s);
5100                         return;
5101                 }
5102         } else {
5103                 if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
5104                         if (s)
5105                                 PF_STATE_UNLOCK(s);
5106                         return;
5107                 }
5108                 m0 = *m;
5109         }
5110
5111         ip = mtod(m0, struct ip *);
5112
5113         bzero(&dst, sizeof(dst));
5114         dst.sin_family = AF_INET;
5115         dst.sin_len = sizeof(dst);
5116         dst.sin_addr = ip->ip_dst;
5117
5118         if (r->rt == PF_FASTROUTE) {
5119                 struct rtentry *rt;
5120
5121                 if (s)
5122                         PF_STATE_UNLOCK(s);
5123                 rt = rtalloc1_fib(sintosa(&dst), 0, 0, M_GETFIB(m0));
5124                 if (rt == NULL) {
5125                         RTFREE_LOCKED(rt);
5126                         KMOD_IPSTAT_INC(ips_noroute);
5127                         error = EHOSTUNREACH;
5128                         goto bad;
5129                 }
5130
5131                 ifp = rt->rt_ifp;
5132                 rt->rt_rmx.rmx_pksent++;
5133
5134                 if (rt->rt_flags & RTF_GATEWAY)
5135                         bcopy(satosin(rt->rt_gateway), &dst, sizeof(dst));
5136                 RTFREE_LOCKED(rt);
5137         } else {
5138                 if (TAILQ_EMPTY(&r->rpool.list)) {
5139                         DPFPRINTF(PF_DEBUG_URGENT,
5140                             ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
5141                         goto bad_locked;
5142                 }
5143                 if (s == NULL) {
5144                         pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
5145                             &naddr, NULL, &sn);
5146                         if (!PF_AZERO(&naddr, AF_INET))
5147                                 dst.sin_addr.s_addr = naddr.v4.s_addr;
5148                         ifp = r->rpool.cur->kif ?
5149                             r->rpool.cur->kif->pfik_ifp : NULL;
5150                 } else {
5151                         if (!PF_AZERO(&s->rt_addr, AF_INET))
5152                                 dst.sin_addr.s_addr =
5153                                     s->rt_addr.v4.s_addr;
5154                         ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
5155                         PF_STATE_UNLOCK(s);
5156                 }
5157         }
5158         if (ifp == NULL)
5159                 goto bad;
5160
5161         if (oifp != ifp) {
5162                 if (pf_test(PF_OUT, ifp, &m0, NULL) != PF_PASS)
5163                         goto bad;
5164                 else if (m0 == NULL)
5165                         goto done;
5166                 if (m0->m_len < sizeof(struct ip)) {
5167                         DPFPRINTF(PF_DEBUG_URGENT,
5168                             ("%s: m0->m_len < sizeof(struct ip)\n", __func__));
5169                         goto bad;
5170                 }
5171                 ip = mtod(m0, struct ip *);
5172         }
5173
5174         if (ifp->if_flags & IFF_LOOPBACK)
5175                 m0->m_flags |= M_SKIP_FIREWALL;
5176
5177         /* Back to host byte order. */
5178         ip->ip_len = ntohs(ip->ip_len);
5179         ip->ip_off = ntohs(ip->ip_off);
5180
5181         /* Copied from FreeBSD 10.0-CURRENT ip_output. */
5182         m0->m_pkthdr.csum_flags |= CSUM_IP;
5183         sw_csum = m0->m_pkthdr.csum_flags & ~ifp->if_hwassist;
5184         if (sw_csum & CSUM_DELAY_DATA) {
5185                 in_delayed_cksum(m0);
5186                 sw_csum &= ~CSUM_DELAY_DATA;
5187         }
5188 #ifdef SCTP
5189         if (sw_csum & CSUM_SCTP) {
5190                 sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
5191                 sw_csum &= ~CSUM_SCTP;
5192         }
5193 #endif
5194         m0->m_pkthdr.csum_flags &= ifp->if_hwassist;
5195
5196         /*
5197          * If small enough for interface, or the interface will take
5198          * care of the fragmentation for us, we can just send directly.
5199          */
5200         if (ip->ip_len <= ifp->if_mtu ||
5201             (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0 ||
5202             ((ip->ip_off & IP_DF) == 0 && (ifp->if_hwassist & CSUM_FRAGMENT))) {
5203                 ip->ip_len = htons(ip->ip_len);
5204                 ip->ip_off = htons(ip->ip_off);
5205                 ip->ip_sum = 0;
5206                 if (sw_csum & CSUM_DELAY_IP)
5207                         ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
5208                 m0->m_flags &= ~(M_PROTOFLAGS);
5209                 error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
5210                 goto done;
5211         }
5212
5213         /* Balk when DF bit is set or the interface didn't support TSO. */
5214         if ((ip->ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) {
5215                 error = EMSGSIZE;
5216                 KMOD_IPSTAT_INC(ips_cantfrag);
5217                 if (r->rt != PF_DUPTO) {
5218                         icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
5219                             ifp->if_mtu);
5220                         goto done;
5221                 } else
5222                         goto bad;
5223         }
5224
5225         error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist, sw_csum);
5226         if (error)
5227                 goto bad;
5228
5229         for (; m0; m0 = m1) {
5230                 m1 = m0->m_nextpkt;
5231                 m0->m_nextpkt = NULL;
5232                 if (error == 0) {
5233                         m0->m_flags &= ~(M_PROTOFLAGS);
5234                         error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
5235                 } else
5236                         m_freem(m0);
5237         }
5238
5239         if (error == 0)
5240                 KMOD_IPSTAT_INC(ips_fragmented);
5241
5242 done:
5243         if (r->rt != PF_DUPTO)
5244                 *m = NULL;
5245         return;
5246
5247 bad_locked:
5248         if (s)
5249                 PF_STATE_UNLOCK(s);
5250 bad:
5251         m_freem(m0);
5252         goto done;
5253 }
5254 #endif /* INET */
5255
5256 #ifdef INET6
5257 static void
5258 pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
5259     struct pf_state *s, struct pf_pdesc *pd)
5260 {
5261         struct mbuf             *m0;
5262         struct sockaddr_in6     dst;
5263         struct ip6_hdr          *ip6;
5264         struct ifnet            *ifp = NULL;
5265         struct pf_addr           naddr;
5266         struct pf_src_node      *sn = NULL;
5267
5268         KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
5269         KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
5270             __func__));
5271
5272         if ((pd->pf_mtag == NULL &&
5273             ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
5274             pd->pf_mtag->routed++ > 3) {
5275                 m0 = *m;
5276                 *m = NULL;
5277                 goto bad_locked;
5278         }
5279
5280         if (r->rt == PF_DUPTO) {
5281                 if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
5282                         if (s)
5283                                 PF_STATE_UNLOCK(s);
5284                         return;
5285                 }
5286         } else {
5287                 if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
5288                         if (s)
5289                                 PF_STATE_UNLOCK(s);
5290                         return;
5291                 }
5292                 m0 = *m;
5293         }
5294
5295         ip6 = mtod(m0, struct ip6_hdr *);
5296
5297         bzero(&dst, sizeof(dst));
5298         dst.sin6_family = AF_INET6;
5299         dst.sin6_len = sizeof(dst);
5300         dst.sin6_addr = ip6->ip6_dst;
5301
5302         /* Cheat. XXX why only in the v6 case??? */
5303         if (r->rt == PF_FASTROUTE) {
5304                 if (s)
5305                         PF_STATE_UNLOCK(s);
5306                 m0->m_flags |= M_SKIP_FIREWALL;
5307                 ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
5308                 return;
5309         }
5310
5311         if (TAILQ_EMPTY(&r->rpool.list)) {
5312                 DPFPRINTF(PF_DEBUG_URGENT,
5313                     ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
5314                 goto bad_locked;
5315         }
5316         if (s == NULL) {
5317                 pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
5318                     &naddr, NULL, &sn);
5319                 if (!PF_AZERO(&naddr, AF_INET6))
5320                         PF_ACPY((struct pf_addr *)&dst.sin6_addr,
5321                             &naddr, AF_INET6);
5322                 ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
5323         } else {
5324                 if (!PF_AZERO(&s->rt_addr, AF_INET6))
5325                         PF_ACPY((struct pf_addr *)&dst.sin6_addr,
5326                             &s->rt_addr, AF_INET6);
5327                 ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
5328         }
5329
5330         if (s)
5331                 PF_STATE_UNLOCK(s);
5332
5333         if (ifp == NULL)
5334                 goto bad;
5335
5336         if (oifp != ifp) {
5337                 if (pf_test6(PF_OUT, ifp, &m0, NULL) != PF_PASS)
5338                         goto bad;
5339                 else if (m0 == NULL)
5340                         goto done;
5341                 if (m0->m_len < sizeof(struct ip6_hdr)) {
5342                         DPFPRINTF(PF_DEBUG_URGENT,
5343                             ("%s: m0->m_len < sizeof(struct ip6_hdr)\n",
5344                             __func__));
5345                         goto bad;
5346                 }
5347                 ip6 = mtod(m0, struct ip6_hdr *);
5348         }
5349
5350         if (ifp->if_flags & IFF_LOOPBACK)
5351                 m0->m_flags |= M_SKIP_FIREWALL;
5352
5353         /*
5354          * If the packet is too large for the outgoing interface,
5355          * send back an icmp6 error.
5356          */
5357         if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr))
5358                 dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
5359         if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu)
5360                 nd6_output(ifp, ifp, m0, &dst, NULL);
5361         else {
5362                 in6_ifstat_inc(ifp, ifs6_in_toobig);
5363                 if (r->rt != PF_DUPTO)
5364                         icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
5365                 else
5366                         goto bad;
5367         }
5368
5369 done:
5370         if (r->rt != PF_DUPTO)
5371                 *m = NULL;
5372         return;
5373
5374 bad_locked:
5375         if (s)
5376                 PF_STATE_UNLOCK(s);
5377 bad:
5378         m_freem(m0);
5379         goto done;
5380 }
5381 #endif /* INET6 */
5382
5383 /*
5384  * FreeBSD supports cksum offloads for the following drivers.
5385  *  em(4), fxp(4), ixgb(4), lge(4), ndis(4), nge(4), re(4),
5386  *   ti(4), txp(4), xl(4)
5387  *
5388  * CSUM_DATA_VALID | CSUM_PSEUDO_HDR :
5389  *  network driver performed cksum including pseudo header, need to verify
5390  *   csum_data
5391  * CSUM_DATA_VALID :
5392  *  network driver performed cksum, needs to additional pseudo header
5393  *  cksum computation with partial csum_data(i.e. lack of H/W support for
5394  *  pseudo header, for instance hme(4), sk(4) and possibly gem(4))
5395  *
5396  * After validating the cksum of packet, set both flag CSUM_DATA_VALID and
5397  * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper
5398  * TCP/UDP layer.
5399  * Also, set csum_data to 0xffff to force cksum validation.
5400  */
5401 static int
5402 pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af)
5403 {
5404         u_int16_t sum = 0;
5405         int hw_assist = 0;
5406         struct ip *ip;
5407
5408         if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
5409                 return (1);
5410         if (m->m_pkthdr.len < off + len)
5411                 return (1);
5412
5413         switch (p) {
5414         case IPPROTO_TCP:
5415                 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
5416                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5417                                 sum = m->m_pkthdr.csum_data;
5418                         } else {
5419                                 ip = mtod(m, struct ip *);
5420                                 sum = in_pseudo(ip->ip_src.s_addr,
5421                                 ip->ip_dst.s_addr, htonl((u_short)len +
5422                                 m->m_pkthdr.csum_data + IPPROTO_TCP));
5423                         }
5424                         sum ^= 0xffff;
5425                         ++hw_assist;
5426                 }
5427                 break;
5428         case IPPROTO_UDP:
5429                 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
5430                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
5431                                 sum = m->m_pkthdr.csum_data;
5432                         } else {
5433                                 ip = mtod(m, struct ip *);
5434                                 sum = in_pseudo(ip->ip_src.s_addr,
5435                                 ip->ip_dst.s_addr, htonl((u_short)len +
5436                                 m->m_pkthdr.csum_data + IPPROTO_UDP));
5437                         }
5438                         sum ^= 0xffff;
5439                         ++hw_assist;
5440                 }
5441                 break;
5442         case IPPROTO_ICMP:
5443 #ifdef INET6
5444         case IPPROTO_ICMPV6:
5445 #endif /* INET6 */
5446                 break;
5447         default:
5448                 return (1);
5449         }
5450
5451         if (!hw_assist) {
5452                 switch (af) {
5453                 case AF_INET:
5454                         if (p == IPPROTO_ICMP) {
5455                                 if (m->m_len < off)
5456                                         return (1);
5457                                 m->m_data += off;
5458                                 m->m_len -= off;
5459                                 sum = in_cksum(m, len);
5460                                 m->m_data -= off;
5461                                 m->m_len += off;
5462                         } else {
5463                                 if (m->m_len < sizeof(struct ip))
5464                                         return (1);
5465                                 sum = in4_cksum(m, p, off, len);
5466                         }
5467                         break;
5468 #ifdef INET6
5469                 case AF_INET6:
5470                         if (m->m_len < sizeof(struct ip6_hdr))
5471                                 return (1);
5472                         sum = in6_cksum(m, p, off, len);
5473                         break;
5474 #endif /* INET6 */
5475                 default:
5476                         return (1);
5477                 }
5478         }
5479         if (sum) {
5480                 switch (p) {
5481                 case IPPROTO_TCP:
5482                     {
5483                         KMOD_TCPSTAT_INC(tcps_rcvbadsum);
5484                         break;
5485                     }
5486                 case IPPROTO_UDP:
5487                     {
5488                         KMOD_UDPSTAT_INC(udps_badsum);
5489                         break;
5490                     }
5491 #ifdef INET
5492                 case IPPROTO_ICMP:
5493                     {
5494                         KMOD_ICMPSTAT_INC(icps_checksum);
5495                         break;
5496                     }
5497 #endif
5498 #ifdef INET6
5499                 case IPPROTO_ICMPV6:
5500                     {
5501                         KMOD_ICMP6STAT_INC(icp6s_checksum);
5502                         break;
5503                     }
5504 #endif /* INET6 */
5505                 }
5506                 return (1);
5507         } else {
5508                 if (p == IPPROTO_TCP || p == IPPROTO_UDP) {
5509                         m->m_pkthdr.csum_flags |=
5510                             (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
5511                         m->m_pkthdr.csum_data = 0xffff;
5512                 }
5513         }
5514         return (0);
5515 }
5516
5517
5518 #ifdef INET
5519 int
5520 pf_test(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
5521 {
5522         struct pfi_kif          *kif;
5523         u_short                  action, reason = 0, log = 0;
5524         struct mbuf             *m = *m0;
5525         struct ip               *h = NULL;
5526         struct m_tag            *ipfwtag;
5527         struct pf_rule          *a = NULL, *r = &V_pf_default_rule, *tr, *nr;
5528         struct pf_state         *s = NULL;
5529         struct pf_ruleset       *ruleset = NULL;
5530         struct pf_pdesc          pd;
5531         int                      off, dirndx, pqid = 0;
5532
5533         M_ASSERTPKTHDR(m);
5534
5535         if (!V_pf_status.running)
5536                 return (PF_PASS);
5537
5538         memset(&pd, 0, sizeof(pd));
5539
5540         kif = (struct pfi_kif *)ifp->if_pf_kif;
5541
5542         if (kif == NULL) {
5543                 DPFPRINTF(PF_DEBUG_URGENT,
5544                     ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
5545                 return (PF_DROP);
5546         }
5547         if (kif->pfik_flags & PFI_IFLAG_SKIP)
5548                 return (PF_PASS);
5549
5550         if (m->m_flags & M_SKIP_FIREWALL)
5551                 return (PF_PASS);
5552
5553         if (m->m_pkthdr.len < (int)sizeof(struct ip)) {
5554                 action = PF_DROP;
5555                 REASON_SET(&reason, PFRES_SHORT);
5556                 log = 1;
5557                 goto done;
5558         }
5559
5560         pd.pf_mtag = pf_find_mtag(m);
5561
5562         PF_RULES_RLOCK();
5563
5564         if (ip_divert_ptr != NULL &&
5565             ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) {
5566                 struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1);
5567                 if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) {
5568                         if (pd.pf_mtag == NULL &&
5569                             ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5570                                 action = PF_DROP;
5571                                 goto done;
5572                         }
5573                         pd.pf_mtag->flags |= PF_PACKET_LOOPED;
5574                         m_tag_delete(m, ipfwtag);
5575                 }
5576                 if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) {
5577                         m->m_flags |= M_FASTFWD_OURS;
5578                         pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT;
5579                 }
5580         } else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
5581                 /* We do IP header normalization and packet reassembly here */
5582                 action = PF_DROP;
5583                 goto done;
5584         }
5585         m = *m0;        /* pf_normalize messes with m0 */
5586         h = mtod(m, struct ip *);
5587
5588         off = h->ip_hl << 2;
5589         if (off < (int)sizeof(struct ip)) {
5590                 action = PF_DROP;
5591                 REASON_SET(&reason, PFRES_SHORT);
5592                 log = 1;
5593                 goto done;
5594         }
5595
5596         pd.src = (struct pf_addr *)&h->ip_src;
5597         pd.dst = (struct pf_addr *)&h->ip_dst;
5598         pd.sport = pd.dport = NULL;
5599         pd.ip_sum = &h->ip_sum;
5600         pd.proto_sum = NULL;
5601         pd.proto = h->ip_p;
5602         pd.dir = dir;
5603         pd.sidx = (dir == PF_IN) ? 0 : 1;
5604         pd.didx = (dir == PF_IN) ? 1 : 0;
5605         pd.af = AF_INET;
5606         pd.tos = h->ip_tos;
5607         pd.tot_len = ntohs(h->ip_len);
5608
5609         /* handle fragments that didn't get reassembled by normalization */
5610         if (h->ip_off & htons(IP_MF | IP_OFFMASK)) {
5611                 action = pf_test_fragment(&r, dir, kif, m, h,
5612                     &pd, &a, &ruleset);
5613                 goto done;
5614         }
5615
5616         switch (h->ip_p) {
5617
5618         case IPPROTO_TCP: {
5619                 struct tcphdr   th;
5620
5621                 pd.hdr.tcp = &th;
5622                 if (!pf_pull_hdr(m, off, &th, sizeof(th),
5623                     &action, &reason, AF_INET)) {
5624                         log = action != PF_PASS;
5625                         goto done;
5626                 }
5627                 pd.p_len = pd.tot_len - off - (th.th_off << 2);
5628                 if ((th.th_flags & TH_ACK) && pd.p_len == 0)
5629                         pqid = 1;
5630                 action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
5631                 if (action == PF_DROP)
5632                         goto done;
5633                 action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
5634                     &reason);
5635                 if (action == PF_PASS) {
5636                         if (pfsync_update_state_ptr != NULL)
5637                                 pfsync_update_state_ptr(s);
5638                         r = s->rule.ptr;
5639                         a = s->anchor.ptr;
5640                         log = s->log;
5641                 } else if (s == NULL)
5642                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5643                             &a, &ruleset, inp);
5644                 break;
5645         }
5646
5647         case IPPROTO_UDP: {
5648                 struct udphdr   uh;
5649
5650                 pd.hdr.udp = &uh;
5651                 if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
5652                     &action, &reason, AF_INET)) {
5653                         log = action != PF_PASS;
5654                         goto done;
5655                 }
5656                 if (uh.uh_dport == 0 ||
5657                     ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
5658                     ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
5659                         action = PF_DROP;
5660                         REASON_SET(&reason, PFRES_SHORT);
5661                         goto done;
5662                 }
5663                 action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
5664                 if (action == PF_PASS) {
5665                         if (pfsync_update_state_ptr != NULL)
5666                                 pfsync_update_state_ptr(s);
5667                         r = s->rule.ptr;
5668                         a = s->anchor.ptr;
5669                         log = s->log;
5670                 } else if (s == NULL)
5671                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5672                             &a, &ruleset, inp);
5673                 break;
5674         }
5675
5676         case IPPROTO_ICMP: {
5677                 struct icmp     ih;
5678
5679                 pd.hdr.icmp = &ih;
5680                 if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
5681                     &action, &reason, AF_INET)) {
5682                         log = action != PF_PASS;
5683                         goto done;
5684                 }
5685                 action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
5686                     &reason);
5687                 if (action == PF_PASS) {
5688                         if (pfsync_update_state_ptr != NULL)
5689                                 pfsync_update_state_ptr(s);
5690                         r = s->rule.ptr;
5691                         a = s->anchor.ptr;
5692                         log = s->log;
5693                 } else if (s == NULL)
5694                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5695                             &a, &ruleset, inp);
5696                 break;
5697         }
5698
5699 #ifdef INET6
5700         case IPPROTO_ICMPV6: {
5701                 action = PF_DROP;
5702                 DPFPRINTF(PF_DEBUG_MISC,
5703                     ("pf: dropping IPv4 packet with ICMPv6 payload\n"));
5704                 goto done;
5705         }
5706 #endif
5707
5708         default:
5709                 action = pf_test_state_other(&s, dir, kif, m, &pd);
5710                 if (action == PF_PASS) {
5711                         if (pfsync_update_state_ptr != NULL)
5712                                 pfsync_update_state_ptr(s);
5713                         r = s->rule.ptr;
5714                         a = s->anchor.ptr;
5715                         log = s->log;
5716                 } else if (s == NULL)
5717                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
5718                             &a, &ruleset, inp);
5719                 break;
5720         }
5721
5722 done:
5723         PF_RULES_RUNLOCK();
5724         if (action == PF_PASS && h->ip_hl > 5 &&
5725             !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
5726                 action = PF_DROP;
5727                 REASON_SET(&reason, PFRES_IPOPTIONS);
5728                 log = 1;
5729                 DPFPRINTF(PF_DEBUG_MISC,
5730                     ("pf: dropping packet with ip options\n"));
5731         }
5732
5733         if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
5734                 action = PF_DROP;
5735                 REASON_SET(&reason, PFRES_MEMORY);
5736         }
5737         if (r->rtableid >= 0)
5738                 M_SETFIB(m, r->rtableid);
5739
5740 #ifdef ALTQ
5741         if (action == PF_PASS && r->qid) {
5742                 if (pd.pf_mtag == NULL &&
5743                     ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5744                         action = PF_DROP;
5745                         REASON_SET(&reason, PFRES_MEMORY);
5746                 }
5747                 if (pqid || (pd.tos & IPTOS_LOWDELAY))
5748                         pd.pf_mtag->qid = r->pqid;
5749                 else
5750                         pd.pf_mtag->qid = r->qid;
5751                 /* add hints for ecn */
5752                 pd.pf_mtag->hdr = h;
5753
5754         }
5755 #endif /* ALTQ */
5756
5757         /*
5758          * connections redirected to loopback should not match sockets
5759          * bound specifically to loopback due to security implications,
5760          * see tcp_input() and in_pcblookup_listen().
5761          */
5762         if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
5763             pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
5764             (s->nat_rule.ptr->action == PF_RDR ||
5765             s->nat_rule.ptr->action == PF_BINAT) &&
5766             (ntohl(pd.dst->v4.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
5767                 m->m_flags |= M_SKIP_FIREWALL;
5768
5769         if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL &&
5770             !PACKET_LOOPED(&pd)) {
5771
5772                 ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
5773                     sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
5774                 if (ipfwtag != NULL) {
5775                         ((struct ipfw_rule_ref *)(ipfwtag+1))->info =
5776                             ntohs(r->divert.port);
5777                         ((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir;
5778
5779                         if (s)
5780                                 PF_STATE_UNLOCK(s);
5781
5782                         m_tag_prepend(m, ipfwtag);
5783                         if (m->m_flags & M_FASTFWD_OURS) {
5784                                 if (pd.pf_mtag == NULL &&
5785                                     ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
5786                                         action = PF_DROP;
5787                                         REASON_SET(&reason, PFRES_MEMORY);
5788                                         log = 1;
5789                                         DPFPRINTF(PF_DEBUG_MISC,
5790                                             ("pf: failed to allocate tag\n"));
5791                                 }
5792                                 pd.pf_mtag->flags |= PF_FASTFWD_OURS_PRESENT;
5793                                 m->m_flags &= ~M_FASTFWD_OURS;
5794                         }
5795                         ip_divert_ptr(*m0, dir ==  PF_IN ? DIR_IN : DIR_OUT);
5796                         *m0 = NULL;
5797
5798                         return (action);
5799                 } else {
5800                         /* XXX: ipfw has the same behaviour! */
5801                         action = PF_DROP;
5802                         REASON_SET(&reason, PFRES_MEMORY);
5803                         log = 1;
5804                         DPFPRINTF(PF_DEBUG_MISC,
5805                             ("pf: failed to allocate divert tag\n"));
5806                 }
5807         }
5808
5809         if (log) {
5810                 struct pf_rule *lr;
5811
5812                 if (s != NULL && s->nat_rule.ptr != NULL &&
5813                     s->nat_rule.ptr->log & PF_LOG_ALL)
5814                         lr = s->nat_rule.ptr;
5815                 else
5816                         lr = r;
5817                 PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd,
5818                     (s == NULL));
5819         }
5820
5821         kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
5822         kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;
5823
5824         if (action == PF_PASS || r->action == PF_DROP) {
5825                 dirndx = (dir == PF_OUT);
5826                 r->packets[dirndx]++;
5827                 r->bytes[dirndx] += pd.tot_len;
5828                 if (a != NULL) {
5829                         a->packets[dirndx]++;
5830                         a->bytes[dirndx] += pd.tot_len;
5831                 }
5832                 if (s != NULL) {
5833                         if (s->nat_rule.ptr != NULL) {
5834                                 s->nat_rule.ptr->packets[dirndx]++;
5835                                 s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
5836                         }
5837                         if (s->src_node != NULL) {
5838                                 s->src_node->packets[dirndx]++;
5839                                 s->src_node->bytes[dirndx] += pd.tot_len;
5840                         }
5841                         if (s->nat_src_node != NULL) {
5842                                 s->nat_src_node->packets[dirndx]++;
5843                                 s->nat_src_node->bytes[dirndx] += pd.tot_len;
5844                         }
5845                         dirndx = (dir == s->direction) ? 0 : 1;
5846                         s->packets[dirndx]++;
5847                         s->bytes[dirndx] += pd.tot_len;
5848                 }
5849                 tr = r;
5850                 nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
5851                 if (nr != NULL && r == &V_pf_default_rule)
5852                         tr = nr;
5853                 if (tr->src.addr.type == PF_ADDR_TABLE)
5854                         pfr_update_stats(tr->src.addr.p.tbl,
5855                             (s == NULL) ? pd.src :
5856                             &s->key[(s->direction == PF_IN)]->
5857                                 addr[(s->direction == PF_OUT)],
5858                             pd.af, pd.tot_len, dir == PF_OUT,
5859                             r->action == PF_PASS, tr->src.neg);
5860                 if (tr->dst.addr.type == PF_ADDR_TABLE)
5861                         pfr_update_stats(tr->dst.addr.p.tbl,
5862                             (s == NULL) ? pd.dst :
5863                             &s->key[(s->direction == PF_IN)]->
5864                                 addr[(s->direction == PF_IN)],
5865                             pd.af, pd.tot_len, dir == PF_OUT,
5866                             r->action == PF_PASS, tr->dst.neg);
5867         }
5868
5869         switch (action) {
5870         case PF_SYNPROXY_DROP:
5871                 m_freem(*m0);
5872         case PF_DEFER:
5873                 *m0 = NULL;
5874                 action = PF_PASS;
5875                 break;
5876         default:
5877                 /* pf_route() returns unlocked. */
5878                 if (r->rt) {
5879                         pf_route(m0, r, dir, kif->pfik_ifp, s, &pd);
5880                         return (action);
5881                 }
5882                 break;
5883         }
5884         if (s)
5885                 PF_STATE_UNLOCK(s);
5886
5887         return (action);
5888 }
5889 #endif /* INET */
5890
5891 #ifdef INET6
5892 int
5893 pf_test6(int dir, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
5894 {
5895         struct pfi_kif          *kif;
5896         u_short                  action, reason = 0, log = 0;
5897         struct mbuf             *m = *m0, *n = NULL;
5898         struct ip6_hdr          *h = NULL;
5899         struct pf_rule          *a = NULL, *r = &V_pf_default_rule, *tr, *nr;
5900         struct pf_state         *s = NULL;
5901         struct pf_ruleset       *ruleset = NULL;
5902         struct pf_pdesc          pd;
5903         int                      off, terminal = 0, dirndx, rh_cnt = 0;
5904
5905         M_ASSERTPKTHDR(m);
5906
5907         if (!V_pf_status.running)
5908                 return (PF_PASS);
5909
5910         memset(&pd, 0, sizeof(pd));
5911         pd.pf_mtag = pf_find_mtag(m);
5912
5913         if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED)
5914                 return (PF_PASS);
5915
5916         kif = (struct pfi_kif *)ifp->if_pf_kif;
5917         if (kif == NULL) {
5918                 DPFPRINTF(PF_DEBUG_URGENT,
5919                     ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
5920                 return (PF_DROP);
5921         }
5922         if (kif->pfik_flags & PFI_IFLAG_SKIP)
5923                 return (PF_PASS);
5924
5925         if (m->m_pkthdr.len < (int)sizeof(*h)) {
5926                 action = PF_DROP;
5927                 REASON_SET(&reason, PFRES_SHORT);
5928                 log = 1;
5929                 goto done;
5930         }
5931
5932         PF_RULES_RLOCK();
5933
5934         /* We do IP header normalization and packet reassembly here */
5935         if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
5936                 action = PF_DROP;
5937                 goto done;
5938         }
5939         m = *m0;        /* pf_normalize messes with m0 */
5940         h = mtod(m, struct ip6_hdr *);
5941
5942 #if 1
5943         /*
5944          * we do not support jumbogram yet.  if we keep going, zero ip6_plen
5945          * will do something bad, so drop the packet for now.
5946          */
5947         if (htons(h->ip6_plen) == 0) {
5948                 action = PF_DROP;
5949                 REASON_SET(&reason, PFRES_NORM);        /*XXX*/
5950                 goto done;
5951         }
5952 #endif
5953
5954         pd.src = (struct pf_addr *)&h->ip6_src;
5955         pd.dst = (struct pf_addr *)&h->ip6_dst;
5956         pd.sport = pd.dport = NULL;
5957         pd.ip_sum = NULL;
5958         pd.proto_sum = NULL;
5959         pd.dir = dir;
5960         pd.sidx = (dir == PF_IN) ? 0 : 1;
5961         pd.didx = (dir == PF_IN) ? 1 : 0;
5962         pd.af = AF_INET6;
5963         pd.tos = 0;
5964         pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
5965
5966         off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
5967         pd.proto = h->ip6_nxt;
5968         do {
5969                 switch (pd.proto) {
5970                 case IPPROTO_FRAGMENT:
5971                         action = pf_test_fragment(&r, dir, kif, m, h,
5972                             &pd, &a, &ruleset);
5973                         if (action == PF_DROP)
5974                                 REASON_SET(&reason, PFRES_FRAG);
5975                         goto done;
5976                 case IPPROTO_ROUTING: {
5977                         struct ip6_rthdr rthdr;
5978
5979                         if (rh_cnt++) {
5980                                 DPFPRINTF(PF_DEBUG_MISC,
5981                                     ("pf: IPv6 more than one rthdr\n"));
5982                                 action = PF_DROP;
5983                                 REASON_SET(&reason, PFRES_IPOPTIONS);
5984                                 log = 1;
5985                                 goto done;
5986                         }
5987                         if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
5988                             &reason, pd.af)) {
5989                                 DPFPRINTF(PF_DEBUG_MISC,
5990                                     ("pf: IPv6 short rthdr\n"));
5991                                 action = PF_DROP;
5992                                 REASON_SET(&reason, PFRES_SHORT);
5993                                 log = 1;
5994                                 goto done;
5995                         }
5996                         if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
5997                                 DPFPRINTF(PF_DEBUG_MISC,
5998                                     ("pf: IPv6 rthdr0\n"));
5999                                 action = PF_DROP;
6000                                 REASON_SET(&reason, PFRES_IPOPTIONS);
6001                                 log = 1;
6002                                 goto done;
6003                         }
6004                         /* FALLTHROUGH */
6005                 }
6006                 case IPPROTO_AH:
6007                 case IPPROTO_HOPOPTS:
6008                 case IPPROTO_DSTOPTS: {
6009                         /* get next header and header length */
6010                         struct ip6_ext  opt6;
6011
6012                         if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
6013                             NULL, &reason, pd.af)) {
6014                                 DPFPRINTF(PF_DEBUG_MISC,
6015                                     ("pf: IPv6 short opt\n"));
6016                                 action = PF_DROP;
6017                                 log = 1;
6018                                 goto done;
6019                         }
6020                         if (pd.proto == IPPROTO_AH)
6021                                 off += (opt6.ip6e_len + 2) * 4;
6022                         else
6023                                 off += (opt6.ip6e_len + 1) * 8;
6024                         pd.proto = opt6.ip6e_nxt;
6025                         /* goto the next header */
6026                         break;
6027                 }
6028                 default:
6029                         terminal++;
6030                         break;
6031                 }
6032         } while (!terminal);
6033
6034         /* if there's no routing header, use unmodified mbuf for checksumming */
6035         if (!n)
6036                 n = m;
6037
6038         switch (pd.proto) {
6039
6040         case IPPROTO_TCP: {
6041                 struct tcphdr   th;
6042
6043                 pd.hdr.tcp = &th;
6044                 if (!pf_pull_hdr(m, off, &th, sizeof(th),
6045                     &action, &reason, AF_INET6)) {
6046                         log = action != PF_PASS;
6047                         goto done;
6048                 }
6049                 pd.p_len = pd.tot_len - off - (th.th_off << 2);
6050                 action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
6051                 if (action == PF_DROP)
6052                         goto done;
6053                 action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
6054                     &reason);
6055                 if (action == PF_PASS) {
6056                         if (pfsync_update_state_ptr != NULL)
6057                                 pfsync_update_state_ptr(s);
6058                         r = s->rule.ptr;
6059                         a = s->anchor.ptr;
6060                         log = s->log;
6061                 } else if (s == NULL)
6062                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6063                             &a, &ruleset, inp);
6064                 break;
6065         }
6066
6067         case IPPROTO_UDP: {
6068                 struct udphdr   uh;
6069
6070                 pd.hdr.udp = &uh;
6071                 if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
6072                     &action, &reason, AF_INET6)) {
6073                         log = action != PF_PASS;
6074                         goto done;
6075                 }
6076                 if (uh.uh_dport == 0 ||
6077                     ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
6078                     ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
6079                         action = PF_DROP;
6080                         REASON_SET(&reason, PFRES_SHORT);
6081                         goto done;
6082                 }
6083                 action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
6084                 if (action == PF_PASS) {
6085                         if (pfsync_update_state_ptr != NULL)
6086                                 pfsync_update_state_ptr(s);
6087                         r = s->rule.ptr;
6088                         a = s->anchor.ptr;
6089                         log = s->log;
6090                 } else if (s == NULL)
6091                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6092                             &a, &ruleset, inp);
6093                 break;
6094         }
6095
6096         case IPPROTO_ICMP: {
6097                 action = PF_DROP;
6098                 DPFPRINTF(PF_DEBUG_MISC,
6099                     ("pf: dropping IPv6 packet with ICMPv4 payload\n"));
6100                 goto done;
6101         }
6102
6103         case IPPROTO_ICMPV6: {
6104                 struct icmp6_hdr        ih;
6105
6106                 pd.hdr.icmp6 = &ih;
6107                 if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
6108                     &action, &reason, AF_INET6)) {
6109                         log = action != PF_PASS;
6110                         goto done;
6111                 }
6112                 action = pf_test_state_icmp(&s, dir, kif,
6113                     m, off, h, &pd, &reason);
6114                 if (action == PF_PASS) {
6115                         if (pfsync_update_state_ptr != NULL)
6116                                 pfsync_update_state_ptr(s);
6117                         r = s->rule.ptr;
6118                         a = s->anchor.ptr;
6119                         log = s->log;
6120                 } else if (s == NULL)
6121                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6122                             &a, &ruleset, inp);
6123                 break;
6124         }
6125
6126         default:
6127                 action = pf_test_state_other(&s, dir, kif, m, &pd);
6128                 if (action == PF_PASS) {
6129                         if (pfsync_update_state_ptr != NULL)
6130                                 pfsync_update_state_ptr(s);
6131                         r = s->rule.ptr;
6132                         a = s->anchor.ptr;
6133                         log = s->log;
6134                 } else if (s == NULL)
6135                         action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
6136                             &a, &ruleset, inp);
6137                 break;
6138         }
6139
6140 done:
6141         PF_RULES_RUNLOCK();
6142         if (n != m) {
6143                 m_freem(n);
6144                 n = NULL;
6145         }
6146
6147         /* handle dangerous IPv6 extension headers. */
6148         if (action == PF_PASS && rh_cnt &&
6149             !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
6150                 action = PF_DROP;
6151                 REASON_SET(&reason, PFRES_IPOPTIONS);
6152                 log = 1;
6153                 DPFPRINTF(PF_DEBUG_MISC,
6154                     ("pf: dropping packet with dangerous v6 headers\n"));
6155         }
6156
6157         if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
6158                 action = PF_DROP;
6159                 REASON_SET(&reason, PFRES_MEMORY);
6160         }
6161         if (r->rtableid >= 0)
6162                 M_SETFIB(m, r->rtableid);
6163
6164 #ifdef ALTQ
6165         if (action == PF_PASS && r->qid) {
6166                 if (pd.pf_mtag == NULL &&
6167                     ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
6168                         action = PF_DROP;
6169                         REASON_SET(&reason, PFRES_MEMORY);
6170                 }
6171                 if (pd.tos & IPTOS_LOWDELAY)
6172                         pd.pf_mtag->qid = r->pqid;
6173                 else
6174                         pd.pf_mtag->qid = r->qid;
6175                 /* add hints for ecn */
6176                 pd.pf_mtag->hdr = h;
6177         }
6178 #endif /* ALTQ */
6179
6180         if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
6181             pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
6182             (s->nat_rule.ptr->action == PF_RDR ||
6183             s->nat_rule.ptr->action == PF_BINAT) &&
6184             IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
6185                 m->m_flags |= M_SKIP_FIREWALL;
6186
6187         /* XXX: Anybody working on it?! */
6188         if (r->divert.port)
6189                 printf("pf: divert(9) is not supported for IPv6\n");
6190
6191         if (log) {
6192                 struct pf_rule *lr;
6193
6194                 if (s != NULL && s->nat_rule.ptr != NULL &&
6195                     s->nat_rule.ptr->log & PF_LOG_ALL)
6196                         lr = s->nat_rule.ptr;
6197                 else
6198                         lr = r;
6199                 PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset,
6200                     &pd, (s == NULL));
6201         }
6202
6203         kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
6204         kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;
6205
6206         if (action == PF_PASS || r->action == PF_DROP) {
6207                 dirndx = (dir == PF_OUT);
6208                 r->packets[dirndx]++;
6209                 r->bytes[dirndx] += pd.tot_len;
6210                 if (a != NULL) {
6211                         a->packets[dirndx]++;
6212                         a->bytes[dirndx] += pd.tot_len;
6213                 }
6214                 if (s != NULL) {
6215                         if (s->nat_rule.ptr != NULL) {
6216                                 s->nat_rule.ptr->packets[dirndx]++;
6217                                 s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
6218                         }
6219                         if (s->src_node != NULL) {
6220                                 s->src_node->packets[dirndx]++;
6221                                 s->src_node->bytes[dirndx] += pd.tot_len;
6222                         }
6223                         if (s->nat_src_node != NULL) {
6224                                 s->nat_src_node->packets[dirndx]++;
6225                                 s->nat_src_node->bytes[dirndx] += pd.tot_len;
6226                         }
6227                         dirndx = (dir == s->direction) ? 0 : 1;
6228                         s->packets[dirndx]++;
6229                         s->bytes[dirndx] += pd.tot_len;
6230                 }
6231                 tr = r;
6232                 nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
6233                 if (nr != NULL && r == &V_pf_default_rule)
6234                         tr = nr;
6235                 if (tr->src.addr.type == PF_ADDR_TABLE)
6236                         pfr_update_stats(tr->src.addr.p.tbl,
6237                             (s == NULL) ? pd.src :
6238                             &s->key[(s->direction == PF_IN)]->addr[0],
6239                             pd.af, pd.tot_len, dir == PF_OUT,
6240                             r->action == PF_PASS, tr->src.neg);
6241                 if (tr->dst.addr.type == PF_ADDR_TABLE)
6242                         pfr_update_stats(tr->dst.addr.p.tbl,
6243                             (s == NULL) ? pd.dst :
6244                             &s->key[(s->direction == PF_IN)]->addr[1],
6245                             pd.af, pd.tot_len, dir == PF_OUT,
6246                             r->action == PF_PASS, tr->dst.neg);
6247         }
6248
6249         switch (action) {
6250         case PF_SYNPROXY_DROP:
6251                 m_freem(*m0);
6252         case PF_DEFER:
6253                 *m0 = NULL;
6254                 action = PF_PASS;
6255                 break;
6256         default:
6257                 /* pf_route6() returns unlocked. */
6258                 if (r->rt) {
6259                         pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd);
6260                         return (action);
6261                 }
6262                 break;
6263         }
6264
6265         if (s)
6266                 PF_STATE_UNLOCK(s);
6267
6268         return (action);
6269 }
6270 #endif /* INET6 */