]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/netpfil/pf/if_pfsync.c
Merge r261882, r261898, r261937, r262760, r262799:
[FreeBSD/stable/10.git] / sys / netpfil / pf / if_pfsync.c
1 /*-
2  * Copyright (c) 2002 Michael Shalayeff
3  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
19  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
23  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
24  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
25  * THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 /*-
29  * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
30  *
31  * Permission to use, copy, modify, and distribute this software for any
32  * purpose with or without fee is hereby granted, provided that the above
33  * copyright notice and this permission notice appear in all copies.
34  *
35  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
36  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
37  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
38  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
40  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
41  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42  */
43
44 /*
45  * $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $
46  *
47  * Revisions picked from OpenBSD after revision 1.110 import:
48  * 1.119 - don't m_copydata() beyond the len of mbuf in pfsync_input()
49  * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates
50  * 1.120, 1.175 - use monotonic time_uptime
51  * 1.122 - reduce number of updates for non-TCP sessions
52  * 1.125, 1.127 - rewrite merge or stale processing
53  * 1.128 - cleanups
54  * 1.146 - bzero() mbuf before sparsely filling it with data
55  * 1.170 - SIOCSIFMTU checks
56  * 1.126, 1.142 - deferred packets processing
57  * 1.173 - correct expire time processing
58  */
59
60 #include <sys/cdefs.h>
61 __FBSDID("$FreeBSD$");
62
63 #include "opt_inet.h"
64 #include "opt_inet6.h"
65 #include "opt_pf.h"
66
67 #include <sys/param.h>
68 #include <sys/bus.h>
69 #include <sys/endian.h>
70 #include <sys/interrupt.h>
71 #include <sys/kernel.h>
72 #include <sys/lock.h>
73 #include <sys/mbuf.h>
74 #include <sys/module.h>
75 #include <sys/mutex.h>
76 #include <sys/priv.h>
77 #include <sys/protosw.h>
78 #include <sys/socket.h>
79 #include <sys/sockio.h>
80 #include <sys/sysctl.h>
81
82 #include <net/bpf.h>
83 #include <net/if.h>
84 #include <net/if_clone.h>
85 #include <net/if_types.h>
86 #include <net/pfvar.h>
87 #include <net/if_pfsync.h>
88
89 #include <netinet/if_ether.h>
90 #include <netinet/in.h>
91 #include <netinet/in_var.h>
92 #include <netinet/ip.h>
93 #include <netinet/ip_carp.h>
94 #include <netinet/ip_var.h>
95 #include <netinet/tcp.h>
96 #include <netinet/tcp_fsm.h>
97 #include <netinet/tcp_seq.h>
98
99 #define PFSYNC_MINPKT ( \
100         sizeof(struct ip) + \
101         sizeof(struct pfsync_header) + \
102         sizeof(struct pfsync_subheader) )
103
104 struct pfsync_pkt {
105         struct ip *ip;
106         struct in_addr src;
107         u_int8_t flags;
108 };
109
110 static int      pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
111                     struct pfsync_state_peer *);
112 static int      pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int);
113 static int      pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int);
114 static int      pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int);
115 static int      pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int);
116 static int      pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int);
117 static int      pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int);
118 static int      pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int);
119 static int      pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int);
120 static int      pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int);
121 static int      pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int);
122 static int      pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int);
123 static int      pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int);
124
125 static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = {
126         pfsync_in_clr,                  /* PFSYNC_ACT_CLR */
127         pfsync_in_ins,                  /* PFSYNC_ACT_INS */
128         pfsync_in_iack,                 /* PFSYNC_ACT_INS_ACK */
129         pfsync_in_upd,                  /* PFSYNC_ACT_UPD */
130         pfsync_in_upd_c,                /* PFSYNC_ACT_UPD_C */
131         pfsync_in_ureq,                 /* PFSYNC_ACT_UPD_REQ */
132         pfsync_in_del,                  /* PFSYNC_ACT_DEL */
133         pfsync_in_del_c,                /* PFSYNC_ACT_DEL_C */
134         pfsync_in_error,                /* PFSYNC_ACT_INS_F */
135         pfsync_in_error,                /* PFSYNC_ACT_DEL_F */
136         pfsync_in_bus,                  /* PFSYNC_ACT_BUS */
137         pfsync_in_tdb,                  /* PFSYNC_ACT_TDB */
138         pfsync_in_eof                   /* PFSYNC_ACT_EOF */
139 };
140
141 struct pfsync_q {
142         void            (*write)(struct pf_state *, void *);
143         size_t          len;
144         u_int8_t        action;
145 };
146
147 /* we have one of these for every PFSYNC_S_ */
148 static void     pfsync_out_state(struct pf_state *, void *);
149 static void     pfsync_out_iack(struct pf_state *, void *);
150 static void     pfsync_out_upd_c(struct pf_state *, void *);
151 static void     pfsync_out_del(struct pf_state *, void *);
152
153 static struct pfsync_q pfsync_qs[] = {
154         { pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
155         { pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
156         { pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD },
157         { pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
158         { pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C }
159 };
160
161 static void     pfsync_q_ins(struct pf_state *, int);
162 static void     pfsync_q_del(struct pf_state *);
163
164 static void     pfsync_update_state(struct pf_state *);
165
166 struct pfsync_upd_req_item {
167         TAILQ_ENTRY(pfsync_upd_req_item)        ur_entry;
168         struct pfsync_upd_req                   ur_msg;
169 };
170
171 struct pfsync_deferral {
172         struct pfsync_softc             *pd_sc;
173         TAILQ_ENTRY(pfsync_deferral)    pd_entry;
174         u_int                           pd_refs;
175         struct callout                  pd_tmo;
176
177         struct pf_state                 *pd_st;
178         struct mbuf                     *pd_m;
179 };
180
181 struct pfsync_softc {
182         /* Configuration */
183         struct ifnet            *sc_ifp;
184         struct ifnet            *sc_sync_if;
185         struct ip_moptions      sc_imo;
186         struct in_addr          sc_sync_peer;
187         uint32_t                sc_flags;
188 #define PFSYNCF_OK              0x00000001
189 #define PFSYNCF_DEFER           0x00000002
190 #define PFSYNCF_PUSH            0x00000004
191         uint8_t                 sc_maxupdates;
192         struct ip               sc_template;
193         struct callout          sc_tmo;
194         struct mtx              sc_mtx;
195
196         /* Queued data */
197         size_t                  sc_len;
198         TAILQ_HEAD(, pf_state)                  sc_qs[PFSYNC_S_COUNT];
199         TAILQ_HEAD(, pfsync_upd_req_item)       sc_upd_req_list;
200         TAILQ_HEAD(, pfsync_deferral)           sc_deferrals;
201         u_int                   sc_deferred;
202         void                    *sc_plus;
203         size_t                  sc_pluslen;
204
205         /* Bulk update info */
206         struct mtx              sc_bulk_mtx;
207         uint32_t                sc_ureq_sent;
208         int                     sc_bulk_tries;
209         uint32_t                sc_ureq_received;
210         int                     sc_bulk_hashid;
211         uint64_t                sc_bulk_stateid;
212         uint32_t                sc_bulk_creatorid;
213         struct callout          sc_bulk_tmo;
214         struct callout          sc_bulkfail_tmo;
215 };
216
217 #define PFSYNC_LOCK(sc)         mtx_lock(&(sc)->sc_mtx)
218 #define PFSYNC_UNLOCK(sc)       mtx_unlock(&(sc)->sc_mtx)
219 #define PFSYNC_LOCK_ASSERT(sc)  mtx_assert(&(sc)->sc_mtx, MA_OWNED)
220
221 #define PFSYNC_BLOCK(sc)        mtx_lock(&(sc)->sc_bulk_mtx)
222 #define PFSYNC_BUNLOCK(sc)      mtx_unlock(&(sc)->sc_bulk_mtx)
223 #define PFSYNC_BLOCK_ASSERT(sc) mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
224
225 static const char pfsyncname[] = "pfsync";
226 static MALLOC_DEFINE(M_PFSYNC, pfsyncname, "pfsync(4) data");
227 static VNET_DEFINE(struct pfsync_softc  *, pfsyncif) = NULL;
228 #define V_pfsyncif              VNET(pfsyncif)
229 static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL;
230 #define V_pfsync_swi_cookie     VNET(pfsync_swi_cookie)
231 static VNET_DEFINE(struct pfsyncstats, pfsyncstats);
232 #define V_pfsyncstats           VNET(pfsyncstats)
233 static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW;
234 #define V_pfsync_carp_adj       VNET(pfsync_carp_adj)
235
236 static void     pfsync_timeout(void *);
237 static void     pfsync_push(struct pfsync_softc *);
238 static void     pfsyncintr(void *);
239 static int      pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
240                     void *);
241 static void     pfsync_multicast_cleanup(struct pfsync_softc *);
242 static void     pfsync_pointers_init(void);
243 static void     pfsync_pointers_uninit(void);
244 static int      pfsync_init(void);
245 static void     pfsync_uninit(void);
246
247 SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC");
248 SYSCTL_VNET_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_RW,
249     &VNET_NAME(pfsyncstats), pfsyncstats,
250     "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
251 SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW,
252     &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
253
254 static int      pfsync_clone_create(struct if_clone *, int, caddr_t);
255 static void     pfsync_clone_destroy(struct ifnet *);
256 static int      pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
257                     struct pf_state_peer *);
258 static int      pfsyncoutput(struct ifnet *, struct mbuf *,
259                     const struct sockaddr *, struct route *);
260 static int      pfsyncioctl(struct ifnet *, u_long, caddr_t);
261
262 static int      pfsync_defer(struct pf_state *, struct mbuf *);
263 static void     pfsync_undefer(struct pfsync_deferral *, int);
264 static void     pfsync_undefer_state(struct pf_state *, int);
265 static void     pfsync_defer_tmo(void *);
266
267 static void     pfsync_request_update(u_int32_t, u_int64_t);
268 static void     pfsync_update_state_req(struct pf_state *);
269
270 static void     pfsync_drop(struct pfsync_softc *);
271 static void     pfsync_sendout(int);
272 static void     pfsync_send_plus(void *, size_t);
273
274 static void     pfsync_bulk_start(void);
275 static void     pfsync_bulk_status(u_int8_t);
276 static void     pfsync_bulk_update(void *);
277 static void     pfsync_bulk_fail(void *);
278
279 #ifdef IPSEC
280 static void     pfsync_update_net_tdb(struct pfsync_tdb *);
281 #endif
282
283 #define PFSYNC_MAX_BULKTRIES    12
284
285 VNET_DEFINE(struct if_clone *, pfsync_cloner);
286 #define V_pfsync_cloner VNET(pfsync_cloner)
287
288 static int
289 pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
290 {
291         struct pfsync_softc *sc;
292         struct ifnet *ifp;
293         int q;
294
295         if (unit != 0)
296                 return (EINVAL);
297
298         sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
299         sc->sc_flags |= PFSYNCF_OK;
300
301         for (q = 0; q < PFSYNC_S_COUNT; q++)
302                 TAILQ_INIT(&sc->sc_qs[q]);
303
304         TAILQ_INIT(&sc->sc_upd_req_list);
305         TAILQ_INIT(&sc->sc_deferrals);
306
307         sc->sc_len = PFSYNC_MINPKT;
308         sc->sc_maxupdates = 128;
309
310         ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
311         if (ifp == NULL) {
312                 free(sc, M_PFSYNC);
313                 return (ENOSPC);
314         }
315         if_initname(ifp, pfsyncname, unit);
316         ifp->if_softc = sc;
317         ifp->if_ioctl = pfsyncioctl;
318         ifp->if_output = pfsyncoutput;
319         ifp->if_type = IFT_PFSYNC;
320         ifp->if_snd.ifq_maxlen = ifqmaxlen;
321         ifp->if_hdrlen = sizeof(struct pfsync_header);
322         ifp->if_mtu = ETHERMTU;
323         mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF);
324         mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
325         callout_init(&sc->sc_tmo, CALLOUT_MPSAFE);
326         callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
327         callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
328
329         if_attach(ifp);
330
331         bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
332
333         V_pfsyncif = sc;
334
335         return (0);
336 }
337
338 static void
339 pfsync_clone_destroy(struct ifnet *ifp)
340 {
341         struct pfsync_softc *sc = ifp->if_softc;
342
343         /*
344          * At this stage, everything should have already been
345          * cleared by pfsync_uninit(), and we have only to
346          * drain callouts.
347          */
348         while (sc->sc_deferred > 0) {
349                 struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals);
350
351                 TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
352                 sc->sc_deferred--;
353                 if (callout_stop(&pd->pd_tmo)) {
354                         pf_release_state(pd->pd_st);
355                         m_freem(pd->pd_m);
356                         free(pd, M_PFSYNC);
357                 } else {
358                         pd->pd_refs++;
359                         callout_drain(&pd->pd_tmo);
360                         free(pd, M_PFSYNC);
361                 }
362         }
363
364         callout_drain(&sc->sc_tmo);
365         callout_drain(&sc->sc_bulkfail_tmo);
366         callout_drain(&sc->sc_bulk_tmo);
367
368         if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
369                 (*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy");
370         bpfdetach(ifp);
371         if_detach(ifp);
372
373         pfsync_drop(sc);
374
375         if_free(ifp);
376         if (sc->sc_imo.imo_membership)
377                 pfsync_multicast_cleanup(sc);
378         mtx_destroy(&sc->sc_mtx);
379         mtx_destroy(&sc->sc_bulk_mtx);
380         free(sc, M_PFSYNC);
381
382         V_pfsyncif = NULL;
383 }
384
385 static int
386 pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
387     struct pf_state_peer *d)
388 {
389         if (s->scrub.scrub_flag && d->scrub == NULL) {
390                 d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO);
391                 if (d->scrub == NULL)
392                         return (ENOMEM);
393         }
394
395         return (0);
396 }
397
398
399 static int
400 pfsync_state_import(struct pfsync_state *sp, u_int8_t flags)
401 {
402         struct pfsync_softc *sc = V_pfsyncif;
403         struct pf_state *st = NULL;
404         struct pf_state_key *skw = NULL, *sks = NULL;
405         struct pf_rule *r = NULL;
406         struct pfi_kif  *kif;
407         int error;
408
409         PF_RULES_RASSERT();
410
411         if (sp->creatorid == 0) {
412                 if (V_pf_status.debug >= PF_DEBUG_MISC)
413                         printf("%s: invalid creator id: %08x\n", __func__,
414                             ntohl(sp->creatorid));
415                 return (EINVAL);
416         }
417
418         if ((kif = pfi_kif_find(sp->ifname)) == NULL) {
419                 if (V_pf_status.debug >= PF_DEBUG_MISC)
420                         printf("%s: unknown interface: %s\n", __func__,
421                             sp->ifname);
422                 if (flags & PFSYNC_SI_IOCTL)
423                         return (EINVAL);
424                 return (0);     /* skip this state */
425         }
426
427         /*
428          * If the ruleset checksums match or the state is coming from the ioctl,
429          * it's safe to associate the state with the rule of that number.
430          */
431         if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
432             (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
433             pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
434                 r = pf_main_ruleset.rules[
435                     PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
436         else
437                 r = &V_pf_default_rule;
438
439         if ((r->max_states &&
440             counter_u64_fetch(r->states_cur) >= r->max_states))
441                 goto cleanup;
442
443         /*
444          * XXXGL: consider M_WAITOK in ioctl path after.
445          */
446         if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL)
447                 goto cleanup;
448
449         if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL)
450                 goto cleanup;
451
452         if (PF_ANEQ(&sp->key[PF_SK_WIRE].addr[0],
453             &sp->key[PF_SK_STACK].addr[0], sp->af) ||
454             PF_ANEQ(&sp->key[PF_SK_WIRE].addr[1],
455             &sp->key[PF_SK_STACK].addr[1], sp->af) ||
456             sp->key[PF_SK_WIRE].port[0] != sp->key[PF_SK_STACK].port[0] ||
457             sp->key[PF_SK_WIRE].port[1] != sp->key[PF_SK_STACK].port[1]) {
458                 sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
459                 if (sks == NULL)
460                         goto cleanup;
461         } else
462                 sks = skw;
463
464         /* allocate memory for scrub info */
465         if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
466             pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
467                 goto cleanup;
468
469         /* copy to state key(s) */
470         skw->addr[0] = sp->key[PF_SK_WIRE].addr[0];
471         skw->addr[1] = sp->key[PF_SK_WIRE].addr[1];
472         skw->port[0] = sp->key[PF_SK_WIRE].port[0];
473         skw->port[1] = sp->key[PF_SK_WIRE].port[1];
474         skw->proto = sp->proto;
475         skw->af = sp->af;
476         if (sks != skw) {
477                 sks->addr[0] = sp->key[PF_SK_STACK].addr[0];
478                 sks->addr[1] = sp->key[PF_SK_STACK].addr[1];
479                 sks->port[0] = sp->key[PF_SK_STACK].port[0];
480                 sks->port[1] = sp->key[PF_SK_STACK].port[1];
481                 sks->proto = sp->proto;
482                 sks->af = sp->af;
483         }
484
485         /* copy to state */
486         bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
487         st->creation = time_uptime - ntohl(sp->creation);
488         st->expire = time_uptime;
489         if (sp->expire) {
490                 uint32_t timeout;
491
492                 timeout = r->timeout[sp->timeout];
493                 if (!timeout)
494                         timeout = V_pf_default_rule.timeout[sp->timeout];
495
496                 /* sp->expire may have been adaptively scaled by export. */
497                 st->expire -= timeout - ntohl(sp->expire);
498         }
499
500         st->direction = sp->direction;
501         st->log = sp->log;
502         st->timeout = sp->timeout;
503         st->state_flags = sp->state_flags;
504
505         st->id = sp->id;
506         st->creatorid = sp->creatorid;
507         pf_state_peer_ntoh(&sp->src, &st->src);
508         pf_state_peer_ntoh(&sp->dst, &st->dst);
509
510         st->rule.ptr = r;
511         st->nat_rule.ptr = NULL;
512         st->anchor.ptr = NULL;
513         st->rt_kif = NULL;
514
515         st->pfsync_time = time_uptime;
516         st->sync_state = PFSYNC_S_NONE;
517
518         if (!(flags & PFSYNC_SI_IOCTL))
519                 st->state_flags |= PFSTATE_NOSYNC;
520
521         if ((error = pf_state_insert(kif, skw, sks, st)) != 0)
522                 goto cleanup_state;
523
524         /* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
525         counter_u64_add(r->states_cur, 1);
526         counter_u64_add(r->states_tot, 1);
527
528         if (!(flags & PFSYNC_SI_IOCTL)) {
529                 st->state_flags &= ~PFSTATE_NOSYNC;
530                 if (st->state_flags & PFSTATE_ACK) {
531                         pfsync_q_ins(st, PFSYNC_S_IACK);
532                         pfsync_push(sc);
533                 }
534         }
535         st->state_flags &= ~PFSTATE_ACK;
536         PF_STATE_UNLOCK(st);
537
538         return (0);
539
540 cleanup:
541         error = ENOMEM;
542         if (skw == sks)
543                 sks = NULL;
544         if (skw != NULL)
545                 uma_zfree(V_pf_state_key_z, skw);
546         if (sks != NULL)
547                 uma_zfree(V_pf_state_key_z, sks);
548
549 cleanup_state:  /* pf_state_insert() frees the state keys. */
550         if (st) {
551                 if (st->dst.scrub)
552                         uma_zfree(V_pf_state_scrub_z, st->dst.scrub);
553                 if (st->src.scrub)
554                         uma_zfree(V_pf_state_scrub_z, st->src.scrub);
555                 uma_zfree(V_pf_state_z, st);
556         }
557         return (error);
558 }
559
560 static void
561 pfsync_input(struct mbuf *m, __unused int off)
562 {
563         struct pfsync_softc *sc = V_pfsyncif;
564         struct pfsync_pkt pkt;
565         struct ip *ip = mtod(m, struct ip *);
566         struct pfsync_header *ph;
567         struct pfsync_subheader subh;
568
569         int offset, len;
570         int rv;
571         uint16_t count;
572
573         V_pfsyncstats.pfsyncs_ipackets++;
574
575         /* Verify that we have a sync interface configured. */
576         if (!sc || !sc->sc_sync_if || !V_pf_status.running ||
577             (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
578                 goto done;
579
580         /* verify that the packet came in on the right interface */
581         if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
582                 V_pfsyncstats.pfsyncs_badif++;
583                 goto done;
584         }
585
586         sc->sc_ifp->if_ipackets++;
587         sc->sc_ifp->if_ibytes += m->m_pkthdr.len;
588         /* verify that the IP TTL is 255. */
589         if (ip->ip_ttl != PFSYNC_DFLTTL) {
590                 V_pfsyncstats.pfsyncs_badttl++;
591                 goto done;
592         }
593
594         offset = ip->ip_hl << 2;
595         if (m->m_pkthdr.len < offset + sizeof(*ph)) {
596                 V_pfsyncstats.pfsyncs_hdrops++;
597                 goto done;
598         }
599
600         if (offset + sizeof(*ph) > m->m_len) {
601                 if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
602                         V_pfsyncstats.pfsyncs_hdrops++;
603                         return;
604                 }
605                 ip = mtod(m, struct ip *);
606         }
607         ph = (struct pfsync_header *)((char *)ip + offset);
608
609         /* verify the version */
610         if (ph->version != PFSYNC_VERSION) {
611                 V_pfsyncstats.pfsyncs_badver++;
612                 goto done;
613         }
614
615         len = ntohs(ph->len) + offset;
616         if (m->m_pkthdr.len < len) {
617                 V_pfsyncstats.pfsyncs_badlen++;
618                 goto done;
619         }
620
621         /* Cheaper to grab this now than having to mess with mbufs later */
622         pkt.ip = ip;
623         pkt.src = ip->ip_src;
624         pkt.flags = 0;
625
626         /*
627          * Trusting pf_chksum during packet processing, as well as seeking
628          * in interface name tree, require holding PF_RULES_RLOCK().
629          */
630         PF_RULES_RLOCK();
631         if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
632                 pkt.flags |= PFSYNC_SI_CKSUM;
633
634         offset += sizeof(*ph);
635         while (offset <= len - sizeof(subh)) {
636                 m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
637                 offset += sizeof(subh);
638
639                 if (subh.action >= PFSYNC_ACT_MAX) {
640                         V_pfsyncstats.pfsyncs_badact++;
641                         PF_RULES_RUNLOCK();
642                         goto done;
643                 }
644
645                 count = ntohs(subh.count);
646                 V_pfsyncstats.pfsyncs_iacts[subh.action] += count;
647                 rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count);
648                 if (rv == -1) {
649                         PF_RULES_RUNLOCK();
650                         return;
651                 }
652
653                 offset += rv;
654         }
655         PF_RULES_RUNLOCK();
656
657 done:
658         m_freem(m);
659 }
660
661 static int
662 pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
663 {
664         struct pfsync_clr *clr;
665         struct mbuf *mp;
666         int len = sizeof(*clr) * count;
667         int i, offp;
668         u_int32_t creatorid;
669
670         mp = m_pulldown(m, offset, len, &offp);
671         if (mp == NULL) {
672                 V_pfsyncstats.pfsyncs_badlen++;
673                 return (-1);
674         }
675         clr = (struct pfsync_clr *)(mp->m_data + offp);
676
677         for (i = 0; i < count; i++) {
678                 creatorid = clr[i].creatorid;
679
680                 if (clr[i].ifname[0] != '\0' &&
681                     pfi_kif_find(clr[i].ifname) == NULL)
682                         continue;
683
684                 for (int i = 0; i <= V_pf_hashmask; i++) {
685                         struct pf_idhash *ih = &V_pf_idhash[i];
686                         struct pf_state *s;
687 relock:
688                         PF_HASHROW_LOCK(ih);
689                         LIST_FOREACH(s, &ih->states, entry) {
690                                 if (s->creatorid == creatorid) {
691                                         s->state_flags |= PFSTATE_NOSYNC;
692                                         pf_unlink_state(s, PF_ENTER_LOCKED);
693                                         goto relock;
694                                 }
695                         }
696                         PF_HASHROW_UNLOCK(ih);
697                 }
698         }
699
700         return (len);
701 }
702
703 static int
704 pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
705 {
706         struct mbuf *mp;
707         struct pfsync_state *sa, *sp;
708         int len = sizeof(*sp) * count;
709         int i, offp;
710
711         mp = m_pulldown(m, offset, len, &offp);
712         if (mp == NULL) {
713                 V_pfsyncstats.pfsyncs_badlen++;
714                 return (-1);
715         }
716         sa = (struct pfsync_state *)(mp->m_data + offp);
717
718         for (i = 0; i < count; i++) {
719                 sp = &sa[i];
720
721                 /* Check for invalid values. */
722                 if (sp->timeout >= PFTM_MAX ||
723                     sp->src.state > PF_TCPS_PROXY_DST ||
724                     sp->dst.state > PF_TCPS_PROXY_DST ||
725                     sp->direction > PF_OUT ||
726                     (sp->af != AF_INET && sp->af != AF_INET6)) {
727                         if (V_pf_status.debug >= PF_DEBUG_MISC)
728                                 printf("%s: invalid value\n", __func__);
729                         V_pfsyncstats.pfsyncs_badval++;
730                         continue;
731                 }
732
733                 if (pfsync_state_import(sp, pkt->flags) == ENOMEM)
734                         /* Drop out, but process the rest of the actions. */
735                         break;
736         }
737
738         return (len);
739 }
740
741 static int
742 pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
743 {
744         struct pfsync_ins_ack *ia, *iaa;
745         struct pf_state *st;
746
747         struct mbuf *mp;
748         int len = count * sizeof(*ia);
749         int offp, i;
750
751         mp = m_pulldown(m, offset, len, &offp);
752         if (mp == NULL) {
753                 V_pfsyncstats.pfsyncs_badlen++;
754                 return (-1);
755         }
756         iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
757
758         for (i = 0; i < count; i++) {
759                 ia = &iaa[i];
760
761                 st = pf_find_state_byid(ia->id, ia->creatorid);
762                 if (st == NULL)
763                         continue;
764
765                 if (st->state_flags & PFSTATE_ACK) {
766                         PFSYNC_LOCK(V_pfsyncif);
767                         pfsync_undefer_state(st, 0);
768                         PFSYNC_UNLOCK(V_pfsyncif);
769                 }
770                 PF_STATE_UNLOCK(st);
771         }
772         /*
773          * XXX this is not yet implemented, but we know the size of the
774          * message so we can skip it.
775          */
776
777         return (count * sizeof(struct pfsync_ins_ack));
778 }
779
780 static int
781 pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
782     struct pfsync_state_peer *dst)
783 {
784         int sync = 0;
785
786         PF_STATE_LOCK_ASSERT(st);
787
788         /*
789          * The state should never go backwards except
790          * for syn-proxy states.  Neither should the
791          * sequence window slide backwards.
792          */
793         if ((st->src.state > src->state &&
794             (st->src.state < PF_TCPS_PROXY_SRC ||
795             src->state >= PF_TCPS_PROXY_SRC)) ||
796
797             (st->src.state == src->state &&
798             SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
799                 sync++;
800         else
801                 pf_state_peer_ntoh(src, &st->src);
802
803         if ((st->dst.state > dst->state) ||
804
805             (st->dst.state >= TCPS_SYN_SENT &&
806             SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
807                 sync++;
808         else
809                 pf_state_peer_ntoh(dst, &st->dst);
810
811         return (sync);
812 }
813
814 static int
815 pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
816 {
817         struct pfsync_softc *sc = V_pfsyncif;
818         struct pfsync_state *sa, *sp;
819         struct pf_state *st;
820         int sync;
821
822         struct mbuf *mp;
823         int len = count * sizeof(*sp);
824         int offp, i;
825
826         mp = m_pulldown(m, offset, len, &offp);
827         if (mp == NULL) {
828                 V_pfsyncstats.pfsyncs_badlen++;
829                 return (-1);
830         }
831         sa = (struct pfsync_state *)(mp->m_data + offp);
832
833         for (i = 0; i < count; i++) {
834                 sp = &sa[i];
835
836                 /* check for invalid values */
837                 if (sp->timeout >= PFTM_MAX ||
838                     sp->src.state > PF_TCPS_PROXY_DST ||
839                     sp->dst.state > PF_TCPS_PROXY_DST) {
840                         if (V_pf_status.debug >= PF_DEBUG_MISC) {
841                                 printf("pfsync_input: PFSYNC_ACT_UPD: "
842                                     "invalid value\n");
843                         }
844                         V_pfsyncstats.pfsyncs_badval++;
845                         continue;
846                 }
847
848                 st = pf_find_state_byid(sp->id, sp->creatorid);
849                 if (st == NULL) {
850                         /* insert the update */
851                         if (pfsync_state_import(sp, 0))
852                                 V_pfsyncstats.pfsyncs_badstate++;
853                         continue;
854                 }
855
856                 if (st->state_flags & PFSTATE_ACK) {
857                         PFSYNC_LOCK(sc);
858                         pfsync_undefer_state(st, 1);
859                         PFSYNC_UNLOCK(sc);
860                 }
861
862                 if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
863                         sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
864                 else {
865                         sync = 0;
866
867                         /*
868                          * Non-TCP protocol state machine always go
869                          * forwards
870                          */
871                         if (st->src.state > sp->src.state)
872                                 sync++;
873                         else
874                                 pf_state_peer_ntoh(&sp->src, &st->src);
875                         if (st->dst.state > sp->dst.state)
876                                 sync++;
877                         else
878                                 pf_state_peer_ntoh(&sp->dst, &st->dst);
879                 }
880                 if (sync < 2) {
881                         pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
882                         pf_state_peer_ntoh(&sp->dst, &st->dst);
883                         st->expire = time_uptime;
884                         st->timeout = sp->timeout;
885                 }
886                 st->pfsync_time = time_uptime;
887
888                 if (sync) {
889                         V_pfsyncstats.pfsyncs_stale++;
890
891                         pfsync_update_state(st);
892                         PF_STATE_UNLOCK(st);
893                         PFSYNC_LOCK(sc);
894                         pfsync_push(sc);
895                         PFSYNC_UNLOCK(sc);
896                         continue;
897                 }
898                 PF_STATE_UNLOCK(st);
899         }
900
901         return (len);
902 }
903
904 static int
905 pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
906 {
907         struct pfsync_softc *sc = V_pfsyncif;
908         struct pfsync_upd_c *ua, *up;
909         struct pf_state *st;
910         int len = count * sizeof(*up);
911         int sync;
912         struct mbuf *mp;
913         int offp, i;
914
915         mp = m_pulldown(m, offset, len, &offp);
916         if (mp == NULL) {
917                 V_pfsyncstats.pfsyncs_badlen++;
918                 return (-1);
919         }
920         ua = (struct pfsync_upd_c *)(mp->m_data + offp);
921
922         for (i = 0; i < count; i++) {
923                 up = &ua[i];
924
925                 /* check for invalid values */
926                 if (up->timeout >= PFTM_MAX ||
927                     up->src.state > PF_TCPS_PROXY_DST ||
928                     up->dst.state > PF_TCPS_PROXY_DST) {
929                         if (V_pf_status.debug >= PF_DEBUG_MISC) {
930                                 printf("pfsync_input: "
931                                     "PFSYNC_ACT_UPD_C: "
932                                     "invalid value\n");
933                         }
934                         V_pfsyncstats.pfsyncs_badval++;
935                         continue;
936                 }
937
938                 st = pf_find_state_byid(up->id, up->creatorid);
939                 if (st == NULL) {
940                         /* We don't have this state. Ask for it. */
941                         PFSYNC_LOCK(sc);
942                         pfsync_request_update(up->creatorid, up->id);
943                         PFSYNC_UNLOCK(sc);
944                         continue;
945                 }
946
947                 if (st->state_flags & PFSTATE_ACK) {
948                         PFSYNC_LOCK(sc);
949                         pfsync_undefer_state(st, 1);
950                         PFSYNC_UNLOCK(sc);
951                 }
952
953                 if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
954                         sync = pfsync_upd_tcp(st, &up->src, &up->dst);
955                 else {
956                         sync = 0;
957
958                         /*
959                          * Non-TCP protocol state machine always go
960                          * forwards
961                          */
962                         if (st->src.state > up->src.state)
963                                 sync++;
964                         else
965                                 pf_state_peer_ntoh(&up->src, &st->src);
966                         if (st->dst.state > up->dst.state)
967                                 sync++;
968                         else
969                                 pf_state_peer_ntoh(&up->dst, &st->dst);
970                 }
971                 if (sync < 2) {
972                         pfsync_alloc_scrub_memory(&up->dst, &st->dst);
973                         pf_state_peer_ntoh(&up->dst, &st->dst);
974                         st->expire = time_uptime;
975                         st->timeout = up->timeout;
976                 }
977                 st->pfsync_time = time_uptime;
978
979                 if (sync) {
980                         V_pfsyncstats.pfsyncs_stale++;
981
982                         pfsync_update_state(st);
983                         PF_STATE_UNLOCK(st);
984                         PFSYNC_LOCK(sc);
985                         pfsync_push(sc);
986                         PFSYNC_UNLOCK(sc);
987                         continue;
988                 }
989                 PF_STATE_UNLOCK(st);
990         }
991
992         return (len);
993 }
994
995 static int
996 pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
997 {
998         struct pfsync_upd_req *ur, *ura;
999         struct mbuf *mp;
1000         int len = count * sizeof(*ur);
1001         int i, offp;
1002
1003         struct pf_state *st;
1004
1005         mp = m_pulldown(m, offset, len, &offp);
1006         if (mp == NULL) {
1007                 V_pfsyncstats.pfsyncs_badlen++;
1008                 return (-1);
1009         }
1010         ura = (struct pfsync_upd_req *)(mp->m_data + offp);
1011
1012         for (i = 0; i < count; i++) {
1013                 ur = &ura[i];
1014
1015                 if (ur->id == 0 && ur->creatorid == 0)
1016                         pfsync_bulk_start();
1017                 else {
1018                         st = pf_find_state_byid(ur->id, ur->creatorid);
1019                         if (st == NULL) {
1020                                 V_pfsyncstats.pfsyncs_badstate++;
1021                                 continue;
1022                         }
1023                         if (st->state_flags & PFSTATE_NOSYNC) {
1024                                 PF_STATE_UNLOCK(st);
1025                                 continue;
1026                         }
1027
1028                         pfsync_update_state_req(st);
1029                         PF_STATE_UNLOCK(st);
1030                 }
1031         }
1032
1033         return (len);
1034 }
1035
1036 static int
1037 pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1038 {
1039         struct mbuf *mp;
1040         struct pfsync_state *sa, *sp;
1041         struct pf_state *st;
1042         int len = count * sizeof(*sp);
1043         int offp, i;
1044
1045         mp = m_pulldown(m, offset, len, &offp);
1046         if (mp == NULL) {
1047                 V_pfsyncstats.pfsyncs_badlen++;
1048                 return (-1);
1049         }
1050         sa = (struct pfsync_state *)(mp->m_data + offp);
1051
1052         for (i = 0; i < count; i++) {
1053                 sp = &sa[i];
1054
1055                 st = pf_find_state_byid(sp->id, sp->creatorid);
1056                 if (st == NULL) {
1057                         V_pfsyncstats.pfsyncs_badstate++;
1058                         continue;
1059                 }
1060                 st->state_flags |= PFSTATE_NOSYNC;
1061                 pf_unlink_state(st, PF_ENTER_LOCKED);
1062         }
1063
1064         return (len);
1065 }
1066
1067 static int
1068 pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1069 {
1070         struct mbuf *mp;
1071         struct pfsync_del_c *sa, *sp;
1072         struct pf_state *st;
1073         int len = count * sizeof(*sp);
1074         int offp, i;
1075
1076         mp = m_pulldown(m, offset, len, &offp);
1077         if (mp == NULL) {
1078                 V_pfsyncstats.pfsyncs_badlen++;
1079                 return (-1);
1080         }
1081         sa = (struct pfsync_del_c *)(mp->m_data + offp);
1082
1083         for (i = 0; i < count; i++) {
1084                 sp = &sa[i];
1085
1086                 st = pf_find_state_byid(sp->id, sp->creatorid);
1087                 if (st == NULL) {
1088                         V_pfsyncstats.pfsyncs_badstate++;
1089                         continue;
1090                 }
1091
1092                 st->state_flags |= PFSTATE_NOSYNC;
1093                 pf_unlink_state(st, PF_ENTER_LOCKED);
1094         }
1095
1096         return (len);
1097 }
1098
1099 static int
1100 pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1101 {
1102         struct pfsync_softc *sc = V_pfsyncif;
1103         struct pfsync_bus *bus;
1104         struct mbuf *mp;
1105         int len = count * sizeof(*bus);
1106         int offp;
1107
1108         PFSYNC_BLOCK(sc);
1109
1110         /* If we're not waiting for a bulk update, who cares. */
1111         if (sc->sc_ureq_sent == 0) {
1112                 PFSYNC_BUNLOCK(sc);
1113                 return (len);
1114         }
1115
1116         mp = m_pulldown(m, offset, len, &offp);
1117         if (mp == NULL) {
1118                 PFSYNC_BUNLOCK(sc);
1119                 V_pfsyncstats.pfsyncs_badlen++;
1120                 return (-1);
1121         }
1122         bus = (struct pfsync_bus *)(mp->m_data + offp);
1123
1124         switch (bus->status) {
1125         case PFSYNC_BUS_START:
1126                 callout_reset(&sc->sc_bulkfail_tmo, 4 * hz +
1127                     V_pf_limits[PF_LIMIT_STATES].limit /
1128                     ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) /
1129                     sizeof(struct pfsync_state)),
1130                     pfsync_bulk_fail, sc);
1131                 if (V_pf_status.debug >= PF_DEBUG_MISC)
1132                         printf("pfsync: received bulk update start\n");
1133                 break;
1134
1135         case PFSYNC_BUS_END:
1136                 if (time_uptime - ntohl(bus->endtime) >=
1137                     sc->sc_ureq_sent) {
1138                         /* that's it, we're happy */
1139                         sc->sc_ureq_sent = 0;
1140                         sc->sc_bulk_tries = 0;
1141                         callout_stop(&sc->sc_bulkfail_tmo);
1142                         if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1143                                 (*carp_demote_adj_p)(-V_pfsync_carp_adj,
1144                                     "pfsync bulk done");
1145                         sc->sc_flags |= PFSYNCF_OK;
1146                         if (V_pf_status.debug >= PF_DEBUG_MISC)
1147                                 printf("pfsync: received valid "
1148                                     "bulk update end\n");
1149                 } else {
1150                         if (V_pf_status.debug >= PF_DEBUG_MISC)
1151                                 printf("pfsync: received invalid "
1152                                     "bulk update end: bad timestamp\n");
1153                 }
1154                 break;
1155         }
1156         PFSYNC_BUNLOCK(sc);
1157
1158         return (len);
1159 }
1160
1161 static int
1162 pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1163 {
1164         int len = count * sizeof(struct pfsync_tdb);
1165
1166 #if defined(IPSEC)
1167         struct pfsync_tdb *tp;
1168         struct mbuf *mp;
1169         int offp;
1170         int i;
1171         int s;
1172
1173         mp = m_pulldown(m, offset, len, &offp);
1174         if (mp == NULL) {
1175                 V_pfsyncstats.pfsyncs_badlen++;
1176                 return (-1);
1177         }
1178         tp = (struct pfsync_tdb *)(mp->m_data + offp);
1179
1180         for (i = 0; i < count; i++)
1181                 pfsync_update_net_tdb(&tp[i]);
1182 #endif
1183
1184         return (len);
1185 }
1186
1187 #if defined(IPSEC)
1188 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
1189 static void
1190 pfsync_update_net_tdb(struct pfsync_tdb *pt)
1191 {
1192         struct tdb              *tdb;
1193         int                      s;
1194
1195         /* check for invalid values */
1196         if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
1197             (pt->dst.sa.sa_family != AF_INET &&
1198             pt->dst.sa.sa_family != AF_INET6))
1199                 goto bad;
1200
1201         tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
1202         if (tdb) {
1203                 pt->rpl = ntohl(pt->rpl);
1204                 pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes);
1205
1206                 /* Neither replay nor byte counter should ever decrease. */
1207                 if (pt->rpl < tdb->tdb_rpl ||
1208                     pt->cur_bytes < tdb->tdb_cur_bytes) {
1209                         goto bad;
1210                 }
1211
1212                 tdb->tdb_rpl = pt->rpl;
1213                 tdb->tdb_cur_bytes = pt->cur_bytes;
1214         }
1215         return;
1216
1217 bad:
1218         if (V_pf_status.debug >= PF_DEBUG_MISC)
1219                 printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
1220                     "invalid value\n");
1221         V_pfsyncstats.pfsyncs_badstate++;
1222         return;
1223 }
1224 #endif
1225
1226
1227 static int
1228 pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1229 {
1230         /* check if we are at the right place in the packet */
1231         if (offset != m->m_pkthdr.len)
1232                 V_pfsyncstats.pfsyncs_badlen++;
1233
1234         /* we're done. free and let the caller return */
1235         m_freem(m);
1236         return (-1);
1237 }
1238
1239 static int
1240 pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1241 {
1242         V_pfsyncstats.pfsyncs_badact++;
1243
1244         m_freem(m);
1245         return (-1);
1246 }
1247
1248 static int
1249 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
1250         struct route *rt)
1251 {
1252         m_freem(m);
1253         return (0);
1254 }
1255
1256 /* ARGSUSED */
1257 static int
1258 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1259 {
1260         struct pfsync_softc *sc = ifp->if_softc;
1261         struct ifreq *ifr = (struct ifreq *)data;
1262         struct pfsyncreq pfsyncr;
1263         int error;
1264
1265         switch (cmd) {
1266         case SIOCSIFFLAGS:
1267                 PFSYNC_LOCK(sc);
1268                 if (ifp->if_flags & IFF_UP) {
1269                         ifp->if_drv_flags |= IFF_DRV_RUNNING;
1270                         PFSYNC_UNLOCK(sc);
1271                         pfsync_pointers_init();
1272                 } else {
1273                         ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1274                         PFSYNC_UNLOCK(sc);
1275                         pfsync_pointers_uninit();
1276                 }
1277                 break;
1278         case SIOCSIFMTU:
1279                 if (!sc->sc_sync_if ||
1280                     ifr->ifr_mtu <= PFSYNC_MINPKT ||
1281                     ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
1282                         return (EINVAL);
1283                 if (ifr->ifr_mtu < ifp->if_mtu) {
1284                         PFSYNC_LOCK(sc);
1285                         if (sc->sc_len > PFSYNC_MINPKT)
1286                                 pfsync_sendout(1);
1287                         PFSYNC_UNLOCK(sc);
1288                 }
1289                 ifp->if_mtu = ifr->ifr_mtu;
1290                 break;
1291         case SIOCGETPFSYNC:
1292                 bzero(&pfsyncr, sizeof(pfsyncr));
1293                 PFSYNC_LOCK(sc);
1294                 if (sc->sc_sync_if) {
1295                         strlcpy(pfsyncr.pfsyncr_syncdev,
1296                             sc->sc_sync_if->if_xname, IFNAMSIZ);
1297                 }
1298                 pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
1299                 pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
1300                 pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER ==
1301                     (sc->sc_flags & PFSYNCF_DEFER));
1302                 PFSYNC_UNLOCK(sc);
1303                 return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
1304
1305         case SIOCSETPFSYNC:
1306             {
1307                 struct ip_moptions *imo = &sc->sc_imo;
1308                 struct ifnet *sifp;
1309                 struct ip *ip;
1310                 void *mship = NULL;
1311
1312                 if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
1313                         return (error);
1314                 if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
1315                         return (error);
1316
1317                 if (pfsyncr.pfsyncr_maxupdates > 255)
1318                         return (EINVAL);
1319
1320                 if (pfsyncr.pfsyncr_syncdev[0] == 0)
1321                         sifp = NULL;
1322                 else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL)
1323                         return (EINVAL);
1324
1325                 if (sifp != NULL && (
1326                     pfsyncr.pfsyncr_syncpeer.s_addr == 0 ||
1327                     pfsyncr.pfsyncr_syncpeer.s_addr ==
1328                     htonl(INADDR_PFSYNC_GROUP)))
1329                         mship = malloc((sizeof(struct in_multi *) *
1330                             IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO);
1331
1332                 PFSYNC_LOCK(sc);
1333                 if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
1334                         sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
1335                 else
1336                         sc->sc_sync_peer.s_addr =
1337                             pfsyncr.pfsyncr_syncpeer.s_addr;
1338
1339                 sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
1340                 if (pfsyncr.pfsyncr_defer) {
1341                         sc->sc_flags |= PFSYNCF_DEFER;
1342                         pfsync_defer_ptr = pfsync_defer;
1343                 } else {
1344                         sc->sc_flags &= ~PFSYNCF_DEFER;
1345                         pfsync_defer_ptr = NULL;
1346                 }
1347
1348                 if (sifp == NULL) {
1349                         if (sc->sc_sync_if)
1350                                 if_rele(sc->sc_sync_if);
1351                         sc->sc_sync_if = NULL;
1352                         if (imo->imo_membership)
1353                                 pfsync_multicast_cleanup(sc);
1354                         PFSYNC_UNLOCK(sc);
1355                         break;
1356                 }
1357
1358                 if (sc->sc_len > PFSYNC_MINPKT &&
1359                     (sifp->if_mtu < sc->sc_ifp->if_mtu ||
1360                     (sc->sc_sync_if != NULL &&
1361                     sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
1362                     sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
1363                         pfsync_sendout(1);
1364
1365                 if (imo->imo_membership)
1366                         pfsync_multicast_cleanup(sc);
1367
1368                 if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
1369                         error = pfsync_multicast_setup(sc, sifp, mship);
1370                         if (error) {
1371                                 if_rele(sifp);
1372                                 free(mship, M_PFSYNC);
1373                                 return (error);
1374                         }
1375                 }
1376                 if (sc->sc_sync_if)
1377                         if_rele(sc->sc_sync_if);
1378                 sc->sc_sync_if = sifp;
1379
1380                 ip = &sc->sc_template;
1381                 bzero(ip, sizeof(*ip));
1382                 ip->ip_v = IPVERSION;
1383                 ip->ip_hl = sizeof(sc->sc_template) >> 2;
1384                 ip->ip_tos = IPTOS_LOWDELAY;
1385                 /* len and id are set later. */
1386                 ip->ip_off = htons(IP_DF);
1387                 ip->ip_ttl = PFSYNC_DFLTTL;
1388                 ip->ip_p = IPPROTO_PFSYNC;
1389                 ip->ip_src.s_addr = INADDR_ANY;
1390                 ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
1391
1392                 /* Request a full state table update. */
1393                 if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1394                         (*carp_demote_adj_p)(V_pfsync_carp_adj,
1395                             "pfsync bulk start");
1396                 sc->sc_flags &= ~PFSYNCF_OK;
1397                 if (V_pf_status.debug >= PF_DEBUG_MISC)
1398                         printf("pfsync: requesting bulk update\n");
1399                 pfsync_request_update(0, 0);
1400                 PFSYNC_UNLOCK(sc);
1401                 PFSYNC_BLOCK(sc);
1402                 sc->sc_ureq_sent = time_uptime;
1403                 callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail,
1404                     sc);
1405                 PFSYNC_BUNLOCK(sc);
1406
1407                 break;
1408             }
1409         default:
1410                 return (ENOTTY);
1411         }
1412
1413         return (0);
1414 }
1415
1416 static void
1417 pfsync_out_state(struct pf_state *st, void *buf)
1418 {
1419         struct pfsync_state *sp = buf;
1420
1421         pfsync_state_export(sp, st);
1422 }
1423
1424 static void
1425 pfsync_out_iack(struct pf_state *st, void *buf)
1426 {
1427         struct pfsync_ins_ack *iack = buf;
1428
1429         iack->id = st->id;
1430         iack->creatorid = st->creatorid;
1431 }
1432
1433 static void
1434 pfsync_out_upd_c(struct pf_state *st, void *buf)
1435 {
1436         struct pfsync_upd_c *up = buf;
1437
1438         bzero(up, sizeof(*up));
1439         up->id = st->id;
1440         pf_state_peer_hton(&st->src, &up->src);
1441         pf_state_peer_hton(&st->dst, &up->dst);
1442         up->creatorid = st->creatorid;
1443         up->timeout = st->timeout;
1444 }
1445
1446 static void
1447 pfsync_out_del(struct pf_state *st, void *buf)
1448 {
1449         struct pfsync_del_c *dp = buf;
1450
1451         dp->id = st->id;
1452         dp->creatorid = st->creatorid;
1453         st->state_flags |= PFSTATE_NOSYNC;
1454 }
1455
1456 static void
1457 pfsync_drop(struct pfsync_softc *sc)
1458 {
1459         struct pf_state *st, *next;
1460         struct pfsync_upd_req_item *ur;
1461         int q;
1462
1463         for (q = 0; q < PFSYNC_S_COUNT; q++) {
1464                 if (TAILQ_EMPTY(&sc->sc_qs[q]))
1465                         continue;
1466
1467                 TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) {
1468                         KASSERT(st->sync_state == q,
1469                                 ("%s: st->sync_state == q",
1470                                         __func__));
1471                         st->sync_state = PFSYNC_S_NONE;
1472                         pf_release_state(st);
1473                 }
1474                 TAILQ_INIT(&sc->sc_qs[q]);
1475         }
1476
1477         while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1478                 TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1479                 free(ur, M_PFSYNC);
1480         }
1481
1482         sc->sc_plus = NULL;
1483         sc->sc_len = PFSYNC_MINPKT;
1484 }
1485
1486 static void
1487 pfsync_sendout(int schedswi)
1488 {
1489         struct pfsync_softc *sc = V_pfsyncif;
1490         struct ifnet *ifp = sc->sc_ifp;
1491         struct mbuf *m;
1492         struct ip *ip;
1493         struct pfsync_header *ph;
1494         struct pfsync_subheader *subh;
1495         struct pf_state *st;
1496         struct pfsync_upd_req_item *ur;
1497         int offset;
1498         int q, count = 0;
1499
1500         KASSERT(sc != NULL, ("%s: null sc", __func__));
1501         KASSERT(sc->sc_len > PFSYNC_MINPKT,
1502             ("%s: sc_len %zu", __func__, sc->sc_len));
1503         PFSYNC_LOCK_ASSERT(sc);
1504
1505         if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
1506                 pfsync_drop(sc);
1507                 return;
1508         }
1509
1510         m = m_get2(max_linkhdr + sc->sc_len, M_NOWAIT, MT_DATA, M_PKTHDR);
1511         if (m == NULL) {
1512                 sc->sc_ifp->if_oerrors++;
1513                 V_pfsyncstats.pfsyncs_onomem++;
1514                 return;
1515         }
1516         m->m_data += max_linkhdr;
1517         m->m_len = m->m_pkthdr.len = sc->sc_len;
1518
1519         /* build the ip header */
1520         ip = (struct ip *)m->m_data;
1521         bcopy(&sc->sc_template, ip, sizeof(*ip));
1522         offset = sizeof(*ip);
1523
1524         ip->ip_len = htons(m->m_pkthdr.len);
1525         ip->ip_id = htons(ip_randomid());
1526
1527         /* build the pfsync header */
1528         ph = (struct pfsync_header *)(m->m_data + offset);
1529         bzero(ph, sizeof(*ph));
1530         offset += sizeof(*ph);
1531
1532         ph->version = PFSYNC_VERSION;
1533         ph->len = htons(sc->sc_len - sizeof(*ip));
1534         bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
1535
1536         /* walk the queues */
1537         for (q = 0; q < PFSYNC_S_COUNT; q++) {
1538                 if (TAILQ_EMPTY(&sc->sc_qs[q]))
1539                         continue;
1540
1541                 subh = (struct pfsync_subheader *)(m->m_data + offset);
1542                 offset += sizeof(*subh);
1543
1544                 count = 0;
1545                 TAILQ_FOREACH(st, &sc->sc_qs[q], sync_list) {
1546                         KASSERT(st->sync_state == q,
1547                                 ("%s: st->sync_state == q",
1548                                         __func__));
1549                         /*
1550                          * XXXGL: some of write methods do unlocked reads
1551                          * of state data :(
1552                          */
1553                         pfsync_qs[q].write(st, m->m_data + offset);
1554                         offset += pfsync_qs[q].len;
1555                         st->sync_state = PFSYNC_S_NONE;
1556                         pf_release_state(st);
1557                         count++;
1558                 }
1559                 TAILQ_INIT(&sc->sc_qs[q]);
1560
1561                 bzero(subh, sizeof(*subh));
1562                 subh->action = pfsync_qs[q].action;
1563                 subh->count = htons(count);
1564                 V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
1565         }
1566
1567         if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
1568                 subh = (struct pfsync_subheader *)(m->m_data + offset);
1569                 offset += sizeof(*subh);
1570
1571                 count = 0;
1572                 while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1573                         TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1574
1575                         bcopy(&ur->ur_msg, m->m_data + offset,
1576                             sizeof(ur->ur_msg));
1577                         offset += sizeof(ur->ur_msg);
1578                         free(ur, M_PFSYNC);
1579                         count++;
1580                 }
1581
1582                 bzero(subh, sizeof(*subh));
1583                 subh->action = PFSYNC_ACT_UPD_REQ;
1584                 subh->count = htons(count);
1585                 V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count;
1586         }
1587
1588         /* has someone built a custom region for us to add? */
1589         if (sc->sc_plus != NULL) {
1590                 bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
1591                 offset += sc->sc_pluslen;
1592
1593                 sc->sc_plus = NULL;
1594         }
1595
1596         subh = (struct pfsync_subheader *)(m->m_data + offset);
1597         offset += sizeof(*subh);
1598
1599         bzero(subh, sizeof(*subh));
1600         subh->action = PFSYNC_ACT_EOF;
1601         subh->count = htons(1);
1602         V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++;
1603
1604         /* we're done, let's put it on the wire */
1605         if (ifp->if_bpf) {
1606                 m->m_data += sizeof(*ip);
1607                 m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
1608                 BPF_MTAP(ifp, m);
1609                 m->m_data -= sizeof(*ip);
1610                 m->m_len = m->m_pkthdr.len = sc->sc_len;
1611         }
1612
1613         if (sc->sc_sync_if == NULL) {
1614                 sc->sc_len = PFSYNC_MINPKT;
1615                 m_freem(m);
1616                 return;
1617         }
1618
1619         sc->sc_ifp->if_opackets++;
1620         sc->sc_ifp->if_obytes += m->m_pkthdr.len;
1621         sc->sc_len = PFSYNC_MINPKT;
1622
1623         if (!_IF_QFULL(&sc->sc_ifp->if_snd))
1624                 _IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
1625         else {
1626                 m_freem(m);
1627                 sc->sc_ifp->if_snd.ifq_drops++;
1628         }
1629         if (schedswi)
1630                 swi_sched(V_pfsync_swi_cookie, 0);
1631 }
1632
1633 static void
1634 pfsync_insert_state(struct pf_state *st)
1635 {
1636         struct pfsync_softc *sc = V_pfsyncif;
1637
1638         if (st->state_flags & PFSTATE_NOSYNC)
1639                 return;
1640
1641         if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) ||
1642             st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
1643                 st->state_flags |= PFSTATE_NOSYNC;
1644                 return;
1645         }
1646
1647         KASSERT(st->sync_state == PFSYNC_S_NONE,
1648                 ("%s: st->sync_state %u", __func__, st->sync_state));
1649
1650         PFSYNC_LOCK(sc);
1651         if (sc->sc_len == PFSYNC_MINPKT)
1652                 callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1653
1654         pfsync_q_ins(st, PFSYNC_S_INS);
1655         PFSYNC_UNLOCK(sc);
1656
1657         st->sync_updates = 0;
1658 }
1659
1660 static int
1661 pfsync_defer(struct pf_state *st, struct mbuf *m)
1662 {
1663         struct pfsync_softc *sc = V_pfsyncif;
1664         struct pfsync_deferral *pd;
1665
1666         if (m->m_flags & (M_BCAST|M_MCAST))
1667                 return (0);
1668
1669         PFSYNC_LOCK(sc);
1670
1671         if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) ||
1672             !(sc->sc_flags & PFSYNCF_DEFER)) {
1673                 PFSYNC_UNLOCK(sc);
1674                 return (0);
1675         }
1676
1677          if (sc->sc_deferred >= 128)
1678                 pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0);
1679
1680         pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
1681         if (pd == NULL)
1682                 return (0);
1683         sc->sc_deferred++;
1684
1685         m->m_flags |= M_SKIP_FIREWALL;
1686         st->state_flags |= PFSTATE_ACK;
1687
1688         pd->pd_sc = sc;
1689         pd->pd_refs = 0;
1690         pd->pd_st = st;
1691         pf_ref_state(st);
1692         pd->pd_m = m;
1693
1694         TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
1695         callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1696         callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd);
1697
1698         pfsync_push(sc);
1699
1700         return (1);
1701 }
1702
1703 static void
1704 pfsync_undefer(struct pfsync_deferral *pd, int drop)
1705 {
1706         struct pfsync_softc *sc = pd->pd_sc;
1707         struct mbuf *m = pd->pd_m;
1708         struct pf_state *st = pd->pd_st;
1709
1710         PFSYNC_LOCK_ASSERT(sc);
1711
1712         TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1713         sc->sc_deferred--;
1714         pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */
1715         free(pd, M_PFSYNC);
1716         pf_release_state(st);
1717
1718         if (drop)
1719                 m_freem(m);
1720         else {
1721                 _IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
1722                 pfsync_push(sc);
1723         }
1724 }
1725
1726 static void
1727 pfsync_defer_tmo(void *arg)
1728 {
1729         struct pfsync_deferral *pd = arg;
1730         struct pfsync_softc *sc = pd->pd_sc;
1731         struct mbuf *m = pd->pd_m;
1732         struct pf_state *st = pd->pd_st;
1733
1734         PFSYNC_LOCK_ASSERT(sc);
1735
1736         CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
1737
1738         TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1739         sc->sc_deferred--;
1740         pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */
1741         if (pd->pd_refs == 0)
1742                 free(pd, M_PFSYNC);
1743         PFSYNC_UNLOCK(sc);
1744
1745         ip_output(m, NULL, NULL, 0, NULL, NULL);
1746
1747         pf_release_state(st);
1748
1749         CURVNET_RESTORE();
1750 }
1751
1752 static void
1753 pfsync_undefer_state(struct pf_state *st, int drop)
1754 {
1755         struct pfsync_softc *sc = V_pfsyncif;
1756         struct pfsync_deferral *pd;
1757
1758         PFSYNC_LOCK_ASSERT(sc);
1759
1760         TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
1761                  if (pd->pd_st == st) {
1762                         if (callout_stop(&pd->pd_tmo))
1763                                 pfsync_undefer(pd, drop);
1764                         return;
1765                 }
1766         }
1767
1768         panic("%s: unable to find deferred state", __func__);
1769 }
1770
1771 static void
1772 pfsync_update_state(struct pf_state *st)
1773 {
1774         struct pfsync_softc *sc = V_pfsyncif;
1775         int sync = 0;
1776
1777         PF_STATE_LOCK_ASSERT(st);
1778         PFSYNC_LOCK(sc);
1779
1780         if (st->state_flags & PFSTATE_ACK)
1781                 pfsync_undefer_state(st, 0);
1782         if (st->state_flags & PFSTATE_NOSYNC) {
1783                 if (st->sync_state != PFSYNC_S_NONE)
1784                         pfsync_q_del(st);
1785                 PFSYNC_UNLOCK(sc);
1786                 return;
1787         }
1788
1789         if (sc->sc_len == PFSYNC_MINPKT)
1790                 callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1791
1792         switch (st->sync_state) {
1793         case PFSYNC_S_UPD_C:
1794         case PFSYNC_S_UPD:
1795         case PFSYNC_S_INS:
1796                 /* we're already handling it */
1797
1798                 if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
1799                         st->sync_updates++;
1800                         if (st->sync_updates >= sc->sc_maxupdates)
1801                                 sync = 1;
1802                 }
1803                 break;
1804
1805         case PFSYNC_S_IACK:
1806                 pfsync_q_del(st);
1807         case PFSYNC_S_NONE:
1808                 pfsync_q_ins(st, PFSYNC_S_UPD_C);
1809                 st->sync_updates = 0;
1810                 break;
1811
1812         default:
1813                 panic("%s: unexpected sync state %d", __func__, st->sync_state);
1814         }
1815
1816         if (sync || (time_uptime - st->pfsync_time) < 2)
1817                 pfsync_push(sc);
1818
1819         PFSYNC_UNLOCK(sc);
1820 }
1821
1822 static void
1823 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
1824 {
1825         struct pfsync_softc *sc = V_pfsyncif;
1826         struct pfsync_upd_req_item *item;
1827         size_t nlen = sizeof(struct pfsync_upd_req);
1828
1829         PFSYNC_LOCK_ASSERT(sc);
1830
1831         /*
1832          * This code does a bit to prevent multiple update requests for the
1833          * same state being generated. It searches current subheader queue,
1834          * but it doesn't lookup into queue of already packed datagrams.
1835          */
1836         TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry)
1837                 if (item->ur_msg.id == id &&
1838                     item->ur_msg.creatorid == creatorid)
1839                         return;
1840
1841         item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT);
1842         if (item == NULL)
1843                 return; /* XXX stats */
1844
1845         item->ur_msg.id = id;
1846         item->ur_msg.creatorid = creatorid;
1847
1848         if (TAILQ_EMPTY(&sc->sc_upd_req_list))
1849                 nlen += sizeof(struct pfsync_subheader);
1850
1851         if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
1852                 pfsync_sendout(1);
1853
1854                 nlen = sizeof(struct pfsync_subheader) +
1855                     sizeof(struct pfsync_upd_req);
1856         }
1857
1858         TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
1859         sc->sc_len += nlen;
1860 }
1861
1862 static void
1863 pfsync_update_state_req(struct pf_state *st)
1864 {
1865         struct pfsync_softc *sc = V_pfsyncif;
1866
1867         PF_STATE_LOCK_ASSERT(st);
1868         PFSYNC_LOCK(sc);
1869
1870         if (st->state_flags & PFSTATE_NOSYNC) {
1871                 if (st->sync_state != PFSYNC_S_NONE)
1872                         pfsync_q_del(st);
1873                 PFSYNC_UNLOCK(sc);
1874                 return;
1875         }
1876
1877         switch (st->sync_state) {
1878         case PFSYNC_S_UPD_C:
1879         case PFSYNC_S_IACK:
1880                 pfsync_q_del(st);
1881         case PFSYNC_S_NONE:
1882                 pfsync_q_ins(st, PFSYNC_S_UPD);
1883                 pfsync_push(sc);
1884                 break;
1885
1886         case PFSYNC_S_INS:
1887         case PFSYNC_S_UPD:
1888         case PFSYNC_S_DEL:
1889                 /* we're already handling it */
1890                 break;
1891
1892         default:
1893                 panic("%s: unexpected sync state %d", __func__, st->sync_state);
1894         }
1895
1896         PFSYNC_UNLOCK(sc);
1897 }
1898
1899 static void
1900 pfsync_delete_state(struct pf_state *st)
1901 {
1902         struct pfsync_softc *sc = V_pfsyncif;
1903
1904         PFSYNC_LOCK(sc);
1905         if (st->state_flags & PFSTATE_ACK)
1906                 pfsync_undefer_state(st, 1);
1907         if (st->state_flags & PFSTATE_NOSYNC) {
1908                 if (st->sync_state != PFSYNC_S_NONE)
1909                         pfsync_q_del(st);
1910                 PFSYNC_UNLOCK(sc);
1911                 return;
1912         }
1913
1914         if (sc->sc_len == PFSYNC_MINPKT)
1915                 callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1916
1917         switch (st->sync_state) {
1918         case PFSYNC_S_INS:
1919                 /* We never got to tell the world so just forget about it. */
1920                 pfsync_q_del(st);
1921                 break;
1922
1923         case PFSYNC_S_UPD_C:
1924         case PFSYNC_S_UPD:
1925         case PFSYNC_S_IACK:
1926                 pfsync_q_del(st);
1927                 /* FALLTHROUGH to putting it on the del list */
1928
1929         case PFSYNC_S_NONE:
1930                 pfsync_q_ins(st, PFSYNC_S_DEL);
1931                 break;
1932
1933         default:
1934                 panic("%s: unexpected sync state %d", __func__, st->sync_state);
1935         }
1936         PFSYNC_UNLOCK(sc);
1937 }
1938
1939 static void
1940 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
1941 {
1942         struct pfsync_softc *sc = V_pfsyncif;
1943         struct {
1944                 struct pfsync_subheader subh;
1945                 struct pfsync_clr clr;
1946         } __packed r;
1947
1948         bzero(&r, sizeof(r));
1949
1950         r.subh.action = PFSYNC_ACT_CLR;
1951         r.subh.count = htons(1);
1952         V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++;
1953
1954         strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
1955         r.clr.creatorid = creatorid;
1956
1957         PFSYNC_LOCK(sc);
1958         pfsync_send_plus(&r, sizeof(r));
1959         PFSYNC_UNLOCK(sc);
1960 }
1961
1962 static void
1963 pfsync_q_ins(struct pf_state *st, int q)
1964 {
1965         struct pfsync_softc *sc = V_pfsyncif;
1966         size_t nlen = pfsync_qs[q].len;
1967
1968         PFSYNC_LOCK_ASSERT(sc);
1969
1970         KASSERT(st->sync_state == PFSYNC_S_NONE,
1971                 ("%s: st->sync_state %u", __func__, st->sync_state));
1972         KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
1973             sc->sc_len));
1974
1975         if (TAILQ_EMPTY(&sc->sc_qs[q]))
1976                 nlen += sizeof(struct pfsync_subheader);
1977
1978         if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
1979                 pfsync_sendout(1);
1980
1981                 nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
1982         }
1983
1984         sc->sc_len += nlen;
1985         TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
1986         st->sync_state = q;
1987         pf_ref_state(st);
1988 }
1989
1990 static void
1991 pfsync_q_del(struct pf_state *st)
1992 {
1993         struct pfsync_softc *sc = V_pfsyncif;
1994         int q = st->sync_state;
1995
1996         PFSYNC_LOCK_ASSERT(sc);
1997         KASSERT(st->sync_state != PFSYNC_S_NONE,
1998                 ("%s: st->sync_state != PFSYNC_S_NONE", __func__));
1999
2000         sc->sc_len -= pfsync_qs[q].len;
2001         TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
2002         st->sync_state = PFSYNC_S_NONE;
2003         pf_release_state(st);
2004
2005         if (TAILQ_EMPTY(&sc->sc_qs[q]))
2006                 sc->sc_len -= sizeof(struct pfsync_subheader);
2007 }
2008
2009 static void
2010 pfsync_bulk_start(void)
2011 {
2012         struct pfsync_softc *sc = V_pfsyncif;
2013
2014         if (V_pf_status.debug >= PF_DEBUG_MISC)
2015                 printf("pfsync: received bulk update request\n");
2016
2017         PFSYNC_BLOCK(sc);
2018
2019         sc->sc_ureq_received = time_uptime;
2020         sc->sc_bulk_hashid = 0;
2021         sc->sc_bulk_stateid = 0;
2022         pfsync_bulk_status(PFSYNC_BUS_START);
2023         callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
2024         PFSYNC_BUNLOCK(sc);
2025 }
2026
2027 static void
2028 pfsync_bulk_update(void *arg)
2029 {
2030         struct pfsync_softc *sc = arg;
2031         struct pf_state *s;
2032         int i, sent = 0;
2033
2034         PFSYNC_BLOCK_ASSERT(sc);
2035         CURVNET_SET(sc->sc_ifp->if_vnet);
2036
2037         /*
2038          * Start with last state from previous invocation.
2039          * It may had gone, in this case start from the
2040          * hash slot.
2041          */
2042         s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid);
2043
2044         if (s != NULL)
2045                 i = PF_IDHASH(s);
2046         else
2047                 i = sc->sc_bulk_hashid;
2048
2049         for (; i <= V_pf_hashmask; i++) {
2050                 struct pf_idhash *ih = &V_pf_idhash[i];
2051
2052                 if (s != NULL)
2053                         PF_HASHROW_ASSERT(ih);
2054                 else {
2055                         PF_HASHROW_LOCK(ih);
2056                         s = LIST_FIRST(&ih->states);
2057                 }
2058
2059                 for (; s; s = LIST_NEXT(s, entry)) {
2060
2061                         if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) <
2062                             sizeof(struct pfsync_state)) {
2063                                 /* We've filled a packet. */
2064                                 sc->sc_bulk_hashid = i;
2065                                 sc->sc_bulk_stateid = s->id;
2066                                 sc->sc_bulk_creatorid = s->creatorid;
2067                                 PF_HASHROW_UNLOCK(ih);
2068                                 callout_reset(&sc->sc_bulk_tmo, 1,
2069                                     pfsync_bulk_update, sc);
2070                                 goto full;
2071                         }
2072
2073                         if (s->sync_state == PFSYNC_S_NONE &&
2074                             s->timeout < PFTM_MAX &&
2075                             s->pfsync_time <= sc->sc_ureq_received) {
2076                                 pfsync_update_state_req(s);
2077                                 sent++;
2078                         }
2079                 }
2080                 PF_HASHROW_UNLOCK(ih);
2081         }
2082
2083         /* We're done. */
2084         pfsync_bulk_status(PFSYNC_BUS_END);
2085
2086 full:
2087         CURVNET_RESTORE();
2088 }
2089
2090 static void
2091 pfsync_bulk_status(u_int8_t status)
2092 {
2093         struct {
2094                 struct pfsync_subheader subh;
2095                 struct pfsync_bus bus;
2096         } __packed r;
2097
2098         struct pfsync_softc *sc = V_pfsyncif;
2099
2100         bzero(&r, sizeof(r));
2101
2102         r.subh.action = PFSYNC_ACT_BUS;
2103         r.subh.count = htons(1);
2104         V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++;
2105
2106         r.bus.creatorid = V_pf_status.hostid;
2107         r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
2108         r.bus.status = status;
2109
2110         PFSYNC_LOCK(sc);
2111         pfsync_send_plus(&r, sizeof(r));
2112         PFSYNC_UNLOCK(sc);
2113 }
2114
2115 static void
2116 pfsync_bulk_fail(void *arg)
2117 {
2118         struct pfsync_softc *sc = arg;
2119
2120         CURVNET_SET(sc->sc_ifp->if_vnet);
2121
2122         PFSYNC_BLOCK_ASSERT(sc);
2123
2124         if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
2125                 /* Try again */
2126                 callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
2127                     pfsync_bulk_fail, V_pfsyncif);
2128                 PFSYNC_LOCK(sc);
2129                 pfsync_request_update(0, 0);
2130                 PFSYNC_UNLOCK(sc);
2131         } else {
2132                 /* Pretend like the transfer was ok. */
2133                 sc->sc_ureq_sent = 0;
2134                 sc->sc_bulk_tries = 0;
2135                 PFSYNC_LOCK(sc);
2136                 if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
2137                         (*carp_demote_adj_p)(-V_pfsync_carp_adj,
2138                             "pfsync bulk fail");
2139                 sc->sc_flags |= PFSYNCF_OK;
2140                 PFSYNC_UNLOCK(sc);
2141                 if (V_pf_status.debug >= PF_DEBUG_MISC)
2142                         printf("pfsync: failed to receive bulk update\n");
2143         }
2144
2145         CURVNET_RESTORE();
2146 }
2147
2148 static void
2149 pfsync_send_plus(void *plus, size_t pluslen)
2150 {
2151         struct pfsync_softc *sc = V_pfsyncif;
2152
2153         PFSYNC_LOCK_ASSERT(sc);
2154
2155         if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu)
2156                 pfsync_sendout(1);
2157
2158         sc->sc_plus = plus;
2159         sc->sc_len += (sc->sc_pluslen = pluslen);
2160
2161         pfsync_sendout(1);
2162 }
2163
2164 static void
2165 pfsync_timeout(void *arg)
2166 {
2167         struct pfsync_softc *sc = arg;
2168
2169         CURVNET_SET(sc->sc_ifp->if_vnet);
2170         PFSYNC_LOCK(sc);
2171         pfsync_push(sc);
2172         PFSYNC_UNLOCK(sc);
2173         CURVNET_RESTORE();
2174 }
2175
2176 static void
2177 pfsync_push(struct pfsync_softc *sc)
2178 {
2179
2180         PFSYNC_LOCK_ASSERT(sc);
2181
2182         sc->sc_flags |= PFSYNCF_PUSH;
2183         swi_sched(V_pfsync_swi_cookie, 0);
2184 }
2185
2186 static void
2187 pfsyncintr(void *arg)
2188 {
2189         struct pfsync_softc *sc = arg;
2190         struct mbuf *m, *n;
2191
2192         CURVNET_SET(sc->sc_ifp->if_vnet);
2193
2194         PFSYNC_LOCK(sc);
2195         if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) {
2196                 pfsync_sendout(0);
2197                 sc->sc_flags &= ~PFSYNCF_PUSH;
2198         }
2199         _IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m);
2200         PFSYNC_UNLOCK(sc);
2201
2202         for (; m != NULL; m = n) {
2203
2204                 n = m->m_nextpkt;
2205                 m->m_nextpkt = NULL;
2206
2207                 /*
2208                  * We distinguish between a deferral packet and our
2209                  * own pfsync packet based on M_SKIP_FIREWALL
2210                  * flag. This is XXX.
2211                  */
2212                 if (m->m_flags & M_SKIP_FIREWALL)
2213                         ip_output(m, NULL, NULL, 0, NULL, NULL);
2214                 else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
2215                     NULL) == 0)
2216                         V_pfsyncstats.pfsyncs_opackets++;
2217                 else
2218                         V_pfsyncstats.pfsyncs_oerrors++;
2219         }
2220         CURVNET_RESTORE();
2221 }
2222
2223 static int
2224 pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship)
2225 {
2226         struct ip_moptions *imo = &sc->sc_imo;
2227         int error;
2228
2229         if (!(ifp->if_flags & IFF_MULTICAST))
2230                 return (EADDRNOTAVAIL);
2231
2232         imo->imo_membership = (struct in_multi **)mship;
2233         imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
2234         imo->imo_multicast_vif = -1;
2235
2236         if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL,
2237             &imo->imo_membership[0])) != 0) {
2238                 imo->imo_membership = NULL;
2239                 return (error);
2240         }
2241         imo->imo_num_memberships++;
2242         imo->imo_multicast_ifp = ifp;
2243         imo->imo_multicast_ttl = PFSYNC_DFLTTL;
2244         imo->imo_multicast_loop = 0;
2245
2246         return (0);
2247 }
2248
2249 static void
2250 pfsync_multicast_cleanup(struct pfsync_softc *sc)
2251 {
2252         struct ip_moptions *imo = &sc->sc_imo;
2253
2254         in_leavegroup(imo->imo_membership[0], NULL);
2255         free(imo->imo_membership, M_PFSYNC);
2256         imo->imo_membership = NULL;
2257         imo->imo_multicast_ifp = NULL;
2258 }
2259
2260 #ifdef INET
2261 extern  struct domain inetdomain;
2262 static struct protosw in_pfsync_protosw = {
2263         .pr_type =              SOCK_RAW,
2264         .pr_domain =            &inetdomain,
2265         .pr_protocol =          IPPROTO_PFSYNC,
2266         .pr_flags =             PR_ATOMIC|PR_ADDR,
2267         .pr_input =             pfsync_input,
2268         .pr_output =            (pr_output_t *)rip_output,
2269         .pr_ctloutput =         rip_ctloutput,
2270         .pr_usrreqs =           &rip_usrreqs
2271 };
2272 #endif
2273
2274 static void
2275 pfsync_pointers_init()
2276 {
2277
2278         PF_RULES_WLOCK();
2279         pfsync_state_import_ptr = pfsync_state_import;
2280         pfsync_insert_state_ptr = pfsync_insert_state;
2281         pfsync_update_state_ptr = pfsync_update_state;
2282         pfsync_delete_state_ptr = pfsync_delete_state;
2283         pfsync_clear_states_ptr = pfsync_clear_states;
2284         pfsync_defer_ptr = pfsync_defer;
2285         PF_RULES_WUNLOCK();
2286 }
2287
2288 static void
2289 pfsync_pointers_uninit()
2290 {
2291
2292         PF_RULES_WLOCK();
2293         pfsync_state_import_ptr = NULL;
2294         pfsync_insert_state_ptr = NULL;
2295         pfsync_update_state_ptr = NULL;
2296         pfsync_delete_state_ptr = NULL;
2297         pfsync_clear_states_ptr = NULL;
2298         pfsync_defer_ptr = NULL;
2299         PF_RULES_WUNLOCK();
2300 }
2301
2302 static int
2303 pfsync_init()
2304 {
2305         VNET_ITERATOR_DECL(vnet_iter);
2306         int error = 0;
2307
2308         VNET_LIST_RLOCK();
2309         VNET_FOREACH(vnet_iter) {
2310                 CURVNET_SET(vnet_iter);
2311                 V_pfsync_cloner = if_clone_simple(pfsyncname,
2312                     pfsync_clone_create, pfsync_clone_destroy, 1);
2313                 error = swi_add(NULL, pfsyncname, pfsyncintr, V_pfsyncif,
2314                     SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie);
2315                 CURVNET_RESTORE();
2316                 if (error)
2317                         goto fail_locked;
2318         }
2319         VNET_LIST_RUNLOCK();
2320 #ifdef INET
2321         error = pf_proto_register(PF_INET, &in_pfsync_protosw);
2322         if (error)
2323                 goto fail;
2324         error = ipproto_register(IPPROTO_PFSYNC);
2325         if (error) {
2326                 pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2327                 goto fail;
2328         }
2329 #endif
2330         pfsync_pointers_init();
2331
2332         return (0);
2333
2334 fail:
2335         VNET_LIST_RLOCK();
2336 fail_locked:
2337         VNET_FOREACH(vnet_iter) {
2338                 CURVNET_SET(vnet_iter);
2339                 if (V_pfsync_swi_cookie) {
2340                         swi_remove(V_pfsync_swi_cookie);
2341                         if_clone_detach(V_pfsync_cloner);
2342                 }
2343                 CURVNET_RESTORE();
2344         }
2345         VNET_LIST_RUNLOCK();
2346
2347         return (error);
2348 }
2349
2350 static void
2351 pfsync_uninit()
2352 {
2353         VNET_ITERATOR_DECL(vnet_iter);
2354
2355         pfsync_pointers_uninit();
2356
2357         ipproto_unregister(IPPROTO_PFSYNC);
2358         pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2359         VNET_LIST_RLOCK();
2360         VNET_FOREACH(vnet_iter) {
2361                 CURVNET_SET(vnet_iter);
2362                 if_clone_detach(V_pfsync_cloner);
2363                 swi_remove(V_pfsync_swi_cookie);
2364                 CURVNET_RESTORE();
2365         }
2366         VNET_LIST_RUNLOCK();
2367 }
2368
2369 static int
2370 pfsync_modevent(module_t mod, int type, void *data)
2371 {
2372         int error = 0;
2373
2374         switch (type) {
2375         case MOD_LOAD:
2376                 error = pfsync_init();
2377                 break;
2378         case MOD_QUIESCE:
2379                 /*
2380                  * Module should not be unloaded due to race conditions.
2381                  */
2382                 error = EBUSY;
2383                 break;
2384         case MOD_UNLOAD:
2385                 pfsync_uninit();
2386                 break;
2387         default:
2388                 error = EINVAL;
2389                 break;
2390         }
2391
2392         return (error);
2393 }
2394
2395 static moduledata_t pfsync_mod = {
2396         pfsyncname,
2397         pfsync_modevent,
2398         0
2399 };
2400
2401 #define PFSYNC_MODVER 1
2402
2403 DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
2404 MODULE_VERSION(pfsync, PFSYNC_MODVER);
2405 MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);