]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netpfil/pf/if_pfsync.c
Use an accessor function to access ifr_data.
[FreeBSD/FreeBSD.git] / sys / netpfil / pf / if_pfsync.c
1 /*-
2  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND ISC)
3  *
4  * Copyright (c) 2002 Michael Shalayeff
5  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
21  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
25  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
26  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
27  * THE POSSIBILITY OF SUCH DAMAGE.
28  */
29
30 /*-
31  * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
32  *
33  * Permission to use, copy, modify, and distribute this software for any
34  * purpose with or without fee is hereby granted, provided that the above
35  * copyright notice and this permission notice appear in all copies.
36  *
37  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
38  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
39  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
40  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
41  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
42  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
43  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
44  */
45
46 /*
47  * $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $
48  *
49  * Revisions picked from OpenBSD after revision 1.110 import:
50  * 1.119 - don't m_copydata() beyond the len of mbuf in pfsync_input()
51  * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates
52  * 1.120, 1.175 - use monotonic time_uptime
53  * 1.122 - reduce number of updates for non-TCP sessions
54  * 1.125, 1.127 - rewrite merge or stale processing
55  * 1.128 - cleanups
56  * 1.146 - bzero() mbuf before sparsely filling it with data
57  * 1.170 - SIOCSIFMTU checks
58  * 1.126, 1.142 - deferred packets processing
59  * 1.173 - correct expire time processing
60  */
61
62 #include <sys/cdefs.h>
63 __FBSDID("$FreeBSD$");
64
65 #include "opt_inet.h"
66 #include "opt_inet6.h"
67 #include "opt_pf.h"
68
69 #include <sys/param.h>
70 #include <sys/bus.h>
71 #include <sys/endian.h>
72 #include <sys/interrupt.h>
73 #include <sys/kernel.h>
74 #include <sys/lock.h>
75 #include <sys/mbuf.h>
76 #include <sys/module.h>
77 #include <sys/mutex.h>
78 #include <sys/priv.h>
79 #include <sys/protosw.h>
80 #include <sys/socket.h>
81 #include <sys/sockio.h>
82 #include <sys/sysctl.h>
83 #include <sys/syslog.h>
84
85 #include <net/bpf.h>
86 #include <net/if.h>
87 #include <net/if_var.h>
88 #include <net/if_clone.h>
89 #include <net/if_types.h>
90 #include <net/vnet.h>
91 #include <net/pfvar.h>
92 #include <net/if_pfsync.h>
93
94 #include <netinet/if_ether.h>
95 #include <netinet/in.h>
96 #include <netinet/in_var.h>
97 #include <netinet/ip.h>
98 #include <netinet/ip_carp.h>
99 #include <netinet/ip_var.h>
100 #include <netinet/tcp.h>
101 #include <netinet/tcp_fsm.h>
102 #include <netinet/tcp_seq.h>
103
104 #define PFSYNC_MINPKT ( \
105         sizeof(struct ip) + \
106         sizeof(struct pfsync_header) + \
107         sizeof(struct pfsync_subheader) )
108
109 struct pfsync_pkt {
110         struct ip *ip;
111         struct in_addr src;
112         u_int8_t flags;
113 };
114
115 static int      pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
116                     struct pfsync_state_peer *);
117 static int      pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int);
118 static int      pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int);
119 static int      pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int);
120 static int      pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int);
121 static int      pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int);
122 static int      pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int);
123 static int      pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int);
124 static int      pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int);
125 static int      pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int);
126 static int      pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int);
127 static int      pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int);
128 static int      pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int);
129
130 static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = {
131         pfsync_in_clr,                  /* PFSYNC_ACT_CLR */
132         pfsync_in_ins,                  /* PFSYNC_ACT_INS */
133         pfsync_in_iack,                 /* PFSYNC_ACT_INS_ACK */
134         pfsync_in_upd,                  /* PFSYNC_ACT_UPD */
135         pfsync_in_upd_c,                /* PFSYNC_ACT_UPD_C */
136         pfsync_in_ureq,                 /* PFSYNC_ACT_UPD_REQ */
137         pfsync_in_del,                  /* PFSYNC_ACT_DEL */
138         pfsync_in_del_c,                /* PFSYNC_ACT_DEL_C */
139         pfsync_in_error,                /* PFSYNC_ACT_INS_F */
140         pfsync_in_error,                /* PFSYNC_ACT_DEL_F */
141         pfsync_in_bus,                  /* PFSYNC_ACT_BUS */
142         pfsync_in_tdb,                  /* PFSYNC_ACT_TDB */
143         pfsync_in_eof                   /* PFSYNC_ACT_EOF */
144 };
145
146 struct pfsync_q {
147         void            (*write)(struct pf_state *, void *);
148         size_t          len;
149         u_int8_t        action;
150 };
151
152 /* we have one of these for every PFSYNC_S_ */
153 static void     pfsync_out_state(struct pf_state *, void *);
154 static void     pfsync_out_iack(struct pf_state *, void *);
155 static void     pfsync_out_upd_c(struct pf_state *, void *);
156 static void     pfsync_out_del(struct pf_state *, void *);
157
158 static struct pfsync_q pfsync_qs[] = {
159         { pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
160         { pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
161         { pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD },
162         { pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
163         { pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C }
164 };
165
166 static void     pfsync_q_ins(struct pf_state *, int, bool);
167 static void     pfsync_q_del(struct pf_state *, bool);
168
169 static void     pfsync_update_state(struct pf_state *);
170
171 struct pfsync_upd_req_item {
172         TAILQ_ENTRY(pfsync_upd_req_item)        ur_entry;
173         struct pfsync_upd_req                   ur_msg;
174 };
175
176 struct pfsync_deferral {
177         struct pfsync_softc             *pd_sc;
178         TAILQ_ENTRY(pfsync_deferral)    pd_entry;
179         u_int                           pd_refs;
180         struct callout                  pd_tmo;
181
182         struct pf_state                 *pd_st;
183         struct mbuf                     *pd_m;
184 };
185
186 struct pfsync_softc {
187         /* Configuration */
188         struct ifnet            *sc_ifp;
189         struct ifnet            *sc_sync_if;
190         struct ip_moptions      sc_imo;
191         struct in_addr          sc_sync_peer;
192         uint32_t                sc_flags;
193 #define PFSYNCF_OK              0x00000001
194 #define PFSYNCF_DEFER           0x00000002
195 #define PFSYNCF_PUSH            0x00000004
196         uint8_t                 sc_maxupdates;
197         struct ip               sc_template;
198         struct callout          sc_tmo;
199         struct mtx              sc_mtx;
200
201         /* Queued data */
202         size_t                  sc_len;
203         TAILQ_HEAD(, pf_state)                  sc_qs[PFSYNC_S_COUNT];
204         TAILQ_HEAD(, pfsync_upd_req_item)       sc_upd_req_list;
205         TAILQ_HEAD(, pfsync_deferral)           sc_deferrals;
206         u_int                   sc_deferred;
207         void                    *sc_plus;
208         size_t                  sc_pluslen;
209
210         /* Bulk update info */
211         struct mtx              sc_bulk_mtx;
212         uint32_t                sc_ureq_sent;
213         int                     sc_bulk_tries;
214         uint32_t                sc_ureq_received;
215         int                     sc_bulk_hashid;
216         uint64_t                sc_bulk_stateid;
217         uint32_t                sc_bulk_creatorid;
218         struct callout          sc_bulk_tmo;
219         struct callout          sc_bulkfail_tmo;
220 };
221
222 #define PFSYNC_LOCK(sc)         mtx_lock(&(sc)->sc_mtx)
223 #define PFSYNC_UNLOCK(sc)       mtx_unlock(&(sc)->sc_mtx)
224 #define PFSYNC_LOCK_ASSERT(sc)  mtx_assert(&(sc)->sc_mtx, MA_OWNED)
225
226 #define PFSYNC_BLOCK(sc)        mtx_lock(&(sc)->sc_bulk_mtx)
227 #define PFSYNC_BUNLOCK(sc)      mtx_unlock(&(sc)->sc_bulk_mtx)
228 #define PFSYNC_BLOCK_ASSERT(sc) mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
229
230 static const char pfsyncname[] = "pfsync";
231 static MALLOC_DEFINE(M_PFSYNC, pfsyncname, "pfsync(4) data");
232 static VNET_DEFINE(struct pfsync_softc  *, pfsyncif) = NULL;
233 #define V_pfsyncif              VNET(pfsyncif)
234 static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL;
235 #define V_pfsync_swi_cookie     VNET(pfsync_swi_cookie)
236 static VNET_DEFINE(struct pfsyncstats, pfsyncstats);
237 #define V_pfsyncstats           VNET(pfsyncstats)
238 static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW;
239 #define V_pfsync_carp_adj       VNET(pfsync_carp_adj)
240
241 static void     pfsync_timeout(void *);
242 static void     pfsync_push(struct pfsync_softc *);
243 static void     pfsyncintr(void *);
244 static int      pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
245                     void *);
246 static void     pfsync_multicast_cleanup(struct pfsync_softc *);
247 static void     pfsync_pointers_init(void);
248 static void     pfsync_pointers_uninit(void);
249 static int      pfsync_init(void);
250 static void     pfsync_uninit(void);
251
252 SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC");
253 SYSCTL_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_VNET | CTLFLAG_RW,
254     &VNET_NAME(pfsyncstats), pfsyncstats,
255     "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
256 SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW,
257     &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
258
259 static int      pfsync_clone_create(struct if_clone *, int, caddr_t);
260 static void     pfsync_clone_destroy(struct ifnet *);
261 static int      pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
262                     struct pf_state_peer *);
263 static int      pfsyncoutput(struct ifnet *, struct mbuf *,
264                     const struct sockaddr *, struct route *);
265 static int      pfsyncioctl(struct ifnet *, u_long, caddr_t);
266
267 static int      pfsync_defer(struct pf_state *, struct mbuf *);
268 static void     pfsync_undefer(struct pfsync_deferral *, int);
269 static void     pfsync_undefer_state(struct pf_state *, int);
270 static void     pfsync_defer_tmo(void *);
271
272 static void     pfsync_request_update(u_int32_t, u_int64_t);
273 static void     pfsync_update_state_req(struct pf_state *);
274
275 static void     pfsync_drop(struct pfsync_softc *);
276 static void     pfsync_sendout(int);
277 static void     pfsync_send_plus(void *, size_t);
278
279 static void     pfsync_bulk_start(void);
280 static void     pfsync_bulk_status(u_int8_t);
281 static void     pfsync_bulk_update(void *);
282 static void     pfsync_bulk_fail(void *);
283
284 #ifdef IPSEC
285 static void     pfsync_update_net_tdb(struct pfsync_tdb *);
286 #endif
287
288 #define PFSYNC_MAX_BULKTRIES    12
289
290 VNET_DEFINE(struct if_clone *, pfsync_cloner);
291 #define V_pfsync_cloner VNET(pfsync_cloner)
292
293 static int
294 pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
295 {
296         struct pfsync_softc *sc;
297         struct ifnet *ifp;
298         int q;
299
300         if (unit != 0)
301                 return (EINVAL);
302
303         sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
304         sc->sc_flags |= PFSYNCF_OK;
305
306         for (q = 0; q < PFSYNC_S_COUNT; q++)
307                 TAILQ_INIT(&sc->sc_qs[q]);
308
309         TAILQ_INIT(&sc->sc_upd_req_list);
310         TAILQ_INIT(&sc->sc_deferrals);
311
312         sc->sc_len = PFSYNC_MINPKT;
313         sc->sc_maxupdates = 128;
314
315         ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
316         if (ifp == NULL) {
317                 free(sc, M_PFSYNC);
318                 return (ENOSPC);
319         }
320         if_initname(ifp, pfsyncname, unit);
321         ifp->if_softc = sc;
322         ifp->if_ioctl = pfsyncioctl;
323         ifp->if_output = pfsyncoutput;
324         ifp->if_type = IFT_PFSYNC;
325         ifp->if_snd.ifq_maxlen = ifqmaxlen;
326         ifp->if_hdrlen = sizeof(struct pfsync_header);
327         ifp->if_mtu = ETHERMTU;
328         mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF);
329         mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
330         callout_init(&sc->sc_tmo, 1);
331         callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
332         callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
333
334         if_attach(ifp);
335
336         bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
337
338         V_pfsyncif = sc;
339
340         return (0);
341 }
342
343 static void
344 pfsync_clone_destroy(struct ifnet *ifp)
345 {
346         struct pfsync_softc *sc = ifp->if_softc;
347
348         /*
349          * At this stage, everything should have already been
350          * cleared by pfsync_uninit(), and we have only to
351          * drain callouts.
352          */
353         while (sc->sc_deferred > 0) {
354                 struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals);
355
356                 TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
357                 sc->sc_deferred--;
358                 if (callout_stop(&pd->pd_tmo) > 0) {
359                         pf_release_state(pd->pd_st);
360                         m_freem(pd->pd_m);
361                         free(pd, M_PFSYNC);
362                 } else {
363                         pd->pd_refs++;
364                         callout_drain(&pd->pd_tmo);
365                         free(pd, M_PFSYNC);
366                 }
367         }
368
369         callout_drain(&sc->sc_tmo);
370         callout_drain(&sc->sc_bulkfail_tmo);
371         callout_drain(&sc->sc_bulk_tmo);
372
373         if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
374                 (*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy");
375         bpfdetach(ifp);
376         if_detach(ifp);
377
378         pfsync_drop(sc);
379
380         if_free(ifp);
381         if (sc->sc_imo.imo_membership)
382                 pfsync_multicast_cleanup(sc);
383         mtx_destroy(&sc->sc_mtx);
384         mtx_destroy(&sc->sc_bulk_mtx);
385         free(sc, M_PFSYNC);
386
387         V_pfsyncif = NULL;
388 }
389
390 static int
391 pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
392     struct pf_state_peer *d)
393 {
394         if (s->scrub.scrub_flag && d->scrub == NULL) {
395                 d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO);
396                 if (d->scrub == NULL)
397                         return (ENOMEM);
398         }
399
400         return (0);
401 }
402
403
404 static int
405 pfsync_state_import(struct pfsync_state *sp, u_int8_t flags)
406 {
407         struct pfsync_softc *sc = V_pfsyncif;
408 #ifndef __NO_STRICT_ALIGNMENT
409         struct pfsync_state_key key[2];
410 #endif
411         struct pfsync_state_key *kw, *ks;
412         struct pf_state *st = NULL;
413         struct pf_state_key *skw = NULL, *sks = NULL;
414         struct pf_rule *r = NULL;
415         struct pfi_kif  *kif;
416         int error;
417
418         PF_RULES_RASSERT();
419
420         if (sp->creatorid == 0) {
421                 if (V_pf_status.debug >= PF_DEBUG_MISC)
422                         printf("%s: invalid creator id: %08x\n", __func__,
423                             ntohl(sp->creatorid));
424                 return (EINVAL);
425         }
426
427         if ((kif = pfi_kif_find(sp->ifname)) == NULL) {
428                 if (V_pf_status.debug >= PF_DEBUG_MISC)
429                         printf("%s: unknown interface: %s\n", __func__,
430                             sp->ifname);
431                 if (flags & PFSYNC_SI_IOCTL)
432                         return (EINVAL);
433                 return (0);     /* skip this state */
434         }
435
436         /*
437          * If the ruleset checksums match or the state is coming from the ioctl,
438          * it's safe to associate the state with the rule of that number.
439          */
440         if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
441             (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
442             pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
443                 r = pf_main_ruleset.rules[
444                     PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
445         else
446                 r = &V_pf_default_rule;
447
448         if ((r->max_states &&
449             counter_u64_fetch(r->states_cur) >= r->max_states))
450                 goto cleanup;
451
452         /*
453          * XXXGL: consider M_WAITOK in ioctl path after.
454          */
455         if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL)
456                 goto cleanup;
457
458         if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL)
459                 goto cleanup;
460
461 #ifndef __NO_STRICT_ALIGNMENT
462         bcopy(&sp->key, key, sizeof(struct pfsync_state_key) * 2);
463         kw = &key[PF_SK_WIRE];
464         ks = &key[PF_SK_STACK];
465 #else
466         kw = &sp->key[PF_SK_WIRE];
467         ks = &sp->key[PF_SK_STACK];
468 #endif
469
470         if (PF_ANEQ(&kw->addr[0], &ks->addr[0], sp->af) ||
471             PF_ANEQ(&kw->addr[1], &ks->addr[1], sp->af) ||
472             kw->port[0] != ks->port[0] ||
473             kw->port[1] != ks->port[1]) {
474                 sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
475                 if (sks == NULL)
476                         goto cleanup;
477         } else
478                 sks = skw;
479
480         /* allocate memory for scrub info */
481         if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
482             pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
483                 goto cleanup;
484
485         /* Copy to state key(s). */
486         skw->addr[0] = kw->addr[0];
487         skw->addr[1] = kw->addr[1];
488         skw->port[0] = kw->port[0];
489         skw->port[1] = kw->port[1];
490         skw->proto = sp->proto;
491         skw->af = sp->af;
492         if (sks != skw) {
493                 sks->addr[0] = ks->addr[0];
494                 sks->addr[1] = ks->addr[1];
495                 sks->port[0] = ks->port[0];
496                 sks->port[1] = ks->port[1];
497                 sks->proto = sp->proto;
498                 sks->af = sp->af;
499         }
500
501         /* copy to state */
502         bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
503         st->creation = time_uptime - ntohl(sp->creation);
504         st->expire = time_uptime;
505         if (sp->expire) {
506                 uint32_t timeout;
507
508                 timeout = r->timeout[sp->timeout];
509                 if (!timeout)
510                         timeout = V_pf_default_rule.timeout[sp->timeout];
511
512                 /* sp->expire may have been adaptively scaled by export. */
513                 st->expire -= timeout - ntohl(sp->expire);
514         }
515
516         st->direction = sp->direction;
517         st->log = sp->log;
518         st->timeout = sp->timeout;
519         st->state_flags = sp->state_flags;
520
521         st->id = sp->id;
522         st->creatorid = sp->creatorid;
523         pf_state_peer_ntoh(&sp->src, &st->src);
524         pf_state_peer_ntoh(&sp->dst, &st->dst);
525
526         st->rule.ptr = r;
527         st->nat_rule.ptr = NULL;
528         st->anchor.ptr = NULL;
529         st->rt_kif = NULL;
530
531         st->pfsync_time = time_uptime;
532         st->sync_state = PFSYNC_S_NONE;
533
534         if (!(flags & PFSYNC_SI_IOCTL))
535                 st->state_flags |= PFSTATE_NOSYNC;
536
537         if ((error = pf_state_insert(kif, skw, sks, st)) != 0)
538                 goto cleanup_state;
539
540         /* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
541         counter_u64_add(r->states_cur, 1);
542         counter_u64_add(r->states_tot, 1);
543
544         if (!(flags & PFSYNC_SI_IOCTL)) {
545                 st->state_flags &= ~PFSTATE_NOSYNC;
546                 if (st->state_flags & PFSTATE_ACK) {
547                         pfsync_q_ins(st, PFSYNC_S_IACK, true);
548                         pfsync_push(sc);
549                 }
550         }
551         st->state_flags &= ~PFSTATE_ACK;
552         PF_STATE_UNLOCK(st);
553
554         return (0);
555
556 cleanup:
557         error = ENOMEM;
558         if (skw == sks)
559                 sks = NULL;
560         if (skw != NULL)
561                 uma_zfree(V_pf_state_key_z, skw);
562         if (sks != NULL)
563                 uma_zfree(V_pf_state_key_z, sks);
564
565 cleanup_state:  /* pf_state_insert() frees the state keys. */
566         if (st) {
567                 if (st->dst.scrub)
568                         uma_zfree(V_pf_state_scrub_z, st->dst.scrub);
569                 if (st->src.scrub)
570                         uma_zfree(V_pf_state_scrub_z, st->src.scrub);
571                 uma_zfree(V_pf_state_z, st);
572         }
573         return (error);
574 }
575
576 static int
577 pfsync_input(struct mbuf **mp, int *offp __unused, int proto __unused)
578 {
579         struct pfsync_softc *sc = V_pfsyncif;
580         struct pfsync_pkt pkt;
581         struct mbuf *m = *mp;
582         struct ip *ip = mtod(m, struct ip *);
583         struct pfsync_header *ph;
584         struct pfsync_subheader subh;
585
586         int offset, len;
587         int rv;
588         uint16_t count;
589
590         *mp = NULL;
591         V_pfsyncstats.pfsyncs_ipackets++;
592
593         /* Verify that we have a sync interface configured. */
594         if (!sc || !sc->sc_sync_if || !V_pf_status.running ||
595             (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
596                 goto done;
597
598         /* verify that the packet came in on the right interface */
599         if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
600                 V_pfsyncstats.pfsyncs_badif++;
601                 goto done;
602         }
603
604         if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1);
605         if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
606         /* verify that the IP TTL is 255. */
607         if (ip->ip_ttl != PFSYNC_DFLTTL) {
608                 V_pfsyncstats.pfsyncs_badttl++;
609                 goto done;
610         }
611
612         offset = ip->ip_hl << 2;
613         if (m->m_pkthdr.len < offset + sizeof(*ph)) {
614                 V_pfsyncstats.pfsyncs_hdrops++;
615                 goto done;
616         }
617
618         if (offset + sizeof(*ph) > m->m_len) {
619                 if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
620                         V_pfsyncstats.pfsyncs_hdrops++;
621                         return (IPPROTO_DONE);
622                 }
623                 ip = mtod(m, struct ip *);
624         }
625         ph = (struct pfsync_header *)((char *)ip + offset);
626
627         /* verify the version */
628         if (ph->version != PFSYNC_VERSION) {
629                 V_pfsyncstats.pfsyncs_badver++;
630                 goto done;
631         }
632
633         len = ntohs(ph->len) + offset;
634         if (m->m_pkthdr.len < len) {
635                 V_pfsyncstats.pfsyncs_badlen++;
636                 goto done;
637         }
638
639         /* Cheaper to grab this now than having to mess with mbufs later */
640         pkt.ip = ip;
641         pkt.src = ip->ip_src;
642         pkt.flags = 0;
643
644         /*
645          * Trusting pf_chksum during packet processing, as well as seeking
646          * in interface name tree, require holding PF_RULES_RLOCK().
647          */
648         PF_RULES_RLOCK();
649         if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
650                 pkt.flags |= PFSYNC_SI_CKSUM;
651
652         offset += sizeof(*ph);
653         while (offset <= len - sizeof(subh)) {
654                 m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
655                 offset += sizeof(subh);
656
657                 if (subh.action >= PFSYNC_ACT_MAX) {
658                         V_pfsyncstats.pfsyncs_badact++;
659                         PF_RULES_RUNLOCK();
660                         goto done;
661                 }
662
663                 count = ntohs(subh.count);
664                 V_pfsyncstats.pfsyncs_iacts[subh.action] += count;
665                 rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count);
666                 if (rv == -1) {
667                         PF_RULES_RUNLOCK();
668                         return (IPPROTO_DONE);
669                 }
670
671                 offset += rv;
672         }
673         PF_RULES_RUNLOCK();
674
675 done:
676         m_freem(m);
677         return (IPPROTO_DONE);
678 }
679
680 static int
681 pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
682 {
683         struct pfsync_clr *clr;
684         struct mbuf *mp;
685         int len = sizeof(*clr) * count;
686         int i, offp;
687         u_int32_t creatorid;
688
689         mp = m_pulldown(m, offset, len, &offp);
690         if (mp == NULL) {
691                 V_pfsyncstats.pfsyncs_badlen++;
692                 return (-1);
693         }
694         clr = (struct pfsync_clr *)(mp->m_data + offp);
695
696         for (i = 0; i < count; i++) {
697                 creatorid = clr[i].creatorid;
698
699                 if (clr[i].ifname[0] != '\0' &&
700                     pfi_kif_find(clr[i].ifname) == NULL)
701                         continue;
702
703                 for (int i = 0; i <= pf_hashmask; i++) {
704                         struct pf_idhash *ih = &V_pf_idhash[i];
705                         struct pf_state *s;
706 relock:
707                         PF_HASHROW_LOCK(ih);
708                         LIST_FOREACH(s, &ih->states, entry) {
709                                 if (s->creatorid == creatorid) {
710                                         s->state_flags |= PFSTATE_NOSYNC;
711                                         pf_unlink_state(s, PF_ENTER_LOCKED);
712                                         goto relock;
713                                 }
714                         }
715                         PF_HASHROW_UNLOCK(ih);
716                 }
717         }
718
719         return (len);
720 }
721
722 static int
723 pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
724 {
725         struct mbuf *mp;
726         struct pfsync_state *sa, *sp;
727         int len = sizeof(*sp) * count;
728         int i, offp;
729
730         mp = m_pulldown(m, offset, len, &offp);
731         if (mp == NULL) {
732                 V_pfsyncstats.pfsyncs_badlen++;
733                 return (-1);
734         }
735         sa = (struct pfsync_state *)(mp->m_data + offp);
736
737         for (i = 0; i < count; i++) {
738                 sp = &sa[i];
739
740                 /* Check for invalid values. */
741                 if (sp->timeout >= PFTM_MAX ||
742                     sp->src.state > PF_TCPS_PROXY_DST ||
743                     sp->dst.state > PF_TCPS_PROXY_DST ||
744                     sp->direction > PF_OUT ||
745                     (sp->af != AF_INET && sp->af != AF_INET6)) {
746                         if (V_pf_status.debug >= PF_DEBUG_MISC)
747                                 printf("%s: invalid value\n", __func__);
748                         V_pfsyncstats.pfsyncs_badval++;
749                         continue;
750                 }
751
752                 if (pfsync_state_import(sp, pkt->flags) == ENOMEM)
753                         /* Drop out, but process the rest of the actions. */
754                         break;
755         }
756
757         return (len);
758 }
759
760 static int
761 pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
762 {
763         struct pfsync_ins_ack *ia, *iaa;
764         struct pf_state *st;
765
766         struct mbuf *mp;
767         int len = count * sizeof(*ia);
768         int offp, i;
769
770         mp = m_pulldown(m, offset, len, &offp);
771         if (mp == NULL) {
772                 V_pfsyncstats.pfsyncs_badlen++;
773                 return (-1);
774         }
775         iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
776
777         for (i = 0; i < count; i++) {
778                 ia = &iaa[i];
779
780                 st = pf_find_state_byid(ia->id, ia->creatorid);
781                 if (st == NULL)
782                         continue;
783
784                 if (st->state_flags & PFSTATE_ACK) {
785                         PFSYNC_LOCK(V_pfsyncif);
786                         pfsync_undefer_state(st, 0);
787                         PFSYNC_UNLOCK(V_pfsyncif);
788                 }
789                 PF_STATE_UNLOCK(st);
790         }
791         /*
792          * XXX this is not yet implemented, but we know the size of the
793          * message so we can skip it.
794          */
795
796         return (count * sizeof(struct pfsync_ins_ack));
797 }
798
799 static int
800 pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
801     struct pfsync_state_peer *dst)
802 {
803         int sync = 0;
804
805         PF_STATE_LOCK_ASSERT(st);
806
807         /*
808          * The state should never go backwards except
809          * for syn-proxy states.  Neither should the
810          * sequence window slide backwards.
811          */
812         if ((st->src.state > src->state &&
813             (st->src.state < PF_TCPS_PROXY_SRC ||
814             src->state >= PF_TCPS_PROXY_SRC)) ||
815
816             (st->src.state == src->state &&
817             SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
818                 sync++;
819         else
820                 pf_state_peer_ntoh(src, &st->src);
821
822         if ((st->dst.state > dst->state) ||
823
824             (st->dst.state >= TCPS_SYN_SENT &&
825             SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
826                 sync++;
827         else
828                 pf_state_peer_ntoh(dst, &st->dst);
829
830         return (sync);
831 }
832
833 static int
834 pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
835 {
836         struct pfsync_softc *sc = V_pfsyncif;
837         struct pfsync_state *sa, *sp;
838         struct pf_state *st;
839         int sync;
840
841         struct mbuf *mp;
842         int len = count * sizeof(*sp);
843         int offp, i;
844
845         mp = m_pulldown(m, offset, len, &offp);
846         if (mp == NULL) {
847                 V_pfsyncstats.pfsyncs_badlen++;
848                 return (-1);
849         }
850         sa = (struct pfsync_state *)(mp->m_data + offp);
851
852         for (i = 0; i < count; i++) {
853                 sp = &sa[i];
854
855                 /* check for invalid values */
856                 if (sp->timeout >= PFTM_MAX ||
857                     sp->src.state > PF_TCPS_PROXY_DST ||
858                     sp->dst.state > PF_TCPS_PROXY_DST) {
859                         if (V_pf_status.debug >= PF_DEBUG_MISC) {
860                                 printf("pfsync_input: PFSYNC_ACT_UPD: "
861                                     "invalid value\n");
862                         }
863                         V_pfsyncstats.pfsyncs_badval++;
864                         continue;
865                 }
866
867                 st = pf_find_state_byid(sp->id, sp->creatorid);
868                 if (st == NULL) {
869                         /* insert the update */
870                         if (pfsync_state_import(sp, 0))
871                                 V_pfsyncstats.pfsyncs_badstate++;
872                         continue;
873                 }
874
875                 if (st->state_flags & PFSTATE_ACK) {
876                         PFSYNC_LOCK(sc);
877                         pfsync_undefer_state(st, 1);
878                         PFSYNC_UNLOCK(sc);
879                 }
880
881                 if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
882                         sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
883                 else {
884                         sync = 0;
885
886                         /*
887                          * Non-TCP protocol state machine always go
888                          * forwards
889                          */
890                         if (st->src.state > sp->src.state)
891                                 sync++;
892                         else
893                                 pf_state_peer_ntoh(&sp->src, &st->src);
894                         if (st->dst.state > sp->dst.state)
895                                 sync++;
896                         else
897                                 pf_state_peer_ntoh(&sp->dst, &st->dst);
898                 }
899                 if (sync < 2) {
900                         pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
901                         pf_state_peer_ntoh(&sp->dst, &st->dst);
902                         st->expire = time_uptime;
903                         st->timeout = sp->timeout;
904                 }
905                 st->pfsync_time = time_uptime;
906
907                 if (sync) {
908                         V_pfsyncstats.pfsyncs_stale++;
909
910                         pfsync_update_state(st);
911                         PF_STATE_UNLOCK(st);
912                         PFSYNC_LOCK(sc);
913                         pfsync_push(sc);
914                         PFSYNC_UNLOCK(sc);
915                         continue;
916                 }
917                 PF_STATE_UNLOCK(st);
918         }
919
920         return (len);
921 }
922
923 static int
924 pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
925 {
926         struct pfsync_softc *sc = V_pfsyncif;
927         struct pfsync_upd_c *ua, *up;
928         struct pf_state *st;
929         int len = count * sizeof(*up);
930         int sync;
931         struct mbuf *mp;
932         int offp, i;
933
934         mp = m_pulldown(m, offset, len, &offp);
935         if (mp == NULL) {
936                 V_pfsyncstats.pfsyncs_badlen++;
937                 return (-1);
938         }
939         ua = (struct pfsync_upd_c *)(mp->m_data + offp);
940
941         for (i = 0; i < count; i++) {
942                 up = &ua[i];
943
944                 /* check for invalid values */
945                 if (up->timeout >= PFTM_MAX ||
946                     up->src.state > PF_TCPS_PROXY_DST ||
947                     up->dst.state > PF_TCPS_PROXY_DST) {
948                         if (V_pf_status.debug >= PF_DEBUG_MISC) {
949                                 printf("pfsync_input: "
950                                     "PFSYNC_ACT_UPD_C: "
951                                     "invalid value\n");
952                         }
953                         V_pfsyncstats.pfsyncs_badval++;
954                         continue;
955                 }
956
957                 st = pf_find_state_byid(up->id, up->creatorid);
958                 if (st == NULL) {
959                         /* We don't have this state. Ask for it. */
960                         PFSYNC_LOCK(sc);
961                         pfsync_request_update(up->creatorid, up->id);
962                         PFSYNC_UNLOCK(sc);
963                         continue;
964                 }
965
966                 if (st->state_flags & PFSTATE_ACK) {
967                         PFSYNC_LOCK(sc);
968                         pfsync_undefer_state(st, 1);
969                         PFSYNC_UNLOCK(sc);
970                 }
971
972                 if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
973                         sync = pfsync_upd_tcp(st, &up->src, &up->dst);
974                 else {
975                         sync = 0;
976
977                         /*
978                          * Non-TCP protocol state machine always go
979                          * forwards
980                          */
981                         if (st->src.state > up->src.state)
982                                 sync++;
983                         else
984                                 pf_state_peer_ntoh(&up->src, &st->src);
985                         if (st->dst.state > up->dst.state)
986                                 sync++;
987                         else
988                                 pf_state_peer_ntoh(&up->dst, &st->dst);
989                 }
990                 if (sync < 2) {
991                         pfsync_alloc_scrub_memory(&up->dst, &st->dst);
992                         pf_state_peer_ntoh(&up->dst, &st->dst);
993                         st->expire = time_uptime;
994                         st->timeout = up->timeout;
995                 }
996                 st->pfsync_time = time_uptime;
997
998                 if (sync) {
999                         V_pfsyncstats.pfsyncs_stale++;
1000
1001                         pfsync_update_state(st);
1002                         PF_STATE_UNLOCK(st);
1003                         PFSYNC_LOCK(sc);
1004                         pfsync_push(sc);
1005                         PFSYNC_UNLOCK(sc);
1006                         continue;
1007                 }
1008                 PF_STATE_UNLOCK(st);
1009         }
1010
1011         return (len);
1012 }
1013
1014 static int
1015 pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1016 {
1017         struct pfsync_upd_req *ur, *ura;
1018         struct mbuf *mp;
1019         int len = count * sizeof(*ur);
1020         int i, offp;
1021
1022         struct pf_state *st;
1023
1024         mp = m_pulldown(m, offset, len, &offp);
1025         if (mp == NULL) {
1026                 V_pfsyncstats.pfsyncs_badlen++;
1027                 return (-1);
1028         }
1029         ura = (struct pfsync_upd_req *)(mp->m_data + offp);
1030
1031         for (i = 0; i < count; i++) {
1032                 ur = &ura[i];
1033
1034                 if (ur->id == 0 && ur->creatorid == 0)
1035                         pfsync_bulk_start();
1036                 else {
1037                         st = pf_find_state_byid(ur->id, ur->creatorid);
1038                         if (st == NULL) {
1039                                 V_pfsyncstats.pfsyncs_badstate++;
1040                                 continue;
1041                         }
1042                         if (st->state_flags & PFSTATE_NOSYNC) {
1043                                 PF_STATE_UNLOCK(st);
1044                                 continue;
1045                         }
1046
1047                         pfsync_update_state_req(st);
1048                         PF_STATE_UNLOCK(st);
1049                 }
1050         }
1051
1052         return (len);
1053 }
1054
1055 static int
1056 pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1057 {
1058         struct mbuf *mp;
1059         struct pfsync_state *sa, *sp;
1060         struct pf_state *st;
1061         int len = count * sizeof(*sp);
1062         int offp, i;
1063
1064         mp = m_pulldown(m, offset, len, &offp);
1065         if (mp == NULL) {
1066                 V_pfsyncstats.pfsyncs_badlen++;
1067                 return (-1);
1068         }
1069         sa = (struct pfsync_state *)(mp->m_data + offp);
1070
1071         for (i = 0; i < count; i++) {
1072                 sp = &sa[i];
1073
1074                 st = pf_find_state_byid(sp->id, sp->creatorid);
1075                 if (st == NULL) {
1076                         V_pfsyncstats.pfsyncs_badstate++;
1077                         continue;
1078                 }
1079                 st->state_flags |= PFSTATE_NOSYNC;
1080                 pf_unlink_state(st, PF_ENTER_LOCKED);
1081         }
1082
1083         return (len);
1084 }
1085
1086 static int
1087 pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1088 {
1089         struct mbuf *mp;
1090         struct pfsync_del_c *sa, *sp;
1091         struct pf_state *st;
1092         int len = count * sizeof(*sp);
1093         int offp, i;
1094
1095         mp = m_pulldown(m, offset, len, &offp);
1096         if (mp == NULL) {
1097                 V_pfsyncstats.pfsyncs_badlen++;
1098                 return (-1);
1099         }
1100         sa = (struct pfsync_del_c *)(mp->m_data + offp);
1101
1102         for (i = 0; i < count; i++) {
1103                 sp = &sa[i];
1104
1105                 st = pf_find_state_byid(sp->id, sp->creatorid);
1106                 if (st == NULL) {
1107                         V_pfsyncstats.pfsyncs_badstate++;
1108                         continue;
1109                 }
1110
1111                 st->state_flags |= PFSTATE_NOSYNC;
1112                 pf_unlink_state(st, PF_ENTER_LOCKED);
1113         }
1114
1115         return (len);
1116 }
1117
1118 static int
1119 pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1120 {
1121         struct pfsync_softc *sc = V_pfsyncif;
1122         struct pfsync_bus *bus;
1123         struct mbuf *mp;
1124         int len = count * sizeof(*bus);
1125         int offp;
1126
1127         PFSYNC_BLOCK(sc);
1128
1129         /* If we're not waiting for a bulk update, who cares. */
1130         if (sc->sc_ureq_sent == 0) {
1131                 PFSYNC_BUNLOCK(sc);
1132                 return (len);
1133         }
1134
1135         mp = m_pulldown(m, offset, len, &offp);
1136         if (mp == NULL) {
1137                 PFSYNC_BUNLOCK(sc);
1138                 V_pfsyncstats.pfsyncs_badlen++;
1139                 return (-1);
1140         }
1141         bus = (struct pfsync_bus *)(mp->m_data + offp);
1142
1143         switch (bus->status) {
1144         case PFSYNC_BUS_START:
1145                 callout_reset(&sc->sc_bulkfail_tmo, 4 * hz +
1146                     V_pf_limits[PF_LIMIT_STATES].limit /
1147                     ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) /
1148                     sizeof(struct pfsync_state)),
1149                     pfsync_bulk_fail, sc);
1150                 if (V_pf_status.debug >= PF_DEBUG_MISC)
1151                         printf("pfsync: received bulk update start\n");
1152                 break;
1153
1154         case PFSYNC_BUS_END:
1155                 if (time_uptime - ntohl(bus->endtime) >=
1156                     sc->sc_ureq_sent) {
1157                         /* that's it, we're happy */
1158                         sc->sc_ureq_sent = 0;
1159                         sc->sc_bulk_tries = 0;
1160                         callout_stop(&sc->sc_bulkfail_tmo);
1161                         if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1162                                 (*carp_demote_adj_p)(-V_pfsync_carp_adj,
1163                                     "pfsync bulk done");
1164                         sc->sc_flags |= PFSYNCF_OK;
1165                         if (V_pf_status.debug >= PF_DEBUG_MISC)
1166                                 printf("pfsync: received valid "
1167                                     "bulk update end\n");
1168                 } else {
1169                         if (V_pf_status.debug >= PF_DEBUG_MISC)
1170                                 printf("pfsync: received invalid "
1171                                     "bulk update end: bad timestamp\n");
1172                 }
1173                 break;
1174         }
1175         PFSYNC_BUNLOCK(sc);
1176
1177         return (len);
1178 }
1179
1180 static int
1181 pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1182 {
1183         int len = count * sizeof(struct pfsync_tdb);
1184
1185 #if defined(IPSEC)
1186         struct pfsync_tdb *tp;
1187         struct mbuf *mp;
1188         int offp;
1189         int i;
1190         int s;
1191
1192         mp = m_pulldown(m, offset, len, &offp);
1193         if (mp == NULL) {
1194                 V_pfsyncstats.pfsyncs_badlen++;
1195                 return (-1);
1196         }
1197         tp = (struct pfsync_tdb *)(mp->m_data + offp);
1198
1199         for (i = 0; i < count; i++)
1200                 pfsync_update_net_tdb(&tp[i]);
1201 #endif
1202
1203         return (len);
1204 }
1205
1206 #if defined(IPSEC)
1207 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
1208 static void
1209 pfsync_update_net_tdb(struct pfsync_tdb *pt)
1210 {
1211         struct tdb              *tdb;
1212         int                      s;
1213
1214         /* check for invalid values */
1215         if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
1216             (pt->dst.sa.sa_family != AF_INET &&
1217             pt->dst.sa.sa_family != AF_INET6))
1218                 goto bad;
1219
1220         tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
1221         if (tdb) {
1222                 pt->rpl = ntohl(pt->rpl);
1223                 pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes);
1224
1225                 /* Neither replay nor byte counter should ever decrease. */
1226                 if (pt->rpl < tdb->tdb_rpl ||
1227                     pt->cur_bytes < tdb->tdb_cur_bytes) {
1228                         goto bad;
1229                 }
1230
1231                 tdb->tdb_rpl = pt->rpl;
1232                 tdb->tdb_cur_bytes = pt->cur_bytes;
1233         }
1234         return;
1235
1236 bad:
1237         if (V_pf_status.debug >= PF_DEBUG_MISC)
1238                 printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
1239                     "invalid value\n");
1240         V_pfsyncstats.pfsyncs_badstate++;
1241         return;
1242 }
1243 #endif
1244
1245
1246 static int
1247 pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1248 {
1249         /* check if we are at the right place in the packet */
1250         if (offset != m->m_pkthdr.len)
1251                 V_pfsyncstats.pfsyncs_badlen++;
1252
1253         /* we're done. free and let the caller return */
1254         m_freem(m);
1255         return (-1);
1256 }
1257
1258 static int
1259 pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1260 {
1261         V_pfsyncstats.pfsyncs_badact++;
1262
1263         m_freem(m);
1264         return (-1);
1265 }
1266
1267 static int
1268 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
1269         struct route *rt)
1270 {
1271         m_freem(m);
1272         return (0);
1273 }
1274
1275 /* ARGSUSED */
1276 static int
1277 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1278 {
1279         struct pfsync_softc *sc = ifp->if_softc;
1280         struct ifreq *ifr = (struct ifreq *)data;
1281         struct pfsyncreq pfsyncr;
1282         int error;
1283
1284         switch (cmd) {
1285         case SIOCSIFFLAGS:
1286                 PFSYNC_LOCK(sc);
1287                 if (ifp->if_flags & IFF_UP) {
1288                         ifp->if_drv_flags |= IFF_DRV_RUNNING;
1289                         PFSYNC_UNLOCK(sc);
1290                         pfsync_pointers_init();
1291                 } else {
1292                         ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1293                         PFSYNC_UNLOCK(sc);
1294                         pfsync_pointers_uninit();
1295                 }
1296                 break;
1297         case SIOCSIFMTU:
1298                 if (!sc->sc_sync_if ||
1299                     ifr->ifr_mtu <= PFSYNC_MINPKT ||
1300                     ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
1301                         return (EINVAL);
1302                 if (ifr->ifr_mtu < ifp->if_mtu) {
1303                         PFSYNC_LOCK(sc);
1304                         if (sc->sc_len > PFSYNC_MINPKT)
1305                                 pfsync_sendout(1);
1306                         PFSYNC_UNLOCK(sc);
1307                 }
1308                 ifp->if_mtu = ifr->ifr_mtu;
1309                 break;
1310         case SIOCGETPFSYNC:
1311                 bzero(&pfsyncr, sizeof(pfsyncr));
1312                 PFSYNC_LOCK(sc);
1313                 if (sc->sc_sync_if) {
1314                         strlcpy(pfsyncr.pfsyncr_syncdev,
1315                             sc->sc_sync_if->if_xname, IFNAMSIZ);
1316                 }
1317                 pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
1318                 pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
1319                 pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER ==
1320                     (sc->sc_flags & PFSYNCF_DEFER));
1321                 PFSYNC_UNLOCK(sc);
1322                 return (copyout(&pfsyncr, ifr_data_get_ptr(ifr),
1323                     sizeof(pfsyncr)));
1324
1325         case SIOCSETPFSYNC:
1326             {
1327                 struct ip_moptions *imo = &sc->sc_imo;
1328                 struct ifnet *sifp;
1329                 struct ip *ip;
1330                 void *mship = NULL;
1331
1332                 if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
1333                         return (error);
1334                 if ((error = copyin(ifr_data_get_ptr(ifr), &pfsyncr,
1335                     sizeof(pfsyncr))))
1336                         return (error);
1337
1338                 if (pfsyncr.pfsyncr_maxupdates > 255)
1339                         return (EINVAL);
1340
1341                 if (pfsyncr.pfsyncr_syncdev[0] == 0)
1342                         sifp = NULL;
1343                 else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL)
1344                         return (EINVAL);
1345
1346                 if (sifp != NULL && (
1347                     pfsyncr.pfsyncr_syncpeer.s_addr == 0 ||
1348                     pfsyncr.pfsyncr_syncpeer.s_addr ==
1349                     htonl(INADDR_PFSYNC_GROUP)))
1350                         mship = malloc((sizeof(struct in_multi *) *
1351                             IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO);
1352
1353                 PFSYNC_LOCK(sc);
1354                 if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
1355                         sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
1356                 else
1357                         sc->sc_sync_peer.s_addr =
1358                             pfsyncr.pfsyncr_syncpeer.s_addr;
1359
1360                 sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
1361                 if (pfsyncr.pfsyncr_defer) {
1362                         sc->sc_flags |= PFSYNCF_DEFER;
1363                         pfsync_defer_ptr = pfsync_defer;
1364                 } else {
1365                         sc->sc_flags &= ~PFSYNCF_DEFER;
1366                         pfsync_defer_ptr = NULL;
1367                 }
1368
1369                 if (sifp == NULL) {
1370                         if (sc->sc_sync_if)
1371                                 if_rele(sc->sc_sync_if);
1372                         sc->sc_sync_if = NULL;
1373                         if (imo->imo_membership)
1374                                 pfsync_multicast_cleanup(sc);
1375                         PFSYNC_UNLOCK(sc);
1376                         break;
1377                 }
1378
1379                 if (sc->sc_len > PFSYNC_MINPKT &&
1380                     (sifp->if_mtu < sc->sc_ifp->if_mtu ||
1381                     (sc->sc_sync_if != NULL &&
1382                     sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
1383                     sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
1384                         pfsync_sendout(1);
1385
1386                 if (imo->imo_membership)
1387                         pfsync_multicast_cleanup(sc);
1388
1389                 if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
1390                         error = pfsync_multicast_setup(sc, sifp, mship);
1391                         if (error) {
1392                                 if_rele(sifp);
1393                                 free(mship, M_PFSYNC);
1394                                 return (error);
1395                         }
1396                 }
1397                 if (sc->sc_sync_if)
1398                         if_rele(sc->sc_sync_if);
1399                 sc->sc_sync_if = sifp;
1400
1401                 ip = &sc->sc_template;
1402                 bzero(ip, sizeof(*ip));
1403                 ip->ip_v = IPVERSION;
1404                 ip->ip_hl = sizeof(sc->sc_template) >> 2;
1405                 ip->ip_tos = IPTOS_LOWDELAY;
1406                 /* len and id are set later. */
1407                 ip->ip_off = htons(IP_DF);
1408                 ip->ip_ttl = PFSYNC_DFLTTL;
1409                 ip->ip_p = IPPROTO_PFSYNC;
1410                 ip->ip_src.s_addr = INADDR_ANY;
1411                 ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
1412
1413                 /* Request a full state table update. */
1414                 if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1415                         (*carp_demote_adj_p)(V_pfsync_carp_adj,
1416                             "pfsync bulk start");
1417                 sc->sc_flags &= ~PFSYNCF_OK;
1418                 if (V_pf_status.debug >= PF_DEBUG_MISC)
1419                         printf("pfsync: requesting bulk update\n");
1420                 pfsync_request_update(0, 0);
1421                 PFSYNC_UNLOCK(sc);
1422                 PFSYNC_BLOCK(sc);
1423                 sc->sc_ureq_sent = time_uptime;
1424                 callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail,
1425                     sc);
1426                 PFSYNC_BUNLOCK(sc);
1427
1428                 break;
1429             }
1430         default:
1431                 return (ENOTTY);
1432         }
1433
1434         return (0);
1435 }
1436
1437 static void
1438 pfsync_out_state(struct pf_state *st, void *buf)
1439 {
1440         struct pfsync_state *sp = buf;
1441
1442         pfsync_state_export(sp, st);
1443 }
1444
1445 static void
1446 pfsync_out_iack(struct pf_state *st, void *buf)
1447 {
1448         struct pfsync_ins_ack *iack = buf;
1449
1450         iack->id = st->id;
1451         iack->creatorid = st->creatorid;
1452 }
1453
1454 static void
1455 pfsync_out_upd_c(struct pf_state *st, void *buf)
1456 {
1457         struct pfsync_upd_c *up = buf;
1458
1459         bzero(up, sizeof(*up));
1460         up->id = st->id;
1461         pf_state_peer_hton(&st->src, &up->src);
1462         pf_state_peer_hton(&st->dst, &up->dst);
1463         up->creatorid = st->creatorid;
1464         up->timeout = st->timeout;
1465 }
1466
1467 static void
1468 pfsync_out_del(struct pf_state *st, void *buf)
1469 {
1470         struct pfsync_del_c *dp = buf;
1471
1472         dp->id = st->id;
1473         dp->creatorid = st->creatorid;
1474         st->state_flags |= PFSTATE_NOSYNC;
1475 }
1476
1477 static void
1478 pfsync_drop(struct pfsync_softc *sc)
1479 {
1480         struct pf_state *st, *next;
1481         struct pfsync_upd_req_item *ur;
1482         int q;
1483
1484         for (q = 0; q < PFSYNC_S_COUNT; q++) {
1485                 if (TAILQ_EMPTY(&sc->sc_qs[q]))
1486                         continue;
1487
1488                 TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) {
1489                         KASSERT(st->sync_state == q,
1490                                 ("%s: st->sync_state == q",
1491                                         __func__));
1492                         st->sync_state = PFSYNC_S_NONE;
1493                         pf_release_state(st);
1494                 }
1495                 TAILQ_INIT(&sc->sc_qs[q]);
1496         }
1497
1498         while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1499                 TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1500                 free(ur, M_PFSYNC);
1501         }
1502
1503         sc->sc_plus = NULL;
1504         sc->sc_len = PFSYNC_MINPKT;
1505 }
1506
1507 static void
1508 pfsync_sendout(int schedswi)
1509 {
1510         struct pfsync_softc *sc = V_pfsyncif;
1511         struct ifnet *ifp = sc->sc_ifp;
1512         struct mbuf *m;
1513         struct ip *ip;
1514         struct pfsync_header *ph;
1515         struct pfsync_subheader *subh;
1516         struct pf_state *st, *st_next;
1517         struct pfsync_upd_req_item *ur;
1518         int offset;
1519         int q, count = 0;
1520
1521         KASSERT(sc != NULL, ("%s: null sc", __func__));
1522         KASSERT(sc->sc_len > PFSYNC_MINPKT,
1523             ("%s: sc_len %zu", __func__, sc->sc_len));
1524         PFSYNC_LOCK_ASSERT(sc);
1525
1526         if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
1527                 pfsync_drop(sc);
1528                 return;
1529         }
1530
1531         m = m_get2(max_linkhdr + sc->sc_len, M_NOWAIT, MT_DATA, M_PKTHDR);
1532         if (m == NULL) {
1533                 if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
1534                 V_pfsyncstats.pfsyncs_onomem++;
1535                 return;
1536         }
1537         m->m_data += max_linkhdr;
1538         m->m_len = m->m_pkthdr.len = sc->sc_len;
1539
1540         /* build the ip header */
1541         ip = (struct ip *)m->m_data;
1542         bcopy(&sc->sc_template, ip, sizeof(*ip));
1543         offset = sizeof(*ip);
1544
1545         ip->ip_len = htons(m->m_pkthdr.len);
1546         ip_fillid(ip);
1547
1548         /* build the pfsync header */
1549         ph = (struct pfsync_header *)(m->m_data + offset);
1550         bzero(ph, sizeof(*ph));
1551         offset += sizeof(*ph);
1552
1553         ph->version = PFSYNC_VERSION;
1554         ph->len = htons(sc->sc_len - sizeof(*ip));
1555         bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
1556
1557         /* walk the queues */
1558         for (q = 0; q < PFSYNC_S_COUNT; q++) {
1559                 if (TAILQ_EMPTY(&sc->sc_qs[q]))
1560                         continue;
1561
1562                 subh = (struct pfsync_subheader *)(m->m_data + offset);
1563                 offset += sizeof(*subh);
1564
1565                 count = 0;
1566                 TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, st_next) {
1567                         KASSERT(st->sync_state == q,
1568                                 ("%s: st->sync_state == q",
1569                                         __func__));
1570                         /*
1571                          * XXXGL: some of write methods do unlocked reads
1572                          * of state data :(
1573                          */
1574                         pfsync_qs[q].write(st, m->m_data + offset);
1575                         offset += pfsync_qs[q].len;
1576                         st->sync_state = PFSYNC_S_NONE;
1577                         pf_release_state(st);
1578                         count++;
1579                 }
1580                 TAILQ_INIT(&sc->sc_qs[q]);
1581
1582                 bzero(subh, sizeof(*subh));
1583                 subh->action = pfsync_qs[q].action;
1584                 subh->count = htons(count);
1585                 V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
1586         }
1587
1588         if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
1589                 subh = (struct pfsync_subheader *)(m->m_data + offset);
1590                 offset += sizeof(*subh);
1591
1592                 count = 0;
1593                 while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1594                         TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1595
1596                         bcopy(&ur->ur_msg, m->m_data + offset,
1597                             sizeof(ur->ur_msg));
1598                         offset += sizeof(ur->ur_msg);
1599                         free(ur, M_PFSYNC);
1600                         count++;
1601                 }
1602
1603                 bzero(subh, sizeof(*subh));
1604                 subh->action = PFSYNC_ACT_UPD_REQ;
1605                 subh->count = htons(count);
1606                 V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count;
1607         }
1608
1609         /* has someone built a custom region for us to add? */
1610         if (sc->sc_plus != NULL) {
1611                 bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
1612                 offset += sc->sc_pluslen;
1613
1614                 sc->sc_plus = NULL;
1615         }
1616
1617         subh = (struct pfsync_subheader *)(m->m_data + offset);
1618         offset += sizeof(*subh);
1619
1620         bzero(subh, sizeof(*subh));
1621         subh->action = PFSYNC_ACT_EOF;
1622         subh->count = htons(1);
1623         V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++;
1624
1625         /* we're done, let's put it on the wire */
1626         if (ifp->if_bpf) {
1627                 m->m_data += sizeof(*ip);
1628                 m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
1629                 BPF_MTAP(ifp, m);
1630                 m->m_data -= sizeof(*ip);
1631                 m->m_len = m->m_pkthdr.len = sc->sc_len;
1632         }
1633
1634         if (sc->sc_sync_if == NULL) {
1635                 sc->sc_len = PFSYNC_MINPKT;
1636                 m_freem(m);
1637                 return;
1638         }
1639
1640         if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1);
1641         if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
1642         sc->sc_len = PFSYNC_MINPKT;
1643
1644         if (!_IF_QFULL(&sc->sc_ifp->if_snd))
1645                 _IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
1646         else {
1647                 m_freem(m);
1648                 if_inc_counter(sc->sc_ifp, IFCOUNTER_OQDROPS, 1);
1649         }
1650         if (schedswi)
1651                 swi_sched(V_pfsync_swi_cookie, 0);
1652 }
1653
1654 static void
1655 pfsync_insert_state(struct pf_state *st)
1656 {
1657         struct pfsync_softc *sc = V_pfsyncif;
1658
1659         if (st->state_flags & PFSTATE_NOSYNC)
1660                 return;
1661
1662         if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) ||
1663             st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
1664                 st->state_flags |= PFSTATE_NOSYNC;
1665                 return;
1666         }
1667
1668         KASSERT(st->sync_state == PFSYNC_S_NONE,
1669                 ("%s: st->sync_state %u", __func__, st->sync_state));
1670
1671         PFSYNC_LOCK(sc);
1672         if (sc->sc_len == PFSYNC_MINPKT)
1673                 callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1674
1675         pfsync_q_ins(st, PFSYNC_S_INS, true);
1676         PFSYNC_UNLOCK(sc);
1677
1678         st->sync_updates = 0;
1679 }
1680
1681 static int
1682 pfsync_defer(struct pf_state *st, struct mbuf *m)
1683 {
1684         struct pfsync_softc *sc = V_pfsyncif;
1685         struct pfsync_deferral *pd;
1686
1687         if (m->m_flags & (M_BCAST|M_MCAST))
1688                 return (0);
1689
1690         PFSYNC_LOCK(sc);
1691
1692         if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) ||
1693             !(sc->sc_flags & PFSYNCF_DEFER)) {
1694                 PFSYNC_UNLOCK(sc);
1695                 return (0);
1696         }
1697
1698          if (sc->sc_deferred >= 128)
1699                 pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0);
1700
1701         pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
1702         if (pd == NULL)
1703                 return (0);
1704         sc->sc_deferred++;
1705
1706         m->m_flags |= M_SKIP_FIREWALL;
1707         st->state_flags |= PFSTATE_ACK;
1708
1709         pd->pd_sc = sc;
1710         pd->pd_refs = 0;
1711         pd->pd_st = st;
1712         pf_ref_state(st);
1713         pd->pd_m = m;
1714
1715         TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
1716         callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1717         callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd);
1718
1719         pfsync_push(sc);
1720
1721         return (1);
1722 }
1723
1724 static void
1725 pfsync_undefer(struct pfsync_deferral *pd, int drop)
1726 {
1727         struct pfsync_softc *sc = pd->pd_sc;
1728         struct mbuf *m = pd->pd_m;
1729         struct pf_state *st = pd->pd_st;
1730
1731         PFSYNC_LOCK_ASSERT(sc);
1732
1733         TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1734         sc->sc_deferred--;
1735         pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */
1736         free(pd, M_PFSYNC);
1737         pf_release_state(st);
1738
1739         if (drop)
1740                 m_freem(m);
1741         else {
1742                 _IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
1743                 pfsync_push(sc);
1744         }
1745 }
1746
1747 static void
1748 pfsync_defer_tmo(void *arg)
1749 {
1750         struct pfsync_deferral *pd = arg;
1751         struct pfsync_softc *sc = pd->pd_sc;
1752         struct mbuf *m = pd->pd_m;
1753         struct pf_state *st = pd->pd_st;
1754
1755         PFSYNC_LOCK_ASSERT(sc);
1756
1757         CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
1758
1759         TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1760         sc->sc_deferred--;
1761         pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */
1762         if (pd->pd_refs == 0)
1763                 free(pd, M_PFSYNC);
1764         PFSYNC_UNLOCK(sc);
1765
1766         ip_output(m, NULL, NULL, 0, NULL, NULL);
1767
1768         pf_release_state(st);
1769
1770         CURVNET_RESTORE();
1771 }
1772
1773 static void
1774 pfsync_undefer_state(struct pf_state *st, int drop)
1775 {
1776         struct pfsync_softc *sc = V_pfsyncif;
1777         struct pfsync_deferral *pd;
1778
1779         PFSYNC_LOCK_ASSERT(sc);
1780
1781         TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
1782                  if (pd->pd_st == st) {
1783                         if (callout_stop(&pd->pd_tmo) > 0)
1784                                 pfsync_undefer(pd, drop);
1785                         return;
1786                 }
1787         }
1788
1789         panic("%s: unable to find deferred state", __func__);
1790 }
1791
1792 static void
1793 pfsync_update_state(struct pf_state *st)
1794 {
1795         struct pfsync_softc *sc = V_pfsyncif;
1796         bool sync = false, ref = true;
1797
1798         PF_STATE_LOCK_ASSERT(st);
1799         PFSYNC_LOCK(sc);
1800
1801         if (st->state_flags & PFSTATE_ACK)
1802                 pfsync_undefer_state(st, 0);
1803         if (st->state_flags & PFSTATE_NOSYNC) {
1804                 if (st->sync_state != PFSYNC_S_NONE)
1805                         pfsync_q_del(st, true);
1806                 PFSYNC_UNLOCK(sc);
1807                 return;
1808         }
1809
1810         if (sc->sc_len == PFSYNC_MINPKT)
1811                 callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1812
1813         switch (st->sync_state) {
1814         case PFSYNC_S_UPD_C:
1815         case PFSYNC_S_UPD:
1816         case PFSYNC_S_INS:
1817                 /* we're already handling it */
1818
1819                 if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
1820                         st->sync_updates++;
1821                         if (st->sync_updates >= sc->sc_maxupdates)
1822                                 sync = true;
1823                 }
1824                 break;
1825
1826         case PFSYNC_S_IACK:
1827                 pfsync_q_del(st, false);
1828                 ref = false;
1829                 /* FALLTHROUGH */
1830
1831         case PFSYNC_S_NONE:
1832                 pfsync_q_ins(st, PFSYNC_S_UPD_C, ref);
1833                 st->sync_updates = 0;
1834                 break;
1835
1836         default:
1837                 panic("%s: unexpected sync state %d", __func__, st->sync_state);
1838         }
1839
1840         if (sync || (time_uptime - st->pfsync_time) < 2)
1841                 pfsync_push(sc);
1842
1843         PFSYNC_UNLOCK(sc);
1844 }
1845
1846 static void
1847 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
1848 {
1849         struct pfsync_softc *sc = V_pfsyncif;
1850         struct pfsync_upd_req_item *item;
1851         size_t nlen = sizeof(struct pfsync_upd_req);
1852
1853         PFSYNC_LOCK_ASSERT(sc);
1854
1855         /*
1856          * This code does a bit to prevent multiple update requests for the
1857          * same state being generated. It searches current subheader queue,
1858          * but it doesn't lookup into queue of already packed datagrams.
1859          */
1860         TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry)
1861                 if (item->ur_msg.id == id &&
1862                     item->ur_msg.creatorid == creatorid)
1863                         return;
1864
1865         item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT);
1866         if (item == NULL)
1867                 return; /* XXX stats */
1868
1869         item->ur_msg.id = id;
1870         item->ur_msg.creatorid = creatorid;
1871
1872         if (TAILQ_EMPTY(&sc->sc_upd_req_list))
1873                 nlen += sizeof(struct pfsync_subheader);
1874
1875         if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
1876                 pfsync_sendout(1);
1877
1878                 nlen = sizeof(struct pfsync_subheader) +
1879                     sizeof(struct pfsync_upd_req);
1880         }
1881
1882         TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
1883         sc->sc_len += nlen;
1884 }
1885
1886 static void
1887 pfsync_update_state_req(struct pf_state *st)
1888 {
1889         struct pfsync_softc *sc = V_pfsyncif;
1890         bool ref = true;
1891
1892         PF_STATE_LOCK_ASSERT(st);
1893         PFSYNC_LOCK(sc);
1894
1895         if (st->state_flags & PFSTATE_NOSYNC) {
1896                 if (st->sync_state != PFSYNC_S_NONE)
1897                         pfsync_q_del(st, true);
1898                 PFSYNC_UNLOCK(sc);
1899                 return;
1900         }
1901
1902         switch (st->sync_state) {
1903         case PFSYNC_S_UPD_C:
1904         case PFSYNC_S_IACK:
1905                 pfsync_q_del(st, false);
1906                 ref = false;
1907                 /* FALLTHROUGH */
1908
1909         case PFSYNC_S_NONE:
1910                 pfsync_q_ins(st, PFSYNC_S_UPD, ref);
1911                 pfsync_push(sc);
1912                 break;
1913
1914         case PFSYNC_S_INS:
1915         case PFSYNC_S_UPD:
1916         case PFSYNC_S_DEL:
1917                 /* we're already handling it */
1918                 break;
1919
1920         default:
1921                 panic("%s: unexpected sync state %d", __func__, st->sync_state);
1922         }
1923
1924         PFSYNC_UNLOCK(sc);
1925 }
1926
1927 static void
1928 pfsync_delete_state(struct pf_state *st)
1929 {
1930         struct pfsync_softc *sc = V_pfsyncif;
1931         bool ref = true;
1932
1933         PFSYNC_LOCK(sc);
1934         if (st->state_flags & PFSTATE_ACK)
1935                 pfsync_undefer_state(st, 1);
1936         if (st->state_flags & PFSTATE_NOSYNC) {
1937                 if (st->sync_state != PFSYNC_S_NONE)
1938                         pfsync_q_del(st, true);
1939                 PFSYNC_UNLOCK(sc);
1940                 return;
1941         }
1942
1943         if (sc->sc_len == PFSYNC_MINPKT)
1944                 callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1945
1946         switch (st->sync_state) {
1947         case PFSYNC_S_INS:
1948                 /* We never got to tell the world so just forget about it. */
1949                 pfsync_q_del(st, true);
1950                 break;
1951
1952         case PFSYNC_S_UPD_C:
1953         case PFSYNC_S_UPD:
1954         case PFSYNC_S_IACK:
1955                 pfsync_q_del(st, false);
1956                 ref = false;
1957                 /* FALLTHROUGH */
1958
1959         case PFSYNC_S_NONE:
1960                 pfsync_q_ins(st, PFSYNC_S_DEL, ref);
1961                 break;
1962
1963         default:
1964                 panic("%s: unexpected sync state %d", __func__, st->sync_state);
1965         }
1966
1967         PFSYNC_UNLOCK(sc);
1968 }
1969
1970 static void
1971 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
1972 {
1973         struct pfsync_softc *sc = V_pfsyncif;
1974         struct {
1975                 struct pfsync_subheader subh;
1976                 struct pfsync_clr clr;
1977         } __packed r;
1978
1979         bzero(&r, sizeof(r));
1980
1981         r.subh.action = PFSYNC_ACT_CLR;
1982         r.subh.count = htons(1);
1983         V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++;
1984
1985         strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
1986         r.clr.creatorid = creatorid;
1987
1988         PFSYNC_LOCK(sc);
1989         pfsync_send_plus(&r, sizeof(r));
1990         PFSYNC_UNLOCK(sc);
1991 }
1992
1993 static void
1994 pfsync_q_ins(struct pf_state *st, int q, bool ref)
1995 {
1996         struct pfsync_softc *sc = V_pfsyncif;
1997         size_t nlen = pfsync_qs[q].len;
1998
1999         PFSYNC_LOCK_ASSERT(sc);
2000
2001         KASSERT(st->sync_state == PFSYNC_S_NONE,
2002                 ("%s: st->sync_state %u", __func__, st->sync_state));
2003         KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
2004             sc->sc_len));
2005
2006         if (TAILQ_EMPTY(&sc->sc_qs[q]))
2007                 nlen += sizeof(struct pfsync_subheader);
2008
2009         if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
2010                 pfsync_sendout(1);
2011
2012                 nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
2013         }
2014
2015         sc->sc_len += nlen;
2016         TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
2017         st->sync_state = q;
2018         if (ref)
2019                 pf_ref_state(st);
2020 }
2021
2022 static void
2023 pfsync_q_del(struct pf_state *st, bool unref)
2024 {
2025         struct pfsync_softc *sc = V_pfsyncif;
2026         int q = st->sync_state;
2027
2028         PFSYNC_LOCK_ASSERT(sc);
2029         KASSERT(st->sync_state != PFSYNC_S_NONE,
2030                 ("%s: st->sync_state != PFSYNC_S_NONE", __func__));
2031
2032         sc->sc_len -= pfsync_qs[q].len;
2033         TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
2034         st->sync_state = PFSYNC_S_NONE;
2035         if (unref)
2036                 pf_release_state(st);
2037
2038         if (TAILQ_EMPTY(&sc->sc_qs[q]))
2039                 sc->sc_len -= sizeof(struct pfsync_subheader);
2040 }
2041
2042 static void
2043 pfsync_bulk_start(void)
2044 {
2045         struct pfsync_softc *sc = V_pfsyncif;
2046
2047         if (V_pf_status.debug >= PF_DEBUG_MISC)
2048                 printf("pfsync: received bulk update request\n");
2049
2050         PFSYNC_BLOCK(sc);
2051
2052         sc->sc_ureq_received = time_uptime;
2053         sc->sc_bulk_hashid = 0;
2054         sc->sc_bulk_stateid = 0;
2055         pfsync_bulk_status(PFSYNC_BUS_START);
2056         callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
2057         PFSYNC_BUNLOCK(sc);
2058 }
2059
2060 static void
2061 pfsync_bulk_update(void *arg)
2062 {
2063         struct pfsync_softc *sc = arg;
2064         struct pf_state *s;
2065         int i, sent = 0;
2066
2067         PFSYNC_BLOCK_ASSERT(sc);
2068         CURVNET_SET(sc->sc_ifp->if_vnet);
2069
2070         /*
2071          * Start with last state from previous invocation.
2072          * It may had gone, in this case start from the
2073          * hash slot.
2074          */
2075         s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid);
2076
2077         if (s != NULL)
2078                 i = PF_IDHASH(s);
2079         else
2080                 i = sc->sc_bulk_hashid;
2081
2082         for (; i <= pf_hashmask; i++) {
2083                 struct pf_idhash *ih = &V_pf_idhash[i];
2084
2085                 if (s != NULL)
2086                         PF_HASHROW_ASSERT(ih);
2087                 else {
2088                         PF_HASHROW_LOCK(ih);
2089                         s = LIST_FIRST(&ih->states);
2090                 }
2091
2092                 for (; s; s = LIST_NEXT(s, entry)) {
2093
2094                         if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) <
2095                             sizeof(struct pfsync_state)) {
2096                                 /* We've filled a packet. */
2097                                 sc->sc_bulk_hashid = i;
2098                                 sc->sc_bulk_stateid = s->id;
2099                                 sc->sc_bulk_creatorid = s->creatorid;
2100                                 PF_HASHROW_UNLOCK(ih);
2101                                 callout_reset(&sc->sc_bulk_tmo, 1,
2102                                     pfsync_bulk_update, sc);
2103                                 goto full;
2104                         }
2105
2106                         if (s->sync_state == PFSYNC_S_NONE &&
2107                             s->timeout < PFTM_MAX &&
2108                             s->pfsync_time <= sc->sc_ureq_received) {
2109                                 pfsync_update_state_req(s);
2110                                 sent++;
2111                         }
2112                 }
2113                 PF_HASHROW_UNLOCK(ih);
2114         }
2115
2116         /* We're done. */
2117         pfsync_bulk_status(PFSYNC_BUS_END);
2118
2119 full:
2120         CURVNET_RESTORE();
2121 }
2122
2123 static void
2124 pfsync_bulk_status(u_int8_t status)
2125 {
2126         struct {
2127                 struct pfsync_subheader subh;
2128                 struct pfsync_bus bus;
2129         } __packed r;
2130
2131         struct pfsync_softc *sc = V_pfsyncif;
2132
2133         bzero(&r, sizeof(r));
2134
2135         r.subh.action = PFSYNC_ACT_BUS;
2136         r.subh.count = htons(1);
2137         V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++;
2138
2139         r.bus.creatorid = V_pf_status.hostid;
2140         r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
2141         r.bus.status = status;
2142
2143         PFSYNC_LOCK(sc);
2144         pfsync_send_plus(&r, sizeof(r));
2145         PFSYNC_UNLOCK(sc);
2146 }
2147
2148 static void
2149 pfsync_bulk_fail(void *arg)
2150 {
2151         struct pfsync_softc *sc = arg;
2152
2153         CURVNET_SET(sc->sc_ifp->if_vnet);
2154
2155         PFSYNC_BLOCK_ASSERT(sc);
2156
2157         if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
2158                 /* Try again */
2159                 callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
2160                     pfsync_bulk_fail, V_pfsyncif);
2161                 PFSYNC_LOCK(sc);
2162                 pfsync_request_update(0, 0);
2163                 PFSYNC_UNLOCK(sc);
2164         } else {
2165                 /* Pretend like the transfer was ok. */
2166                 sc->sc_ureq_sent = 0;
2167                 sc->sc_bulk_tries = 0;
2168                 PFSYNC_LOCK(sc);
2169                 if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
2170                         (*carp_demote_adj_p)(-V_pfsync_carp_adj,
2171                             "pfsync bulk fail");
2172                 sc->sc_flags |= PFSYNCF_OK;
2173                 PFSYNC_UNLOCK(sc);
2174                 if (V_pf_status.debug >= PF_DEBUG_MISC)
2175                         printf("pfsync: failed to receive bulk update\n");
2176         }
2177
2178         CURVNET_RESTORE();
2179 }
2180
2181 static void
2182 pfsync_send_plus(void *plus, size_t pluslen)
2183 {
2184         struct pfsync_softc *sc = V_pfsyncif;
2185
2186         PFSYNC_LOCK_ASSERT(sc);
2187
2188         if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu)
2189                 pfsync_sendout(1);
2190
2191         sc->sc_plus = plus;
2192         sc->sc_len += (sc->sc_pluslen = pluslen);
2193
2194         pfsync_sendout(1);
2195 }
2196
2197 static void
2198 pfsync_timeout(void *arg)
2199 {
2200         struct pfsync_softc *sc = arg;
2201
2202         CURVNET_SET(sc->sc_ifp->if_vnet);
2203         PFSYNC_LOCK(sc);
2204         pfsync_push(sc);
2205         PFSYNC_UNLOCK(sc);
2206         CURVNET_RESTORE();
2207 }
2208
2209 static void
2210 pfsync_push(struct pfsync_softc *sc)
2211 {
2212
2213         PFSYNC_LOCK_ASSERT(sc);
2214
2215         sc->sc_flags |= PFSYNCF_PUSH;
2216         swi_sched(V_pfsync_swi_cookie, 0);
2217 }
2218
2219 static void
2220 pfsyncintr(void *arg)
2221 {
2222         struct pfsync_softc *sc = arg;
2223         struct mbuf *m, *n;
2224
2225         CURVNET_SET(sc->sc_ifp->if_vnet);
2226
2227         PFSYNC_LOCK(sc);
2228         if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) {
2229                 pfsync_sendout(0);
2230                 sc->sc_flags &= ~PFSYNCF_PUSH;
2231         }
2232         _IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m);
2233         PFSYNC_UNLOCK(sc);
2234
2235         for (; m != NULL; m = n) {
2236
2237                 n = m->m_nextpkt;
2238                 m->m_nextpkt = NULL;
2239
2240                 /*
2241                  * We distinguish between a deferral packet and our
2242                  * own pfsync packet based on M_SKIP_FIREWALL
2243                  * flag. This is XXX.
2244                  */
2245                 if (m->m_flags & M_SKIP_FIREWALL)
2246                         ip_output(m, NULL, NULL, 0, NULL, NULL);
2247                 else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
2248                     NULL) == 0)
2249                         V_pfsyncstats.pfsyncs_opackets++;
2250                 else
2251                         V_pfsyncstats.pfsyncs_oerrors++;
2252         }
2253         CURVNET_RESTORE();
2254 }
2255
2256 static int
2257 pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship)
2258 {
2259         struct ip_moptions *imo = &sc->sc_imo;
2260         int error;
2261
2262         if (!(ifp->if_flags & IFF_MULTICAST))
2263                 return (EADDRNOTAVAIL);
2264
2265         imo->imo_membership = (struct in_multi **)mship;
2266         imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
2267         imo->imo_multicast_vif = -1;
2268
2269         if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL,
2270             &imo->imo_membership[0])) != 0) {
2271                 imo->imo_membership = NULL;
2272                 return (error);
2273         }
2274         imo->imo_num_memberships++;
2275         imo->imo_multicast_ifp = ifp;
2276         imo->imo_multicast_ttl = PFSYNC_DFLTTL;
2277         imo->imo_multicast_loop = 0;
2278
2279         return (0);
2280 }
2281
2282 static void
2283 pfsync_multicast_cleanup(struct pfsync_softc *sc)
2284 {
2285         struct ip_moptions *imo = &sc->sc_imo;
2286
2287         in_leavegroup(imo->imo_membership[0], NULL);
2288         free(imo->imo_membership, M_PFSYNC);
2289         imo->imo_membership = NULL;
2290         imo->imo_multicast_ifp = NULL;
2291 }
2292
2293 #ifdef INET
2294 extern  struct domain inetdomain;
2295 static struct protosw in_pfsync_protosw = {
2296         .pr_type =              SOCK_RAW,
2297         .pr_domain =            &inetdomain,
2298         .pr_protocol =          IPPROTO_PFSYNC,
2299         .pr_flags =             PR_ATOMIC|PR_ADDR,
2300         .pr_input =             pfsync_input,
2301         .pr_output =            rip_output,
2302         .pr_ctloutput =         rip_ctloutput,
2303         .pr_usrreqs =           &rip_usrreqs
2304 };
2305 #endif
2306
2307 static void
2308 pfsync_pointers_init()
2309 {
2310
2311         PF_RULES_WLOCK();
2312         pfsync_state_import_ptr = pfsync_state_import;
2313         pfsync_insert_state_ptr = pfsync_insert_state;
2314         pfsync_update_state_ptr = pfsync_update_state;
2315         pfsync_delete_state_ptr = pfsync_delete_state;
2316         pfsync_clear_states_ptr = pfsync_clear_states;
2317         pfsync_defer_ptr = pfsync_defer;
2318         PF_RULES_WUNLOCK();
2319 }
2320
2321 static void
2322 pfsync_pointers_uninit()
2323 {
2324
2325         PF_RULES_WLOCK();
2326         pfsync_state_import_ptr = NULL;
2327         pfsync_insert_state_ptr = NULL;
2328         pfsync_update_state_ptr = NULL;
2329         pfsync_delete_state_ptr = NULL;
2330         pfsync_clear_states_ptr = NULL;
2331         pfsync_defer_ptr = NULL;
2332         PF_RULES_WUNLOCK();
2333 }
2334
2335 static void
2336 vnet_pfsync_init(const void *unused __unused)
2337 {
2338         int error;
2339
2340         V_pfsync_cloner = if_clone_simple(pfsyncname,
2341             pfsync_clone_create, pfsync_clone_destroy, 1);
2342         error = swi_add(NULL, pfsyncname, pfsyncintr, V_pfsyncif,
2343             SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie);
2344         if (error) {
2345                 if_clone_detach(V_pfsync_cloner);
2346                 log(LOG_INFO, "swi_add() failed in %s\n", __func__);
2347         }
2348 }
2349 VNET_SYSINIT(vnet_pfsync_init, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY,
2350     vnet_pfsync_init, NULL);
2351
2352 static void
2353 vnet_pfsync_uninit(const void *unused __unused)
2354 {
2355
2356         if_clone_detach(V_pfsync_cloner);
2357         swi_remove(V_pfsync_swi_cookie);
2358 }
2359 /*
2360  * Detach after pf is gone; otherwise we might touch pfsync memory
2361  * from within pf after freeing pfsync.
2362  */
2363 VNET_SYSUNINIT(vnet_pfsync_uninit, SI_SUB_INIT_IF, SI_ORDER_SECOND,
2364     vnet_pfsync_uninit, NULL);
2365
2366 static int
2367 pfsync_init()
2368 {
2369 #ifdef INET
2370         int error;
2371
2372         error = pf_proto_register(PF_INET, &in_pfsync_protosw);
2373         if (error)
2374                 return (error);
2375         error = ipproto_register(IPPROTO_PFSYNC);
2376         if (error) {
2377                 pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2378                 return (error);
2379         }
2380 #endif
2381         pfsync_pointers_init();
2382
2383         return (0);
2384 }
2385
2386 static void
2387 pfsync_uninit()
2388 {
2389
2390         pfsync_pointers_uninit();
2391
2392 #ifdef INET
2393         ipproto_unregister(IPPROTO_PFSYNC);
2394         pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2395 #endif
2396 }
2397
2398 static int
2399 pfsync_modevent(module_t mod, int type, void *data)
2400 {
2401         int error = 0;
2402
2403         switch (type) {
2404         case MOD_LOAD:
2405                 error = pfsync_init();
2406                 break;
2407         case MOD_QUIESCE:
2408                 /*
2409                  * Module should not be unloaded due to race conditions.
2410                  */
2411                 error = EBUSY;
2412                 break;
2413         case MOD_UNLOAD:
2414                 pfsync_uninit();
2415                 break;
2416         default:
2417                 error = EINVAL;
2418                 break;
2419         }
2420
2421         return (error);
2422 }
2423
2424 static moduledata_t pfsync_mod = {
2425         pfsyncname,
2426         pfsync_modevent,
2427         0
2428 };
2429
2430 #define PFSYNC_MODVER 1
2431
2432 /* Stay on FIREWALL as we depend on pf being initialized and on inetdomain. */
2433 DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY);
2434 MODULE_VERSION(pfsync, PFSYNC_MODVER);
2435 MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);