]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netpfil/pf/if_pfsync.c
MFV r323530,r323533,r323534: 7431 ZFS Channel Programs, and followups
[FreeBSD/FreeBSD.git] / sys / netpfil / pf / if_pfsync.c
1 /*-
2  * Copyright (c) 2002 Michael Shalayeff
3  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
19  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
23  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
24  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
25  * THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 /*-
29  * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
30  *
31  * Permission to use, copy, modify, and distribute this software for any
32  * purpose with or without fee is hereby granted, provided that the above
33  * copyright notice and this permission notice appear in all copies.
34  *
35  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
36  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
37  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
38  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
40  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
41  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42  */
43
44 /*
45  * $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $
46  *
47  * Revisions picked from OpenBSD after revision 1.110 import:
48  * 1.119 - don't m_copydata() beyond the len of mbuf in pfsync_input()
49  * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates
50  * 1.120, 1.175 - use monotonic time_uptime
51  * 1.122 - reduce number of updates for non-TCP sessions
52  * 1.125, 1.127 - rewrite merge or stale processing
53  * 1.128 - cleanups
54  * 1.146 - bzero() mbuf before sparsely filling it with data
55  * 1.170 - SIOCSIFMTU checks
56  * 1.126, 1.142 - deferred packets processing
57  * 1.173 - correct expire time processing
58  */
59
60 #include <sys/cdefs.h>
61 __FBSDID("$FreeBSD$");
62
63 #include "opt_inet.h"
64 #include "opt_inet6.h"
65 #include "opt_pf.h"
66
67 #include <sys/param.h>
68 #include <sys/bus.h>
69 #include <sys/endian.h>
70 #include <sys/interrupt.h>
71 #include <sys/kernel.h>
72 #include <sys/lock.h>
73 #include <sys/mbuf.h>
74 #include <sys/module.h>
75 #include <sys/mutex.h>
76 #include <sys/priv.h>
77 #include <sys/protosw.h>
78 #include <sys/socket.h>
79 #include <sys/sockio.h>
80 #include <sys/sysctl.h>
81 #include <sys/syslog.h>
82
83 #include <net/bpf.h>
84 #include <net/if.h>
85 #include <net/if_var.h>
86 #include <net/if_clone.h>
87 #include <net/if_types.h>
88 #include <net/vnet.h>
89 #include <net/pfvar.h>
90 #include <net/if_pfsync.h>
91
92 #include <netinet/if_ether.h>
93 #include <netinet/in.h>
94 #include <netinet/in_var.h>
95 #include <netinet/ip.h>
96 #include <netinet/ip_carp.h>
97 #include <netinet/ip_var.h>
98 #include <netinet/tcp.h>
99 #include <netinet/tcp_fsm.h>
100 #include <netinet/tcp_seq.h>
101
102 #define PFSYNC_MINPKT ( \
103         sizeof(struct ip) + \
104         sizeof(struct pfsync_header) + \
105         sizeof(struct pfsync_subheader) )
106
107 struct pfsync_pkt {
108         struct ip *ip;
109         struct in_addr src;
110         u_int8_t flags;
111 };
112
113 static int      pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
114                     struct pfsync_state_peer *);
115 static int      pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int);
116 static int      pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int);
117 static int      pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int);
118 static int      pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int);
119 static int      pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int);
120 static int      pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int);
121 static int      pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int);
122 static int      pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int);
123 static int      pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int);
124 static int      pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int);
125 static int      pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int);
126 static int      pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int);
127
128 static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = {
129         pfsync_in_clr,                  /* PFSYNC_ACT_CLR */
130         pfsync_in_ins,                  /* PFSYNC_ACT_INS */
131         pfsync_in_iack,                 /* PFSYNC_ACT_INS_ACK */
132         pfsync_in_upd,                  /* PFSYNC_ACT_UPD */
133         pfsync_in_upd_c,                /* PFSYNC_ACT_UPD_C */
134         pfsync_in_ureq,                 /* PFSYNC_ACT_UPD_REQ */
135         pfsync_in_del,                  /* PFSYNC_ACT_DEL */
136         pfsync_in_del_c,                /* PFSYNC_ACT_DEL_C */
137         pfsync_in_error,                /* PFSYNC_ACT_INS_F */
138         pfsync_in_error,                /* PFSYNC_ACT_DEL_F */
139         pfsync_in_bus,                  /* PFSYNC_ACT_BUS */
140         pfsync_in_tdb,                  /* PFSYNC_ACT_TDB */
141         pfsync_in_eof                   /* PFSYNC_ACT_EOF */
142 };
143
144 struct pfsync_q {
145         void            (*write)(struct pf_state *, void *);
146         size_t          len;
147         u_int8_t        action;
148 };
149
150 /* we have one of these for every PFSYNC_S_ */
151 static void     pfsync_out_state(struct pf_state *, void *);
152 static void     pfsync_out_iack(struct pf_state *, void *);
153 static void     pfsync_out_upd_c(struct pf_state *, void *);
154 static void     pfsync_out_del(struct pf_state *, void *);
155
156 static struct pfsync_q pfsync_qs[] = {
157         { pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
158         { pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
159         { pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD },
160         { pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
161         { pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C }
162 };
163
164 static void     pfsync_q_ins(struct pf_state *, int, bool);
165 static void     pfsync_q_del(struct pf_state *, bool);
166
167 static void     pfsync_update_state(struct pf_state *);
168
169 struct pfsync_upd_req_item {
170         TAILQ_ENTRY(pfsync_upd_req_item)        ur_entry;
171         struct pfsync_upd_req                   ur_msg;
172 };
173
174 struct pfsync_deferral {
175         struct pfsync_softc             *pd_sc;
176         TAILQ_ENTRY(pfsync_deferral)    pd_entry;
177         u_int                           pd_refs;
178         struct callout                  pd_tmo;
179
180         struct pf_state                 *pd_st;
181         struct mbuf                     *pd_m;
182 };
183
184 struct pfsync_softc {
185         /* Configuration */
186         struct ifnet            *sc_ifp;
187         struct ifnet            *sc_sync_if;
188         struct ip_moptions      sc_imo;
189         struct in_addr          sc_sync_peer;
190         uint32_t                sc_flags;
191 #define PFSYNCF_OK              0x00000001
192 #define PFSYNCF_DEFER           0x00000002
193 #define PFSYNCF_PUSH            0x00000004
194         uint8_t                 sc_maxupdates;
195         struct ip               sc_template;
196         struct callout          sc_tmo;
197         struct mtx              sc_mtx;
198
199         /* Queued data */
200         size_t                  sc_len;
201         TAILQ_HEAD(, pf_state)                  sc_qs[PFSYNC_S_COUNT];
202         TAILQ_HEAD(, pfsync_upd_req_item)       sc_upd_req_list;
203         TAILQ_HEAD(, pfsync_deferral)           sc_deferrals;
204         u_int                   sc_deferred;
205         void                    *sc_plus;
206         size_t                  sc_pluslen;
207
208         /* Bulk update info */
209         struct mtx              sc_bulk_mtx;
210         uint32_t                sc_ureq_sent;
211         int                     sc_bulk_tries;
212         uint32_t                sc_ureq_received;
213         int                     sc_bulk_hashid;
214         uint64_t                sc_bulk_stateid;
215         uint32_t                sc_bulk_creatorid;
216         struct callout          sc_bulk_tmo;
217         struct callout          sc_bulkfail_tmo;
218 };
219
220 #define PFSYNC_LOCK(sc)         mtx_lock(&(sc)->sc_mtx)
221 #define PFSYNC_UNLOCK(sc)       mtx_unlock(&(sc)->sc_mtx)
222 #define PFSYNC_LOCK_ASSERT(sc)  mtx_assert(&(sc)->sc_mtx, MA_OWNED)
223
224 #define PFSYNC_BLOCK(sc)        mtx_lock(&(sc)->sc_bulk_mtx)
225 #define PFSYNC_BUNLOCK(sc)      mtx_unlock(&(sc)->sc_bulk_mtx)
226 #define PFSYNC_BLOCK_ASSERT(sc) mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
227
228 static const char pfsyncname[] = "pfsync";
229 static MALLOC_DEFINE(M_PFSYNC, pfsyncname, "pfsync(4) data");
230 static VNET_DEFINE(struct pfsync_softc  *, pfsyncif) = NULL;
231 #define V_pfsyncif              VNET(pfsyncif)
232 static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL;
233 #define V_pfsync_swi_cookie     VNET(pfsync_swi_cookie)
234 static VNET_DEFINE(struct pfsyncstats, pfsyncstats);
235 #define V_pfsyncstats           VNET(pfsyncstats)
236 static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW;
237 #define V_pfsync_carp_adj       VNET(pfsync_carp_adj)
238
239 static void     pfsync_timeout(void *);
240 static void     pfsync_push(struct pfsync_softc *);
241 static void     pfsyncintr(void *);
242 static int      pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
243                     void *);
244 static void     pfsync_multicast_cleanup(struct pfsync_softc *);
245 static void     pfsync_pointers_init(void);
246 static void     pfsync_pointers_uninit(void);
247 static int      pfsync_init(void);
248 static void     pfsync_uninit(void);
249
250 SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC");
251 SYSCTL_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_VNET | CTLFLAG_RW,
252     &VNET_NAME(pfsyncstats), pfsyncstats,
253     "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
254 SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW,
255     &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
256
257 static int      pfsync_clone_create(struct if_clone *, int, caddr_t);
258 static void     pfsync_clone_destroy(struct ifnet *);
259 static int      pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
260                     struct pf_state_peer *);
261 static int      pfsyncoutput(struct ifnet *, struct mbuf *,
262                     const struct sockaddr *, struct route *);
263 static int      pfsyncioctl(struct ifnet *, u_long, caddr_t);
264
265 static int      pfsync_defer(struct pf_state *, struct mbuf *);
266 static void     pfsync_undefer(struct pfsync_deferral *, int);
267 static void     pfsync_undefer_state(struct pf_state *, int);
268 static void     pfsync_defer_tmo(void *);
269
270 static void     pfsync_request_update(u_int32_t, u_int64_t);
271 static void     pfsync_update_state_req(struct pf_state *);
272
273 static void     pfsync_drop(struct pfsync_softc *);
274 static void     pfsync_sendout(int);
275 static void     pfsync_send_plus(void *, size_t);
276
277 static void     pfsync_bulk_start(void);
278 static void     pfsync_bulk_status(u_int8_t);
279 static void     pfsync_bulk_update(void *);
280 static void     pfsync_bulk_fail(void *);
281
282 #ifdef IPSEC
283 static void     pfsync_update_net_tdb(struct pfsync_tdb *);
284 #endif
285
286 #define PFSYNC_MAX_BULKTRIES    12
287
288 VNET_DEFINE(struct if_clone *, pfsync_cloner);
289 #define V_pfsync_cloner VNET(pfsync_cloner)
290
291 static int
292 pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
293 {
294         struct pfsync_softc *sc;
295         struct ifnet *ifp;
296         int q;
297
298         if (unit != 0)
299                 return (EINVAL);
300
301         sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
302         sc->sc_flags |= PFSYNCF_OK;
303
304         for (q = 0; q < PFSYNC_S_COUNT; q++)
305                 TAILQ_INIT(&sc->sc_qs[q]);
306
307         TAILQ_INIT(&sc->sc_upd_req_list);
308         TAILQ_INIT(&sc->sc_deferrals);
309
310         sc->sc_len = PFSYNC_MINPKT;
311         sc->sc_maxupdates = 128;
312
313         ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
314         if (ifp == NULL) {
315                 free(sc, M_PFSYNC);
316                 return (ENOSPC);
317         }
318         if_initname(ifp, pfsyncname, unit);
319         ifp->if_softc = sc;
320         ifp->if_ioctl = pfsyncioctl;
321         ifp->if_output = pfsyncoutput;
322         ifp->if_type = IFT_PFSYNC;
323         ifp->if_snd.ifq_maxlen = ifqmaxlen;
324         ifp->if_hdrlen = sizeof(struct pfsync_header);
325         ifp->if_mtu = ETHERMTU;
326         mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF);
327         mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
328         callout_init(&sc->sc_tmo, 1);
329         callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
330         callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
331
332         if_attach(ifp);
333
334         bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
335
336         V_pfsyncif = sc;
337
338         return (0);
339 }
340
341 static void
342 pfsync_clone_destroy(struct ifnet *ifp)
343 {
344         struct pfsync_softc *sc = ifp->if_softc;
345
346         /*
347          * At this stage, everything should have already been
348          * cleared by pfsync_uninit(), and we have only to
349          * drain callouts.
350          */
351         while (sc->sc_deferred > 0) {
352                 struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals);
353
354                 TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
355                 sc->sc_deferred--;
356                 if (callout_stop(&pd->pd_tmo) > 0) {
357                         pf_release_state(pd->pd_st);
358                         m_freem(pd->pd_m);
359                         free(pd, M_PFSYNC);
360                 } else {
361                         pd->pd_refs++;
362                         callout_drain(&pd->pd_tmo);
363                         free(pd, M_PFSYNC);
364                 }
365         }
366
367         callout_drain(&sc->sc_tmo);
368         callout_drain(&sc->sc_bulkfail_tmo);
369         callout_drain(&sc->sc_bulk_tmo);
370
371         if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
372                 (*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy");
373         bpfdetach(ifp);
374         if_detach(ifp);
375
376         pfsync_drop(sc);
377
378         if_free(ifp);
379         if (sc->sc_imo.imo_membership)
380                 pfsync_multicast_cleanup(sc);
381         mtx_destroy(&sc->sc_mtx);
382         mtx_destroy(&sc->sc_bulk_mtx);
383         free(sc, M_PFSYNC);
384
385         V_pfsyncif = NULL;
386 }
387
388 static int
389 pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
390     struct pf_state_peer *d)
391 {
392         if (s->scrub.scrub_flag && d->scrub == NULL) {
393                 d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO);
394                 if (d->scrub == NULL)
395                         return (ENOMEM);
396         }
397
398         return (0);
399 }
400
401
402 static int
403 pfsync_state_import(struct pfsync_state *sp, u_int8_t flags)
404 {
405         struct pfsync_softc *sc = V_pfsyncif;
406 #ifndef __NO_STRICT_ALIGNMENT
407         struct pfsync_state_key key[2];
408 #endif
409         struct pfsync_state_key *kw, *ks;
410         struct pf_state *st = NULL;
411         struct pf_state_key *skw = NULL, *sks = NULL;
412         struct pf_rule *r = NULL;
413         struct pfi_kif  *kif;
414         int error;
415
416         PF_RULES_RASSERT();
417
418         if (sp->creatorid == 0) {
419                 if (V_pf_status.debug >= PF_DEBUG_MISC)
420                         printf("%s: invalid creator id: %08x\n", __func__,
421                             ntohl(sp->creatorid));
422                 return (EINVAL);
423         }
424
425         if ((kif = pfi_kif_find(sp->ifname)) == NULL) {
426                 if (V_pf_status.debug >= PF_DEBUG_MISC)
427                         printf("%s: unknown interface: %s\n", __func__,
428                             sp->ifname);
429                 if (flags & PFSYNC_SI_IOCTL)
430                         return (EINVAL);
431                 return (0);     /* skip this state */
432         }
433
434         /*
435          * If the ruleset checksums match or the state is coming from the ioctl,
436          * it's safe to associate the state with the rule of that number.
437          */
438         if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
439             (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
440             pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
441                 r = pf_main_ruleset.rules[
442                     PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
443         else
444                 r = &V_pf_default_rule;
445
446         if ((r->max_states &&
447             counter_u64_fetch(r->states_cur) >= r->max_states))
448                 goto cleanup;
449
450         /*
451          * XXXGL: consider M_WAITOK in ioctl path after.
452          */
453         if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL)
454                 goto cleanup;
455
456         if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL)
457                 goto cleanup;
458
459 #ifndef __NO_STRICT_ALIGNMENT
460         bcopy(&sp->key, key, sizeof(struct pfsync_state_key) * 2);
461         kw = &key[PF_SK_WIRE];
462         ks = &key[PF_SK_STACK];
463 #else
464         kw = &sp->key[PF_SK_WIRE];
465         ks = &sp->key[PF_SK_STACK];
466 #endif
467
468         if (PF_ANEQ(&kw->addr[0], &ks->addr[0], sp->af) ||
469             PF_ANEQ(&kw->addr[1], &ks->addr[1], sp->af) ||
470             kw->port[0] != ks->port[0] ||
471             kw->port[1] != ks->port[1]) {
472                 sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
473                 if (sks == NULL)
474                         goto cleanup;
475         } else
476                 sks = skw;
477
478         /* allocate memory for scrub info */
479         if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
480             pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
481                 goto cleanup;
482
483         /* Copy to state key(s). */
484         skw->addr[0] = kw->addr[0];
485         skw->addr[1] = kw->addr[1];
486         skw->port[0] = kw->port[0];
487         skw->port[1] = kw->port[1];
488         skw->proto = sp->proto;
489         skw->af = sp->af;
490         if (sks != skw) {
491                 sks->addr[0] = ks->addr[0];
492                 sks->addr[1] = ks->addr[1];
493                 sks->port[0] = ks->port[0];
494                 sks->port[1] = ks->port[1];
495                 sks->proto = sp->proto;
496                 sks->af = sp->af;
497         }
498
499         /* copy to state */
500         bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
501         st->creation = time_uptime - ntohl(sp->creation);
502         st->expire = time_uptime;
503         if (sp->expire) {
504                 uint32_t timeout;
505
506                 timeout = r->timeout[sp->timeout];
507                 if (!timeout)
508                         timeout = V_pf_default_rule.timeout[sp->timeout];
509
510                 /* sp->expire may have been adaptively scaled by export. */
511                 st->expire -= timeout - ntohl(sp->expire);
512         }
513
514         st->direction = sp->direction;
515         st->log = sp->log;
516         st->timeout = sp->timeout;
517         st->state_flags = sp->state_flags;
518
519         st->id = sp->id;
520         st->creatorid = sp->creatorid;
521         pf_state_peer_ntoh(&sp->src, &st->src);
522         pf_state_peer_ntoh(&sp->dst, &st->dst);
523
524         st->rule.ptr = r;
525         st->nat_rule.ptr = NULL;
526         st->anchor.ptr = NULL;
527         st->rt_kif = NULL;
528
529         st->pfsync_time = time_uptime;
530         st->sync_state = PFSYNC_S_NONE;
531
532         if (!(flags & PFSYNC_SI_IOCTL))
533                 st->state_flags |= PFSTATE_NOSYNC;
534
535         if ((error = pf_state_insert(kif, skw, sks, st)) != 0)
536                 goto cleanup_state;
537
538         /* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
539         counter_u64_add(r->states_cur, 1);
540         counter_u64_add(r->states_tot, 1);
541
542         if (!(flags & PFSYNC_SI_IOCTL)) {
543                 st->state_flags &= ~PFSTATE_NOSYNC;
544                 if (st->state_flags & PFSTATE_ACK) {
545                         pfsync_q_ins(st, PFSYNC_S_IACK, true);
546                         pfsync_push(sc);
547                 }
548         }
549         st->state_flags &= ~PFSTATE_ACK;
550         PF_STATE_UNLOCK(st);
551
552         return (0);
553
554 cleanup:
555         error = ENOMEM;
556         if (skw == sks)
557                 sks = NULL;
558         if (skw != NULL)
559                 uma_zfree(V_pf_state_key_z, skw);
560         if (sks != NULL)
561                 uma_zfree(V_pf_state_key_z, sks);
562
563 cleanup_state:  /* pf_state_insert() frees the state keys. */
564         if (st) {
565                 if (st->dst.scrub)
566                         uma_zfree(V_pf_state_scrub_z, st->dst.scrub);
567                 if (st->src.scrub)
568                         uma_zfree(V_pf_state_scrub_z, st->src.scrub);
569                 uma_zfree(V_pf_state_z, st);
570         }
571         return (error);
572 }
573
574 static int
575 pfsync_input(struct mbuf **mp, int *offp __unused, int proto __unused)
576 {
577         struct pfsync_softc *sc = V_pfsyncif;
578         struct pfsync_pkt pkt;
579         struct mbuf *m = *mp;
580         struct ip *ip = mtod(m, struct ip *);
581         struct pfsync_header *ph;
582         struct pfsync_subheader subh;
583
584         int offset, len;
585         int rv;
586         uint16_t count;
587
588         *mp = NULL;
589         V_pfsyncstats.pfsyncs_ipackets++;
590
591         /* Verify that we have a sync interface configured. */
592         if (!sc || !sc->sc_sync_if || !V_pf_status.running ||
593             (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
594                 goto done;
595
596         /* verify that the packet came in on the right interface */
597         if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
598                 V_pfsyncstats.pfsyncs_badif++;
599                 goto done;
600         }
601
602         if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1);
603         if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
604         /* verify that the IP TTL is 255. */
605         if (ip->ip_ttl != PFSYNC_DFLTTL) {
606                 V_pfsyncstats.pfsyncs_badttl++;
607                 goto done;
608         }
609
610         offset = ip->ip_hl << 2;
611         if (m->m_pkthdr.len < offset + sizeof(*ph)) {
612                 V_pfsyncstats.pfsyncs_hdrops++;
613                 goto done;
614         }
615
616         if (offset + sizeof(*ph) > m->m_len) {
617                 if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
618                         V_pfsyncstats.pfsyncs_hdrops++;
619                         return (IPPROTO_DONE);
620                 }
621                 ip = mtod(m, struct ip *);
622         }
623         ph = (struct pfsync_header *)((char *)ip + offset);
624
625         /* verify the version */
626         if (ph->version != PFSYNC_VERSION) {
627                 V_pfsyncstats.pfsyncs_badver++;
628                 goto done;
629         }
630
631         len = ntohs(ph->len) + offset;
632         if (m->m_pkthdr.len < len) {
633                 V_pfsyncstats.pfsyncs_badlen++;
634                 goto done;
635         }
636
637         /* Cheaper to grab this now than having to mess with mbufs later */
638         pkt.ip = ip;
639         pkt.src = ip->ip_src;
640         pkt.flags = 0;
641
642         /*
643          * Trusting pf_chksum during packet processing, as well as seeking
644          * in interface name tree, require holding PF_RULES_RLOCK().
645          */
646         PF_RULES_RLOCK();
647         if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
648                 pkt.flags |= PFSYNC_SI_CKSUM;
649
650         offset += sizeof(*ph);
651         while (offset <= len - sizeof(subh)) {
652                 m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
653                 offset += sizeof(subh);
654
655                 if (subh.action >= PFSYNC_ACT_MAX) {
656                         V_pfsyncstats.pfsyncs_badact++;
657                         PF_RULES_RUNLOCK();
658                         goto done;
659                 }
660
661                 count = ntohs(subh.count);
662                 V_pfsyncstats.pfsyncs_iacts[subh.action] += count;
663                 rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count);
664                 if (rv == -1) {
665                         PF_RULES_RUNLOCK();
666                         return (IPPROTO_DONE);
667                 }
668
669                 offset += rv;
670         }
671         PF_RULES_RUNLOCK();
672
673 done:
674         m_freem(m);
675         return (IPPROTO_DONE);
676 }
677
678 static int
679 pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
680 {
681         struct pfsync_clr *clr;
682         struct mbuf *mp;
683         int len = sizeof(*clr) * count;
684         int i, offp;
685         u_int32_t creatorid;
686
687         mp = m_pulldown(m, offset, len, &offp);
688         if (mp == NULL) {
689                 V_pfsyncstats.pfsyncs_badlen++;
690                 return (-1);
691         }
692         clr = (struct pfsync_clr *)(mp->m_data + offp);
693
694         for (i = 0; i < count; i++) {
695                 creatorid = clr[i].creatorid;
696
697                 if (clr[i].ifname[0] != '\0' &&
698                     pfi_kif_find(clr[i].ifname) == NULL)
699                         continue;
700
701                 for (int i = 0; i <= pf_hashmask; i++) {
702                         struct pf_idhash *ih = &V_pf_idhash[i];
703                         struct pf_state *s;
704 relock:
705                         PF_HASHROW_LOCK(ih);
706                         LIST_FOREACH(s, &ih->states, entry) {
707                                 if (s->creatorid == creatorid) {
708                                         s->state_flags |= PFSTATE_NOSYNC;
709                                         pf_unlink_state(s, PF_ENTER_LOCKED);
710                                         goto relock;
711                                 }
712                         }
713                         PF_HASHROW_UNLOCK(ih);
714                 }
715         }
716
717         return (len);
718 }
719
720 static int
721 pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
722 {
723         struct mbuf *mp;
724         struct pfsync_state *sa, *sp;
725         int len = sizeof(*sp) * count;
726         int i, offp;
727
728         mp = m_pulldown(m, offset, len, &offp);
729         if (mp == NULL) {
730                 V_pfsyncstats.pfsyncs_badlen++;
731                 return (-1);
732         }
733         sa = (struct pfsync_state *)(mp->m_data + offp);
734
735         for (i = 0; i < count; i++) {
736                 sp = &sa[i];
737
738                 /* Check for invalid values. */
739                 if (sp->timeout >= PFTM_MAX ||
740                     sp->src.state > PF_TCPS_PROXY_DST ||
741                     sp->dst.state > PF_TCPS_PROXY_DST ||
742                     sp->direction > PF_OUT ||
743                     (sp->af != AF_INET && sp->af != AF_INET6)) {
744                         if (V_pf_status.debug >= PF_DEBUG_MISC)
745                                 printf("%s: invalid value\n", __func__);
746                         V_pfsyncstats.pfsyncs_badval++;
747                         continue;
748                 }
749
750                 if (pfsync_state_import(sp, pkt->flags) == ENOMEM)
751                         /* Drop out, but process the rest of the actions. */
752                         break;
753         }
754
755         return (len);
756 }
757
758 static int
759 pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
760 {
761         struct pfsync_ins_ack *ia, *iaa;
762         struct pf_state *st;
763
764         struct mbuf *mp;
765         int len = count * sizeof(*ia);
766         int offp, i;
767
768         mp = m_pulldown(m, offset, len, &offp);
769         if (mp == NULL) {
770                 V_pfsyncstats.pfsyncs_badlen++;
771                 return (-1);
772         }
773         iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
774
775         for (i = 0; i < count; i++) {
776                 ia = &iaa[i];
777
778                 st = pf_find_state_byid(ia->id, ia->creatorid);
779                 if (st == NULL)
780                         continue;
781
782                 if (st->state_flags & PFSTATE_ACK) {
783                         PFSYNC_LOCK(V_pfsyncif);
784                         pfsync_undefer_state(st, 0);
785                         PFSYNC_UNLOCK(V_pfsyncif);
786                 }
787                 PF_STATE_UNLOCK(st);
788         }
789         /*
790          * XXX this is not yet implemented, but we know the size of the
791          * message so we can skip it.
792          */
793
794         return (count * sizeof(struct pfsync_ins_ack));
795 }
796
797 static int
798 pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
799     struct pfsync_state_peer *dst)
800 {
801         int sync = 0;
802
803         PF_STATE_LOCK_ASSERT(st);
804
805         /*
806          * The state should never go backwards except
807          * for syn-proxy states.  Neither should the
808          * sequence window slide backwards.
809          */
810         if ((st->src.state > src->state &&
811             (st->src.state < PF_TCPS_PROXY_SRC ||
812             src->state >= PF_TCPS_PROXY_SRC)) ||
813
814             (st->src.state == src->state &&
815             SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
816                 sync++;
817         else
818                 pf_state_peer_ntoh(src, &st->src);
819
820         if ((st->dst.state > dst->state) ||
821
822             (st->dst.state >= TCPS_SYN_SENT &&
823             SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
824                 sync++;
825         else
826                 pf_state_peer_ntoh(dst, &st->dst);
827
828         return (sync);
829 }
830
831 static int
832 pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
833 {
834         struct pfsync_softc *sc = V_pfsyncif;
835         struct pfsync_state *sa, *sp;
836         struct pf_state *st;
837         int sync;
838
839         struct mbuf *mp;
840         int len = count * sizeof(*sp);
841         int offp, i;
842
843         mp = m_pulldown(m, offset, len, &offp);
844         if (mp == NULL) {
845                 V_pfsyncstats.pfsyncs_badlen++;
846                 return (-1);
847         }
848         sa = (struct pfsync_state *)(mp->m_data + offp);
849
850         for (i = 0; i < count; i++) {
851                 sp = &sa[i];
852
853                 /* check for invalid values */
854                 if (sp->timeout >= PFTM_MAX ||
855                     sp->src.state > PF_TCPS_PROXY_DST ||
856                     sp->dst.state > PF_TCPS_PROXY_DST) {
857                         if (V_pf_status.debug >= PF_DEBUG_MISC) {
858                                 printf("pfsync_input: PFSYNC_ACT_UPD: "
859                                     "invalid value\n");
860                         }
861                         V_pfsyncstats.pfsyncs_badval++;
862                         continue;
863                 }
864
865                 st = pf_find_state_byid(sp->id, sp->creatorid);
866                 if (st == NULL) {
867                         /* insert the update */
868                         if (pfsync_state_import(sp, 0))
869                                 V_pfsyncstats.pfsyncs_badstate++;
870                         continue;
871                 }
872
873                 if (st->state_flags & PFSTATE_ACK) {
874                         PFSYNC_LOCK(sc);
875                         pfsync_undefer_state(st, 1);
876                         PFSYNC_UNLOCK(sc);
877                 }
878
879                 if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
880                         sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
881                 else {
882                         sync = 0;
883
884                         /*
885                          * Non-TCP protocol state machine always go
886                          * forwards
887                          */
888                         if (st->src.state > sp->src.state)
889                                 sync++;
890                         else
891                                 pf_state_peer_ntoh(&sp->src, &st->src);
892                         if (st->dst.state > sp->dst.state)
893                                 sync++;
894                         else
895                                 pf_state_peer_ntoh(&sp->dst, &st->dst);
896                 }
897                 if (sync < 2) {
898                         pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
899                         pf_state_peer_ntoh(&sp->dst, &st->dst);
900                         st->expire = time_uptime;
901                         st->timeout = sp->timeout;
902                 }
903                 st->pfsync_time = time_uptime;
904
905                 if (sync) {
906                         V_pfsyncstats.pfsyncs_stale++;
907
908                         pfsync_update_state(st);
909                         PF_STATE_UNLOCK(st);
910                         PFSYNC_LOCK(sc);
911                         pfsync_push(sc);
912                         PFSYNC_UNLOCK(sc);
913                         continue;
914                 }
915                 PF_STATE_UNLOCK(st);
916         }
917
918         return (len);
919 }
920
921 static int
922 pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
923 {
924         struct pfsync_softc *sc = V_pfsyncif;
925         struct pfsync_upd_c *ua, *up;
926         struct pf_state *st;
927         int len = count * sizeof(*up);
928         int sync;
929         struct mbuf *mp;
930         int offp, i;
931
932         mp = m_pulldown(m, offset, len, &offp);
933         if (mp == NULL) {
934                 V_pfsyncstats.pfsyncs_badlen++;
935                 return (-1);
936         }
937         ua = (struct pfsync_upd_c *)(mp->m_data + offp);
938
939         for (i = 0; i < count; i++) {
940                 up = &ua[i];
941
942                 /* check for invalid values */
943                 if (up->timeout >= PFTM_MAX ||
944                     up->src.state > PF_TCPS_PROXY_DST ||
945                     up->dst.state > PF_TCPS_PROXY_DST) {
946                         if (V_pf_status.debug >= PF_DEBUG_MISC) {
947                                 printf("pfsync_input: "
948                                     "PFSYNC_ACT_UPD_C: "
949                                     "invalid value\n");
950                         }
951                         V_pfsyncstats.pfsyncs_badval++;
952                         continue;
953                 }
954
955                 st = pf_find_state_byid(up->id, up->creatorid);
956                 if (st == NULL) {
957                         /* We don't have this state. Ask for it. */
958                         PFSYNC_LOCK(sc);
959                         pfsync_request_update(up->creatorid, up->id);
960                         PFSYNC_UNLOCK(sc);
961                         continue;
962                 }
963
964                 if (st->state_flags & PFSTATE_ACK) {
965                         PFSYNC_LOCK(sc);
966                         pfsync_undefer_state(st, 1);
967                         PFSYNC_UNLOCK(sc);
968                 }
969
970                 if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
971                         sync = pfsync_upd_tcp(st, &up->src, &up->dst);
972                 else {
973                         sync = 0;
974
975                         /*
976                          * Non-TCP protocol state machine always go
977                          * forwards
978                          */
979                         if (st->src.state > up->src.state)
980                                 sync++;
981                         else
982                                 pf_state_peer_ntoh(&up->src, &st->src);
983                         if (st->dst.state > up->dst.state)
984                                 sync++;
985                         else
986                                 pf_state_peer_ntoh(&up->dst, &st->dst);
987                 }
988                 if (sync < 2) {
989                         pfsync_alloc_scrub_memory(&up->dst, &st->dst);
990                         pf_state_peer_ntoh(&up->dst, &st->dst);
991                         st->expire = time_uptime;
992                         st->timeout = up->timeout;
993                 }
994                 st->pfsync_time = time_uptime;
995
996                 if (sync) {
997                         V_pfsyncstats.pfsyncs_stale++;
998
999                         pfsync_update_state(st);
1000                         PF_STATE_UNLOCK(st);
1001                         PFSYNC_LOCK(sc);
1002                         pfsync_push(sc);
1003                         PFSYNC_UNLOCK(sc);
1004                         continue;
1005                 }
1006                 PF_STATE_UNLOCK(st);
1007         }
1008
1009         return (len);
1010 }
1011
1012 static int
1013 pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1014 {
1015         struct pfsync_upd_req *ur, *ura;
1016         struct mbuf *mp;
1017         int len = count * sizeof(*ur);
1018         int i, offp;
1019
1020         struct pf_state *st;
1021
1022         mp = m_pulldown(m, offset, len, &offp);
1023         if (mp == NULL) {
1024                 V_pfsyncstats.pfsyncs_badlen++;
1025                 return (-1);
1026         }
1027         ura = (struct pfsync_upd_req *)(mp->m_data + offp);
1028
1029         for (i = 0; i < count; i++) {
1030                 ur = &ura[i];
1031
1032                 if (ur->id == 0 && ur->creatorid == 0)
1033                         pfsync_bulk_start();
1034                 else {
1035                         st = pf_find_state_byid(ur->id, ur->creatorid);
1036                         if (st == NULL) {
1037                                 V_pfsyncstats.pfsyncs_badstate++;
1038                                 continue;
1039                         }
1040                         if (st->state_flags & PFSTATE_NOSYNC) {
1041                                 PF_STATE_UNLOCK(st);
1042                                 continue;
1043                         }
1044
1045                         pfsync_update_state_req(st);
1046                         PF_STATE_UNLOCK(st);
1047                 }
1048         }
1049
1050         return (len);
1051 }
1052
1053 static int
1054 pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1055 {
1056         struct mbuf *mp;
1057         struct pfsync_state *sa, *sp;
1058         struct pf_state *st;
1059         int len = count * sizeof(*sp);
1060         int offp, i;
1061
1062         mp = m_pulldown(m, offset, len, &offp);
1063         if (mp == NULL) {
1064                 V_pfsyncstats.pfsyncs_badlen++;
1065                 return (-1);
1066         }
1067         sa = (struct pfsync_state *)(mp->m_data + offp);
1068
1069         for (i = 0; i < count; i++) {
1070                 sp = &sa[i];
1071
1072                 st = pf_find_state_byid(sp->id, sp->creatorid);
1073                 if (st == NULL) {
1074                         V_pfsyncstats.pfsyncs_badstate++;
1075                         continue;
1076                 }
1077                 st->state_flags |= PFSTATE_NOSYNC;
1078                 pf_unlink_state(st, PF_ENTER_LOCKED);
1079         }
1080
1081         return (len);
1082 }
1083
1084 static int
1085 pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1086 {
1087         struct mbuf *mp;
1088         struct pfsync_del_c *sa, *sp;
1089         struct pf_state *st;
1090         int len = count * sizeof(*sp);
1091         int offp, i;
1092
1093         mp = m_pulldown(m, offset, len, &offp);
1094         if (mp == NULL) {
1095                 V_pfsyncstats.pfsyncs_badlen++;
1096                 return (-1);
1097         }
1098         sa = (struct pfsync_del_c *)(mp->m_data + offp);
1099
1100         for (i = 0; i < count; i++) {
1101                 sp = &sa[i];
1102
1103                 st = pf_find_state_byid(sp->id, sp->creatorid);
1104                 if (st == NULL) {
1105                         V_pfsyncstats.pfsyncs_badstate++;
1106                         continue;
1107                 }
1108
1109                 st->state_flags |= PFSTATE_NOSYNC;
1110                 pf_unlink_state(st, PF_ENTER_LOCKED);
1111         }
1112
1113         return (len);
1114 }
1115
1116 static int
1117 pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1118 {
1119         struct pfsync_softc *sc = V_pfsyncif;
1120         struct pfsync_bus *bus;
1121         struct mbuf *mp;
1122         int len = count * sizeof(*bus);
1123         int offp;
1124
1125         PFSYNC_BLOCK(sc);
1126
1127         /* If we're not waiting for a bulk update, who cares. */
1128         if (sc->sc_ureq_sent == 0) {
1129                 PFSYNC_BUNLOCK(sc);
1130                 return (len);
1131         }
1132
1133         mp = m_pulldown(m, offset, len, &offp);
1134         if (mp == NULL) {
1135                 PFSYNC_BUNLOCK(sc);
1136                 V_pfsyncstats.pfsyncs_badlen++;
1137                 return (-1);
1138         }
1139         bus = (struct pfsync_bus *)(mp->m_data + offp);
1140
1141         switch (bus->status) {
1142         case PFSYNC_BUS_START:
1143                 callout_reset(&sc->sc_bulkfail_tmo, 4 * hz +
1144                     V_pf_limits[PF_LIMIT_STATES].limit /
1145                     ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) /
1146                     sizeof(struct pfsync_state)),
1147                     pfsync_bulk_fail, sc);
1148                 if (V_pf_status.debug >= PF_DEBUG_MISC)
1149                         printf("pfsync: received bulk update start\n");
1150                 break;
1151
1152         case PFSYNC_BUS_END:
1153                 if (time_uptime - ntohl(bus->endtime) >=
1154                     sc->sc_ureq_sent) {
1155                         /* that's it, we're happy */
1156                         sc->sc_ureq_sent = 0;
1157                         sc->sc_bulk_tries = 0;
1158                         callout_stop(&sc->sc_bulkfail_tmo);
1159                         if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1160                                 (*carp_demote_adj_p)(-V_pfsync_carp_adj,
1161                                     "pfsync bulk done");
1162                         sc->sc_flags |= PFSYNCF_OK;
1163                         if (V_pf_status.debug >= PF_DEBUG_MISC)
1164                                 printf("pfsync: received valid "
1165                                     "bulk update end\n");
1166                 } else {
1167                         if (V_pf_status.debug >= PF_DEBUG_MISC)
1168                                 printf("pfsync: received invalid "
1169                                     "bulk update end: bad timestamp\n");
1170                 }
1171                 break;
1172         }
1173         PFSYNC_BUNLOCK(sc);
1174
1175         return (len);
1176 }
1177
1178 static int
1179 pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1180 {
1181         int len = count * sizeof(struct pfsync_tdb);
1182
1183 #if defined(IPSEC)
1184         struct pfsync_tdb *tp;
1185         struct mbuf *mp;
1186         int offp;
1187         int i;
1188         int s;
1189
1190         mp = m_pulldown(m, offset, len, &offp);
1191         if (mp == NULL) {
1192                 V_pfsyncstats.pfsyncs_badlen++;
1193                 return (-1);
1194         }
1195         tp = (struct pfsync_tdb *)(mp->m_data + offp);
1196
1197         for (i = 0; i < count; i++)
1198                 pfsync_update_net_tdb(&tp[i]);
1199 #endif
1200
1201         return (len);
1202 }
1203
1204 #if defined(IPSEC)
1205 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
1206 static void
1207 pfsync_update_net_tdb(struct pfsync_tdb *pt)
1208 {
1209         struct tdb              *tdb;
1210         int                      s;
1211
1212         /* check for invalid values */
1213         if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
1214             (pt->dst.sa.sa_family != AF_INET &&
1215             pt->dst.sa.sa_family != AF_INET6))
1216                 goto bad;
1217
1218         tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
1219         if (tdb) {
1220                 pt->rpl = ntohl(pt->rpl);
1221                 pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes);
1222
1223                 /* Neither replay nor byte counter should ever decrease. */
1224                 if (pt->rpl < tdb->tdb_rpl ||
1225                     pt->cur_bytes < tdb->tdb_cur_bytes) {
1226                         goto bad;
1227                 }
1228
1229                 tdb->tdb_rpl = pt->rpl;
1230                 tdb->tdb_cur_bytes = pt->cur_bytes;
1231         }
1232         return;
1233
1234 bad:
1235         if (V_pf_status.debug >= PF_DEBUG_MISC)
1236                 printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
1237                     "invalid value\n");
1238         V_pfsyncstats.pfsyncs_badstate++;
1239         return;
1240 }
1241 #endif
1242
1243
1244 static int
1245 pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1246 {
1247         /* check if we are at the right place in the packet */
1248         if (offset != m->m_pkthdr.len)
1249                 V_pfsyncstats.pfsyncs_badlen++;
1250
1251         /* we're done. free and let the caller return */
1252         m_freem(m);
1253         return (-1);
1254 }
1255
1256 static int
1257 pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1258 {
1259         V_pfsyncstats.pfsyncs_badact++;
1260
1261         m_freem(m);
1262         return (-1);
1263 }
1264
1265 static int
1266 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
1267         struct route *rt)
1268 {
1269         m_freem(m);
1270         return (0);
1271 }
1272
1273 /* ARGSUSED */
1274 static int
1275 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1276 {
1277         struct pfsync_softc *sc = ifp->if_softc;
1278         struct ifreq *ifr = (struct ifreq *)data;
1279         struct pfsyncreq pfsyncr;
1280         int error;
1281
1282         switch (cmd) {
1283         case SIOCSIFFLAGS:
1284                 PFSYNC_LOCK(sc);
1285                 if (ifp->if_flags & IFF_UP) {
1286                         ifp->if_drv_flags |= IFF_DRV_RUNNING;
1287                         PFSYNC_UNLOCK(sc);
1288                         pfsync_pointers_init();
1289                 } else {
1290                         ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1291                         PFSYNC_UNLOCK(sc);
1292                         pfsync_pointers_uninit();
1293                 }
1294                 break;
1295         case SIOCSIFMTU:
1296                 if (!sc->sc_sync_if ||
1297                     ifr->ifr_mtu <= PFSYNC_MINPKT ||
1298                     ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
1299                         return (EINVAL);
1300                 if (ifr->ifr_mtu < ifp->if_mtu) {
1301                         PFSYNC_LOCK(sc);
1302                         if (sc->sc_len > PFSYNC_MINPKT)
1303                                 pfsync_sendout(1);
1304                         PFSYNC_UNLOCK(sc);
1305                 }
1306                 ifp->if_mtu = ifr->ifr_mtu;
1307                 break;
1308         case SIOCGETPFSYNC:
1309                 bzero(&pfsyncr, sizeof(pfsyncr));
1310                 PFSYNC_LOCK(sc);
1311                 if (sc->sc_sync_if) {
1312                         strlcpy(pfsyncr.pfsyncr_syncdev,
1313                             sc->sc_sync_if->if_xname, IFNAMSIZ);
1314                 }
1315                 pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
1316                 pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
1317                 pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER ==
1318                     (sc->sc_flags & PFSYNCF_DEFER));
1319                 PFSYNC_UNLOCK(sc);
1320                 return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
1321
1322         case SIOCSETPFSYNC:
1323             {
1324                 struct ip_moptions *imo = &sc->sc_imo;
1325                 struct ifnet *sifp;
1326                 struct ip *ip;
1327                 void *mship = NULL;
1328
1329                 if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
1330                         return (error);
1331                 if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
1332                         return (error);
1333
1334                 if (pfsyncr.pfsyncr_maxupdates > 255)
1335                         return (EINVAL);
1336
1337                 if (pfsyncr.pfsyncr_syncdev[0] == 0)
1338                         sifp = NULL;
1339                 else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL)
1340                         return (EINVAL);
1341
1342                 if (sifp != NULL && (
1343                     pfsyncr.pfsyncr_syncpeer.s_addr == 0 ||
1344                     pfsyncr.pfsyncr_syncpeer.s_addr ==
1345                     htonl(INADDR_PFSYNC_GROUP)))
1346                         mship = malloc((sizeof(struct in_multi *) *
1347                             IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO);
1348
1349                 PFSYNC_LOCK(sc);
1350                 if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
1351                         sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
1352                 else
1353                         sc->sc_sync_peer.s_addr =
1354                             pfsyncr.pfsyncr_syncpeer.s_addr;
1355
1356                 sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
1357                 if (pfsyncr.pfsyncr_defer) {
1358                         sc->sc_flags |= PFSYNCF_DEFER;
1359                         pfsync_defer_ptr = pfsync_defer;
1360                 } else {
1361                         sc->sc_flags &= ~PFSYNCF_DEFER;
1362                         pfsync_defer_ptr = NULL;
1363                 }
1364
1365                 if (sifp == NULL) {
1366                         if (sc->sc_sync_if)
1367                                 if_rele(sc->sc_sync_if);
1368                         sc->sc_sync_if = NULL;
1369                         if (imo->imo_membership)
1370                                 pfsync_multicast_cleanup(sc);
1371                         PFSYNC_UNLOCK(sc);
1372                         break;
1373                 }
1374
1375                 if (sc->sc_len > PFSYNC_MINPKT &&
1376                     (sifp->if_mtu < sc->sc_ifp->if_mtu ||
1377                     (sc->sc_sync_if != NULL &&
1378                     sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
1379                     sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
1380                         pfsync_sendout(1);
1381
1382                 if (imo->imo_membership)
1383                         pfsync_multicast_cleanup(sc);
1384
1385                 if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
1386                         error = pfsync_multicast_setup(sc, sifp, mship);
1387                         if (error) {
1388                                 if_rele(sifp);
1389                                 free(mship, M_PFSYNC);
1390                                 return (error);
1391                         }
1392                 }
1393                 if (sc->sc_sync_if)
1394                         if_rele(sc->sc_sync_if);
1395                 sc->sc_sync_if = sifp;
1396
1397                 ip = &sc->sc_template;
1398                 bzero(ip, sizeof(*ip));
1399                 ip->ip_v = IPVERSION;
1400                 ip->ip_hl = sizeof(sc->sc_template) >> 2;
1401                 ip->ip_tos = IPTOS_LOWDELAY;
1402                 /* len and id are set later. */
1403                 ip->ip_off = htons(IP_DF);
1404                 ip->ip_ttl = PFSYNC_DFLTTL;
1405                 ip->ip_p = IPPROTO_PFSYNC;
1406                 ip->ip_src.s_addr = INADDR_ANY;
1407                 ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
1408
1409                 /* Request a full state table update. */
1410                 if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1411                         (*carp_demote_adj_p)(V_pfsync_carp_adj,
1412                             "pfsync bulk start");
1413                 sc->sc_flags &= ~PFSYNCF_OK;
1414                 if (V_pf_status.debug >= PF_DEBUG_MISC)
1415                         printf("pfsync: requesting bulk update\n");
1416                 pfsync_request_update(0, 0);
1417                 PFSYNC_UNLOCK(sc);
1418                 PFSYNC_BLOCK(sc);
1419                 sc->sc_ureq_sent = time_uptime;
1420                 callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail,
1421                     sc);
1422                 PFSYNC_BUNLOCK(sc);
1423
1424                 break;
1425             }
1426         default:
1427                 return (ENOTTY);
1428         }
1429
1430         return (0);
1431 }
1432
1433 static void
1434 pfsync_out_state(struct pf_state *st, void *buf)
1435 {
1436         struct pfsync_state *sp = buf;
1437
1438         pfsync_state_export(sp, st);
1439 }
1440
1441 static void
1442 pfsync_out_iack(struct pf_state *st, void *buf)
1443 {
1444         struct pfsync_ins_ack *iack = buf;
1445
1446         iack->id = st->id;
1447         iack->creatorid = st->creatorid;
1448 }
1449
1450 static void
1451 pfsync_out_upd_c(struct pf_state *st, void *buf)
1452 {
1453         struct pfsync_upd_c *up = buf;
1454
1455         bzero(up, sizeof(*up));
1456         up->id = st->id;
1457         pf_state_peer_hton(&st->src, &up->src);
1458         pf_state_peer_hton(&st->dst, &up->dst);
1459         up->creatorid = st->creatorid;
1460         up->timeout = st->timeout;
1461 }
1462
1463 static void
1464 pfsync_out_del(struct pf_state *st, void *buf)
1465 {
1466         struct pfsync_del_c *dp = buf;
1467
1468         dp->id = st->id;
1469         dp->creatorid = st->creatorid;
1470         st->state_flags |= PFSTATE_NOSYNC;
1471 }
1472
1473 static void
1474 pfsync_drop(struct pfsync_softc *sc)
1475 {
1476         struct pf_state *st, *next;
1477         struct pfsync_upd_req_item *ur;
1478         int q;
1479
1480         for (q = 0; q < PFSYNC_S_COUNT; q++) {
1481                 if (TAILQ_EMPTY(&sc->sc_qs[q]))
1482                         continue;
1483
1484                 TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) {
1485                         KASSERT(st->sync_state == q,
1486                                 ("%s: st->sync_state == q",
1487                                         __func__));
1488                         st->sync_state = PFSYNC_S_NONE;
1489                         pf_release_state(st);
1490                 }
1491                 TAILQ_INIT(&sc->sc_qs[q]);
1492         }
1493
1494         while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1495                 TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1496                 free(ur, M_PFSYNC);
1497         }
1498
1499         sc->sc_plus = NULL;
1500         sc->sc_len = PFSYNC_MINPKT;
1501 }
1502
1503 static void
1504 pfsync_sendout(int schedswi)
1505 {
1506         struct pfsync_softc *sc = V_pfsyncif;
1507         struct ifnet *ifp = sc->sc_ifp;
1508         struct mbuf *m;
1509         struct ip *ip;
1510         struct pfsync_header *ph;
1511         struct pfsync_subheader *subh;
1512         struct pf_state *st, *st_next;
1513         struct pfsync_upd_req_item *ur;
1514         int offset;
1515         int q, count = 0;
1516
1517         KASSERT(sc != NULL, ("%s: null sc", __func__));
1518         KASSERT(sc->sc_len > PFSYNC_MINPKT,
1519             ("%s: sc_len %zu", __func__, sc->sc_len));
1520         PFSYNC_LOCK_ASSERT(sc);
1521
1522         if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
1523                 pfsync_drop(sc);
1524                 return;
1525         }
1526
1527         m = m_get2(max_linkhdr + sc->sc_len, M_NOWAIT, MT_DATA, M_PKTHDR);
1528         if (m == NULL) {
1529                 if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
1530                 V_pfsyncstats.pfsyncs_onomem++;
1531                 return;
1532         }
1533         m->m_data += max_linkhdr;
1534         m->m_len = m->m_pkthdr.len = sc->sc_len;
1535
1536         /* build the ip header */
1537         ip = (struct ip *)m->m_data;
1538         bcopy(&sc->sc_template, ip, sizeof(*ip));
1539         offset = sizeof(*ip);
1540
1541         ip->ip_len = htons(m->m_pkthdr.len);
1542         ip_fillid(ip);
1543
1544         /* build the pfsync header */
1545         ph = (struct pfsync_header *)(m->m_data + offset);
1546         bzero(ph, sizeof(*ph));
1547         offset += sizeof(*ph);
1548
1549         ph->version = PFSYNC_VERSION;
1550         ph->len = htons(sc->sc_len - sizeof(*ip));
1551         bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
1552
1553         /* walk the queues */
1554         for (q = 0; q < PFSYNC_S_COUNT; q++) {
1555                 if (TAILQ_EMPTY(&sc->sc_qs[q]))
1556                         continue;
1557
1558                 subh = (struct pfsync_subheader *)(m->m_data + offset);
1559                 offset += sizeof(*subh);
1560
1561                 count = 0;
1562                 TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, st_next) {
1563                         KASSERT(st->sync_state == q,
1564                                 ("%s: st->sync_state == q",
1565                                         __func__));
1566                         /*
1567                          * XXXGL: some of write methods do unlocked reads
1568                          * of state data :(
1569                          */
1570                         pfsync_qs[q].write(st, m->m_data + offset);
1571                         offset += pfsync_qs[q].len;
1572                         st->sync_state = PFSYNC_S_NONE;
1573                         pf_release_state(st);
1574                         count++;
1575                 }
1576                 TAILQ_INIT(&sc->sc_qs[q]);
1577
1578                 bzero(subh, sizeof(*subh));
1579                 subh->action = pfsync_qs[q].action;
1580                 subh->count = htons(count);
1581                 V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
1582         }
1583
1584         if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
1585                 subh = (struct pfsync_subheader *)(m->m_data + offset);
1586                 offset += sizeof(*subh);
1587
1588                 count = 0;
1589                 while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1590                         TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1591
1592                         bcopy(&ur->ur_msg, m->m_data + offset,
1593                             sizeof(ur->ur_msg));
1594                         offset += sizeof(ur->ur_msg);
1595                         free(ur, M_PFSYNC);
1596                         count++;
1597                 }
1598
1599                 bzero(subh, sizeof(*subh));
1600                 subh->action = PFSYNC_ACT_UPD_REQ;
1601                 subh->count = htons(count);
1602                 V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count;
1603         }
1604
1605         /* has someone built a custom region for us to add? */
1606         if (sc->sc_plus != NULL) {
1607                 bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
1608                 offset += sc->sc_pluslen;
1609
1610                 sc->sc_plus = NULL;
1611         }
1612
1613         subh = (struct pfsync_subheader *)(m->m_data + offset);
1614         offset += sizeof(*subh);
1615
1616         bzero(subh, sizeof(*subh));
1617         subh->action = PFSYNC_ACT_EOF;
1618         subh->count = htons(1);
1619         V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++;
1620
1621         /* we're done, let's put it on the wire */
1622         if (ifp->if_bpf) {
1623                 m->m_data += sizeof(*ip);
1624                 m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
1625                 BPF_MTAP(ifp, m);
1626                 m->m_data -= sizeof(*ip);
1627                 m->m_len = m->m_pkthdr.len = sc->sc_len;
1628         }
1629
1630         if (sc->sc_sync_if == NULL) {
1631                 sc->sc_len = PFSYNC_MINPKT;
1632                 m_freem(m);
1633                 return;
1634         }
1635
1636         if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1);
1637         if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
1638         sc->sc_len = PFSYNC_MINPKT;
1639
1640         if (!_IF_QFULL(&sc->sc_ifp->if_snd))
1641                 _IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
1642         else {
1643                 m_freem(m);
1644                 if_inc_counter(sc->sc_ifp, IFCOUNTER_OQDROPS, 1);
1645         }
1646         if (schedswi)
1647                 swi_sched(V_pfsync_swi_cookie, 0);
1648 }
1649
1650 static void
1651 pfsync_insert_state(struct pf_state *st)
1652 {
1653         struct pfsync_softc *sc = V_pfsyncif;
1654
1655         if (st->state_flags & PFSTATE_NOSYNC)
1656                 return;
1657
1658         if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) ||
1659             st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
1660                 st->state_flags |= PFSTATE_NOSYNC;
1661                 return;
1662         }
1663
1664         KASSERT(st->sync_state == PFSYNC_S_NONE,
1665                 ("%s: st->sync_state %u", __func__, st->sync_state));
1666
1667         PFSYNC_LOCK(sc);
1668         if (sc->sc_len == PFSYNC_MINPKT)
1669                 callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1670
1671         pfsync_q_ins(st, PFSYNC_S_INS, true);
1672         PFSYNC_UNLOCK(sc);
1673
1674         st->sync_updates = 0;
1675 }
1676
1677 static int
1678 pfsync_defer(struct pf_state *st, struct mbuf *m)
1679 {
1680         struct pfsync_softc *sc = V_pfsyncif;
1681         struct pfsync_deferral *pd;
1682
1683         if (m->m_flags & (M_BCAST|M_MCAST))
1684                 return (0);
1685
1686         PFSYNC_LOCK(sc);
1687
1688         if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) ||
1689             !(sc->sc_flags & PFSYNCF_DEFER)) {
1690                 PFSYNC_UNLOCK(sc);
1691                 return (0);
1692         }
1693
1694          if (sc->sc_deferred >= 128)
1695                 pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0);
1696
1697         pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
1698         if (pd == NULL)
1699                 return (0);
1700         sc->sc_deferred++;
1701
1702         m->m_flags |= M_SKIP_FIREWALL;
1703         st->state_flags |= PFSTATE_ACK;
1704
1705         pd->pd_sc = sc;
1706         pd->pd_refs = 0;
1707         pd->pd_st = st;
1708         pf_ref_state(st);
1709         pd->pd_m = m;
1710
1711         TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
1712         callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1713         callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd);
1714
1715         pfsync_push(sc);
1716
1717         return (1);
1718 }
1719
1720 static void
1721 pfsync_undefer(struct pfsync_deferral *pd, int drop)
1722 {
1723         struct pfsync_softc *sc = pd->pd_sc;
1724         struct mbuf *m = pd->pd_m;
1725         struct pf_state *st = pd->pd_st;
1726
1727         PFSYNC_LOCK_ASSERT(sc);
1728
1729         TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1730         sc->sc_deferred--;
1731         pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */
1732         free(pd, M_PFSYNC);
1733         pf_release_state(st);
1734
1735         if (drop)
1736                 m_freem(m);
1737         else {
1738                 _IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
1739                 pfsync_push(sc);
1740         }
1741 }
1742
1743 static void
1744 pfsync_defer_tmo(void *arg)
1745 {
1746         struct pfsync_deferral *pd = arg;
1747         struct pfsync_softc *sc = pd->pd_sc;
1748         struct mbuf *m = pd->pd_m;
1749         struct pf_state *st = pd->pd_st;
1750
1751         PFSYNC_LOCK_ASSERT(sc);
1752
1753         CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
1754
1755         TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1756         sc->sc_deferred--;
1757         pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */
1758         if (pd->pd_refs == 0)
1759                 free(pd, M_PFSYNC);
1760         PFSYNC_UNLOCK(sc);
1761
1762         ip_output(m, NULL, NULL, 0, NULL, NULL);
1763
1764         pf_release_state(st);
1765
1766         CURVNET_RESTORE();
1767 }
1768
1769 static void
1770 pfsync_undefer_state(struct pf_state *st, int drop)
1771 {
1772         struct pfsync_softc *sc = V_pfsyncif;
1773         struct pfsync_deferral *pd;
1774
1775         PFSYNC_LOCK_ASSERT(sc);
1776
1777         TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
1778                  if (pd->pd_st == st) {
1779                         if (callout_stop(&pd->pd_tmo) > 0)
1780                                 pfsync_undefer(pd, drop);
1781                         return;
1782                 }
1783         }
1784
1785         panic("%s: unable to find deferred state", __func__);
1786 }
1787
1788 static void
1789 pfsync_update_state(struct pf_state *st)
1790 {
1791         struct pfsync_softc *sc = V_pfsyncif;
1792         bool sync = false, ref = true;
1793
1794         PF_STATE_LOCK_ASSERT(st);
1795         PFSYNC_LOCK(sc);
1796
1797         if (st->state_flags & PFSTATE_ACK)
1798                 pfsync_undefer_state(st, 0);
1799         if (st->state_flags & PFSTATE_NOSYNC) {
1800                 if (st->sync_state != PFSYNC_S_NONE)
1801                         pfsync_q_del(st, true);
1802                 PFSYNC_UNLOCK(sc);
1803                 return;
1804         }
1805
1806         if (sc->sc_len == PFSYNC_MINPKT)
1807                 callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1808
1809         switch (st->sync_state) {
1810         case PFSYNC_S_UPD_C:
1811         case PFSYNC_S_UPD:
1812         case PFSYNC_S_INS:
1813                 /* we're already handling it */
1814
1815                 if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
1816                         st->sync_updates++;
1817                         if (st->sync_updates >= sc->sc_maxupdates)
1818                                 sync = true;
1819                 }
1820                 break;
1821
1822         case PFSYNC_S_IACK:
1823                 pfsync_q_del(st, false);
1824                 ref = false;
1825                 /* FALLTHROUGH */
1826
1827         case PFSYNC_S_NONE:
1828                 pfsync_q_ins(st, PFSYNC_S_UPD_C, ref);
1829                 st->sync_updates = 0;
1830                 break;
1831
1832         default:
1833                 panic("%s: unexpected sync state %d", __func__, st->sync_state);
1834         }
1835
1836         if (sync || (time_uptime - st->pfsync_time) < 2)
1837                 pfsync_push(sc);
1838
1839         PFSYNC_UNLOCK(sc);
1840 }
1841
1842 static void
1843 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
1844 {
1845         struct pfsync_softc *sc = V_pfsyncif;
1846         struct pfsync_upd_req_item *item;
1847         size_t nlen = sizeof(struct pfsync_upd_req);
1848
1849         PFSYNC_LOCK_ASSERT(sc);
1850
1851         /*
1852          * This code does a bit to prevent multiple update requests for the
1853          * same state being generated. It searches current subheader queue,
1854          * but it doesn't lookup into queue of already packed datagrams.
1855          */
1856         TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry)
1857                 if (item->ur_msg.id == id &&
1858                     item->ur_msg.creatorid == creatorid)
1859                         return;
1860
1861         item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT);
1862         if (item == NULL)
1863                 return; /* XXX stats */
1864
1865         item->ur_msg.id = id;
1866         item->ur_msg.creatorid = creatorid;
1867
1868         if (TAILQ_EMPTY(&sc->sc_upd_req_list))
1869                 nlen += sizeof(struct pfsync_subheader);
1870
1871         if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
1872                 pfsync_sendout(1);
1873
1874                 nlen = sizeof(struct pfsync_subheader) +
1875                     sizeof(struct pfsync_upd_req);
1876         }
1877
1878         TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
1879         sc->sc_len += nlen;
1880 }
1881
1882 static void
1883 pfsync_update_state_req(struct pf_state *st)
1884 {
1885         struct pfsync_softc *sc = V_pfsyncif;
1886         bool ref = true;
1887
1888         PF_STATE_LOCK_ASSERT(st);
1889         PFSYNC_LOCK(sc);
1890
1891         if (st->state_flags & PFSTATE_NOSYNC) {
1892                 if (st->sync_state != PFSYNC_S_NONE)
1893                         pfsync_q_del(st, true);
1894                 PFSYNC_UNLOCK(sc);
1895                 return;
1896         }
1897
1898         switch (st->sync_state) {
1899         case PFSYNC_S_UPD_C:
1900         case PFSYNC_S_IACK:
1901                 pfsync_q_del(st, false);
1902                 ref = false;
1903                 /* FALLTHROUGH */
1904
1905         case PFSYNC_S_NONE:
1906                 pfsync_q_ins(st, PFSYNC_S_UPD, ref);
1907                 pfsync_push(sc);
1908                 break;
1909
1910         case PFSYNC_S_INS:
1911         case PFSYNC_S_UPD:
1912         case PFSYNC_S_DEL:
1913                 /* we're already handling it */
1914                 break;
1915
1916         default:
1917                 panic("%s: unexpected sync state %d", __func__, st->sync_state);
1918         }
1919
1920         PFSYNC_UNLOCK(sc);
1921 }
1922
1923 static void
1924 pfsync_delete_state(struct pf_state *st)
1925 {
1926         struct pfsync_softc *sc = V_pfsyncif;
1927         bool ref = true;
1928
1929         PFSYNC_LOCK(sc);
1930         if (st->state_flags & PFSTATE_ACK)
1931                 pfsync_undefer_state(st, 1);
1932         if (st->state_flags & PFSTATE_NOSYNC) {
1933                 if (st->sync_state != PFSYNC_S_NONE)
1934                         pfsync_q_del(st, true);
1935                 PFSYNC_UNLOCK(sc);
1936                 return;
1937         }
1938
1939         if (sc->sc_len == PFSYNC_MINPKT)
1940                 callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1941
1942         switch (st->sync_state) {
1943         case PFSYNC_S_INS:
1944                 /* We never got to tell the world so just forget about it. */
1945                 pfsync_q_del(st, true);
1946                 break;
1947
1948         case PFSYNC_S_UPD_C:
1949         case PFSYNC_S_UPD:
1950         case PFSYNC_S_IACK:
1951                 pfsync_q_del(st, false);
1952                 ref = false;
1953                 /* FALLTHROUGH */
1954
1955         case PFSYNC_S_NONE:
1956                 pfsync_q_ins(st, PFSYNC_S_DEL, ref);
1957                 break;
1958
1959         default:
1960                 panic("%s: unexpected sync state %d", __func__, st->sync_state);
1961         }
1962
1963         PFSYNC_UNLOCK(sc);
1964 }
1965
1966 static void
1967 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
1968 {
1969         struct pfsync_softc *sc = V_pfsyncif;
1970         struct {
1971                 struct pfsync_subheader subh;
1972                 struct pfsync_clr clr;
1973         } __packed r;
1974
1975         bzero(&r, sizeof(r));
1976
1977         r.subh.action = PFSYNC_ACT_CLR;
1978         r.subh.count = htons(1);
1979         V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++;
1980
1981         strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
1982         r.clr.creatorid = creatorid;
1983
1984         PFSYNC_LOCK(sc);
1985         pfsync_send_plus(&r, sizeof(r));
1986         PFSYNC_UNLOCK(sc);
1987 }
1988
1989 static void
1990 pfsync_q_ins(struct pf_state *st, int q, bool ref)
1991 {
1992         struct pfsync_softc *sc = V_pfsyncif;
1993         size_t nlen = pfsync_qs[q].len;
1994
1995         PFSYNC_LOCK_ASSERT(sc);
1996
1997         KASSERT(st->sync_state == PFSYNC_S_NONE,
1998                 ("%s: st->sync_state %u", __func__, st->sync_state));
1999         KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
2000             sc->sc_len));
2001
2002         if (TAILQ_EMPTY(&sc->sc_qs[q]))
2003                 nlen += sizeof(struct pfsync_subheader);
2004
2005         if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
2006                 pfsync_sendout(1);
2007
2008                 nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
2009         }
2010
2011         sc->sc_len += nlen;
2012         TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
2013         st->sync_state = q;
2014         if (ref)
2015                 pf_ref_state(st);
2016 }
2017
2018 static void
2019 pfsync_q_del(struct pf_state *st, bool unref)
2020 {
2021         struct pfsync_softc *sc = V_pfsyncif;
2022         int q = st->sync_state;
2023
2024         PFSYNC_LOCK_ASSERT(sc);
2025         KASSERT(st->sync_state != PFSYNC_S_NONE,
2026                 ("%s: st->sync_state != PFSYNC_S_NONE", __func__));
2027
2028         sc->sc_len -= pfsync_qs[q].len;
2029         TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
2030         st->sync_state = PFSYNC_S_NONE;
2031         if (unref)
2032                 pf_release_state(st);
2033
2034         if (TAILQ_EMPTY(&sc->sc_qs[q]))
2035                 sc->sc_len -= sizeof(struct pfsync_subheader);
2036 }
2037
2038 static void
2039 pfsync_bulk_start(void)
2040 {
2041         struct pfsync_softc *sc = V_pfsyncif;
2042
2043         if (V_pf_status.debug >= PF_DEBUG_MISC)
2044                 printf("pfsync: received bulk update request\n");
2045
2046         PFSYNC_BLOCK(sc);
2047
2048         sc->sc_ureq_received = time_uptime;
2049         sc->sc_bulk_hashid = 0;
2050         sc->sc_bulk_stateid = 0;
2051         pfsync_bulk_status(PFSYNC_BUS_START);
2052         callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
2053         PFSYNC_BUNLOCK(sc);
2054 }
2055
2056 static void
2057 pfsync_bulk_update(void *arg)
2058 {
2059         struct pfsync_softc *sc = arg;
2060         struct pf_state *s;
2061         int i, sent = 0;
2062
2063         PFSYNC_BLOCK_ASSERT(sc);
2064         CURVNET_SET(sc->sc_ifp->if_vnet);
2065
2066         /*
2067          * Start with last state from previous invocation.
2068          * It may had gone, in this case start from the
2069          * hash slot.
2070          */
2071         s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid);
2072
2073         if (s != NULL)
2074                 i = PF_IDHASH(s);
2075         else
2076                 i = sc->sc_bulk_hashid;
2077
2078         for (; i <= pf_hashmask; i++) {
2079                 struct pf_idhash *ih = &V_pf_idhash[i];
2080
2081                 if (s != NULL)
2082                         PF_HASHROW_ASSERT(ih);
2083                 else {
2084                         PF_HASHROW_LOCK(ih);
2085                         s = LIST_FIRST(&ih->states);
2086                 }
2087
2088                 for (; s; s = LIST_NEXT(s, entry)) {
2089
2090                         if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) <
2091                             sizeof(struct pfsync_state)) {
2092                                 /* We've filled a packet. */
2093                                 sc->sc_bulk_hashid = i;
2094                                 sc->sc_bulk_stateid = s->id;
2095                                 sc->sc_bulk_creatorid = s->creatorid;
2096                                 PF_HASHROW_UNLOCK(ih);
2097                                 callout_reset(&sc->sc_bulk_tmo, 1,
2098                                     pfsync_bulk_update, sc);
2099                                 goto full;
2100                         }
2101
2102                         if (s->sync_state == PFSYNC_S_NONE &&
2103                             s->timeout < PFTM_MAX &&
2104                             s->pfsync_time <= sc->sc_ureq_received) {
2105                                 pfsync_update_state_req(s);
2106                                 sent++;
2107                         }
2108                 }
2109                 PF_HASHROW_UNLOCK(ih);
2110         }
2111
2112         /* We're done. */
2113         pfsync_bulk_status(PFSYNC_BUS_END);
2114
2115 full:
2116         CURVNET_RESTORE();
2117 }
2118
2119 static void
2120 pfsync_bulk_status(u_int8_t status)
2121 {
2122         struct {
2123                 struct pfsync_subheader subh;
2124                 struct pfsync_bus bus;
2125         } __packed r;
2126
2127         struct pfsync_softc *sc = V_pfsyncif;
2128
2129         bzero(&r, sizeof(r));
2130
2131         r.subh.action = PFSYNC_ACT_BUS;
2132         r.subh.count = htons(1);
2133         V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++;
2134
2135         r.bus.creatorid = V_pf_status.hostid;
2136         r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
2137         r.bus.status = status;
2138
2139         PFSYNC_LOCK(sc);
2140         pfsync_send_plus(&r, sizeof(r));
2141         PFSYNC_UNLOCK(sc);
2142 }
2143
2144 static void
2145 pfsync_bulk_fail(void *arg)
2146 {
2147         struct pfsync_softc *sc = arg;
2148
2149         CURVNET_SET(sc->sc_ifp->if_vnet);
2150
2151         PFSYNC_BLOCK_ASSERT(sc);
2152
2153         if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
2154                 /* Try again */
2155                 callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
2156                     pfsync_bulk_fail, V_pfsyncif);
2157                 PFSYNC_LOCK(sc);
2158                 pfsync_request_update(0, 0);
2159                 PFSYNC_UNLOCK(sc);
2160         } else {
2161                 /* Pretend like the transfer was ok. */
2162                 sc->sc_ureq_sent = 0;
2163                 sc->sc_bulk_tries = 0;
2164                 PFSYNC_LOCK(sc);
2165                 if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
2166                         (*carp_demote_adj_p)(-V_pfsync_carp_adj,
2167                             "pfsync bulk fail");
2168                 sc->sc_flags |= PFSYNCF_OK;
2169                 PFSYNC_UNLOCK(sc);
2170                 if (V_pf_status.debug >= PF_DEBUG_MISC)
2171                         printf("pfsync: failed to receive bulk update\n");
2172         }
2173
2174         CURVNET_RESTORE();
2175 }
2176
2177 static void
2178 pfsync_send_plus(void *plus, size_t pluslen)
2179 {
2180         struct pfsync_softc *sc = V_pfsyncif;
2181
2182         PFSYNC_LOCK_ASSERT(sc);
2183
2184         if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu)
2185                 pfsync_sendout(1);
2186
2187         sc->sc_plus = plus;
2188         sc->sc_len += (sc->sc_pluslen = pluslen);
2189
2190         pfsync_sendout(1);
2191 }
2192
2193 static void
2194 pfsync_timeout(void *arg)
2195 {
2196         struct pfsync_softc *sc = arg;
2197
2198         CURVNET_SET(sc->sc_ifp->if_vnet);
2199         PFSYNC_LOCK(sc);
2200         pfsync_push(sc);
2201         PFSYNC_UNLOCK(sc);
2202         CURVNET_RESTORE();
2203 }
2204
2205 static void
2206 pfsync_push(struct pfsync_softc *sc)
2207 {
2208
2209         PFSYNC_LOCK_ASSERT(sc);
2210
2211         sc->sc_flags |= PFSYNCF_PUSH;
2212         swi_sched(V_pfsync_swi_cookie, 0);
2213 }
2214
2215 static void
2216 pfsyncintr(void *arg)
2217 {
2218         struct pfsync_softc *sc = arg;
2219         struct mbuf *m, *n;
2220
2221         CURVNET_SET(sc->sc_ifp->if_vnet);
2222
2223         PFSYNC_LOCK(sc);
2224         if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) {
2225                 pfsync_sendout(0);
2226                 sc->sc_flags &= ~PFSYNCF_PUSH;
2227         }
2228         _IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m);
2229         PFSYNC_UNLOCK(sc);
2230
2231         for (; m != NULL; m = n) {
2232
2233                 n = m->m_nextpkt;
2234                 m->m_nextpkt = NULL;
2235
2236                 /*
2237                  * We distinguish between a deferral packet and our
2238                  * own pfsync packet based on M_SKIP_FIREWALL
2239                  * flag. This is XXX.
2240                  */
2241                 if (m->m_flags & M_SKIP_FIREWALL)
2242                         ip_output(m, NULL, NULL, 0, NULL, NULL);
2243                 else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
2244                     NULL) == 0)
2245                         V_pfsyncstats.pfsyncs_opackets++;
2246                 else
2247                         V_pfsyncstats.pfsyncs_oerrors++;
2248         }
2249         CURVNET_RESTORE();
2250 }
2251
2252 static int
2253 pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship)
2254 {
2255         struct ip_moptions *imo = &sc->sc_imo;
2256         int error;
2257
2258         if (!(ifp->if_flags & IFF_MULTICAST))
2259                 return (EADDRNOTAVAIL);
2260
2261         imo->imo_membership = (struct in_multi **)mship;
2262         imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
2263         imo->imo_multicast_vif = -1;
2264
2265         if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL,
2266             &imo->imo_membership[0])) != 0) {
2267                 imo->imo_membership = NULL;
2268                 return (error);
2269         }
2270         imo->imo_num_memberships++;
2271         imo->imo_multicast_ifp = ifp;
2272         imo->imo_multicast_ttl = PFSYNC_DFLTTL;
2273         imo->imo_multicast_loop = 0;
2274
2275         return (0);
2276 }
2277
2278 static void
2279 pfsync_multicast_cleanup(struct pfsync_softc *sc)
2280 {
2281         struct ip_moptions *imo = &sc->sc_imo;
2282
2283         in_leavegroup(imo->imo_membership[0], NULL);
2284         free(imo->imo_membership, M_PFSYNC);
2285         imo->imo_membership = NULL;
2286         imo->imo_multicast_ifp = NULL;
2287 }
2288
2289 #ifdef INET
2290 extern  struct domain inetdomain;
2291 static struct protosw in_pfsync_protosw = {
2292         .pr_type =              SOCK_RAW,
2293         .pr_domain =            &inetdomain,
2294         .pr_protocol =          IPPROTO_PFSYNC,
2295         .pr_flags =             PR_ATOMIC|PR_ADDR,
2296         .pr_input =             pfsync_input,
2297         .pr_output =            rip_output,
2298         .pr_ctloutput =         rip_ctloutput,
2299         .pr_usrreqs =           &rip_usrreqs
2300 };
2301 #endif
2302
2303 static void
2304 pfsync_pointers_init()
2305 {
2306
2307         PF_RULES_WLOCK();
2308         pfsync_state_import_ptr = pfsync_state_import;
2309         pfsync_insert_state_ptr = pfsync_insert_state;
2310         pfsync_update_state_ptr = pfsync_update_state;
2311         pfsync_delete_state_ptr = pfsync_delete_state;
2312         pfsync_clear_states_ptr = pfsync_clear_states;
2313         pfsync_defer_ptr = pfsync_defer;
2314         PF_RULES_WUNLOCK();
2315 }
2316
2317 static void
2318 pfsync_pointers_uninit()
2319 {
2320
2321         PF_RULES_WLOCK();
2322         pfsync_state_import_ptr = NULL;
2323         pfsync_insert_state_ptr = NULL;
2324         pfsync_update_state_ptr = NULL;
2325         pfsync_delete_state_ptr = NULL;
2326         pfsync_clear_states_ptr = NULL;
2327         pfsync_defer_ptr = NULL;
2328         PF_RULES_WUNLOCK();
2329 }
2330
2331 static void
2332 vnet_pfsync_init(const void *unused __unused)
2333 {
2334         int error;
2335
2336         V_pfsync_cloner = if_clone_simple(pfsyncname,
2337             pfsync_clone_create, pfsync_clone_destroy, 1);
2338         error = swi_add(NULL, pfsyncname, pfsyncintr, V_pfsyncif,
2339             SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie);
2340         if (error) {
2341                 if_clone_detach(V_pfsync_cloner);
2342                 log(LOG_INFO, "swi_add() failed in %s\n", __func__);
2343         }
2344 }
2345 VNET_SYSINIT(vnet_pfsync_init, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY,
2346     vnet_pfsync_init, NULL);
2347
2348 static void
2349 vnet_pfsync_uninit(const void *unused __unused)
2350 {
2351
2352         if_clone_detach(V_pfsync_cloner);
2353         swi_remove(V_pfsync_swi_cookie);
2354 }
2355 /*
2356  * Detach after pf is gone; otherwise we might touch pfsync memory
2357  * from within pf after freeing pfsync.
2358  */
2359 VNET_SYSUNINIT(vnet_pfsync_uninit, SI_SUB_INIT_IF, SI_ORDER_SECOND,
2360     vnet_pfsync_uninit, NULL);
2361
2362 static int
2363 pfsync_init()
2364 {
2365 #ifdef INET
2366         int error;
2367
2368         error = pf_proto_register(PF_INET, &in_pfsync_protosw);
2369         if (error)
2370                 return (error);
2371         error = ipproto_register(IPPROTO_PFSYNC);
2372         if (error) {
2373                 pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2374                 return (error);
2375         }
2376 #endif
2377         pfsync_pointers_init();
2378
2379         return (0);
2380 }
2381
2382 static void
2383 pfsync_uninit()
2384 {
2385
2386         pfsync_pointers_uninit();
2387
2388 #ifdef INET
2389         ipproto_unregister(IPPROTO_PFSYNC);
2390         pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2391 #endif
2392 }
2393
2394 static int
2395 pfsync_modevent(module_t mod, int type, void *data)
2396 {
2397         int error = 0;
2398
2399         switch (type) {
2400         case MOD_LOAD:
2401                 error = pfsync_init();
2402                 break;
2403         case MOD_QUIESCE:
2404                 /*
2405                  * Module should not be unloaded due to race conditions.
2406                  */
2407                 error = EBUSY;
2408                 break;
2409         case MOD_UNLOAD:
2410                 pfsync_uninit();
2411                 break;
2412         default:
2413                 error = EINVAL;
2414                 break;
2415         }
2416
2417         return (error);
2418 }
2419
2420 static moduledata_t pfsync_mod = {
2421         pfsyncname,
2422         pfsync_modevent,
2423         0
2424 };
2425
2426 #define PFSYNC_MODVER 1
2427
2428 /* Stay on FIREWALL as we depend on pf being initialized and on inetdomain. */
2429 DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY);
2430 MODULE_VERSION(pfsync, PFSYNC_MODVER);
2431 MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);