]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netpfil/pf/if_pfsync.c
Prefer NULL over 0 for pointers
[FreeBSD/FreeBSD.git] / sys / netpfil / pf / if_pfsync.c
1 /*      $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $     */
2
3 /*
4  * Copyright (c) 2002 Michael Shalayeff
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
20  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  * THE POSSIBILITY OF SUCH DAMAGE.
27  */
28
29 /*
30  * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
31  *
32  * Permission to use, copy, modify, and distribute this software for any
33  * purpose with or without fee is hereby granted, provided that the above
34  * copyright notice and this permission notice appear in all copies.
35  *
36  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
37  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
38  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
39  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
40  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
41  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
42  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
43  */
44
45 /*
46  * Revisions picked from OpenBSD after revision 1.110 import:
47  * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates
48  * 1.120, 1.175 - use monotonic time_uptime
49  * 1.122 - reduce number of updates for non-TCP sessions
50  * 1.128 - cleanups
51  * 1.146 - bzero() mbuf before sparsely filling it with data
52  * 1.170 - SIOCSIFMTU checks
53  * 1.126, 1.142 - deferred packets processing
54  * 1.173 - correct expire time processing
55  */
56
57 #include <sys/cdefs.h>
58 __FBSDID("$FreeBSD$");
59
60 #include "opt_inet.h"
61 #include "opt_inet6.h"
62 #include "opt_pf.h"
63
64 #include <sys/param.h>
65 #include <sys/bus.h>
66 #include <sys/endian.h>
67 #include <sys/interrupt.h>
68 #include <sys/kernel.h>
69 #include <sys/lock.h>
70 #include <sys/mbuf.h>
71 #include <sys/module.h>
72 #include <sys/mutex.h>
73 #include <sys/priv.h>
74 #include <sys/protosw.h>
75 #include <sys/socket.h>
76 #include <sys/sockio.h>
77 #include <sys/sysctl.h>
78
79 #include <net/bpf.h>
80 #include <net/if.h>
81 #include <net/if_clone.h>
82 #include <net/if_types.h>
83 #include <net/pfvar.h>
84 #include <net/if_pfsync.h>
85
86 #include <netinet/if_ether.h>
87 #include <netinet/in.h>
88 #include <netinet/in_var.h>
89 #include <netinet/ip.h>
90 #include <netinet/ip_carp.h>
91 #include <netinet/ip_var.h>
92 #include <netinet/tcp.h>
93 #include <netinet/tcp_fsm.h>
94 #include <netinet/tcp_seq.h>
95
96 #define PFSYNC_MINPKT ( \
97         sizeof(struct ip) + \
98         sizeof(struct pfsync_header) + \
99         sizeof(struct pfsync_subheader) + \
100         sizeof(struct pfsync_eof))
101
102 struct pfsync_pkt {
103         struct ip *ip;
104         struct in_addr src;
105         u_int8_t flags;
106 };
107
108 static int      pfsync_upd_tcp(struct pf_state *, struct pfsync_state_peer *,
109                     struct pfsync_state_peer *);
110 static int      pfsync_in_clr(struct pfsync_pkt *, struct mbuf *, int, int);
111 static int      pfsync_in_ins(struct pfsync_pkt *, struct mbuf *, int, int);
112 static int      pfsync_in_iack(struct pfsync_pkt *, struct mbuf *, int, int);
113 static int      pfsync_in_upd(struct pfsync_pkt *, struct mbuf *, int, int);
114 static int      pfsync_in_upd_c(struct pfsync_pkt *, struct mbuf *, int, int);
115 static int      pfsync_in_ureq(struct pfsync_pkt *, struct mbuf *, int, int);
116 static int      pfsync_in_del(struct pfsync_pkt *, struct mbuf *, int, int);
117 static int      pfsync_in_del_c(struct pfsync_pkt *, struct mbuf *, int, int);
118 static int      pfsync_in_bus(struct pfsync_pkt *, struct mbuf *, int, int);
119 static int      pfsync_in_tdb(struct pfsync_pkt *, struct mbuf *, int, int);
120 static int      pfsync_in_eof(struct pfsync_pkt *, struct mbuf *, int, int);
121 static int      pfsync_in_error(struct pfsync_pkt *, struct mbuf *, int, int);
122
123 static int (*pfsync_acts[])(struct pfsync_pkt *, struct mbuf *, int, int) = {
124         pfsync_in_clr,                  /* PFSYNC_ACT_CLR */
125         pfsync_in_ins,                  /* PFSYNC_ACT_INS */
126         pfsync_in_iack,                 /* PFSYNC_ACT_INS_ACK */
127         pfsync_in_upd,                  /* PFSYNC_ACT_UPD */
128         pfsync_in_upd_c,                /* PFSYNC_ACT_UPD_C */
129         pfsync_in_ureq,                 /* PFSYNC_ACT_UPD_REQ */
130         pfsync_in_del,                  /* PFSYNC_ACT_DEL */
131         pfsync_in_del_c,                /* PFSYNC_ACT_DEL_C */
132         pfsync_in_error,                /* PFSYNC_ACT_INS_F */
133         pfsync_in_error,                /* PFSYNC_ACT_DEL_F */
134         pfsync_in_bus,                  /* PFSYNC_ACT_BUS */
135         pfsync_in_tdb,                  /* PFSYNC_ACT_TDB */
136         pfsync_in_eof                   /* PFSYNC_ACT_EOF */
137 };
138
139 struct pfsync_q {
140         void            (*write)(struct pf_state *, void *);
141         size_t          len;
142         u_int8_t        action;
143 };
144
145 /* we have one of these for every PFSYNC_S_ */
146 static void     pfsync_out_state(struct pf_state *, void *);
147 static void     pfsync_out_iack(struct pf_state *, void *);
148 static void     pfsync_out_upd_c(struct pf_state *, void *);
149 static void     pfsync_out_del(struct pf_state *, void *);
150
151 static struct pfsync_q pfsync_qs[] = {
152         { pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
153         { pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
154         { pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD },
155         { pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
156         { pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C }
157 };
158
159 static void     pfsync_q_ins(struct pf_state *, int);
160 static void     pfsync_q_del(struct pf_state *);
161
162 static void     pfsync_update_state(struct pf_state *);
163
164 struct pfsync_upd_req_item {
165         TAILQ_ENTRY(pfsync_upd_req_item)        ur_entry;
166         struct pfsync_upd_req                   ur_msg;
167 };
168
169 struct pfsync_deferral {
170         struct pfsync_softc             *pd_sc;
171         TAILQ_ENTRY(pfsync_deferral)    pd_entry;
172         u_int                           pd_refs;
173         struct callout                  pd_tmo;
174
175         struct pf_state                 *pd_st;
176         struct mbuf                     *pd_m;
177 };
178
179 struct pfsync_softc {
180         /* Configuration */
181         struct ifnet            *sc_ifp;
182         struct ifnet            *sc_sync_if;
183         struct ip_moptions      sc_imo;
184         struct in_addr          sc_sync_peer;
185         uint32_t                sc_flags;
186 #define PFSYNCF_OK              0x00000001
187 #define PFSYNCF_DEFER           0x00000002
188 #define PFSYNCF_PUSH            0x00000004
189         uint8_t                 sc_maxupdates;
190         struct ip               sc_template;
191         struct callout          sc_tmo;
192         struct mtx              sc_mtx;
193
194         /* Queued data */
195         size_t                  sc_len;
196         TAILQ_HEAD(, pf_state)                  sc_qs[PFSYNC_S_COUNT];
197         TAILQ_HEAD(, pfsync_upd_req_item)       sc_upd_req_list;
198         TAILQ_HEAD(, pfsync_deferral)           sc_deferrals;
199         u_int                   sc_deferred;
200         void                    *sc_plus;
201         size_t                  sc_pluslen;
202
203         /* Bulk update info */
204         struct mtx              sc_bulk_mtx;
205         uint32_t                sc_ureq_sent;
206         int                     sc_bulk_tries;
207         uint32_t                sc_ureq_received;
208         int                     sc_bulk_hashid;
209         uint64_t                sc_bulk_stateid;
210         uint32_t                sc_bulk_creatorid;
211         struct callout          sc_bulk_tmo;
212         struct callout          sc_bulkfail_tmo;
213 };
214
215 #define PFSYNC_LOCK(sc)         mtx_lock(&(sc)->sc_mtx)
216 #define PFSYNC_UNLOCK(sc)       mtx_unlock(&(sc)->sc_mtx)
217 #define PFSYNC_LOCK_ASSERT(sc)  mtx_assert(&(sc)->sc_mtx, MA_OWNED)
218
219 #define PFSYNC_BLOCK(sc)        mtx_lock(&(sc)->sc_bulk_mtx)
220 #define PFSYNC_BUNLOCK(sc)      mtx_unlock(&(sc)->sc_bulk_mtx)
221 #define PFSYNC_BLOCK_ASSERT(sc) mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
222
223 static MALLOC_DEFINE(M_PFSYNC, "pfsync", "pfsync(4) data");
224 static VNET_DEFINE(struct pfsync_softc  *, pfsyncif) = NULL;
225 #define V_pfsyncif              VNET(pfsyncif)
226 static VNET_DEFINE(void *, pfsync_swi_cookie) = NULL;
227 #define V_pfsync_swi_cookie     VNET(pfsync_swi_cookie)
228 static VNET_DEFINE(struct pfsyncstats, pfsyncstats);
229 #define V_pfsyncstats           VNET(pfsyncstats)
230 static VNET_DEFINE(int, pfsync_carp_adj) = CARP_MAXSKEW;
231 #define V_pfsync_carp_adj       VNET(pfsync_carp_adj)
232
233 static void     pfsync_timeout(void *);
234 static void     pfsync_push(struct pfsync_softc *);
235 static void     pfsyncintr(void *);
236 static int      pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
237                     void *);
238 static void     pfsync_multicast_cleanup(struct pfsync_softc *);
239 static void     pfsync_pointers_init(void);
240 static void     pfsync_pointers_uninit(void);
241 static int      pfsync_init(void);
242 static void     pfsync_uninit(void);
243
244 SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC");
245 SYSCTL_VNET_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_RW,
246     &VNET_NAME(pfsyncstats), pfsyncstats,
247     "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
248 SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW,
249     &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
250
251 static int      pfsync_clone_create(struct if_clone *, int, caddr_t);
252 static void     pfsync_clone_destroy(struct ifnet *);
253 static int      pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
254                     struct pf_state_peer *);
255 static int      pfsyncoutput(struct ifnet *, struct mbuf *, struct sockaddr *,
256                     struct route *);
257 static int      pfsyncioctl(struct ifnet *, u_long, caddr_t);
258
259 static int      pfsync_defer(struct pf_state *, struct mbuf *);
260 static void     pfsync_undefer(struct pfsync_deferral *, int);
261 static void     pfsync_undefer_state(struct pf_state *, int);
262 static void     pfsync_defer_tmo(void *);
263
264 static void     pfsync_request_update(u_int32_t, u_int64_t);
265 static void     pfsync_update_state_req(struct pf_state *);
266
267 static void     pfsync_drop(struct pfsync_softc *);
268 static void     pfsync_sendout(int);
269 static void     pfsync_send_plus(void *, size_t);
270
271 static void     pfsync_bulk_start(void);
272 static void     pfsync_bulk_status(u_int8_t);
273 static void     pfsync_bulk_update(void *);
274 static void     pfsync_bulk_fail(void *);
275
276 #ifdef IPSEC
277 static void     pfsync_update_net_tdb(struct pfsync_tdb *);
278 #endif
279
280 #define PFSYNC_MAX_BULKTRIES    12
281
282 VNET_DEFINE(struct ifc_simple_data, pfsync_cloner_data);
283 VNET_DEFINE(struct if_clone, pfsync_cloner);
284 #define V_pfsync_cloner_data    VNET(pfsync_cloner_data)
285 #define V_pfsync_cloner         VNET(pfsync_cloner)
286 IFC_SIMPLE_DECLARE(pfsync, 1);
287
288 static int
289 pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
290 {
291         struct pfsync_softc *sc;
292         struct ifnet *ifp;
293         int q;
294
295         if (unit != 0)
296                 return (EINVAL);
297
298         sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
299         sc->sc_flags |= PFSYNCF_OK;
300
301         for (q = 0; q < PFSYNC_S_COUNT; q++)
302                 TAILQ_INIT(&sc->sc_qs[q]);
303
304         TAILQ_INIT(&sc->sc_upd_req_list);
305         TAILQ_INIT(&sc->sc_deferrals);
306
307         sc->sc_len = PFSYNC_MINPKT;
308         sc->sc_maxupdates = 128;
309
310         ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
311         if (ifp == NULL) {
312                 free(sc, M_PFSYNC);
313                 return (ENOSPC);
314         }
315         if_initname(ifp, ifc->ifc_name, unit);
316         ifp->if_softc = sc;
317         ifp->if_ioctl = pfsyncioctl;
318         ifp->if_output = pfsyncoutput;
319         ifp->if_type = IFT_PFSYNC;
320         ifp->if_snd.ifq_maxlen = ifqmaxlen;
321         ifp->if_hdrlen = sizeof(struct pfsync_header);
322         ifp->if_mtu = ETHERMTU;
323         mtx_init(&sc->sc_mtx, "pfsync", NULL, MTX_DEF);
324         mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
325         callout_init(&sc->sc_tmo, CALLOUT_MPSAFE);
326         callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
327         callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
328
329         if_attach(ifp);
330
331         bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
332
333         V_pfsyncif = sc;
334
335         return (0);
336 }
337
338 static void
339 pfsync_clone_destroy(struct ifnet *ifp)
340 {
341         struct pfsync_softc *sc = ifp->if_softc;
342
343         /*
344          * At this stage, everything should have already been
345          * cleared by pfsync_uninit(), and we have only to
346          * drain callouts.
347          */
348         while (sc->sc_deferred > 0) {
349                 struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals);
350
351                 TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
352                 sc->sc_deferred--;
353                 if (callout_stop(&pd->pd_tmo)) {
354                         pf_release_state(pd->pd_st);
355                         m_freem(pd->pd_m);
356                         free(pd, M_PFSYNC);
357                 } else {
358                         pd->pd_refs++;
359                         callout_drain(&pd->pd_tmo);
360                         free(pd, M_PFSYNC);
361                 }
362         }
363
364         callout_drain(&sc->sc_tmo);
365         callout_drain(&sc->sc_bulkfail_tmo);
366         callout_drain(&sc->sc_bulk_tmo);
367
368         if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
369                 (*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy");
370         bpfdetach(ifp);
371         if_detach(ifp);
372
373         pfsync_drop(sc);
374
375         if_free(ifp);
376         if (sc->sc_imo.imo_membership)
377                 pfsync_multicast_cleanup(sc);
378         mtx_destroy(&sc->sc_mtx);
379         mtx_destroy(&sc->sc_bulk_mtx);
380         free(sc, M_PFSYNC);
381
382         V_pfsyncif = NULL;
383 }
384
385 static int
386 pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
387     struct pf_state_peer *d)
388 {
389         if (s->scrub.scrub_flag && d->scrub == NULL) {
390                 d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO);
391                 if (d->scrub == NULL)
392                         return (ENOMEM);
393         }
394
395         return (0);
396 }
397
398
399 static int
400 pfsync_state_import(struct pfsync_state *sp, u_int8_t flags)
401 {
402         struct pfsync_softc *sc = V_pfsyncif;
403         struct pf_state *st = NULL;
404         struct pf_state_key *skw = NULL, *sks = NULL;
405         struct pf_rule *r = NULL;
406         struct pfi_kif  *kif;
407         int error;
408
409         PF_RULES_RASSERT();
410
411         if (sp->creatorid == 0 && V_pf_status.debug >= PF_DEBUG_MISC) {
412                 printf("%s: invalid creator id: %08x\n", __func__,
413                     ntohl(sp->creatorid));
414                 return (EINVAL);
415         }
416
417         if ((kif = pfi_kif_find(sp->ifname)) == NULL) {
418                 if (V_pf_status.debug >= PF_DEBUG_MISC)
419                         printf("%s: unknown interface: %s\n", __func__,
420                             sp->ifname);
421                 if (flags & PFSYNC_SI_IOCTL)
422                         return (EINVAL);
423                 return (0);     /* skip this state */
424         }
425
426         /*
427          * If the ruleset checksums match or the state is coming from the ioctl,
428          * it's safe to associate the state with the rule of that number.
429          */
430         if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
431             (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
432             pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
433                 r = pf_main_ruleset.rules[
434                     PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
435         else
436                 r = &V_pf_default_rule;
437
438         if ((r->max_states && r->states_cur >= r->max_states))
439                 goto cleanup;
440
441         /*
442          * XXXGL: consider M_WAITOK in ioctl path after.
443          */
444         if ((st = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO)) == NULL)
445                 goto cleanup;
446
447         if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL)
448                 goto cleanup;
449
450         if (PF_ANEQ(&sp->key[PF_SK_WIRE].addr[0],
451             &sp->key[PF_SK_STACK].addr[0], sp->af) ||
452             PF_ANEQ(&sp->key[PF_SK_WIRE].addr[1],
453             &sp->key[PF_SK_STACK].addr[1], sp->af) ||
454             sp->key[PF_SK_WIRE].port[0] != sp->key[PF_SK_STACK].port[0] ||
455             sp->key[PF_SK_WIRE].port[1] != sp->key[PF_SK_STACK].port[1]) {
456                 sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
457                 if (sks == NULL)
458                         goto cleanup;
459         } else
460                 sks = skw;
461
462         /* allocate memory for scrub info */
463         if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
464             pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
465                 goto cleanup;
466
467         /* copy to state key(s) */
468         skw->addr[0] = sp->key[PF_SK_WIRE].addr[0];
469         skw->addr[1] = sp->key[PF_SK_WIRE].addr[1];
470         skw->port[0] = sp->key[PF_SK_WIRE].port[0];
471         skw->port[1] = sp->key[PF_SK_WIRE].port[1];
472         skw->proto = sp->proto;
473         skw->af = sp->af;
474         if (sks != skw) {
475                 sks->addr[0] = sp->key[PF_SK_STACK].addr[0];
476                 sks->addr[1] = sp->key[PF_SK_STACK].addr[1];
477                 sks->port[0] = sp->key[PF_SK_STACK].port[0];
478                 sks->port[1] = sp->key[PF_SK_STACK].port[1];
479                 sks->proto = sp->proto;
480                 sks->af = sp->af;
481         }
482
483         /* copy to state */
484         bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
485         st->creation = time_uptime - ntohl(sp->creation);
486         st->expire = time_uptime;
487         if (sp->expire) {
488                 uint32_t timeout;
489
490                 timeout = r->timeout[sp->timeout];
491                 if (!timeout)
492                         timeout = V_pf_default_rule.timeout[sp->timeout];
493
494                 /* sp->expire may have been adaptively scaled by export. */
495                 st->expire -= timeout - ntohl(sp->expire);
496         }
497
498         st->direction = sp->direction;
499         st->log = sp->log;
500         st->timeout = sp->timeout;
501         st->state_flags = sp->state_flags;
502
503         st->id = sp->id;
504         st->creatorid = sp->creatorid;
505         pf_state_peer_ntoh(&sp->src, &st->src);
506         pf_state_peer_ntoh(&sp->dst, &st->dst);
507
508         st->rule.ptr = r;
509         st->nat_rule.ptr = NULL;
510         st->anchor.ptr = NULL;
511         st->rt_kif = NULL;
512
513         st->pfsync_time = time_uptime;
514         st->sync_state = PFSYNC_S_NONE;
515
516         /* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
517         r->states_cur++;
518         r->states_tot++;
519
520         if (!(flags & PFSYNC_SI_IOCTL))
521                 st->state_flags |= PFSTATE_NOSYNC;
522
523         if ((error = pf_state_insert(kif, skw, sks, st)) != 0) {
524                 /* XXX when we have nat_rule/anchors, use STATE_DEC_COUNTERS */
525                 r->states_cur--;
526                 goto cleanup_state;
527         }
528
529         if (!(flags & PFSYNC_SI_IOCTL)) {
530                 st->state_flags &= ~PFSTATE_NOSYNC;
531                 if (st->state_flags & PFSTATE_ACK) {
532                         pfsync_q_ins(st, PFSYNC_S_IACK);
533                         pfsync_push(sc);
534                 }
535         }
536         st->state_flags &= ~PFSTATE_ACK;
537         PF_STATE_UNLOCK(st);
538
539         return (0);
540
541 cleanup:
542         error = ENOMEM;
543         if (skw == sks)
544                 sks = NULL;
545         if (skw != NULL)
546                 uma_zfree(V_pf_state_key_z, skw);
547         if (sks != NULL)
548                 uma_zfree(V_pf_state_key_z, sks);
549
550 cleanup_state:  /* pf_state_insert() frees the state keys. */
551         if (st) {
552                 if (st->dst.scrub)
553                         uma_zfree(V_pf_state_scrub_z, st->dst.scrub);
554                 if (st->src.scrub)
555                         uma_zfree(V_pf_state_scrub_z, st->src.scrub);
556                 uma_zfree(V_pf_state_z, st);
557         }
558         return (error);
559 }
560
561 static void
562 pfsync_input(struct mbuf *m, __unused int off)
563 {
564         struct pfsync_softc *sc = V_pfsyncif;
565         struct pfsync_pkt pkt;
566         struct ip *ip = mtod(m, struct ip *);
567         struct pfsync_header *ph;
568         struct pfsync_subheader subh;
569
570         int offset;
571         int rv;
572         uint16_t count;
573
574         V_pfsyncstats.pfsyncs_ipackets++;
575
576         /* Verify that we have a sync interface configured. */
577         if (!sc || !sc->sc_sync_if || !V_pf_status.running ||
578             (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
579                 goto done;
580
581         /* verify that the packet came in on the right interface */
582         if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
583                 V_pfsyncstats.pfsyncs_badif++;
584                 goto done;
585         }
586
587         sc->sc_ifp->if_ipackets++;
588         sc->sc_ifp->if_ibytes += m->m_pkthdr.len;
589         /* verify that the IP TTL is 255. */
590         if (ip->ip_ttl != PFSYNC_DFLTTL) {
591                 V_pfsyncstats.pfsyncs_badttl++;
592                 goto done;
593         }
594
595         offset = ip->ip_hl << 2;
596         if (m->m_pkthdr.len < offset + sizeof(*ph)) {
597                 V_pfsyncstats.pfsyncs_hdrops++;
598                 goto done;
599         }
600
601         if (offset + sizeof(*ph) > m->m_len) {
602                 if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
603                         V_pfsyncstats.pfsyncs_hdrops++;
604                         return;
605                 }
606                 ip = mtod(m, struct ip *);
607         }
608         ph = (struct pfsync_header *)((char *)ip + offset);
609
610         /* verify the version */
611         if (ph->version != PFSYNC_VERSION) {
612                 V_pfsyncstats.pfsyncs_badver++;
613                 goto done;
614         }
615
616         /* Cheaper to grab this now than having to mess with mbufs later */
617         pkt.ip = ip;
618         pkt.src = ip->ip_src;
619         pkt.flags = 0;
620
621         /*
622          * Trusting pf_chksum during packet processing, as well as seeking
623          * in interface name tree, require holding PF_RULES_RLOCK().
624          */
625         PF_RULES_RLOCK();
626         if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
627                 pkt.flags |= PFSYNC_SI_CKSUM;
628
629         offset += sizeof(*ph);
630         for (;;) {
631                 m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
632                 offset += sizeof(subh);
633
634                 if (subh.action >= PFSYNC_ACT_MAX) {
635                         V_pfsyncstats.pfsyncs_badact++;
636                         PF_RULES_RUNLOCK();
637                         goto done;
638                 }
639
640                 count = ntohs(subh.count);
641                 V_pfsyncstats.pfsyncs_iacts[subh.action] += count;
642                 rv = (*pfsync_acts[subh.action])(&pkt, m, offset, count);
643                 if (rv == -1) {
644                         PF_RULES_RUNLOCK();
645                         return;
646                 }
647
648                 offset += rv;
649         }
650         PF_RULES_RUNLOCK();
651
652 done:
653         m_freem(m);
654 }
655
656 static int
657 pfsync_in_clr(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
658 {
659         struct pfsync_clr *clr;
660         struct mbuf *mp;
661         int len = sizeof(*clr) * count;
662         int i, offp;
663         u_int32_t creatorid;
664
665         mp = m_pulldown(m, offset, len, &offp);
666         if (mp == NULL) {
667                 V_pfsyncstats.pfsyncs_badlen++;
668                 return (-1);
669         }
670         clr = (struct pfsync_clr *)(mp->m_data + offp);
671
672         for (i = 0; i < count; i++) {
673                 creatorid = clr[i].creatorid;
674
675                 if (clr[i].ifname[0] != '\0' &&
676                     pfi_kif_find(clr[i].ifname) == NULL)
677                         continue;
678
679                 for (int i = 0; i <= V_pf_hashmask; i++) {
680                         struct pf_idhash *ih = &V_pf_idhash[i];
681                         struct pf_state *s;
682 relock:
683                         PF_HASHROW_LOCK(ih);
684                         LIST_FOREACH(s, &ih->states, entry) {
685                                 if (s->creatorid == creatorid) {
686                                         s->state_flags |= PFSTATE_NOSYNC;
687                                         pf_unlink_state(s, PF_ENTER_LOCKED);
688                                         goto relock;
689                                 }
690                         }
691                         PF_HASHROW_UNLOCK(ih);
692                 }
693         }
694
695         return (len);
696 }
697
698 static int
699 pfsync_in_ins(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
700 {
701         struct mbuf *mp;
702         struct pfsync_state *sa, *sp;
703         int len = sizeof(*sp) * count;
704         int i, offp;
705
706         mp = m_pulldown(m, offset, len, &offp);
707         if (mp == NULL) {
708                 V_pfsyncstats.pfsyncs_badlen++;
709                 return (-1);
710         }
711         sa = (struct pfsync_state *)(mp->m_data + offp);
712
713         for (i = 0; i < count; i++) {
714                 sp = &sa[i];
715
716                 /* Check for invalid values. */
717                 if (sp->timeout >= PFTM_MAX ||
718                     sp->src.state > PF_TCPS_PROXY_DST ||
719                     sp->dst.state > PF_TCPS_PROXY_DST ||
720                     sp->direction > PF_OUT ||
721                     (sp->af != AF_INET && sp->af != AF_INET6)) {
722                         if (V_pf_status.debug >= PF_DEBUG_MISC)
723                                 printf("%s: invalid value\n", __func__);
724                         V_pfsyncstats.pfsyncs_badval++;
725                         continue;
726                 }
727
728                 if (pfsync_state_import(sp, pkt->flags) == ENOMEM)
729                         /* Drop out, but process the rest of the actions. */
730                         break;
731         }
732
733         return (len);
734 }
735
736 static int
737 pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
738 {
739         struct pfsync_ins_ack *ia, *iaa;
740         struct pf_state *st;
741
742         struct mbuf *mp;
743         int len = count * sizeof(*ia);
744         int offp, i;
745
746         mp = m_pulldown(m, offset, len, &offp);
747         if (mp == NULL) {
748                 V_pfsyncstats.pfsyncs_badlen++;
749                 return (-1);
750         }
751         iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
752
753         for (i = 0; i < count; i++) {
754                 ia = &iaa[i];
755
756                 st = pf_find_state_byid(ia->id, ia->creatorid);
757                 if (st == NULL)
758                         continue;
759
760                 if (st->state_flags & PFSTATE_ACK) {
761                         PFSYNC_LOCK(V_pfsyncif);
762                         pfsync_undefer_state(st, 0);
763                         PFSYNC_UNLOCK(V_pfsyncif);
764                 }
765                 PF_STATE_UNLOCK(st);
766         }
767         /*
768          * XXX this is not yet implemented, but we know the size of the
769          * message so we can skip it.
770          */
771
772         return (count * sizeof(struct pfsync_ins_ack));
773 }
774
775 static int
776 pfsync_upd_tcp(struct pf_state *st, struct pfsync_state_peer *src,
777     struct pfsync_state_peer *dst)
778 {
779         int sfail = 0;
780
781         PF_STATE_LOCK_ASSERT(st);
782
783         /*
784          * The state should never go backwards except
785          * for syn-proxy states.  Neither should the
786          * sequence window slide backwards.
787          */
788         if (st->src.state > src->state &&
789             (st->src.state < PF_TCPS_PROXY_SRC ||
790             src->state >= PF_TCPS_PROXY_SRC))
791                 sfail = 1;
792         else if (SEQ_GT(st->src.seqlo, ntohl(src->seqlo)))
793                 sfail = 3;
794         else if (st->dst.state > dst->state) {
795                 /* There might still be useful
796                  * information about the src state here,
797                  * so import that part of the update,
798                  * then "fail" so we send the updated
799                  * state back to the peer who is missing
800                  * our what we know. */
801                 pf_state_peer_ntoh(src, &st->src);
802                 /* XXX do anything with timeouts? */
803                 sfail = 7;
804         } else if (st->dst.state >= TCPS_SYN_SENT &&
805             SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo)))
806                 sfail = 4;
807
808         return (sfail);
809 }
810
811 static int
812 pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
813 {
814         struct pfsync_softc *sc = V_pfsyncif;
815         struct pfsync_state *sa, *sp;
816         struct pf_state_key *sk;
817         struct pf_state *st;
818         int sfail;
819
820         struct mbuf *mp;
821         int len = count * sizeof(*sp);
822         int offp, i;
823
824         mp = m_pulldown(m, offset, len, &offp);
825         if (mp == NULL) {
826                 V_pfsyncstats.pfsyncs_badlen++;
827                 return (-1);
828         }
829         sa = (struct pfsync_state *)(mp->m_data + offp);
830
831         for (i = 0; i < count; i++) {
832                 sp = &sa[i];
833
834                 /* check for invalid values */
835                 if (sp->timeout >= PFTM_MAX ||
836                     sp->src.state > PF_TCPS_PROXY_DST ||
837                     sp->dst.state > PF_TCPS_PROXY_DST) {
838                         if (V_pf_status.debug >= PF_DEBUG_MISC) {
839                                 printf("pfsync_input: PFSYNC_ACT_UPD: "
840                                     "invalid value\n");
841                         }
842                         V_pfsyncstats.pfsyncs_badval++;
843                         continue;
844                 }
845
846                 st = pf_find_state_byid(sp->id, sp->creatorid);
847                 if (st == NULL) {
848                         /* insert the update */
849                         if (pfsync_state_import(sp, 0))
850                                 V_pfsyncstats.pfsyncs_badstate++;
851                         continue;
852                 }
853
854                 if (st->state_flags & PFSTATE_ACK) {
855                         PFSYNC_LOCK(sc);
856                         pfsync_undefer_state(st, 1);
857                         PFSYNC_UNLOCK(sc);
858                 }
859
860                 sk = st->key[PF_SK_WIRE];       /* XXX right one? */
861                 sfail = 0;
862                 if (sk->proto == IPPROTO_TCP)
863                         sfail = pfsync_upd_tcp(st, &sp->src, &sp->dst);
864                 else {
865                         /*
866                          * Non-TCP protocol state machine always go
867                          * forwards
868                          */
869                         if (st->src.state > sp->src.state)
870                                 sfail = 5;
871                         else if (st->dst.state > sp->dst.state)
872                                 sfail = 6;
873                 }
874
875                 if (sfail) {
876                         if (V_pf_status.debug >= PF_DEBUG_MISC) {
877                                 printf("pfsync: %s stale update (%d)"
878                                     " id: %016llx creatorid: %08x\n",
879                                     (sfail < 7 ?  "ignoring" : "partial"),
880                                     sfail, (unsigned long long)be64toh(st->id),
881                                     ntohl(st->creatorid));
882                         }
883                         V_pfsyncstats.pfsyncs_stale++;
884
885                         pfsync_update_state(st);
886                         PF_STATE_UNLOCK(st);
887                         PFSYNC_LOCK(sc);
888                         pfsync_push(sc);
889                         PFSYNC_UNLOCK(sc);
890                         continue;
891                 }
892                 pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
893                 pf_state_peer_ntoh(&sp->src, &st->src);
894                 pf_state_peer_ntoh(&sp->dst, &st->dst);
895                 st->expire = time_uptime;
896                 st->timeout = sp->timeout;
897                 st->pfsync_time = time_uptime;
898                 PF_STATE_UNLOCK(st);
899         }
900
901         return (len);
902 }
903
904 static int
905 pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
906 {
907         struct pfsync_softc *sc = V_pfsyncif;
908         struct pfsync_upd_c *ua, *up;
909         struct pf_state_key *sk;
910         struct pf_state *st;
911
912         int len = count * sizeof(*up);
913         int sfail;
914
915         struct mbuf *mp;
916         int offp, i;
917
918         mp = m_pulldown(m, offset, len, &offp);
919         if (mp == NULL) {
920                 V_pfsyncstats.pfsyncs_badlen++;
921                 return (-1);
922         }
923         ua = (struct pfsync_upd_c *)(mp->m_data + offp);
924
925         for (i = 0; i < count; i++) {
926                 up = &ua[i];
927
928                 /* check for invalid values */
929                 if (up->timeout >= PFTM_MAX ||
930                     up->src.state > PF_TCPS_PROXY_DST ||
931                     up->dst.state > PF_TCPS_PROXY_DST) {
932                         if (V_pf_status.debug >= PF_DEBUG_MISC) {
933                                 printf("pfsync_input: "
934                                     "PFSYNC_ACT_UPD_C: "
935                                     "invalid value\n");
936                         }
937                         V_pfsyncstats.pfsyncs_badval++;
938                         continue;
939                 }
940
941                 st = pf_find_state_byid(up->id, up->creatorid);
942                 if (st == NULL) {
943                         /* We don't have this state. Ask for it. */
944                         PFSYNC_LOCK(sc);
945                         pfsync_request_update(up->creatorid, up->id);
946                         PFSYNC_UNLOCK(sc);
947                         continue;
948                 }
949
950                 if (st->state_flags & PFSTATE_ACK) {
951                         PFSYNC_LOCK(sc);
952                         pfsync_undefer_state(st, 1);
953                         PFSYNC_UNLOCK(sc);
954                 }
955
956                 sk = st->key[PF_SK_WIRE]; /* XXX right one? */
957                 sfail = 0;
958                 if (sk->proto == IPPROTO_TCP)
959                         sfail = pfsync_upd_tcp(st, &up->src, &up->dst);
960                 else {
961                         /*
962                          * Non-TCP protocol state machine always go forwards
963                          */
964                         if (st->src.state > up->src.state)
965                                 sfail = 5;
966                         else if (st->dst.state > up->dst.state)
967                                 sfail = 6;
968                 }
969
970                 if (sfail) {
971                         if (V_pf_status.debug >= PF_DEBUG_MISC) {
972                                 printf("pfsync: ignoring stale update "
973                                     "(%d) id: %016llx "
974                                     "creatorid: %08x\n", sfail,
975                                     (unsigned long long)be64toh(st->id),
976                                     ntohl(st->creatorid));
977                         }
978                         V_pfsyncstats.pfsyncs_stale++;
979
980                         pfsync_update_state(st);
981                         PF_STATE_UNLOCK(st);
982                         PFSYNC_LOCK(sc);
983                         pfsync_push(sc);
984                         PFSYNC_UNLOCK(sc);
985                         continue;
986                 }
987                 pfsync_alloc_scrub_memory(&up->dst, &st->dst);
988                 pf_state_peer_ntoh(&up->src, &st->src);
989                 pf_state_peer_ntoh(&up->dst, &st->dst);
990                 st->expire = time_uptime;
991                 st->timeout = up->timeout;
992                 st->pfsync_time = time_uptime;
993                 PF_STATE_UNLOCK(st);
994         }
995
996         return (len);
997 }
998
999 static int
1000 pfsync_in_ureq(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1001 {
1002         struct pfsync_upd_req *ur, *ura;
1003         struct mbuf *mp;
1004         int len = count * sizeof(*ur);
1005         int i, offp;
1006
1007         struct pf_state *st;
1008
1009         mp = m_pulldown(m, offset, len, &offp);
1010         if (mp == NULL) {
1011                 V_pfsyncstats.pfsyncs_badlen++;
1012                 return (-1);
1013         }
1014         ura = (struct pfsync_upd_req *)(mp->m_data + offp);
1015
1016         for (i = 0; i < count; i++) {
1017                 ur = &ura[i];
1018
1019                 if (ur->id == 0 && ur->creatorid == 0)
1020                         pfsync_bulk_start();
1021                 else {
1022                         st = pf_find_state_byid(ur->id, ur->creatorid);
1023                         if (st == NULL) {
1024                                 V_pfsyncstats.pfsyncs_badstate++;
1025                                 continue;
1026                         }
1027                         if (st->state_flags & PFSTATE_NOSYNC) {
1028                                 PF_STATE_UNLOCK(st);
1029                                 continue;
1030                         }
1031
1032                         pfsync_update_state_req(st);
1033                         PF_STATE_UNLOCK(st);
1034                 }
1035         }
1036
1037         return (len);
1038 }
1039
1040 static int
1041 pfsync_in_del(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1042 {
1043         struct mbuf *mp;
1044         struct pfsync_state *sa, *sp;
1045         struct pf_state *st;
1046         int len = count * sizeof(*sp);
1047         int offp, i;
1048
1049         mp = m_pulldown(m, offset, len, &offp);
1050         if (mp == NULL) {
1051                 V_pfsyncstats.pfsyncs_badlen++;
1052                 return (-1);
1053         }
1054         sa = (struct pfsync_state *)(mp->m_data + offp);
1055
1056         for (i = 0; i < count; i++) {
1057                 sp = &sa[i];
1058
1059                 st = pf_find_state_byid(sp->id, sp->creatorid);
1060                 if (st == NULL) {
1061                         V_pfsyncstats.pfsyncs_badstate++;
1062                         continue;
1063                 }
1064                 st->state_flags |= PFSTATE_NOSYNC;
1065                 pf_unlink_state(st, PF_ENTER_LOCKED);
1066         }
1067
1068         return (len);
1069 }
1070
1071 static int
1072 pfsync_in_del_c(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1073 {
1074         struct mbuf *mp;
1075         struct pfsync_del_c *sa, *sp;
1076         struct pf_state *st;
1077         int len = count * sizeof(*sp);
1078         int offp, i;
1079
1080         mp = m_pulldown(m, offset, len, &offp);
1081         if (mp == NULL) {
1082                 V_pfsyncstats.pfsyncs_badlen++;
1083                 return (-1);
1084         }
1085         sa = (struct pfsync_del_c *)(mp->m_data + offp);
1086
1087         for (i = 0; i < count; i++) {
1088                 sp = &sa[i];
1089
1090                 st = pf_find_state_byid(sp->id, sp->creatorid);
1091                 if (st == NULL) {
1092                         V_pfsyncstats.pfsyncs_badstate++;
1093                         continue;
1094                 }
1095
1096                 st->state_flags |= PFSTATE_NOSYNC;
1097                 pf_unlink_state(st, PF_ENTER_LOCKED);
1098         }
1099
1100         return (len);
1101 }
1102
1103 static int
1104 pfsync_in_bus(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1105 {
1106         struct pfsync_softc *sc = V_pfsyncif;
1107         struct pfsync_bus *bus;
1108         struct mbuf *mp;
1109         int len = count * sizeof(*bus);
1110         int offp;
1111
1112         PFSYNC_BLOCK(sc);
1113
1114         /* If we're not waiting for a bulk update, who cares. */
1115         if (sc->sc_ureq_sent == 0) {
1116                 PFSYNC_BUNLOCK(sc);
1117                 return (len);
1118         }
1119
1120         mp = m_pulldown(m, offset, len, &offp);
1121         if (mp == NULL) {
1122                 PFSYNC_BUNLOCK(sc);
1123                 V_pfsyncstats.pfsyncs_badlen++;
1124                 return (-1);
1125         }
1126         bus = (struct pfsync_bus *)(mp->m_data + offp);
1127
1128         switch (bus->status) {
1129         case PFSYNC_BUS_START:
1130                 callout_reset(&sc->sc_bulkfail_tmo, 4 * hz +
1131                     V_pf_limits[PF_LIMIT_STATES].limit /
1132                     ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) /
1133                     sizeof(struct pfsync_state)),
1134                     pfsync_bulk_fail, sc);
1135                 if (V_pf_status.debug >= PF_DEBUG_MISC)
1136                         printf("pfsync: received bulk update start\n");
1137                 break;
1138
1139         case PFSYNC_BUS_END:
1140                 if (time_uptime - ntohl(bus->endtime) >=
1141                     sc->sc_ureq_sent) {
1142                         /* that's it, we're happy */
1143                         sc->sc_ureq_sent = 0;
1144                         sc->sc_bulk_tries = 0;
1145                         callout_stop(&sc->sc_bulkfail_tmo);
1146                         if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1147                                 (*carp_demote_adj_p)(-V_pfsync_carp_adj,
1148                                     "pfsync bulk done");
1149                         sc->sc_flags |= PFSYNCF_OK;
1150                         if (V_pf_status.debug >= PF_DEBUG_MISC)
1151                                 printf("pfsync: received valid "
1152                                     "bulk update end\n");
1153                 } else {
1154                         if (V_pf_status.debug >= PF_DEBUG_MISC)
1155                                 printf("pfsync: received invalid "
1156                                     "bulk update end: bad timestamp\n");
1157                 }
1158                 break;
1159         }
1160         PFSYNC_BUNLOCK(sc);
1161
1162         return (len);
1163 }
1164
1165 static int
1166 pfsync_in_tdb(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1167 {
1168         int len = count * sizeof(struct pfsync_tdb);
1169
1170 #if defined(IPSEC)
1171         struct pfsync_tdb *tp;
1172         struct mbuf *mp;
1173         int offp;
1174         int i;
1175         int s;
1176
1177         mp = m_pulldown(m, offset, len, &offp);
1178         if (mp == NULL) {
1179                 V_pfsyncstats.pfsyncs_badlen++;
1180                 return (-1);
1181         }
1182         tp = (struct pfsync_tdb *)(mp->m_data + offp);
1183
1184         for (i = 0; i < count; i++)
1185                 pfsync_update_net_tdb(&tp[i]);
1186 #endif
1187
1188         return (len);
1189 }
1190
1191 #if defined(IPSEC)
1192 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
1193 static void
1194 pfsync_update_net_tdb(struct pfsync_tdb *pt)
1195 {
1196         struct tdb              *tdb;
1197         int                      s;
1198
1199         /* check for invalid values */
1200         if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
1201             (pt->dst.sa.sa_family != AF_INET &&
1202             pt->dst.sa.sa_family != AF_INET6))
1203                 goto bad;
1204
1205         tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
1206         if (tdb) {
1207                 pt->rpl = ntohl(pt->rpl);
1208                 pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes);
1209
1210                 /* Neither replay nor byte counter should ever decrease. */
1211                 if (pt->rpl < tdb->tdb_rpl ||
1212                     pt->cur_bytes < tdb->tdb_cur_bytes) {
1213                         goto bad;
1214                 }
1215
1216                 tdb->tdb_rpl = pt->rpl;
1217                 tdb->tdb_cur_bytes = pt->cur_bytes;
1218         }
1219         return;
1220
1221 bad:
1222         if (V_pf_status.debug >= PF_DEBUG_MISC)
1223                 printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
1224                     "invalid value\n");
1225         V_pfsyncstats.pfsyncs_badstate++;
1226         return;
1227 }
1228 #endif
1229
1230
1231 static int
1232 pfsync_in_eof(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1233 {
1234         /* check if we are at the right place in the packet */
1235         if (offset != m->m_pkthdr.len - sizeof(struct pfsync_eof))
1236                 V_pfsyncstats.pfsyncs_badact++;
1237
1238         /* we're done. free and let the caller return */
1239         m_freem(m);
1240         return (-1);
1241 }
1242
1243 static int
1244 pfsync_in_error(struct pfsync_pkt *pkt, struct mbuf *m, int offset, int count)
1245 {
1246         V_pfsyncstats.pfsyncs_badact++;
1247
1248         m_freem(m);
1249         return (-1);
1250 }
1251
1252 static int
1253 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
1254         struct route *rt)
1255 {
1256         m_freem(m);
1257         return (0);
1258 }
1259
1260 /* ARGSUSED */
1261 static int
1262 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1263 {
1264         struct pfsync_softc *sc = ifp->if_softc;
1265         struct ifreq *ifr = (struct ifreq *)data;
1266         struct pfsyncreq pfsyncr;
1267         int error;
1268
1269         switch (cmd) {
1270         case SIOCSIFFLAGS:
1271                 PFSYNC_LOCK(sc);
1272                 if (ifp->if_flags & IFF_UP) {
1273                         ifp->if_drv_flags |= IFF_DRV_RUNNING;
1274                         PFSYNC_UNLOCK(sc);
1275                         pfsync_pointers_init();
1276                 } else {
1277                         ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1278                         PFSYNC_UNLOCK(sc);
1279                         pfsync_pointers_uninit();
1280                 }
1281                 break;
1282         case SIOCSIFMTU:
1283                 if (!sc->sc_sync_if ||
1284                     ifr->ifr_mtu <= PFSYNC_MINPKT ||
1285                     ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
1286                         return (EINVAL);
1287                 if (ifr->ifr_mtu < ifp->if_mtu) {
1288                         PFSYNC_LOCK(sc);
1289                         if (sc->sc_len > PFSYNC_MINPKT)
1290                                 pfsync_sendout(1);
1291                         PFSYNC_UNLOCK(sc);
1292                 }
1293                 ifp->if_mtu = ifr->ifr_mtu;
1294                 break;
1295         case SIOCGETPFSYNC:
1296                 bzero(&pfsyncr, sizeof(pfsyncr));
1297                 PFSYNC_LOCK(sc);
1298                 if (sc->sc_sync_if) {
1299                         strlcpy(pfsyncr.pfsyncr_syncdev,
1300                             sc->sc_sync_if->if_xname, IFNAMSIZ);
1301                 }
1302                 pfsyncr.pfsyncr_syncpeer = sc->sc_sync_peer;
1303                 pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
1304                 pfsyncr.pfsyncr_defer = (PFSYNCF_DEFER ==
1305                     (sc->sc_flags & PFSYNCF_DEFER));
1306                 PFSYNC_UNLOCK(sc);
1307                 return (copyout(&pfsyncr, ifr->ifr_data, sizeof(pfsyncr)));
1308
1309         case SIOCSETPFSYNC:
1310             {
1311                 struct ip_moptions *imo = &sc->sc_imo;
1312                 struct ifnet *sifp;
1313                 struct ip *ip;
1314                 void *mship = NULL;
1315
1316                 if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
1317                         return (error);
1318                 if ((error = copyin(ifr->ifr_data, &pfsyncr, sizeof(pfsyncr))))
1319                         return (error);
1320
1321                 if (pfsyncr.pfsyncr_maxupdates > 255)
1322                         return (EINVAL);
1323
1324                 if (pfsyncr.pfsyncr_syncdev[0] == 0)
1325                         sifp = NULL;
1326                 else if ((sifp = ifunit_ref(pfsyncr.pfsyncr_syncdev)) == NULL)
1327                         return (EINVAL);
1328
1329                 if (pfsyncr.pfsyncr_syncpeer.s_addr == 0 && sifp != NULL)
1330                         mship = malloc((sizeof(struct in_multi *) *
1331                             IP_MIN_MEMBERSHIPS), M_PFSYNC, M_WAITOK | M_ZERO);
1332
1333                 PFSYNC_LOCK(sc);
1334                 if (pfsyncr.pfsyncr_syncpeer.s_addr == 0)
1335                         sc->sc_sync_peer.s_addr = htonl(INADDR_PFSYNC_GROUP);
1336                 else
1337                         sc->sc_sync_peer.s_addr =
1338                             pfsyncr.pfsyncr_syncpeer.s_addr;
1339
1340                 sc->sc_maxupdates = pfsyncr.pfsyncr_maxupdates;
1341                 if (pfsyncr.pfsyncr_defer) {
1342                         sc->sc_flags |= PFSYNCF_DEFER;
1343                         pfsync_defer_ptr = pfsync_defer;
1344                 } else {
1345                         sc->sc_flags &= ~PFSYNCF_DEFER;
1346                         pfsync_defer_ptr = NULL;
1347                 }
1348
1349                 if (sifp == NULL) {
1350                         if (sc->sc_sync_if)
1351                                 if_rele(sc->sc_sync_if);
1352                         sc->sc_sync_if = NULL;
1353                         if (imo->imo_membership)
1354                                 pfsync_multicast_cleanup(sc);
1355                         PFSYNC_UNLOCK(sc);
1356                         break;
1357                 }
1358
1359                 if (sc->sc_len > PFSYNC_MINPKT &&
1360                     (sifp->if_mtu < sc->sc_ifp->if_mtu ||
1361                     (sc->sc_sync_if != NULL &&
1362                     sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
1363                     sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
1364                         pfsync_sendout(1);
1365
1366                 if (imo->imo_membership)
1367                         pfsync_multicast_cleanup(sc);
1368
1369                 if (sc->sc_sync_peer.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
1370                         error = pfsync_multicast_setup(sc, sifp, mship);
1371                         if (error) {
1372                                 if_rele(sifp);
1373                                 free(mship, M_PFSYNC);
1374                                 return (error);
1375                         }
1376                 }
1377                 if (sc->sc_sync_if)
1378                         if_rele(sc->sc_sync_if);
1379                 sc->sc_sync_if = sifp;
1380
1381                 ip = &sc->sc_template;
1382                 bzero(ip, sizeof(*ip));
1383                 ip->ip_v = IPVERSION;
1384                 ip->ip_hl = sizeof(sc->sc_template) >> 2;
1385                 ip->ip_tos = IPTOS_LOWDELAY;
1386                 /* len and id are set later. */
1387                 ip->ip_off = IP_DF;
1388                 ip->ip_ttl = PFSYNC_DFLTTL;
1389                 ip->ip_p = IPPROTO_PFSYNC;
1390                 ip->ip_src.s_addr = INADDR_ANY;
1391                 ip->ip_dst.s_addr = sc->sc_sync_peer.s_addr;
1392
1393                 /* Request a full state table update. */
1394                 if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
1395                         (*carp_demote_adj_p)(V_pfsync_carp_adj,
1396                             "pfsync bulk start");
1397                 sc->sc_flags &= ~PFSYNCF_OK;
1398                 if (V_pf_status.debug >= PF_DEBUG_MISC)
1399                         printf("pfsync: requesting bulk update\n");
1400                 pfsync_request_update(0, 0);
1401                 PFSYNC_UNLOCK(sc);
1402                 PFSYNC_BLOCK(sc);
1403                 sc->sc_ureq_sent = time_uptime;
1404                 callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail,
1405                     sc);
1406                 PFSYNC_BUNLOCK(sc);
1407
1408                 break;
1409             }
1410         default:
1411                 return (ENOTTY);
1412         }
1413
1414         return (0);
1415 }
1416
1417 static void
1418 pfsync_out_state(struct pf_state *st, void *buf)
1419 {
1420         struct pfsync_state *sp = buf;
1421
1422         pfsync_state_export(sp, st);
1423 }
1424
1425 static void
1426 pfsync_out_iack(struct pf_state *st, void *buf)
1427 {
1428         struct pfsync_ins_ack *iack = buf;
1429
1430         iack->id = st->id;
1431         iack->creatorid = st->creatorid;
1432 }
1433
1434 static void
1435 pfsync_out_upd_c(struct pf_state *st, void *buf)
1436 {
1437         struct pfsync_upd_c *up = buf;
1438
1439         bzero(up, sizeof(*up));
1440         up->id = st->id;
1441         pf_state_peer_hton(&st->src, &up->src);
1442         pf_state_peer_hton(&st->dst, &up->dst);
1443         up->creatorid = st->creatorid;
1444         up->timeout = st->timeout;
1445 }
1446
1447 static void
1448 pfsync_out_del(struct pf_state *st, void *buf)
1449 {
1450         struct pfsync_del_c *dp = buf;
1451
1452         dp->id = st->id;
1453         dp->creatorid = st->creatorid;
1454         st->state_flags |= PFSTATE_NOSYNC;
1455 }
1456
1457 static void
1458 pfsync_drop(struct pfsync_softc *sc)
1459 {
1460         struct pf_state *st, *next;
1461         struct pfsync_upd_req_item *ur;
1462         int q;
1463
1464         for (q = 0; q < PFSYNC_S_COUNT; q++) {
1465                 if (TAILQ_EMPTY(&sc->sc_qs[q]))
1466                         continue;
1467
1468                 TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) {
1469                         KASSERT(st->sync_state == q,
1470                                 ("%s: st->sync_state == q",
1471                                         __func__));
1472                         st->sync_state = PFSYNC_S_NONE;
1473                         pf_release_state(st);
1474                 }
1475                 TAILQ_INIT(&sc->sc_qs[q]);
1476         }
1477
1478         while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1479                 TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1480                 free(ur, M_PFSYNC);
1481         }
1482
1483         sc->sc_plus = NULL;
1484         sc->sc_len = PFSYNC_MINPKT;
1485 }
1486
1487 static void
1488 pfsync_sendout(int schedswi)
1489 {
1490         struct pfsync_softc *sc = V_pfsyncif;
1491         struct ifnet *ifp = sc->sc_ifp;
1492         struct mbuf *m;
1493         struct ip *ip;
1494         struct pfsync_header *ph;
1495         struct pfsync_subheader *subh;
1496         struct pf_state *st;
1497         struct pfsync_upd_req_item *ur;
1498         int offset;
1499         int q, count = 0;
1500
1501         KASSERT(sc != NULL, ("%s: null sc", __func__));
1502         KASSERT(sc->sc_len > PFSYNC_MINPKT,
1503             ("%s: sc_len %zu", __func__, sc->sc_len));
1504         PFSYNC_LOCK_ASSERT(sc);
1505
1506         if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
1507                 pfsync_drop(sc);
1508                 return;
1509         }
1510
1511         m = m_get2(M_NOWAIT, MT_DATA, M_PKTHDR, max_linkhdr + sc->sc_len);
1512         if (m == NULL) {
1513                 sc->sc_ifp->if_oerrors++;
1514                 V_pfsyncstats.pfsyncs_onomem++;
1515                 return;
1516         }
1517         m->m_data += max_linkhdr;
1518         m->m_len = m->m_pkthdr.len = sc->sc_len;
1519
1520         /* build the ip header */
1521         ip = (struct ip *)m->m_data;
1522         bcopy(&sc->sc_template, ip, sizeof(*ip));
1523         offset = sizeof(*ip);
1524
1525         ip->ip_len = m->m_pkthdr.len;
1526         ip->ip_id = htons(ip_randomid());
1527
1528         /* build the pfsync header */
1529         ph = (struct pfsync_header *)(m->m_data + offset);
1530         bzero(ph, sizeof(*ph));
1531         offset += sizeof(*ph);
1532
1533         ph->version = PFSYNC_VERSION;
1534         ph->len = htons(sc->sc_len - sizeof(*ip));
1535         bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
1536
1537         /* walk the queues */
1538         for (q = 0; q < PFSYNC_S_COUNT; q++) {
1539                 if (TAILQ_EMPTY(&sc->sc_qs[q]))
1540                         continue;
1541
1542                 subh = (struct pfsync_subheader *)(m->m_data + offset);
1543                 offset += sizeof(*subh);
1544
1545                 count = 0;
1546                 TAILQ_FOREACH(st, &sc->sc_qs[q], sync_list) {
1547                         KASSERT(st->sync_state == q,
1548                                 ("%s: st->sync_state == q",
1549                                         __func__));
1550                         /*
1551                          * XXXGL: some of write methods do unlocked reads
1552                          * of state data :(
1553                          */
1554                         pfsync_qs[q].write(st, m->m_data + offset);
1555                         offset += pfsync_qs[q].len;
1556                         st->sync_state = PFSYNC_S_NONE;
1557                         pf_release_state(st);
1558                         count++;
1559                 }
1560                 TAILQ_INIT(&sc->sc_qs[q]);
1561
1562                 bzero(subh, sizeof(*subh));
1563                 subh->action = pfsync_qs[q].action;
1564                 subh->count = htons(count);
1565                 V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
1566         }
1567
1568         if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
1569                 subh = (struct pfsync_subheader *)(m->m_data + offset);
1570                 offset += sizeof(*subh);
1571
1572                 count = 0;
1573                 while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
1574                         TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
1575
1576                         bcopy(&ur->ur_msg, m->m_data + offset,
1577                             sizeof(ur->ur_msg));
1578                         offset += sizeof(ur->ur_msg);
1579                         free(ur, M_PFSYNC);
1580                         count++;
1581                 }
1582
1583                 bzero(subh, sizeof(*subh));
1584                 subh->action = PFSYNC_ACT_UPD_REQ;
1585                 subh->count = htons(count);
1586                 V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count;
1587         }
1588
1589         /* has someone built a custom region for us to add? */
1590         if (sc->sc_plus != NULL) {
1591                 bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
1592                 offset += sc->sc_pluslen;
1593
1594                 sc->sc_plus = NULL;
1595         }
1596
1597         subh = (struct pfsync_subheader *)(m->m_data + offset);
1598         offset += sizeof(*subh);
1599
1600         bzero(subh, sizeof(*subh));
1601         subh->action = PFSYNC_ACT_EOF;
1602         subh->count = htons(1);
1603         V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++;
1604
1605         /* XXX write checksum in EOF here */
1606
1607         /* we're done, let's put it on the wire */
1608         if (ifp->if_bpf) {
1609                 m->m_data += sizeof(*ip);
1610                 m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
1611                 BPF_MTAP(ifp, m);
1612                 m->m_data -= sizeof(*ip);
1613                 m->m_len = m->m_pkthdr.len = sc->sc_len;
1614         }
1615
1616         if (sc->sc_sync_if == NULL) {
1617                 sc->sc_len = PFSYNC_MINPKT;
1618                 m_freem(m);
1619                 return;
1620         }
1621
1622         sc->sc_ifp->if_opackets++;
1623         sc->sc_ifp->if_obytes += m->m_pkthdr.len;
1624         sc->sc_len = PFSYNC_MINPKT;
1625
1626         if (!_IF_QFULL(&sc->sc_ifp->if_snd))
1627                 _IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
1628         else {
1629                 m_freem(m);
1630                 sc->sc_ifp->if_snd.ifq_drops++;
1631         }
1632         if (schedswi)
1633                 swi_sched(V_pfsync_swi_cookie, 0);
1634 }
1635
1636 static void
1637 pfsync_insert_state(struct pf_state *st)
1638 {
1639         struct pfsync_softc *sc = V_pfsyncif;
1640
1641         if (st->state_flags & PFSTATE_NOSYNC)
1642                 return;
1643
1644         if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) ||
1645             st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
1646                 st->state_flags |= PFSTATE_NOSYNC;
1647                 return;
1648         }
1649
1650         KASSERT(st->sync_state == PFSYNC_S_NONE,
1651                 ("%s: st->sync_state == PFSYNC_S_NONE", __func__));
1652
1653         PFSYNC_LOCK(sc);
1654         if (sc->sc_len == PFSYNC_MINPKT)
1655                 callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1656
1657         pfsync_q_ins(st, PFSYNC_S_INS);
1658         PFSYNC_UNLOCK(sc);
1659
1660         st->sync_updates = 0;
1661 }
1662
1663 static int
1664 pfsync_defer(struct pf_state *st, struct mbuf *m)
1665 {
1666         struct pfsync_softc *sc = V_pfsyncif;
1667         struct pfsync_deferral *pd;
1668
1669         if (m->m_flags & (M_BCAST|M_MCAST))
1670                 return (0);
1671
1672         PFSYNC_LOCK(sc);
1673
1674         if (sc == NULL || !(sc->sc_ifp->if_flags & IFF_DRV_RUNNING) ||
1675             !(sc->sc_flags & PFSYNCF_DEFER)) {
1676                 PFSYNC_UNLOCK(sc);
1677                 return (0);
1678         }
1679
1680          if (sc->sc_deferred >= 128)
1681                 pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0);
1682
1683         pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
1684         if (pd == NULL)
1685                 return (0);
1686         sc->sc_deferred++;
1687
1688         m->m_flags |= M_SKIP_FIREWALL;
1689         st->state_flags |= PFSTATE_ACK;
1690
1691         pd->pd_sc = sc;
1692         pd->pd_refs = 0;
1693         pd->pd_st = st;
1694         pf_ref_state(st);
1695         pd->pd_m = m;
1696
1697         TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
1698         callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1699         callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd);
1700
1701         pfsync_push(sc);
1702
1703         return (1);
1704 }
1705
1706 static void
1707 pfsync_undefer(struct pfsync_deferral *pd, int drop)
1708 {
1709         struct pfsync_softc *sc = pd->pd_sc;
1710         struct mbuf *m = pd->pd_m;
1711         struct pf_state *st = pd->pd_st;
1712
1713         PFSYNC_LOCK_ASSERT(sc);
1714
1715         TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1716         sc->sc_deferred--;
1717         pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */
1718         free(pd, M_PFSYNC);
1719         pf_release_state(st);
1720
1721         if (drop)
1722                 m_freem(m);
1723         else {
1724                 _IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
1725                 pfsync_push(sc);
1726         }
1727 }
1728
1729 static void
1730 pfsync_defer_tmo(void *arg)
1731 {
1732         struct pfsync_deferral *pd = arg;
1733         struct pfsync_softc *sc = pd->pd_sc;
1734         struct mbuf *m = pd->pd_m;
1735         struct pf_state *st = pd->pd_st;
1736
1737         PFSYNC_LOCK_ASSERT(sc);
1738
1739         CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
1740
1741         TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
1742         sc->sc_deferred--;
1743         pd->pd_st->state_flags &= ~PFSTATE_ACK; /* XXX: locking! */
1744         if (pd->pd_refs == 0)
1745                 free(pd, M_PFSYNC);
1746         PFSYNC_UNLOCK(sc);
1747
1748         ip_output(m, NULL, NULL, 0, NULL, NULL);
1749
1750         pf_release_state(st);
1751
1752         CURVNET_RESTORE();
1753 }
1754
1755 static void
1756 pfsync_undefer_state(struct pf_state *st, int drop)
1757 {
1758         struct pfsync_softc *sc = V_pfsyncif;
1759         struct pfsync_deferral *pd;
1760
1761         PFSYNC_LOCK_ASSERT(sc);
1762
1763         TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
1764                  if (pd->pd_st == st) {
1765                         if (callout_stop(&pd->pd_tmo))
1766                                 pfsync_undefer(pd, drop);
1767                         return;
1768                 }
1769         }
1770
1771         panic("%s: unable to find deferred state", __func__);
1772 }
1773
1774 static void
1775 pfsync_update_state(struct pf_state *st)
1776 {
1777         struct pfsync_softc *sc = V_pfsyncif;
1778         int sync = 0;
1779
1780         PF_STATE_LOCK_ASSERT(st);
1781         PFSYNC_LOCK(sc);
1782
1783         if (st->state_flags & PFSTATE_ACK)
1784                 pfsync_undefer_state(st, 0);
1785         if (st->state_flags & PFSTATE_NOSYNC) {
1786                 if (st->sync_state != PFSYNC_S_NONE)
1787                         pfsync_q_del(st);
1788                 PFSYNC_UNLOCK(sc);
1789                 return;
1790         }
1791
1792         if (sc->sc_len == PFSYNC_MINPKT)
1793                 callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1794
1795         switch (st->sync_state) {
1796         case PFSYNC_S_UPD_C:
1797         case PFSYNC_S_UPD:
1798         case PFSYNC_S_INS:
1799                 /* we're already handling it */
1800
1801                 if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
1802                         st->sync_updates++;
1803                         if (st->sync_updates >= sc->sc_maxupdates)
1804                                 sync = 1;
1805                 }
1806                 break;
1807
1808         case PFSYNC_S_IACK:
1809                 pfsync_q_del(st);
1810         case PFSYNC_S_NONE:
1811                 pfsync_q_ins(st, PFSYNC_S_UPD_C);
1812                 st->sync_updates = 0;
1813                 break;
1814
1815         default:
1816                 panic("%s: unexpected sync state %d", __func__, st->sync_state);
1817         }
1818
1819         if (sync || (time_uptime - st->pfsync_time) < 2)
1820                 pfsync_push(sc);
1821
1822         PFSYNC_UNLOCK(sc);
1823 }
1824
1825 static void
1826 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
1827 {
1828         struct pfsync_softc *sc = V_pfsyncif;
1829         struct pfsync_upd_req_item *item;
1830         size_t nlen = sizeof(struct pfsync_upd_req);
1831
1832         PFSYNC_LOCK_ASSERT(sc);
1833
1834         /*
1835          * This code does a bit to prevent multiple update requests for the
1836          * same state being generated. It searches current subheader queue,
1837          * but it doesn't lookup into queue of already packed datagrams.
1838          */
1839         TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry)
1840                 if (item->ur_msg.id == id &&
1841                     item->ur_msg.creatorid == creatorid)
1842                         return;
1843
1844         item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT);
1845         if (item == NULL)
1846                 return; /* XXX stats */
1847
1848         item->ur_msg.id = id;
1849         item->ur_msg.creatorid = creatorid;
1850
1851         if (TAILQ_EMPTY(&sc->sc_upd_req_list))
1852                 nlen += sizeof(struct pfsync_subheader);
1853
1854         if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
1855                 pfsync_sendout(1);
1856
1857                 nlen = sizeof(struct pfsync_subheader) +
1858                     sizeof(struct pfsync_upd_req);
1859         }
1860
1861         TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
1862         sc->sc_len += nlen;
1863 }
1864
1865 static void
1866 pfsync_update_state_req(struct pf_state *st)
1867 {
1868         struct pfsync_softc *sc = V_pfsyncif;
1869
1870         PF_STATE_LOCK_ASSERT(st);
1871         PFSYNC_LOCK(sc);
1872
1873         if (st->state_flags & PFSTATE_NOSYNC) {
1874                 if (st->sync_state != PFSYNC_S_NONE)
1875                         pfsync_q_del(st);
1876                 PFSYNC_UNLOCK(sc);
1877                 return;
1878         }
1879
1880         switch (st->sync_state) {
1881         case PFSYNC_S_UPD_C:
1882         case PFSYNC_S_IACK:
1883                 pfsync_q_del(st);
1884         case PFSYNC_S_NONE:
1885                 pfsync_q_ins(st, PFSYNC_S_UPD);
1886                 pfsync_push(sc);
1887                 break;
1888
1889         case PFSYNC_S_INS:
1890         case PFSYNC_S_UPD:
1891         case PFSYNC_S_DEL:
1892                 /* we're already handling it */
1893                 break;
1894
1895         default:
1896                 panic("%s: unexpected sync state %d", __func__, st->sync_state);
1897         }
1898
1899         PFSYNC_UNLOCK(sc);
1900 }
1901
1902 static void
1903 pfsync_delete_state(struct pf_state *st)
1904 {
1905         struct pfsync_softc *sc = V_pfsyncif;
1906
1907         PFSYNC_LOCK(sc);
1908         if (st->state_flags & PFSTATE_ACK)
1909                 pfsync_undefer_state(st, 1);
1910         if (st->state_flags & PFSTATE_NOSYNC) {
1911                 if (st->sync_state != PFSYNC_S_NONE)
1912                         pfsync_q_del(st);
1913                 PFSYNC_UNLOCK(sc);
1914                 return;
1915         }
1916
1917         if (sc->sc_len == PFSYNC_MINPKT)
1918                 callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
1919
1920         switch (st->sync_state) {
1921         case PFSYNC_S_INS:
1922                 /* We never got to tell the world so just forget about it. */
1923                 pfsync_q_del(st);
1924                 break;
1925
1926         case PFSYNC_S_UPD_C:
1927         case PFSYNC_S_UPD:
1928         case PFSYNC_S_IACK:
1929                 pfsync_q_del(st);
1930                 /* FALLTHROUGH to putting it on the del list */
1931
1932         case PFSYNC_S_NONE:
1933                 pfsync_q_ins(st, PFSYNC_S_DEL);
1934                 break;
1935
1936         default:
1937                 panic("%s: unexpected sync state %d", __func__, st->sync_state);
1938         }
1939         PFSYNC_UNLOCK(sc);
1940 }
1941
1942 static void
1943 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
1944 {
1945         struct pfsync_softc *sc = V_pfsyncif;
1946         struct {
1947                 struct pfsync_subheader subh;
1948                 struct pfsync_clr clr;
1949         } __packed r;
1950
1951         bzero(&r, sizeof(r));
1952
1953         r.subh.action = PFSYNC_ACT_CLR;
1954         r.subh.count = htons(1);
1955         V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++;
1956
1957         strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
1958         r.clr.creatorid = creatorid;
1959
1960         PFSYNC_LOCK(sc);
1961         pfsync_send_plus(&r, sizeof(r));
1962         PFSYNC_UNLOCK(sc);
1963 }
1964
1965 static void
1966 pfsync_q_ins(struct pf_state *st, int q)
1967 {
1968         struct pfsync_softc *sc = V_pfsyncif;
1969         size_t nlen = pfsync_qs[q].len;
1970
1971         PFSYNC_LOCK_ASSERT(sc);
1972
1973         KASSERT(st->sync_state == PFSYNC_S_NONE,
1974                 ("%s: st->sync_state == PFSYNC_S_NONE", __func__));
1975         KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
1976             sc->sc_len));
1977
1978         if (TAILQ_EMPTY(&sc->sc_qs[q]))
1979                 nlen += sizeof(struct pfsync_subheader);
1980
1981         if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
1982                 pfsync_sendout(1);
1983
1984                 nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
1985         }
1986
1987         sc->sc_len += nlen;
1988         TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
1989         st->sync_state = q;
1990         pf_ref_state(st);
1991 }
1992
1993 static void
1994 pfsync_q_del(struct pf_state *st)
1995 {
1996         struct pfsync_softc *sc = V_pfsyncif;
1997         int q = st->sync_state;
1998
1999         PFSYNC_LOCK_ASSERT(sc);
2000         KASSERT(st->sync_state != PFSYNC_S_NONE,
2001                 ("%s: st->sync_state != PFSYNC_S_NONE", __func__));
2002
2003         sc->sc_len -= pfsync_qs[q].len;
2004         TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
2005         st->sync_state = PFSYNC_S_NONE;
2006         pf_release_state(st);
2007
2008         if (TAILQ_EMPTY(&sc->sc_qs[q]))
2009                 sc->sc_len -= sizeof(struct pfsync_subheader);
2010 }
2011
2012 static void
2013 pfsync_bulk_start(void)
2014 {
2015         struct pfsync_softc *sc = V_pfsyncif;
2016
2017         if (V_pf_status.debug >= PF_DEBUG_MISC)
2018                 printf("pfsync: received bulk update request\n");
2019
2020         PFSYNC_BLOCK(sc);
2021
2022         sc->sc_ureq_received = time_uptime;
2023         sc->sc_bulk_hashid = 0;
2024         sc->sc_bulk_stateid = 0;
2025         pfsync_bulk_status(PFSYNC_BUS_START);
2026         callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
2027         PFSYNC_BUNLOCK(sc);
2028 }
2029
2030 static void
2031 pfsync_bulk_update(void *arg)
2032 {
2033         struct pfsync_softc *sc = arg;
2034         struct pf_state *s;
2035         int i, sent = 0;
2036
2037         PFSYNC_BLOCK_ASSERT(sc);
2038         CURVNET_SET(sc->sc_ifp->if_vnet);
2039
2040         /*
2041          * Start with last state from previous invocation.
2042          * It may had gone, in this case start from the
2043          * hash slot.
2044          */
2045         s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid);
2046
2047         if (s != NULL)
2048                 i = PF_IDHASH(s);
2049         else
2050                 i = sc->sc_bulk_hashid;
2051
2052         for (; i <= V_pf_hashmask; i++) {
2053                 struct pf_idhash *ih = &V_pf_idhash[i];
2054
2055                 if (s != NULL)
2056                         PF_HASHROW_ASSERT(ih);
2057                 else {
2058                         PF_HASHROW_LOCK(ih);
2059                         s = LIST_FIRST(&ih->states);
2060                 }
2061
2062                 for (; s; s = LIST_NEXT(s, entry)) {
2063
2064                         if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) <
2065                             sizeof(struct pfsync_state)) {
2066                                 /* We've filled a packet. */
2067                                 sc->sc_bulk_hashid = i;
2068                                 sc->sc_bulk_stateid = s->id;
2069                                 sc->sc_bulk_creatorid = s->creatorid;
2070                                 PF_HASHROW_UNLOCK(ih);
2071                                 callout_reset(&sc->sc_bulk_tmo, 1,
2072                                     pfsync_bulk_update, sc);
2073                                 goto full;
2074                         }
2075
2076                         if (s->sync_state == PFSYNC_S_NONE &&
2077                             s->timeout < PFTM_MAX &&
2078                             s->pfsync_time <= sc->sc_ureq_received) {
2079                                 PFSYNC_LOCK(sc);
2080                                 pfsync_update_state_req(s);
2081                                 PFSYNC_UNLOCK(sc);
2082                                 sent++;
2083                         }
2084                 }
2085                 PF_HASHROW_UNLOCK(ih);
2086         }
2087
2088         /* We're done. */
2089         pfsync_bulk_status(PFSYNC_BUS_END);
2090
2091 full:
2092         CURVNET_RESTORE();
2093 }
2094
2095 static void
2096 pfsync_bulk_status(u_int8_t status)
2097 {
2098         struct {
2099                 struct pfsync_subheader subh;
2100                 struct pfsync_bus bus;
2101         } __packed r;
2102
2103         struct pfsync_softc *sc = V_pfsyncif;
2104
2105         bzero(&r, sizeof(r));
2106
2107         r.subh.action = PFSYNC_ACT_BUS;
2108         r.subh.count = htons(1);
2109         V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++;
2110
2111         r.bus.creatorid = V_pf_status.hostid;
2112         r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
2113         r.bus.status = status;
2114
2115         PFSYNC_LOCK(sc);
2116         pfsync_send_plus(&r, sizeof(r));
2117         PFSYNC_UNLOCK(sc);
2118 }
2119
2120 static void
2121 pfsync_bulk_fail(void *arg)
2122 {
2123         struct pfsync_softc *sc = arg;
2124
2125         CURVNET_SET(sc->sc_ifp->if_vnet);
2126
2127         PFSYNC_BLOCK_ASSERT(sc);
2128
2129         if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
2130                 /* Try again */
2131                 callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
2132                     pfsync_bulk_fail, V_pfsyncif);
2133                 PFSYNC_LOCK(sc);
2134                 pfsync_request_update(0, 0);
2135                 PFSYNC_UNLOCK(sc);
2136         } else {
2137                 /* Pretend like the transfer was ok. */
2138                 sc->sc_ureq_sent = 0;
2139                 sc->sc_bulk_tries = 0;
2140                 PFSYNC_LOCK(sc);
2141                 if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
2142                         (*carp_demote_adj_p)(-V_pfsync_carp_adj,
2143                             "pfsync bulk fail");
2144                 sc->sc_flags |= PFSYNCF_OK;
2145                 PFSYNC_UNLOCK(sc);
2146                 if (V_pf_status.debug >= PF_DEBUG_MISC)
2147                         printf("pfsync: failed to receive bulk update\n");
2148         }
2149
2150         CURVNET_RESTORE();
2151 }
2152
2153 static void
2154 pfsync_send_plus(void *plus, size_t pluslen)
2155 {
2156         struct pfsync_softc *sc = V_pfsyncif;
2157
2158         PFSYNC_LOCK_ASSERT(sc);
2159
2160         if (sc->sc_len + pluslen > sc->sc_ifp->if_mtu)
2161                 pfsync_sendout(1);
2162
2163         sc->sc_plus = plus;
2164         sc->sc_len += (sc->sc_pluslen = pluslen);
2165
2166         pfsync_sendout(1);
2167 }
2168
2169 static void
2170 pfsync_timeout(void *arg)
2171 {
2172         struct pfsync_softc *sc = arg;
2173
2174         CURVNET_SET(sc->sc_ifp->if_vnet);
2175         PFSYNC_LOCK(sc);
2176         pfsync_push(sc);
2177         PFSYNC_UNLOCK(sc);
2178         CURVNET_RESTORE();
2179 }
2180
2181 static void
2182 pfsync_push(struct pfsync_softc *sc)
2183 {
2184
2185         PFSYNC_LOCK_ASSERT(sc);
2186
2187         sc->sc_flags |= PFSYNCF_PUSH;
2188         swi_sched(V_pfsync_swi_cookie, 0);
2189 }
2190
2191 static void
2192 pfsyncintr(void *arg)
2193 {
2194         struct pfsync_softc *sc = arg;
2195         struct mbuf *m, *n;
2196
2197         CURVNET_SET(sc->sc_ifp->if_vnet);
2198
2199         PFSYNC_LOCK(sc);
2200         if ((sc->sc_flags & PFSYNCF_PUSH) && sc->sc_len > PFSYNC_MINPKT) {
2201                 pfsync_sendout(0);
2202                 sc->sc_flags &= ~PFSYNCF_PUSH;
2203         }
2204         _IF_DEQUEUE_ALL(&sc->sc_ifp->if_snd, m);
2205         PFSYNC_UNLOCK(sc);
2206
2207         for (; m != NULL; m = n) {
2208
2209                 n = m->m_nextpkt;
2210                 m->m_nextpkt = NULL;
2211
2212                 /*
2213                  * We distinguish between a deferral packet and our
2214                  * own pfsync packet based on M_SKIP_FIREWALL
2215                  * flag. This is XXX.
2216                  */
2217                 if (m->m_flags & M_SKIP_FIREWALL)
2218                         ip_output(m, NULL, NULL, 0, NULL, NULL);
2219                 else if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo,
2220                     NULL) == 0)
2221                         V_pfsyncstats.pfsyncs_opackets++;
2222                 else
2223                         V_pfsyncstats.pfsyncs_oerrors++;
2224         }
2225         CURVNET_RESTORE();
2226 }
2227
2228 static int
2229 pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp, void *mship)
2230 {
2231         struct ip_moptions *imo = &sc->sc_imo;
2232         int error;
2233
2234         if (!(ifp->if_flags & IFF_MULTICAST))
2235                 return (EADDRNOTAVAIL);
2236
2237         imo->imo_membership = (struct in_multi **)mship;
2238         imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
2239         imo->imo_multicast_vif = -1;
2240
2241         if ((error = in_joingroup(ifp, &sc->sc_sync_peer, NULL,
2242             &imo->imo_membership[0])) != 0) {
2243                 imo->imo_membership = NULL;
2244                 return (error);
2245         }
2246         imo->imo_num_memberships++;
2247         imo->imo_multicast_ifp = ifp;
2248         imo->imo_multicast_ttl = PFSYNC_DFLTTL;
2249         imo->imo_multicast_loop = 0;
2250
2251         return (0);
2252 }
2253
2254 static void
2255 pfsync_multicast_cleanup(struct pfsync_softc *sc)
2256 {
2257         struct ip_moptions *imo = &sc->sc_imo;
2258
2259         in_leavegroup(imo->imo_membership[0], NULL);
2260         free(imo->imo_membership, M_PFSYNC);
2261         imo->imo_membership = NULL;
2262         imo->imo_multicast_ifp = NULL;
2263 }
2264
2265 #ifdef INET
2266 extern  struct domain inetdomain;
2267 static struct protosw in_pfsync_protosw = {
2268         .pr_type =              SOCK_RAW,
2269         .pr_domain =            &inetdomain,
2270         .pr_protocol =          IPPROTO_PFSYNC,
2271         .pr_flags =             PR_ATOMIC|PR_ADDR,
2272         .pr_input =             pfsync_input,
2273         .pr_output =            (pr_output_t *)rip_output,
2274         .pr_ctloutput =         rip_ctloutput,
2275         .pr_usrreqs =           &rip_usrreqs
2276 };
2277 #endif
2278
2279 static void
2280 pfsync_pointers_init()
2281 {
2282
2283         PF_RULES_WLOCK();
2284         pfsync_state_import_ptr = pfsync_state_import;
2285         pfsync_insert_state_ptr = pfsync_insert_state;
2286         pfsync_update_state_ptr = pfsync_update_state;
2287         pfsync_delete_state_ptr = pfsync_delete_state;
2288         pfsync_clear_states_ptr = pfsync_clear_states;
2289         pfsync_defer_ptr = pfsync_defer;
2290         PF_RULES_WUNLOCK();
2291 }
2292
2293 static void
2294 pfsync_pointers_uninit()
2295 {
2296
2297         PF_RULES_WLOCK();
2298         pfsync_state_import_ptr = NULL;
2299         pfsync_insert_state_ptr = NULL;
2300         pfsync_update_state_ptr = NULL;
2301         pfsync_delete_state_ptr = NULL;
2302         pfsync_clear_states_ptr = NULL;
2303         pfsync_defer_ptr = NULL;
2304         PF_RULES_WUNLOCK();
2305 }
2306
2307 static int
2308 pfsync_init()
2309 {
2310         VNET_ITERATOR_DECL(vnet_iter);
2311         int error = 0;
2312
2313         VNET_LIST_RLOCK();
2314         VNET_FOREACH(vnet_iter) {
2315                 CURVNET_SET(vnet_iter);
2316                 V_pfsync_cloner = pfsync_cloner;
2317                 V_pfsync_cloner_data = pfsync_cloner_data;
2318                 V_pfsync_cloner.ifc_data = &V_pfsync_cloner_data;
2319                 if_clone_attach(&V_pfsync_cloner);
2320                 error = swi_add(NULL, "pfsync", pfsyncintr, V_pfsyncif,
2321                     SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie);
2322                 CURVNET_RESTORE();
2323                 if (error)
2324                         goto fail_locked;
2325         }
2326         VNET_LIST_RUNLOCK();
2327 #ifdef INET
2328         error = pf_proto_register(PF_INET, &in_pfsync_protosw);
2329         if (error)
2330                 goto fail;
2331         error = ipproto_register(IPPROTO_PFSYNC);
2332         if (error) {
2333                 pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2334                 goto fail;
2335         }
2336 #endif
2337         pfsync_pointers_init();
2338
2339         return (0);
2340
2341 fail:
2342         VNET_LIST_RLOCK();
2343 fail_locked:
2344         VNET_FOREACH(vnet_iter) {
2345                 CURVNET_SET(vnet_iter);
2346                 if (V_pfsync_swi_cookie) {
2347                         swi_remove(V_pfsync_swi_cookie);
2348                         if_clone_detach(&V_pfsync_cloner);
2349                 }
2350                 CURVNET_RESTORE();
2351         }
2352         VNET_LIST_RUNLOCK();
2353
2354         return (error);
2355 }
2356
2357 static void
2358 pfsync_uninit()
2359 {
2360         VNET_ITERATOR_DECL(vnet_iter);
2361
2362         pfsync_pointers_uninit();
2363
2364         ipproto_unregister(IPPROTO_PFSYNC);
2365         pf_proto_unregister(PF_INET, IPPROTO_PFSYNC, SOCK_RAW);
2366         VNET_LIST_RLOCK();
2367         VNET_FOREACH(vnet_iter) {
2368                 CURVNET_SET(vnet_iter);
2369                 if_clone_detach(&V_pfsync_cloner);
2370                 swi_remove(V_pfsync_swi_cookie);
2371                 CURVNET_RESTORE();
2372         }
2373         VNET_LIST_RUNLOCK();
2374 }
2375
2376 static int
2377 pfsync_modevent(module_t mod, int type, void *data)
2378 {
2379         int error = 0;
2380
2381         switch (type) {
2382         case MOD_LOAD:
2383                 error = pfsync_init();
2384                 break;
2385         case MOD_QUIESCE:
2386                 /*
2387                  * Module should not be unloaded due to race conditions.
2388                  */
2389                 error = EBUSY;
2390                 break;
2391         case MOD_UNLOAD:
2392                 pfsync_uninit();
2393                 break;
2394         default:
2395                 error = EINVAL;
2396                 break;
2397         }
2398
2399         return (error);
2400 }
2401
2402 static moduledata_t pfsync_mod = {
2403         "pfsync",
2404         pfsync_modevent,
2405         NULL
2406 };
2407
2408 #define PFSYNC_MODVER 1
2409
2410 DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
2411 MODULE_VERSION(pfsync, PFSYNC_MODVER);
2412 MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);