]> CyberLeo.Net >> Repos - FreeBSD/releng/8.1.git/blob - sys/netinet/ip_carp.c
Copy stable/8 to releng/8.1 in preparation for 8.1-RC1.
[FreeBSD/releng/8.1.git] / sys / netinet / ip_carp.c
1 /*
2  * Copyright (c) 2002 Michael Shalayeff. All rights reserved.
3  * Copyright (c) 2003 Ryan McBride. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
18  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
22  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
23  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
24  * THE POSSIBILITY OF SUCH DAMAGE.
25  */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 #include "opt_carp.h"
31 #include "opt_bpf.h"
32 #include "opt_inet.h"
33 #include "opt_inet6.h"
34
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/conf.h>
39 #include <sys/kernel.h>
40 #include <sys/limits.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/module.h>
44 #include <sys/time.h>
45 #include <sys/priv.h>
46 #include <sys/proc.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/signalvar.h>
50 #include <sys/filio.h>
51 #include <sys/sockio.h>
52
53 #include <sys/socket.h>
54 #include <sys/vnode.h>
55
56 #include <machine/stdarg.h>
57
58 #include <net/bpf.h>
59 #include <net/ethernet.h>
60 #include <net/fddi.h>
61 #include <net/iso88025.h>
62 #include <net/if.h>
63 #include <net/if_clone.h>
64 #include <net/if_dl.h>
65 #include <net/if_types.h>
66 #include <net/route.h>
67 #include <net/vnet.h>
68
69 #ifdef INET
70 #include <netinet/in.h>
71 #include <netinet/in_var.h>
72 #include <netinet/in_systm.h>
73 #include <netinet/ip.h>
74 #include <netinet/ip_var.h>
75 #include <netinet/if_ether.h>
76 #include <machine/in_cksum.h>
77 #endif
78
79 #ifdef INET6
80 #include <netinet/icmp6.h>
81 #include <netinet/ip6.h>
82 #include <netinet6/ip6_var.h>
83 #include <netinet6/scope6_var.h>
84 #include <netinet6/nd6.h>
85 #endif
86
87 #include <crypto/sha1.h>
88 #include <netinet/ip_carp.h>
89
90 #define CARP_IFNAME     "carp"
91 static MALLOC_DEFINE(M_CARP, "CARP", "CARP interfaces");
92 SYSCTL_DECL(_net_inet_carp);
93
94 struct carp_softc {
95         struct ifnet            *sc_ifp;        /* Interface clue */
96         struct ifnet            *sc_carpdev;    /* Pointer to parent interface */
97         struct in_ifaddr        *sc_ia;         /* primary iface address */
98         struct ip_moptions       sc_imo;
99 #ifdef INET6
100         struct in6_ifaddr       *sc_ia6;        /* primary iface address v6 */
101         struct ip6_moptions      sc_im6o;
102 #endif /* INET6 */
103         TAILQ_ENTRY(carp_softc)  sc_list;
104
105         enum { INIT = 0, BACKUP, MASTER }       sc_state;
106
107         int                      sc_flags_backup;
108         int                      sc_suppress;
109
110         int                      sc_sendad_errors;
111 #define CARP_SENDAD_MAX_ERRORS  3
112         int                      sc_sendad_success;
113 #define CARP_SENDAD_MIN_SUCCESS 3
114
115         int                      sc_vhid;
116         int                      sc_advskew;
117         int                      sc_naddrs;
118         int                      sc_naddrs6;
119         int                      sc_advbase;    /* seconds */
120         int                      sc_init_counter;
121         u_int64_t                sc_counter;
122
123         /* authentication */
124 #define CARP_HMAC_PAD   64
125         unsigned char sc_key[CARP_KEY_LEN];
126         unsigned char sc_pad[CARP_HMAC_PAD];
127         SHA1_CTX sc_sha1;
128
129         struct callout           sc_ad_tmo;     /* advertisement timeout */
130         struct callout           sc_md_tmo;     /* master down timeout */
131         struct callout           sc_md6_tmo;    /* master down timeout */
132         
133         LIST_ENTRY(carp_softc)   sc_next;       /* Interface clue */
134 };
135 #define SC2IFP(sc)      ((sc)->sc_ifp)
136
137 int carp_suppress_preempt = 0;
138 int carp_opts[CARPCTL_MAXID] = { 0, 1, 0, 1, 0, 0 };    /* XXX for now */
139 SYSCTL_INT(_net_inet_carp, CARPCTL_ALLOW, allow, CTLFLAG_RW,
140     &carp_opts[CARPCTL_ALLOW], 0, "Accept incoming CARP packets");
141 SYSCTL_INT(_net_inet_carp, CARPCTL_PREEMPT, preempt, CTLFLAG_RW,
142     &carp_opts[CARPCTL_PREEMPT], 0, "high-priority backup preemption mode");
143 SYSCTL_INT(_net_inet_carp, CARPCTL_LOG, log, CTLFLAG_RW,
144     &carp_opts[CARPCTL_LOG], 0, "log bad carp packets");
145 SYSCTL_INT(_net_inet_carp, CARPCTL_ARPBALANCE, arpbalance, CTLFLAG_RW,
146     &carp_opts[CARPCTL_ARPBALANCE], 0, "balance arp responses");
147 SYSCTL_INT(_net_inet_carp, OID_AUTO, suppress_preempt, CTLFLAG_RD,
148     &carp_suppress_preempt, 0, "Preemption is suppressed");
149
150 struct carpstats carpstats;
151 SYSCTL_STRUCT(_net_inet_carp, CARPCTL_STATS, stats, CTLFLAG_RW,
152     &carpstats, carpstats,
153     "CARP statistics (struct carpstats, netinet/ip_carp.h)");
154
155 struct carp_if {
156         TAILQ_HEAD(, carp_softc) vhif_vrs;
157         int vhif_nvrs;
158
159         struct ifnet    *vhif_ifp;
160         struct mtx       vhif_mtx;
161 };
162
163 /* Get carp_if from softc. Valid after carp_set_addr{,6}. */
164 #define SC2CIF(sc)              ((struct carp_if *)(sc)->sc_carpdev->if_carp)
165
166 /* lock per carp_if queue */
167 #define CARP_LOCK_INIT(cif)     mtx_init(&(cif)->vhif_mtx, "carp_if",   \
168         NULL, MTX_DEF)
169 #define CARP_LOCK_DESTROY(cif)  mtx_destroy(&(cif)->vhif_mtx)
170 #define CARP_LOCK_ASSERT(cif)   mtx_assert(&(cif)->vhif_mtx, MA_OWNED)
171 #define CARP_LOCK(cif)          mtx_lock(&(cif)->vhif_mtx)
172 #define CARP_UNLOCK(cif)        mtx_unlock(&(cif)->vhif_mtx)
173
174 #define CARP_SCLOCK(sc)         mtx_lock(&SC2CIF(sc)->vhif_mtx)
175 #define CARP_SCUNLOCK(sc)       mtx_unlock(&SC2CIF(sc)->vhif_mtx)
176 #define CARP_SCLOCK_ASSERT(sc)  mtx_assert(&SC2CIF(sc)->vhif_mtx, MA_OWNED)
177
178 #define CARP_LOG(...)   do {                            \
179         if (carp_opts[CARPCTL_LOG] > 0)                 \
180                 log(LOG_INFO, __VA_ARGS__);             \
181 } while (0)
182
183 #define CARP_DEBUG(...) do {                            \
184         if (carp_opts[CARPCTL_LOG] > 1)                 \
185                 log(LOG_DEBUG, __VA_ARGS__);            \
186 } while (0)
187
188 static void     carp_hmac_prepare(struct carp_softc *);
189 static void     carp_hmac_generate(struct carp_softc *, u_int32_t *,
190                     unsigned char *);
191 static int      carp_hmac_verify(struct carp_softc *, u_int32_t *,
192                     unsigned char *);
193 static void     carp_setroute(struct carp_softc *, int);
194 static void     carp_input_c(struct mbuf *, struct carp_header *, sa_family_t);
195 static int      carp_clone_create(struct if_clone *, int, caddr_t);
196 static void     carp_clone_destroy(struct ifnet *);
197 static void     carpdetach(struct carp_softc *, int);
198 static int      carp_prepare_ad(struct mbuf *, struct carp_softc *,
199                     struct carp_header *);
200 static void     carp_send_ad_all(void);
201 static void     carp_send_ad(void *);
202 static void     carp_send_ad_locked(struct carp_softc *);
203 static void     carp_send_arp(struct carp_softc *);
204 static void     carp_master_down(void *);
205 static void     carp_master_down_locked(struct carp_softc *);
206 static int      carp_ioctl(struct ifnet *, u_long, caddr_t);
207 static int      carp_looutput(struct ifnet *, struct mbuf *, struct sockaddr *,
208                     struct route *);
209 static void     carp_start(struct ifnet *);
210 static void     carp_setrun(struct carp_softc *, sa_family_t);
211 static void     carp_set_state(struct carp_softc *, int);
212 static int      carp_addrcount(struct carp_if *, struct in_ifaddr *, int);
213 enum    { CARP_COUNT_MASTER, CARP_COUNT_RUNNING };
214
215 static void     carp_multicast_cleanup(struct carp_softc *);
216 static int      carp_set_addr(struct carp_softc *, struct sockaddr_in *);
217 static int      carp_del_addr(struct carp_softc *, struct sockaddr_in *);
218 static void     carp_carpdev_state_locked(struct carp_if *);
219 static void     carp_sc_state_locked(struct carp_softc *);
220 #ifdef INET6
221 static void     carp_send_na(struct carp_softc *);
222 static int      carp_set_addr6(struct carp_softc *, struct sockaddr_in6 *);
223 static int      carp_del_addr6(struct carp_softc *, struct sockaddr_in6 *);
224 static void     carp_multicast6_cleanup(struct carp_softc *);
225 #endif
226
227 static LIST_HEAD(, carp_softc) carpif_list;
228 static struct mtx carp_mtx;
229 IFC_SIMPLE_DECLARE(carp, 0);
230
231 static eventhandler_tag if_detach_event_tag;
232
233 static __inline u_int16_t
234 carp_cksum(struct mbuf *m, int len)
235 {
236         return (in_cksum(m, len));
237 }
238
239 static void
240 carp_hmac_prepare(struct carp_softc *sc)
241 {
242         u_int8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
243         u_int8_t vhid = sc->sc_vhid & 0xff;
244         struct ifaddr *ifa;
245         int i, found;
246 #ifdef INET
247         struct in_addr last, cur, in;
248 #endif
249 #ifdef INET6
250         struct in6_addr last6, cur6, in6;
251 #endif
252
253         if (sc->sc_carpdev)
254                 CARP_SCLOCK(sc);
255
256         /* XXX: possible race here */
257
258         /* compute ipad from key */
259         bzero(sc->sc_pad, sizeof(sc->sc_pad));
260         bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
261         for (i = 0; i < sizeof(sc->sc_pad); i++)
262                 sc->sc_pad[i] ^= 0x36;
263
264         /* precompute first part of inner hash */
265         SHA1Init(&sc->sc_sha1);
266         SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
267         SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
268         SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
269         SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
270 #ifdef INET
271         cur.s_addr = 0;
272         do {
273                 found = 0;
274                 last = cur;
275                 cur.s_addr = 0xffffffff;
276                 IF_ADDR_LOCK(SC2IFP(sc));
277                 TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
278                         in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
279                         if (ifa->ifa_addr->sa_family == AF_INET &&
280                             ntohl(in.s_addr) > ntohl(last.s_addr) &&
281                             ntohl(in.s_addr) < ntohl(cur.s_addr)) {
282                                 cur.s_addr = in.s_addr;
283                                 found++;
284                         }
285                 }
286                 IF_ADDR_UNLOCK(SC2IFP(sc));
287                 if (found)
288                         SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur));
289         } while (found);
290 #endif /* INET */
291 #ifdef INET6
292         memset(&cur6, 0, sizeof(cur6));
293         do {
294                 found = 0;
295                 last6 = cur6;
296                 memset(&cur6, 0xff, sizeof(cur6));
297                 IF_ADDR_LOCK(SC2IFP(sc));
298                 TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
299                         in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
300                         if (IN6_IS_SCOPE_EMBED(&in6))
301                                 in6.s6_addr16[1] = 0;
302                         if (ifa->ifa_addr->sa_family == AF_INET6 &&
303                             memcmp(&in6, &last6, sizeof(in6)) > 0 &&
304                             memcmp(&in6, &cur6, sizeof(in6)) < 0) {
305                                 cur6 = in6;
306                                 found++;
307                         }
308                 }
309                 IF_ADDR_UNLOCK(SC2IFP(sc));
310                 if (found)
311                         SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6));
312         } while (found);
313 #endif /* INET6 */
314
315         /* convert ipad to opad */
316         for (i = 0; i < sizeof(sc->sc_pad); i++)
317                 sc->sc_pad[i] ^= 0x36 ^ 0x5c;
318
319         if (sc->sc_carpdev)
320                 CARP_SCUNLOCK(sc);
321 }
322
323 static void
324 carp_hmac_generate(struct carp_softc *sc, u_int32_t counter[2],
325     unsigned char md[20])
326 {
327         SHA1_CTX sha1ctx;
328
329         /* fetch first half of inner hash */
330         bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));
331
332         SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter));
333         SHA1Final(md, &sha1ctx);
334
335         /* outer hash */
336         SHA1Init(&sha1ctx);
337         SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad));
338         SHA1Update(&sha1ctx, md, 20);
339         SHA1Final(md, &sha1ctx);
340 }
341
342 static int
343 carp_hmac_verify(struct carp_softc *sc, u_int32_t counter[2],
344     unsigned char md[20])
345 {
346         unsigned char md2[20];
347
348         CARP_SCLOCK_ASSERT(sc);
349
350         carp_hmac_generate(sc, counter, md2);
351
352         return (bcmp(md, md2, sizeof(md2)));
353 }
354
355 static void
356 carp_setroute(struct carp_softc *sc, int cmd)
357 {
358         struct ifaddr *ifa;
359         int s;
360
361         if (sc->sc_carpdev)
362                 CARP_SCLOCK_ASSERT(sc);
363
364         s = splnet();
365         TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
366                 if (ifa->ifa_addr->sa_family == AF_INET &&
367                     sc->sc_carpdev != NULL) {
368                         int count = carp_addrcount(
369                             (struct carp_if *)sc->sc_carpdev->if_carp,
370                             ifatoia(ifa), CARP_COUNT_MASTER);
371
372                         if ((cmd == RTM_ADD && count == 1) ||
373                             (cmd == RTM_DELETE && count == 0))
374                                 rtinit(ifa, cmd, RTF_UP | RTF_HOST);
375                 }
376         }
377         splx(s);
378 }
379
380 static int
381 carp_clone_create(struct if_clone *ifc, int unit, caddr_t params)
382 {
383
384         struct carp_softc *sc;
385         struct ifnet *ifp;
386
387         sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO);
388         ifp = SC2IFP(sc) = if_alloc(IFT_ETHER);
389         if (ifp == NULL) {
390                 free(sc, M_CARP);
391                 return (ENOSPC);
392         }
393         
394         sc->sc_flags_backup = 0;
395         sc->sc_suppress = 0;
396         sc->sc_advbase = CARP_DFLTINTV;
397         sc->sc_vhid = -1;       /* required setting */
398         sc->sc_advskew = 0;
399         sc->sc_init_counter = 1;
400         sc->sc_naddrs = sc->sc_naddrs6 = 0; /* M_ZERO? */
401         sc->sc_imo.imo_membership = (struct in_multi **)malloc(
402             (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP,
403             M_WAITOK);
404         sc->sc_imo.imo_mfilters = NULL;
405         sc->sc_imo.imo_max_memberships = IP_MIN_MEMBERSHIPS;
406         sc->sc_imo.imo_multicast_vif = -1;
407 #ifdef INET6
408         sc->sc_im6o.im6o_membership = (struct in6_multi **)malloc(
409             (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP,
410             M_WAITOK);
411         sc->sc_im6o.im6o_mfilters = NULL;
412         sc->sc_im6o.im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
413         sc->sc_im6o.im6o_multicast_hlim = CARP_DFLTTL;
414 #endif
415
416         callout_init(&sc->sc_ad_tmo, CALLOUT_MPSAFE);
417         callout_init(&sc->sc_md_tmo, CALLOUT_MPSAFE);
418         callout_init(&sc->sc_md6_tmo, CALLOUT_MPSAFE);
419         
420         ifp->if_softc = sc;
421         if_initname(ifp, CARP_IFNAME, unit);
422         ifp->if_mtu = ETHERMTU;
423         ifp->if_flags = IFF_LOOPBACK;
424         ifp->if_ioctl = carp_ioctl;
425         ifp->if_output = carp_looutput;
426         ifp->if_start = carp_start;
427         ifp->if_type = IFT_CARP;
428         ifp->if_snd.ifq_maxlen = ifqmaxlen;
429         ifp->if_hdrlen = 0;
430         if_attach(ifp);
431         bpfattach(SC2IFP(sc), DLT_NULL, sizeof(u_int32_t));
432         mtx_lock(&carp_mtx);
433         LIST_INSERT_HEAD(&carpif_list, sc, sc_next);
434         mtx_unlock(&carp_mtx);
435         return (0);
436 }
437
438 static void
439 carp_clone_destroy(struct ifnet *ifp)
440 {
441         struct carp_softc *sc = ifp->if_softc;
442
443         if (sc->sc_carpdev)
444                 CARP_SCLOCK(sc);
445         carpdetach(sc, 1);      /* Returns unlocked. */
446
447         mtx_lock(&carp_mtx);
448         LIST_REMOVE(sc, sc_next);
449         mtx_unlock(&carp_mtx);
450         bpfdetach(ifp);
451         if_detach(ifp);
452         if_free_type(ifp, IFT_ETHER);
453         free(sc->sc_imo.imo_membership, M_CARP);
454 #ifdef INET6
455         free(sc->sc_im6o.im6o_membership, M_CARP);
456 #endif
457         free(sc, M_CARP);
458 }
459
460 /*
461  * This function can be called on CARP interface destroy path,
462  * and in case of the removal of the underlying interface as
463  * well. We differentiate these two cases. In the latter case
464  * we do not cleanup our multicast memberships, since they
465  * are already freed. Also, in the latter case we do not
466  * release the lock on return, because the function will be
467  * called once more, for another CARP instance on the same
468  * interface.
469  */
470 static void
471 carpdetach(struct carp_softc *sc, int unlock)
472 {
473         struct carp_if *cif;
474
475         callout_stop(&sc->sc_ad_tmo);
476         callout_stop(&sc->sc_md_tmo);
477         callout_stop(&sc->sc_md6_tmo);
478
479         if (sc->sc_suppress)
480                 carp_suppress_preempt--;
481         sc->sc_suppress = 0;
482
483         if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS)
484                 carp_suppress_preempt--;
485         sc->sc_sendad_errors = 0;
486
487         carp_set_state(sc, INIT);
488         SC2IFP(sc)->if_flags &= ~IFF_UP;
489         carp_setrun(sc, 0);
490         if (unlock)
491                 carp_multicast_cleanup(sc);
492 #ifdef INET6
493         carp_multicast6_cleanup(sc);
494 #endif
495
496         if (sc->sc_carpdev != NULL) {
497                 cif = (struct carp_if *)sc->sc_carpdev->if_carp;
498                 CARP_LOCK_ASSERT(cif);
499                 TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
500                 if (!--cif->vhif_nvrs) {
501                         ifpromisc(sc->sc_carpdev, 0);
502                         sc->sc_carpdev->if_carp = NULL;
503                         CARP_LOCK_DESTROY(cif);
504                         free(cif, M_CARP);
505                 } else if (unlock)
506                         CARP_UNLOCK(cif);
507                 sc->sc_carpdev = NULL;
508         }
509 }
510
511 /* Detach an interface from the carp. */
512 static void
513 carp_ifdetach(void *arg __unused, struct ifnet *ifp)
514 {
515         struct carp_if *cif = (struct carp_if *)ifp->if_carp;
516         struct carp_softc *sc, *nextsc;
517
518         if (cif == NULL)
519                 return;
520
521         /*
522          * XXX: At the end of for() cycle the lock will be destroyed.
523          */
524         CARP_LOCK(cif);
525         for (sc = TAILQ_FIRST(&cif->vhif_vrs); sc; sc = nextsc) {
526                 nextsc = TAILQ_NEXT(sc, sc_list);
527                 carpdetach(sc, 0);
528         }
529 }
530
531 /*
532  * process input packet.
533  * we have rearranged checks order compared to the rfc,
534  * but it seems more efficient this way or not possible otherwise.
535  */
536 void
537 carp_input(struct mbuf *m, int hlen)
538 {
539         struct ip *ip = mtod(m, struct ip *);
540         struct carp_header *ch;
541         int iplen, len;
542
543         CARPSTATS_INC(carps_ipackets);
544
545         if (!carp_opts[CARPCTL_ALLOW]) {
546                 m_freem(m);
547                 return;
548         }
549
550         /* check if received on a valid carp interface */
551         if (m->m_pkthdr.rcvif->if_carp == NULL) {
552                 CARPSTATS_INC(carps_badif);
553                 CARP_DEBUG("carp_input: packet received on non-carp "
554                     "interface: %s\n",
555                     m->m_pkthdr.rcvif->if_xname);
556                 m_freem(m);
557                 return;
558         }
559
560         /* verify that the IP TTL is 255.  */
561         if (ip->ip_ttl != CARP_DFLTTL) {
562                 CARPSTATS_INC(carps_badttl);
563                 CARP_DEBUG("carp_input: received ttl %d != 255 on %s\n",
564                     ip->ip_ttl,
565                     m->m_pkthdr.rcvif->if_xname);
566                 m_freem(m);
567                 return;
568         }
569
570         iplen = ip->ip_hl << 2;
571
572         if (m->m_pkthdr.len < iplen + sizeof(*ch)) {
573                 CARPSTATS_INC(carps_badlen);
574                 CARP_DEBUG("carp_input: received len %zd < "
575                     "sizeof(struct carp_header) on %s\n",
576                     m->m_len - sizeof(struct ip),
577                     m->m_pkthdr.rcvif->if_xname);
578                 m_freem(m);
579                 return;
580         }
581
582         if (iplen + sizeof(*ch) < m->m_len) {
583                 if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) {
584                         CARPSTATS_INC(carps_hdrops);
585                         CARP_DEBUG("carp_input: pullup failed\n");
586                         return;
587                 }
588                 ip = mtod(m, struct ip *);
589         }
590         ch = (struct carp_header *)((char *)ip + iplen);
591
592         /*
593          * verify that the received packet length is
594          * equal to the CARP header
595          */
596         len = iplen + sizeof(*ch);
597         if (len > m->m_pkthdr.len) {
598                 CARPSTATS_INC(carps_badlen);
599                 CARP_DEBUG("carp_input: packet too short %d on %s\n",
600                     m->m_pkthdr.len,
601                     m->m_pkthdr.rcvif->if_xname);
602                 m_freem(m);
603                 return;
604         }
605
606         if ((m = m_pullup(m, len)) == NULL) {
607                 CARPSTATS_INC(carps_hdrops);
608                 return;
609         }
610         ip = mtod(m, struct ip *);
611         ch = (struct carp_header *)((char *)ip + iplen);
612
613         /* verify the CARP checksum */
614         m->m_data += iplen;
615         if (carp_cksum(m, len - iplen)) {
616                 CARPSTATS_INC(carps_badsum);
617                 CARP_DEBUG("carp_input: checksum failed on %s\n",
618                     m->m_pkthdr.rcvif->if_xname);
619                 m_freem(m);
620                 return;
621         }
622         m->m_data -= iplen;
623
624         carp_input_c(m, ch, AF_INET);
625 }
626
627 #ifdef INET6
628 int
629 carp6_input(struct mbuf **mp, int *offp, int proto)
630 {
631         struct mbuf *m = *mp;
632         struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
633         struct carp_header *ch;
634         u_int len;
635
636         CARPSTATS_INC(carps_ipackets6);
637
638         if (!carp_opts[CARPCTL_ALLOW]) {
639                 m_freem(m);
640                 return (IPPROTO_DONE);
641         }
642
643         /* check if received on a valid carp interface */
644         if (m->m_pkthdr.rcvif->if_carp == NULL) {
645                 CARPSTATS_INC(carps_badif);
646                 CARP_DEBUG("carp6_input: packet received on non-carp "
647                     "interface: %s\n",
648                     m->m_pkthdr.rcvif->if_xname);
649                 m_freem(m);
650                 return (IPPROTO_DONE);
651         }
652
653         /* verify that the IP TTL is 255 */
654         if (ip6->ip6_hlim != CARP_DFLTTL) {
655                 CARPSTATS_INC(carps_badttl);
656                 CARP_DEBUG("carp6_input: received ttl %d != 255 on %s\n",
657                     ip6->ip6_hlim,
658                     m->m_pkthdr.rcvif->if_xname);
659                 m_freem(m);
660                 return (IPPROTO_DONE);
661         }
662
663         /* verify that we have a complete carp packet */
664         len = m->m_len;
665         IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch));
666         if (ch == NULL) {
667                 CARPSTATS_INC(carps_badlen);
668                 CARP_DEBUG("carp6_input: packet size %u too small\n", len);
669                 return (IPPROTO_DONE);
670         }
671
672
673         /* verify the CARP checksum */
674         m->m_data += *offp;
675         if (carp_cksum(m, sizeof(*ch))) {
676                 CARPSTATS_INC(carps_badsum);
677                 CARP_DEBUG("carp6_input: checksum failed, on %s\n",
678                     m->m_pkthdr.rcvif->if_xname);
679                 m_freem(m);
680                 return (IPPROTO_DONE);
681         }
682         m->m_data -= *offp;
683
684         carp_input_c(m, ch, AF_INET6);
685         return (IPPROTO_DONE);
686 }
687 #endif /* INET6 */
688
689 static void
690 carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
691 {
692         struct ifnet *ifp = m->m_pkthdr.rcvif;
693         struct carp_softc *sc;
694         u_int64_t tmp_counter;
695         struct timeval sc_tv, ch_tv;
696
697         /* verify that the VHID is valid on the receiving interface */
698         CARP_LOCK(ifp->if_carp);
699         TAILQ_FOREACH(sc, &((struct carp_if *)ifp->if_carp)->vhif_vrs, sc_list)
700                 if (sc->sc_vhid == ch->carp_vhid)
701                         break;
702
703         if (!sc || !((SC2IFP(sc)->if_flags & IFF_UP) &&
704             (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) {
705                 CARPSTATS_INC(carps_badvhid);
706                 CARP_UNLOCK(ifp->if_carp);
707                 m_freem(m);
708                 return;
709         }
710
711         getmicrotime(&SC2IFP(sc)->if_lastchange);
712         SC2IFP(sc)->if_ipackets++;
713         SC2IFP(sc)->if_ibytes += m->m_pkthdr.len;
714
715         if (bpf_peers_present(SC2IFP(sc)->if_bpf)) {
716                 struct ip *ip = mtod(m, struct ip *);
717                 uint32_t af1 = af;
718
719                 /* BPF wants net byte order */
720                 ip->ip_len = htons(ip->ip_len + (ip->ip_hl << 2));
721                 ip->ip_off = htons(ip->ip_off);
722                 bpf_mtap2(SC2IFP(sc)->if_bpf, &af1, sizeof(af1), m);
723         }
724
725         /* verify the CARP version. */
726         if (ch->carp_version != CARP_VERSION) {
727                 CARPSTATS_INC(carps_badver);
728                 SC2IFP(sc)->if_ierrors++;
729                 CARP_UNLOCK(ifp->if_carp);
730                 CARP_DEBUG("%s; invalid version %d\n",
731                     SC2IFP(sc)->if_xname,
732                     ch->carp_version);
733                 m_freem(m);
734                 return;
735         }
736
737         /* verify the hash */
738         if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
739                 CARPSTATS_INC(carps_badauth);
740                 SC2IFP(sc)->if_ierrors++;
741                 CARP_UNLOCK(ifp->if_carp);
742                 CARP_DEBUG("%s: incorrect hash\n", SC2IFP(sc)->if_xname);
743                 m_freem(m);
744                 return;
745         }
746
747         tmp_counter = ntohl(ch->carp_counter[0]);
748         tmp_counter = tmp_counter<<32;
749         tmp_counter += ntohl(ch->carp_counter[1]);
750
751         /* XXX Replay protection goes here */
752
753         sc->sc_init_counter = 0;
754         sc->sc_counter = tmp_counter;
755
756         sc_tv.tv_sec = sc->sc_advbase;
757         if (carp_suppress_preempt && sc->sc_advskew <  240)
758                 sc_tv.tv_usec = 240 * 1000000 / 256;
759         else
760                 sc_tv.tv_usec = sc->sc_advskew * 1000000 / 256;
761         ch_tv.tv_sec = ch->carp_advbase;
762         ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
763
764         switch (sc->sc_state) {
765         case INIT:
766                 break;
767         case MASTER:
768                 /*
769                  * If we receive an advertisement from a master who's going to
770                  * be more frequent than us, go into BACKUP state.
771                  */
772                 if (timevalcmp(&sc_tv, &ch_tv, >) ||
773                     timevalcmp(&sc_tv, &ch_tv, ==)) {
774                         callout_stop(&sc->sc_ad_tmo);
775                         CARP_LOG("%s: MASTER -> BACKUP "
776                            "(more frequent advertisement received)\n",
777                            SC2IFP(sc)->if_xname);
778                         carp_set_state(sc, BACKUP);
779                         carp_setrun(sc, 0);
780                         carp_setroute(sc, RTM_DELETE);
781                 }
782                 break;
783         case BACKUP:
784                 /*
785                  * If we're pre-empting masters who advertise slower than us,
786                  * and this one claims to be slower, treat him as down.
787                  */
788                 if (carp_opts[CARPCTL_PREEMPT] &&
789                     timevalcmp(&sc_tv, &ch_tv, <)) {
790                         CARP_LOG("%s: BACKUP -> MASTER "
791                             "(preempting a slower master)\n",
792                             SC2IFP(sc)->if_xname);
793                         carp_master_down_locked(sc);
794                         break;
795                 }
796
797                 /*
798                  *  If the master is going to advertise at such a low frequency
799                  *  that he's guaranteed to time out, we'd might as well just
800                  *  treat him as timed out now.
801                  */
802                 sc_tv.tv_sec = sc->sc_advbase * 3;
803                 if (timevalcmp(&sc_tv, &ch_tv, <)) {
804                         CARP_LOG("%s: BACKUP -> MASTER "
805                             "(master timed out)\n",
806                             SC2IFP(sc)->if_xname);
807                         carp_master_down_locked(sc);
808                         break;
809                 }
810
811                 /*
812                  * Otherwise, we reset the counter and wait for the next
813                  * advertisement.
814                  */
815                 carp_setrun(sc, af);
816                 break;
817         }
818
819         CARP_UNLOCK(ifp->if_carp);
820
821         m_freem(m);
822         return;
823 }
824
825 static int
826 carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch)
827 {
828         struct m_tag *mtag;
829         struct ifnet *ifp = SC2IFP(sc);
830
831         if (sc->sc_init_counter) {
832                 /* this could also be seconds since unix epoch */
833                 sc->sc_counter = arc4random();
834                 sc->sc_counter = sc->sc_counter << 32;
835                 sc->sc_counter += arc4random();
836         } else
837                 sc->sc_counter++;
838
839         ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff);
840         ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff);
841
842         carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
843
844         /* Tag packet for carp_output */
845         mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct ifnet *), M_NOWAIT);
846         if (mtag == NULL) {
847                 m_freem(m);
848                 SC2IFP(sc)->if_oerrors++;
849                 return (ENOMEM);
850         }
851         bcopy(&ifp, (caddr_t)(mtag + 1), sizeof(struct ifnet *));
852         m_tag_prepend(m, mtag);
853
854         return (0);
855 }
856
857 static void
858 carp_send_ad_all(void)
859 {
860         struct carp_softc *sc;
861
862         mtx_lock(&carp_mtx);
863         LIST_FOREACH(sc, &carpif_list, sc_next) {
864                 if (sc->sc_carpdev == NULL)
865                         continue;
866                 CARP_SCLOCK(sc);
867                 if ((SC2IFP(sc)->if_flags & IFF_UP) &&
868                     (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING) &&
869                      sc->sc_state == MASTER)
870                         carp_send_ad_locked(sc);
871                 CARP_SCUNLOCK(sc);
872         }
873         mtx_unlock(&carp_mtx);
874 }
875
876 static void
877 carp_send_ad(void *v)
878 {
879         struct carp_softc *sc = v;
880
881         CARP_SCLOCK(sc);
882         carp_send_ad_locked(sc);
883         CARP_SCUNLOCK(sc);
884 }
885
886 static void
887 carp_send_ad_locked(struct carp_softc *sc)
888 {
889         struct carp_header ch;
890         struct timeval tv;
891         struct carp_header *ch_ptr;
892         struct mbuf *m;
893         int len, advbase, advskew;
894
895         CARP_SCLOCK_ASSERT(sc);
896
897         /* bow out if we've lost our UPness or RUNNINGuiness */
898         if (!((SC2IFP(sc)->if_flags & IFF_UP) &&
899             (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING))) {
900                 advbase = 255;
901                 advskew = 255;
902         } else {
903                 advbase = sc->sc_advbase;
904                 if (!carp_suppress_preempt || sc->sc_advskew > 240)
905                         advskew = sc->sc_advskew;
906                 else
907                         advskew = 240;
908                 tv.tv_sec = advbase;
909                 tv.tv_usec = advskew * 1000000 / 256;
910         }
911
912         ch.carp_version = CARP_VERSION;
913         ch.carp_type = CARP_ADVERTISEMENT;
914         ch.carp_vhid = sc->sc_vhid;
915         ch.carp_advbase = advbase;
916         ch.carp_advskew = advskew;
917         ch.carp_authlen = 7;    /* XXX DEFINE */
918         ch.carp_pad1 = 0;       /* must be zero */
919         ch.carp_cksum = 0;
920
921 #ifdef INET
922         if (sc->sc_ia) {
923                 struct ip *ip;
924
925                 MGETHDR(m, M_DONTWAIT, MT_HEADER);
926                 if (m == NULL) {
927                         SC2IFP(sc)->if_oerrors++;
928                         CARPSTATS_INC(carps_onomem);
929                         /* XXX maybe less ? */
930                         if (advbase != 255 || advskew != 255)
931                                 callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
932                                     carp_send_ad, sc);
933                         return;
934                 }
935                 len = sizeof(*ip) + sizeof(ch);
936                 m->m_pkthdr.len = len;
937                 m->m_pkthdr.rcvif = NULL;
938                 m->m_len = len;
939                 MH_ALIGN(m, m->m_len);
940                 m->m_flags |= M_MCAST;
941                 ip = mtod(m, struct ip *);
942                 ip->ip_v = IPVERSION;
943                 ip->ip_hl = sizeof(*ip) >> 2;
944                 ip->ip_tos = IPTOS_LOWDELAY;
945                 ip->ip_len = len;
946                 ip->ip_id = ip_newid();
947                 ip->ip_off = IP_DF;
948                 ip->ip_ttl = CARP_DFLTTL;
949                 ip->ip_p = IPPROTO_CARP;
950                 ip->ip_sum = 0;
951                 ip->ip_src.s_addr = sc->sc_ia->ia_addr.sin_addr.s_addr;
952                 ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
953
954                 ch_ptr = (struct carp_header *)(&ip[1]);
955                 bcopy(&ch, ch_ptr, sizeof(ch));
956                 if (carp_prepare_ad(m, sc, ch_ptr))
957                         return;
958
959                 m->m_data += sizeof(*ip);
960                 ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip));
961                 m->m_data -= sizeof(*ip);
962
963                 getmicrotime(&SC2IFP(sc)->if_lastchange);
964                 SC2IFP(sc)->if_opackets++;
965                 SC2IFP(sc)->if_obytes += len;
966                 CARPSTATS_INC(carps_opackets);
967
968                 if (ip_output(m, NULL, NULL, IP_RAWOUTPUT, &sc->sc_imo, NULL)) {
969                         SC2IFP(sc)->if_oerrors++;
970                         if (sc->sc_sendad_errors < INT_MAX)
971                                 sc->sc_sendad_errors++;
972                         if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
973                                 carp_suppress_preempt++;
974                                 if (carp_suppress_preempt == 1) {
975                                         CARP_SCUNLOCK(sc);
976                                         carp_send_ad_all();
977                                         CARP_SCLOCK(sc);
978                                 }
979                         }
980                         sc->sc_sendad_success = 0;
981                 } else {
982                         if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
983                                 if (++sc->sc_sendad_success >=
984                                     CARP_SENDAD_MIN_SUCCESS) {
985                                         carp_suppress_preempt--;
986                                         sc->sc_sendad_errors = 0;
987                                 }
988                         } else
989                                 sc->sc_sendad_errors = 0;
990                 }
991         }
992 #endif /* INET */
993 #ifdef INET6
994         if (sc->sc_ia6) {
995                 struct ip6_hdr *ip6;
996
997                 MGETHDR(m, M_DONTWAIT, MT_HEADER);
998                 if (m == NULL) {
999                         SC2IFP(sc)->if_oerrors++;
1000                         CARPSTATS_INC(carps_onomem);
1001                         /* XXX maybe less ? */
1002                         if (advbase != 255 || advskew != 255)
1003                                 callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
1004                                     carp_send_ad, sc);
1005                         return;
1006                 }
1007                 len = sizeof(*ip6) + sizeof(ch);
1008                 m->m_pkthdr.len = len;
1009                 m->m_pkthdr.rcvif = NULL;
1010                 m->m_len = len;
1011                 MH_ALIGN(m, m->m_len);
1012                 m->m_flags |= M_MCAST;
1013                 ip6 = mtod(m, struct ip6_hdr *);
1014                 bzero(ip6, sizeof(*ip6));
1015                 ip6->ip6_vfc |= IPV6_VERSION;
1016                 ip6->ip6_hlim = CARP_DFLTTL;
1017                 ip6->ip6_nxt = IPPROTO_CARP;
1018                 bcopy(&sc->sc_ia6->ia_addr.sin6_addr, &ip6->ip6_src,
1019                     sizeof(struct in6_addr));
1020                 /* set the multicast destination */
1021
1022                 ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
1023                 ip6->ip6_dst.s6_addr8[15] = 0x12;
1024                 if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
1025                         SC2IFP(sc)->if_oerrors++;
1026                         m_freem(m);
1027                         CARP_DEBUG("%s: in6_setscope failed\n", __func__);
1028                         return;
1029                 }
1030
1031                 ch_ptr = (struct carp_header *)(&ip6[1]);
1032                 bcopy(&ch, ch_ptr, sizeof(ch));
1033                 if (carp_prepare_ad(m, sc, ch_ptr))
1034                         return;
1035
1036                 m->m_data += sizeof(*ip6);
1037                 ch_ptr->carp_cksum = carp_cksum(m, len - sizeof(*ip6));
1038                 m->m_data -= sizeof(*ip6);
1039
1040                 getmicrotime(&SC2IFP(sc)->if_lastchange);
1041                 SC2IFP(sc)->if_opackets++;
1042                 SC2IFP(sc)->if_obytes += len;
1043                 CARPSTATS_INC(carps_opackets6);
1044
1045                 if (ip6_output(m, NULL, NULL, 0, &sc->sc_im6o, NULL, NULL)) {
1046                         SC2IFP(sc)->if_oerrors++;
1047                         if (sc->sc_sendad_errors < INT_MAX)
1048                                 sc->sc_sendad_errors++;
1049                         if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
1050                                 carp_suppress_preempt++;
1051                                 if (carp_suppress_preempt == 1) {
1052                                         CARP_SCUNLOCK(sc);
1053                                         carp_send_ad_all();
1054                                         CARP_SCLOCK(sc);
1055                                 }
1056                         }
1057                         sc->sc_sendad_success = 0;
1058                 } else {
1059                         if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
1060                                 if (++sc->sc_sendad_success >=
1061                                     CARP_SENDAD_MIN_SUCCESS) {
1062                                         carp_suppress_preempt--;
1063                                         sc->sc_sendad_errors = 0;
1064                                 }
1065                         } else
1066                                 sc->sc_sendad_errors = 0;
1067                 }
1068         }
1069 #endif /* INET6 */
1070
1071         if (advbase != 255 || advskew != 255)
1072                 callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
1073                     carp_send_ad, sc);
1074
1075 }
1076
1077 /*
1078  * Broadcast a gratuitous ARP request containing
1079  * the virtual router MAC address for each IP address
1080  * associated with the virtual router.
1081  */
1082 static void
1083 carp_send_arp(struct carp_softc *sc)
1084 {
1085         struct ifaddr *ifa;
1086
1087         TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
1088
1089                 if (ifa->ifa_addr->sa_family != AF_INET)
1090                         continue;
1091
1092 /*              arprequest(sc->sc_carpdev, &in, &in, IF_LLADDR(sc->sc_ifp)); */
1093                 arp_ifinit2(sc->sc_carpdev, ifa, IF_LLADDR(sc->sc_ifp));
1094
1095                 DELAY(1000);    /* XXX */
1096         }
1097 }
1098
1099 #ifdef INET6
1100 static void
1101 carp_send_na(struct carp_softc *sc)
1102 {
1103         struct ifaddr *ifa;
1104         struct in6_addr *in6;
1105         static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
1106
1107         TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
1108
1109                 if (ifa->ifa_addr->sa_family != AF_INET6)
1110                         continue;
1111
1112                 in6 = &ifatoia6(ifa)->ia_addr.sin6_addr;
1113                 nd6_na_output(sc->sc_carpdev, &mcast, in6,
1114                     ND_NA_FLAG_OVERRIDE, 1, NULL);
1115                 DELAY(1000);    /* XXX */
1116         }
1117 }
1118 #endif /* INET6 */
1119
1120 static int
1121 carp_addrcount(struct carp_if *cif, struct in_ifaddr *ia, int type)
1122 {
1123         struct carp_softc *vh;
1124         struct ifaddr *ifa;
1125         int count = 0;
1126
1127         CARP_LOCK_ASSERT(cif);
1128
1129         TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
1130                 if ((type == CARP_COUNT_RUNNING &&
1131                     (SC2IFP(vh)->if_flags & IFF_UP) &&
1132                     (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) ||
1133                     (type == CARP_COUNT_MASTER && vh->sc_state == MASTER)) {
1134                         IF_ADDR_LOCK(SC2IFP(vh));
1135                         TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist,
1136                             ifa_list) {
1137                                 if (ifa->ifa_addr->sa_family == AF_INET &&
1138                                     ia->ia_addr.sin_addr.s_addr ==
1139                                     ifatoia(ifa)->ia_addr.sin_addr.s_addr)
1140                                         count++;
1141                         }
1142                         IF_ADDR_UNLOCK(SC2IFP(vh));
1143                 }
1144         }
1145         return (count);
1146 }
1147
1148 int
1149 carp_iamatch(void *v, struct in_ifaddr *ia,
1150     struct in_addr *isaddr, u_int8_t **enaddr)
1151 {
1152         struct carp_if *cif = v;
1153         struct carp_softc *vh;
1154         int index, count = 0;
1155         struct ifaddr *ifa;
1156
1157         CARP_LOCK(cif);
1158
1159         if (carp_opts[CARPCTL_ARPBALANCE]) {
1160                 /*
1161                  * XXX proof of concept implementation.
1162                  * We use the source ip to decide which virtual host should
1163                  * handle the request. If we're master of that virtual host,
1164                  * then we respond, otherwise, just drop the arp packet on
1165                  * the floor.
1166                  */
1167                 count = carp_addrcount(cif, ia, CARP_COUNT_RUNNING);
1168                 if (count == 0) {
1169                         /* should never reach this */
1170                         CARP_UNLOCK(cif);
1171                         return (0);
1172                 }
1173
1174                 /* this should be a hash, like pf_hash() */
1175                 index = ntohl(isaddr->s_addr) % count;
1176                 count = 0;
1177
1178                 TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
1179                         if ((SC2IFP(vh)->if_flags & IFF_UP) &&
1180                             (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING)) {
1181                                 IF_ADDR_LOCK(SC2IFP(vh));
1182                                 TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist,
1183                                     ifa_list) {
1184                                         if (ifa->ifa_addr->sa_family ==
1185                                             AF_INET &&
1186                                             ia->ia_addr.sin_addr.s_addr ==
1187                                             ifatoia(ifa)->ia_addr.sin_addr.s_addr) {
1188                                                 if (count == index) {
1189                                                         if (vh->sc_state ==
1190                                                             MASTER) {
1191                                                                 *enaddr = IF_LLADDR(vh->sc_ifp);
1192                                                                 IF_ADDR_UNLOCK(SC2IFP(vh));
1193                                                                 CARP_UNLOCK(cif);
1194                                                                 return (1);
1195                                                         } else {
1196                                                                 IF_ADDR_UNLOCK(SC2IFP(vh));
1197                                                                 CARP_UNLOCK(cif);
1198                                                                 return (0);
1199                                                         }
1200                                                 }
1201                                                 count++;
1202                                         }
1203                                 }
1204                                 IF_ADDR_UNLOCK(SC2IFP(vh));
1205                         }
1206                 }
1207         } else {
1208                 TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
1209                         if ((SC2IFP(vh)->if_flags & IFF_UP) &&
1210                             (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
1211                             ia->ia_ifp == SC2IFP(vh) &&
1212                             vh->sc_state == MASTER) {
1213                                 *enaddr = IF_LLADDR(vh->sc_ifp);
1214                                 CARP_UNLOCK(cif);
1215                                 return (1);
1216                         }
1217                 }
1218         }
1219         CARP_UNLOCK(cif);
1220         return (0);
1221 }
1222
1223 #ifdef INET6
1224 struct ifaddr *
1225 carp_iamatch6(void *v, struct in6_addr *taddr)
1226 {
1227         struct carp_if *cif = v;
1228         struct carp_softc *vh;
1229         struct ifaddr *ifa;
1230
1231         CARP_LOCK(cif);
1232         TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list) {
1233                 IF_ADDR_LOCK(SC2IFP(vh));
1234                 TAILQ_FOREACH(ifa, &SC2IFP(vh)->if_addrlist, ifa_list) {
1235                         if (IN6_ARE_ADDR_EQUAL(taddr,
1236                             &ifatoia6(ifa)->ia_addr.sin6_addr) &&
1237                             (SC2IFP(vh)->if_flags & IFF_UP) &&
1238                             (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
1239                             vh->sc_state == MASTER) {
1240                                 ifa_ref(ifa);
1241                                 IF_ADDR_UNLOCK(SC2IFP(vh));
1242                                 CARP_UNLOCK(cif);
1243                                 return (ifa);
1244                         }
1245                 }
1246                 IF_ADDR_UNLOCK(SC2IFP(vh));
1247         }
1248         CARP_UNLOCK(cif);
1249         
1250         return (NULL);
1251 }
1252
1253 void *
1254 carp_macmatch6(void *v, struct mbuf *m, const struct in6_addr *taddr)
1255 {
1256         struct m_tag *mtag;
1257         struct carp_if *cif = v;
1258         struct carp_softc *sc;
1259         struct ifaddr *ifa;
1260
1261         CARP_LOCK(cif);
1262         TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list) {
1263                 IF_ADDR_LOCK(SC2IFP(sc));
1264                 TAILQ_FOREACH(ifa, &SC2IFP(sc)->if_addrlist, ifa_list) {
1265                         if (IN6_ARE_ADDR_EQUAL(taddr,
1266                             &ifatoia6(ifa)->ia_addr.sin6_addr) &&
1267                             (SC2IFP(sc)->if_flags & IFF_UP) &&
1268                             (SC2IFP(sc)->if_drv_flags & IFF_DRV_RUNNING)) {
1269                                 struct ifnet *ifp = SC2IFP(sc);
1270                                 mtag = m_tag_get(PACKET_TAG_CARP,
1271                                     sizeof(struct ifnet *), M_NOWAIT);
1272                                 if (mtag == NULL) {
1273                                         /* better a bit than nothing */
1274                                         IF_ADDR_UNLOCK(SC2IFP(sc));
1275                                         CARP_UNLOCK(cif);
1276                                         return (IF_LLADDR(sc->sc_ifp));
1277                                 }
1278                                 bcopy(&ifp, (caddr_t)(mtag + 1),
1279                                     sizeof(struct ifnet *));
1280                                 m_tag_prepend(m, mtag);
1281
1282                                 IF_ADDR_UNLOCK(SC2IFP(sc));
1283                                 CARP_UNLOCK(cif);
1284                                 return (IF_LLADDR(sc->sc_ifp));
1285                         }
1286                 }
1287                 IF_ADDR_UNLOCK(SC2IFP(sc));
1288         }
1289         CARP_UNLOCK(cif);
1290
1291         return (NULL);
1292 }
1293 #endif
1294
1295 struct ifnet *
1296 carp_forus(void *v, void *dhost)
1297 {
1298         struct carp_if *cif = v;
1299         struct carp_softc *vh;
1300         u_int8_t *ena = dhost;
1301
1302         if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1)
1303                 return (NULL);
1304
1305         CARP_LOCK(cif);
1306         TAILQ_FOREACH(vh, &cif->vhif_vrs, sc_list)
1307                 if ((SC2IFP(vh)->if_flags & IFF_UP) &&
1308                     (SC2IFP(vh)->if_drv_flags & IFF_DRV_RUNNING) &&
1309                     vh->sc_state == MASTER &&
1310                     !bcmp(dhost, IF_LLADDR(vh->sc_ifp), ETHER_ADDR_LEN)) {
1311                         CARP_UNLOCK(cif);
1312                         return (SC2IFP(vh));
1313                 }
1314
1315         CARP_UNLOCK(cif);
1316         return (NULL);
1317 }
1318
1319 static void
1320 carp_master_down(void *v)
1321 {
1322         struct carp_softc *sc = v;
1323
1324         CARP_SCLOCK(sc);
1325         carp_master_down_locked(sc);
1326         CARP_SCUNLOCK(sc);
1327 }
1328
1329 static void
1330 carp_master_down_locked(struct carp_softc *sc)
1331 {
1332         if (sc->sc_carpdev)
1333                 CARP_SCLOCK_ASSERT(sc);
1334
1335         switch (sc->sc_state) {
1336         case INIT:
1337                 printf("%s: master_down event in INIT state\n",
1338                     SC2IFP(sc)->if_xname);
1339                 break;
1340         case MASTER:
1341                 break;
1342         case BACKUP:
1343                 carp_set_state(sc, MASTER);
1344                 carp_send_ad_locked(sc);
1345                 carp_send_arp(sc);
1346 #ifdef INET6
1347                 carp_send_na(sc);
1348 #endif /* INET6 */
1349                 carp_setrun(sc, 0);
1350                 carp_setroute(sc, RTM_ADD);
1351                 break;
1352         }
1353 }
1354
1355 /*
1356  * When in backup state, af indicates whether to reset the master down timer
1357  * for v4 or v6. If it's set to zero, reset the ones which are already pending.
1358  */
1359 static void
1360 carp_setrun(struct carp_softc *sc, sa_family_t af)
1361 {
1362         struct timeval tv;
1363
1364         if (sc->sc_carpdev == NULL) {
1365                 SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
1366                 carp_set_state(sc, INIT);
1367                 return;
1368         } else
1369                 CARP_SCLOCK_ASSERT(sc);
1370
1371         if (SC2IFP(sc)->if_flags & IFF_UP &&
1372             sc->sc_vhid > 0 && (sc->sc_naddrs || sc->sc_naddrs6))
1373                 SC2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING;
1374         else {
1375                 SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
1376                 carp_setroute(sc, RTM_DELETE);
1377                 return;
1378         }
1379
1380         switch (sc->sc_state) {
1381         case INIT:
1382                 if (carp_opts[CARPCTL_PREEMPT] && !carp_suppress_preempt) {
1383                         carp_send_ad_locked(sc);
1384                         carp_send_arp(sc);
1385 #ifdef INET6
1386                         carp_send_na(sc);
1387 #endif /* INET6 */
1388                         CARP_LOG("%s: INIT -> MASTER (preempting)\n",
1389                             SC2IFP(sc)->if_xname);
1390                         carp_set_state(sc, MASTER);
1391                         carp_setroute(sc, RTM_ADD);
1392                 } else {
1393                         CARP_LOG("%s: INIT -> BACKUP\n", SC2IFP(sc)->if_xname);
1394                         carp_set_state(sc, BACKUP);
1395                         carp_setroute(sc, RTM_DELETE);
1396                         carp_setrun(sc, 0);
1397                 }
1398                 break;
1399         case BACKUP:
1400                 callout_stop(&sc->sc_ad_tmo);
1401                 tv.tv_sec = 3 * sc->sc_advbase;
1402                 tv.tv_usec = sc->sc_advskew * 1000000 / 256;
1403                 switch (af) {
1404 #ifdef INET
1405                 case AF_INET:
1406                         callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
1407                             carp_master_down, sc);
1408                         break;
1409 #endif /* INET */
1410 #ifdef INET6
1411                 case AF_INET6:
1412                         callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
1413                             carp_master_down, sc);
1414                         break;
1415 #endif /* INET6 */
1416                 default:
1417                         if (sc->sc_naddrs)
1418                                 callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
1419                                     carp_master_down, sc);
1420                         if (sc->sc_naddrs6)
1421                                 callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
1422                                     carp_master_down, sc);
1423                         break;
1424                 }
1425                 break;
1426         case MASTER:
1427                 tv.tv_sec = sc->sc_advbase;
1428                 tv.tv_usec = sc->sc_advskew * 1000000 / 256;
1429                 callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
1430                     carp_send_ad, sc);
1431                 break;
1432         }
1433 }
1434
1435 static void
1436 carp_multicast_cleanup(struct carp_softc *sc)
1437 {
1438         struct ip_moptions *imo = &sc->sc_imo;
1439         u_int16_t n = imo->imo_num_memberships;
1440
1441         /* Clean up our own multicast memberships */
1442         while (n-- > 0) {
1443                 if (imo->imo_membership[n] != NULL) {
1444                         in_delmulti(imo->imo_membership[n]);
1445                         imo->imo_membership[n] = NULL;
1446                 }
1447         }
1448         KASSERT(imo->imo_mfilters == NULL,
1449            ("%s: imo_mfilters != NULL", __func__));
1450         imo->imo_num_memberships = 0;
1451         imo->imo_multicast_ifp = NULL;
1452 }
1453
1454 #ifdef INET6
1455 static void
1456 carp_multicast6_cleanup(struct carp_softc *sc)
1457 {
1458         struct ip6_moptions *im6o = &sc->sc_im6o;
1459         u_int16_t n = im6o->im6o_num_memberships;
1460
1461         while (n-- > 0) {
1462                 if (im6o->im6o_membership[n] != NULL) {
1463                         in6_mc_leave(im6o->im6o_membership[n], NULL);
1464                         im6o->im6o_membership[n] = NULL;
1465                 }
1466         }
1467         KASSERT(im6o->im6o_mfilters == NULL,
1468            ("%s: im6o_mfilters != NULL", __func__));
1469         im6o->im6o_num_memberships = 0;
1470         im6o->im6o_multicast_ifp = NULL;
1471 }
1472 #endif
1473
1474 static int
1475 carp_set_addr(struct carp_softc *sc, struct sockaddr_in *sin)
1476 {
1477         struct ifnet *ifp;
1478         struct carp_if *cif;
1479         struct in_ifaddr *ia, *ia_if;
1480         struct ip_moptions *imo = &sc->sc_imo;
1481         struct in_addr addr;
1482         u_long iaddr = htonl(sin->sin_addr.s_addr);
1483         int own, error;
1484
1485         if (sin->sin_addr.s_addr == 0) {
1486                 if (!(SC2IFP(sc)->if_flags & IFF_UP))
1487                         carp_set_state(sc, INIT);
1488                 if (sc->sc_naddrs)
1489                         SC2IFP(sc)->if_flags |= IFF_UP;
1490                 if (sc->sc_carpdev)
1491                         CARP_SCLOCK(sc);
1492                 carp_setrun(sc, 0);
1493                 if (sc->sc_carpdev)
1494                         CARP_SCUNLOCK(sc);
1495                 return (0);
1496         }
1497
1498         /* we have to do it by hands to check we won't match on us */
1499         ia_if = NULL; own = 0;
1500         IN_IFADDR_RLOCK();
1501         TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1502                 /* and, yeah, we need a multicast-capable iface too */
1503                 if (ia->ia_ifp != SC2IFP(sc) &&
1504                     (ia->ia_ifp->if_flags & IFF_MULTICAST) &&
1505                     (iaddr & ia->ia_subnetmask) == ia->ia_subnet) {
1506                         if (!ia_if)
1507                                 ia_if = ia;
1508                         if (sin->sin_addr.s_addr ==
1509                             ia->ia_addr.sin_addr.s_addr)
1510                                 own++;
1511                 }
1512         }
1513
1514         if (!ia_if) {
1515                 IN_IFADDR_RUNLOCK();
1516                 return (EADDRNOTAVAIL);
1517         }
1518
1519         ia = ia_if;
1520         ifa_ref(&ia->ia_ifa);
1521         IN_IFADDR_RUNLOCK();
1522
1523         ifp = ia->ia_ifp;
1524
1525         if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 ||
1526             (imo->imo_multicast_ifp && imo->imo_multicast_ifp != ifp)) {
1527                 ifa_free(&ia->ia_ifa);
1528                 return (EADDRNOTAVAIL);
1529         }
1530
1531         if (imo->imo_num_memberships == 0) {
1532                 addr.s_addr = htonl(INADDR_CARP_GROUP);
1533                 if ((imo->imo_membership[0] = in_addmulti(&addr, ifp)) ==
1534                     NULL) {
1535                         ifa_free(&ia->ia_ifa);
1536                         return (ENOBUFS);
1537                 }
1538                 imo->imo_num_memberships++;
1539                 imo->imo_multicast_ifp = ifp;
1540                 imo->imo_multicast_ttl = CARP_DFLTTL;
1541                 imo->imo_multicast_loop = 0;
1542         }
1543
1544         if (!ifp->if_carp) {
1545
1546                 cif = malloc(sizeof(*cif), M_CARP,
1547                     M_WAITOK|M_ZERO);
1548                 if (!cif) {
1549                         error = ENOBUFS;
1550                         goto cleanup;
1551                 }
1552                 if ((error = ifpromisc(ifp, 1))) {
1553                         free(cif, M_CARP);
1554                         goto cleanup;
1555                 }
1556                 
1557                 CARP_LOCK_INIT(cif);
1558                 CARP_LOCK(cif);
1559                 cif->vhif_ifp = ifp;
1560                 TAILQ_INIT(&cif->vhif_vrs);
1561                 ifp->if_carp = cif;
1562
1563         } else {
1564                 struct carp_softc *vr;
1565
1566                 cif = (struct carp_if *)ifp->if_carp;
1567                 CARP_LOCK(cif);
1568                 TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
1569                         if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
1570                                 CARP_UNLOCK(cif);
1571                                 error = EEXIST;
1572                                 goto cleanup;
1573                         }
1574         }
1575         sc->sc_ia = ia;
1576         sc->sc_carpdev = ifp;
1577
1578         { /* XXX prevent endless loop if already in queue */
1579         struct carp_softc *vr, *after = NULL;
1580         int myself = 0;
1581         cif = (struct carp_if *)ifp->if_carp;
1582
1583         /* XXX: cif should not change, right? So we still hold the lock */
1584         CARP_LOCK_ASSERT(cif);
1585
1586         TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
1587                 if (vr == sc)
1588                         myself = 1;
1589                 if (vr->sc_vhid < sc->sc_vhid)
1590                         after = vr;
1591         }
1592
1593         if (!myself) {
1594                 /* We're trying to keep things in order */
1595                 if (after == NULL) {
1596                         TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
1597                 } else {
1598                         TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
1599                 }
1600                 cif->vhif_nvrs++;
1601         }
1602         }
1603
1604         sc->sc_naddrs++;
1605         SC2IFP(sc)->if_flags |= IFF_UP;
1606         if (own)
1607                 sc->sc_advskew = 0;
1608         carp_sc_state_locked(sc);
1609         carp_setrun(sc, 0);
1610
1611         CARP_UNLOCK(cif);
1612         ifa_free(&ia->ia_ifa);  /* XXXRW: should hold reference for softc. */
1613
1614         return (0);
1615
1616 cleanup:
1617         in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
1618         ifa_free(&ia->ia_ifa);
1619         return (error);
1620 }
1621
1622 static int
1623 carp_del_addr(struct carp_softc *sc, struct sockaddr_in *sin)
1624 {
1625         int error = 0;
1626
1627         if (!--sc->sc_naddrs) {
1628                 struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp;
1629                 struct ip_moptions *imo = &sc->sc_imo;
1630
1631                 CARP_LOCK(cif);
1632                 callout_stop(&sc->sc_ad_tmo);
1633                 SC2IFP(sc)->if_flags &= ~IFF_UP;
1634                 SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
1635                 sc->sc_vhid = -1;
1636                 in_delmulti(imo->imo_membership[--imo->imo_num_memberships]);
1637                 imo->imo_multicast_ifp = NULL;
1638                 TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
1639                 if (!--cif->vhif_nvrs) {
1640                         sc->sc_carpdev->if_carp = NULL;
1641                         CARP_LOCK_DESTROY(cif);
1642                         free(cif, M_CARP);
1643                 } else {
1644                         CARP_UNLOCK(cif);
1645                 }
1646         }
1647
1648         return (error);
1649 }
1650
1651 #ifdef INET6
1652 static int
1653 carp_set_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6)
1654 {
1655         struct ifnet *ifp;
1656         struct carp_if *cif;
1657         struct in6_ifaddr *ia, *ia_if;
1658         struct ip6_moptions *im6o = &sc->sc_im6o;
1659         struct in6_addr in6;
1660         int own, error;
1661
1662         error = 0;
1663
1664         if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
1665                 if (!(SC2IFP(sc)->if_flags & IFF_UP))
1666                         carp_set_state(sc, INIT);
1667                 if (sc->sc_naddrs6)
1668                         SC2IFP(sc)->if_flags |= IFF_UP;
1669                 if (sc->sc_carpdev)
1670                         CARP_SCLOCK(sc);
1671                 carp_setrun(sc, 0);
1672                 if (sc->sc_carpdev)
1673                         CARP_SCUNLOCK(sc);
1674                 return (0);
1675         }
1676
1677         /* we have to do it by hands to check we won't match on us */
1678         ia_if = NULL; own = 0;
1679         IN6_IFADDR_RLOCK();
1680         TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
1681                 int i;
1682
1683                 for (i = 0; i < 4; i++) {
1684                         if ((sin6->sin6_addr.s6_addr32[i] &
1685                             ia->ia_prefixmask.sin6_addr.s6_addr32[i]) !=
1686                             (ia->ia_addr.sin6_addr.s6_addr32[i] &
1687                             ia->ia_prefixmask.sin6_addr.s6_addr32[i]))
1688                                 break;
1689                 }
1690                 /* and, yeah, we need a multicast-capable iface too */
1691                 if (ia->ia_ifp != SC2IFP(sc) &&
1692                     (ia->ia_ifp->if_flags & IFF_MULTICAST) &&
1693                     (i == 4)) {
1694                         if (!ia_if)
1695                                 ia_if = ia;
1696                         if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
1697                             &ia->ia_addr.sin6_addr))
1698                                 own++;
1699                 }
1700         }
1701
1702         if (!ia_if) {
1703                 IN6_IFADDR_RUNLOCK();
1704                 return (EADDRNOTAVAIL);
1705         }
1706         ia = ia_if;
1707         ifa_ref(&ia->ia_ifa);
1708         IN6_IFADDR_RUNLOCK();
1709         ifp = ia->ia_ifp;
1710
1711         if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0 ||
1712             (im6o->im6o_multicast_ifp && im6o->im6o_multicast_ifp != ifp)) {
1713                 ifa_free(&ia->ia_ifa);
1714                 return (EADDRNOTAVAIL);
1715         }
1716
1717         if (!sc->sc_naddrs6) {
1718                 struct in6_multi *in6m;
1719
1720                 im6o->im6o_multicast_ifp = ifp;
1721
1722                 /* join CARP multicast address */
1723                 bzero(&in6, sizeof(in6));
1724                 in6.s6_addr16[0] = htons(0xff02);
1725                 in6.s6_addr8[15] = 0x12;
1726                 if (in6_setscope(&in6, ifp, NULL) != 0)
1727                         goto cleanup;
1728                 in6m = NULL;
1729                 error = in6_mc_join(ifp, &in6, NULL, &in6m, 0);
1730                 if (error)
1731                         goto cleanup;
1732                 im6o->im6o_membership[0] = in6m;
1733                 im6o->im6o_num_memberships++;
1734
1735                 /* join solicited multicast address */
1736                 bzero(&in6, sizeof(in6));
1737                 in6.s6_addr16[0] = htons(0xff02);
1738                 in6.s6_addr32[1] = 0;
1739                 in6.s6_addr32[2] = htonl(1);
1740                 in6.s6_addr32[3] = sin6->sin6_addr.s6_addr32[3];
1741                 in6.s6_addr8[12] = 0xff;
1742                 if (in6_setscope(&in6, ifp, NULL) != 0)
1743                         goto cleanup;
1744                 in6m = NULL;
1745                 error = in6_mc_join(ifp, &in6, NULL, &in6m, 0);
1746                 if (error)
1747                         goto cleanup;
1748                 im6o->im6o_membership[1] = in6m;
1749                 im6o->im6o_num_memberships++;
1750         }
1751
1752         if (!ifp->if_carp) {
1753                 cif = malloc(sizeof(*cif), M_CARP,
1754                     M_WAITOK|M_ZERO);
1755                 if (!cif) {
1756                         error = ENOBUFS;
1757                         goto cleanup;
1758                 }
1759                 if ((error = ifpromisc(ifp, 1))) {
1760                         free(cif, M_CARP);
1761                         goto cleanup;
1762                 }
1763
1764                 CARP_LOCK_INIT(cif);
1765                 CARP_LOCK(cif);
1766                 cif->vhif_ifp = ifp;
1767                 TAILQ_INIT(&cif->vhif_vrs);
1768                 ifp->if_carp = cif;
1769
1770         } else {
1771                 struct carp_softc *vr;
1772
1773                 cif = (struct carp_if *)ifp->if_carp;
1774                 CARP_LOCK(cif);
1775                 TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
1776                         if (vr != sc && vr->sc_vhid == sc->sc_vhid) {
1777                                 CARP_UNLOCK(cif);
1778                                 error = EINVAL;
1779                                 goto cleanup;
1780                         }
1781         }
1782         sc->sc_ia6 = ia;
1783         sc->sc_carpdev = ifp;
1784
1785         { /* XXX prevent endless loop if already in queue */
1786         struct carp_softc *vr, *after = NULL;
1787         int myself = 0;
1788         cif = (struct carp_if *)ifp->if_carp;
1789         CARP_LOCK_ASSERT(cif);
1790
1791         TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list) {
1792                 if (vr == sc)
1793                         myself = 1;
1794                 if (vr->sc_vhid < sc->sc_vhid)
1795                         after = vr;
1796         }
1797
1798         if (!myself) {
1799                 /* We're trying to keep things in order */
1800                 if (after == NULL) {
1801                         TAILQ_INSERT_TAIL(&cif->vhif_vrs, sc, sc_list);
1802                 } else {
1803                         TAILQ_INSERT_AFTER(&cif->vhif_vrs, after, sc, sc_list);
1804                 }
1805                 cif->vhif_nvrs++;
1806         }
1807         }
1808
1809         sc->sc_naddrs6++;
1810         SC2IFP(sc)->if_flags |= IFF_UP;
1811         if (own)
1812                 sc->sc_advskew = 0;
1813         carp_sc_state_locked(sc);
1814         carp_setrun(sc, 0);
1815
1816         CARP_UNLOCK(cif);
1817         ifa_free(&ia->ia_ifa);  /* XXXRW: should hold reference for softc. */
1818
1819         return (0);
1820
1821 cleanup:
1822         if (!sc->sc_naddrs6)
1823                 carp_multicast6_cleanup(sc);
1824         ifa_free(&ia->ia_ifa);
1825         return (error);
1826 }
1827
1828 static int
1829 carp_del_addr6(struct carp_softc *sc, struct sockaddr_in6 *sin6)
1830 {
1831         int error = 0;
1832
1833         if (!--sc->sc_naddrs6) {
1834                 struct carp_if *cif = (struct carp_if *)sc->sc_carpdev->if_carp;
1835
1836                 CARP_LOCK(cif);
1837                 callout_stop(&sc->sc_ad_tmo);
1838                 SC2IFP(sc)->if_flags &= ~IFF_UP;
1839                 SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
1840                 sc->sc_vhid = -1;
1841                 carp_multicast6_cleanup(sc);
1842                 TAILQ_REMOVE(&cif->vhif_vrs, sc, sc_list);
1843                 if (!--cif->vhif_nvrs) {
1844                         CARP_LOCK_DESTROY(cif);
1845                         sc->sc_carpdev->if_carp = NULL;
1846                         free(cif, M_CARP);
1847                 } else
1848                         CARP_UNLOCK(cif);
1849         }
1850
1851         return (error);
1852 }
1853 #endif /* INET6 */
1854
1855 static int
1856 carp_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr)
1857 {
1858         struct carp_softc *sc = ifp->if_softc, *vr;
1859         struct carpreq carpr;
1860         struct ifaddr *ifa;
1861         struct ifreq *ifr;
1862         struct ifaliasreq *ifra;
1863         int locked = 0, error = 0;
1864
1865         ifa = (struct ifaddr *)addr;
1866         ifra = (struct ifaliasreq *)addr;
1867         ifr = (struct ifreq *)addr;
1868
1869         switch (cmd) {
1870         case SIOCSIFADDR:
1871                 switch (ifa->ifa_addr->sa_family) {
1872 #ifdef INET
1873                 case AF_INET:
1874                         SC2IFP(sc)->if_flags |= IFF_UP;
1875                         bcopy(ifa->ifa_addr, ifa->ifa_dstaddr,
1876                             sizeof(struct sockaddr));
1877                         error = carp_set_addr(sc, satosin(ifa->ifa_addr));
1878                         break;
1879 #endif /* INET */
1880 #ifdef INET6
1881                 case AF_INET6:
1882                         SC2IFP(sc)->if_flags |= IFF_UP;
1883                         error = carp_set_addr6(sc, satosin6(ifa->ifa_addr));
1884                         break;
1885 #endif /* INET6 */
1886                 default:
1887                         error = EAFNOSUPPORT;
1888                         break;
1889                 }
1890                 break;
1891
1892         case SIOCAIFADDR:
1893                 switch (ifa->ifa_addr->sa_family) {
1894 #ifdef INET
1895                 case AF_INET:
1896                         SC2IFP(sc)->if_flags |= IFF_UP;
1897                         bcopy(ifa->ifa_addr, ifa->ifa_dstaddr,
1898                             sizeof(struct sockaddr));
1899                         error = carp_set_addr(sc, satosin(&ifra->ifra_addr));
1900                         break;
1901 #endif /* INET */
1902 #ifdef INET6
1903                 case AF_INET6:
1904                         SC2IFP(sc)->if_flags |= IFF_UP;
1905                         error = carp_set_addr6(sc, satosin6(&ifra->ifra_addr));
1906                         break;
1907 #endif /* INET6 */
1908                 default:
1909                         error = EAFNOSUPPORT;
1910                         break;
1911                 }
1912                 break;
1913
1914         case SIOCDIFADDR:
1915                 switch (ifa->ifa_addr->sa_family) {
1916 #ifdef INET
1917                 case AF_INET:
1918                         error = carp_del_addr(sc, satosin(&ifra->ifra_addr));
1919                         break;
1920 #endif /* INET */
1921 #ifdef INET6
1922                 case AF_INET6:
1923                         error = carp_del_addr6(sc, satosin6(&ifra->ifra_addr));
1924                         break;
1925 #endif /* INET6 */
1926                 default:
1927                         error = EAFNOSUPPORT;
1928                         break;
1929                 }
1930                 break;
1931
1932         case SIOCSIFFLAGS:
1933                 if (sc->sc_carpdev) {
1934                         locked = 1;
1935                         CARP_SCLOCK(sc);
1936                 }
1937                 if (sc->sc_state != INIT && !(ifr->ifr_flags & IFF_UP)) {
1938                         callout_stop(&sc->sc_ad_tmo);
1939                         callout_stop(&sc->sc_md_tmo);
1940                         callout_stop(&sc->sc_md6_tmo);
1941                         if (sc->sc_state == MASTER)
1942                                 carp_send_ad_locked(sc);
1943                         carp_set_state(sc, INIT);
1944                         carp_setrun(sc, 0);
1945                 } else if (sc->sc_state == INIT && (ifr->ifr_flags & IFF_UP)) {
1946                         SC2IFP(sc)->if_flags |= IFF_UP;
1947                         carp_setrun(sc, 0);
1948                 }
1949                 break;
1950
1951         case SIOCSVH:
1952                 error = priv_check(curthread, PRIV_NETINET_CARP);
1953                 if (error)
1954                         break;
1955                 if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr)))
1956                         break;
1957                 error = 1;
1958                 if (sc->sc_carpdev) {
1959                         locked = 1;
1960                         CARP_SCLOCK(sc);
1961                 }
1962                 if (sc->sc_state != INIT && carpr.carpr_state != sc->sc_state) {
1963                         switch (carpr.carpr_state) {
1964                         case BACKUP:
1965                                 callout_stop(&sc->sc_ad_tmo);
1966                                 carp_set_state(sc, BACKUP);
1967                                 carp_setrun(sc, 0);
1968                                 carp_setroute(sc, RTM_DELETE);
1969                                 break;
1970                         case MASTER:
1971                                 carp_master_down_locked(sc);
1972                                 break;
1973                         default:
1974                                 break;
1975                         }
1976                 }
1977                 if (carpr.carpr_vhid > 0) {
1978                         if (carpr.carpr_vhid > 255) {
1979                                 error = EINVAL;
1980                                 break;
1981                         }
1982                         if (sc->sc_carpdev) {
1983                                 struct carp_if *cif;
1984                                 cif = (struct carp_if *)sc->sc_carpdev->if_carp;
1985                                 TAILQ_FOREACH(vr, &cif->vhif_vrs, sc_list)
1986                                         if (vr != sc &&
1987                                             vr->sc_vhid == carpr.carpr_vhid) {
1988                                                 error = EEXIST;
1989                                                 break;
1990                                         }
1991                                 if (error == EEXIST)
1992                                         break;
1993                         }
1994                         sc->sc_vhid = carpr.carpr_vhid;
1995                         IF_LLADDR(sc->sc_ifp)[0] = 0;
1996                         IF_LLADDR(sc->sc_ifp)[1] = 0;
1997                         IF_LLADDR(sc->sc_ifp)[2] = 0x5e;
1998                         IF_LLADDR(sc->sc_ifp)[3] = 0;
1999                         IF_LLADDR(sc->sc_ifp)[4] = 1;
2000                         IF_LLADDR(sc->sc_ifp)[5] = sc->sc_vhid;
2001                         error--;
2002                 }
2003                 if (carpr.carpr_advbase > 0 || carpr.carpr_advskew > 0) {
2004                         if (carpr.carpr_advskew >= 255) {
2005                                 error = EINVAL;
2006                                 break;
2007                         }
2008                         if (carpr.carpr_advbase > 255) {
2009                                 error = EINVAL;
2010                                 break;
2011                         }
2012                         sc->sc_advbase = carpr.carpr_advbase;
2013                         sc->sc_advskew = carpr.carpr_advskew;
2014                         error--;
2015                 }
2016                 bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
2017                 if (error > 0)
2018                         error = EINVAL;
2019                 else {
2020                         error = 0;
2021                         carp_setrun(sc, 0);
2022                 }
2023                 break;
2024
2025         case SIOCGVH:
2026                 /* XXX: lockless read */
2027                 bzero(&carpr, sizeof(carpr));
2028                 carpr.carpr_state = sc->sc_state;
2029                 carpr.carpr_vhid = sc->sc_vhid;
2030                 carpr.carpr_advbase = sc->sc_advbase;
2031                 carpr.carpr_advskew = sc->sc_advskew;
2032                 error = priv_check(curthread, PRIV_NETINET_CARP);
2033                 if (error == 0)
2034                         bcopy(sc->sc_key, carpr.carpr_key,
2035                             sizeof(carpr.carpr_key));
2036                 error = copyout(&carpr, ifr->ifr_data, sizeof(carpr));
2037                 break;
2038
2039         default:
2040                 error = EINVAL;
2041         }
2042
2043         if (locked)
2044                 CARP_SCUNLOCK(sc);
2045
2046         carp_hmac_prepare(sc);
2047
2048         return (error);
2049 }
2050
2051 /*
2052  * XXX: this is looutput. We should eventually use it from there.
2053  */
2054 static int
2055 carp_looutput(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
2056     struct route *ro)
2057 {
2058         u_int32_t af;
2059         struct rtentry *rt = NULL;
2060
2061         M_ASSERTPKTHDR(m); /* check if we have the packet header */
2062
2063         if (ro != NULL)
2064                 rt = ro->ro_rt;
2065         if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2066                 m_freem(m);
2067                 return (rt->rt_flags & RTF_BLACKHOLE ? 0 :
2068                         rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
2069         }
2070
2071         ifp->if_opackets++;
2072         ifp->if_obytes += m->m_pkthdr.len;
2073
2074         /* BPF writes need to be handled specially. */
2075         if (dst->sa_family == AF_UNSPEC) {
2076                 bcopy(dst->sa_data, &af, sizeof(af));
2077                 dst->sa_family = af;
2078         }
2079
2080 #if 1   /* XXX */
2081         switch (dst->sa_family) {
2082         case AF_INET:
2083         case AF_INET6:
2084         case AF_IPX:
2085         case AF_APPLETALK:
2086                 break;
2087         default:
2088                 printf("carp_looutput: af=%d unexpected\n", dst->sa_family);
2089                 m_freem(m);
2090                 return (EAFNOSUPPORT);
2091         }
2092 #endif
2093         return(if_simloop(ifp, m, dst->sa_family, 0));
2094 }
2095
2096 /*
2097  * Start output on carp interface. This function should never be called.
2098  */
2099 static void
2100 carp_start(struct ifnet *ifp)
2101 {
2102 #ifdef DEBUG
2103         printf("%s: start called\n", ifp->if_xname);
2104 #endif
2105 }
2106
2107 int
2108 carp_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *sa,
2109     struct rtentry *rt)
2110 {
2111         struct m_tag *mtag;
2112         struct carp_softc *sc;
2113         struct ifnet *carp_ifp;
2114
2115         if (!sa)
2116                 return (0);
2117
2118         switch (sa->sa_family) {
2119 #ifdef INET
2120         case AF_INET:
2121                 break;
2122 #endif /* INET */
2123 #ifdef INET6
2124         case AF_INET6:
2125                 break;
2126 #endif /* INET6 */
2127         default:
2128                 return (0);
2129         }
2130
2131         mtag = m_tag_find(m, PACKET_TAG_CARP, NULL);
2132         if (mtag == NULL)
2133                 return (0);
2134
2135         bcopy(mtag + 1, &carp_ifp, sizeof(struct ifnet *));
2136         sc = carp_ifp->if_softc;
2137
2138         /* Set the source MAC address to Virtual Router MAC Address */
2139         switch (ifp->if_type) {
2140         case IFT_ETHER:
2141         case IFT_L2VLAN: {
2142                         struct ether_header *eh;
2143
2144                         eh = mtod(m, struct ether_header *);
2145                         eh->ether_shost[0] = 0;
2146                         eh->ether_shost[1] = 0;
2147                         eh->ether_shost[2] = 0x5e;
2148                         eh->ether_shost[3] = 0;
2149                         eh->ether_shost[4] = 1;
2150                         eh->ether_shost[5] = sc->sc_vhid;
2151                 }
2152                 break;
2153         case IFT_FDDI: {
2154                         struct fddi_header *fh;
2155
2156                         fh = mtod(m, struct fddi_header *);
2157                         fh->fddi_shost[0] = 0;
2158                         fh->fddi_shost[1] = 0;
2159                         fh->fddi_shost[2] = 0x5e;
2160                         fh->fddi_shost[3] = 0;
2161                         fh->fddi_shost[4] = 1;
2162                         fh->fddi_shost[5] = sc->sc_vhid;
2163                 }
2164                 break;
2165         case IFT_ISO88025: {
2166                         struct iso88025_header *th;
2167                         th = mtod(m, struct iso88025_header *);
2168                         th->iso88025_shost[0] = 3;
2169                         th->iso88025_shost[1] = 0;
2170                         th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1);
2171                         th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1);
2172                         th->iso88025_shost[4] = 0;
2173                         th->iso88025_shost[5] = 0;
2174                 }
2175                 break;
2176         default:
2177                 printf("%s: carp is not supported for this interface type\n",
2178                     ifp->if_xname);
2179                 return (EOPNOTSUPP);
2180         }
2181
2182         return (0);
2183 }
2184
2185 static void
2186 carp_set_state(struct carp_softc *sc, int state)
2187 {
2188         int link_state;
2189
2190         if (sc->sc_carpdev)
2191                 CARP_SCLOCK_ASSERT(sc);
2192
2193         if (sc->sc_state == state)
2194                 return;
2195
2196         sc->sc_state = state;
2197         switch (state) {
2198         case BACKUP:
2199                 link_state = LINK_STATE_DOWN;
2200                 break;
2201         case MASTER:
2202                 link_state = LINK_STATE_UP;
2203                 break;
2204         default:
2205                 link_state = LINK_STATE_UNKNOWN;
2206                 break;
2207         }
2208         if_link_state_change(SC2IFP(sc), link_state);
2209 }
2210
2211 void
2212 carp_carpdev_state(void *v)
2213 {
2214         struct carp_if *cif = v;
2215
2216         CARP_LOCK(cif);
2217         carp_carpdev_state_locked(cif);
2218         CARP_UNLOCK(cif);
2219 }
2220
2221 static void
2222 carp_carpdev_state_locked(struct carp_if *cif)
2223 {
2224         struct carp_softc *sc;
2225
2226         TAILQ_FOREACH(sc, &cif->vhif_vrs, sc_list)
2227                 carp_sc_state_locked(sc);
2228 }
2229
2230 static void
2231 carp_sc_state_locked(struct carp_softc *sc)
2232 {
2233         CARP_SCLOCK_ASSERT(sc);
2234
2235         if (sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
2236             !(sc->sc_carpdev->if_flags & IFF_UP)) {
2237                 sc->sc_flags_backup = SC2IFP(sc)->if_flags;
2238                 SC2IFP(sc)->if_flags &= ~IFF_UP;
2239                 SC2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
2240                 callout_stop(&sc->sc_ad_tmo);
2241                 callout_stop(&sc->sc_md_tmo);
2242                 callout_stop(&sc->sc_md6_tmo);
2243                 carp_set_state(sc, INIT);
2244                 carp_setrun(sc, 0);
2245                 if (!sc->sc_suppress) {
2246                         carp_suppress_preempt++;
2247                         if (carp_suppress_preempt == 1) {
2248                                 CARP_SCUNLOCK(sc);
2249                                 carp_send_ad_all();
2250                                 CARP_SCLOCK(sc);
2251                         }
2252                 }
2253                 sc->sc_suppress = 1;
2254         } else {
2255                 SC2IFP(sc)->if_flags |= sc->sc_flags_backup;
2256                 carp_set_state(sc, INIT);
2257                 carp_setrun(sc, 0);
2258                 if (sc->sc_suppress)
2259                         carp_suppress_preempt--;
2260                 sc->sc_suppress = 0;
2261         }
2262
2263         return;
2264 }
2265
2266 static int
2267 carp_modevent(module_t mod, int type, void *data)
2268 {
2269         switch (type) {
2270         case MOD_LOAD:
2271                 if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
2272                     carp_ifdetach, NULL, EVENTHANDLER_PRI_ANY);
2273                 if (if_detach_event_tag == NULL)
2274                         return (ENOMEM);
2275                 mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF);
2276                 LIST_INIT(&carpif_list);
2277                 if_clone_attach(&carp_cloner);
2278                 break;
2279
2280         case MOD_UNLOAD:
2281                 EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag);
2282                 if_clone_detach(&carp_cloner);
2283                 mtx_destroy(&carp_mtx);
2284                 break;
2285
2286         default:
2287                 return (EINVAL);
2288         }
2289
2290         return (0);
2291 }
2292
2293 static moduledata_t carp_mod = {
2294         "carp",
2295         carp_modevent,
2296         0
2297 };
2298
2299 DECLARE_MODULE(carp, carp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);