]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/netinet/ip_carp.c
MFV r254071:
[FreeBSD/FreeBSD.git] / sys / netinet / ip_carp.c
1 /*-
2  * Copyright (c) 2002 Michael Shalayeff.
3  * Copyright (c) 2003 Ryan McBride.
4  * Copyright (c) 2011 Gleb Smirnoff <glebius@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
20  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26  * THE POSSIBILITY OF SUCH DAMAGE.
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include "opt_bpf.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/bus.h>
39 #include <sys/jail.h>
40 #include <sys/kernel.h>
41 #include <sys/limits.h>
42 #include <sys/malloc.h>
43 #include <sys/mbuf.h>
44 #include <sys/module.h>
45 #include <sys/priv.h>
46 #include <sys/proc.h>
47 #include <sys/protosw.h>
48 #include <sys/socket.h>
49 #include <sys/sockio.h>
50 #include <sys/sysctl.h>
51 #include <sys/syslog.h>
52 #include <sys/taskqueue.h>
53 #include <sys/counter.h>
54
55 #include <net/ethernet.h>
56 #include <net/fddi.h>
57 #include <net/if.h>
58 #include <net/if_dl.h>
59 #include <net/if_llatbl.h>
60 #include <net/if_types.h>
61 #include <net/iso88025.h>
62 #include <net/route.h>
63 #include <net/vnet.h>
64
65 #if defined(INET) || defined(INET6)
66 #include <netinet/in.h>
67 #include <netinet/in_var.h>
68 #include <netinet/ip_carp.h>
69 #include <netinet/ip.h>
70 #include <machine/in_cksum.h>
71 #endif
72 #ifdef INET
73 #include <netinet/ip_var.h>
74 #include <netinet/if_ether.h>
75 #endif
76
77 #ifdef INET6
78 #include <netinet/icmp6.h>
79 #include <netinet/ip6.h>
80 #include <netinet6/ip6protosw.h>
81 #include <netinet6/in6_var.h>
82 #include <netinet6/ip6_var.h>
83 #include <netinet6/scope6_var.h>
84 #include <netinet6/nd6.h>
85 #endif
86
87 #include <crypto/sha1.h>
88
89 static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses");
90
91 struct carp_softc {
92         struct ifnet            *sc_carpdev;    /* Pointer to parent ifnet. */
93         struct ifaddr           **sc_ifas;      /* Our ifaddrs. */
94         struct sockaddr_dl      sc_addr;        /* Our link level address. */
95         struct callout          sc_ad_tmo;      /* Advertising timeout. */
96 #ifdef INET
97         struct callout          sc_md_tmo;      /* Master down timeout. */
98 #endif
99 #ifdef INET6
100         struct callout          sc_md6_tmo;     /* XXX: Master down timeout. */
101 #endif
102         struct mtx              sc_mtx;
103
104         int                     sc_vhid;
105         int                     sc_advskew;
106         int                     sc_advbase;
107
108         int                     sc_naddrs;
109         int                     sc_naddrs6;
110         int                     sc_ifasiz;
111         enum { INIT = 0, BACKUP, MASTER }       sc_state;
112         int                     sc_suppress;
113         int                     sc_sendad_errors;
114 #define CARP_SENDAD_MAX_ERRORS  3
115         int                     sc_sendad_success;
116 #define CARP_SENDAD_MIN_SUCCESS 3
117
118         int                     sc_init_counter;
119         uint64_t                sc_counter;
120
121         /* authentication */
122 #define CARP_HMAC_PAD   64
123         unsigned char sc_key[CARP_KEY_LEN];
124         unsigned char sc_pad[CARP_HMAC_PAD];
125         SHA1_CTX sc_sha1;
126
127         TAILQ_ENTRY(carp_softc) sc_list;        /* On the carp_if list. */
128         LIST_ENTRY(carp_softc)  sc_next;        /* On the global list. */
129 };
130
131 struct carp_if {
132 #ifdef INET
133         int     cif_naddrs;
134 #endif
135 #ifdef INET6
136         int     cif_naddrs6;
137 #endif
138         TAILQ_HEAD(, carp_softc) cif_vrs;
139 #ifdef INET
140         struct ip_moptions       cif_imo;
141 #endif
142 #ifdef INET6
143         struct ip6_moptions      cif_im6o;
144 #endif
145         struct ifnet    *cif_ifp;
146         struct mtx      cif_mtx;
147 };
148
149 #define CARP_INET       0
150 #define CARP_INET6      1
151 static int proto_reg[] = {-1, -1};
152
153 /*
154  * Brief design of carp(4).
155  *
156  * Any carp-capable ifnet may have a list of carp softcs hanging off
157  * its ifp->if_carp pointer. Each softc represents one unique virtual
158  * host id, or vhid. The softc has a back pointer to the ifnet. All
159  * softcs are joined in a global list, which has quite limited use.
160  *
161  * Any interface address that takes part in CARP negotiation has a
162  * pointer to the softc of its vhid, ifa->ifa_carp. That could be either
163  * AF_INET or AF_INET6 address.
164  *
165  * Although, one can get the softc's backpointer to ifnet and traverse
166  * through its ifp->if_addrhead queue to find all interface addresses
167  * involved in CARP, we keep a growable array of ifaddr pointers. This
168  * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that
169  * do calls into the network stack, thus avoiding LORs.
170  *
171  * Locking:
172  *
173  * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(),
174  * callout-driven events and ioctl()s.
175  *
176  * To traverse the list of softcs on an ifnet we use CIF_LOCK(), to
177  * traverse the global list we use the mutex carp_mtx.
178  *
179  * Known issues with locking:
180  *
181  * - There is no protection for races between two ioctl() requests,
182  *   neither SIOCSVH, nor SIOCAIFADDR & SIOCAIFADDR_IN6. I think that all
183  *   interface ioctl()s should be serialized right in net/if.c.
184  * - Sending ad, we put the pointer to the softc in an mtag, and no reference
185  *   counting is done on the softc.
186  * - On module unload we may race (?) with packet processing thread
187  *   dereferencing our function pointers.
188  */
189
190 static int carp_allow = 1;              /* Accept incoming CARP packets. */
191 static int carp_preempt = 0;            /* Preempt slower nodes. */
192 static int carp_log = 1;                /* Log level. */
193 static int carp_demotion = 0;           /* Global advskew demotion. */
194 static int carp_senderr_adj = CARP_MAXSKEW;     /* Send error demotion factor */
195 static int carp_ifdown_adj = CARP_MAXSKEW;      /* Iface down demotion factor */
196 static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS);
197
198 SYSCTL_NODE(_net_inet, IPPROTO_CARP,    carp,   CTLFLAG_RW, 0,  "CARP");
199 SYSCTL_INT(_net_inet_carp, OID_AUTO, allow, CTLFLAG_RW, &carp_allow, 0,
200     "Accept incoming CARP packets");
201 SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_RW, &carp_preempt, 0,
202     "High-priority backup preemption mode");
203 SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_RW, &carp_log, 0,
204     "CARP log level");
205 SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion, CTLTYPE_INT|CTLFLAG_RW,
206     0, 0, carp_demote_adj_sysctl, "I",
207     "Adjust demotion factor (skew of advskew)");
208 SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor, CTLFLAG_RW,
209     &carp_senderr_adj, 0, "Send error demotion factor adjustment");
210 SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor, CTLFLAG_RW,
211     &carp_ifdown_adj, 0, "Interface down demotion factor adjustment");
212
213 static counter_u64_t carpstats[sizeof(struct carpstats) / sizeof(uint64_t)];
214 #define CARPSTATS_ADD(name, val)        \
215     counter_u64_add(carpstats[offsetof(struct carpstats, name) / \
216         sizeof(uint64_t)], (val))
217 #define CARPSTATS_INC(name)             CARPSTATS_ADD(name, 1)
218
219 static int
220 carpstats_sysctl(SYSCTL_HANDLER_ARGS)
221 {
222         struct carpstats s;
223
224         COUNTER_ARRAY_COPY(carpstats, &s, sizeof(s) / sizeof(uint64_t));
225         if (req->newptr)
226                 COUNTER_ARRAY_ZERO(carpstats, sizeof(s) / sizeof(uint64_t));
227         return (SYSCTL_OUT(req, &s, sizeof(s)));
228 }
229 SYSCTL_PROC(_net_inet_carp, OID_AUTO, stats, CTLTYPE_OPAQUE | CTLFLAG_RW,
230     NULL, 0, carpstats_sysctl, "I",
231     "CARP statistics (struct carpstats, netinet/ip_carp.h)");
232
233 #define CARP_LOCK_INIT(sc)      mtx_init(&(sc)->sc_mtx, "carp_softc",   \
234         NULL, MTX_DEF)
235 #define CARP_LOCK_DESTROY(sc)   mtx_destroy(&(sc)->sc_mtx)
236 #define CARP_LOCK_ASSERT(sc)    mtx_assert(&(sc)->sc_mtx, MA_OWNED)
237 #define CARP_LOCK(sc)           mtx_lock(&(sc)->sc_mtx)
238 #define CARP_UNLOCK(sc)         mtx_unlock(&(sc)->sc_mtx)
239 #define CIF_LOCK_INIT(cif)      mtx_init(&(cif)->cif_mtx, "carp_if",   \
240         NULL, MTX_DEF)
241 #define CIF_LOCK_DESTROY(cif)   mtx_destroy(&(cif)->cif_mtx)
242 #define CIF_LOCK_ASSERT(cif)    mtx_assert(&(cif)->cif_mtx, MA_OWNED)
243 #define CIF_LOCK(cif)           mtx_lock(&(cif)->cif_mtx)
244 #define CIF_UNLOCK(cif)         mtx_unlock(&(cif)->cif_mtx)
245 #define CIF_FREE(cif)   do {                            \
246                 CIF_LOCK_ASSERT(cif);                   \
247                 if (TAILQ_EMPTY(&(cif)->cif_vrs))       \
248                         carp_free_if(cif);              \
249                 else                                    \
250                         CIF_UNLOCK(cif);                \
251 } while (0)
252
253 #define CARP_LOG(...)   do {                            \
254         if (carp_log > 0)                               \
255                 log(LOG_INFO, "carp: " __VA_ARGS__);    \
256 } while (0)
257
258 #define CARP_DEBUG(...) do {                            \
259         if (carp_log > 1)                               \
260                 log(LOG_DEBUG, __VA_ARGS__);            \
261 } while (0)
262
263 #define IFNET_FOREACH_IFA(ifp, ifa)                                     \
264         IF_ADDR_LOCK_ASSERT(ifp);                                       \
265         TAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link)             \
266                 if ((ifa)->ifa_carp != NULL)
267
268 #define CARP_FOREACH_IFA(sc, ifa)                                       \
269         CARP_LOCK_ASSERT(sc);                                           \
270         for (int _i = 0;                                                \
271                 _i < (sc)->sc_naddrs + (sc)->sc_naddrs6 &&              \
272                 ((ifa) = sc->sc_ifas[_i]) != NULL;                      \
273                 ++_i)
274
275 #define IFNET_FOREACH_CARP(ifp, sc)                                     \
276         CIF_LOCK_ASSERT(ifp->if_carp);                                  \
277         TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list)
278
279 #define DEMOTE_ADVSKEW(sc)                                      \
280     (((sc)->sc_advskew + carp_demotion > CARP_MAXSKEW) ?        \
281     CARP_MAXSKEW : ((sc)->sc_advskew + carp_demotion))
282
283 static void     carp_input_c(struct mbuf *, struct carp_header *, sa_family_t);
284 static struct carp_softc
285                 *carp_alloc(struct ifnet *);
286 static void     carp_detach_locked(struct ifaddr *);
287 static void     carp_destroy(struct carp_softc *);
288 static struct carp_if
289                 *carp_alloc_if(struct ifnet *);
290 static void     carp_free_if(struct carp_if *);
291 static void     carp_set_state(struct carp_softc *, int);
292 static void     carp_sc_state(struct carp_softc *);
293 static void     carp_setrun(struct carp_softc *, sa_family_t);
294 static void     carp_master_down(void *);
295 static void     carp_master_down_locked(struct carp_softc *);
296 static void     carp_send_ad(void *);
297 static void     carp_send_ad_locked(struct carp_softc *);
298 static void     carp_addroute(struct carp_softc *);
299 static void     carp_ifa_addroute(struct ifaddr *);
300 static void     carp_delroute(struct carp_softc *);
301 static void     carp_ifa_delroute(struct ifaddr *);
302 static void     carp_send_ad_all(void *, int);
303 static void     carp_demote_adj(int, char *);
304
305 static LIST_HEAD(, carp_softc) carp_list;
306 static struct mtx carp_mtx;
307 static struct task carp_sendall_task =
308     TASK_INITIALIZER(0, carp_send_ad_all, NULL);
309
310 static void
311 carp_hmac_prepare(struct carp_softc *sc)
312 {
313         uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
314         uint8_t vhid = sc->sc_vhid & 0xff;
315         struct ifaddr *ifa;
316         int i, found;
317 #ifdef INET
318         struct in_addr last, cur, in;
319 #endif
320 #ifdef INET6
321         struct in6_addr last6, cur6, in6;
322 #endif
323
324         CARP_LOCK_ASSERT(sc);
325
326         /* Compute ipad from key. */
327         bzero(sc->sc_pad, sizeof(sc->sc_pad));
328         bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
329         for (i = 0; i < sizeof(sc->sc_pad); i++)
330                 sc->sc_pad[i] ^= 0x36;
331
332         /* Precompute first part of inner hash. */
333         SHA1Init(&sc->sc_sha1);
334         SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
335         SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
336         SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
337         SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
338 #ifdef INET
339         cur.s_addr = 0;
340         do {
341                 found = 0;
342                 last = cur;
343                 cur.s_addr = 0xffffffff;
344                 CARP_FOREACH_IFA(sc, ifa) {
345                         in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
346                         if (ifa->ifa_addr->sa_family == AF_INET &&
347                             ntohl(in.s_addr) > ntohl(last.s_addr) &&
348                             ntohl(in.s_addr) < ntohl(cur.s_addr)) {
349                                 cur.s_addr = in.s_addr;
350                                 found++;
351                         }
352                 }
353                 if (found)
354                         SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur));
355         } while (found);
356 #endif /* INET */
357 #ifdef INET6
358         memset(&cur6, 0, sizeof(cur6));
359         do {
360                 found = 0;
361                 last6 = cur6;
362                 memset(&cur6, 0xff, sizeof(cur6));
363                 CARP_FOREACH_IFA(sc, ifa) {
364                         in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
365                         if (IN6_IS_SCOPE_EMBED(&in6))
366                                 in6.s6_addr16[1] = 0;
367                         if (ifa->ifa_addr->sa_family == AF_INET6 &&
368                             memcmp(&in6, &last6, sizeof(in6)) > 0 &&
369                             memcmp(&in6, &cur6, sizeof(in6)) < 0) {
370                                 cur6 = in6;
371                                 found++;
372                         }
373                 }
374                 if (found)
375                         SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6));
376         } while (found);
377 #endif /* INET6 */
378
379         /* convert ipad to opad */
380         for (i = 0; i < sizeof(sc->sc_pad); i++)
381                 sc->sc_pad[i] ^= 0x36 ^ 0x5c;
382 }
383
384 static void
385 carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2],
386     unsigned char md[20])
387 {
388         SHA1_CTX sha1ctx;
389
390         CARP_LOCK_ASSERT(sc);
391
392         /* fetch first half of inner hash */
393         bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));
394
395         SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter));
396         SHA1Final(md, &sha1ctx);
397
398         /* outer hash */
399         SHA1Init(&sha1ctx);
400         SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad));
401         SHA1Update(&sha1ctx, md, 20);
402         SHA1Final(md, &sha1ctx);
403 }
404
405 static int
406 carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2],
407     unsigned char md[20])
408 {
409         unsigned char md2[20];
410
411         CARP_LOCK_ASSERT(sc);
412
413         carp_hmac_generate(sc, counter, md2);
414
415         return (bcmp(md, md2, sizeof(md2)));
416 }
417
418 /*
419  * process input packet.
420  * we have rearranged checks order compared to the rfc,
421  * but it seems more efficient this way or not possible otherwise.
422  */
423 #ifdef INET
424 void
425 carp_input(struct mbuf *m, int hlen)
426 {
427         struct ip *ip = mtod(m, struct ip *);
428         struct carp_header *ch;
429         int iplen, len;
430
431         CARPSTATS_INC(carps_ipackets);
432
433         if (!carp_allow) {
434                 m_freem(m);
435                 return;
436         }
437
438         /* verify that the IP TTL is 255.  */
439         if (ip->ip_ttl != CARP_DFLTTL) {
440                 CARPSTATS_INC(carps_badttl);
441                 CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
442                     ip->ip_ttl,
443                     m->m_pkthdr.rcvif->if_xname);
444                 m_freem(m);
445                 return;
446         }
447
448         iplen = ip->ip_hl << 2;
449
450         if (m->m_pkthdr.len < iplen + sizeof(*ch)) {
451                 CARPSTATS_INC(carps_badlen);
452                 CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) "
453                     "on %s\n", __func__, m->m_len - sizeof(struct ip),
454                     m->m_pkthdr.rcvif->if_xname);
455                 m_freem(m);
456                 return;
457         }
458
459         if (iplen + sizeof(*ch) < m->m_len) {
460                 if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) {
461                         CARPSTATS_INC(carps_hdrops);
462                         CARP_DEBUG("%s: pullup failed\n", __func__);
463                         return;
464                 }
465                 ip = mtod(m, struct ip *);
466         }
467         ch = (struct carp_header *)((char *)ip + iplen);
468
469         /*
470          * verify that the received packet length is
471          * equal to the CARP header
472          */
473         len = iplen + sizeof(*ch);
474         if (len > m->m_pkthdr.len) {
475                 CARPSTATS_INC(carps_badlen);
476                 CARP_DEBUG("%s: packet too short %d on %s\n", __func__,
477                     m->m_pkthdr.len,
478                     m->m_pkthdr.rcvif->if_xname);
479                 m_freem(m);
480                 return;
481         }
482
483         if ((m = m_pullup(m, len)) == NULL) {
484                 CARPSTATS_INC(carps_hdrops);
485                 return;
486         }
487         ip = mtod(m, struct ip *);
488         ch = (struct carp_header *)((char *)ip + iplen);
489
490         /* verify the CARP checksum */
491         m->m_data += iplen;
492         if (in_cksum(m, len - iplen)) {
493                 CARPSTATS_INC(carps_badsum);
494                 CARP_DEBUG("%s: checksum failed on %s\n", __func__,
495                     m->m_pkthdr.rcvif->if_xname);
496                 m_freem(m);
497                 return;
498         }
499         m->m_data -= iplen;
500
501         carp_input_c(m, ch, AF_INET);
502 }
503 #endif
504
505 #ifdef INET6
506 int
507 carp6_input(struct mbuf **mp, int *offp, int proto)
508 {
509         struct mbuf *m = *mp;
510         struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
511         struct carp_header *ch;
512         u_int len;
513
514         CARPSTATS_INC(carps_ipackets6);
515
516         if (!carp_allow) {
517                 m_freem(m);
518                 return (IPPROTO_DONE);
519         }
520
521         /* check if received on a valid carp interface */
522         if (m->m_pkthdr.rcvif->if_carp == NULL) {
523                 CARPSTATS_INC(carps_badif);
524                 CARP_DEBUG("%s: packet received on non-carp interface: %s\n",
525                     __func__, m->m_pkthdr.rcvif->if_xname);
526                 m_freem(m);
527                 return (IPPROTO_DONE);
528         }
529
530         /* verify that the IP TTL is 255 */
531         if (ip6->ip6_hlim != CARP_DFLTTL) {
532                 CARPSTATS_INC(carps_badttl);
533                 CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
534                     ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname);
535                 m_freem(m);
536                 return (IPPROTO_DONE);
537         }
538
539         /* verify that we have a complete carp packet */
540         len = m->m_len;
541         IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch));
542         if (ch == NULL) {
543                 CARPSTATS_INC(carps_badlen);
544                 CARP_DEBUG("%s: packet size %u too small\n", __func__, len);
545                 return (IPPROTO_DONE);
546         }
547
548
549         /* verify the CARP checksum */
550         m->m_data += *offp;
551         if (in_cksum(m, sizeof(*ch))) {
552                 CARPSTATS_INC(carps_badsum);
553                 CARP_DEBUG("%s: checksum failed, on %s\n", __func__,
554                     m->m_pkthdr.rcvif->if_xname);
555                 m_freem(m);
556                 return (IPPROTO_DONE);
557         }
558         m->m_data -= *offp;
559
560         carp_input_c(m, ch, AF_INET6);
561         return (IPPROTO_DONE);
562 }
563 #endif /* INET6 */
564
565 static void
566 carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
567 {
568         struct ifnet *ifp = m->m_pkthdr.rcvif;
569         struct ifaddr *ifa;
570         struct carp_softc *sc;
571         uint64_t tmp_counter;
572         struct timeval sc_tv, ch_tv;
573
574         /* verify that the VHID is valid on the receiving interface */
575         IF_ADDR_RLOCK(ifp);
576         IFNET_FOREACH_IFA(ifp, ifa)
577                 if (ifa->ifa_addr->sa_family == af &&
578                     ifa->ifa_carp->sc_vhid == ch->carp_vhid) {
579                         ifa_ref(ifa);
580                         break;
581                 }
582         IF_ADDR_RUNLOCK(ifp);
583
584         if (ifa == NULL) {
585                 CARPSTATS_INC(carps_badvhid);
586                 m_freem(m);
587                 return;
588         }
589
590         /* verify the CARP version. */
591         if (ch->carp_version != CARP_VERSION) {
592                 CARPSTATS_INC(carps_badver);
593                 CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname,
594                     ch->carp_version);
595                 ifa_free(ifa);
596                 m_freem(m);
597                 return;
598         }
599
600         sc = ifa->ifa_carp;
601         CARP_LOCK(sc);
602         ifa_free(ifa);
603
604         if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
605                 CARPSTATS_INC(carps_badauth);
606                 CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__,
607                     sc->sc_vhid, ifp->if_xname);
608                 goto out;
609         }
610
611         tmp_counter = ntohl(ch->carp_counter[0]);
612         tmp_counter = tmp_counter<<32;
613         tmp_counter += ntohl(ch->carp_counter[1]);
614
615         /* XXX Replay protection goes here */
616
617         sc->sc_init_counter = 0;
618         sc->sc_counter = tmp_counter;
619
620         sc_tv.tv_sec = sc->sc_advbase;
621         sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256;
622         ch_tv.tv_sec = ch->carp_advbase;
623         ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
624
625         switch (sc->sc_state) {
626         case INIT:
627                 break;
628         case MASTER:
629                 /*
630                  * If we receive an advertisement from a master who's going to
631                  * be more frequent than us, go into BACKUP state.
632                  */
633                 if (timevalcmp(&sc_tv, &ch_tv, >) ||
634                     timevalcmp(&sc_tv, &ch_tv, ==)) {
635                         callout_stop(&sc->sc_ad_tmo);
636                         CARP_LOG("VHID %u@%s: MASTER -> BACKUP "
637                             "(more frequent advertisement received)\n",
638                             sc->sc_vhid,
639                             sc->sc_carpdev->if_xname);
640                         carp_set_state(sc, BACKUP);
641                         carp_setrun(sc, 0);
642                         carp_delroute(sc);
643                 }
644                 break;
645         case BACKUP:
646                 /*
647                  * If we're pre-empting masters who advertise slower than us,
648                  * and this one claims to be slower, treat him as down.
649                  */
650                 if (carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) {
651                         CARP_LOG("VHID %u@%s: BACKUP -> MASTER "
652                             "(preempting a slower master)\n",
653                             sc->sc_vhid,
654                             sc->sc_carpdev->if_xname);
655                         carp_master_down_locked(sc);
656                         break;
657                 }
658
659                 /*
660                  *  If the master is going to advertise at such a low frequency
661                  *  that he's guaranteed to time out, we'd might as well just
662                  *  treat him as timed out now.
663                  */
664                 sc_tv.tv_sec = sc->sc_advbase * 3;
665                 if (timevalcmp(&sc_tv, &ch_tv, <)) {
666                         CARP_LOG("VHID %u@%s: BACKUP -> MASTER "
667                             "(master timed out)\n",
668                             sc->sc_vhid,
669                             sc->sc_carpdev->if_xname);
670                         carp_master_down_locked(sc);
671                         break;
672                 }
673
674                 /*
675                  * Otherwise, we reset the counter and wait for the next
676                  * advertisement.
677                  */
678                 carp_setrun(sc, af);
679                 break;
680         }
681
682 out:
683         CARP_UNLOCK(sc);
684         m_freem(m);
685 }
686
687 static int
688 carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch)
689 {
690         struct m_tag *mtag;
691
692         if (sc->sc_init_counter) {
693                 /* this could also be seconds since unix epoch */
694                 sc->sc_counter = arc4random();
695                 sc->sc_counter = sc->sc_counter << 32;
696                 sc->sc_counter += arc4random();
697         } else
698                 sc->sc_counter++;
699
700         ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff);
701         ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff);
702
703         carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
704
705         /* Tag packet for carp_output */
706         if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *),
707             M_NOWAIT)) == NULL) {
708                 m_freem(m);
709                 CARPSTATS_INC(carps_onomem);
710                 return (ENOMEM);
711         }
712         bcopy(&sc, mtag + 1, sizeof(sc));
713         m_tag_prepend(m, mtag);
714
715         return (0);
716 }
717
718 /*
719  * To avoid LORs and possible recursions this function shouldn't
720  * be called directly, but scheduled via taskqueue.
721  */
722 static void
723 carp_send_ad_all(void *ctx __unused, int pending __unused)
724 {
725         struct carp_softc *sc;
726
727         mtx_lock(&carp_mtx);
728         LIST_FOREACH(sc, &carp_list, sc_next)
729                 if (sc->sc_state == MASTER) {
730                         CARP_LOCK(sc);
731                         CURVNET_SET(sc->sc_carpdev->if_vnet);
732                         carp_send_ad_locked(sc);
733                         CURVNET_RESTORE();
734                         CARP_UNLOCK(sc);
735                 }
736         mtx_unlock(&carp_mtx);
737 }
738
739 /* Send a periodic advertisement, executed in callout context. */
740 static void
741 carp_send_ad(void *v)
742 {
743         struct carp_softc *sc = v;
744
745         CARP_LOCK_ASSERT(sc);
746         CURVNET_SET(sc->sc_carpdev->if_vnet);
747         carp_send_ad_locked(sc);
748         CURVNET_RESTORE();
749         CARP_UNLOCK(sc);
750 }
751
752 static void
753 carp_send_ad_locked(struct carp_softc *sc)
754 {
755         struct carp_header ch;
756         struct timeval tv;
757         struct sockaddr sa;
758         struct ifaddr *ifa;
759         struct carp_header *ch_ptr;
760         struct mbuf *m;
761         int len, advskew;
762
763         CARP_LOCK_ASSERT(sc);
764
765         advskew = DEMOTE_ADVSKEW(sc);
766         tv.tv_sec = sc->sc_advbase;
767         tv.tv_usec = advskew * 1000000 / 256;
768
769         ch.carp_version = CARP_VERSION;
770         ch.carp_type = CARP_ADVERTISEMENT;
771         ch.carp_vhid = sc->sc_vhid;
772         ch.carp_advbase = sc->sc_advbase;
773         ch.carp_advskew = advskew;
774         ch.carp_authlen = 7;    /* XXX DEFINE */
775         ch.carp_pad1 = 0;       /* must be zero */
776         ch.carp_cksum = 0;
777
778         /* XXXGL: OpenBSD picks first ifaddr with needed family. */
779
780 #ifdef INET
781         if (sc->sc_naddrs) {
782                 struct ip *ip;
783
784                 m = m_gethdr(M_NOWAIT, MT_DATA);
785                 if (m == NULL) {
786                         CARPSTATS_INC(carps_onomem);
787                         goto resched;
788                 }
789                 len = sizeof(*ip) + sizeof(ch);
790                 m->m_pkthdr.len = len;
791                 m->m_pkthdr.rcvif = NULL;
792                 m->m_len = len;
793                 MH_ALIGN(m, m->m_len);
794                 m->m_flags |= M_MCAST;
795                 ip = mtod(m, struct ip *);
796                 ip->ip_v = IPVERSION;
797                 ip->ip_hl = sizeof(*ip) >> 2;
798                 ip->ip_tos = IPTOS_LOWDELAY;
799                 ip->ip_len = htons(len);
800                 ip->ip_id = ip_newid();
801                 ip->ip_off = htons(IP_DF);
802                 ip->ip_ttl = CARP_DFLTTL;
803                 ip->ip_p = IPPROTO_CARP;
804                 ip->ip_sum = 0;
805
806                 bzero(&sa, sizeof(sa));
807                 sa.sa_family = AF_INET;
808                 ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev);
809                 if (ifa != NULL) {
810                         ip->ip_src.s_addr =
811                             ifatoia(ifa)->ia_addr.sin_addr.s_addr;
812                         ifa_free(ifa);
813                 } else
814                         ip->ip_src.s_addr = 0;
815                 ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
816
817                 ch_ptr = (struct carp_header *)(&ip[1]);
818                 bcopy(&ch, ch_ptr, sizeof(ch));
819                 if (carp_prepare_ad(m, sc, ch_ptr))
820                         goto resched;
821
822                 m->m_data += sizeof(*ip);
823                 ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip));
824                 m->m_data -= sizeof(*ip);
825
826                 CARPSTATS_INC(carps_opackets);
827
828                 if (ip_output(m, NULL, NULL, IP_RAWOUTPUT,
829                     &sc->sc_carpdev->if_carp->cif_imo, NULL)) {
830                         if (sc->sc_sendad_errors < INT_MAX)
831                                 sc->sc_sendad_errors++;
832                         if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS)
833                                 carp_demote_adj(carp_senderr_adj, "send error");
834                         sc->sc_sendad_success = 0;
835                 } else {
836                         if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
837                                 if (++sc->sc_sendad_success >=
838                                     CARP_SENDAD_MIN_SUCCESS) {
839                                         carp_demote_adj(-carp_senderr_adj,
840                                             "send ok");
841                                         sc->sc_sendad_errors = 0;
842                                 }
843                         } else
844                                 sc->sc_sendad_errors = 0;
845                 }
846         }
847 #endif /* INET */
848 #ifdef INET6
849         if (sc->sc_naddrs6) {
850                 struct ip6_hdr *ip6;
851
852                 m = m_gethdr(M_NOWAIT, MT_DATA);
853                 if (m == NULL) {
854                         CARPSTATS_INC(carps_onomem);
855                         goto resched;
856                 }
857                 len = sizeof(*ip6) + sizeof(ch);
858                 m->m_pkthdr.len = len;
859                 m->m_pkthdr.rcvif = NULL;
860                 m->m_len = len;
861                 MH_ALIGN(m, m->m_len);
862                 m->m_flags |= M_MCAST;
863                 ip6 = mtod(m, struct ip6_hdr *);
864                 bzero(ip6, sizeof(*ip6));
865                 ip6->ip6_vfc |= IPV6_VERSION;
866                 ip6->ip6_hlim = CARP_DFLTTL;
867                 ip6->ip6_nxt = IPPROTO_CARP;
868                 bzero(&sa, sizeof(sa));
869
870                 /* set the source address */
871                 sa.sa_family = AF_INET6;
872                 ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev);
873                 if (ifa != NULL) {
874                         bcopy(IFA_IN6(ifa), &ip6->ip6_src,
875                             sizeof(struct in6_addr));
876                         ifa_free(ifa);
877                 } else
878                         /* This should never happen with IPv6. */
879                         bzero(&ip6->ip6_src, sizeof(struct in6_addr));
880
881                 /* Set the multicast destination. */
882                 ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
883                 ip6->ip6_dst.s6_addr8[15] = 0x12;
884                 if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
885                         m_freem(m);
886                         CARP_DEBUG("%s: in6_setscope failed\n", __func__);
887                         goto resched;
888                 }
889
890                 ch_ptr = (struct carp_header *)(&ip6[1]);
891                 bcopy(&ch, ch_ptr, sizeof(ch));
892                 if (carp_prepare_ad(m, sc, ch_ptr))
893                         goto resched;
894
895                 m->m_data += sizeof(*ip6);
896                 ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6));
897                 m->m_data -= sizeof(*ip6);
898
899                 CARPSTATS_INC(carps_opackets6);
900
901                 if (ip6_output(m, NULL, NULL, 0,
902                     &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL)) {
903                         if (sc->sc_sendad_errors < INT_MAX)
904                                 sc->sc_sendad_errors++;
905                         if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS)
906                                 carp_demote_adj(carp_senderr_adj,
907                                     "send6 error");
908                         sc->sc_sendad_success = 0;
909                 } else {
910                         if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
911                                 if (++sc->sc_sendad_success >=
912                                     CARP_SENDAD_MIN_SUCCESS) {
913                                         carp_demote_adj(-carp_senderr_adj,
914                                             "send6 ok");
915                                         sc->sc_sendad_errors = 0;
916                                 }
917                         } else
918                                 sc->sc_sendad_errors = 0;
919                 }
920         }
921 #endif /* INET6 */
922
923 resched:
924         callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc);
925 }
926
927 static void
928 carp_addroute(struct carp_softc *sc)
929 {
930         struct ifaddr *ifa;
931
932         CARP_FOREACH_IFA(sc, ifa)
933                 carp_ifa_addroute(ifa);
934 }
935
936 static void
937 carp_ifa_addroute(struct ifaddr *ifa)
938 {
939
940         switch (ifa->ifa_addr->sa_family) {
941 #ifdef INET
942         case AF_INET:
943                 in_addprefix(ifatoia(ifa), RTF_UP);
944                 ifa_add_loopback_route(ifa,
945                     (struct sockaddr *)&ifatoia(ifa)->ia_addr);
946                 break;
947 #endif
948 #ifdef INET6
949         case AF_INET6:
950                 ifa_add_loopback_route(ifa,
951                     (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
952                 in6_ifaddloop(ifa);
953                 break;
954 #endif
955         }
956 }
957
958 static void
959 carp_delroute(struct carp_softc *sc)
960 {
961         struct ifaddr *ifa;
962
963         CARP_FOREACH_IFA(sc, ifa)
964                 carp_ifa_delroute(ifa);
965 }
966
967 static void
968 carp_ifa_delroute(struct ifaddr *ifa)
969 {
970
971         switch (ifa->ifa_addr->sa_family) {
972 #ifdef INET
973         case AF_INET:
974                 ifa_del_loopback_route(ifa,
975                     (struct sockaddr *)&ifatoia(ifa)->ia_addr);
976                 in_scrubprefix(ifatoia(ifa), LLE_STATIC);
977                 break;
978 #endif
979 #ifdef INET6
980         case AF_INET6:
981                 ifa_del_loopback_route(ifa,
982                     (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
983                 in6_ifremloop(ifa);
984                 break;
985 #endif
986         }
987 }
988
989 int
990 carp_master(struct ifaddr *ifa)
991 {
992         struct carp_softc *sc = ifa->ifa_carp;
993
994         return (sc->sc_state == MASTER);
995 }
996
997 #ifdef INET
998 /*
999  * Broadcast a gratuitous ARP request containing
1000  * the virtual router MAC address for each IP address
1001  * associated with the virtual router.
1002  */
1003 static void
1004 carp_send_arp(struct carp_softc *sc)
1005 {
1006         struct ifaddr *ifa;
1007
1008         CARP_FOREACH_IFA(sc, ifa)
1009                 if (ifa->ifa_addr->sa_family == AF_INET)
1010                         arp_ifinit2(sc->sc_carpdev, ifa, LLADDR(&sc->sc_addr));
1011 }
1012
1013 int
1014 carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr)
1015 {
1016         struct carp_softc *sc = ifa->ifa_carp;
1017
1018         if (sc->sc_state == MASTER) {
1019                 *enaddr = LLADDR(&sc->sc_addr);
1020                 return (1);
1021         }
1022
1023         return (0);
1024 }
1025 #endif
1026
1027 #ifdef INET6
1028 static void
1029 carp_send_na(struct carp_softc *sc)
1030 {
1031         static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
1032         struct ifaddr *ifa;
1033         struct in6_addr *in6;
1034
1035         CARP_FOREACH_IFA(sc, ifa) {
1036                 if (ifa->ifa_addr->sa_family != AF_INET6)
1037                         continue;
1038
1039                 in6 = IFA_IN6(ifa);
1040                 nd6_na_output(sc->sc_carpdev, &mcast, in6,
1041                     ND_NA_FLAG_OVERRIDE, 1, NULL);
1042                 DELAY(1000);    /* XXX */
1043         }
1044 }
1045
1046 /*
1047  * Returns ifa in case it's a carp address and it is MASTER, or if the address
1048  * matches and is not a carp address.  Returns NULL otherwise.
1049  */
1050 struct ifaddr *
1051 carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr)
1052 {
1053         struct ifaddr *ifa;
1054
1055         ifa = NULL;
1056         IF_ADDR_RLOCK(ifp);
1057         TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1058                 if (ifa->ifa_addr->sa_family != AF_INET6)
1059                         continue;
1060                 if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa)))
1061                         continue;
1062                 if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER)
1063                         ifa = NULL;
1064                 else
1065                         ifa_ref(ifa);
1066                 break;
1067         }
1068         IF_ADDR_RUNLOCK(ifp);
1069
1070         return (ifa);
1071 }
1072
1073 caddr_t
1074 carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr)
1075 {
1076         struct ifaddr *ifa;
1077
1078         IF_ADDR_RLOCK(ifp);
1079         IFNET_FOREACH_IFA(ifp, ifa)
1080                 if (ifa->ifa_addr->sa_family == AF_INET6 &&
1081                     IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) {
1082                         struct carp_softc *sc = ifa->ifa_carp;
1083                         struct m_tag *mtag;
1084
1085                         IF_ADDR_RUNLOCK(ifp);
1086
1087                         mtag = m_tag_get(PACKET_TAG_CARP,
1088                             sizeof(struct carp_softc *), M_NOWAIT);
1089                         if (mtag == NULL)
1090                                 /* Better a bit than nothing. */
1091                                 return (LLADDR(&sc->sc_addr));
1092
1093                         bcopy(&sc, mtag + 1, sizeof(sc));
1094                         m_tag_prepend(m, mtag);
1095
1096                         return (LLADDR(&sc->sc_addr));
1097                 }
1098         IF_ADDR_RUNLOCK(ifp);
1099
1100         return (NULL);
1101 }
1102 #endif /* INET6 */
1103
1104 int
1105 carp_forus(struct ifnet *ifp, u_char *dhost)
1106 {
1107         struct carp_softc *sc;
1108         uint8_t *ena = dhost;
1109
1110         if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1)
1111                 return (0);
1112
1113         CIF_LOCK(ifp->if_carp);
1114         IFNET_FOREACH_CARP(ifp, sc) {
1115                 CARP_LOCK(sc);
1116                 if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr),
1117                     ETHER_ADDR_LEN)) {
1118                         CARP_UNLOCK(sc);
1119                         CIF_UNLOCK(ifp->if_carp);
1120                         return (1);
1121                 }
1122                 CARP_UNLOCK(sc);
1123         }
1124         CIF_UNLOCK(ifp->if_carp);
1125
1126         return (0);
1127 }
1128
1129 /* Master down timeout event, executed in callout context. */
1130 static void
1131 carp_master_down(void *v)
1132 {
1133         struct carp_softc *sc = v;
1134
1135         CARP_LOCK_ASSERT(sc);
1136
1137         CURVNET_SET(sc->sc_carpdev->if_vnet);
1138         if (sc->sc_state == BACKUP) {
1139                 CARP_LOG("VHID %u@%s: BACKUP -> MASTER (master down)\n",
1140                     sc->sc_vhid,
1141                     sc->sc_carpdev->if_xname);
1142                 carp_master_down_locked(sc);
1143         }
1144         CURVNET_RESTORE();
1145
1146         CARP_UNLOCK(sc);
1147 }
1148
1149 static void
1150 carp_master_down_locked(struct carp_softc *sc)
1151 {
1152
1153         CARP_LOCK_ASSERT(sc);
1154
1155         switch (sc->sc_state) {
1156         case BACKUP:
1157                 carp_set_state(sc, MASTER);
1158                 carp_send_ad_locked(sc);
1159 #ifdef INET
1160                 carp_send_arp(sc);
1161 #endif
1162 #ifdef INET6
1163                 carp_send_na(sc);
1164 #endif
1165                 carp_setrun(sc, 0);
1166                 carp_addroute(sc);
1167                 break;
1168         case INIT:
1169         case MASTER:
1170 #ifdef INVARIANTS
1171                 panic("carp: VHID %u@%s: master_down event in %s state\n",
1172                     sc->sc_vhid,
1173                     sc->sc_carpdev->if_xname,
1174                     sc->sc_state ? "MASTER" : "INIT");
1175 #endif
1176                 break;
1177         }
1178 }
1179
1180 /*
1181  * When in backup state, af indicates whether to reset the master down timer
1182  * for v4 or v6. If it's set to zero, reset the ones which are already pending.
1183  */
1184 static void
1185 carp_setrun(struct carp_softc *sc, sa_family_t af)
1186 {
1187         struct timeval tv;
1188
1189         CARP_LOCK_ASSERT(sc);
1190
1191         if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 ||
1192             sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
1193             (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0))
1194                 return;
1195
1196         switch (sc->sc_state) {
1197         case INIT:
1198                 CARP_LOG("VHID %u@%s: INIT -> BACKUP\n",
1199                     sc->sc_vhid,
1200                     sc->sc_carpdev->if_xname);
1201                 carp_set_state(sc, BACKUP);
1202                 carp_setrun(sc, 0);
1203                 break;
1204         case BACKUP:
1205                 callout_stop(&sc->sc_ad_tmo);
1206                 tv.tv_sec = 3 * sc->sc_advbase;
1207                 tv.tv_usec = sc->sc_advskew * 1000000 / 256;
1208                 switch (af) {
1209 #ifdef INET
1210                 case AF_INET:
1211                         callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
1212                             carp_master_down, sc);
1213                         break;
1214 #endif
1215 #ifdef INET6
1216                 case AF_INET6:
1217                         callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
1218                             carp_master_down, sc);
1219                         break;
1220 #endif
1221                 default:
1222 #ifdef INET
1223                         if (sc->sc_naddrs)
1224                                 callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
1225                                     carp_master_down, sc);
1226 #endif
1227 #ifdef INET6
1228                         if (sc->sc_naddrs6)
1229                                 callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
1230                                     carp_master_down, sc);
1231 #endif
1232                         break;
1233                 }
1234                 break;
1235         case MASTER:
1236                 tv.tv_sec = sc->sc_advbase;
1237                 tv.tv_usec = sc->sc_advskew * 1000000 / 256;
1238                 callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
1239                     carp_send_ad, sc);
1240                 break;
1241         }
1242 }
1243
1244 /*
1245  * Setup multicast structures.
1246  */
1247 static int
1248 carp_multicast_setup(struct carp_if *cif, sa_family_t sa)
1249 {
1250         struct ifnet *ifp = cif->cif_ifp;
1251         int error = 0;
1252
1253         CIF_LOCK_ASSERT(cif);
1254
1255         switch (sa) {
1256 #ifdef INET
1257         case AF_INET:
1258             {
1259                 struct ip_moptions *imo = &cif->cif_imo;
1260                 struct in_addr addr;
1261
1262                 if (imo->imo_membership)
1263                         return (0);
1264
1265                 imo->imo_membership = (struct in_multi **)malloc(
1266                     (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP,
1267                     M_NOWAIT);
1268                 if (imo->imo_membership == NULL)
1269                         return (ENOMEM);
1270                 imo->imo_mfilters = NULL;
1271                 imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
1272                 imo->imo_multicast_vif = -1;
1273
1274                 addr.s_addr = htonl(INADDR_CARP_GROUP);
1275                 if ((error = in_joingroup(ifp, &addr, NULL,
1276                     &imo->imo_membership[0])) != 0) {
1277                         free(imo->imo_membership, M_CARP);
1278                         break;
1279                 }
1280                 imo->imo_num_memberships++;
1281                 imo->imo_multicast_ifp = ifp;
1282                 imo->imo_multicast_ttl = CARP_DFLTTL;
1283                 imo->imo_multicast_loop = 0;
1284                 break;
1285            }
1286 #endif
1287 #ifdef INET6
1288         case AF_INET6:
1289             {
1290                 struct ip6_moptions *im6o = &cif->cif_im6o;
1291                 struct in6_addr in6;
1292                 struct in6_multi *in6m;
1293
1294                 if (im6o->im6o_membership)
1295                         return (0);
1296
1297                 im6o->im6o_membership = (struct in6_multi **)malloc(
1298                     (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP,
1299                     M_ZERO | M_NOWAIT);
1300                 if (im6o->im6o_membership == NULL)
1301                         return (ENOMEM);
1302                 im6o->im6o_mfilters = NULL;
1303                 im6o->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
1304                 im6o->im6o_multicast_hlim = CARP_DFLTTL;
1305                 im6o->im6o_multicast_ifp = ifp;
1306
1307                 /* Join IPv6 CARP multicast group. */
1308                 bzero(&in6, sizeof(in6));
1309                 in6.s6_addr16[0] = htons(0xff02);
1310                 in6.s6_addr8[15] = 0x12;
1311                 if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
1312                         free(im6o->im6o_membership, M_CARP);
1313                         break;
1314                 }
1315                 in6m = NULL;
1316                 if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) {
1317                         free(im6o->im6o_membership, M_CARP);
1318                         break;
1319                 }
1320                 im6o->im6o_membership[0] = in6m;
1321                 im6o->im6o_num_memberships++;
1322
1323                 /* Join solicited multicast address. */
1324                 bzero(&in6, sizeof(in6));
1325                 in6.s6_addr16[0] = htons(0xff02);
1326                 in6.s6_addr32[1] = 0;
1327                 in6.s6_addr32[2] = htonl(1);
1328                 in6.s6_addr32[3] = 0;
1329                 in6.s6_addr8[12] = 0xff;
1330                 if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
1331                         in6_mc_leave(im6o->im6o_membership[0], NULL);
1332                         free(im6o->im6o_membership, M_CARP);
1333                         break;
1334                 }
1335                 in6m = NULL;
1336                 if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) {
1337                         in6_mc_leave(im6o->im6o_membership[0], NULL);
1338                         free(im6o->im6o_membership, M_CARP);
1339                         break;
1340                 }
1341                 im6o->im6o_membership[1] = in6m;
1342                 im6o->im6o_num_memberships++;
1343                 break;
1344             }
1345 #endif
1346         }
1347
1348         return (error);
1349 }
1350
1351 /*
1352  * Free multicast structures.
1353  */
1354 static void
1355 carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa)
1356 {
1357
1358         CIF_LOCK_ASSERT(cif);
1359         switch (sa) {
1360 #ifdef INET
1361         case AF_INET:
1362                 if (cif->cif_naddrs == 0) {
1363                         struct ip_moptions *imo = &cif->cif_imo;
1364
1365                         in_leavegroup(imo->imo_membership[0], NULL);
1366                         KASSERT(imo->imo_mfilters == NULL,
1367                             ("%s: imo_mfilters != NULL", __func__));
1368                         free(imo->imo_membership, M_CARP);
1369                         imo->imo_membership = NULL;
1370
1371                 }
1372                 break;
1373 #endif
1374 #ifdef INET6
1375         case AF_INET6:
1376                 if (cif->cif_naddrs6 == 0) {
1377                         struct ip6_moptions *im6o = &cif->cif_im6o;
1378
1379                         in6_mc_leave(im6o->im6o_membership[0], NULL);
1380                         in6_mc_leave(im6o->im6o_membership[1], NULL);
1381                         KASSERT(im6o->im6o_mfilters == NULL,
1382                             ("%s: im6o_mfilters != NULL", __func__));
1383                         free(im6o->im6o_membership, M_CARP);
1384                         im6o->im6o_membership = NULL;
1385                 }
1386                 break;
1387 #endif
1388         }
1389 }
1390
1391 int
1392 carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa)
1393 {
1394         struct m_tag *mtag;
1395         struct carp_softc *sc;
1396
1397         if (!sa)
1398                 return (0);
1399
1400         switch (sa->sa_family) {
1401 #ifdef INET
1402         case AF_INET:
1403                 break;
1404 #endif
1405 #ifdef INET6
1406         case AF_INET6:
1407                 break;
1408 #endif
1409         default:
1410                 return (0);
1411         }
1412
1413         mtag = m_tag_find(m, PACKET_TAG_CARP, NULL);
1414         if (mtag == NULL)
1415                 return (0);
1416
1417         bcopy(mtag + 1, &sc, sizeof(sc));
1418
1419         /* Set the source MAC address to the Virtual Router MAC Address. */
1420         switch (ifp->if_type) {
1421         case IFT_ETHER:
1422         case IFT_BRIDGE:
1423         case IFT_L2VLAN: {
1424                         struct ether_header *eh;
1425
1426                         eh = mtod(m, struct ether_header *);
1427                         eh->ether_shost[0] = 0;
1428                         eh->ether_shost[1] = 0;
1429                         eh->ether_shost[2] = 0x5e;
1430                         eh->ether_shost[3] = 0;
1431                         eh->ether_shost[4] = 1;
1432                         eh->ether_shost[5] = sc->sc_vhid;
1433                 }
1434                 break;
1435         case IFT_FDDI: {
1436                         struct fddi_header *fh;
1437
1438                         fh = mtod(m, struct fddi_header *);
1439                         fh->fddi_shost[0] = 0;
1440                         fh->fddi_shost[1] = 0;
1441                         fh->fddi_shost[2] = 0x5e;
1442                         fh->fddi_shost[3] = 0;
1443                         fh->fddi_shost[4] = 1;
1444                         fh->fddi_shost[5] = sc->sc_vhid;
1445                 }
1446                 break;
1447         case IFT_ISO88025: {
1448                         struct iso88025_header *th;
1449                         th = mtod(m, struct iso88025_header *);
1450                         th->iso88025_shost[0] = 3;
1451                         th->iso88025_shost[1] = 0;
1452                         th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1);
1453                         th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1);
1454                         th->iso88025_shost[4] = 0;
1455                         th->iso88025_shost[5] = 0;
1456                 }
1457                 break;
1458         default:
1459                 printf("%s: carp is not supported for the %d interface type\n",
1460                     ifp->if_xname, ifp->if_type);
1461                 return (EOPNOTSUPP);
1462         }
1463
1464         return (0);
1465 }
1466
1467 static struct carp_softc*
1468 carp_alloc(struct ifnet *ifp)
1469 {
1470         struct carp_softc *sc;
1471         struct carp_if *cif;
1472
1473         if ((cif = ifp->if_carp) == NULL) {
1474                 cif = carp_alloc_if(ifp);
1475                 if (cif == NULL)
1476                         return (NULL);
1477         }
1478
1479         sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO);
1480
1481         sc->sc_advbase = CARP_DFLTINTV;
1482         sc->sc_vhid = -1;       /* required setting */
1483         sc->sc_init_counter = 1;
1484         sc->sc_state = INIT;
1485
1486         sc->sc_ifasiz = sizeof(struct ifaddr *);
1487         sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO);
1488         sc->sc_carpdev = ifp;
1489
1490         CARP_LOCK_INIT(sc);
1491 #ifdef INET
1492         callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1493 #endif
1494 #ifdef INET6
1495         callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1496 #endif
1497         callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1498
1499         CIF_LOCK(cif);
1500         TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list);
1501         CIF_UNLOCK(cif);
1502
1503         mtx_lock(&carp_mtx);
1504         LIST_INSERT_HEAD(&carp_list, sc, sc_next);
1505         mtx_unlock(&carp_mtx);
1506
1507         return (sc);
1508 }
1509
1510 static int
1511 carp_grow_ifas(struct carp_softc *sc)
1512 {
1513         struct ifaddr **new;
1514
1515         CARP_LOCK_ASSERT(sc);
1516
1517         new = malloc(sc->sc_ifasiz * 2, M_CARP, M_NOWAIT|M_ZERO);
1518         if (new == NULL)
1519                 return (ENOMEM);
1520         bcopy(sc->sc_ifas, new, sc->sc_ifasiz);
1521         free(sc->sc_ifas, M_CARP);
1522         sc->sc_ifas = new;
1523         sc->sc_ifasiz *= 2;
1524
1525         return (0);
1526 }
1527
1528 static void
1529 carp_destroy(struct carp_softc *sc)
1530 {
1531         struct ifnet *ifp = sc->sc_carpdev;
1532         struct carp_if *cif = ifp->if_carp;
1533
1534         CIF_LOCK_ASSERT(cif);
1535
1536         TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list);
1537
1538         mtx_lock(&carp_mtx);
1539         LIST_REMOVE(sc, sc_next);
1540         mtx_unlock(&carp_mtx);
1541
1542         CARP_LOCK(sc);
1543         if (sc->sc_suppress)
1544                 carp_demote_adj(-carp_ifdown_adj, "vhid removed");
1545         callout_drain(&sc->sc_ad_tmo);
1546 #ifdef INET
1547         callout_drain(&sc->sc_md_tmo);
1548 #endif
1549 #ifdef INET6
1550         callout_drain(&sc->sc_md6_tmo);
1551 #endif
1552         CARP_LOCK_DESTROY(sc);
1553
1554         free(sc->sc_ifas, M_CARP);
1555         free(sc, M_CARP);
1556 }
1557
1558 static struct carp_if*
1559 carp_alloc_if(struct ifnet *ifp)
1560 {
1561         struct carp_if *cif;
1562
1563         cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO);
1564
1565         if (ifpromisc(ifp, 1) != 0)
1566                 goto cleanup;
1567
1568         CIF_LOCK_INIT(cif);
1569         cif->cif_ifp = ifp;
1570         TAILQ_INIT(&cif->cif_vrs);
1571
1572         IF_ADDR_WLOCK(ifp);
1573         ifp->if_carp = cif;
1574         if_ref(ifp);
1575         IF_ADDR_WUNLOCK(ifp);
1576
1577         return (cif);
1578
1579 cleanup:
1580         free(cif, M_CARP);
1581
1582         return (NULL);
1583 }
1584
1585 static void
1586 carp_free_if(struct carp_if *cif)
1587 {
1588         struct ifnet *ifp = cif->cif_ifp;
1589
1590         CIF_LOCK_ASSERT(cif);
1591         KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty",
1592             __func__));
1593
1594         IF_ADDR_WLOCK(ifp);
1595         ifp->if_carp = NULL;
1596         if_rele(ifp);
1597         IF_ADDR_WUNLOCK(ifp);
1598
1599         CIF_LOCK_DESTROY(cif);
1600
1601         ifpromisc(ifp, 0);
1602
1603         free(cif, M_CARP);
1604 }
1605
1606 static void
1607 carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv)
1608 {
1609
1610         CARP_LOCK(sc);
1611         carpr->carpr_state = sc->sc_state;
1612         carpr->carpr_vhid = sc->sc_vhid;
1613         carpr->carpr_advbase = sc->sc_advbase;
1614         carpr->carpr_advskew = sc->sc_advskew;
1615         if (priv)
1616                 bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key));
1617         else
1618                 bzero(carpr->carpr_key, sizeof(carpr->carpr_key));
1619         CARP_UNLOCK(sc);
1620 }
1621
1622 int
1623 carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td)
1624 {
1625         struct carpreq carpr;
1626         struct ifnet *ifp;
1627         struct carp_softc *sc = NULL;
1628         int error = 0, locked = 0;
1629
1630         if ((error = copyin(ifr->ifr_data, &carpr, sizeof carpr)))
1631                 return (error);
1632
1633         ifp = ifunit_ref(ifr->ifr_name);
1634         if (ifp == NULL)
1635                 return (ENXIO);
1636
1637         switch (ifp->if_type) {
1638         case IFT_ETHER:
1639         case IFT_L2VLAN:
1640         case IFT_BRIDGE:
1641         case IFT_FDDI:
1642         case IFT_ISO88025:
1643                 break;
1644         default:
1645                 error = EOPNOTSUPP;
1646                 goto out;
1647         }
1648
1649         if ((ifp->if_flags & IFF_MULTICAST) == 0) {
1650                 error = EADDRNOTAVAIL;
1651                 goto out;
1652         }
1653
1654         switch (cmd) {
1655         case SIOCSVH:
1656                 if ((error = priv_check(td, PRIV_NETINET_CARP)))
1657                         break;
1658                 if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID ||
1659                     carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) {
1660                         error = EINVAL;
1661                         break;
1662                 }
1663
1664                 if (ifp->if_carp) {
1665                         CIF_LOCK(ifp->if_carp);
1666                         IFNET_FOREACH_CARP(ifp, sc)
1667                                 if (sc->sc_vhid == carpr.carpr_vhid)
1668                                         break;
1669                         CIF_UNLOCK(ifp->if_carp);
1670                 }
1671                 if (sc == NULL) {
1672                         sc = carp_alloc(ifp);
1673                         if (sc == NULL) {
1674                                 error = EINVAL; /* XXX: ifpromisc failed */
1675                                 break;
1676                         }
1677
1678                         CARP_LOCK(sc);
1679                         sc->sc_vhid = carpr.carpr_vhid;
1680                         LLADDR(&sc->sc_addr)[0] = 0;
1681                         LLADDR(&sc->sc_addr)[1] = 0;
1682                         LLADDR(&sc->sc_addr)[2] = 0x5e;
1683                         LLADDR(&sc->sc_addr)[3] = 0;
1684                         LLADDR(&sc->sc_addr)[4] = 1;
1685                         LLADDR(&sc->sc_addr)[5] = sc->sc_vhid;
1686                 } else
1687                         CARP_LOCK(sc);
1688                 locked = 1;
1689                 if (carpr.carpr_advbase > 0) {
1690                         if (carpr.carpr_advbase > 255 ||
1691                             carpr.carpr_advbase < CARP_DFLTINTV) {
1692                                 error = EINVAL;
1693                                 break;
1694                         }
1695                         sc->sc_advbase = carpr.carpr_advbase;
1696                 }
1697                 if (carpr.carpr_advskew > 0) {
1698                         if (carpr.carpr_advskew >= 255) {
1699                                 error = EINVAL;
1700                                 break;
1701                         }
1702                         sc->sc_advskew = carpr.carpr_advskew;
1703                 }
1704                 if (carpr.carpr_key[0] != '\0') {
1705                         bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
1706                         carp_hmac_prepare(sc);
1707                 }
1708                 if (sc->sc_state != INIT &&
1709                     carpr.carpr_state != sc->sc_state) {
1710                         switch (carpr.carpr_state) {
1711                         case BACKUP:
1712                                 callout_stop(&sc->sc_ad_tmo);
1713                                 carp_set_state(sc, BACKUP);
1714                                 carp_setrun(sc, 0);
1715                                 carp_delroute(sc);
1716                                 break;
1717                         case MASTER:
1718                                 carp_master_down_locked(sc);
1719                                 break;
1720                         default:
1721                                 break;
1722                         }
1723                 }
1724                 break;
1725
1726         case SIOCGVH:
1727             {
1728                 int priveleged;
1729
1730                 if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) {
1731                         error = EINVAL;
1732                         break;
1733                 }
1734                 if (carpr.carpr_count < 1) {
1735                         error = EMSGSIZE;
1736                         break;
1737                 }
1738                 if (ifp->if_carp == NULL) {
1739                         error = ENOENT;
1740                         break;
1741                 }
1742
1743                 priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0);
1744                 if (carpr.carpr_vhid != 0) {
1745                         CIF_LOCK(ifp->if_carp);
1746                         IFNET_FOREACH_CARP(ifp, sc)
1747                                 if (sc->sc_vhid == carpr.carpr_vhid)
1748                                         break;
1749                         CIF_UNLOCK(ifp->if_carp);
1750                         if (sc == NULL) {
1751                                 error = ENOENT;
1752                                 break;
1753                         }
1754                         carp_carprcp(&carpr, sc, priveleged);
1755                         error = copyout(&carpr, ifr->ifr_data, sizeof(carpr));
1756                 } else  {
1757                         int i, count;
1758
1759                         count = 0;
1760                         CIF_LOCK(ifp->if_carp);
1761                         IFNET_FOREACH_CARP(ifp, sc)
1762                                 count++;
1763
1764                         if (count > carpr.carpr_count) {
1765                                 CIF_UNLOCK(ifp->if_carp);
1766                                 error = EMSGSIZE;
1767                                 break;
1768                         }
1769
1770                         i = 0;
1771                         IFNET_FOREACH_CARP(ifp, sc) {
1772                                 carp_carprcp(&carpr, sc, priveleged);
1773                                 carpr.carpr_count = count;
1774                                 error = copyout(&carpr, ifr->ifr_data +
1775                                     (i * sizeof(carpr)), sizeof(carpr));
1776                                 if (error) {
1777                                         CIF_UNLOCK(ifp->if_carp);
1778                                         break;
1779                                 }
1780                                 i++;
1781                         }
1782                         CIF_UNLOCK(ifp->if_carp);
1783                 }
1784                 break;
1785             }
1786         default:
1787                 error = EINVAL;
1788         }
1789
1790 out:
1791         if (locked)
1792                 CARP_UNLOCK(sc);
1793         if_rele(ifp);
1794
1795         return (error);
1796 }
1797
1798 static int
1799 carp_get_vhid(struct ifaddr *ifa)
1800 {
1801
1802         if (ifa == NULL || ifa->ifa_carp == NULL)
1803                 return (0);
1804
1805         return (ifa->ifa_carp->sc_vhid);
1806 }
1807
1808 int
1809 carp_attach(struct ifaddr *ifa, int vhid)
1810 {
1811         struct ifnet *ifp = ifa->ifa_ifp;
1812         struct carp_if *cif = ifp->if_carp;
1813         struct carp_softc *sc;
1814         int index, error;
1815
1816         if (ifp->if_carp == NULL)
1817                 return (ENOPROTOOPT);
1818
1819         switch (ifa->ifa_addr->sa_family) {
1820 #ifdef INET
1821         case AF_INET:
1822 #endif
1823 #ifdef INET6
1824         case AF_INET6:
1825 #endif
1826                 break;
1827         default:
1828                 return (EPROTOTYPE);
1829         }
1830
1831         CIF_LOCK(cif);
1832         IFNET_FOREACH_CARP(ifp, sc)
1833                 if (sc->sc_vhid == vhid)
1834                         break;
1835         if (sc == NULL) {
1836                 CIF_UNLOCK(cif);
1837                 return (ENOENT);
1838         }
1839
1840         if (ifa->ifa_carp) {
1841                 if (ifa->ifa_carp->sc_vhid != vhid)
1842                         carp_detach_locked(ifa);
1843                 else {
1844                         CIF_UNLOCK(cif);
1845                         return (0);
1846                 }
1847         }
1848
1849         error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family);
1850         if (error) {
1851                 CIF_FREE(cif);
1852                 return (error);
1853         }
1854
1855         CARP_LOCK(sc);
1856         index = sc->sc_naddrs + sc->sc_naddrs6 + 1;
1857         if (index > sc->sc_ifasiz / sizeof(struct ifaddr *))
1858                 if ((error = carp_grow_ifas(sc)) != 0) {
1859                         carp_multicast_cleanup(cif,
1860                             ifa->ifa_addr->sa_family);
1861                         CARP_UNLOCK(sc);
1862                         CIF_FREE(cif);
1863                         return (error);
1864                 }
1865
1866         switch (ifa->ifa_addr->sa_family) {
1867 #ifdef INET
1868         case AF_INET:
1869                 cif->cif_naddrs++;
1870                 sc->sc_naddrs++;
1871                 break;
1872 #endif
1873 #ifdef INET6
1874         case AF_INET6:
1875                 cif->cif_naddrs6++;
1876                 sc->sc_naddrs6++;
1877                 break;
1878 #endif
1879         }
1880
1881         ifa_ref(ifa);
1882         sc->sc_ifas[index - 1] = ifa;
1883         ifa->ifa_carp = sc;
1884
1885         carp_hmac_prepare(sc);
1886         carp_sc_state(sc);
1887
1888         CARP_UNLOCK(sc);
1889         CIF_UNLOCK(cif);
1890
1891         return (0);
1892 }
1893
1894 void
1895 carp_detach(struct ifaddr *ifa)
1896 {
1897         struct ifnet *ifp = ifa->ifa_ifp;
1898         struct carp_if *cif = ifp->if_carp;
1899
1900         CIF_LOCK(cif);
1901         carp_detach_locked(ifa);
1902         CIF_FREE(cif);
1903 }
1904
1905 static void
1906 carp_detach_locked(struct ifaddr *ifa)
1907 {
1908         struct ifnet *ifp = ifa->ifa_ifp;
1909         struct carp_if *cif = ifp->if_carp;
1910         struct carp_softc *sc = ifa->ifa_carp;
1911         int i, index;
1912
1913         KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa));
1914
1915         CIF_LOCK_ASSERT(cif);
1916         CARP_LOCK(sc);
1917
1918         /* Shift array. */
1919         index = sc->sc_naddrs + sc->sc_naddrs6;
1920         for (i = 0; i < index; i++)
1921                 if (sc->sc_ifas[i] == ifa)
1922                         break;
1923         KASSERT(i < index, ("%s: %p no backref", __func__, ifa));
1924         for (; i < index - 1; i++)
1925                 sc->sc_ifas[i] = sc->sc_ifas[i+1];
1926         sc->sc_ifas[index - 1] = NULL;
1927
1928         switch (ifa->ifa_addr->sa_family) {
1929 #ifdef INET
1930         case AF_INET:
1931                 cif->cif_naddrs--;
1932                 sc->sc_naddrs--;
1933                 break;
1934 #endif
1935 #ifdef INET6
1936         case AF_INET6:
1937                 cif->cif_naddrs6--;
1938                 sc->sc_naddrs6--;
1939                 break;
1940 #endif
1941         }
1942
1943         carp_ifa_delroute(ifa);
1944         carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family);
1945
1946         ifa->ifa_carp = NULL;
1947         ifa_free(ifa);
1948
1949         carp_hmac_prepare(sc);
1950         carp_sc_state(sc);
1951
1952         if (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) {
1953                 CARP_UNLOCK(sc);
1954                 carp_destroy(sc);
1955         } else
1956                 CARP_UNLOCK(sc);
1957 }
1958
1959 static void
1960 carp_set_state(struct carp_softc *sc, int state)
1961 {
1962
1963         CARP_LOCK_ASSERT(sc);
1964
1965         if (sc->sc_state != state) {
1966                 const char *carp_states[] = { CARP_STATES };
1967                 char subsys[IFNAMSIZ+5];
1968
1969                 sc->sc_state = state;
1970
1971                 snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid,
1972                     sc->sc_carpdev->if_xname);
1973                 devctl_notify("CARP", subsys, carp_states[state], NULL);
1974         }
1975 }
1976
1977 static void
1978 carp_linkstate(struct ifnet *ifp)
1979 {
1980         struct carp_softc *sc;
1981
1982         CIF_LOCK(ifp->if_carp);
1983         IFNET_FOREACH_CARP(ifp, sc) {
1984                 CARP_LOCK(sc);
1985                 carp_sc_state(sc);
1986                 CARP_UNLOCK(sc);
1987         }
1988         CIF_UNLOCK(ifp->if_carp);
1989 }
1990
1991 static void
1992 carp_sc_state(struct carp_softc *sc)
1993 {
1994
1995         CARP_LOCK_ASSERT(sc);
1996
1997         if (sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
1998             !(sc->sc_carpdev->if_flags & IFF_UP)) {
1999                 callout_stop(&sc->sc_ad_tmo);
2000 #ifdef INET
2001                 callout_stop(&sc->sc_md_tmo);
2002 #endif
2003 #ifdef INET6
2004                 callout_stop(&sc->sc_md6_tmo);
2005 #endif
2006                 carp_set_state(sc, INIT);
2007                 carp_setrun(sc, 0);
2008                 if (!sc->sc_suppress)
2009                         carp_demote_adj(carp_ifdown_adj, "interface down");
2010                 sc->sc_suppress = 1;
2011         } else {
2012                 carp_set_state(sc, INIT);
2013                 carp_setrun(sc, 0);
2014                 if (sc->sc_suppress)
2015                         carp_demote_adj(-carp_ifdown_adj, "interface up");
2016                 sc->sc_suppress = 0;
2017         }
2018 }
2019
2020 static void
2021 carp_demote_adj(int adj, char *reason)
2022 {
2023         atomic_add_int(&carp_demotion, adj);
2024         CARP_LOG("demoted by %d to %d (%s)\n", adj, carp_demotion, reason);
2025         taskqueue_enqueue(taskqueue_swi, &carp_sendall_task);
2026 }
2027
2028 static int
2029 carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS)
2030 {
2031         int new, error;
2032
2033         new = carp_demotion;
2034         error = sysctl_handle_int(oidp, &new, 0, req);
2035         if (error || !req->newptr)
2036                 return (error);
2037
2038         carp_demote_adj(new, "sysctl");
2039
2040         return (0);
2041 }
2042
2043 #ifdef INET
2044 extern  struct domain inetdomain;
2045 static struct protosw in_carp_protosw = {
2046         .pr_type =              SOCK_RAW,
2047         .pr_domain =            &inetdomain,
2048         .pr_protocol =          IPPROTO_CARP,
2049         .pr_flags =             PR_ATOMIC|PR_ADDR,
2050         .pr_input =             carp_input,
2051         .pr_output =            (pr_output_t *)rip_output,
2052         .pr_ctloutput =         rip_ctloutput,
2053         .pr_usrreqs =           &rip_usrreqs
2054 };
2055 #endif
2056
2057 #ifdef INET6
2058 extern  struct domain inet6domain;
2059 static struct ip6protosw in6_carp_protosw = {
2060         .pr_type =              SOCK_RAW,
2061         .pr_domain =            &inet6domain,
2062         .pr_protocol =          IPPROTO_CARP,
2063         .pr_flags =             PR_ATOMIC|PR_ADDR,
2064         .pr_input =             carp6_input,
2065         .pr_output =            rip6_output,
2066         .pr_ctloutput =         rip6_ctloutput,
2067         .pr_usrreqs =           &rip6_usrreqs
2068 };
2069 #endif
2070
2071 static void
2072 carp_mod_cleanup(void)
2073 {
2074
2075 #ifdef INET
2076         if (proto_reg[CARP_INET] == 0) {
2077                 (void)ipproto_unregister(IPPROTO_CARP);
2078                 pf_proto_unregister(PF_INET, IPPROTO_CARP, SOCK_RAW);
2079                 proto_reg[CARP_INET] = -1;
2080         }
2081         carp_iamatch_p = NULL;
2082 #endif
2083 #ifdef INET6
2084         if (proto_reg[CARP_INET6] == 0) {
2085                 (void)ip6proto_unregister(IPPROTO_CARP);
2086                 pf_proto_unregister(PF_INET6, IPPROTO_CARP, SOCK_RAW);
2087                 proto_reg[CARP_INET6] = -1;
2088         }
2089         carp_iamatch6_p = NULL;
2090         carp_macmatch6_p = NULL;
2091 #endif
2092         carp_ioctl_p = NULL;
2093         carp_attach_p = NULL;
2094         carp_detach_p = NULL;
2095         carp_get_vhid_p = NULL;
2096         carp_linkstate_p = NULL;
2097         carp_forus_p = NULL;
2098         carp_output_p = NULL;
2099         carp_demote_adj_p = NULL;
2100         carp_master_p = NULL;
2101         mtx_unlock(&carp_mtx);
2102         taskqueue_drain(taskqueue_swi, &carp_sendall_task);
2103         mtx_destroy(&carp_mtx);
2104         COUNTER_ARRAY_FREE(carpstats,
2105             sizeof(struct carpstats) / sizeof(uint64_t));
2106 }
2107
2108 static int
2109 carp_mod_load(void)
2110 {
2111         int err;
2112
2113         mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF);
2114         LIST_INIT(&carp_list);
2115         COUNTER_ARRAY_ALLOC(carpstats,
2116             sizeof(struct carpstats) / sizeof(uint64_t), M_WAITOK);
2117         carp_get_vhid_p = carp_get_vhid;
2118         carp_forus_p = carp_forus;
2119         carp_output_p = carp_output;
2120         carp_linkstate_p = carp_linkstate;
2121         carp_ioctl_p = carp_ioctl;
2122         carp_attach_p = carp_attach;
2123         carp_detach_p = carp_detach;
2124         carp_demote_adj_p = carp_demote_adj;
2125         carp_master_p = carp_master;
2126 #ifdef INET6
2127         carp_iamatch6_p = carp_iamatch6;
2128         carp_macmatch6_p = carp_macmatch6;
2129         proto_reg[CARP_INET6] = pf_proto_register(PF_INET6,
2130             (struct protosw *)&in6_carp_protosw);
2131         if (proto_reg[CARP_INET6]) {
2132                 printf("carp: error %d attaching to PF_INET6\n",
2133                     proto_reg[CARP_INET6]);
2134                 carp_mod_cleanup();
2135                 return (proto_reg[CARP_INET6]);
2136         }
2137         err = ip6proto_register(IPPROTO_CARP);
2138         if (err) {
2139                 printf("carp: error %d registering with INET6\n", err);
2140                 carp_mod_cleanup();
2141                 return (err);
2142         }
2143 #endif
2144 #ifdef INET
2145         carp_iamatch_p = carp_iamatch;
2146         proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw);
2147         if (proto_reg[CARP_INET]) {
2148                 printf("carp: error %d attaching to PF_INET\n",
2149                     proto_reg[CARP_INET]);
2150                 carp_mod_cleanup();
2151                 return (proto_reg[CARP_INET]);
2152         }
2153         err = ipproto_register(IPPROTO_CARP);
2154         if (err) {
2155                 printf("carp: error %d registering with INET\n", err);
2156                 carp_mod_cleanup();
2157                 return (err);
2158         }
2159 #endif
2160         return (0);
2161 }
2162
2163 static int
2164 carp_modevent(module_t mod, int type, void *data)
2165 {
2166         switch (type) {
2167         case MOD_LOAD:
2168                 return carp_mod_load();
2169                 /* NOTREACHED */
2170         case MOD_UNLOAD:
2171                 mtx_lock(&carp_mtx);
2172                 if (LIST_EMPTY(&carp_list))
2173                         carp_mod_cleanup();
2174                 else {
2175                         mtx_unlock(&carp_mtx);
2176                         return (EBUSY);
2177                 }
2178                 break;
2179
2180         default:
2181                 return (EINVAL);
2182         }
2183
2184         return (0);
2185 }
2186
2187 static moduledata_t carp_mod = {
2188         "carp",
2189         carp_modevent,
2190         0
2191 };
2192
2193 DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);