2 * Copyright (c) 2011 Chelsio Communications, Inc.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/module.h>
37 #include <sys/mutex.h>
38 #include <sys/rwlock.h>
39 #include <sys/socket.h>
42 #include <net/if_types.h>
43 #include <net/ethernet.h>
44 #include <net/if_vlan_var.h>
45 #include <net/if_dl.h>
46 #include <net/if_llatbl.h>
47 #include <net/route.h>
48 #include <netinet/in.h>
49 #include <netinet/in_var.h>
50 #include <netinet/if_ether.h>
52 #include "common/common.h"
53 #include "common/jhash.h"
54 #include "common/t4_msg.h"
58 * Module locking notes: There is a RW lock protecting the L2 table as a
59 * whole plus a spinlock per L2T entry. Entry lookups and allocations happen
60 * under the protection of the table lock, individual entry changes happen
61 * while holding that entry's spinlock. The table lock nests outside the
62 * entry locks. Allocations of new entries take the table lock as writers so
63 * no other lookups can happen while allocating new entries. Entry updates
64 * take the table lock as readers so multiple entries can be updated in
65 * parallel. An L2T entry can be dropped by decrementing its reference count
66 * and therefore can happen in parallel with entry allocation but no entry
67 * can change state or increment its ref count during allocation as both of
68 * these perform lookups.
70 * Note: We do not take refereces to ifnets in this module because both
71 * the TOE and the sockets already hold references to the interfaces and the
72 * lifetime of an L2T entry is fully contained in the lifetime of the TOE.
75 /* identifies sync vs async L2T_WRITE_REQs */
77 #define V_SYNC_WR(x) ((x) << S_SYNC_WR)
78 #define F_SYNC_WR V_SYNC_WR(1)
81 L2T_STATE_VALID, /* entry is up to date */
82 L2T_STATE_STALE, /* entry may be used but needs revalidation */
83 L2T_STATE_RESOLVING, /* entry needs address resolution */
84 L2T_STATE_SYNC_WRITE, /* synchronous write of entry underway */
86 /* when state is one of the below the entry is not hashed */
87 L2T_STATE_SWITCHING, /* entry is being used by a switching filter */
88 L2T_STATE_UNUSED /* entry not in use */
93 volatile int nfree; /* number of free entries */
94 struct l2t_entry *rover;/* starting point for next allocation */
95 struct l2t_entry l2tab[L2T_SIZE];
98 static int do_l2t_write_rpl(struct sge_iq *, const struct rss_header *,
101 #define VLAN_NONE 0xfff
102 #define SA(x) ((struct sockaddr *)(x))
103 #define SIN(x) ((struct sockaddr_in *)(x))
104 #define SINADDR(x) (SIN(x)->sin_addr.s_addr)
107 * Allocate a free L2T entry. Must be called with l2t_data.lock held.
109 static struct l2t_entry *
110 alloc_l2e(struct l2t_data *d)
112 struct l2t_entry *end, *e, **p;
114 rw_assert(&d->lock, RA_WLOCKED);
116 if (!atomic_load_acq_int(&d->nfree))
119 /* there's definitely a free entry */
120 for (e = d->rover, end = &d->l2tab[L2T_SIZE]; e != end; ++e)
121 if (atomic_load_acq_int(&e->refcnt) == 0)
124 for (e = d->l2tab; atomic_load_acq_int(&e->refcnt); ++e) ;
127 atomic_subtract_int(&d->nfree, 1);
130 * The entry we found may be an inactive entry that is
131 * presently in the hash table. We need to remove it.
133 if (e->state < L2T_STATE_SWITCHING) {
134 for (p = &d->l2tab[e->hash].first; *p; p = &(*p)->next) {
143 e->state = L2T_STATE_UNUSED;
148 * Write an L2T entry. Must be called with the entry locked.
149 * The write may be synchronous or asynchronous.
152 write_l2e(struct adapter *sc, struct l2t_entry *e, int sync)
155 struct cpl_l2t_write_req *req;
157 mtx_assert(&e->lock, MA_OWNED);
159 if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
162 req = mtod(m, struct cpl_l2t_write_req *);
163 m->m_pkthdr.len = m->m_len = sizeof(*req);
166 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_L2T_WRITE_REQ, e->idx |
167 V_SYNC_WR(sync) | V_TID_QID(sc->sge.fwq.abs_id)));
168 req->params = htons(V_L2T_W_PORT(e->lport) | V_L2T_W_NOREPLY(!sync));
169 req->l2t_idx = htons(e->idx);
170 req->vlan = htons(e->vlan);
171 memcpy(req->dst_mac, e->dmac, sizeof(req->dst_mac));
175 if (sync && e->state != L2T_STATE_SWITCHING)
176 e->state = L2T_STATE_SYNC_WRITE;
182 * Allocate an L2T entry for use by a switching rule. Such need to be
183 * explicitly freed and while busy they are not on any hash chain, so normal
184 * address resolution updates do not see them.
187 t4_l2t_alloc_switching(struct l2t_data *d)
194 mtx_lock(&e->lock); /* avoid race with t4_l2t_free */
195 e->state = L2T_STATE_SWITCHING;
196 atomic_store_rel_int(&e->refcnt, 1);
197 mtx_unlock(&e->lock);
199 rw_runlock(&d->lock);
204 * Sets/updates the contents of a switching L2T entry that has been allocated
205 * with an earlier call to @t4_l2t_alloc_switching.
208 t4_l2t_set_switching(struct adapter *sc, struct l2t_entry *e, uint16_t vlan,
209 uint8_t port, uint8_t *eth_addr)
215 memcpy(e->dmac, eth_addr, ETHER_ADDR_LEN);
217 rc = write_l2e(sc, e, 0);
218 mtx_unlock(&e->lock);
223 t4_init_l2t(struct adapter *sc, int flags)
228 d = malloc(sizeof(*d), M_CXGBE, M_ZERO | flags);
233 atomic_store_rel_int(&d->nfree, L2T_SIZE);
234 rw_init(&d->lock, "L2T");
236 for (i = 0; i < L2T_SIZE; i++) {
238 d->l2tab[i].state = L2T_STATE_UNUSED;
239 mtx_init(&d->l2tab[i].lock, "L2T_E", NULL, MTX_DEF);
240 atomic_store_rel_int(&d->l2tab[i].refcnt, 0);
244 t4_register_cpl_handler(sc, CPL_L2T_WRITE_RPL, do_l2t_write_rpl);
250 t4_free_l2t(struct l2t_data *d)
254 for (i = 0; i < L2T_SIZE; i++)
255 mtx_destroy(&d->l2tab[i].lock);
256 rw_destroy(&d->lock);
263 static inline unsigned int
264 vlan_prio(const struct l2t_entry *e)
266 return e->vlan >> 13;
270 l2e_state(const struct l2t_entry *e)
273 case L2T_STATE_VALID: return 'V'; /* valid, fast-path entry */
274 case L2T_STATE_STALE: return 'S'; /* needs revalidation, but usable */
275 case L2T_STATE_SYNC_WRITE: return 'W';
276 case L2T_STATE_RESOLVING: return e->arpq_head ? 'A' : 'R';
277 case L2T_STATE_SWITCHING: return 'X';
283 sysctl_l2t(SYSCTL_HANDLER_ARGS)
285 struct adapter *sc = arg1;
286 struct l2t_data *l2t = sc->l2t;
289 int rc, i, header = 0;
295 rc = sysctl_wire_old_buffer(req, 0);
299 sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
304 for (i = 0; i < L2T_SIZE; i++, e++) {
306 if (e->state == L2T_STATE_UNUSED)
310 sbuf_printf(sb, " Idx IP address "
311 "Ethernet address VLAN/P LP State Users Port");
314 if (e->state == L2T_STATE_SWITCHING || e->v6)
317 snprintf(ip, sizeof(ip), "%s",
318 inet_ntoa(*(struct in_addr *)&e->addr[0]));
320 /* XXX: accessing lle probably not safe? */
321 sbuf_printf(sb, "\n%4u %-15s %02x:%02x:%02x:%02x:%02x:%02x %4d"
323 e->idx, ip, e->dmac[0], e->dmac[1], e->dmac[2],
324 e->dmac[3], e->dmac[4], e->dmac[5],
325 e->vlan & 0xfff, vlan_prio(e), e->lport,
326 l2e_state(e), atomic_load_acq_int(&e->refcnt),
327 e->lle ? e->lle->lle_tbl->llt_ifp->if_xname : "");
329 mtx_unlock(&e->lock);
332 rc = sbuf_finish(sb);
339 #ifndef TCP_OFFLOAD_DISABLE
341 l2t_hold(struct l2t_data *d, struct l2t_entry *e)
343 if (atomic_fetchadd_int(&e->refcnt, 1) == 0) /* 0 -> 1 transition */
344 atomic_subtract_int(&d->nfree, 1);
348 * To avoid having to check address families we do not allow v4 and v6
349 * neighbors to be on the same hash chain. We keep v4 entries in the first
350 * half of available hash buckets and v6 in the second.
353 L2T_SZ_HALF = L2T_SIZE / 2,
354 L2T_HASH_MASK = L2T_SZ_HALF - 1
357 static inline unsigned int
358 arp_hash(const uint32_t *key, int ifindex)
360 return jhash_2words(*key, ifindex, 0) & L2T_HASH_MASK;
363 static inline unsigned int
364 ipv6_hash(const uint32_t *key, int ifindex)
366 uint32_t xor = key[0] ^ key[1] ^ key[2] ^ key[3];
368 return L2T_SZ_HALF + (jhash_2words(xor, ifindex, 0) & L2T_HASH_MASK);
371 static inline unsigned int
372 addr_hash(const uint32_t *addr, int addr_len, int ifindex)
374 return addr_len == 4 ? arp_hash(addr, ifindex) :
375 ipv6_hash(addr, ifindex);
379 * Checks if an L2T entry is for the given IP/IPv6 address. It does not check
380 * whether the L2T entry and the address are of the same address family.
381 * Callers ensure an address is only checked against L2T entries of the same
382 * family, something made trivial by the separation of IP and IPv6 hash chains
383 * mentioned above. Returns 0 if there's a match,
386 addreq(const struct l2t_entry *e, const uint32_t *addr)
389 return (e->addr[0] ^ addr[0]) | (e->addr[1] ^ addr[1]) |
390 (e->addr[2] ^ addr[2]) | (e->addr[3] ^ addr[3]);
391 return e->addr[0] ^ addr[0];
395 * Add a packet to an L2T entry's queue of packets awaiting resolution.
396 * Must be called with the entry's lock held.
399 arpq_enqueue(struct l2t_entry *e, struct mbuf *m)
401 mtx_assert(&e->lock, MA_OWNED);
403 KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt not NULL", __func__));
405 e->arpq_tail->m_nextpkt = m;
412 send_pending(struct adapter *sc, struct l2t_entry *e)
414 struct mbuf *m, *next;
416 mtx_assert(&e->lock, MA_OWNED);
418 for (m = e->arpq_head; m; m = next) {
421 t4_wrq_tx(sc, MBUF_EQ(m), m);
423 e->arpq_head = e->arpq_tail = NULL;
428 * Looks up and fills up an l2t_entry's lle. We grab all the locks that we need
429 * ourself, and update e->state at the end if e->lle was successfully filled.
431 * The lle passed in comes from arpresolve and is ignored as it does not appear
435 l2t_fill_lle(struct adapter *sc, struct l2t_entry *e, struct llentry *unused)
438 struct sockaddr_in sin;
439 struct ifnet *ifp = e->ifp;
442 bzero(&sin, sizeof(struct sockaddr_in));
444 panic("%s: IPv6 L2 resolution not supported yet.", __func__);
446 sin.sin_family = AF_INET;
447 sin.sin_len = sizeof(struct sockaddr_in);
448 memcpy(&sin.sin_addr, e->addr, sizeof(struct sockaddr_in));
450 mtx_assert(&e->lock, MA_NOTOWNED);
451 KASSERT(e->addr && ifp, ("%s: bad prep before call", __func__));
454 lle = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, SA(&sin));
455 IF_AFDATA_UNLOCK(ifp);
456 if (!LLE_IS_VALID(lle))
458 if (!(lle->la_flags & LLE_VALID)) {
466 if (e->state == L2T_STATE_RESOLVING) {
467 KASSERT(e->lle == NULL, ("%s: lle already valid", __func__));
469 memcpy(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN);
472 KASSERT(e->lle == lle, ("%s: lle changed", __func__));
475 mtx_unlock(&e->lock);
483 t4_l2t_send(struct adapter *sc, struct mbuf *m, struct l2t_entry *e)
488 struct llentry *lle = NULL;
489 struct sockaddr_in sin;
490 struct ifnet *ifp = e->ifp;
493 panic("%s: IPv6 L2 resolution not supported yet.", __func__);
495 bzero(&sin, sizeof(struct sockaddr_in));
496 sin.sin_family = AF_INET;
497 sin.sin_len = sizeof(struct sockaddr_in);
498 memcpy(&sin.sin_addr, e->addr, sizeof(struct sockaddr_in));
502 case L2T_STATE_STALE: /* entry is stale, kick off revalidation */
503 if (arpresolve(ifp, NULL, NULL, SA(&sin), e->dmac, &lle) == 0)
504 l2t_fill_lle(sc, e, lle);
508 case L2T_STATE_VALID: /* fast-path, send the packet on */
509 return t4_wrq_tx(sc, MBUF_EQ(m), m);
511 case L2T_STATE_RESOLVING:
512 case L2T_STATE_SYNC_WRITE:
514 if (e->state != L2T_STATE_SYNC_WRITE &&
515 e->state != L2T_STATE_RESOLVING) {
516 /* state changed by the time we got here */
517 mtx_unlock(&e->lock);
521 mtx_unlock(&e->lock);
523 if (e->state == L2T_STATE_RESOLVING &&
524 arpresolve(ifp, NULL, NULL, SA(&sin), e->dmac, &lle) == 0)
525 l2t_fill_lle(sc, e, lle);
533 * Called when an L2T entry has no more users. The entry is left in the hash
534 * table since it is likely to be reused but we also bump nfree to indicate
535 * that the entry can be reallocated for a different neighbor. We also drop
536 * the existing neighbor reference in case the neighbor is going away and is
537 * waiting on our reference.
539 * Because entries can be reallocated to other neighbors once their ref count
540 * drops to 0 we need to take the entry's lock to avoid races with a new
544 t4_l2e_free(struct l2t_entry *e)
546 struct llentry *lle = NULL;
550 if (atomic_load_acq_int(&e->refcnt) == 0) { /* hasn't been recycled */
554 * Don't need to worry about the arpq, an L2T entry can't be
555 * released if any packets are waiting for resolution as we
556 * need to be able to communicate with the device to close a
560 mtx_unlock(&e->lock);
562 d = container_of(e, struct l2t_data, l2tab[e->idx]);
563 atomic_add_int(&d->nfree, 1);
570 t4_l2t_release(struct l2t_entry *e)
572 if (atomic_fetchadd_int(&e->refcnt, -1) == 1)
577 do_l2t_write_rpl(struct sge_iq *iq, const struct rss_header *rss,
580 struct adapter *sc = iq->adapter;
581 const struct cpl_l2t_write_rpl *rpl = (const void *)(rss + 1);
582 unsigned int tid = GET_TID(rpl);
583 unsigned int idx = tid & (L2T_SIZE - 1);
585 if (__predict_false(rpl->status != CPL_ERR_NONE)) {
587 "Unexpected L2T_WRITE_RPL status %u for entry %u\n",
592 if (tid & F_SYNC_WR) {
593 struct l2t_entry *e = &sc->l2t->l2tab[idx];
596 if (e->state != L2T_STATE_SWITCHING) {
598 e->state = L2T_STATE_VALID;
600 mtx_unlock(&e->lock);
607 * Reuse an L2T entry that was previously used for the same next hop.
610 reuse_entry(struct l2t_entry *e)
614 mtx_lock(&e->lock); /* avoid race with t4_l2t_free */
617 KASSERT(lle->la_flags & LLE_VALID,
618 ("%s: invalid lle stored in l2t_entry", __func__));
620 if (lle->la_expire >= time_uptime)
621 e->state = L2T_STATE_STALE;
623 e->state = L2T_STATE_VALID;
625 e->state = L2T_STATE_RESOLVING;
626 mtx_unlock(&e->lock);
630 * The TOE wants an L2 table entry that it can use to reach the next hop over
631 * the specified port. Produce such an entry - create one if needed.
633 * Note that the ifnet could be a pseudo-device like if_vlan, if_lagg, etc. on
634 * top of the real cxgbe interface.
637 t4_l2t_get(struct port_info *pi, struct ifnet *ifp, struct sockaddr *sa)
640 struct l2t_data *d = pi->adapter->l2t;
644 struct sockaddr_in6 *sin6;
645 unsigned int smt_idx = pi->port_id;
647 if (sa->sa_family == AF_INET) {
648 addr = (uint32_t *)&SINADDR(sa);
649 addr_len = sizeof(SINADDR(sa));
650 } else if (sa->sa_family == AF_INET6) {
651 sin6 = (struct sockaddr_in6 *)sa;
652 addr = (uint32_t *)&sin6->sin6_addr.s6_addr;
653 addr_len = sizeof(sin6->sin6_addr.s6_addr);
658 if (ifp->if_type == IFT_L2VLAN)
662 hash = addr_hash(addr, addr_len, ifp->if_index);
665 for (e = d->l2tab[hash].first; e; e = e->next) {
666 if (!addreq(e, addr) && e->ifp == ifp && e->smt_idx == smt_idx){
668 if (atomic_load_acq_int(&e->refcnt) == 1)
674 /* Need to allocate a new entry */
677 mtx_lock(&e->lock); /* avoid race with t4_l2t_free */
678 e->state = L2T_STATE_RESOLVING;
679 memcpy(e->addr, addr, addr_len);
680 e->ifindex = ifp->if_index;
681 e->smt_idx = smt_idx;
684 e->lport = pi->lport;
685 e->v6 = (addr_len == 16);
687 atomic_store_rel_int(&e->refcnt, 1);
689 if (ifp->if_type == IFT_L2VLAN)
690 VLAN_TAG(ifp, &e->vlan);
694 e->next = d->l2tab[hash].first;
695 d->l2tab[hash].first = e;
696 mtx_unlock(&e->lock);
699 rw_wunlock(&d->lock);
704 * Called when the host's neighbor layer makes a change to some entry that is
705 * loaded into the HW L2 table.
708 t4_l2t_update(struct adapter *sc, struct llentry *lle)
711 struct l2t_data *d = sc->l2t;
712 struct sockaddr *sa = L3_ADDR(lle);
713 struct llentry *old_lle = NULL;
714 uint32_t *addr = (uint32_t *)&SINADDR(sa);
715 struct ifnet *ifp = lle->lle_tbl->llt_ifp;
716 int hash = addr_hash(addr, sizeof(*addr), ifp->if_index);
718 KASSERT(d != NULL, ("%s: no L2 table", __func__));
719 LLE_WLOCK_ASSERT(lle);
720 KASSERT(lle->la_flags & LLE_VALID || lle->la_flags & LLE_DELETED,
721 ("%s: entry neither valid nor deleted.", __func__));
724 for (e = d->l2tab[hash].first; e; e = e->next) {
725 if (!addreq(e, addr) && e->ifp == ifp) {
727 if (atomic_load_acq_int(&e->refcnt))
729 e->state = L2T_STATE_STALE;
730 mtx_unlock(&e->lock);
734 rw_runlock(&d->lock);
736 /* The TOE has no interest in this LLE */
740 rw_runlock(&d->lock);
742 if (atomic_load_acq_int(&e->refcnt)) {
744 /* Entry is referenced by at least 1 offloaded connection. */
746 /* Handle deletes first */
747 if (lle->la_flags & LLE_DELETED) {
750 e->state = L2T_STATE_RESOLVING;
762 if (e->state == L2T_STATE_RESOLVING ||
763 memcmp(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN)) {
765 /* unresolved -> resolved; or dmac changed */
767 memcpy(e->dmac, &lle->ll_addr, ETHER_ADDR_LEN);
771 /* +ve reinforcement of a valid or stale entry */
775 e->state = L2T_STATE_VALID;
779 * Entry was used previously but is unreferenced right now.
780 * e->lle has been released and NULL'd out by t4_l2t_free, or
781 * l2t_release is about to call t4_l2t_free and do that.
783 * Either way this is of no interest to us.
788 mtx_unlock(&e->lock);