2 * Copyright (c) 2012 Chelsio Communications, Inc.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
32 #include "opt_inet6.h"
35 #include <sys/param.h>
36 #include <sys/types.h>
37 #include <sys/kernel.h>
39 #include <sys/module.h>
40 #include <sys/protosw.h>
41 #include <sys/refcount.h>
42 #include <sys/domain.h>
43 #include <sys/fnv_hash.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sysctl.h>
47 #include <net/ethernet.h>
49 #include <net/if_types.h>
50 #include <net/if_vlan_var.h>
51 #include <net/route.h>
52 #include <netinet/in.h>
53 #include <netinet/in_fib.h>
54 #include <netinet/in_pcb.h>
55 #include <netinet/ip.h>
56 #include <netinet/ip6.h>
57 #include <netinet6/in6_fib.h>
58 #include <netinet6/scope6_var.h>
59 #include <netinet/tcp_timer.h>
61 #include <netinet/tcp_fsm.h>
62 #include <netinet/tcp_var.h>
63 #include <netinet/toecore.h>
64 #include <netinet/cc/cc.h>
66 #include "common/common.h"
67 #include "common/t4_msg.h"
68 #include "common/t4_regs.h"
69 #include "tom/t4_tom_l2t.h"
70 #include "tom/t4_tom.h"
73 static int alloc_stid(struct adapter *, struct listen_ctx *, int);
74 static struct listen_ctx *lookup_stid(struct adapter *, int);
75 static void free_stid(struct adapter *, struct listen_ctx *);
78 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
80 static int free_lctx(struct adapter *, struct listen_ctx *);
81 static void hold_lctx(struct listen_ctx *);
82 static void listen_hash_add(struct adapter *, struct listen_ctx *);
83 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
84 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
85 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
87 static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *,
88 struct offload_settings *);
89 static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *);
90 static void send_reset_synqe(struct toedev *, struct synq_entry *);
93 alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
95 struct tid_info *t = &sc->tids;
96 u_int stid, n, f, mask;
97 struct stid_region *sr = &lctx->stid_region;
100 * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
101 * the TCAM. The start of the stid region is properly aligned (the chip
102 * requires each region to be 128-cell aligned).
106 KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
107 ("%s: stid region (%u, %u) not properly aligned. n = %u",
108 __func__, t->stid_base, t->nstids, n));
110 mtx_lock(&t->stid_lock);
111 if (n > t->nstids - t->stids_in_use) {
112 mtx_unlock(&t->stid_lock);
116 if (t->nstids_free_head >= n) {
118 * This allocation will definitely succeed because the region
119 * starts at a good alignment and we just checked we have enough
122 f = t->nstids_free_head & mask;
123 t->nstids_free_head -= n + f;
124 stid = t->nstids_free_head;
125 TAILQ_INSERT_HEAD(&t->stids, sr, link);
127 struct stid_region *s;
129 stid = t->nstids_free_head;
130 TAILQ_FOREACH(s, &t->stids, link) {
131 stid += s->used + s->free;
133 if (s->free >= n + f) {
136 TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
141 if (__predict_false(stid != t->nstids)) {
142 panic("%s: stids TAILQ (%p) corrupt."
143 " At %d instead of %d at the end of the queue.",
144 __func__, &t->stids, stid, t->nstids);
147 mtx_unlock(&t->stid_lock);
154 t->stids_in_use += n;
155 t->stid_tab[stid] = lctx;
156 mtx_unlock(&t->stid_lock);
158 KASSERT(((stid + t->stid_base) & mask) == 0,
159 ("%s: EDOOFUS.", __func__));
160 return (stid + t->stid_base);
163 static struct listen_ctx *
164 lookup_stid(struct adapter *sc, int stid)
166 struct tid_info *t = &sc->tids;
168 return (t->stid_tab[stid - t->stid_base]);
172 free_stid(struct adapter *sc, struct listen_ctx *lctx)
174 struct tid_info *t = &sc->tids;
175 struct stid_region *sr = &lctx->stid_region;
176 struct stid_region *s;
178 KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
180 mtx_lock(&t->stid_lock);
181 s = TAILQ_PREV(sr, stid_head, link);
183 s->free += sr->used + sr->free;
185 t->nstids_free_head += sr->used + sr->free;
186 KASSERT(t->stids_in_use >= sr->used,
187 ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
188 t->stids_in_use, sr->used));
189 t->stids_in_use -= sr->used;
190 TAILQ_REMOVE(&t->stids, sr, link);
191 mtx_unlock(&t->stid_lock);
194 static struct listen_ctx *
195 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
197 struct listen_ctx *lctx;
199 INP_WLOCK_ASSERT(inp);
201 lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
205 lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
206 if (lctx->stid < 0) {
211 if (inp->inp_vflag & INP_IPV6 &&
212 !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
213 struct tom_data *td = sc->tom_softc;
215 lctx->ce = hold_lip(td, &inp->in6p_laddr, NULL);
216 if (lctx->ce == NULL) {
222 lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
223 lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
224 refcount_init(&lctx->refcount, 1);
225 TAILQ_INIT(&lctx->synq);
228 lctx->vnet = inp->inp_socket->so_vnet;
234 /* Don't call this directly, use release_lctx instead */
236 free_lctx(struct adapter *sc, struct listen_ctx *lctx)
238 struct inpcb *inp = lctx->inp;
239 struct tom_data *td = sc->tom_softc;
241 INP_WLOCK_ASSERT(inp);
242 KASSERT(lctx->refcount == 0,
243 ("%s: refcount %d", __func__, lctx->refcount));
244 KASSERT(TAILQ_EMPTY(&lctx->synq),
245 ("%s: synq not empty.", __func__));
246 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
248 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
249 __func__, lctx->stid, lctx, lctx->inp);
252 release_lip(td, lctx->ce);
256 return (in_pcbrele_wlocked(inp));
260 hold_lctx(struct listen_ctx *lctx)
263 refcount_acquire(&lctx->refcount);
266 static inline uint32_t
267 listen_hashfn(void *key, u_long mask)
270 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
274 * Add a listen_ctx entry to the listen hash table.
277 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
279 struct tom_data *td = sc->tom_softc;
280 int bucket = listen_hashfn(lctx->inp, td->listen_mask);
282 mtx_lock(&td->lctx_hash_lock);
283 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
285 mtx_unlock(&td->lctx_hash_lock);
289 * Look for the listening socket's context entry in the hash and return it.
291 static struct listen_ctx *
292 listen_hash_find(struct adapter *sc, struct inpcb *inp)
294 struct tom_data *td = sc->tom_softc;
295 int bucket = listen_hashfn(inp, td->listen_mask);
296 struct listen_ctx *lctx;
298 mtx_lock(&td->lctx_hash_lock);
299 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
300 if (lctx->inp == inp)
303 mtx_unlock(&td->lctx_hash_lock);
309 * Removes the listen_ctx structure for inp from the hash and returns it.
311 static struct listen_ctx *
312 listen_hash_del(struct adapter *sc, struct inpcb *inp)
314 struct tom_data *td = sc->tom_softc;
315 int bucket = listen_hashfn(inp, td->listen_mask);
316 struct listen_ctx *lctx, *l;
318 mtx_lock(&td->lctx_hash_lock);
319 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
320 if (lctx->inp == inp) {
321 LIST_REMOVE(lctx, link);
326 mtx_unlock(&td->lctx_hash_lock);
332 * Releases a hold on the lctx. Must be called with the listening socket's inp
333 * locked. The inp may be freed by this function and it returns NULL to
336 static struct inpcb *
337 release_lctx(struct adapter *sc, struct listen_ctx *lctx)
339 struct inpcb *inp = lctx->inp;
342 INP_WLOCK_ASSERT(inp);
343 if (refcount_release(&lctx->refcount))
344 inp_freed = free_lctx(sc, lctx);
346 return (inp_freed ? NULL : inp);
350 send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
352 struct adapter *sc = tod->tod_softc;
353 struct mbuf *m = synqe->syn;
354 struct ifnet *ifp = m->m_pkthdr.rcvif;
355 struct vi_info *vi = ifp->if_softc;
356 struct port_info *pi = vi->pi;
357 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
359 struct fw_flowc_wr *flowc;
360 struct cpl_abort_req *req;
361 int txqid, rxqid, flowclen;
362 struct sge_wrq *ofld_txq;
363 struct sge_ofld_rxq *ofld_rxq;
364 const int nparams = 6;
365 unsigned int pfvf = G_FW_VIID_PFN(vi->viid) << S_FW_VIID_PFN;
367 INP_WLOCK_ASSERT(synqe->lctx->inp);
369 CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
370 __func__, synqe, synqe->flags, synqe->tid,
371 synqe->flags & TPF_ABORT_SHUTDOWN ?
372 " (abort already in progress)" : "");
373 if (synqe->flags & TPF_ABORT_SHUTDOWN)
374 return; /* abort already in progress */
375 synqe->flags |= TPF_ABORT_SHUTDOWN;
377 get_qids_from_mbuf(m, &txqid, &rxqid);
378 ofld_txq = &sc->sge.ofld_txq[txqid];
379 ofld_rxq = &sc->sge.ofld_rxq[rxqid];
381 /* The wrqe will have two WRs - a flowc followed by an abort_req */
382 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
384 wr = alloc_wrqe(roundup2(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
387 panic("%s: allocation failure.", __func__);
390 req = (void *)((caddr_t)flowc + roundup2(flowclen, EQ_ESIZE));
392 /* First the flowc ... */
393 memset(flowc, 0, wr->wr_len);
394 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
395 V_FW_FLOWC_WR_NPARAMS(nparams));
396 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
397 V_FW_WR_FLOWID(synqe->tid));
398 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
399 flowc->mnemval[0].val = htobe32(pfvf);
400 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
401 flowc->mnemval[1].val = htobe32(pi->tx_chan);
402 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
403 flowc->mnemval[2].val = htobe32(pi->tx_chan);
404 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
405 flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
406 flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
407 flowc->mnemval[4].val = htobe32(512);
408 flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
409 flowc->mnemval[5].val = htobe32(512);
410 synqe->flags |= TPF_FLOWC_WR_SENT;
412 /* ... then ABORT request */
413 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
414 req->rsvd0 = 0; /* don't have a snd_nxt */
415 req->rsvd1 = 1; /* no data sent yet */
416 req->cmd = CPL_ABORT_SEND_RST;
418 t4_l2t_send(sc, wr, e);
422 create_server(struct adapter *sc, struct listen_ctx *lctx)
425 struct cpl_pass_open_req *req;
426 struct inpcb *inp = lctx->inp;
428 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
430 log(LOG_ERR, "%s: allocation failure", __func__);
436 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
437 req->local_port = inp->inp_lport;
439 req->local_ip = inp->inp_laddr.s_addr;
441 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
442 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
443 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
450 create_server6(struct adapter *sc, struct listen_ctx *lctx)
453 struct cpl_pass_open_req6 *req;
454 struct inpcb *inp = lctx->inp;
456 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
458 log(LOG_ERR, "%s: allocation failure", __func__);
464 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
465 req->local_port = inp->inp_lport;
467 req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
468 req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
471 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
472 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
473 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
480 destroy_server(struct adapter *sc, struct listen_ctx *lctx)
483 struct cpl_close_listsvr_req *req;
485 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
488 panic("%s: allocation failure.", __func__);
493 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
495 req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
496 req->rsvd = htobe16(0);
503 * Start a listening server by sending a passive open request to HW.
505 * Can't take adapter lock here and access to sc->flags,
506 * sc->offload_map, if_capenable are all race prone.
509 t4_listen_start(struct toedev *tod, struct tcpcb *tp)
511 struct adapter *sc = tod->tod_softc;
513 struct port_info *pi;
514 struct inpcb *inp = tp->t_inpcb;
515 struct listen_ctx *lctx;
517 struct offload_settings settings;
519 INP_WLOCK_ASSERT(inp);
521 rw_rlock(&sc->policy_lock);
522 settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, 0xffff,
524 rw_runlock(&sc->policy_lock);
525 if (!settings.offload)
528 /* Don't start a hardware listener for any loopback address. */
529 if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
531 if (!(inp->inp_vflag & INP_IPV6) &&
532 IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
537 log(LOG_ERR, "%s: listen request ignored, %s is busy",
538 __func__, device_get_nameunit(sc->dev));
542 KASSERT(uld_active(sc, ULD_TOM),
543 ("%s: TOM not initialized", __func__));
547 * Find an initialized VI with IFCAP_TOE (4 or 6). We'll use the first
548 * such VI's queues to send the passive open and receive the reply to
551 * XXX: need a way to mark a port in use by offload. if_cxgbe should
552 * then reject any attempt to bring down such a port (and maybe reject
553 * attempts to disable IFCAP_TOE on that port too?).
555 for_each_port(sc, i) {
557 for_each_vi(pi, v, vi) {
558 if (vi->flags & VI_INIT_DONE &&
559 vi->ifp->if_capenable & IFCAP_TOE)
563 goto done; /* no port that's UP with IFCAP_TOE enabled */
566 if (listen_hash_find(sc, inp) != NULL)
567 goto done; /* already setup */
569 lctx = alloc_lctx(sc, inp, vi);
572 "%s: listen request ignored, %s couldn't allocate lctx\n",
573 __func__, device_get_nameunit(sc->dev));
576 listen_hash_add(sc, lctx);
578 CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
579 __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
582 if (inp->inp_vflag & INP_IPV6)
583 rc = create_server6(sc, lctx);
585 rc = create_server(sc, lctx);
587 log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
588 __func__, device_get_nameunit(sc->dev), rc);
589 (void) listen_hash_del(sc, inp);
590 inp = release_lctx(sc, lctx);
591 /* can't be freed, host stack has a reference */
592 KASSERT(inp != NULL, ("%s: inp freed", __func__));
595 lctx->flags |= LCTX_RPL_PENDING;
604 t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
606 struct listen_ctx *lctx;
607 struct adapter *sc = tod->tod_softc;
608 struct inpcb *inp = tp->t_inpcb;
609 struct synq_entry *synqe;
611 INP_WLOCK_ASSERT(inp);
613 lctx = listen_hash_del(sc, inp);
615 return (ENOENT); /* no hardware listener for this inp */
617 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
621 * If the reply to the PASS_OPEN is still pending we'll wait for it to
622 * arrive and clean up when it does.
624 if (lctx->flags & LCTX_RPL_PENDING) {
625 KASSERT(TAILQ_EMPTY(&lctx->synq),
626 ("%s: synq not empty.", __func__));
627 return (EINPROGRESS);
631 * The host stack will abort all the connections on the listening
632 * socket's so_comp. It doesn't know about the connections on the synq
633 * so we need to take care of those.
635 TAILQ_FOREACH(synqe, &lctx->synq, link) {
636 if (synqe->flags & TPF_SYNQE_HAS_L2TE)
637 send_reset_synqe(tod, synqe);
640 destroy_server(sc, lctx);
645 hold_synqe(struct synq_entry *synqe)
648 refcount_acquire(&synqe->refcnt);
652 release_synqe(struct synq_entry *synqe)
655 if (refcount_release(&synqe->refcnt)) {
656 int needfree = synqe->flags & TPF_SYNQE_NEEDFREE;
660 free(synqe, M_CXGBE);
665 t4_syncache_added(struct toedev *tod __unused, void *arg)
667 struct synq_entry *synqe = arg;
673 t4_syncache_removed(struct toedev *tod __unused, void *arg)
675 struct synq_entry *synqe = arg;
677 release_synqe(synqe);
681 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
683 struct adapter *sc = tod->tod_softc;
684 struct synq_entry *synqe = arg;
688 struct ip *ip = mtod(m, struct ip *);
691 wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr);
697 if (ip->ip_v == IPVERSION)
698 th = (void *)(ip + 1);
700 th = (void *)((struct ip6_hdr *)ip + 1);
701 bzero(&to, sizeof(to));
702 tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
705 /* save these for later */
706 synqe->iss = be32toh(th->th_seq);
707 synqe->ts = to.to_tsval;
709 if (chip_id(sc) >= CHELSIO_T5) {
710 struct cpl_t5_pass_accept_rpl *rpl5 = wrtod(wr);
712 rpl5->iss = th->th_seq;
715 e = &sc->l2t->l2tab[synqe->l2e_idx];
716 t4_l2t_send(sc, wr, e);
718 m_freem(m); /* don't need this any more */
723 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
726 struct adapter *sc = iq->adapter;
727 const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
728 int stid = GET_TID(cpl);
729 unsigned int status = cpl->status;
730 struct listen_ctx *lctx = lookup_stid(sc, stid);
731 struct inpcb *inp = lctx->inp;
733 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
736 KASSERT(opcode == CPL_PASS_OPEN_RPL,
737 ("%s: unexpected opcode 0x%x", __func__, opcode));
738 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
739 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
743 CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
744 __func__, stid, status, lctx->flags);
746 lctx->flags &= ~LCTX_RPL_PENDING;
748 if (status != CPL_ERR_NONE)
749 log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
753 * If the inp has been dropped (listening socket closed) then
754 * listen_stop must have run and taken the inp out of the hash.
756 if (inp->inp_flags & INP_DROPPED) {
757 KASSERT(listen_hash_del(sc, inp) == NULL,
758 ("%s: inp %p still in listen hash", __func__, inp));
762 if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
763 if (release_lctx(sc, lctx) != NULL)
769 * Listening socket stopped listening earlier and now the chip tells us
770 * it has started the hardware listener. Stop it; the lctx will be
771 * released in do_close_server_rpl.
773 if (inp->inp_flags & INP_DROPPED) {
774 destroy_server(sc, lctx);
780 * Failed to start hardware listener. Take inp out of the hash and
781 * release our reference on it. An error message has been logged
784 if (status != CPL_ERR_NONE) {
785 listen_hash_del(sc, inp);
786 if (release_lctx(sc, lctx) != NULL)
791 /* hardware listener open for business */
798 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
801 struct adapter *sc = iq->adapter;
802 const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
803 int stid = GET_TID(cpl);
804 unsigned int status = cpl->status;
805 struct listen_ctx *lctx = lookup_stid(sc, stid);
806 struct inpcb *inp = lctx->inp;
808 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
811 KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
812 ("%s: unexpected opcode 0x%x", __func__, opcode));
813 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
814 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
816 CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
818 if (status != CPL_ERR_NONE) {
819 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
820 __func__, status, stid);
825 inp = release_lctx(sc, lctx);
833 done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
835 struct listen_ctx *lctx = synqe->lctx;
836 struct inpcb *inp = lctx->inp;
837 struct vi_info *vi = synqe->syn->m_pkthdr.rcvif->if_softc;
838 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
841 INP_WLOCK_ASSERT(inp);
842 ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
844 TAILQ_REMOVE(&lctx->synq, synqe, link);
845 inp = release_lctx(sc, lctx);
848 remove_tid(sc, synqe->tid, ntids);
849 release_tid(sc, synqe->tid, &sc->sge.ctrlq[vi->pi->port_id]);
851 release_synqe(synqe); /* removed from synq list */
855 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
858 struct adapter *sc = iq->adapter;
859 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
860 unsigned int tid = GET_TID(cpl);
861 struct synq_entry *synqe = lookup_tid(sc, tid);
862 struct listen_ctx *lctx = synqe->lctx;
863 struct inpcb *inp = lctx->inp;
865 struct sge_wrq *ofld_txq;
867 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
870 KASSERT(opcode == CPL_ABORT_REQ_RSS,
871 ("%s: unexpected opcode 0x%x", __func__, opcode));
872 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
873 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
875 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
876 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
878 if (negative_advice(cpl->status))
879 return (0); /* Ignore negative advice */
883 get_qids_from_mbuf(synqe->syn, &txqid, NULL);
884 ofld_txq = &sc->sge.ofld_txq[txqid];
887 * If we'd initiated an abort earlier the reply to it is responsible for
888 * cleaning up resources. Otherwise we tear everything down right here
889 * right now. We owe the T4 a CPL_ABORT_RPL no matter what.
891 if (synqe->flags & TPF_ABORT_SHUTDOWN) {
896 done_with_synqe(sc, synqe);
897 /* inp lock released by done_with_synqe */
899 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
904 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
907 struct adapter *sc = iq->adapter;
908 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
909 unsigned int tid = GET_TID(cpl);
910 struct synq_entry *synqe = lookup_tid(sc, tid);
911 struct listen_ctx *lctx = synqe->lctx;
912 struct inpcb *inp = lctx->inp;
914 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
917 KASSERT(opcode == CPL_ABORT_RPL_RSS,
918 ("%s: unexpected opcode 0x%x", __func__, opcode));
919 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
920 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
922 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
923 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
926 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
927 ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
928 __func__, synqe, synqe->flags));
930 done_with_synqe(sc, synqe);
931 /* inp lock released by done_with_synqe */
937 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
939 struct adapter *sc = tod->tod_softc;
940 struct synq_entry *synqe = arg;
942 struct inpcb *inp = sotoinpcb(so);
944 struct cpl_pass_establish *cpl = mtod(synqe->syn, void *);
945 struct toepcb *toep = *(struct toepcb **)(cpl + 1);
947 INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
948 INP_WLOCK_ASSERT(inp);
949 KASSERT(synqe->flags & TPF_SYNQE,
950 ("%s: %p not a synq_entry?", __func__, arg));
952 offload_socket(so, toep);
953 make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
954 toep->flags |= TPF_CPL_PENDING;
955 update_tid(sc, synqe->tid, toep);
956 synqe->flags |= TPF_SYNQE_EXPANDED;
960 save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi,
961 struct offload_settings *s)
963 uint32_t txqid, rxqid;
965 if (s->txq >= 0 && s->txq < vi->nofldtxq)
968 txqid = arc4random() % vi->nofldtxq;
969 txqid += vi->first_ofld_txq;
971 if (s->rxq >= 0 && s->rxq < vi->nofldrxq)
974 rxqid = arc4random() % vi->nofldrxq;
975 rxqid += vi->first_ofld_rxq;
977 m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff);
981 get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid)
985 *txqid = m->m_pkthdr.flowid >> 16;
987 *rxqid = m->m_pkthdr.flowid & 0xffff;
991 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
992 * store some state temporarily.
994 static struct synq_entry *
995 mbuf_to_synqe(struct mbuf *m)
997 int len = roundup2(sizeof (struct synq_entry), 8);
998 int tspace = M_TRAILINGSPACE(m);
999 struct synq_entry *synqe = NULL;
1002 synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT);
1005 synqe->flags = TPF_SYNQE | TPF_SYNQE_NEEDFREE;
1007 synqe = (void *)(m->m_data + m->m_len + tspace - len);
1008 synqe->flags = TPF_SYNQE;
1015 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
1017 bzero(to, sizeof(*to));
1020 to->to_flags |= TOF_MSS;
1021 to->to_mss = be16toh(t4opt->mss);
1025 to->to_flags |= TOF_SCALE;
1026 to->to_wscale = t4opt->wsf;
1030 to->to_flags |= TOF_TS;
1033 to->to_flags |= TOF_SACKPERM;
1037 * Options2 for passive open.
1040 calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
1041 const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode,
1042 struct cc_algo *cc, const struct offload_settings *s)
1044 struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
1048 * rx flow control, rx coalesce, congestion control, and tx pace are all
1049 * explicitly set by the driver. On T5+ the ISS is also set by the
1050 * driver to the value picked by the kernel.
1053 opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID;
1054 opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID;
1056 opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */
1057 opt2 |= F_T5_ISS; /* ISS provided in CPL */
1060 if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_rfc1323)))
1063 if (tcpopt->tstamp &&
1064 (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323)))
1065 opt2 |= F_TSTAMPS_EN;
1067 if (tcpopt->wsf < 15 && V_tcp_do_rfc1323)
1068 opt2 |= F_WND_SCALE_EN;
1070 if (th->th_flags & (TH_ECE | TH_CWR) &&
1071 (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn)))
1072 opt2 |= F_CCTRL_ECN;
1074 /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */
1076 opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
1078 /* These defaults are subject to ULP specific fixups later. */
1079 opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0);
1083 if (s->cong_algo >= 0)
1084 opt2 |= V_CONG_CNTRL(s->cong_algo);
1085 else if (sc->tt.cong_algorithm >= 0)
1086 opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL);
1088 if (strcasecmp(cc->name, "reno") == 0)
1089 opt2 |= V_CONG_CNTRL(CONG_ALG_RENO);
1090 else if (strcasecmp(cc->name, "tahoe") == 0)
1091 opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE);
1092 if (strcasecmp(cc->name, "newreno") == 0)
1093 opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO);
1094 if (strcasecmp(cc->name, "highspeed") == 0)
1095 opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED);
1098 * Use newreno in case the algorithm selected by the
1099 * host stack is not supported by the hardware.
1101 opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO);
1105 if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce))
1106 opt2 |= V_RX_COALESCE(M_RX_COALESCE);
1108 /* Note that ofld_rxq is already set according to s->rxq. */
1109 opt2 |= F_RSS_QUEUE_VALID;
1110 opt2 |= V_RSS_QUEUE(ofld_rxq->iq.abs_id);
1112 #ifdef USE_DDP_RX_FLOW_CONTROL
1113 if (ulp_mode == ULP_MODE_TCPDDP)
1114 opt2 |= F_RX_FC_DDP;
1117 if (ulp_mode == ULP_MODE_TLS) {
1118 opt2 &= ~V_RX_COALESCE(M_RX_COALESCE);
1119 opt2 |= F_RX_FC_DISABLE;
1122 return (htobe32(opt2));
1126 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
1127 struct in_conninfo *inc, struct tcphdr *th)
1129 const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1130 const struct ether_header *eh;
1131 unsigned int hlen = be32toh(cpl->hdr_len);
1133 const struct tcphdr *tcp;
1135 eh = (const void *)(cpl + 1);
1136 if (chip_id(sc) >= CHELSIO_T6) {
1137 l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
1138 tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
1140 l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
1141 tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
1145 bzero(inc, sizeof(*inc));
1146 inc->inc_fport = tcp->th_sport;
1147 inc->inc_lport = tcp->th_dport;
1148 if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
1149 const struct ip *ip = (const void *)l3hdr;
1151 inc->inc_faddr = ip->ip_src;
1152 inc->inc_laddr = ip->ip_dst;
1154 const struct ip6_hdr *ip6 = (const void *)l3hdr;
1156 inc->inc_flags |= INC_ISIPV6;
1157 inc->inc6_faddr = ip6->ip6_src;
1158 inc->inc6_laddr = ip6->ip6_dst;
1163 bcopy(tcp, th, sizeof(*th));
1164 tcp_fields_to_host(th); /* just like tcp_input */
1168 static struct l2t_entry *
1169 get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp,
1170 struct in_conninfo *inc)
1172 struct l2t_entry *e;
1173 struct sockaddr_in6 sin6;
1174 struct sockaddr *dst = (void *)&sin6;
1176 if (inc->inc_flags & INC_ISIPV6) {
1177 struct nhop6_basic nh6;
1179 bzero(dst, sizeof(struct sockaddr_in6));
1180 dst->sa_len = sizeof(struct sockaddr_in6);
1181 dst->sa_family = AF_INET6;
1183 if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
1184 /* no need for route lookup */
1185 e = t4_l2t_get(pi, ifp, dst);
1189 if (fib6_lookup_nh_basic(RT_DEFAULT_FIB, &inc->inc6_faddr,
1190 0, 0, 0, &nh6) != 0)
1192 if (nh6.nh_ifp != ifp)
1194 ((struct sockaddr_in6 *)dst)->sin6_addr = nh6.nh_addr;
1196 struct nhop4_basic nh4;
1198 dst->sa_len = sizeof(struct sockaddr_in);
1199 dst->sa_family = AF_INET;
1201 if (fib4_lookup_nh_basic(RT_DEFAULT_FIB, inc->inc_faddr, 0, 0,
1204 if (nh4.nh_ifp != ifp)
1206 ((struct sockaddr_in *)dst)->sin_addr = nh4.nh_addr;
1209 e = t4_l2t_get(pi, ifp, dst);
1213 #define REJECT_PASS_ACCEPT() do { \
1214 reject_reason = __LINE__; \
1219 * The context associated with a tid entry via insert_tid could be a synq_entry
1220 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags.
1222 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
1225 * Incoming SYN on a listening socket.
1227 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
1231 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
1234 struct adapter *sc = iq->adapter;
1236 const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
1237 struct cpl_pass_accept_rpl *rpl;
1239 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1240 unsigned int tid = GET_TID(cpl);
1241 struct listen_ctx *lctx = lookup_stid(sc, stid);
1244 struct in_conninfo inc;
1247 struct port_info *pi;
1249 struct ifnet *hw_ifp, *ifp;
1250 struct l2t_entry *e = NULL;
1251 int rscale, mtu_idx, rx_credits, rxqid, ulp_mode;
1252 struct synq_entry *synqe = NULL;
1253 int reject_reason, v, ntids;
1256 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1258 struct offload_settings settings;
1260 KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
1261 ("%s: unexpected opcode 0x%x", __func__, opcode));
1262 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1264 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1267 pass_accept_req_to_protohdrs(sc, m, &inc, &th);
1268 t4opt_to_tcpopt(&cpl->tcpopt, &to);
1270 pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))];
1272 CURVNET_SET(lctx->vnet);
1275 * Use the MAC index to lookup the associated VI. If this SYN
1276 * didn't match a perfect MAC filter, punt.
1278 if (!(be16toh(cpl->l2info) & F_SYN_XACT_MATCH)) {
1281 REJECT_PASS_ACCEPT();
1283 for_each_vi(pi, v, vi) {
1284 if (vi->xact_addr_filt == G_SYN_MAC_IDX(be16toh(cpl->l2info)))
1289 REJECT_PASS_ACCEPT();
1292 hw_ifp = vi->ifp; /* the (v)cxgbeX ifnet */
1293 m->m_pkthdr.rcvif = hw_ifp;
1294 tod = TOEDEV(hw_ifp);
1297 * Figure out if there is a pseudo interface (vlan, lagg, etc.)
1298 * involved. Don't offload if the SYN had a VLAN tag and the vid
1299 * doesn't match anything on this interface.
1301 * XXX: lagg support, lagg + vlan support.
1303 vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1305 ifp = VLAN_DEVAT(hw_ifp, vid);
1307 REJECT_PASS_ACCEPT();
1312 * Don't offload if the peer requested a TCP option that's not known to
1315 if (cpl->tcpopt.unknown)
1316 REJECT_PASS_ACCEPT();
1318 if (inc.inc_flags & INC_ISIPV6) {
1320 /* Don't offload if the ifcap isn't enabled */
1321 if ((ifp->if_capenable & IFCAP_TOE6) == 0)
1322 REJECT_PASS_ACCEPT();
1325 * SYN must be directed to an IP6 address on this ifnet. This
1326 * is more restrictive than in6_localip.
1328 if (!in6_ifhasaddr(ifp, &inc.inc6_laddr))
1329 REJECT_PASS_ACCEPT();
1334 /* Don't offload if the ifcap isn't enabled */
1335 if ((ifp->if_capenable & IFCAP_TOE4) == 0)
1336 REJECT_PASS_ACCEPT();
1339 * SYN must be directed to an IP address on this ifnet. This
1340 * is more restrictive than in_localip.
1342 if (!in_ifhasaddr(ifp, inc.inc_laddr))
1343 REJECT_PASS_ACCEPT();
1349 * Don't offload if the ifnet that the SYN came in on is not in the same
1350 * vnet as the listening socket.
1352 if (lctx->vnet != ifp->if_vnet)
1353 REJECT_PASS_ACCEPT();
1355 e = get_l2te_for_nexthop(pi, ifp, &inc);
1357 REJECT_PASS_ACCEPT();
1359 synqe = mbuf_to_synqe(m);
1361 REJECT_PASS_ACCEPT();
1363 wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
1364 sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[pi->port_id]);
1366 REJECT_PASS_ACCEPT();
1369 INP_INFO_RLOCK(&V_tcbinfo); /* for 4-tuple check */
1371 /* Don't offload if the 4-tuple is already in use */
1372 if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1373 INP_INFO_RUNLOCK(&V_tcbinfo);
1375 REJECT_PASS_ACCEPT();
1377 INP_INFO_RUNLOCK(&V_tcbinfo);
1379 inp = lctx->inp; /* listening socket, not owned by TOE */
1382 /* Don't offload if the listening socket has closed */
1383 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1385 * The listening socket has closed. The reply from the TOE to
1386 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
1387 * resources tied to this listen context.
1391 REJECT_PASS_ACCEPT();
1393 so = inp->inp_socket;
1394 rw_rlock(&sc->policy_lock);
1395 settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, 0xffff, inp);
1396 rw_runlock(&sc->policy_lock);
1397 if (!settings.offload) {
1400 REJECT_PASS_ACCEPT();
1403 mtu_idx = find_best_mtu_idx(sc, &inc, &settings);
1404 rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
1405 SOCKBUF_LOCK(&so->so_rcv);
1406 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1407 rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
1408 SOCKBUF_UNLOCK(&so->so_rcv);
1410 save_qids_in_mbuf(m, vi, &settings);
1411 get_qids_from_mbuf(m, NULL, &rxqid);
1414 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1416 struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
1418 INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
1420 ulp_mode = select_ulp_mode(so, sc, &settings);
1422 case ULP_MODE_TCPDDP:
1423 synqe->flags |= TPF_SYNQE_TCPDDP;
1426 synqe->flags |= TPF_SYNQE_TLS;
1429 rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode,
1431 rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode,
1432 CC_ALGO(intotcpcb(inp)), &settings);
1438 refcount_init(&synqe->refcnt, 1); /* 1 means extra hold */
1439 synqe->l2e_idx = e->idx;
1440 synqe->rcv_bufsize = rx_credits;
1441 atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr);
1443 insert_tid(sc, tid, synqe, ntids);
1444 TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
1445 hold_synqe(synqe); /* hold for the duration it's in the synq */
1446 hold_lctx(lctx); /* A synqe on the list has a ref on its lctx */
1449 * If all goes well t4_syncache_respond will get called during
1450 * syncache_add. Note that syncache_add releases the pcb lock.
1452 toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
1453 INP_UNLOCK_ASSERT(inp); /* ok to assert, we have a ref on the inp */
1456 * If we replied during syncache_add (synqe->wr has been consumed),
1457 * good. Otherwise, set it to 0 so that further syncache_respond
1458 * attempts by the kernel will be ignored.
1460 if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) {
1463 * syncache may or may not have a hold on the synqe, which may
1464 * or may not be stashed in the original SYN mbuf passed to us.
1465 * Just copy it over instead of dealing with all possibilities.
1467 m = m_dup(synqe->syn, M_NOWAIT);
1469 m->m_pkthdr.rcvif = hw_ifp;
1471 remove_tid(sc, synqe->tid, ntids);
1474 /* Yank the synqe out of the lctx synq. */
1476 TAILQ_REMOVE(&lctx->synq, synqe, link);
1477 release_synqe(synqe); /* removed from synq list */
1478 inp = release_lctx(sc, lctx);
1482 release_synqe(synqe); /* extra hold */
1483 REJECT_PASS_ACCEPT();
1486 CTR6(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK mode %d",
1487 __func__, stid, tid, lctx, synqe, ulp_mode);
1490 synqe->flags |= TPF_SYNQE_HAS_L2TE;
1491 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1493 * Listening socket closed but tod_listen_stop did not abort
1494 * this tid because there was no L2T entry for the tid at that
1495 * time. Abort it now. The reply to the abort will clean up.
1498 "%s: stid %u, tid %u, lctx %p, synqe %p (0x%x), ABORT",
1499 __func__, stid, tid, lctx, synqe, synqe->flags);
1500 if (!(synqe->flags & TPF_SYNQE_EXPANDED))
1501 send_reset_synqe(tod, synqe);
1505 release_synqe(synqe); /* extra hold */
1511 release_synqe(synqe); /* extra hold */
1515 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1520 release_tid(sc, tid, lctx->ctrlq);
1522 if (__predict_true(m != NULL)) {
1523 m_adj(m, sizeof(*cpl));
1524 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1525 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1526 m->m_pkthdr.csum_data = 0xffff;
1527 hw_ifp->if_input(hw_ifp, m);
1530 return (reject_reason);
1534 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
1535 const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1536 struct tcphdr *th, struct tcpopt *to)
1538 uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1540 /* start off with the original SYN */
1541 pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th);
1543 /* modify parts to make it look like the ACK to our SYN|ACK */
1544 th->th_flags = TH_ACK;
1545 th->th_ack = synqe->iss + 1;
1546 th->th_seq = be32toh(cpl->rcv_isn);
1547 bzero(to, sizeof(*to));
1548 if (G_TCPOPT_TSTAMP(tcp_opt)) {
1549 to->to_flags |= TOF_TS;
1550 to->to_tsecr = synqe->ts;
1555 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1558 struct adapter *sc = iq->adapter;
1561 const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1562 #if defined(KTR) || defined(INVARIANTS)
1563 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1565 unsigned int tid = GET_TID(cpl);
1566 struct synq_entry *synqe = lookup_tid(sc, tid);
1567 struct listen_ctx *lctx = synqe->lctx;
1568 struct inpcb *inp = lctx->inp, *new_inp;
1572 struct in_conninfo inc;
1573 struct toepcb *toep;
1576 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1579 KASSERT(opcode == CPL_PASS_ESTABLISH,
1580 ("%s: unexpected opcode 0x%x", __func__, opcode));
1581 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1582 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1583 KASSERT(synqe->flags & TPF_SYNQE,
1584 ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1586 CURVNET_SET(lctx->vnet);
1587 INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */
1591 "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1592 __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1594 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1596 if (synqe->flags & TPF_SYNQE_HAS_L2TE) {
1597 KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
1598 ("%s: listen socket closed but tid %u not aborted.",
1603 INP_INFO_RUNLOCK(&V_tcbinfo);
1608 ifp = synqe->syn->m_pkthdr.rcvif;
1610 KASSERT(vi->pi->adapter == sc,
1611 ("%s: vi %p, sc %p mismatch", __func__, vi, sc));
1613 get_qids_from_mbuf(synqe->syn, &txqid, &rxqid);
1614 KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1615 ("%s: CPL arrived on unexpected rxq. %d %d", __func__, rxqid,
1616 (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1618 toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT);
1622 * The reply to this abort will perform final cleanup. There is
1623 * no need to check for HAS_L2TE here. We can be here only if
1624 * we responded to the PASS_ACCEPT_REQ, and our response had the
1627 send_reset_synqe(TOEDEV(ifp), synqe);
1629 INP_INFO_RUNLOCK(&V_tcbinfo);
1634 toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
1635 if (synqe->flags & TPF_SYNQE_TCPDDP)
1636 set_ulp_mode(toep, ULP_MODE_TCPDDP);
1637 else if (synqe->flags & TPF_SYNQE_TLS)
1638 set_ulp_mode(toep, ULP_MODE_TLS);
1640 set_ulp_mode(toep, ULP_MODE_NONE);
1641 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1642 toep->rx_credits = synqe->rcv_bufsize;
1644 so = inp->inp_socket;
1645 KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1647 /* Come up with something that syncache_expand should be ok with. */
1648 synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
1651 * No more need for anything in the mbuf that carried the
1652 * CPL_PASS_ACCEPT_REQ. Drop the CPL_PASS_ESTABLISH and toep pointer
1653 * there. XXX: bad form but I don't want to increase the size of synqe.
1656 KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len,
1657 ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len));
1658 bcopy(cpl, mtod(m, void *), sizeof(*cpl));
1659 *(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep;
1661 if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
1666 /* New connection inpcb is already locked by syncache_expand(). */
1667 new_inp = sotoinpcb(so);
1668 INP_WLOCK_ASSERT(new_inp);
1669 MPASS(so->so_vnet == lctx->vnet);
1670 toep->vnet = lctx->vnet;
1671 if (inc.inc_flags & INC_ISIPV6)
1672 toep->ce = hold_lip(sc->tom_softc, &inc.inc6_laddr, lctx->ce);
1675 * This is for the unlikely case where the syncache entry that we added
1676 * has been evicted from the syncache, but the syncache_expand above
1677 * works because of syncookies.
1679 * XXX: we've held the tcbinfo lock throughout so there's no risk of
1680 * anyone accept'ing a connection before we've installed our hooks, but
1681 * this somewhat defeats the purpose of having a tod_offload_socket :-(
1683 if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
1684 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
1685 t4_offload_socket(TOEDEV(ifp), synqe, so);
1688 INP_WUNLOCK(new_inp);
1690 /* Done with the synqe */
1691 TAILQ_REMOVE(&lctx->synq, synqe, link);
1692 inp = release_lctx(sc, lctx);
1695 INP_INFO_RUNLOCK(&V_tcbinfo);
1697 release_synqe(synqe);
1703 t4_init_listen_cpl_handlers(void)
1706 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1707 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1708 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1709 t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
1713 t4_uninit_listen_cpl_handlers(void)
1716 t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL);
1717 t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL);
1718 t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL);
1719 t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL);