2 * Copyright (c) 2012 Chelsio Communications, Inc.
4 * Written by: Navdeep Parhar <np@FreeBSD.org>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/kernel.h>
38 #include <sys/module.h>
39 #include <sys/protosw.h>
40 #include <sys/refcount.h>
41 #include <sys/domain.h>
42 #include <sys/fnv_hash.h>
43 #include <sys/socket.h>
44 #include <sys/socketvar.h>
45 #include <net/ethernet.h>
47 #include <net/if_types.h>
48 #include <net/if_vlan_var.h>
49 #include <net/route.h>
50 #include <netinet/in.h>
51 #include <netinet/in_pcb.h>
52 #include <netinet/ip.h>
53 #include <netinet/tcp_var.h>
55 #include <netinet/tcp_fsm.h>
56 #include <netinet/toecore.h>
58 #include "common/common.h"
59 #include "common/t4_msg.h"
60 #include "common/t4_regs.h"
61 #include "tom/t4_tom_l2t.h"
62 #include "tom/t4_tom.h"
65 static int alloc_stid(struct adapter *, void *);
66 static void *lookup_stid(struct adapter *, int);
67 static void free_stid(struct adapter *, int);
70 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
72 static int free_lctx(struct adapter *, struct listen_ctx *);
73 static void hold_lctx(struct listen_ctx *);
74 static void listen_hash_add(struct adapter *, struct listen_ctx *);
75 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
76 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
77 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
79 static inline void save_qids_in_mbuf(struct mbuf *, struct port_info *);
80 static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *);
81 static void send_reset_synqe(struct toedev *, struct synq_entry *);
83 /* XXX: won't work for IPv6 */
85 alloc_stid(struct adapter *sc, void *ctx)
87 struct tid_info *t = &sc->tids;
90 mtx_lock(&t->stid_lock);
92 union serv_entry *p = t->sfree;
94 stid = p - t->stid_tab;
100 mtx_unlock(&t->stid_lock);
105 lookup_stid(struct adapter *sc, int stid)
107 struct tid_info *t = &sc->tids;
109 return (t->stid_tab[stid - t->stid_base].data);
113 free_stid(struct adapter *sc, int stid)
115 struct tid_info *t = &sc->tids;
116 union serv_entry *p = &t->stid_tab[stid - t->stid_base];
118 mtx_lock(&t->stid_lock);
122 mtx_unlock(&t->stid_lock);
125 static struct listen_ctx *
126 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct port_info *pi)
128 struct listen_ctx *lctx;
130 INP_WLOCK_ASSERT(inp);
132 lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
136 lctx->stid = alloc_stid(sc, lctx);
137 if (lctx->stid < 0) {
142 lctx->ctrlq = &sc->sge.ctrlq[pi->port_id];
143 lctx->ofld_rxq = &sc->sge.ofld_rxq[pi->first_ofld_rxq];
144 refcount_init(&lctx->refcount, 1);
145 TAILQ_INIT(&lctx->synq);
153 /* Don't call this directly, use release_lctx instead */
155 free_lctx(struct adapter *sc, struct listen_ctx *lctx)
157 struct inpcb *inp = lctx->inp;
159 INP_WLOCK_ASSERT(inp);
160 KASSERT(lctx->refcount == 0,
161 ("%s: refcount %d", __func__, lctx->refcount));
162 KASSERT(TAILQ_EMPTY(&lctx->synq),
163 ("%s: synq not empty.", __func__));
164 KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
166 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
167 __func__, lctx->stid, lctx, lctx->inp);
169 free_stid(sc, lctx->stid);
172 return (in_pcbrele_wlocked(inp));
176 hold_lctx(struct listen_ctx *lctx)
179 refcount_acquire(&lctx->refcount);
182 static inline uint32_t
183 listen_hashfn(void *key, u_long mask)
186 return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
190 * Add a listen_ctx entry to the listen hash table.
193 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
195 struct tom_data *td = sc->tom_softc;
196 int bucket = listen_hashfn(lctx->inp, td->listen_mask);
198 mtx_lock(&td->lctx_hash_lock);
199 LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
201 mtx_unlock(&td->lctx_hash_lock);
205 * Look for the listening socket's context entry in the hash and return it.
207 static struct listen_ctx *
208 listen_hash_find(struct adapter *sc, struct inpcb *inp)
210 struct tom_data *td = sc->tom_softc;
211 int bucket = listen_hashfn(inp, td->listen_mask);
212 struct listen_ctx *lctx;
214 mtx_lock(&td->lctx_hash_lock);
215 LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
216 if (lctx->inp == inp)
219 mtx_unlock(&td->lctx_hash_lock);
225 * Removes the listen_ctx structure for inp from the hash and returns it.
227 static struct listen_ctx *
228 listen_hash_del(struct adapter *sc, struct inpcb *inp)
230 struct tom_data *td = sc->tom_softc;
231 int bucket = listen_hashfn(inp, td->listen_mask);
232 struct listen_ctx *lctx, *l;
234 mtx_lock(&td->lctx_hash_lock);
235 LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
236 if (lctx->inp == inp) {
237 LIST_REMOVE(lctx, link);
242 mtx_unlock(&td->lctx_hash_lock);
248 * Releases a hold on the lctx. Must be called with the listening socket's inp
249 * locked. The inp may be freed by this function and it returns NULL to
252 static struct inpcb *
253 release_lctx(struct adapter *sc, struct listen_ctx *lctx)
255 struct inpcb *inp = lctx->inp;
258 INP_WLOCK_ASSERT(inp);
259 if (refcount_release(&lctx->refcount))
260 inp_freed = free_lctx(sc, lctx);
262 return (inp_freed ? NULL : inp);
266 send_reset_synqe(struct toedev *tod, struct synq_entry *synqe)
268 struct adapter *sc = tod->tod_softc;
269 struct mbuf *m = synqe->syn;
270 struct ifnet *ifp = m->m_pkthdr.rcvif;
271 struct port_info *pi = ifp->if_softc;
272 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
274 struct fw_flowc_wr *flowc;
275 struct cpl_abort_req *req;
276 int txqid, rxqid, flowclen;
277 struct sge_wrq *ofld_txq;
278 struct sge_ofld_rxq *ofld_rxq;
279 const int nparams = 4;
280 unsigned int pfvf = G_FW_VIID_PFN(pi->viid) << S_FW_VIID_PFN;
282 INP_WLOCK_ASSERT(synqe->lctx->inp);
284 CTR4(KTR_CXGBE, "%s: synqe %p, tid %d%s",
285 __func__, synqe, synqe->tid,
286 synqe_flag(synqe, TPF_ABORT_SHUTDOWN) ?
287 " (abort already in progress)" : "");
288 if (synqe_flag(synqe, TPF_ABORT_SHUTDOWN))
289 return; /* abort already in progress */
290 synqe_set_flag(synqe, TPF_ABORT_SHUTDOWN);
292 get_qids_from_mbuf(m, &txqid, &rxqid);
293 ofld_txq = &sc->sge.ofld_txq[txqid];
294 ofld_rxq = &sc->sge.ofld_rxq[rxqid];
296 /* The wrqe will have two WRs - a flowc followed by an abort_req */
297 flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
299 wr = alloc_wrqe(roundup(flowclen, EQ_ESIZE) + sizeof(*req), ofld_txq);
302 panic("%s: allocation failure.", __func__);
305 req = (void *)((caddr_t)flowc + roundup(flowclen, EQ_ESIZE));
307 /* First the flowc ... */
308 memset(flowc, 0, wr->wr_len);
309 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
310 V_FW_FLOWC_WR_NPARAMS(nparams));
311 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
312 V_FW_WR_FLOWID(synqe->tid));
313 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
314 flowc->mnemval[0].val = htobe32(pfvf);
315 flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
316 flowc->mnemval[1].val = htobe32(pi->tx_chan);
317 flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
318 flowc->mnemval[2].val = htobe32(pi->tx_chan);
319 flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
320 flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
321 synqe_set_flag(synqe, TPF_FLOWC_WR_SENT);
323 /* ... then ABORT request */
324 INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
325 req->rsvd0 = 0; /* don't have a snd_nxt */
326 req->rsvd1 = 1; /* no data sent yet */
327 req->cmd = CPL_ABORT_SEND_RST;
329 t4_l2t_send(sc, wr, e);
333 create_server(struct adapter *sc, struct listen_ctx *lctx)
336 struct cpl_pass_open_req *req;
337 struct in_conninfo *inc = &lctx->inp->inp_inc;
339 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
341 log(LOG_ERR, "%s: allocation failure", __func__);
347 OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
348 req->local_port = inc->inc_lport;
350 req->local_ip = inc->inc_laddr.s_addr;
352 req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
353 req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
354 F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
361 destroy_server(struct adapter *sc, struct listen_ctx *lctx)
364 struct cpl_close_listsvr_req *req;
366 wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
369 panic("%s: allocation failure.", __func__);
374 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
376 req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
377 req->rsvd = htobe16(0);
384 * Start a listening server by sending a passive open request to HW.
386 * Can't take adapter lock here and access to sc->flags, sc->open_device_map,
387 * sc->offload_map, if_capenable are all race prone.
390 t4_listen_start(struct toedev *tod, struct tcpcb *tp)
392 struct adapter *sc = tod->tod_softc;
393 struct port_info *pi;
394 struct inpcb *inp = tp->t_inpcb;
395 struct listen_ctx *lctx;
398 INP_WLOCK_ASSERT(inp);
400 if ((inp->inp_vflag & INP_IPV4) == 0)
406 log(LOG_ERR, "%s: listen request ignored, %s is busy",
407 __func__, device_get_nameunit(sc->dev));
411 KASSERT(sc->flags & TOM_INIT_DONE,
412 ("%s: TOM not initialized", __func__));
415 if ((sc->open_device_map & sc->offload_map) == 0)
416 goto done; /* no port that's UP with IFCAP_TOE enabled */
419 * Find a running port with IFCAP_TOE4. We'll use the first such port's
420 * queues to send the passive open and receive the reply to it.
422 * XXX: need a way to mark a port in use by offload. if_cxgbe should
423 * then reject any attempt to bring down such a port (and maybe reject
424 * attempts to disable IFCAP_TOE on that port too?).
426 for_each_port(sc, i) {
427 if (isset(&sc->open_device_map, i) &&
428 sc->port[i]->ifp->if_capenable & IFCAP_TOE4)
431 KASSERT(i < sc->params.nports,
432 ("%s: no running port with TOE capability enabled.", __func__));
435 if (listen_hash_find(sc, inp) != NULL)
436 goto done; /* already setup */
438 lctx = alloc_lctx(sc, inp, pi);
441 "%s: listen request ignored, %s couldn't allocate lctx\n",
442 __func__, device_get_nameunit(sc->dev));
445 listen_hash_add(sc, lctx);
447 CTR5(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p", __func__,
448 lctx->stid, tcpstates[tp->t_state], lctx, inp);
450 if (create_server(sc, lctx) != 0) {
451 log(LOG_ERR, "%s: %s failed to create hw listener.\n", __func__,
452 device_get_nameunit(sc->dev));
453 (void) listen_hash_del(sc, inp);
454 inp = release_lctx(sc, lctx);
455 /* can't be freed, host stack has a reference */
456 KASSERT(inp != NULL, ("%s: inp freed", __func__));
459 lctx->flags |= LCTX_RPL_PENDING;
468 t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
470 struct listen_ctx *lctx;
471 struct adapter *sc = tod->tod_softc;
472 struct inpcb *inp = tp->t_inpcb;
473 struct synq_entry *synqe;
475 INP_WLOCK_ASSERT(inp);
477 lctx = listen_hash_del(sc, inp);
479 return (ENOENT); /* no hardware listener for this inp */
481 CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
485 * If the reply to the PASS_OPEN is still pending we'll wait for it to
486 * arrive and clean up when it does.
488 if (lctx->flags & LCTX_RPL_PENDING) {
489 KASSERT(TAILQ_EMPTY(&lctx->synq),
490 ("%s: synq not empty.", __func__));
491 return (EINPROGRESS);
495 * The host stack will abort all the connections on the listening
496 * socket's so_comp. It doesn't know about the connections on the synq
497 * so we need to take care of those.
499 TAILQ_FOREACH(synqe, &lctx->synq, link)
500 send_reset_synqe(tod, synqe);
502 destroy_server(sc, lctx);
507 hold_synqe(struct synq_entry *synqe)
510 refcount_acquire(&synqe->refcnt);
514 release_synqe(struct synq_entry *synqe)
517 if (refcount_release(&synqe->refcnt)) {
518 int needfree = synqe_flag(synqe, TPF_SYNQE_NEEDFREE);
522 free(synqe, M_CXGBE);
527 t4_syncache_added(struct toedev *tod __unused, void *arg)
529 struct synq_entry *synqe = arg;
535 t4_syncache_removed(struct toedev *tod __unused, void *arg)
537 struct synq_entry *synqe = arg;
539 release_synqe(synqe);
543 extern void tcp_dooptions(struct tcpopt *, u_char *, int, int);
546 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
548 struct adapter *sc = tod->tod_softc;
549 struct synq_entry *synqe = arg;
553 struct ip *ip = mtod(m, struct ip *);
554 struct tcphdr *th = (void *)(ip + 1);
556 wr = (struct wrqe *)atomic_readandclear_ptr(&synqe->wr);
560 bzero(&to, sizeof(to));
561 tcp_dooptions(&to, (void *)(th + 1), (th->th_off << 2) - sizeof(*th),
564 /* save these for later */
565 synqe->iss = be32toh(th->th_seq);
566 synqe->ts = to.to_tsval;
568 e = &sc->l2t->l2tab[synqe->l2e_idx];
569 t4_l2t_send(sc, wr, e);
571 m_freem(m); /* don't need this any more */
576 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
579 struct adapter *sc = iq->adapter;
580 const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
581 int stid = GET_TID(cpl);
582 unsigned int status = cpl->status;
583 struct listen_ctx *lctx = lookup_stid(sc, stid);
584 struct inpcb *inp = lctx->inp;
586 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
589 KASSERT(opcode == CPL_PASS_OPEN_RPL,
590 ("%s: unexpected opcode 0x%x", __func__, opcode));
591 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
592 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
596 CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
597 __func__, stid, status, lctx->flags);
599 lctx->flags &= ~LCTX_RPL_PENDING;
601 if (status != CPL_ERR_NONE)
602 log(LOG_ERR, "listener with stid %u failed: %d", stid, status);
606 * If the inp has been dropped (listening socket closed) then
607 * listen_stop must have run and taken the inp out of the hash.
609 if (inp->inp_flags & INP_DROPPED) {
610 KASSERT(listen_hash_del(sc, inp) == NULL,
611 ("%s: inp %p still in listen hash", __func__, inp));
615 if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
616 if (release_lctx(sc, lctx) != NULL)
622 * Listening socket stopped listening earlier and now the chip tells us
623 * it has started the hardware listener. Stop it; the lctx will be
624 * released in do_close_server_rpl.
626 if (inp->inp_flags & INP_DROPPED) {
627 destroy_server(sc, lctx);
633 * Failed to start hardware listener. Take inp out of the hash and
634 * release our reference on it. An error message has been logged
637 if (status != CPL_ERR_NONE) {
638 listen_hash_del(sc, inp);
639 if (release_lctx(sc, lctx) != NULL)
644 /* hardware listener open for business */
651 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
654 struct adapter *sc = iq->adapter;
655 const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
656 int stid = GET_TID(cpl);
657 unsigned int status = cpl->status;
658 struct listen_ctx *lctx = lookup_stid(sc, stid);
659 struct inpcb *inp = lctx->inp;
661 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
664 KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
665 ("%s: unexpected opcode 0x%x", __func__, opcode));
666 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
667 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
669 CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
671 if (status != CPL_ERR_NONE) {
672 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u",
673 __func__, status, stid);
678 inp = release_lctx(sc, lctx);
686 done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
688 struct listen_ctx *lctx = synqe->lctx;
689 struct inpcb *inp = lctx->inp;
690 struct port_info *pi = synqe->syn->m_pkthdr.rcvif->if_softc;
691 struct l2t_entry *e = &sc->l2t->l2tab[synqe->l2e_idx];
693 INP_WLOCK_ASSERT(inp);
695 TAILQ_REMOVE(&lctx->synq, synqe, link);
696 inp = release_lctx(sc, lctx);
699 remove_tid(sc, synqe->tid);
700 release_tid(sc, synqe->tid, &sc->sge.ctrlq[pi->port_id]);
702 release_synqe(synqe); /* removed from synq list */
706 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
709 struct adapter *sc = iq->adapter;
710 const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
711 unsigned int tid = GET_TID(cpl);
712 struct synq_entry *synqe = lookup_tid(sc, tid);
713 struct listen_ctx *lctx = synqe->lctx;
714 struct inpcb *inp = lctx->inp;
716 struct sge_wrq *ofld_txq;
718 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
721 KASSERT(opcode == CPL_ABORT_REQ_RSS,
722 ("%s: unexpected opcode 0x%x", __func__, opcode));
723 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
724 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
726 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
727 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
729 if (cpl->status == CPL_ERR_RTX_NEG_ADVICE ||
730 cpl->status == CPL_ERR_PERSIST_NEG_ADVICE)
731 return (0); /* Ignore negative advice */
735 get_qids_from_mbuf(synqe->syn, &txqid, NULL);
736 ofld_txq = &sc->sge.ofld_txq[txqid];
739 * If we'd initiated an abort earlier the reply to it is responsible for
740 * cleaning up resources. Otherwise we tear everything down right here
741 * right now. We owe the T4 a CPL_ABORT_RPL no matter what.
743 if (synqe_flag(synqe, TPF_ABORT_SHUTDOWN)) {
748 done_with_synqe(sc, synqe);
749 /* inp lock released by done_with_synqe */
751 send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
756 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
759 struct adapter *sc = iq->adapter;
760 const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
761 unsigned int tid = GET_TID(cpl);
762 struct synq_entry *synqe = lookup_tid(sc, tid);
763 struct listen_ctx *lctx = synqe->lctx;
764 struct inpcb *inp = lctx->inp;
766 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
769 KASSERT(opcode == CPL_ABORT_RPL_RSS,
770 ("%s: unexpected opcode 0x%x", __func__, opcode));
771 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
772 KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
774 CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
775 __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
778 KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
779 ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
780 __func__, synqe, synqe->flags));
782 done_with_synqe(sc, synqe);
783 /* inp lock released by done_with_synqe */
789 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
791 struct adapter *sc = tod->tod_softc;
792 struct synq_entry *synqe = arg;
794 struct inpcb *inp = sotoinpcb(so);
796 struct cpl_pass_establish *cpl = mtod(synqe->syn, void *);
797 struct toepcb *toep = *(struct toepcb **)(cpl + 1);
799 INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */
800 INP_WLOCK_ASSERT(inp);
801 KASSERT(synqe_flag(synqe, TPF_SYNQE),
802 ("%s: %p not a synq_entry?", __func__, arg));
804 offload_socket(so, toep);
805 make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt);
806 toepcb_set_flag(toep, TPF_CPL_PENDING);
807 update_tid(sc, synqe->tid, toep);
811 save_qids_in_mbuf(struct mbuf *m, struct port_info *pi)
813 uint32_t txqid, rxqid;
815 txqid = (arc4random() % pi->nofldtxq) + pi->first_ofld_txq;
816 rxqid = (arc4random() % pi->nofldrxq) + pi->first_ofld_rxq;
818 m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff);
822 get_qids_from_mbuf(struct mbuf *m, int *txqid, int *rxqid)
826 *txqid = m->m_pkthdr.flowid >> 16;
828 *rxqid = m->m_pkthdr.flowid & 0xffff;
832 * Use the trailing space in the mbuf in which the PASS_ACCEPT_REQ arrived to
833 * store some state temporarily.
835 static struct synq_entry *
836 mbuf_to_synqe(struct mbuf *m)
838 int len = roundup(sizeof (struct synq_entry), 8);
839 int tspace = M_TRAILINGSPACE(m);
840 struct synq_entry *synqe = NULL;
843 synqe = malloc(sizeof(*synqe), M_CXGBE, M_NOWAIT);
847 synqe = (void *)(m->m_data + m->m_len + tspace - sizeof(*synqe));
850 synqe_set_flag(synqe, TPF_SYNQE);
852 synqe_set_flag(synqe, TPF_SYNQE_NEEDFREE);
858 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
860 bzero(to, sizeof(*to));
863 to->to_flags |= TOF_MSS;
864 to->to_mss = be16toh(t4opt->mss);
868 to->to_flags |= TOF_SCALE;
869 to->to_wscale = t4opt->wsf;
873 to->to_flags |= TOF_TS;
876 to->to_flags |= TOF_SACKPERM;
880 * Options2 for passive open.
883 calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid,
884 const struct tcp_options *tcpopt, struct tcphdr *th)
887 struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid];
889 if (V_tcp_do_rfc1323) {
891 opt2 |= F_TSTAMPS_EN;
895 opt2 |= F_WND_SCALE_EN;
898 if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR))
901 opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
902 opt2 |= F_RX_COALESCE_VALID | V_RX_COALESCE(M_RX_COALESCE);
903 opt2 |= F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id);
905 return htobe32(opt2);
908 /* XXX: duplication. */
910 tcp_fields_to_host(struct tcphdr *th)
913 th->th_seq = ntohl(th->th_seq);
914 th->th_ack = ntohl(th->th_ack);
915 th->th_win = ntohs(th->th_win);
916 th->th_urp = ntohs(th->th_urp);
920 pass_accept_req_to_protohdrs(const struct mbuf *m, struct in_conninfo *inc,
923 const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
924 const struct ether_header *eh;
925 unsigned int hlen = be32toh(cpl->hdr_len);
927 const struct tcphdr *tcp;
929 eh = (const void *)(cpl + 1);
930 ip = (const void *)((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
931 tcp = (const void *)((uintptr_t)ip + G_IP_HDR_LEN(hlen));
934 bzero(inc, sizeof(*inc));
935 inc->inc_faddr = ip->ip_src;
936 inc->inc_laddr = ip->ip_dst;
937 inc->inc_fport = tcp->th_sport;
938 inc->inc_lport = tcp->th_dport;
940 inc->inc_flags |= INC_ISIPV6;
944 bcopy(tcp, th, sizeof(*th));
945 tcp_fields_to_host(th); /* just like tcp_input */
949 #define REJECT_PASS_ACCEPT() do { \
950 reject_reason = __LINE__; \
955 * The context associated with a tid entry via insert_tid could be a synq_entry
956 * or a toepcb. The only way CPL handlers can tell is via a bit in these flags.
958 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
961 * Incoming SYN on a listening socket.
963 * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
967 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
970 struct adapter *sc = iq->adapter;
972 const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
973 struct cpl_pass_accept_rpl *rpl;
975 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
976 unsigned int tid = GET_TID(cpl);
977 struct listen_ctx *lctx = lookup_stid(sc, stid);
980 struct in_conninfo inc;
983 struct port_info *pi;
984 struct ifnet *ifp, *ifp_vlan = NULL;
985 struct l2t_entry *e = NULL;
987 struct sockaddr_in nam;
988 int rscale, mtu_idx, rx_credits, rxqid;
989 struct synq_entry *synqe = NULL;
993 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
996 KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
997 ("%s: unexpected opcode 0x%x", __func__, opcode));
998 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1000 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
1003 pass_accept_req_to_protohdrs(m, &inc, &th);
1004 t4opt_to_tcpopt(&cpl->tcpopt, &to);
1006 pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))];
1008 m->m_pkthdr.rcvif = ifp;
1012 * Don't offload if the interface that received the SYN doesn't have
1013 * IFCAP_TOE enabled.
1015 if ((ifp->if_capenable & IFCAP_TOE4) == 0)
1016 REJECT_PASS_ACCEPT();
1018 /* Don't offload IPv6 connections. XXX: add IPv6 support */
1019 if (inc.inc_flags & INC_ISIPV6)
1020 REJECT_PASS_ACCEPT();
1023 * Don't offload if the SYN had a VLAN tag and the vid doesn't match
1024 * anything on this interface.
1026 vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
1028 ifp_vlan = VLAN_DEVAT(ifp, vid);
1029 if (ifp_vlan == NULL)
1030 REJECT_PASS_ACCEPT();
1034 * Don't offload if the peer requested a TCP option that's not known to
1037 if (cpl->tcpopt.unknown)
1038 REJECT_PASS_ACCEPT();
1041 * Don't offload if the outgoing interface for the route back to the
1042 * peer is not the same as the interface that received the SYN.
1043 * XXX: too restrictive.
1045 nam.sin_len = sizeof(nam);
1046 nam.sin_family = AF_INET;
1047 nam.sin_addr = inc.inc_faddr;
1048 rt = rtalloc1((struct sockaddr *)&nam, 0, 0);
1050 REJECT_PASS_ACCEPT();
1052 struct sockaddr *nexthop;
1055 nexthop = rt->rt_flags & RTF_GATEWAY ? rt->rt_gateway :
1056 (struct sockaddr *)&nam;
1057 if (rt->rt_ifp == ifp ||
1058 (ifp_vlan != NULL && rt->rt_ifp == ifp_vlan))
1059 e = t4_l2t_get(pi, rt->rt_ifp, nexthop);
1062 REJECT_PASS_ACCEPT(); /* no l2te, or ifp mismatch */
1065 synqe = mbuf_to_synqe(m);
1067 REJECT_PASS_ACCEPT();
1069 wr = alloc_wrqe(sizeof(*rpl), &sc->sge.ctrlq[pi->port_id]);
1071 REJECT_PASS_ACCEPT();
1074 INP_INFO_WLOCK(&V_tcbinfo); /* for 4-tuple check, syncache_add */
1076 /* Don't offload if the 4-tuple is already in use */
1077 if (toe_4tuple_check(&inc, &th, ifp) != 0) {
1078 INP_INFO_WUNLOCK(&V_tcbinfo);
1080 REJECT_PASS_ACCEPT();
1083 inp = lctx->inp; /* listening socket, not owned by TOE */
1086 /* Don't offload if the listening socket has closed */
1087 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1089 * The listening socket has closed. The reply from the TOE to
1090 * our CPL_CLOSE_LISTSRV_REQ will ultimately release all
1091 * resources tied to this listen context.
1094 INP_INFO_WUNLOCK(&V_tcbinfo);
1096 REJECT_PASS_ACCEPT();
1098 so = inp->inp_socket;
1100 mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss));
1101 rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0;
1102 SOCKBUF_LOCK(&so->so_rcv);
1103 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1104 rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ);
1105 SOCKBUF_UNLOCK(&so->so_rcv);
1107 save_qids_in_mbuf(m, pi);
1108 get_qids_from_mbuf(m, NULL, &rxqid);
1110 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
1111 rpl->opt0 = calc_opt0(so, pi, e, mtu_idx, rscale, rx_credits,
1113 rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th);
1119 refcount_init(&synqe->refcnt, 1); /* 1 so that it is held for the
1120 duration of this function */
1121 synqe->l2e_idx = e->idx;
1122 synqe->rcv_bufsize = rx_credits;
1123 atomic_store_rel_ptr(&synqe->wr, (uintptr_t)wr);
1125 insert_tid(sc, tid, synqe);
1126 TAILQ_INSERT_TAIL(&lctx->synq, synqe, link);
1127 hold_synqe(synqe); /* hold for the duration it's in the synq */
1128 hold_lctx(lctx); /* A synqe on the list has a ref on its lctx */
1131 * If all goes well t4_syncache_respond will get called during
1132 * syncache_add. Also note that syncache_add releases both pcbinfo and
1135 toe_syncache_add(&inc, &to, &th, inp, tod, synqe);
1136 INP_UNLOCK_ASSERT(inp); /* ok to assert, we have a ref on the inp */
1137 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
1140 * If we replied during syncache_add (synqe->wr has been consumed),
1141 * good. Otherwise, set it to 0 so that further syncache_respond
1142 * attempts by the kernel will be ignored.
1144 * The extra hold on the synqe makes sure that it is still around, even
1145 * if the listener has been dropped and the synqe was aborted and the
1146 * reply to the abort has removed and released the synqe from the synq
1149 if (atomic_cmpset_ptr(&synqe->wr, (uintptr_t)wr, 0)) {
1152 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1153 /* listener closed. synqe must have been aborted. */
1154 KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
1155 ("%s: listener %p closed but synqe %p not aborted",
1156 __func__, inp, synqe));
1159 "%s: stid %u, tid %u, lctx %p, synqe %p, ABORTED",
1160 __func__, stid, tid, lctx, synqe);
1163 release_synqe(synqe); /* about to exit function */
1168 * synqe aborted before TOM replied to PASS_ACCEPT_REQ. But
1169 * that can only happen if the listener was closed and we just
1172 KASSERT(!synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
1173 ("%s: synqe %p aborted, but listener %p not dropped.",
1174 __func__, synqe, inp));
1176 /* Yank the synqe out of the lctx synq. */
1177 TAILQ_REMOVE(&lctx->synq, synqe, link);
1178 release_synqe(synqe); /* removed from synq list */
1179 inp = release_lctx(sc, lctx);
1184 * syncache may or may not have a hold on the synqe, which may
1185 * or may not be stashed in the original SYN mbuf passed to us.
1186 * Just copy it over instead of dealing with all possibilities.
1188 m = m_dup(synqe->syn, M_DONTWAIT);
1190 m->m_pkthdr.rcvif = ifp;
1192 release_synqe(synqe); /* about to exit function */
1194 REJECT_PASS_ACCEPT();
1196 release_synqe(synqe); /* about to exit function */
1197 CTR5(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p, synqe %p, SYNACK",
1198 __func__, stid, tid, lctx, synqe);
1201 CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
1206 release_tid(sc, tid, lctx->ctrlq);
1208 if (__predict_true(m != NULL)) {
1209 m_adj(m, sizeof(*cpl));
1210 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
1211 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1212 m->m_pkthdr.csum_data = 0xffff;
1213 ifp->if_input(ifp, m);
1216 return (reject_reason);
1220 synqe_to_protohdrs(struct synq_entry *synqe,
1221 const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
1222 struct tcphdr *th, struct tcpopt *to)
1224 uint16_t tcp_opt = be16toh(cpl->tcp_opt);
1226 /* start off with the original SYN */
1227 pass_accept_req_to_protohdrs(synqe->syn, inc, th);
1229 /* modify parts to make it look like the ACK to our SYN|ACK */
1230 th->th_flags = TH_ACK;
1231 th->th_ack = synqe->iss + 1;
1232 th->th_seq = be32toh(cpl->rcv_isn);
1233 bzero(to, sizeof(*to));
1234 if (G_TCPOPT_TSTAMP(tcp_opt)) {
1235 to->to_flags |= TOF_TS;
1236 to->to_tsecr = synqe->ts;
1241 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
1244 struct adapter *sc = iq->adapter;
1245 struct port_info *pi;
1247 const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
1248 #if defined(KTR) || defined(INVARIANTS)
1249 unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
1251 unsigned int tid = GET_TID(cpl);
1252 struct synq_entry *synqe = lookup_tid(sc, tid);
1253 struct listen_ctx *lctx = synqe->lctx;
1254 struct inpcb *inp = lctx->inp;
1258 struct in_conninfo inc;
1259 struct toepcb *toep;
1262 unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
1265 KASSERT(opcode == CPL_PASS_ESTABLISH,
1266 ("%s: unexpected opcode 0x%x", __func__, opcode));
1267 KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
1268 KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
1269 KASSERT(synqe_flag(synqe, TPF_SYNQE),
1270 ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
1272 INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */
1276 "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
1277 __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
1279 if (__predict_false(inp->inp_flags & INP_DROPPED)) {
1281 * The listening socket has closed. The TOM must have aborted
1282 * all the embryonic connections (including this one) that were
1283 * on the lctx's synq. do_abort_rpl for the tid is responsible
1286 KASSERT(synqe_flag(synqe, TPF_ABORT_SHUTDOWN),
1287 ("%s: listen socket dropped but tid %u not aborted.",
1291 INP_INFO_WUNLOCK(&V_tcbinfo);
1295 ifp = synqe->syn->m_pkthdr.rcvif;
1297 KASSERT(pi->adapter == sc,
1298 ("%s: pi %p, sc %p mismatch", __func__, pi, sc));
1300 get_qids_from_mbuf(synqe->syn, &txqid, &rxqid);
1301 KASSERT(rxqid == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
1302 ("%s: CPL arrived on unexpected rxq. %d %d", __func__, rxqid,
1303 (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
1305 toep = alloc_toepcb(pi, txqid, rxqid, M_NOWAIT);
1308 /* The reply to this abort will perform final cleanup */
1309 send_reset_synqe(TOEDEV(ifp), synqe);
1311 INP_INFO_WUNLOCK(&V_tcbinfo);
1315 toep->l2te = &sc->l2t->l2tab[synqe->l2e_idx];
1316 toep->ulp_mode = ULP_MODE_NONE;
1317 /* opt0 rcv_bufsiz initially, assumes its normal meaning later */
1318 toep->rx_credits = synqe->rcv_bufsize;
1320 so = inp->inp_socket;
1321 KASSERT(so != NULL, ("%s: socket is NULL", __func__));
1323 /* Come up with something that syncache_expand should be ok with. */
1324 synqe_to_protohdrs(synqe, cpl, &inc, &th, &to);
1327 * No more need for anything in the mbuf that carried the
1328 * CPL_PASS_ACCEPT_REQ. Drop the CPL_PASS_ESTABLISH and toep pointer
1329 * there. XXX: bad form but I don't want to increase the size of synqe.
1332 KASSERT(sizeof(*cpl) + sizeof(toep) <= m->m_len,
1333 ("%s: no room in mbuf %p (m_len %d)", __func__, m, m->m_len));
1334 bcopy(cpl, mtod(m, void *), sizeof(*cpl));
1335 *(struct toepcb **)(mtod(m, struct cpl_pass_establish *) + 1) = toep;
1337 if (!toe_syncache_expand(&inc, &to, &th, &so) || so == NULL) {
1342 /* Done with the synqe */
1343 TAILQ_REMOVE(&lctx->synq, synqe, link);
1344 inp = release_lctx(sc, lctx);
1347 INP_INFO_WUNLOCK(&V_tcbinfo);
1348 release_synqe(synqe);
1354 t4_init_listen_cpl_handlers(struct adapter *sc)
1357 t4_register_cpl_handler(sc, CPL_PASS_OPEN_RPL, do_pass_open_rpl);
1358 t4_register_cpl_handler(sc, CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
1359 t4_register_cpl_handler(sc, CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
1360 t4_register_cpl_handler(sc, CPL_PASS_ESTABLISH, do_pass_establish);