1 /**************************************************************************
3 Copyright (c) 2007-2008, Chelsio Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Chelsio Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/fcntl.h>
36 #include <sys/kernel.h>
37 #include <sys/limits.h>
41 #include <sys/mutex.h>
42 #include <sys/sockbuf.h>
43 #include <sys/sockopt.h>
44 #include <sys/sockstate.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/protosw.h>
52 #include <net/route.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_systm.h>
57 #include <netinet/in_var.h>
60 #include <dev/cxgb/cxgb_osdep.h>
61 #include <dev/cxgb/sys/mbufq.h>
63 #include <netinet/ip.h>
64 #include <netinet/tcp_var.h>
65 #include <netinet/tcp_fsm.h>
66 #include <netinet/tcp_offload.h>
67 #include <netinet/tcp_seq.h>
68 #include <netinet/tcp_syncache.h>
69 #include <netinet/tcp_timer.h>
70 #include <net/route.h>
72 #include <dev/cxgb/t3cdev.h>
73 #include <dev/cxgb/common/cxgb_firmware_exports.h>
74 #include <dev/cxgb/common/cxgb_t3_cpl.h>
75 #include <dev/cxgb/common/cxgb_tcb.h>
76 #include <dev/cxgb/common/cxgb_ctl_defs.h>
77 #include <dev/cxgb/cxgb_offload.h>
80 #include <machine/bus.h>
81 #include <dev/cxgb/sys/mvec.h>
82 #include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
83 #include <dev/cxgb/ulp/tom/cxgb_defs.h>
84 #include <dev/cxgb/ulp/tom/cxgb_tom.h>
85 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
86 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
87 #include <dev/cxgb/ulp/tom/cxgb_tcp.h>
89 #include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
92 * For ULP connections HW may add headers, e.g., for digests, that aren't part
93 * of the messages sent by the host but that are part of the TCP payload and
94 * therefore consume TCP sequence space. Tx connection parameters that
95 * operate in TCP sequence space are affected by the HW additions and need to
96 * compensate for them to accurately track TCP sequence numbers. This array
97 * contains the compensating extra lengths for ULP packets. It is indexed by
98 * a packet's ULP submode.
100 const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
104 * This sk_buff holds a fake header-only TCP segment that we use whenever we
105 * need to exploit SW TCP functionality that expects TCP headers, such as
106 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple
107 * CPUs without locking.
109 static struct mbuf *tcphdr_mbuf __read_mostly;
113 * Size of WRs in bytes. Note that we assume all devices we are handling have
116 static unsigned int wrlen __read_mostly;
119 * The number of WRs needed for an skb depends on the number of page fragments
120 * in the skb and whether it has any payload in its main body. This maps the
121 * length of the gather list represented by an skb into the # of necessary WRs.
123 static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
126 * Max receive window supported by HW in bytes. Only a small part of it can
127 * be set through option0, the rest needs to be set through RX_DATA_ACK.
129 #define MAX_RCV_WND ((1U << 27) - 1)
132 * Min receive window. We want it to be large enough to accommodate receive
133 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
135 #define MIN_RCV_WND (24 * 1024U)
136 #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
138 #define VALIDATE_SEQ 0
139 #define VALIDATE_SOCK(so)
142 #define TCP_TIMEWAIT 1
146 extern int tcp_do_autorcvbuf;
147 extern int tcp_do_autosndbuf;
148 extern int tcp_autorcvbuf_max;
149 extern int tcp_autosndbuf_max;
151 static void t3_send_reset(struct toepcb *toep);
152 static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
153 static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
154 static void handle_syncache_event(int event, void *arg);
157 SBAPPEND(struct sockbuf *sb, struct mbuf *n)
163 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
164 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
165 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
166 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
167 m->m_next, m->m_nextpkt, m->m_flags));
172 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
173 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
174 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
175 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
176 m->m_next, m->m_nextpkt, m->m_flags));
179 KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
180 sbappendstream_locked(sb, n);
184 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
185 m->m_next, m->m_nextpkt, m->m_flags));
191 is_t3a(const struct toedev *dev)
193 return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
197 dump_toepcb(struct toepcb *toep)
199 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
200 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
201 toep->tp_mtu_idx, toep->tp_tid);
203 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
204 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
205 toep->tp_mss_clamp, toep->tp_flags);
208 #ifndef RTALLOC2_DEFINED
209 static struct rtentry *
210 rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
212 struct rtentry *rt = NULL;
214 if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
222 * Determine whether to send a CPL message now or defer it. A message is
223 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
224 * For connections in other states the message is sent immediately.
225 * If through_l2t is set the message is subject to ARP processing, otherwise
226 * it is sent directly.
229 send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
231 struct tcpcb *tp = toep->tp_tp;
233 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
234 inp_wlock(tp->t_inpcb);
235 mbufq_tail(&toep->out_of_order_queue, m); // defer
236 inp_wunlock(tp->t_inpcb);
237 } else if (through_l2t)
238 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T
240 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly
243 static inline unsigned int
244 mkprio(unsigned int cntrl, const struct toepcb *toep)
250 * Populate a TID_RELEASE WR. The skb must be already propely sized.
253 mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
255 struct cpl_tid_release *req;
257 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
258 m->m_pkthdr.len = m->m_len = sizeof(*req);
259 req = mtod(m, struct cpl_tid_release *);
260 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
262 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
266 make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
268 struct tcpcb *tp = so_sototcpcb(so);
269 struct toepcb *toep = tp->t_toe;
270 struct tx_data_wr *req;
273 snd = so_sockbuf_snd(so);
274 inp_wlock_assert(tp->t_inpcb);
276 req = mtod(m, struct tx_data_wr *);
277 m->m_len = sizeof(*req);
278 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
279 req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
280 /* len includes the length of any HW ULP additions */
281 req->len = htonl(len);
282 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
283 /* V_TX_ULP_SUBMODE sets both the mode and submode */
284 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
285 V_TX_URG(/* skb_urgent(skb) */ 0 ) |
286 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
288 req->sndseq = htonl(tp->snd_nxt);
289 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
290 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
291 V_TX_CPU_IDX(toep->tp_qset));
293 /* Sendbuffer is in units of 32KB.
295 if (tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
296 req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
298 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
301 toep->tp_flags |= TP_DATASENT;
305 #define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
308 t3_push_frames(struct socket *so, int req_completion)
310 struct tcpcb *tp = so_sototcpcb(so);
311 struct toepcb *toep = tp->t_toe;
313 struct mbuf *tail, *m0, *last;
316 int state, bytes, count, total_bytes;
317 bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
320 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
321 DPRINTF("tcp state=%d\n", tp->t_state);
325 state = so_state_get(so);
327 if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
328 DPRINTF("disconnecting\n");
333 inp_lock_assert(tp->t_inpcb);
335 snd = so_sockbuf_snd(so);
338 d = TOM_DATA(toep->tp_toedev);
341 last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
344 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
345 toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
347 if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) {
348 KASSERT(tail, ("sbdrop error"));
349 last = tail = tail->m_next;
352 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
353 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
359 toep->tp_m_last = NULL;
360 while (toep->tp_wr_avail && (tail != NULL)) {
363 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
368 * If the data in tail fits as in-line, then
369 * make an immediate data wr.
371 if (tail->m_len <= IMM_LEN) {
378 make_tx_data_wr(so, m0, bytes, tail);
379 m_append(m0, bytes, mtod(last, caddr_t));
380 KASSERT(!m0->m_next, ("bad append"));
382 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
383 && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
384 bytes += tail->m_len;
388 * technically an abuse to be using this for a VA
389 * but less gross than defining my own structure
390 * or calling pmap_kextract from here :-|
392 segp->ds_addr = (bus_addr_t)tail->m_data;
393 segp->ds_len = tail->m_len;
394 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
395 count, mbuf_wrs[count], tail->m_data, tail->m_len);
399 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
400 toep->tp_wr_avail, count, mbuf_wrs[count], tail);
403 m_set_sgllen(m0, count);
404 make_tx_data_wr(so, m0, bytes, tail);
406 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
409 snd->sb_sndptr = tail;
410 toep->tp_m_last = NULL;
412 toep->tp_m_last = snd->sb_sndptr = last;
415 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
417 snd->sb_sndptroff += bytes;
418 total_bytes += bytes;
419 toep->tp_write_seq += bytes;
420 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
421 " tail=%p sndptr=%p sndptroff=%d",
422 toep->tp_wr_avail, count, mbuf_wrs[count],
423 tail, snd->sb_sndptr, snd->sb_sndptroff);
425 CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
426 " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
427 total_bytes, toep->tp_m_last, tail->m_data,
430 CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
431 " tp_m_last=%p snd_una=0x%08x",
432 total_bytes, toep->tp_m_last, tp->snd_una);
440 while (i < count && m_get_sgllen(m0)) {
441 if ((count - i) >= 3) {
443 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
444 " len=%d pa=0x%zx len=%d",
445 segs[i].ds_addr, segs[i].ds_len,
446 segs[i + 1].ds_addr, segs[i + 1].ds_len,
447 segs[i + 2].ds_addr, segs[i + 2].ds_len);
449 } else if ((count - i) == 2) {
451 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
453 segs[i].ds_addr, segs[i].ds_len,
454 segs[i + 1].ds_addr, segs[i + 1].ds_len);
457 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
458 segs[i].ds_addr, segs[i].ds_len);
466 * remember credits used
468 m0->m_pkthdr.csum_data = mbuf_wrs[count];
469 m0->m_pkthdr.len = bytes;
470 toep->tp_wr_avail -= mbuf_wrs[count];
471 toep->tp_wr_unacked += mbuf_wrs[count];
473 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
474 toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
475 struct work_request_hdr *wr = cplhdr(m0);
477 wr->wr_hi |= htonl(F_WR_COMPL);
478 toep->tp_wr_unacked = 0;
480 KASSERT((m0->m_pkthdr.csum_data > 0) &&
481 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
482 m0->m_pkthdr.csum_data));
483 m0->m_type = MT_DONTFREE;
484 enqueue_wr(toep, m0);
485 DPRINTF("sending offload tx with %d bytes in %d segments\n",
487 l2t_send(cdev, m0, toep->tp_l2t);
490 return (total_bytes);
494 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail
495 * under any circumstances. We take the easy way out and always queue the
496 * message to the write_queue. We can optimize the case where the queue is
497 * already empty though the optimization is probably not worth it.
500 close_conn(struct socket *so)
503 struct cpl_close_con_req *req;
505 struct inpcb *inp = so_sotoinpcb(so);
511 tp = so_sototcpcb(so);
515 if (tp->t_state != TCPS_SYN_SENT)
516 t3_push_frames(so, 1);
518 if (toep->tp_flags & TP_FIN_SENT) {
525 d = TOM_DATA(toep->tp_toedev);
527 m = m_gethdr_nofail(sizeof(*req));
528 m_set_priority(m, CPL_PRIORITY_DATA);
532 toep->tp_flags |= TP_FIN_SENT;
533 req = mtod(m, struct cpl_close_con_req *);
535 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
536 req->wr.wr_lo = htonl(V_WR_TID(tid));
537 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
539 req->rsvd = htonl(toep->tp_write_seq);
543 * XXX - need to defer shutdown while there is still data in the queue
546 CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
547 cxgb_ofld_send(d->cdev, m);
552 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant
556 abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
558 struct cpl_abort_req *req = cplhdr(m);
560 req->cmd = CPL_ABORT_NO_RST;
561 cxgb_ofld_send(cdev, m);
565 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are
566 * permitted to return without sending the message in case we cannot allocate
567 * an sk_buff. Returns the number of credits sent.
570 t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
573 struct cpl_rx_data_ack *req;
574 struct toepcb *toep = tp->t_toe;
575 struct toedev *tdev = toep->tp_toedev;
577 m = m_gethdr_nofail(sizeof(*req));
579 DPRINTF("returning %u credits to HW\n", credits);
581 req = mtod(m, struct cpl_rx_data_ack *);
582 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
584 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
585 req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
586 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
587 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
592 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
593 * This is only used in DDP mode, so we take the opportunity to also set the
594 * DACK mode and flush any Rx credits.
597 t3_send_rx_modulate(struct toepcb *toep)
600 struct cpl_rx_data_ack *req;
602 m = m_gethdr_nofail(sizeof(*req));
604 req = mtod(m, struct cpl_rx_data_ack *);
605 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
607 m->m_pkthdr.len = m->m_len = sizeof(*req);
609 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
610 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
612 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
613 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
614 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
615 toep->tp_rcv_wup = toep->tp_copied_seq;
619 * Handle receipt of an urgent pointer.
622 handle_urg_ptr(struct socket *so, uint32_t urg_seq)
624 #ifdef URGENT_DATA_SUPPORTED
625 struct tcpcb *tp = so_sototcpcb(so);
627 urg_seq--; /* initially points past the urgent data, per BSD */
629 if (tp->urg_data && !after(urg_seq, tp->urg_seq))
630 return; /* duplicate pointer */
632 if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
633 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
634 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
637 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
638 tom_eat_skb(sk, skb, 0);
640 tp->urg_data = TCP_URG_NOTYET;
641 tp->urg_seq = urg_seq;
646 * Returns true if a socket cannot accept new Rx data.
649 so_no_receive(const struct socket *so)
651 return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
655 * Process an urgent data notification.
658 rx_urg_notify(struct toepcb *toep, struct mbuf *m)
660 struct cpl_rx_urg_notify *hdr = cplhdr(m);
661 struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
665 if (!so_no_receive(so))
666 handle_urg_ptr(so, ntohl(hdr->seq));
672 * Handler for RX_URG_NOTIFY CPL messages.
675 do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
677 struct toepcb *toep = (struct toepcb *)ctx;
679 rx_urg_notify(toep, m);
684 is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
686 return (toep->tp_ulp_mode ||
687 (toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
688 dev->tod_ttid >= TOE_ID_CHELSIO_T3));
692 * Set of states for which we should return RX credits.
694 #define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
697 * Called after some received data has been read. It returns RX credits
698 * to the HW for the amount of data processed.
701 t3_cleanup_rbuf(struct tcpcb *tp, int copied)
703 struct toepcb *toep = tp->t_toe;
706 int dack_mode, must_send, read;
707 u32 thres, credits, dack = 0;
710 so = inp_inpcbtosocket(tp->t_inpcb);
711 rcv = so_sockbuf_rcv(so);
713 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
714 (tp->t_state == TCPS_FIN_WAIT_2))) {
717 toep->tp_copied_seq += copied;
725 inp_wlock_assert(tp->t_inpcb);
727 toep->tp_copied_seq += copied;
729 read = toep->tp_enqueued_bytes - rcv->sb_cc;
730 toep->tp_copied_seq += read;
732 credits = toep->tp_copied_seq - toep->tp_rcv_wup;
733 toep->tp_enqueued_bytes = rcv->sb_cc;
736 if (credits > rcv->sb_mbmax) {
737 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
738 toep->tp_copied_seq, toep->tp_rcv_wup, credits);
739 credits = rcv->sb_mbmax;
744 * XXX this won't accurately reflect credit return - we need
745 * to look at the difference between the amount that has been
746 * put in the recv sockbuf and what is there now
749 if (__predict_false(!credits))
752 dev = toep->tp_toedev;
753 thres = TOM_TUNABLE(dev, rx_credit_thres);
755 if (__predict_false(thres == 0))
758 if (is_delack_mode_valid(dev, toep)) {
759 dack_mode = TOM_TUNABLE(dev, delack);
760 if (__predict_false(dack_mode != toep->tp_delack_mode)) {
761 u32 r = tp->rcv_nxt - toep->tp_delack_seq;
763 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
764 dack = F_RX_DACK_CHANGE |
765 V_RX_DACK_MODE(dack_mode);
768 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
771 * For coalescing to work effectively ensure the receive window has
772 * at least 16KB left.
774 must_send = credits + 16384 >= tp->rcv_wnd;
776 if (must_send || credits >= thres)
777 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
781 cxgb_toe_disconnect(struct tcpcb *tp)
785 DPRINTF("cxgb_toe_disconnect\n");
787 so = inp_inpcbtosocket(tp->t_inpcb);
793 cxgb_toe_reset(struct tcpcb *tp)
795 struct toepcb *toep = tp->t_toe;
802 tp->t_flags &= ~TF_TOE;
809 cxgb_toe_send(struct tcpcb *tp)
813 DPRINTF("cxgb_toe_send\n");
814 dump_toepcb(tp->t_toe);
816 so = inp_inpcbtosocket(tp->t_inpcb);
817 t3_push_frames(so, 1);
822 cxgb_toe_rcvd(struct tcpcb *tp)
825 inp_wlock_assert(tp->t_inpcb);
827 t3_cleanup_rbuf(tp, 0);
833 cxgb_toe_detach(struct tcpcb *tp)
838 * XXX how do we handle teardown in the SYN_SENT state?
841 inp_lock_assert(tp->t_inpcb);
842 inp_wlock_assert(tp->t_inpcb);
849 tp->t_flags &= ~TF_TOE;
854 static struct toe_usrreqs cxgb_toe_usrreqs = {
855 .tu_disconnect = cxgb_toe_disconnect,
856 .tu_reset = cxgb_toe_reset,
857 .tu_send = cxgb_toe_send,
858 .tu_rcvd = cxgb_toe_rcvd,
859 .tu_detach = cxgb_toe_detach,
860 .tu_detach = cxgb_toe_detach,
861 .tu_syncache_event = handle_syncache_event,
866 __set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
867 uint64_t mask, uint64_t val, int no_reply)
869 struct cpl_set_tcb_field *req;
871 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
872 toep->tp_tid, word, mask, val);
874 req = mtod(m, struct cpl_set_tcb_field *);
875 m->m_pkthdr.len = m->m_len = sizeof(*req);
876 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
878 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
879 req->reply = V_NO_REPLY(no_reply);
881 req->word = htons(word);
882 req->mask = htobe64(mask);
883 req->val = htobe64(val);
885 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
886 send_or_defer(toep, m, 0);
890 t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
893 struct tcpcb *tp = toep->tp_tp;
898 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
899 printf("not seting field\n");
903 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
905 __set_tcb_field(toep, m, word, mask, val, 1);
909 * Set one of the t_flags bits in the TCB.
912 set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
915 t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
919 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
922 t3_set_nagle(struct toepcb *toep)
924 struct tcpcb *tp = toep->tp_tp;
926 set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
930 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
933 t3_set_keepalive(struct toepcb *toep, int on_off)
936 set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
940 t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
942 set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
946 t3_set_dack_mss(struct toepcb *toep, int on_off)
949 set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
953 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
956 t3_set_tos(struct toepcb *toep)
958 int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
960 t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
966 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
967 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
968 * set the PSH bit in the last segment, which would trigger delivery.]
969 * We work around the issue by setting a DDP buffer in a partial placed state,
970 * which guarantees that TP will schedule a timer.
972 #define TP_DDP_TIMER_WORKAROUND_MASK\
973 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
974 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
975 V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
976 #define TP_DDP_TIMER_WORKAROUND_VAL\
977 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
978 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
982 t3_enable_ddp(struct toepcb *toep, int on)
986 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
989 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
991 TP_DDP_TIMER_WORKAROUND_MASK,
993 TP_DDP_TIMER_WORKAROUND_VAL);
998 t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
1000 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1001 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1006 t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1010 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1011 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1012 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1013 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1014 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1016 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1017 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1018 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1019 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1020 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1024 t3_set_cong_control(struct socket *so, const char *name)
1026 #ifdef CONGESTION_CONTROL_SUPPORTED
1029 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1030 if (!strcmp(name, t3_cong_ops[cong_algo].name))
1033 if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1040 t3_get_tcb(struct toepcb *toep)
1042 struct cpl_get_tcb *req;
1043 struct tcpcb *tp = toep->tp_tp;
1044 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1049 inp_lock_assert(tp->t_inpcb);
1051 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1052 req = mtod(m, struct cpl_get_tcb *);
1053 m->m_pkthdr.len = m->m_len = sizeof(*req);
1054 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1056 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1057 req->cpuno = htons(toep->tp_qset);
1059 if (tp->t_state == TCPS_SYN_SENT)
1060 mbufq_tail(&toep->out_of_order_queue, m); // defer
1062 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1067 so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1072 cxgb_insert_tid(d->cdev, d->client, toep, tid);
1076 * find_best_mtu - find the entry in the MTU table closest to an MTU
1078 * @mtu: the target MTU
1080 * Returns the index of the value in the MTU table that is closest to but
1081 * does not exceed the target MTU.
1084 find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1088 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1094 select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1099 struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1102 tp->t_maxseg = pmtu - 40;
1103 if (tp->t_maxseg < td->mtus[0] - 40)
1104 tp->t_maxseg = td->mtus[0] - 40;
1105 idx = find_best_mtu(td, tp->t_maxseg + 40);
1107 tp->t_maxseg = td->mtus[idx] - 40;
1109 idx = find_best_mtu(td, pmtu);
1115 free_atid(struct t3cdev *cdev, unsigned int tid)
1117 struct toepcb *toep = cxgb_free_atid(cdev, tid);
1120 toepcb_release(toep);
1124 * Release resources held by an offload connection (TID, L2T entry, etc.)
1127 t3_release_offload_resources(struct toepcb *toep)
1129 struct tcpcb *tp = toep->tp_tp;
1130 struct toedev *tdev = toep->tp_toedev;
1131 struct t3cdev *cdev;
1133 unsigned int tid = toep->tp_tid;
1134 struct sockbuf *rcv;
1136 CTR0(KTR_TOM, "t3_release_offload_resources");
1141 cdev = TOEP_T3C_DEV(toep);
1146 t3_release_ddp_resources(toep);
1148 #ifdef CTRL_SKB_CACHE
1149 kfree_skb(CTRL_SKB_CACHE(tp));
1150 CTRL_SKB_CACHE(tp) = NULL;
1153 if (toep->tp_wr_avail != toep->tp_wr_max) {
1154 purge_wr_queue(toep);
1155 reset_wr_list(toep);
1159 l2t_release(L2DATA(cdev), toep->tp_l2t);
1160 toep->tp_l2t = NULL;
1164 inp_wlock_assert(tp->t_inpcb);
1165 so = inp_inpcbtosocket(tp->t_inpcb);
1166 rcv = so_sockbuf_rcv(so);
1168 * cancel any offloaded reads
1174 tp->t_flags &= ~TF_TOE;
1175 if (toep->tp_ddp_state.user_ddp_pending) {
1176 t3_cancel_ubuf(toep, rcv);
1177 toep->tp_ddp_state.user_ddp_pending = 0;
1179 so_sorwakeup_locked(so);
1183 if (toep->tp_state == TCPS_SYN_SENT) {
1184 free_atid(cdev, tid);
1186 __skb_queue_purge(&tp->out_of_order_queue);
1188 } else { // we have TID
1189 cxgb_remove_tid(cdev, toep, tid);
1190 toepcb_release(toep);
1193 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1198 install_offload_ops(struct socket *so)
1200 struct tcpcb *tp = so_sototcpcb(so);
1202 KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1204 t3_install_socket_ops(so);
1205 tp->t_flags |= TF_TOE;
1206 tp->t_tu = &cxgb_toe_usrreqs;
1210 * Determine the receive window scaling factor given a target max
1214 select_rcv_wscale(int space)
1218 if (space > MAX_RCV_WND)
1219 space = MAX_RCV_WND;
1222 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1228 * Determine the receive window size for a socket.
1230 static unsigned long
1231 select_rcv_wnd(struct toedev *dev, struct socket *so)
1233 struct tom_data *d = TOM_DATA(dev);
1235 unsigned int max_rcv_wnd;
1236 struct sockbuf *rcv;
1238 rcv = so_sockbuf_rcv(so);
1240 if (tcp_do_autorcvbuf)
1241 wnd = tcp_autorcvbuf_max;
1243 wnd = rcv->sb_hiwat;
1248 * For receive coalescing to work effectively we need a receive window
1249 * that can accomodate a coalesced segment.
1251 if (wnd < MIN_RCV_WND)
1255 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1256 (uint32_t)d->rx_page_size * 23 :
1259 return min(wnd, max_rcv_wnd);
1263 * Assign offload parameters to some socket fields. This code is used by
1264 * both active and passive opens.
1267 init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1268 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1270 struct tcpcb *tp = so_sototcpcb(so);
1271 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1272 struct sockbuf *snd, *rcv;
1275 SOCK_LOCK_ASSERT(so);
1278 snd = so_sockbuf_snd(so);
1279 rcv = so_sockbuf_rcv(so);
1281 log(LOG_INFO, "initializing offload socket\n");
1283 * We either need to fix push frames to work with sbcompress
1284 * or we need to add this
1286 snd->sb_flags |= SB_NOCOALESCE;
1287 rcv->sb_flags |= SB_NOCOALESCE;
1291 toep->tp_toedev = dev;
1295 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1296 toep->tp_wr_unacked = 0;
1297 toep->tp_delack_mode = 0;
1299 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1304 tp->rcv_wnd = select_rcv_wnd(dev, so);
1306 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1307 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1308 toep->tp_qset_idx = 0;
1310 reset_wr_list(toep);
1311 DPRINTF("initialization done\n");
1315 * The next two functions calculate the option 0 value for a socket.
1317 static inline unsigned int
1318 calc_opt0h(struct socket *so, int mtu_idx)
1320 struct tcpcb *tp = so_sototcpcb(so);
1321 int wscale = select_rcv_wscale(tp->rcv_wnd);
1323 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1324 V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1325 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1328 static inline unsigned int
1329 calc_opt0l(struct socket *so, int ulp_mode)
1331 struct tcpcb *tp = so_sototcpcb(so);
1334 val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1335 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1337 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1341 static inline unsigned int
1342 calc_opt2(const struct socket *so, struct toedev *dev)
1346 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1348 return (V_FLAVORS_VALID(flv_valid) |
1349 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1354 count_pending_wrs(const struct toepcb *toep)
1356 const struct mbuf *m;
1359 wr_queue_walk(toep, m)
1360 n += m->m_pkthdr.csum_data;
1366 (((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1370 mk_act_open_req(struct socket *so, struct mbuf *m,
1371 unsigned int atid, const struct l2t_entry *e)
1373 struct cpl_act_open_req *req;
1374 struct inpcb *inp = so_sotoinpcb(so);
1375 struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1376 struct toepcb *toep = tp->t_toe;
1377 struct toedev *tdev = toep->tp_toedev;
1379 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1381 req = mtod(m, struct cpl_act_open_req *);
1382 m->m_pkthdr.len = m->m_len = sizeof(*req);
1384 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1386 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1387 inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1389 req->local_port = inp->inp_lport;
1390 req->peer_port = inp->inp_fport;
1391 memcpy(&req->local_ip, &inp->inp_laddr, 4);
1392 memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1394 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1395 V_TX_CHANNEL(e->smt_idx));
1396 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1398 req->opt2 = htonl(calc_opt2(so, tdev));
1403 * Convert an ACT_OPEN_RPL status to an errno.
1406 act_open_rpl_status_to_errno(int status)
1409 case CPL_ERR_CONN_RESET:
1410 return (ECONNREFUSED);
1411 case CPL_ERR_ARP_MISS:
1412 return (EHOSTUNREACH);
1413 case CPL_ERR_CONN_TIMEDOUT:
1415 case CPL_ERR_TCAM_FULL:
1417 case CPL_ERR_CONN_EXIST:
1418 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1419 return (EADDRINUSE);
1426 fail_act_open(struct toepcb *toep, int errno)
1428 struct tcpcb *tp = toep->tp_tp;
1430 t3_release_offload_resources(toep);
1432 inp_wunlock(tp->t_inpcb);
1433 tcp_offload_drop(tp, errno);
1437 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1442 * Handle active open failures.
1445 active_open_failed(struct toepcb *toep, struct mbuf *m)
1447 struct cpl_act_open_rpl *rpl = cplhdr(m);
1450 if (toep->tp_tp == NULL)
1453 inp = toep->tp_tp->t_inpcb;
1457 * Don't handle connection retry for now
1460 struct inet_connection_sock *icsk = inet_csk(sk);
1462 if (rpl->status == CPL_ERR_CONN_EXIST &&
1463 icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1464 icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1465 sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1472 * drops the inpcb lock
1474 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1479 INP_INFO_WUNLOCK(&tcbinfo);
1485 * Return whether a failed active open has allocated a TID
1488 act_open_has_tid(int status)
1490 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1491 status != CPL_ERR_ARP_MISS;
1495 * Process an ACT_OPEN_RPL CPL message.
1498 do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1500 struct toepcb *toep = (struct toepcb *)ctx;
1501 struct cpl_act_open_rpl *rpl = cplhdr(m);
1503 if (cdev->type != T3A && act_open_has_tid(rpl->status))
1504 cxgb_queue_tid_release(cdev, GET_TID(rpl));
1506 active_open_failed(toep, m);
1511 * Handle an ARP failure for an active open. XXX purge ofo queue
1513 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1514 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1515 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't
1516 * free the atid. Hmm.
1520 act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1522 struct toepcb *toep = m_get_toep(m);
1523 struct tcpcb *tp = toep->tp_tp;
1524 struct inpcb *inp = tp->t_inpcb;
1528 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1530 * drops the inpcb lock
1532 fail_act_open(so, EHOSTUNREACH);
1533 printf("freeing %p\n", m);
1542 * Send an active open request.
1545 t3_connect(struct toedev *tdev, struct socket *so,
1546 struct rtentry *rt, struct sockaddr *nam)
1549 struct l2t_entry *e;
1550 struct tom_data *d = TOM_DATA(tdev);
1551 struct inpcb *inp = so_sotoinpcb(so);
1552 struct tcpcb *tp = intotcpcb(inp);
1553 struct toepcb *toep; /* allocated by init_offload_socket */
1557 toep = toepcb_alloc();
1561 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1564 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1568 inp_wlock_assert(inp);
1569 m = m_gethdr(MT_DATA, M_WAITOK);
1572 m->m_toe.mt_toepcb = tp->t_toe;
1573 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1577 init_offload_socket(so, tdev, atid, e, rt, toep);
1579 install_offload_ops(so);
1581 mk_act_open_req(so, m, atid, e);
1586 m_set_toep(m, tp->t_toe);
1588 toep->tp_state = TCPS_SYN_SENT;
1589 l2t_send(d->cdev, (struct mbuf *)m, e);
1591 if (toep->tp_ulp_mode)
1592 t3_enable_ddp(toep, 0);
1596 printf("failing connect - free atid\n");
1598 free_atid(d->cdev, atid);
1600 printf("return ENOMEM\n");
1605 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do
1606 * not send multiple ABORT_REQs for the same connection and also that we do
1607 * not try to send a message after the connection has closed. Returns 1 if
1608 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1611 t3_send_reset(struct toepcb *toep)
1614 struct cpl_abort_req *req;
1615 unsigned int tid = toep->tp_tid;
1616 int mode = CPL_ABORT_SEND_RST;
1617 struct tcpcb *tp = toep->tp_tp;
1618 struct toedev *tdev = toep->tp_toedev;
1619 struct socket *so = NULL;
1621 struct sockbuf *snd;
1624 inp_wlock_assert(tp->t_inpcb);
1625 so = toeptoso(toep);
1628 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1631 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1633 snd = so_sockbuf_snd(so);
1634 /* Purge the send queue so we don't send anything after an abort. */
1637 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1638 mode |= CPL_ABORT_POST_CLOSE_REQ;
1640 m = m_gethdr_nofail(sizeof(*req));
1641 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1642 set_arp_failure_handler(m, abort_arp_failure);
1644 req = mtod(m, struct cpl_abort_req *);
1645 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1646 req->wr.wr_lo = htonl(V_WR_TID(tid));
1647 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1648 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1649 req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1651 if (tp && (tp->t_state == TCPS_SYN_SENT))
1652 mbufq_tail(&toep->out_of_order_queue, m); // defer
1654 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1658 t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1663 if (sopt->sopt_name == IP_OPTIONS)
1664 return (ENOPROTOOPT);
1666 if (sopt->sopt_name != IP_TOS)
1667 return (EOPNOTSUPP);
1669 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1674 if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1677 inp = so_sotoinpcb(so);
1679 inp_ip_tos_set(inp, optval);
1681 inp->inp_ip_tos = optval;
1683 t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1690 t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1695 if (sopt->sopt_name != TCP_CONGESTION &&
1696 sopt->sopt_name != TCP_NODELAY)
1697 return (EOPNOTSUPP);
1699 if (sopt->sopt_name == TCP_CONGESTION) {
1700 char name[TCP_CA_NAME_MAX];
1701 int optlen = sopt->sopt_valsize;
1704 if (sopt->sopt_dir == SOPT_GET) {
1705 KASSERT(0, ("unimplemented"));
1706 return (EOPNOTSUPP);
1712 err = copyinstr(sopt->sopt_val, name,
1713 min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1719 tp = so_sototcpcb(so);
1721 * XXX I need to revisit this
1723 if ((err = t3_set_cong_control(so, name)) == 0) {
1724 #ifdef CONGESTION_CONTROL_SUPPORTED
1725 tp->t_cong_control = strdup(name, M_CXGB);
1734 if (sopt->sopt_dir == SOPT_GET)
1735 return (EOPNOTSUPP);
1737 err = sooptcopyin(sopt, &optval, sizeof optval,
1743 inp = so_sotoinpcb(so);
1745 tp = inp_inpcbtotcpcb(inp);
1747 oldval = tp->t_flags;
1749 tp->t_flags |= TF_NODELAY;
1751 tp->t_flags &= ~TF_NODELAY;
1754 if (oldval != tp->t_flags && (tp->t_toe != NULL))
1755 t3_set_nagle(tp->t_toe);
1763 t3_ctloutput(struct socket *so, struct sockopt *sopt)
1767 if (sopt->sopt_level != IPPROTO_TCP)
1768 err = t3_ip_ctloutput(so, sopt);
1770 err = t3_tcp_ctloutput(so, sopt);
1772 if (err != EOPNOTSUPP)
1775 return (tcp_ctloutput(so, sopt));
1779 * Returns true if we need to explicitly request RST when we receive new data
1780 * on an RX-closed connection.
1783 need_rst_on_excess_rx(const struct toepcb *toep)
1789 * Handles Rx data that arrives in a state where the socket isn't accepting
1793 handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1796 if (need_rst_on_excess_rx(toep) &&
1797 !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1798 t3_send_reset(toep);
1803 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1804 * by getting the DDP offset from the TCB.
1807 tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1809 struct ddp_state *q = &toep->tp_ddp_state;
1810 struct ddp_buf_state *bsp;
1811 struct cpl_get_tcb_rpl *hdr;
1812 unsigned int ddp_offset;
1815 struct sockbuf *rcv;
1822 so = inp_inpcbtosocket(tp->t_inpcb);
1824 inp_wlock_assert(tp->t_inpcb);
1825 rcv = so_sockbuf_rcv(so);
1828 /* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1829 * We really need a cookie in order to dispatch the RPLs.
1833 /* It is a possible that a previous CPL already invalidated UBUF DDP
1834 * and moved the cur_buf idx and hence no further processing of this
1835 * skb is required. However, the app might be sleeping on
1836 * !q->get_tcb_count and we need to wake it up.
1838 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1839 int state = so_state_get(so);
1842 if (__predict_true((state & SS_NOFDREF) == 0))
1843 so_sorwakeup_locked(so);
1845 sockbuf_unlock(rcv);
1850 bsp = &q->buf_state[q->cur_buf];
1852 tcb = (__be64 *)(hdr + 1);
1853 if (q->cur_buf == 0) {
1854 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1855 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1857 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1858 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1860 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1861 m->m_cur_offset = bsp->cur_offset;
1862 bsp->cur_offset = ddp_offset;
1863 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1866 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1867 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1868 KASSERT(ddp_offset >= m->m_cur_offset,
1869 ("ddp_offset=%u less than cur_offset=%u",
1870 ddp_offset, m->m_cur_offset));
1874 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1876 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1877 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1879 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1880 rcv_nxt = t >> S_TCB_RCV_NXT;
1881 rcv_nxt &= M_TCB_RCV_NXT;
1883 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1884 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1885 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1887 T3_TRACE2(TIDTB(sk),
1888 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1889 ddp_flags, rcv_nxt - rx_hdr_offset);
1891 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1892 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1894 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1895 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1897 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1898 q->buf_state[0].flags, q->buf_state[1].flags);
1902 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1903 handle_excess_rx(toep, m);
1908 if ((int)m->m_pkthdr.len < 0) {
1909 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1912 if (bsp->flags & DDP_BF_NOCOPY) {
1915 "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1917 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1918 printk("!cancel_ubuf");
1919 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1922 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1923 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1925 } else if (bsp->flags & DDP_BF_NOFLIP) {
1927 m->m_ddp_flags = 1; /* always a kernel buffer */
1929 /* now HW buffer carries a user buffer */
1930 bsp->flags &= ~DDP_BF_NOFLIP;
1931 bsp->flags |= DDP_BF_NOCOPY;
1933 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1934 * any new data in which case we're done. If in addition the
1935 * offset is 0, then there wasn't a completion for the kbuf
1936 * and we need to decrement the posted count.
1938 if (m->m_pkthdr.len == 0) {
1939 if (ddp_offset == 0) {
1941 bsp->flags |= DDP_BF_NODATA;
1943 sockbuf_unlock(rcv);
1948 sockbuf_unlock(rcv);
1950 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1951 * but it got here way late and nobody cares anymore.
1957 m->m_ddp_gl = (unsigned char *)bsp->gl;
1958 m->m_flags |= M_DDP;
1959 m->m_seq = tp->rcv_nxt;
1960 tp->rcv_nxt += m->m_pkthdr.len;
1961 tp->t_rcvtime = ticks;
1962 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1963 m->m_seq, q->cur_buf, m->m_pkthdr.len);
1964 if (m->m_pkthdr.len == 0) {
1965 q->user_ddp_pending = 0;
1970 state = so_state_get(so);
1971 if (__predict_true((state & SS_NOFDREF) == 0))
1972 so_sorwakeup_locked(so);
1974 sockbuf_unlock(rcv);
1978 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code,
1979 * in that case they are similar to DDP completions.
1982 do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1984 struct toepcb *toep = (struct toepcb *)ctx;
1986 /* OK if socket doesn't exist */
1988 printf("null toep in do_get_tcb_rpl\n");
1989 return (CPL_RET_BUF_DONE);
1992 inp_wlock(toep->tp_tp->t_inpcb);
1993 tcb_rpl_as_ddp_complete(toep, m);
1994 inp_wunlock(toep->tp_tp->t_inpcb);
2000 handle_ddp_data(struct toepcb *toep, struct mbuf *m)
2002 struct tcpcb *tp = toep->tp_tp;
2004 struct ddp_state *q;
2005 struct ddp_buf_state *bsp;
2006 struct cpl_rx_data *hdr = cplhdr(m);
2007 unsigned int rcv_nxt = ntohl(hdr->seq);
2008 struct sockbuf *rcv;
2010 if (tp->rcv_nxt == rcv_nxt)
2013 inp_wlock_assert(tp->t_inpcb);
2014 so = inp_inpcbtosocket(tp->t_inpcb);
2015 rcv = so_sockbuf_rcv(so);
2018 q = &toep->tp_ddp_state;
2019 bsp = &q->buf_state[q->cur_buf];
2020 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2021 rcv_nxt, tp->rcv_nxt));
2022 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2023 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2024 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2025 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2028 if ((int)m->m_pkthdr.len < 0) {
2029 t3_ddp_error(so, "handle_ddp_data: neg len");
2032 m->m_ddp_gl = (unsigned char *)bsp->gl;
2033 m->m_flags |= M_DDP;
2034 m->m_cur_offset = bsp->cur_offset;
2035 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2036 if (bsp->flags & DDP_BF_NOCOPY)
2037 bsp->flags &= ~DDP_BF_NOCOPY;
2039 m->m_seq = tp->rcv_nxt;
2040 tp->rcv_nxt = rcv_nxt;
2041 bsp->cur_offset += m->m_pkthdr.len;
2042 if (!(bsp->flags & DDP_BF_NOFLIP))
2045 * For now, don't re-enable DDP after a connection fell out of DDP
2048 q->ubuf_ddp_ready = 0;
2049 sockbuf_unlock(rcv);
2053 * Process new data received for a connection.
2056 new_rx_data(struct toepcb *toep, struct mbuf *m)
2058 struct cpl_rx_data *hdr = cplhdr(m);
2059 struct tcpcb *tp = toep->tp_tp;
2061 struct sockbuf *rcv;
2063 int len = be16toh(hdr->len);
2065 inp_wlock(tp->t_inpcb);
2067 so = inp_inpcbtosocket(tp->t_inpcb);
2069 if (__predict_false(so_no_receive(so))) {
2070 handle_excess_rx(toep, m);
2071 inp_wunlock(tp->t_inpcb);
2076 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2077 handle_ddp_data(toep, m);
2079 m->m_seq = ntohl(hdr->seq);
2080 m->m_ulp_mode = 0; /* for iSCSI */
2083 if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2085 "%s: TID %u: Bad sequence number %u, expected %u\n",
2086 toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2089 inp_wunlock(tp->t_inpcb);
2093 m_adj(m, sizeof(*hdr));
2095 #ifdef URGENT_DATA_SUPPORTED
2097 * We don't handle urgent data yet
2099 if (__predict_false(hdr->urg))
2100 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2101 if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2102 tp->urg_seq - tp->rcv_nxt < skb->len))
2103 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2106 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2107 toep->tp_delack_mode = hdr->dack_mode;
2108 toep->tp_delack_seq = tp->rcv_nxt;
2110 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2111 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2113 if (len < m->m_pkthdr.len)
2114 m->m_pkthdr.len = m->m_len = len;
2116 tp->rcv_nxt += m->m_pkthdr.len;
2117 tp->t_rcvtime = ticks;
2118 toep->tp_enqueued_bytes += m->m_pkthdr.len;
2120 "new_rx_data: seq 0x%x len %u",
2121 m->m_seq, m->m_pkthdr.len);
2122 inp_wunlock(tp->t_inpcb);
2123 rcv = so_sockbuf_rcv(so);
2127 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2130 inp_wunlock(tp->t_inpcb);
2135 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2138 KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2140 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2141 so, rcv->sb_cc, rcv->sb_mbmax));
2145 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2146 rcv->sb_cc, rcv->sb_mbcnt);
2148 state = so_state_get(so);
2149 if (__predict_true((state & SS_NOFDREF) == 0))
2150 so_sorwakeup_locked(so);
2152 sockbuf_unlock(rcv);
2156 * Handler for RX_DATA CPL messages.
2159 do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2161 struct toepcb *toep = (struct toepcb *)ctx;
2163 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2165 new_rx_data(toep, m);
2171 new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2174 struct ddp_state *q;
2175 struct ddp_buf_state *bsp;
2176 struct cpl_rx_data_ddp *hdr;
2178 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2180 unsigned int delack_mode;
2181 struct sockbuf *rcv;
2184 inp_wlock(tp->t_inpcb);
2185 so = inp_inpcbtosocket(tp->t_inpcb);
2187 if (__predict_false(so_no_receive(so))) {
2189 handle_excess_rx(toep, m);
2190 inp_wunlock(tp->t_inpcb);
2194 q = &toep->tp_ddp_state;
2196 ddp_report = ntohl(hdr->u.ddp_report);
2197 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2198 bsp = &q->buf_state[buf_idx];
2201 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2202 "hdr seq 0x%x len %u",
2203 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2206 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2207 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2209 ddp_len = ntohs(hdr->len);
2210 rcv_nxt = ntohl(hdr->seq) + ddp_len;
2212 delack_mode = G_DDP_DACK_MODE(ddp_report);
2213 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2214 toep->tp_delack_mode = delack_mode;
2215 toep->tp_delack_seq = tp->rcv_nxt;
2218 m->m_seq = tp->rcv_nxt;
2219 tp->rcv_nxt = rcv_nxt;
2221 tp->t_rcvtime = ticks;
2223 * Store the length in m->m_len. We are changing the meaning of
2224 * m->m_len here, we need to be very careful that nothing from now on
2225 * interprets ->len of this packet the usual way.
2227 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2228 inp_wunlock(tp->t_inpcb);
2230 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2231 m->m_len, rcv_nxt, m->m_seq);
2233 * Figure out where the new data was placed in the buffer and store it
2234 * in when. Assumes the buffer offset starts at 0, consumer needs to
2235 * account for page pod's pg_offset.
2237 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2238 m->m_cur_offset = end_offset - m->m_pkthdr.len;
2240 rcv = so_sockbuf_rcv(so);
2243 m->m_ddp_gl = (unsigned char *)bsp->gl;
2244 m->m_flags |= M_DDP;
2245 bsp->cur_offset = end_offset;
2246 toep->tp_enqueued_bytes += m->m_pkthdr.len;
2249 * Length is only meaningful for kbuf
2251 if (!(bsp->flags & DDP_BF_NOCOPY))
2252 KASSERT(m->m_len <= bsp->gl->dgl_length,
2253 ("length received exceeds ddp pages: len=%d dgl_length=%d",
2254 m->m_len, bsp->gl->dgl_length));
2256 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2257 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2259 * Bit 0 of flags stores whether the DDP buffer is completed.
2260 * Note that other parts of the code depend on this being in bit 0.
2262 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2263 panic("spurious ddp completion");
2265 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2266 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2267 q->cur_buf ^= 1; /* flip buffers */
2270 if (bsp->flags & DDP_BF_NOCOPY) {
2271 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2272 bsp->flags &= ~DDP_BF_NOCOPY;
2275 if (ddp_report & F_DDP_PSH)
2276 m->m_ddp_flags |= DDP_BF_PSH;
2278 m->m_ddp_flags |= DDP_BF_NODATA;
2281 skb_reset_transport_header(skb);
2282 tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */
2286 if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2287 (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2288 || !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2289 so_sorwakeup_locked(so);
2291 sockbuf_unlock(rcv);
2294 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2295 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2296 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2300 * Handler for RX_DATA_DDP CPL messages.
2303 do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2305 struct toepcb *toep = ctx;
2306 const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2310 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2311 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2312 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2313 return (CPL_RET_BUF_DONE);
2316 skb->h.th = tcphdr_skb->h.th;
2318 new_rx_data_ddp(toep, m);
2323 process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2325 struct tcpcb *tp = toep->tp_tp;
2327 struct ddp_state *q;
2328 struct ddp_buf_state *bsp;
2329 struct cpl_rx_ddp_complete *hdr;
2330 unsigned int ddp_report, buf_idx, when, delack_mode;
2332 struct sockbuf *rcv;
2334 inp_wlock(tp->t_inpcb);
2335 so = inp_inpcbtosocket(tp->t_inpcb);
2336 inp_wlock(tp->t_inpcb);
2338 if (__predict_false(so_no_receive(so))) {
2339 struct inpcb *inp = so_sotoinpcb(so);
2341 handle_excess_rx(toep, m);
2345 q = &toep->tp_ddp_state;
2347 ddp_report = ntohl(hdr->ddp_report);
2348 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2349 m->m_pkthdr.csum_data = tp->rcv_nxt;
2351 rcv = so_sockbuf_rcv(so);
2354 bsp = &q->buf_state[buf_idx];
2355 when = bsp->cur_offset;
2356 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2357 tp->rcv_nxt += m->m_len;
2358 tp->t_rcvtime = ticks;
2360 delack_mode = G_DDP_DACK_MODE(ddp_report);
2361 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2362 toep->tp_delack_mode = delack_mode;
2363 toep->tp_delack_seq = tp->rcv_nxt;
2366 skb_reset_transport_header(skb);
2367 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
2369 inp_wunlock(tp->t_inpcb);
2371 KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2373 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2374 "ddp_report 0x%x offset %u, len %u",
2375 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2376 G_DDP_OFFSET(ddp_report), m->m_len);
2378 m->m_cur_offset = bsp->cur_offset;
2379 bsp->cur_offset += m->m_len;
2381 if (!(bsp->flags & DDP_BF_NOFLIP)) {
2382 q->cur_buf ^= 1; /* flip buffers */
2383 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2388 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2389 "ddp_report %u offset %u",
2390 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2391 G_DDP_OFFSET(ddp_report));
2393 m->m_ddp_gl = (unsigned char *)bsp->gl;
2394 m->m_flags |= M_DDP;
2395 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2396 if (bsp->flags & DDP_BF_NOCOPY)
2397 bsp->flags &= ~DDP_BF_NOCOPY;
2399 m->m_ddp_flags |= DDP_BF_NODATA;
2402 if ((so_state_get(so) & SS_NOFDREF) == 0)
2403 so_sorwakeup_locked(so);
2405 sockbuf_unlock(rcv);
2409 * Handler for RX_DDP_COMPLETE CPL messages.
2412 do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2414 struct toepcb *toep = ctx;
2418 skb->h.th = tcphdr_skb->h.th;
2420 process_ddp_complete(toep, m);
2425 * Move a socket to TIME_WAIT state. We need to make some adjustments to the
2426 * socket state before calling tcp_time_wait to comply with its expectations.
2429 enter_timewait(struct tcpcb *tp)
2432 inp_wlock_assert(tp->t_inpcb);
2434 * Bump rcv_nxt for the peer FIN. We don't do this at the time we
2435 * process peer_close because we don't want to carry the peer FIN in
2436 * the socket's receive queue and if we increment rcv_nxt without
2437 * having the FIN in the receive queue we'll confuse facilities such
2440 inp_wlock(tp->t_inpcb);
2443 tp->ts_recent_age = 0; /* defeat recycling */
2444 tp->t_srtt = 0; /* defeat tcp_update_metrics */
2445 inp_wunlock(tp->t_inpcb);
2446 tcp_offload_twstart(tp);
2450 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This
2451 * function deals with the data that may be reported along with the FIN.
2452 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2453 * perform normal FIN-related processing. In the latter case 1 indicates that
2454 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2458 handle_peer_close_data(struct socket *so, struct mbuf *m)
2460 struct tcpcb *tp = so_sototcpcb(so);
2461 struct toepcb *toep = tp->t_toe;
2462 struct ddp_state *q;
2463 struct ddp_buf_state *bsp;
2464 struct cpl_peer_close *req = cplhdr(m);
2465 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2466 struct sockbuf *rcv;
2468 if (tp->rcv_nxt == rcv_nxt) /* no data */
2471 CTR0(KTR_TOM, "handle_peer_close_data");
2472 if (__predict_false(so_no_receive(so))) {
2473 handle_excess_rx(toep, m);
2476 * Although we discard the data we want to process the FIN so
2477 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2478 * PEER_CLOSE without data. In particular this PEER_CLOSE
2479 * may be what will close the connection. We return 1 because
2480 * handle_excess_rx() already freed the packet.
2485 inp_wlock_assert(tp->t_inpcb);
2486 q = &toep->tp_ddp_state;
2487 rcv = so_sockbuf_rcv(so);
2490 bsp = &q->buf_state[q->cur_buf];
2491 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2492 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2493 m->m_ddp_gl = (unsigned char *)bsp->gl;
2494 m->m_flags |= M_DDP;
2495 m->m_cur_offset = bsp->cur_offset;
2497 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2498 m->m_seq = tp->rcv_nxt;
2499 tp->rcv_nxt = rcv_nxt;
2500 bsp->cur_offset += m->m_pkthdr.len;
2501 if (!(bsp->flags & DDP_BF_NOFLIP))
2504 skb_reset_transport_header(skb);
2505 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
2507 tp->t_rcvtime = ticks;
2509 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2510 so_sorwakeup_locked(so);
2512 sockbuf_unlock(rcv);
2518 * Handle a peer FIN.
2521 do_peer_fin(struct toepcb *toep, struct mbuf *m)
2524 struct tcpcb *tp = toep->tp_tp;
2528 CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2529 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2530 printf("abort_pending set\n");
2534 inp_wlock(tp->t_inpcb);
2535 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2536 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2537 keep = handle_peer_close_data(so, m);
2539 inp_wunlock(tp->t_inpcb);
2543 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2545 "waking up waiters for cantrcvmore on %p ", so);
2549 * If connection is half-synchronized
2550 * (ie NEEDSYN flag on) then delay ACK,
2551 * so it may be piggybacked when SYN is sent.
2552 * Otherwise, since we received a FIN then no
2553 * more input can be expected, send ACK now.
2555 if (tp->t_flags & TF_NEEDSYN)
2556 tp->t_flags |= TF_DELACK;
2558 tp->t_flags |= TF_ACKNOW;
2562 switch (tp->t_state) {
2563 case TCPS_SYN_RECEIVED:
2564 tp->t_starttime = ticks;
2566 case TCPS_ESTABLISHED:
2567 tp->t_state = TCPS_CLOSE_WAIT;
2569 case TCPS_FIN_WAIT_1:
2570 tp->t_state = TCPS_CLOSING;
2572 case TCPS_FIN_WAIT_2:
2574 * If we've sent an abort_req we must have sent it too late,
2575 * HW will send us a reply telling us so, and this peer_close
2576 * is really the last message for this connection and needs to
2577 * be treated as an abort_rpl, i.e., transition the connection
2578 * to TCP_CLOSE (note that the host stack does this at the
2579 * time of generating the RST but we must wait for HW).
2580 * Otherwise we enter TIME_WAIT.
2582 t3_release_offload_resources(toep);
2583 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2586 action = TCP_TIMEWAIT;
2591 "%s: TID %u received PEER_CLOSE in bad state %d\n",
2592 toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2594 inp_wunlock(tp->t_inpcb);
2596 if (action == TCP_TIMEWAIT) {
2598 } else if (action == TCP_DROP) {
2599 tcp_offload_drop(tp, 0);
2600 } else if (action == TCP_CLOSE) {
2601 tcp_offload_close(tp);
2605 /* Do not send POLL_HUP for half duplex close. */
2606 if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2607 sk->sk_state == TCP_CLOSE)
2608 sk_wake_async(so, 1, POLL_HUP);
2610 sk_wake_async(so, 1, POLL_IN);
2619 * Handler for PEER_CLOSE CPL messages.
2622 do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2624 struct toepcb *toep = (struct toepcb *)ctx;
2628 do_peer_fin(toep, m);
2633 process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2635 struct cpl_close_con_rpl *rpl = cplhdr(m);
2636 struct tcpcb *tp = toep->tp_tp;
2639 struct sockbuf *rcv;
2641 inp_wlock(tp->t_inpcb);
2642 so = inp_inpcbtosocket(tp->t_inpcb);
2644 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */
2646 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2647 inp_wunlock(tp->t_inpcb);
2651 CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2652 tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2654 switch (tp->t_state) {
2655 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */
2656 t3_release_offload_resources(toep);
2657 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2661 action = TCP_TIMEWAIT;
2666 * In this state we don't care about pending abort_rpl.
2667 * If we've sent abort_req it was post-close and was sent too
2668 * late, this close_con_rpl is the actual last message.
2670 t3_release_offload_resources(toep);
2673 case TCPS_FIN_WAIT_1:
2675 * If we can't receive any more
2676 * data, then closing user can proceed.
2677 * Starting the timer is contrary to the
2678 * specification, but if we don't get a FIN
2679 * we'll hang forever.
2682 * we should release the tp also, and use a
2686 rcv = so_sockbuf_rcv(so);
2690 if (rcv->sb_state & SBS_CANTRCVMORE) {
2694 soisdisconnected(so);
2695 timeout = (tcp_fast_finwait2_recycle) ?
2696 tcp_finwait2_timeout : tcp_maxidle;
2697 tcp_timer_activate(tp, TT_2MSL, timeout);
2699 tp->t_state = TCPS_FIN_WAIT_2;
2700 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2701 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2708 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2709 toep->tp_toedev->tod_name, toep->tp_tid,
2712 inp_wunlock(tp->t_inpcb);
2715 if (action == TCP_TIMEWAIT) {
2717 } else if (action == TCP_DROP) {
2718 tcp_offload_drop(tp, 0);
2719 } else if (action == TCP_CLOSE) {
2720 tcp_offload_close(tp);
2727 * Handler for CLOSE_CON_RPL CPL messages.
2730 do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2733 struct toepcb *toep = (struct toepcb *)ctx;
2735 process_close_con_rpl(toep, m);
2740 * Process abort replies. We only process these messages if we anticipate
2741 * them as the coordination between SW and HW in this area is somewhat lacking
2742 * and sometimes we get ABORT_RPLs after we are done with the connection that
2743 * originated the ABORT_REQ.
2746 process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2748 struct tcpcb *tp = toep->tp_tp;
2753 T3_TRACE1(TIDTB(sk),
2754 "process_abort_rpl: GTS rpl pending %d",
2755 sock_flag(sk, ABORT_RPL_PENDING));
2758 inp_wlock(tp->t_inpcb);
2759 so = inp_inpcbtosocket(tp->t_inpcb);
2761 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2763 * XXX panic on tcpdrop
2765 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2766 toep->tp_flags |= TP_ABORT_RPL_RCVD;
2768 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2769 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2770 !is_t3a(toep->tp_toedev)) {
2771 if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2772 panic("TP_ABORT_REQ_RCVD set");
2773 t3_release_offload_resources(toep);
2778 inp_wunlock(tp->t_inpcb);
2781 tcp_offload_close(tp);
2787 * Handle an ABORT_RPL_RSS CPL message.
2790 do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2792 struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2793 struct toepcb *toep;
2796 * Ignore replies to post-close aborts indicating that the abort was
2797 * requested too late. These connections are terminated when we get
2798 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2799 * arrives the TID is either no longer used or it has been recycled.
2801 if (rpl->status == CPL_ERR_ABORT_FAILED) {
2807 toep = (struct toepcb *)ctx;
2810 * Sometimes we've already closed the socket, e.g., a post-close
2811 * abort races with ABORT_REQ_RSS, the latter frees the socket
2812 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2813 * but FW turns the ABORT_REQ into a regular one and so we get
2814 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A.
2819 if (toep->tp_tp == NULL) {
2820 log(LOG_NOTICE, "removing tid for abort\n");
2821 cxgb_remove_tid(cdev, toep, toep->tp_tid);
2823 l2t_release(L2DATA(cdev), toep->tp_l2t);
2825 toepcb_release(toep);
2829 log(LOG_NOTICE, "toep=%p\n", toep);
2830 log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2833 process_abort_rpl(toep, m);
2834 toepcb_release(toep);
2839 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also
2840 * indicate whether RST should be sent in response.
2843 abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2845 struct tcpcb *tp = so_sototcpcb(so);
2847 switch (abort_reason) {
2848 case CPL_ERR_BAD_SYN:
2850 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through
2852 case CPL_ERR_CONN_RESET:
2853 // XXX need to handle SYN_RECV due to crossed SYNs
2854 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2855 case CPL_ERR_XMIT_TIMEDOUT:
2856 case CPL_ERR_PERSIST_TIMEDOUT:
2857 case CPL_ERR_FINWAIT2_TIMEDOUT:
2858 case CPL_ERR_KEEPALIVE_TIMEDOUT:
2860 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2869 set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2871 struct cpl_abort_rpl *rpl = cplhdr(m);
2873 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2874 rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2875 m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2877 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2882 send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2884 struct mbuf *reply_mbuf;
2885 struct cpl_abort_req_rss *req = cplhdr(m);
2887 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2888 m_set_priority(m, CPL_PRIORITY_DATA);
2889 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2890 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2891 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2896 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2899 is_neg_adv_abort(unsigned int status)
2901 return status == CPL_ERR_RTX_NEG_ADVICE ||
2902 status == CPL_ERR_PERSIST_NEG_ADVICE;
2906 send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2908 struct mbuf *reply_mbuf;
2909 struct cpl_abort_req_rss *req = cplhdr(m);
2911 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2914 /* Defer the reply. Stick rst_status into req->cmd. */
2915 req->status = rst_status;
2916 t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2920 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2921 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2925 * XXX need to sync with ARP as for SYN_RECV connections we can send
2926 * these messages while ARP is pending. For other connection states
2927 * it's not a problem.
2929 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2934 cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2936 CXGB_UNIMPLEMENTED();
2938 struct request_sock *req = child->sk_user_data;
2940 inet_csk_reqsk_queue_removed(parent, req);
2941 synq_remove(tcp_sk(child));
2943 child->sk_user_data = NULL;
2949 * Performs the actual work to abort a SYN_RECV connection.
2952 do_abort_syn_rcv(struct socket *child, struct socket *parent)
2954 struct tcpcb *parenttp = so_sototcpcb(parent);
2955 struct tcpcb *childtp = so_sototcpcb(child);
2958 * If the server is still open we clean up the child connection,
2959 * otherwise the server already did the clean up as it was purging
2960 * its SYN queue and the skb was just sitting in its backlog.
2962 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2963 cleanup_syn_rcv_conn(child, parent);
2964 inp_wlock(childtp->t_inpcb);
2965 t3_release_offload_resources(childtp->t_toe);
2966 inp_wunlock(childtp->t_inpcb);
2967 tcp_offload_close(childtp);
2973 * Handle abort requests for a SYN_RECV connection. These need extra work
2974 * because the socket is on its parent's SYN queue.
2977 abort_syn_rcv(struct socket *so, struct mbuf *m)
2979 CXGB_UNIMPLEMENTED();
2981 struct socket *parent;
2982 struct toedev *tdev = toep->tp_toedev;
2983 struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2984 struct socket *oreq = so->so_incomp;
2985 struct t3c_tid_entry *t3c_stid;
2989 return -1; /* somehow we are not on the SYN queue */
2991 t = &(T3C_DATA(cdev))->tid_maps;
2992 t3c_stid = lookup_stid(t, oreq->ts_recent);
2993 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2996 do_abort_syn_rcv(so, parent);
2997 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
3004 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this
3005 * request except that we need to reply to it.
3008 process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3010 int rst_status = CPL_ABORT_NO_RST;
3011 const struct cpl_abort_req_rss *req = cplhdr(m);
3012 struct tcpcb *tp = toep->tp_tp;
3016 inp_wlock(tp->t_inpcb);
3017 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3018 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3019 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3024 toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3026 * Three cases to consider:
3027 * a) We haven't sent an abort_req; close the connection.
3028 * b) We have sent a post-close abort_req that will get to TP too late
3029 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will
3030 * be ignored and the connection should be closed now.
3031 * c) We have sent a regular abort_req that will get to TP too late.
3032 * That will generate an abort_rpl with status 0, wait for it.
3034 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3035 (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3038 error = abort_status_to_errno(so, req->status,
3040 so_error_set(so, error);
3042 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3045 * SYN_RECV needs special processing. If abort_syn_rcv()
3046 * returns 0 is has taken care of the abort.
3048 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3051 t3_release_offload_resources(toep);
3054 inp_wunlock(tp->t_inpcb);
3057 tcp_offload_close(tp);
3059 send_abort_rpl(m, tdev, rst_status);
3062 inp_wunlock(tp->t_inpcb);
3066 * Handle an ABORT_REQ_RSS CPL message.
3069 do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3071 const struct cpl_abort_req_rss *req = cplhdr(m);
3072 struct toepcb *toep = (struct toepcb *)ctx;
3074 if (is_neg_adv_abort(req->status)) {
3079 log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3081 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3082 cxgb_remove_tid(cdev, toep, toep->tp_tid);
3083 toep->tp_flags |= TP_ABORT_REQ_RCVD;
3085 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3087 l2t_release(L2DATA(cdev), toep->tp_l2t);
3092 toep->tp_tp->t_toe = NULL;
3093 toep->tp_tp->t_flags &= ~TF_TOE;
3096 * XXX need to call syncache_chkrst - but we don't
3097 * have a way of doing that yet
3099 toepcb_release(toep);
3100 log(LOG_ERR, "abort for unestablished connection :-(\n");
3103 if (toep->tp_tp == NULL) {
3104 log(LOG_NOTICE, "disconnected toepcb\n");
3105 /* should be freed momentarily */
3111 process_abort_req(toep, m, toep->tp_toedev);
3112 toepcb_release(toep);
3117 pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3119 struct toedev *tdev = TOE_DEV(parent);
3121 do_abort_syn_rcv(child, parent);
3122 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3123 struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3125 rpl->opt0h = htonl(F_TCAM_BYPASS);
3126 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3127 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3133 handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3135 CXGB_UNIMPLEMENTED();
3138 struct t3cdev *cdev;
3139 struct socket *parent;
3140 struct socket *oreq;
3141 struct t3c_tid_entry *t3c_stid;
3143 struct tcpcb *otp, *tp = so_sototcpcb(so);
3144 struct toepcb *toep = tp->t_toe;
3147 * If the connection is being aborted due to the parent listening
3148 * socket going away there's nothing to do, the ABORT_REQ will close
3151 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3156 oreq = so->so_incomp;
3157 otp = so_sototcpcb(oreq);
3160 t = &(T3C_DATA(cdev))->tid_maps;
3161 t3c_stid = lookup_stid(t, otp->ts_recent);
3162 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3165 pass_open_abort(so, parent, m);
3171 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly
3172 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3176 pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3180 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3181 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3183 handle_pass_open_arp_failure(m_get_socket(m), m);
3187 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3190 mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3192 struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3193 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3194 unsigned int tid = GET_TID(req);
3196 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3197 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3198 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3199 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet
3200 rpl->opt0h = htonl(F_TCAM_BYPASS);
3201 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3203 rpl->rsvd = rpl->opt2; /* workaround for HW bug */
3207 * Send a deferred reject to an accept request.
3210 reject_pass_request(struct toedev *tdev, struct mbuf *m)
3212 struct mbuf *reply_mbuf;
3214 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3215 mk_pass_accept_rpl(reply_mbuf, m);
3216 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3221 handle_syncache_event(int event, void *arg)
3223 struct toepcb *toep = arg;
3226 case TOE_SC_ENTRY_PRESENT:
3228 * entry already exists - free toepcb
3231 printf("syncache entry present\n");
3232 toepcb_release(toep);
3236 * The syncache has given up on this entry
3237 * either it timed out, or it was evicted
3238 * we need to explicitly release the tid
3240 printf("syncache entry dropped\n");
3241 toepcb_release(toep);
3244 log(LOG_ERR, "unknown syncache event %d\n", event);
3250 syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3252 struct in_conninfo inc;
3256 int mss, wsf, sack, ts;
3257 uint32_t rcv_isn = ntohl(req->rcv_isn);
3259 bzero(&to, sizeof(struct tcpopt));
3260 inp = so_sotoinpcb(lso);
3263 * Fill out information for entering us into the syncache
3265 bzero(&inc, sizeof(inc));
3266 inc.inc_fport = th.th_sport = req->peer_port;
3267 inc.inc_lport = th.th_dport = req->local_port;
3268 th.th_seq = req->rcv_isn;
3269 th.th_flags = TH_SYN;
3271 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3274 inc.inc_faddr.s_addr = req->peer_ip;
3275 inc.inc_laddr.s_addr = req->local_ip;
3277 DPRINTF("syncache add of %d:%d %d:%d\n",
3278 ntohl(req->local_ip), ntohs(req->local_port),
3279 ntohl(req->peer_ip), ntohs(req->peer_port));
3281 mss = req->tcp_options.mss;
3282 wsf = req->tcp_options.wsf;
3283 ts = req->tcp_options.tstamp;
3284 sack = req->tcp_options.sack;
3287 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3288 tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3293 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket
3294 * lock held. Note that the sock here is a listening socket that is not owned
3298 process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3299 struct listen_ctx *lctx)
3302 struct l2t_entry *e;
3304 struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3305 struct cpl_pass_accept_rpl *rpl;
3306 struct cpl_pass_accept_req *req = cplhdr(m);
3307 unsigned int tid = GET_TID(req);
3308 struct tom_data *d = TOM_DATA(tdev);
3309 struct t3cdev *cdev = d->cdev;
3310 struct tcpcb *tp = so_sototcpcb(so);
3311 struct toepcb *newtoep;
3312 struct rtentry *dst;
3313 struct sockaddr_in nam;
3314 struct t3c_data *td = T3C_DATA(cdev);
3316 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3317 if (__predict_false(reply_mbuf == NULL)) {
3318 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3319 t3_defer_reply(m, tdev, reject_pass_request);
3321 cxgb_queue_tid_release(cdev, tid);
3324 DPRINTF("failed to get reply_mbuf\n");
3329 if (tp->t_state != TCPS_LISTEN) {
3330 DPRINTF("socket not in listen state\n");
3335 tim.mac_addr = req->dst_mac;
3336 tim.vlan_tag = ntohs(req->vlan_tag);
3337 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3338 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3344 * XXX do route lookup to confirm that we're still listening on this
3347 if (ip_route_input(skb, req->local_ip, req->peer_ip,
3348 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3350 rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3351 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3352 dst_release(skb->dst); // done with the input route, release it
3355 if ((rt_flags & RTF_LOCAL) == 0)
3361 rt_flags = RTF_LOCAL;
3362 if ((rt_flags & RTF_LOCAL) == 0)
3366 * Calculate values and add to syncache
3369 newtoep = toepcb_alloc();
3370 if (newtoep == NULL)
3373 bzero(&nam, sizeof(struct sockaddr_in));
3375 nam.sin_len = sizeof(struct sockaddr_in);
3376 nam.sin_family = AF_INET;
3377 nam.sin_addr.s_addr =req->peer_ip;
3378 dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3381 printf("failed to find route\n");
3384 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3385 (struct sockaddr *)&nam);
3387 DPRINTF("failed to get l2t\n");
3390 * Point to our listen socket until accept
3392 newtoep->tp_tp = tp;
3393 newtoep->tp_flags = TP_SYN_RCVD;
3394 newtoep->tp_tid = tid;
3395 newtoep->tp_toedev = tdev;
3396 tp->rcv_wnd = select_rcv_wnd(tdev, so);
3398 cxgb_insert_tid(cdev, d->client, newtoep, tid);
3400 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3403 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3404 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3406 if (newtoep->tp_ulp_mode) {
3407 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3409 if (ddp_mbuf == NULL)
3410 newtoep->tp_ulp_mode = 0;
3413 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3414 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3415 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3417 * XXX workaround for lack of syncache drop
3419 toepcb_hold(newtoep);
3420 syncache_add_accept_req(req, so, newtoep);
3422 rpl = cplhdr(reply_mbuf);
3423 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3424 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3426 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3427 rpl->opt2 = htonl(calc_opt2(so, tdev));
3428 rpl->rsvd = rpl->opt2; /* workaround for HW bug */
3429 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten
3431 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3432 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3433 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3434 CPL_PASS_OPEN_ACCEPT);
3436 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3438 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3440 l2t_send(cdev, reply_mbuf, e);
3442 if (newtoep->tp_ulp_mode) {
3443 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3445 TP_DDP_TIMER_WORKAROUND_MASK,
3447 TP_DDP_TIMER_WORKAROUND_VAL, 1);
3449 DPRINTF("no DDP\n");
3453 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3454 mk_pass_accept_rpl(reply_mbuf, m);
3456 mk_tid_release(reply_mbuf, newtoep, tid);
3457 cxgb_ofld_send(cdev, reply_mbuf);
3461 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3468 * Handle a CPL_PASS_ACCEPT_REQ message.
3471 do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3473 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3474 struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3475 struct tom_data *d = listen_ctx->tom_data;
3478 struct cpl_pass_accept_req *req = cplhdr(m);
3479 unsigned int tid = GET_TID(req);
3480 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3482 if (unlikely(!lsk)) {
3483 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3485 (unsigned long)((union listen_entry *)ctx -
3487 return CPL_RET_BUF_DONE;
3489 if (unlikely(tid >= t->ntids)) {
3490 printk(KERN_ERR "%s: passive open TID %u too large\n",
3492 return CPL_RET_BUF_DONE;
3495 * For T3A the current user of the TID may have closed but its last
3496 * message(s) may have been backlogged so the TID appears to be still
3497 * in use. Just take the TID away, the connection can close at its
3498 * own leisure. For T3B this situation is a bug.
3500 if (!valid_new_tid(t, tid) &&
3501 cdev->type != T3A) {
3502 printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3504 return CPL_RET_BUF_DONE;
3508 process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3513 * Called when a connection is established to translate the TCP options
3514 * reported by HW to FreeBSD's native format.
3517 assign_rxopt(struct socket *so, unsigned int opt)
3519 struct tcpcb *tp = so_sototcpcb(so);
3520 struct toepcb *toep = tp->t_toe;
3521 const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3523 inp_wlock_assert(tp->t_inpcb);
3525 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3526 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3527 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3528 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3529 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3530 (TF_RCVD_SCALE|TF_REQ_SCALE))
3531 tp->rcv_scale = tp->request_r_scale;
3535 * Completes some final bits of initialization for just established connections
3536 * and changes their state to TCP_ESTABLISHED.
3538 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3541 make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3543 struct tcpcb *tp = so_sototcpcb(so);
3544 struct toepcb *toep = tp->t_toe;
3546 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3547 assign_rxopt(so, opt);
3554 so->so_proto->pr_ctloutput = t3_ctloutput;
3558 inet_sk(sk)->id = tp->write_seq ^ jiffies;
3561 * XXX not clear what rcv_wup maps to
3564 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3565 * pass through opt0.
3567 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3568 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3574 * no clean interface for marking ARP up to date
3576 dst_confirm(sk->sk_dst_cache);
3578 tp->t_starttime = ticks;
3579 tp->t_state = TCPS_ESTABLISHED;
3584 syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3587 struct in_conninfo inc;
3590 int mss, wsf, sack, ts;
3591 struct mbuf *m = NULL;
3592 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3596 #error "no MAC support"
3599 opt = ntohs(req->tcp_opt);
3601 bzero(&to, sizeof(struct tcpopt));
3604 * Fill out information for entering us into the syncache
3606 bzero(&inc, sizeof(inc));
3607 inc.inc_fport = th.th_sport = req->peer_port;
3608 inc.inc_lport = th.th_dport = req->local_port;
3609 th.th_seq = req->rcv_isn;
3610 th.th_flags = TH_ACK;
3613 inc.inc_faddr.s_addr = req->peer_ip;
3614 inc.inc_laddr.s_addr = req->local_ip;
3616 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3617 wsf = G_TCPOPT_WSCALE_OK(opt);
3618 ts = G_TCPOPT_TSTAMP(opt);
3619 sack = G_TCPOPT_SACK(opt);
3622 to.to_wscale = G_TCPOPT_SND_WSCALE(opt);
3623 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3625 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3626 ntohl(req->local_ip), ntohs(req->local_port),
3627 ntohl(req->peer_ip), ntohs(req->peer_port),
3628 mss, wsf, ts, sack);
3629 return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3634 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work
3635 * if we are in TCP_SYN_RECV due to crossed SYNs
3638 do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3640 struct cpl_pass_establish *req = cplhdr(m);
3641 struct toepcb *toep = (struct toepcb *)ctx;
3642 struct tcpcb *tp = toep->tp_tp;
3643 struct socket *so, *lso;
3644 struct t3c_data *td = T3C_DATA(cdev);
3645 struct sockbuf *snd, *rcv;
3647 // Complete socket initialization now that we have the SND_ISN
3649 struct toedev *tdev;
3652 tdev = toep->tp_toedev;
3654 inp_wlock(tp->t_inpcb);
3658 * XXX need to add reference while we're manipulating
3660 so = lso = inp_inpcbtosocket(tp->t_inpcb);
3662 inp_wunlock(tp->t_inpcb);
3665 LIST_REMOVE(toep, synq_entry);
3668 if (!syncache_expand_establish_req(req, &so, toep)) {
3672 CXGB_UNIMPLEMENTED();
3676 * Couldn't create the socket
3678 CXGB_UNIMPLEMENTED();
3681 tp = so_sototcpcb(so);
3682 inp_wlock(tp->t_inpcb);
3684 snd = so_sockbuf_snd(so);
3685 rcv = so_sockbuf_rcv(so);
3687 snd->sb_flags |= SB_NOCOALESCE;
3688 rcv->sb_flags |= SB_NOCOALESCE;
3693 reset_wr_list(toep);
3694 tp->rcv_wnd = select_rcv_wnd(tdev, so);
3695 tp->rcv_nxt = toep->tp_copied_seq;
3696 install_offload_ops(so);
3698 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3699 toep->tp_wr_unacked = 0;
3700 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3701 toep->tp_qset_idx = 0;
3702 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3705 * XXX Cancel any keep alive timer
3708 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3711 * XXX workaround for lack of syncache drop
3713 toepcb_release(toep);
3714 inp_wunlock(tp->t_inpcb);
3716 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3717 cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3720 * XXX not sure how these checks map to us
3722 if (unlikely(sk->sk_socket)) { // simultaneous opens only
3723 sk->sk_state_change(sk);
3724 sk_wake_async(so, 0, POLL_OUT);
3727 * The state for the new connection is now up to date.
3728 * Next check if we should add the connection to the parent's
3729 * accept queue. When the parent closes it resets connections
3730 * on its SYN queue, so check if we are being reset. If so we
3731 * don't need to do anything more, the coming ABORT_RPL will
3732 * destroy this socket. Otherwise move the connection to the
3735 * Note that we reset the synq before closing the server so if
3736 * we are not being reset the stid is still open.
3738 if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3749 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3750 * and send them to the TOE.
3753 fixup_and_send_ofo(struct toepcb *toep)
3756 struct toedev *tdev = toep->tp_toedev;
3757 struct tcpcb *tp = toep->tp_tp;
3758 unsigned int tid = toep->tp_tid;
3760 log(LOG_NOTICE, "fixup_and_send_ofo\n");
3762 inp_wlock_assert(tp->t_inpcb);
3763 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3765 * A variety of messages can be waiting but the fields we'll
3766 * be touching are common to all so any message type will do.
3768 struct cpl_close_con_req *p = cplhdr(m);
3770 p->wr.wr_lo = htonl(V_WR_TID(tid));
3771 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3772 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3777 * Updates socket state from an active establish CPL message. Runs with the
3781 socket_act_establish(struct socket *so, struct mbuf *m)
3783 struct cpl_act_establish *req = cplhdr(m);
3784 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */
3785 struct tcpcb *tp = so_sototcpcb(so);
3786 struct toepcb *toep = tp->t_toe;
3788 if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3789 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3790 toep->tp_tid, tp->t_state);
3792 tp->ts_recent_age = ticks;
3793 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3794 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3796 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3799 * Now that we finally have a TID send any CPL messages that we had to
3800 * defer for lack of a TID.
3802 if (mbufq_len(&toep->out_of_order_queue))
3803 fixup_and_send_ofo(toep);
3805 if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3807 * XXX does this even make sense?
3814 * XXX assume no write requests permitted while socket connection is
3818 * Currently the send queue must be empty at this point because the
3819 * socket layer does not send anything before a connection is
3820 * established. To be future proof though we handle the possibility
3821 * that there are pending buffers to send (either TX_DATA or
3822 * CLOSE_CON_REQ). First we need to adjust the sequence number of the
3823 * buffers according to the just learned write_seq, and then we send
3824 * them on their way.
3826 fixup_pending_writeq_buffers(sk);
3827 if (t3_push_frames(so, 1))
3828 sk->sk_write_space(sk);
3831 toep->tp_state = tp->t_state;
3832 tcpstat.tcps_connects++;
3837 * Process a CPL_ACT_ESTABLISH message.
3840 do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3842 struct cpl_act_establish *req = cplhdr(m);
3843 unsigned int tid = GET_TID(req);
3844 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3845 struct toepcb *toep = (struct toepcb *)ctx;
3846 struct tcpcb *tp = toep->tp_tp;
3848 struct toedev *tdev;
3852 free_atid(cdev, atid);
3855 inp_wlock(tp->t_inpcb);
3860 so = inp_inpcbtosocket(tp->t_inpcb);
3861 tdev = toep->tp_toedev; /* blow up here if link was down */
3863 inp_wlock(tp->t_inpcb);
3866 * It's OK if the TID is currently in use, the owning socket may have
3867 * backlogged its last CPL message(s). Just take it away.
3871 so_insert_tid(d, toep, tid);
3872 free_atid(cdev, atid);
3873 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3875 socket_act_establish(so, m);
3876 inp_wunlock(tp->t_inpcb);
3877 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3878 cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3884 * Process an acknowledgment of WR completion. Advance snd_una and send the
3885 * next batch of work requests from the write queue.
3888 wr_ack(struct toepcb *toep, struct mbuf *m)
3890 struct tcpcb *tp = toep->tp_tp;
3891 struct cpl_wr_ack *hdr = cplhdr(m);
3893 unsigned int credits = ntohs(hdr->credits);
3894 u32 snd_una = ntohl(hdr->snd_una);
3896 struct sockbuf *snd;
3898 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3900 inp_wlock(tp->t_inpcb);
3901 so = inp_inpcbtosocket(tp->t_inpcb);
3903 toep->tp_wr_avail += credits;
3904 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3905 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3908 struct mbuf *p = peek_wr(toep);
3910 if (__predict_false(!p)) {
3911 log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3912 "nothing pending, state %u wr_avail=%u\n",
3913 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3917 "wr_ack: p->credits=%d p->bytes=%d",
3918 p->m_pkthdr.csum_data, p->m_pkthdr.len);
3919 KASSERT(p->m_pkthdr.csum_data != 0,
3920 ("empty request still on list"));
3922 if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3925 struct tx_data_wr *w = cplhdr(p);
3927 "TID %u got %u WR credits, need %u, len %u, "
3928 "main body %u, frags %u, seq # %u, ACK una %u,"
3929 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3930 toep->tp_tid, credits, p->csum, p->len,
3931 p->len - p->data_len, skb_shinfo(p)->nr_frags,
3932 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3933 toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3935 p->m_pkthdr.csum_data -= credits;
3939 credits -= p->m_pkthdr.csum_data;
3940 bytes += p->m_pkthdr.len;
3942 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3943 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3950 check_wr_invariants(tp);
3953 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3955 struct tom_data *d = TOM_DATA(TOE_DEV(so));
3957 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3958 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3959 toep->tp_tid, tp->snd_una);
3964 if (tp->snd_una != snd_una) {
3965 tp->snd_una = snd_una;
3966 tp->ts_recent_age = ticks;
3969 * Keep ARP entry "minty fresh"
3971 dst_confirm(sk->sk_dst_cache);
3973 if (tp->snd_una == tp->snd_nxt)
3974 toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3977 snd = so_sockbuf_snd(so);
3979 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3980 snd = so_sockbuf_snd(so);
3982 sbdrop_locked(snd, bytes);
3983 so_sowwakeup_locked(so);
3986 if (snd->sb_sndptroff < snd->sb_cc)
3987 t3_push_frames(so, 0);
3990 inp_wunlock(tp->t_inpcb);
3995 * Handler for TX_DATA_ACK CPL messages.
3998 do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
4000 struct toepcb *toep = (struct toepcb *)ctx;
4009 * Handler for TRACE_PKT CPL messages. Just sink these packets.
4012 do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4019 * Reset a connection that is on a listener's SYN queue or accept queue,
4020 * i.e., one that has not had a struct socket associated with it.
4021 * Must be called from process context.
4023 * Modeled after code in inet_csk_listen_stop().
4026 t3_reset_listen_child(struct socket *child)
4028 struct tcpcb *tp = so_sototcpcb(child);
4030 t3_send_reset(tp->t_toe);
4035 t3_child_disconnect(struct socket *so, void *arg)
4037 struct tcpcb *tp = so_sototcpcb(so);
4039 if (tp->t_flags & TF_TOE) {
4040 inp_wlock(tp->t_inpcb);
4041 t3_reset_listen_child(so);
4042 inp_wunlock(tp->t_inpcb);
4047 * Disconnect offloaded established but not yet accepted connections sitting
4048 * on a server's accept_queue. We just send an ABORT_REQ at this point and
4049 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4052 t3_disconnect_acceptq(struct socket *listen_so)
4056 so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4057 so_unlock(listen_so);
4061 * Reset offloaded connections sitting on a server's syn queue. As above
4062 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4066 t3_reset_synq(struct listen_ctx *lctx)
4068 struct toepcb *toep;
4071 while (!LIST_EMPTY(&lctx->synq_head)) {
4072 toep = LIST_FIRST(&lctx->synq_head);
4073 LIST_REMOVE(toep, synq_entry);
4075 t3_send_reset(toep);
4076 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4077 toepcb_release(toep);
4079 so_unlock(lctx->lso);
4084 t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4085 unsigned int nppods, unsigned int tag, unsigned int maxoff,
4086 unsigned int pg_off, unsigned int color)
4088 unsigned int i, j, pidx;
4091 struct ulp_mem_io *req;
4092 unsigned int tid = toep->tp_tid;
4093 const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4094 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4096 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4097 gl, nppods, tag, maxoff, pg_off, color);
4099 for (i = 0; i < nppods; ++i) {
4100 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4101 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4102 req = mtod(m, struct ulp_mem_io *);
4103 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4104 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4106 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4107 V_ULPTX_CMD(ULP_MEM_WRITE));
4108 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4109 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4111 p = (struct pagepod *)(req + 1);
4112 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4113 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4114 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4115 V_PPOD_COLOR(color));
4116 p->pp_max_offset = htonl(maxoff);
4117 p->pp_page_offset = htonl(pg_off);
4119 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4120 p->pp_addr[j] = pidx < gl->dgl_nelem ?
4121 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4123 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */
4124 send_or_defer(toep, m, 0);
4125 ppod_addr += PPOD_SIZE;
4131 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4134 mk_cpl_barrier_ulp(struct cpl_barrier *b)
4136 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4138 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4139 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4140 b->opcode = CPL_BARRIER;
4144 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4147 mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4149 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4151 txpkt = (struct ulp_txpkt *)req;
4152 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4153 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4154 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4155 req->cpuno = htons(cpuno);
4159 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4162 mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4163 unsigned int word, uint64_t mask, uint64_t val)
4165 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4167 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4168 tid, word, mask, val);
4170 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4171 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4172 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4173 req->reply = V_NO_REPLY(1);
4175 req->word = htons(word);
4176 req->mask = htobe64(mask);
4177 req->val = htobe64(val);
4181 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4184 mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4185 unsigned int tid, unsigned int credits)
4187 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4189 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4190 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4191 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4192 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4193 V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4194 V_RX_CREDITS(credits));
4198 t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4202 struct work_request_hdr *wr;
4203 struct cpl_barrier *lock;
4204 struct cpl_set_tcb_field *req;
4205 struct cpl_get_tcb *getreq;
4206 struct ddp_state *p = &toep->tp_ddp_state;
4209 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4211 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4213 m = m_gethdr_nofail(wrlen);
4214 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4215 wr = mtod(m, struct work_request_hdr *);
4218 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4219 m->m_pkthdr.len = m->m_len = wrlen;
4221 lock = (struct cpl_barrier *)(wr + 1);
4222 mk_cpl_barrier_ulp(lock);
4224 req = (struct cpl_set_tcb_field *)(lock + 1);
4226 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4228 /* Hmmm, not sure if this actually a good thing: reactivating
4229 * the other buffer might be an issue if it has been completed
4230 * already. However, that is unlikely, since the fact that the UBUF
4231 * is not completed indicates that there is no oustanding data.
4234 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4235 V_TF_DDP_ACTIVE_BUF(1) |
4236 V_TF_DDP_BUF0_VALID(1),
4237 V_TF_DDP_ACTIVE_BUF(1));
4239 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4240 V_TF_DDP_ACTIVE_BUF(1) |
4241 V_TF_DDP_BUF1_VALID(1), 0);
4243 getreq = (struct cpl_get_tcb *)(req + 1);
4244 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4246 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4248 /* Keep track of the number of oustanding CPL_GET_TCB requests
4253 T3_TRACE1(TIDTB(so),
4254 "t3_cancel_ddpbuf: bufidx %u", bufidx);
4256 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4260 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4261 * @sk: the socket associated with the buffers
4262 * @bufidx: index of HW DDP buffer (0 or 1)
4263 * @tag0: new tag for HW buffer 0
4264 * @tag1: new tag for HW buffer 1
4265 * @len: new length for HW buf @bufidx
4267 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4268 * buffer by changing the buffer tag and length and setting the valid and
4269 * active flag accordingly. The caller must ensure the new buffer is at
4270 * least as big as the existing one. Since we typically reprogram both HW
4271 * buffers this function sets both tags for convenience. Read the TCB to
4272 * determine how made data was written into the buffer before the overlay
4276 t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4277 unsigned int tag1, unsigned int len)
4281 struct work_request_hdr *wr;
4282 struct cpl_get_tcb *getreq;
4283 struct cpl_set_tcb_field *req;
4284 struct ddp_state *p = &toep->tp_ddp_state;
4286 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4287 bufidx, tag0, tag1, len);
4289 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4291 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4292 m = m_gethdr_nofail(wrlen);
4293 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4294 wr = mtod(m, struct work_request_hdr *);
4295 m->m_pkthdr.len = m->m_len = wrlen;
4299 /* Set the ATOMIC flag to make sure that TP processes the following
4300 * CPLs in an atomic manner and no wire segments can be interleaved.
4302 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4303 req = (struct cpl_set_tcb_field *)(wr + 1);
4304 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4305 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4306 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4307 V_TCB_RX_DDP_BUF0_TAG(tag0) |
4308 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4311 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4312 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4313 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4315 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4316 V_TF_DDP_PUSH_DISABLE_0(1) |
4317 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4318 V_TF_DDP_PUSH_DISABLE_0(0) |
4319 V_TF_DDP_BUF0_VALID(1));
4321 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4322 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4323 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4325 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4326 V_TF_DDP_PUSH_DISABLE_1(1) |
4327 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4328 V_TF_DDP_PUSH_DISABLE_1(0) |
4329 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4332 getreq = (struct cpl_get_tcb *)(req + 1);
4333 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4335 /* Keep track of the number of oustanding CPL_GET_TCB requests
4340 T3_TRACE4(TIDTB(sk),
4341 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4343 bufidx, tag0, tag1, len);
4345 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4349 * Sends a compound WR containing all the CPL messages needed to program the
4350 * two HW DDP buffers, namely optionally setting up the length and offset of
4351 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4354 t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4355 unsigned int len1, unsigned int offset1,
4356 uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4360 struct work_request_hdr *wr;
4361 struct cpl_set_tcb_field *req;
4363 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4364 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4367 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4369 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4370 (len1 ? sizeof(*req) : 0) +
4371 (modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4372 m = m_gethdr_nofail(wrlen);
4373 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4374 wr = mtod(m, struct work_request_hdr *);
4377 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4378 m->m_pkthdr.len = m->m_len = wrlen;
4380 req = (struct cpl_set_tcb_field *)(wr + 1);
4381 if (len0) { /* program buffer 0 offset and length */
4382 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4383 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4384 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4385 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4386 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4389 if (len1) { /* program buffer 1 offset and length */
4390 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4391 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4392 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4393 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4394 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4398 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4402 mk_rx_data_ack_ulp(toep,
4403 (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4404 toep->tp_copied_seq - toep->tp_rcv_wup);
4405 toep->tp_rcv_wup = toep->tp_copied_seq;
4409 T3_TRACE5(TIDTB(sk),
4410 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4412 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4416 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4420 t3_init_wr_tab(unsigned int wr_len)
4424 if (mbuf_wrs[1]) /* already initialized */
4427 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4428 int sgl_len = (3 * i) / 2 + (i & 1);
4431 mbuf_wrs[i] = sgl_len <= wr_len ?
4432 1 : 1 + (sgl_len - 2) / (wr_len - 1);
4439 t3_init_cpl_io(void)
4442 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4445 "Chelsio TCP offload: can't allocate sk_buff\n");
4448 skb_put(tcphdr_skb, sizeof(struct tcphdr));
4449 tcphdr_skb->h.raw = tcphdr_skb->data;
4450 memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4453 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4454 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4455 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4456 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4457 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4458 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4459 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4460 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4461 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4462 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4463 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4464 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4465 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4466 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4467 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);