1 /**************************************************************************
3 Copyright (c) 2007-2008, Chelsio Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Chelsio Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/fcntl.h>
36 #include <sys/kernel.h>
37 #include <sys/limits.h>
41 #include <sys/mutex.h>
42 #include <sys/sockstate.h>
43 #include <sys/sockopt.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/sockbuf.h>
47 #include <sys/sysctl.h>
48 #include <sys/syslog.h>
49 #include <sys/protosw.h>
51 #include <sys/vimage.h>
54 #include <net/route.h>
56 #include <netinet/in.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_systm.h>
59 #include <netinet/in_var.h>
62 #include <dev/cxgb/cxgb_osdep.h>
63 #include <dev/cxgb/sys/mbufq.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp_var.h>
67 #include <netinet/tcp_fsm.h>
68 #include <netinet/tcp_offload.h>
69 #include <netinet/tcp_seq.h>
70 #include <netinet/tcp_syncache.h>
71 #include <netinet/tcp_timer.h>
72 #include <net/route.h>
74 #include <dev/cxgb/t3cdev.h>
75 #include <dev/cxgb/common/cxgb_firmware_exports.h>
76 #include <dev/cxgb/common/cxgb_t3_cpl.h>
77 #include <dev/cxgb/common/cxgb_tcb.h>
78 #include <dev/cxgb/common/cxgb_ctl_defs.h>
79 #include <dev/cxgb/cxgb_offload.h>
82 #include <machine/bus.h>
83 #include <dev/cxgb/sys/mvec.h>
84 #include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
85 #include <dev/cxgb/ulp/tom/cxgb_defs.h>
86 #include <dev/cxgb/ulp/tom/cxgb_tom.h>
87 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
88 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
89 #include <dev/cxgb/ulp/tom/cxgb_tcp.h>
91 #include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
94 * For ULP connections HW may add headers, e.g., for digests, that aren't part
95 * of the messages sent by the host but that are part of the TCP payload and
96 * therefore consume TCP sequence space. Tx connection parameters that
97 * operate in TCP sequence space are affected by the HW additions and need to
98 * compensate for them to accurately track TCP sequence numbers. This array
99 * contains the compensating extra lengths for ULP packets. It is indexed by
100 * a packet's ULP submode.
102 const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
106 * This sk_buff holds a fake header-only TCP segment that we use whenever we
107 * need to exploit SW TCP functionality that expects TCP headers, such as
108 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple
109 * CPUs without locking.
111 static struct mbuf *tcphdr_mbuf __read_mostly;
115 * Size of WRs in bytes. Note that we assume all devices we are handling have
118 static unsigned int wrlen __read_mostly;
121 * The number of WRs needed for an skb depends on the number of page fragments
122 * in the skb and whether it has any payload in its main body. This maps the
123 * length of the gather list represented by an skb into the # of necessary WRs.
125 static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
128 * Max receive window supported by HW in bytes. Only a small part of it can
129 * be set through option0, the rest needs to be set through RX_DATA_ACK.
131 #define MAX_RCV_WND ((1U << 27) - 1)
134 * Min receive window. We want it to be large enough to accommodate receive
135 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
137 #define MIN_RCV_WND (24 * 1024U)
138 #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
140 #define VALIDATE_SEQ 0
141 #define VALIDATE_SOCK(so)
144 #define TCP_TIMEWAIT 1
148 extern int tcp_do_autorcvbuf;
149 extern int tcp_do_autosndbuf;
150 extern int tcp_autorcvbuf_max;
151 extern int tcp_autosndbuf_max;
153 static void t3_send_reset(struct toepcb *toep);
154 static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
155 static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
156 static void handle_syncache_event(int event, void *arg);
159 SBAPPEND(struct sockbuf *sb, struct mbuf *n)
165 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
166 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
167 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
168 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
169 m->m_next, m->m_nextpkt, m->m_flags));
174 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
175 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
176 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
177 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
178 m->m_next, m->m_nextpkt, m->m_flags));
181 KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
182 sbappendstream_locked(sb, n);
186 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
187 m->m_next, m->m_nextpkt, m->m_flags));
193 is_t3a(const struct toedev *dev)
195 return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
199 dump_toepcb(struct toepcb *toep)
201 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
202 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
203 toep->tp_mtu_idx, toep->tp_tid);
205 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
206 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
207 toep->tp_mss_clamp, toep->tp_flags);
210 #ifndef RTALLOC2_DEFINED
211 static struct rtentry *
212 rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
214 struct rtentry *rt = NULL;
216 if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
224 * Determine whether to send a CPL message now or defer it. A message is
225 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
226 * For connections in other states the message is sent immediately.
227 * If through_l2t is set the message is subject to ARP processing, otherwise
228 * it is sent directly.
231 send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
233 struct tcpcb *tp = toep->tp_tp;
235 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
236 inp_wlock(tp->t_inpcb);
237 mbufq_tail(&toep->out_of_order_queue, m); // defer
238 inp_wunlock(tp->t_inpcb);
239 } else if (through_l2t)
240 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T
242 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly
245 static inline unsigned int
246 mkprio(unsigned int cntrl, const struct toepcb *toep)
252 * Populate a TID_RELEASE WR. The skb must be already propely sized.
255 mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
257 struct cpl_tid_release *req;
259 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
260 m->m_pkthdr.len = m->m_len = sizeof(*req);
261 req = mtod(m, struct cpl_tid_release *);
262 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
264 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
268 make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
270 struct tcpcb *tp = so_sototcpcb(so);
271 struct toepcb *toep = tp->t_toe;
272 struct tx_data_wr *req;
275 inp_lock_assert(tp->t_inpcb);
276 snd = so_sockbuf_snd(so);
278 req = mtod(m, struct tx_data_wr *);
279 m->m_len = sizeof(*req);
280 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
281 req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
282 /* len includes the length of any HW ULP additions */
283 req->len = htonl(len);
284 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
285 /* V_TX_ULP_SUBMODE sets both the mode and submode */
286 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
287 V_TX_URG(/* skb_urgent(skb) */ 0 ) |
288 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
290 req->sndseq = htonl(tp->snd_nxt);
291 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
292 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
293 V_TX_CPU_IDX(toep->tp_qset));
295 /* Sendbuffer is in units of 32KB.
297 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
298 req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
300 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
303 toep->tp_flags |= TP_DATASENT;
307 #define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
310 t3_push_frames(struct socket *so, int req_completion)
312 struct tcpcb *tp = so_sototcpcb(so);
313 struct toepcb *toep = tp->t_toe;
315 struct mbuf *tail, *m0, *last;
318 int state, bytes, count, total_bytes;
319 bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
322 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
323 DPRINTF("tcp state=%d\n", tp->t_state);
327 state = so_state_get(so);
329 if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
330 DPRINTF("disconnecting\n");
335 inp_lock_assert(tp->t_inpcb);
337 snd = so_sockbuf_snd(so);
340 d = TOM_DATA(toep->tp_toedev);
343 last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
346 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
347 toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
349 if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) {
350 KASSERT(tail, ("sbdrop error"));
351 last = tail = tail->m_next;
354 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
355 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
361 toep->tp_m_last = NULL;
362 while (toep->tp_wr_avail && (tail != NULL)) {
365 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
370 * If the data in tail fits as in-line, then
371 * make an immediate data wr.
373 if (tail->m_len <= IMM_LEN) {
380 make_tx_data_wr(so, m0, bytes, tail);
381 m_append(m0, bytes, mtod(last, caddr_t));
382 KASSERT(!m0->m_next, ("bad append"));
384 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
385 && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
386 bytes += tail->m_len;
390 * technically an abuse to be using this for a VA
391 * but less gross than defining my own structure
392 * or calling pmap_kextract from here :-|
394 segp->ds_addr = (bus_addr_t)tail->m_data;
395 segp->ds_len = tail->m_len;
396 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
397 count, mbuf_wrs[count], tail->m_data, tail->m_len);
401 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
402 toep->tp_wr_avail, count, mbuf_wrs[count], tail);
405 m_set_sgllen(m0, count);
406 make_tx_data_wr(so, m0, bytes, tail);
408 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
411 snd->sb_sndptr = tail;
412 toep->tp_m_last = NULL;
414 toep->tp_m_last = snd->sb_sndptr = last;
417 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
419 snd->sb_sndptroff += bytes;
420 total_bytes += bytes;
421 toep->tp_write_seq += bytes;
422 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
423 " tail=%p sndptr=%p sndptroff=%d",
424 toep->tp_wr_avail, count, mbuf_wrs[count],
425 tail, snd->sb_sndptr, snd->sb_sndptroff);
427 CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
428 " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
429 total_bytes, toep->tp_m_last, tail->m_data,
432 CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
433 " tp_m_last=%p snd_una=0x%08x",
434 total_bytes, toep->tp_m_last, tp->snd_una);
442 while (i < count && m_get_sgllen(m0)) {
443 if ((count - i) >= 3) {
445 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
446 " len=%d pa=0x%zx len=%d",
447 segs[i].ds_addr, segs[i].ds_len,
448 segs[i + 1].ds_addr, segs[i + 1].ds_len,
449 segs[i + 2].ds_addr, segs[i + 2].ds_len);
451 } else if ((count - i) == 2) {
453 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
455 segs[i].ds_addr, segs[i].ds_len,
456 segs[i + 1].ds_addr, segs[i + 1].ds_len);
459 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
460 segs[i].ds_addr, segs[i].ds_len);
468 * remember credits used
470 m0->m_pkthdr.csum_data = mbuf_wrs[count];
471 m0->m_pkthdr.len = bytes;
472 toep->tp_wr_avail -= mbuf_wrs[count];
473 toep->tp_wr_unacked += mbuf_wrs[count];
475 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
476 toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
477 struct work_request_hdr *wr = cplhdr(m0);
479 wr->wr_hi |= htonl(F_WR_COMPL);
480 toep->tp_wr_unacked = 0;
482 KASSERT((m0->m_pkthdr.csum_data > 0) &&
483 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
484 m0->m_pkthdr.csum_data));
485 m0->m_type = MT_DONTFREE;
486 enqueue_wr(toep, m0);
487 DPRINTF("sending offload tx with %d bytes in %d segments\n",
489 l2t_send(cdev, m0, toep->tp_l2t);
492 return (total_bytes);
496 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail
497 * under any circumstances. We take the easy way out and always queue the
498 * message to the write_queue. We can optimize the case where the queue is
499 * already empty though the optimization is probably not worth it.
502 close_conn(struct socket *so)
505 struct cpl_close_con_req *req;
507 struct inpcb *inp = so_sotoinpcb(so);
514 tp = so_sototcpcb(so);
517 if (tp->t_state != TCPS_SYN_SENT)
518 t3_push_frames(so, 1);
520 if (toep->tp_flags & TP_FIN_SENT) {
527 d = TOM_DATA(toep->tp_toedev);
529 m = m_gethdr_nofail(sizeof(*req));
530 m_set_priority(m, CPL_PRIORITY_DATA);
534 toep->tp_flags |= TP_FIN_SENT;
535 req = mtod(m, struct cpl_close_con_req *);
537 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
538 req->wr.wr_lo = htonl(V_WR_TID(tid));
539 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
543 * XXX - need to defer shutdown while there is still data in the queue
546 CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
547 cxgb_ofld_send(d->cdev, m);
552 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant
556 abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
558 struct cpl_abort_req *req = cplhdr(m);
560 req->cmd = CPL_ABORT_NO_RST;
561 cxgb_ofld_send(cdev, m);
565 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are
566 * permitted to return without sending the message in case we cannot allocate
567 * an sk_buff. Returns the number of credits sent.
570 t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
573 struct cpl_rx_data_ack *req;
574 struct toepcb *toep = tp->t_toe;
575 struct toedev *tdev = toep->tp_toedev;
577 m = m_gethdr_nofail(sizeof(*req));
579 DPRINTF("returning %u credits to HW\n", credits);
581 req = mtod(m, struct cpl_rx_data_ack *);
582 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
584 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
585 req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
586 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
587 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
592 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
593 * This is only used in DDP mode, so we take the opportunity to also set the
594 * DACK mode and flush any Rx credits.
597 t3_send_rx_modulate(struct toepcb *toep)
600 struct cpl_rx_data_ack *req;
602 m = m_gethdr_nofail(sizeof(*req));
604 req = mtod(m, struct cpl_rx_data_ack *);
605 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
607 m->m_pkthdr.len = m->m_len = sizeof(*req);
609 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
610 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
612 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
613 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
614 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
615 toep->tp_rcv_wup = toep->tp_copied_seq;
619 * Handle receipt of an urgent pointer.
622 handle_urg_ptr(struct socket *so, uint32_t urg_seq)
624 #ifdef URGENT_DATA_SUPPORTED
625 struct tcpcb *tp = so_sototcpcb(so);
627 urg_seq--; /* initially points past the urgent data, per BSD */
629 if (tp->urg_data && !after(urg_seq, tp->urg_seq))
630 return; /* duplicate pointer */
632 if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
633 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
634 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
637 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
638 tom_eat_skb(sk, skb, 0);
640 tp->urg_data = TCP_URG_NOTYET;
641 tp->urg_seq = urg_seq;
646 * Returns true if a socket cannot accept new Rx data.
649 so_no_receive(const struct socket *so)
651 return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
655 * Process an urgent data notification.
658 rx_urg_notify(struct toepcb *toep, struct mbuf *m)
660 struct cpl_rx_urg_notify *hdr = cplhdr(m);
661 struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
665 if (!so_no_receive(so))
666 handle_urg_ptr(so, ntohl(hdr->seq));
672 * Handler for RX_URG_NOTIFY CPL messages.
675 do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
677 struct toepcb *toep = (struct toepcb *)ctx;
679 rx_urg_notify(toep, m);
684 is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
686 return (toep->tp_ulp_mode ||
687 (toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
688 dev->tod_ttid >= TOE_ID_CHELSIO_T3));
692 * Set of states for which we should return RX credits.
694 #define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
697 * Called after some received data has been read. It returns RX credits
698 * to the HW for the amount of data processed.
701 t3_cleanup_rbuf(struct tcpcb *tp, int copied)
703 struct toepcb *toep = tp->t_toe;
706 int dack_mode, must_send, read;
707 u32 thres, credits, dack = 0;
710 so = inp_inpcbtosocket(tp->t_inpcb);
711 rcv = so_sockbuf_rcv(so);
713 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
714 (tp->t_state == TCPS_FIN_WAIT_2))) {
717 toep->tp_copied_seq += copied;
724 inp_lock_assert(tp->t_inpcb);
728 toep->tp_copied_seq += copied;
730 read = toep->tp_enqueued_bytes - rcv->sb_cc;
731 toep->tp_copied_seq += read;
733 credits = toep->tp_copied_seq - toep->tp_rcv_wup;
734 toep->tp_enqueued_bytes = rcv->sb_cc;
737 if (credits > rcv->sb_mbmax) {
738 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
739 toep->tp_copied_seq, toep->tp_rcv_wup, credits);
740 credits = rcv->sb_mbmax;
745 * XXX this won't accurately reflect credit return - we need
746 * to look at the difference between the amount that has been
747 * put in the recv sockbuf and what is there now
750 if (__predict_false(!credits))
753 dev = toep->tp_toedev;
754 thres = TOM_TUNABLE(dev, rx_credit_thres);
756 if (__predict_false(thres == 0))
759 if (is_delack_mode_valid(dev, toep)) {
760 dack_mode = TOM_TUNABLE(dev, delack);
761 if (__predict_false(dack_mode != toep->tp_delack_mode)) {
762 u32 r = tp->rcv_nxt - toep->tp_delack_seq;
764 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
765 dack = F_RX_DACK_CHANGE |
766 V_RX_DACK_MODE(dack_mode);
769 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
772 * For coalescing to work effectively ensure the receive window has
773 * at least 16KB left.
775 must_send = credits + 16384 >= tp->rcv_wnd;
777 if (must_send || credits >= thres)
778 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
782 cxgb_toe_disconnect(struct tcpcb *tp)
786 DPRINTF("cxgb_toe_disconnect\n");
788 so = inp_inpcbtosocket(tp->t_inpcb);
794 cxgb_toe_reset(struct tcpcb *tp)
796 struct toepcb *toep = tp->t_toe;
803 tp->t_flags &= ~TF_TOE;
810 cxgb_toe_send(struct tcpcb *tp)
814 DPRINTF("cxgb_toe_send\n");
815 dump_toepcb(tp->t_toe);
817 so = inp_inpcbtosocket(tp->t_inpcb);
818 t3_push_frames(so, 1);
823 cxgb_toe_rcvd(struct tcpcb *tp)
826 inp_lock_assert(tp->t_inpcb);
828 t3_cleanup_rbuf(tp, 0);
834 cxgb_toe_detach(struct tcpcb *tp)
839 * XXX how do we handle teardown in the SYN_SENT state?
842 inp_lock_assert(tp->t_inpcb);
849 tp->t_flags &= ~TF_TOE;
854 static struct toe_usrreqs cxgb_toe_usrreqs = {
855 .tu_disconnect = cxgb_toe_disconnect,
856 .tu_reset = cxgb_toe_reset,
857 .tu_send = cxgb_toe_send,
858 .tu_rcvd = cxgb_toe_rcvd,
859 .tu_detach = cxgb_toe_detach,
860 .tu_detach = cxgb_toe_detach,
861 .tu_syncache_event = handle_syncache_event,
866 __set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
867 uint64_t mask, uint64_t val, int no_reply)
869 struct cpl_set_tcb_field *req;
871 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
872 toep->tp_tid, word, mask, val);
874 req = mtod(m, struct cpl_set_tcb_field *);
875 m->m_pkthdr.len = m->m_len = sizeof(*req);
876 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
878 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
879 req->reply = V_NO_REPLY(no_reply);
881 req->word = htons(word);
882 req->mask = htobe64(mask);
883 req->val = htobe64(val);
885 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
886 send_or_defer(toep, m, 0);
890 t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
893 struct tcpcb *tp = toep->tp_tp;
898 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
899 printf("not seting field\n");
903 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
905 __set_tcb_field(toep, m, word, mask, val, 1);
909 * Set one of the t_flags bits in the TCB.
912 set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
915 t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
919 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
922 t3_set_nagle(struct toepcb *toep)
924 struct tcpcb *tp = toep->tp_tp;
926 set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
930 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
933 t3_set_keepalive(struct toepcb *toep, int on_off)
936 set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
940 t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
942 set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
946 t3_set_dack_mss(struct toepcb *toep, int on_off)
949 set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
953 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
956 t3_set_tos(struct toepcb *toep)
958 int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
960 t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
966 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
967 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
968 * set the PSH bit in the last segment, which would trigger delivery.]
969 * We work around the issue by setting a DDP buffer in a partial placed state,
970 * which guarantees that TP will schedule a timer.
972 #define TP_DDP_TIMER_WORKAROUND_MASK\
973 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
974 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
975 V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
976 #define TP_DDP_TIMER_WORKAROUND_VAL\
977 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
978 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
982 t3_enable_ddp(struct toepcb *toep, int on)
986 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
989 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
991 TP_DDP_TIMER_WORKAROUND_MASK,
993 TP_DDP_TIMER_WORKAROUND_VAL);
998 t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
1000 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1001 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1006 t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1010 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1011 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1012 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1013 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1014 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1016 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1017 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1018 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1019 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1020 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1024 t3_set_cong_control(struct socket *so, const char *name)
1026 #ifdef CONGESTION_CONTROL_SUPPORTED
1029 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1030 if (!strcmp(name, t3_cong_ops[cong_algo].name))
1033 if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1040 t3_get_tcb(struct toepcb *toep)
1042 struct cpl_get_tcb *req;
1043 struct tcpcb *tp = toep->tp_tp;
1044 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1049 inp_lock_assert(tp->t_inpcb);
1050 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1051 req = mtod(m, struct cpl_get_tcb *);
1052 m->m_pkthdr.len = m->m_len = sizeof(*req);
1053 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1055 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1056 req->cpuno = htons(toep->tp_qset);
1058 if (tp->t_state == TCPS_SYN_SENT)
1059 mbufq_tail(&toep->out_of_order_queue, m); // defer
1061 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1066 so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1071 cxgb_insert_tid(d->cdev, d->client, toep, tid);
1075 * find_best_mtu - find the entry in the MTU table closest to an MTU
1077 * @mtu: the target MTU
1079 * Returns the index of the value in the MTU table that is closest to but
1080 * does not exceed the target MTU.
1083 find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1087 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1093 select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1098 struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1101 tp->t_maxseg = pmtu - 40;
1102 if (tp->t_maxseg < td->mtus[0] - 40)
1103 tp->t_maxseg = td->mtus[0] - 40;
1104 idx = find_best_mtu(td, tp->t_maxseg + 40);
1106 tp->t_maxseg = td->mtus[idx] - 40;
1108 idx = find_best_mtu(td, pmtu);
1114 free_atid(struct t3cdev *cdev, unsigned int tid)
1116 struct toepcb *toep = cxgb_free_atid(cdev, tid);
1119 toepcb_release(toep);
1123 * Release resources held by an offload connection (TID, L2T entry, etc.)
1126 t3_release_offload_resources(struct toepcb *toep)
1128 struct tcpcb *tp = toep->tp_tp;
1129 struct toedev *tdev = toep->tp_toedev;
1130 struct t3cdev *cdev;
1132 unsigned int tid = toep->tp_tid;
1133 struct sockbuf *rcv;
1135 CTR0(KTR_TOM, "t3_release_offload_resources");
1140 cdev = TOEP_T3C_DEV(toep);
1145 t3_release_ddp_resources(toep);
1147 #ifdef CTRL_SKB_CACHE
1148 kfree_skb(CTRL_SKB_CACHE(tp));
1149 CTRL_SKB_CACHE(tp) = NULL;
1152 if (toep->tp_wr_avail != toep->tp_wr_max) {
1153 purge_wr_queue(toep);
1154 reset_wr_list(toep);
1158 l2t_release(L2DATA(cdev), toep->tp_l2t);
1159 toep->tp_l2t = NULL;
1163 inp_lock_assert(tp->t_inpcb);
1164 so = inp_inpcbtosocket(tp->t_inpcb);
1165 rcv = so_sockbuf_rcv(so);
1167 * cancel any offloaded reads
1172 tp->t_flags &= ~TF_TOE;
1173 if (toep->tp_ddp_state.user_ddp_pending) {
1174 t3_cancel_ubuf(toep, rcv);
1175 toep->tp_ddp_state.user_ddp_pending = 0;
1177 so_sorwakeup_locked(so);
1181 if (toep->tp_state == TCPS_SYN_SENT) {
1182 free_atid(cdev, tid);
1184 __skb_queue_purge(&tp->out_of_order_queue);
1186 } else { // we have TID
1187 cxgb_remove_tid(cdev, toep, tid);
1188 toepcb_release(toep);
1191 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1196 install_offload_ops(struct socket *so)
1198 struct tcpcb *tp = so_sototcpcb(so);
1200 KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1202 t3_install_socket_ops(so);
1203 tp->t_flags |= TF_TOE;
1204 tp->t_tu = &cxgb_toe_usrreqs;
1208 * Determine the receive window scaling factor given a target max
1212 select_rcv_wscale(int space)
1216 if (space > MAX_RCV_WND)
1217 space = MAX_RCV_WND;
1219 if (V_tcp_do_rfc1323)
1220 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1226 * Determine the receive window size for a socket.
1228 static unsigned long
1229 select_rcv_wnd(struct toedev *dev, struct socket *so)
1231 struct tom_data *d = TOM_DATA(dev);
1233 unsigned int max_rcv_wnd;
1234 struct sockbuf *rcv;
1236 rcv = so_sockbuf_rcv(so);
1238 if (V_tcp_do_autorcvbuf)
1239 wnd = V_tcp_autorcvbuf_max;
1241 wnd = rcv->sb_hiwat;
1246 * For receive coalescing to work effectively we need a receive window
1247 * that can accomodate a coalesced segment.
1249 if (wnd < MIN_RCV_WND)
1253 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1254 (uint32_t)d->rx_page_size * 23 :
1257 return min(wnd, max_rcv_wnd);
1261 * Assign offload parameters to some socket fields. This code is used by
1262 * both active and passive opens.
1265 init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1266 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1268 struct tcpcb *tp = so_sototcpcb(so);
1269 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1270 struct sockbuf *snd, *rcv;
1273 SOCK_LOCK_ASSERT(so);
1276 snd = so_sockbuf_snd(so);
1277 rcv = so_sockbuf_rcv(so);
1279 log(LOG_INFO, "initializing offload socket\n");
1281 * We either need to fix push frames to work with sbcompress
1282 * or we need to add this
1284 snd->sb_flags |= SB_NOCOALESCE;
1285 rcv->sb_flags |= SB_NOCOALESCE;
1289 toep->tp_toedev = dev;
1293 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1294 toep->tp_wr_unacked = 0;
1295 toep->tp_delack_mode = 0;
1297 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1302 tp->rcv_wnd = select_rcv_wnd(dev, so);
1304 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1305 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1306 toep->tp_qset_idx = 0;
1308 reset_wr_list(toep);
1309 DPRINTF("initialization done\n");
1313 * The next two functions calculate the option 0 value for a socket.
1315 static inline unsigned int
1316 calc_opt0h(struct socket *so, int mtu_idx)
1318 struct tcpcb *tp = so_sototcpcb(so);
1319 int wscale = select_rcv_wscale(tp->rcv_wnd);
1321 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1322 V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1323 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1326 static inline unsigned int
1327 calc_opt0l(struct socket *so, int ulp_mode)
1329 struct tcpcb *tp = so_sototcpcb(so);
1332 val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1333 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1335 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1339 static inline unsigned int
1340 calc_opt2(const struct socket *so, struct toedev *dev)
1344 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1346 return (V_FLAVORS_VALID(flv_valid) |
1347 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1352 count_pending_wrs(const struct toepcb *toep)
1354 const struct mbuf *m;
1357 wr_queue_walk(toep, m)
1358 n += m->m_pkthdr.csum_data;
1364 (((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1368 mk_act_open_req(struct socket *so, struct mbuf *m,
1369 unsigned int atid, const struct l2t_entry *e)
1371 struct cpl_act_open_req *req;
1372 struct inpcb *inp = so_sotoinpcb(so);
1373 struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1374 struct toepcb *toep = tp->t_toe;
1375 struct toedev *tdev = toep->tp_toedev;
1377 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1379 req = mtod(m, struct cpl_act_open_req *);
1380 m->m_pkthdr.len = m->m_len = sizeof(*req);
1382 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1384 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1385 inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1387 req->local_port = inp->inp_lport;
1388 req->peer_port = inp->inp_fport;
1389 memcpy(&req->local_ip, &inp->inp_laddr, 4);
1390 memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1392 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1393 V_TX_CHANNEL(e->smt_idx));
1394 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1396 req->opt2 = htonl(calc_opt2(so, tdev));
1401 * Convert an ACT_OPEN_RPL status to an errno.
1404 act_open_rpl_status_to_errno(int status)
1407 case CPL_ERR_CONN_RESET:
1408 return (ECONNREFUSED);
1409 case CPL_ERR_ARP_MISS:
1410 return (EHOSTUNREACH);
1411 case CPL_ERR_CONN_TIMEDOUT:
1413 case CPL_ERR_TCAM_FULL:
1415 case CPL_ERR_CONN_EXIST:
1416 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1417 return (EADDRINUSE);
1424 fail_act_open(struct toepcb *toep, int errno)
1426 struct tcpcb *tp = toep->tp_tp;
1428 t3_release_offload_resources(toep);
1430 inp_wunlock(tp->t_inpcb);
1431 tcp_offload_drop(tp, errno);
1435 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1440 * Handle active open failures.
1443 active_open_failed(struct toepcb *toep, struct mbuf *m)
1445 struct cpl_act_open_rpl *rpl = cplhdr(m);
1448 if (toep->tp_tp == NULL)
1451 inp = toep->tp_tp->t_inpcb;
1454 * Don't handle connection retry for now
1457 struct inet_connection_sock *icsk = inet_csk(sk);
1459 if (rpl->status == CPL_ERR_CONN_EXIST &&
1460 icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1461 icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1462 sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1469 * drops the inpcb lock
1471 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1479 * Return whether a failed active open has allocated a TID
1482 act_open_has_tid(int status)
1484 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1485 status != CPL_ERR_ARP_MISS;
1489 * Process an ACT_OPEN_RPL CPL message.
1492 do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1494 struct toepcb *toep = (struct toepcb *)ctx;
1495 struct cpl_act_open_rpl *rpl = cplhdr(m);
1497 if (cdev->type != T3A && act_open_has_tid(rpl->status))
1498 cxgb_queue_tid_release(cdev, GET_TID(rpl));
1500 active_open_failed(toep, m);
1505 * Handle an ARP failure for an active open. XXX purge ofo queue
1507 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1508 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1509 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't
1510 * free the atid. Hmm.
1514 act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1516 struct toepcb *toep = m_get_toep(m);
1517 struct tcpcb *tp = toep->tp_tp;
1518 struct inpcb *inp = tp->t_inpcb;
1522 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1524 * drops the inpcb lock
1526 fail_act_open(so, EHOSTUNREACH);
1527 printf("freeing %p\n", m);
1535 * Send an active open request.
1538 t3_connect(struct toedev *tdev, struct socket *so,
1539 struct rtentry *rt, struct sockaddr *nam)
1542 struct l2t_entry *e;
1543 struct tom_data *d = TOM_DATA(tdev);
1544 struct inpcb *inp = so_sotoinpcb(so);
1545 struct tcpcb *tp = intotcpcb(inp);
1546 struct toepcb *toep; /* allocated by init_offload_socket */
1550 toep = toepcb_alloc();
1554 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1557 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1561 inp_lock_assert(inp);
1562 m = m_gethdr(MT_DATA, M_WAITOK);
1565 m->m_toe.mt_toepcb = tp->t_toe;
1566 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1570 init_offload_socket(so, tdev, atid, e, rt, toep);
1572 install_offload_ops(so);
1574 mk_act_open_req(so, m, atid, e);
1579 m_set_toep(m, tp->t_toe);
1581 toep->tp_state = TCPS_SYN_SENT;
1582 l2t_send(d->cdev, (struct mbuf *)m, e);
1584 if (toep->tp_ulp_mode)
1585 t3_enable_ddp(toep, 0);
1589 printf("failing connect - free atid\n");
1591 free_atid(d->cdev, atid);
1593 printf("return ENOMEM\n");
1598 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do
1599 * not send multiple ABORT_REQs for the same connection and also that we do
1600 * not try to send a message after the connection has closed. Returns 1 if
1601 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1604 t3_send_reset(struct toepcb *toep)
1607 struct cpl_abort_req *req;
1608 unsigned int tid = toep->tp_tid;
1609 int mode = CPL_ABORT_SEND_RST;
1610 struct tcpcb *tp = toep->tp_tp;
1611 struct toedev *tdev = toep->tp_toedev;
1612 struct socket *so = NULL;
1614 struct sockbuf *snd;
1617 inp_lock_assert(tp->t_inpcb);
1618 so = inp_inpcbtosocket(tp->t_inpcb);
1621 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1624 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1626 snd = so_sockbuf_snd(so);
1627 /* Purge the send queue so we don't send anything after an abort. */
1630 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1631 mode |= CPL_ABORT_POST_CLOSE_REQ;
1633 m = m_gethdr_nofail(sizeof(*req));
1634 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1635 set_arp_failure_handler(m, abort_arp_failure);
1637 req = mtod(m, struct cpl_abort_req *);
1638 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1639 req->wr.wr_lo = htonl(V_WR_TID(tid));
1640 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1641 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1642 req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1644 if (tp && (tp->t_state == TCPS_SYN_SENT))
1645 mbufq_tail(&toep->out_of_order_queue, m); // defer
1647 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1651 t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1656 if (sopt->sopt_name == IP_OPTIONS)
1657 return (ENOPROTOOPT);
1659 if (sopt->sopt_name != IP_TOS)
1660 return (EOPNOTSUPP);
1662 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1667 if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1670 inp = so_sotoinpcb(so);
1672 inp_ip_tos_set(inp, optval);
1674 inp->inp_ip_tos = optval;
1676 t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1683 t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1688 if (sopt->sopt_name != TCP_CONGESTION &&
1689 sopt->sopt_name != TCP_NODELAY)
1690 return (EOPNOTSUPP);
1692 if (sopt->sopt_name == TCP_CONGESTION) {
1693 char name[TCP_CA_NAME_MAX];
1694 int optlen = sopt->sopt_valsize;
1697 if (sopt->sopt_dir == SOPT_GET) {
1698 KASSERT(0, ("unimplemented"));
1699 return (EOPNOTSUPP);
1705 err = copyinstr(sopt->sopt_val, name,
1706 min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1712 tp = so_sototcpcb(so);
1714 * XXX I need to revisit this
1716 if ((err = t3_set_cong_control(so, name)) == 0) {
1717 #ifdef CONGESTION_CONTROL_SUPPORTED
1718 tp->t_cong_control = strdup(name, M_CXGB);
1727 if (sopt->sopt_dir == SOPT_GET)
1728 return (EOPNOTSUPP);
1730 err = sooptcopyin(sopt, &optval, sizeof optval,
1736 inp = so_sotoinpcb(so);
1737 tp = inp_inpcbtotcpcb(inp);
1741 oldval = tp->t_flags;
1743 tp->t_flags |= TF_NODELAY;
1745 tp->t_flags &= ~TF_NODELAY;
1749 if (oldval != tp->t_flags && (tp->t_toe != NULL))
1750 t3_set_nagle(tp->t_toe);
1758 t3_ctloutput(struct socket *so, struct sockopt *sopt)
1762 if (sopt->sopt_level != IPPROTO_TCP)
1763 err = t3_ip_ctloutput(so, sopt);
1765 err = t3_tcp_ctloutput(so, sopt);
1767 if (err != EOPNOTSUPP)
1770 return (tcp_ctloutput(so, sopt));
1774 * Returns true if we need to explicitly request RST when we receive new data
1775 * on an RX-closed connection.
1778 need_rst_on_excess_rx(const struct toepcb *toep)
1784 * Handles Rx data that arrives in a state where the socket isn't accepting
1788 handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1791 if (need_rst_on_excess_rx(toep) &&
1792 !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1793 t3_send_reset(toep);
1798 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1799 * by getting the DDP offset from the TCB.
1802 tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1804 struct ddp_state *q = &toep->tp_ddp_state;
1805 struct ddp_buf_state *bsp;
1806 struct cpl_get_tcb_rpl *hdr;
1807 unsigned int ddp_offset;
1810 struct sockbuf *rcv;
1817 so = inp_inpcbtosocket(tp->t_inpcb);
1819 inp_lock_assert(tp->t_inpcb);
1820 rcv = so_sockbuf_rcv(so);
1823 /* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1824 * We really need a cookie in order to dispatch the RPLs.
1828 /* It is a possible that a previous CPL already invalidated UBUF DDP
1829 * and moved the cur_buf idx and hence no further processing of this
1830 * skb is required. However, the app might be sleeping on
1831 * !q->get_tcb_count and we need to wake it up.
1833 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1834 int state = so_state_get(so);
1837 if (__predict_true((state & SS_NOFDREF) == 0))
1838 so_sorwakeup_locked(so);
1840 sockbuf_unlock(rcv);
1845 bsp = &q->buf_state[q->cur_buf];
1847 tcb = (__be64 *)(hdr + 1);
1848 if (q->cur_buf == 0) {
1849 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1850 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1852 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1853 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1855 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1856 m->m_cur_offset = bsp->cur_offset;
1857 bsp->cur_offset = ddp_offset;
1858 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1861 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1862 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1863 KASSERT(ddp_offset >= m->m_cur_offset,
1864 ("ddp_offset=%u less than cur_offset=%u",
1865 ddp_offset, m->m_cur_offset));
1869 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1871 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1872 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1874 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1875 rcv_nxt = t >> S_TCB_RCV_NXT;
1876 rcv_nxt &= M_TCB_RCV_NXT;
1878 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1879 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1880 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1882 T3_TRACE2(TIDTB(sk),
1883 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1884 ddp_flags, rcv_nxt - rx_hdr_offset);
1886 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1887 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1889 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1890 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1892 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1893 q->buf_state[0].flags, q->buf_state[1].flags);
1897 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1898 handle_excess_rx(toep, m);
1903 if ((int)m->m_pkthdr.len < 0) {
1904 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1907 if (bsp->flags & DDP_BF_NOCOPY) {
1910 "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1912 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1913 printk("!cancel_ubuf");
1914 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1917 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1918 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1920 } else if (bsp->flags & DDP_BF_NOFLIP) {
1922 m->m_ddp_flags = 1; /* always a kernel buffer */
1924 /* now HW buffer carries a user buffer */
1925 bsp->flags &= ~DDP_BF_NOFLIP;
1926 bsp->flags |= DDP_BF_NOCOPY;
1928 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1929 * any new data in which case we're done. If in addition the
1930 * offset is 0, then there wasn't a completion for the kbuf
1931 * and we need to decrement the posted count.
1933 if (m->m_pkthdr.len == 0) {
1934 if (ddp_offset == 0) {
1936 bsp->flags |= DDP_BF_NODATA;
1938 sockbuf_unlock(rcv);
1943 sockbuf_unlock(rcv);
1945 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1946 * but it got here way late and nobody cares anymore.
1952 m->m_ddp_gl = (unsigned char *)bsp->gl;
1953 m->m_flags |= M_DDP;
1954 m->m_seq = tp->rcv_nxt;
1955 tp->rcv_nxt += m->m_pkthdr.len;
1956 tp->t_rcvtime = ticks;
1957 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1958 m->m_seq, q->cur_buf, m->m_pkthdr.len);
1959 if (m->m_pkthdr.len == 0) {
1960 q->user_ddp_pending = 0;
1965 state = so_state_get(so);
1966 if (__predict_true((state & SS_NOFDREF) == 0))
1967 so_sorwakeup_locked(so);
1969 sockbuf_unlock(rcv);
1973 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code,
1974 * in that case they are similar to DDP completions.
1977 do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1979 struct toepcb *toep = (struct toepcb *)ctx;
1981 /* OK if socket doesn't exist */
1983 printf("null toep in do_get_tcb_rpl\n");
1984 return (CPL_RET_BUF_DONE);
1987 inp_wlock(toep->tp_tp->t_inpcb);
1988 tcb_rpl_as_ddp_complete(toep, m);
1989 inp_wunlock(toep->tp_tp->t_inpcb);
1995 handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1997 struct tcpcb *tp = toep->tp_tp;
1999 struct ddp_state *q;
2000 struct ddp_buf_state *bsp;
2001 struct cpl_rx_data *hdr = cplhdr(m);
2002 unsigned int rcv_nxt = ntohl(hdr->seq);
2003 struct sockbuf *rcv;
2005 if (tp->rcv_nxt == rcv_nxt)
2008 inp_lock_assert(tp->t_inpcb);
2009 so = inp_inpcbtosocket(tp->t_inpcb);
2010 rcv = so_sockbuf_rcv(so);
2013 q = &toep->tp_ddp_state;
2014 bsp = &q->buf_state[q->cur_buf];
2015 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2016 rcv_nxt, tp->rcv_nxt));
2017 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2018 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2019 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2020 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2023 if ((int)m->m_pkthdr.len < 0) {
2024 t3_ddp_error(so, "handle_ddp_data: neg len");
2027 m->m_ddp_gl = (unsigned char *)bsp->gl;
2028 m->m_flags |= M_DDP;
2029 m->m_cur_offset = bsp->cur_offset;
2030 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2031 if (bsp->flags & DDP_BF_NOCOPY)
2032 bsp->flags &= ~DDP_BF_NOCOPY;
2034 m->m_seq = tp->rcv_nxt;
2035 tp->rcv_nxt = rcv_nxt;
2036 bsp->cur_offset += m->m_pkthdr.len;
2037 if (!(bsp->flags & DDP_BF_NOFLIP))
2040 * For now, don't re-enable DDP after a connection fell out of DDP
2043 q->ubuf_ddp_ready = 0;
2044 sockbuf_unlock(rcv);
2048 * Process new data received for a connection.
2051 new_rx_data(struct toepcb *toep, struct mbuf *m)
2053 struct cpl_rx_data *hdr = cplhdr(m);
2054 struct tcpcb *tp = toep->tp_tp;
2056 struct sockbuf *rcv;
2058 int len = be16toh(hdr->len);
2060 inp_wlock(tp->t_inpcb);
2062 so = inp_inpcbtosocket(tp->t_inpcb);
2064 if (__predict_false(so_no_receive(so))) {
2065 handle_excess_rx(toep, m);
2066 inp_wunlock(tp->t_inpcb);
2071 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2072 handle_ddp_data(toep, m);
2074 m->m_seq = ntohl(hdr->seq);
2075 m->m_ulp_mode = 0; /* for iSCSI */
2078 if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2080 "%s: TID %u: Bad sequence number %u, expected %u\n",
2081 toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2084 inp_wunlock(tp->t_inpcb);
2088 m_adj(m, sizeof(*hdr));
2090 #ifdef URGENT_DATA_SUPPORTED
2092 * We don't handle urgent data yet
2094 if (__predict_false(hdr->urg))
2095 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2096 if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2097 tp->urg_seq - tp->rcv_nxt < skb->len))
2098 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2101 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2102 toep->tp_delack_mode = hdr->dack_mode;
2103 toep->tp_delack_seq = tp->rcv_nxt;
2105 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2106 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2108 if (len < m->m_pkthdr.len)
2109 m->m_pkthdr.len = m->m_len = len;
2111 tp->rcv_nxt += m->m_pkthdr.len;
2112 tp->t_rcvtime = ticks;
2113 toep->tp_enqueued_bytes += m->m_pkthdr.len;
2115 "new_rx_data: seq 0x%x len %u",
2116 m->m_seq, m->m_pkthdr.len);
2117 inp_wunlock(tp->t_inpcb);
2118 rcv = so_sockbuf_rcv(so);
2122 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2128 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2131 KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2133 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2134 so, rcv->sb_cc, rcv->sb_mbmax));
2138 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2139 rcv->sb_cc, rcv->sb_mbcnt);
2141 state = so_state_get(so);
2142 if (__predict_true((state & SS_NOFDREF) == 0))
2143 so_sorwakeup_locked(so);
2145 sockbuf_unlock(rcv);
2149 * Handler for RX_DATA CPL messages.
2152 do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2154 struct toepcb *toep = (struct toepcb *)ctx;
2156 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2158 new_rx_data(toep, m);
2164 new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2167 struct ddp_state *q;
2168 struct ddp_buf_state *bsp;
2169 struct cpl_rx_data_ddp *hdr;
2171 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2173 unsigned int delack_mode;
2174 struct sockbuf *rcv;
2177 inp_wlock(tp->t_inpcb);
2178 so = inp_inpcbtosocket(tp->t_inpcb);
2180 if (__predict_false(so_no_receive(so))) {
2182 handle_excess_rx(toep, m);
2183 inp_wunlock(tp->t_inpcb);
2187 q = &toep->tp_ddp_state;
2189 ddp_report = ntohl(hdr->u.ddp_report);
2190 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2191 bsp = &q->buf_state[buf_idx];
2194 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2195 "hdr seq 0x%x len %u",
2196 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2199 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2200 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2202 ddp_len = ntohs(hdr->len);
2203 rcv_nxt = ntohl(hdr->seq) + ddp_len;
2205 delack_mode = G_DDP_DACK_MODE(ddp_report);
2206 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2207 toep->tp_delack_mode = delack_mode;
2208 toep->tp_delack_seq = tp->rcv_nxt;
2211 m->m_seq = tp->rcv_nxt;
2212 tp->rcv_nxt = rcv_nxt;
2214 tp->t_rcvtime = ticks;
2216 * Store the length in m->m_len. We are changing the meaning of
2217 * m->m_len here, we need to be very careful that nothing from now on
2218 * interprets ->len of this packet the usual way.
2220 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2221 inp_wunlock(tp->t_inpcb);
2223 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2224 m->m_len, rcv_nxt, m->m_seq);
2226 * Figure out where the new data was placed in the buffer and store it
2227 * in when. Assumes the buffer offset starts at 0, consumer needs to
2228 * account for page pod's pg_offset.
2230 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2231 m->m_cur_offset = end_offset - m->m_pkthdr.len;
2233 rcv = so_sockbuf_rcv(so);
2236 m->m_ddp_gl = (unsigned char *)bsp->gl;
2237 m->m_flags |= M_DDP;
2238 bsp->cur_offset = end_offset;
2239 toep->tp_enqueued_bytes += m->m_pkthdr.len;
2242 * Length is only meaningful for kbuf
2244 if (!(bsp->flags & DDP_BF_NOCOPY))
2245 KASSERT(m->m_len <= bsp->gl->dgl_length,
2246 ("length received exceeds ddp pages: len=%d dgl_length=%d",
2247 m->m_len, bsp->gl->dgl_length));
2249 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2250 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2252 * Bit 0 of flags stores whether the DDP buffer is completed.
2253 * Note that other parts of the code depend on this being in bit 0.
2255 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2256 panic("spurious ddp completion");
2258 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2259 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2260 q->cur_buf ^= 1; /* flip buffers */
2263 if (bsp->flags & DDP_BF_NOCOPY) {
2264 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2265 bsp->flags &= ~DDP_BF_NOCOPY;
2268 if (ddp_report & F_DDP_PSH)
2269 m->m_ddp_flags |= DDP_BF_PSH;
2271 m->m_ddp_flags |= DDP_BF_NODATA;
2274 skb_reset_transport_header(skb);
2275 tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */
2279 if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2280 (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2281 || !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2282 so_sorwakeup_locked(so);
2284 sockbuf_unlock(rcv);
2287 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2288 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2289 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2293 * Handler for RX_DATA_DDP CPL messages.
2296 do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2298 struct toepcb *toep = ctx;
2299 const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2303 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2304 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2305 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2306 return (CPL_RET_BUF_DONE);
2309 skb->h.th = tcphdr_skb->h.th;
2311 new_rx_data_ddp(toep, m);
2316 process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2318 struct tcpcb *tp = toep->tp_tp;
2320 struct ddp_state *q;
2321 struct ddp_buf_state *bsp;
2322 struct cpl_rx_ddp_complete *hdr;
2323 unsigned int ddp_report, buf_idx, when, delack_mode;
2325 struct sockbuf *rcv;
2327 inp_wlock(tp->t_inpcb);
2328 so = inp_inpcbtosocket(tp->t_inpcb);
2330 if (__predict_false(so_no_receive(so))) {
2331 struct inpcb *inp = so_sotoinpcb(so);
2333 handle_excess_rx(toep, m);
2337 q = &toep->tp_ddp_state;
2339 ddp_report = ntohl(hdr->ddp_report);
2340 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2341 m->m_pkthdr.csum_data = tp->rcv_nxt;
2343 rcv = so_sockbuf_rcv(so);
2346 bsp = &q->buf_state[buf_idx];
2347 when = bsp->cur_offset;
2348 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2349 tp->rcv_nxt += m->m_len;
2350 tp->t_rcvtime = ticks;
2352 delack_mode = G_DDP_DACK_MODE(ddp_report);
2353 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2354 toep->tp_delack_mode = delack_mode;
2355 toep->tp_delack_seq = tp->rcv_nxt;
2358 skb_reset_transport_header(skb);
2359 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
2361 inp_wunlock(tp->t_inpcb);
2363 KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2365 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2366 "ddp_report 0x%x offset %u, len %u",
2367 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2368 G_DDP_OFFSET(ddp_report), m->m_len);
2370 m->m_cur_offset = bsp->cur_offset;
2371 bsp->cur_offset += m->m_len;
2373 if (!(bsp->flags & DDP_BF_NOFLIP)) {
2374 q->cur_buf ^= 1; /* flip buffers */
2375 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2380 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2381 "ddp_report %u offset %u",
2382 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2383 G_DDP_OFFSET(ddp_report));
2385 m->m_ddp_gl = (unsigned char *)bsp->gl;
2386 m->m_flags |= M_DDP;
2387 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2388 if (bsp->flags & DDP_BF_NOCOPY)
2389 bsp->flags &= ~DDP_BF_NOCOPY;
2391 m->m_ddp_flags |= DDP_BF_NODATA;
2394 if ((so_state_get(so) & SS_NOFDREF) == 0)
2395 so_sorwakeup_locked(so);
2397 sockbuf_unlock(rcv);
2401 * Handler for RX_DDP_COMPLETE CPL messages.
2404 do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2406 struct toepcb *toep = ctx;
2410 skb->h.th = tcphdr_skb->h.th;
2412 process_ddp_complete(toep, m);
2417 * Move a socket to TIME_WAIT state. We need to make some adjustments to the
2418 * socket state before calling tcp_time_wait to comply with its expectations.
2421 enter_timewait(struct tcpcb *tp)
2424 * Bump rcv_nxt for the peer FIN. We don't do this at the time we
2425 * process peer_close because we don't want to carry the peer FIN in
2426 * the socket's receive queue and if we increment rcv_nxt without
2427 * having the FIN in the receive queue we'll confuse facilities such
2430 inp_wlock(tp->t_inpcb);
2433 tp->ts_recent_age = 0; /* defeat recycling */
2434 tp->t_srtt = 0; /* defeat tcp_update_metrics */
2435 inp_wunlock(tp->t_inpcb);
2436 tcp_offload_twstart(tp);
2440 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This
2441 * function deals with the data that may be reported along with the FIN.
2442 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2443 * perform normal FIN-related processing. In the latter case 1 indicates that
2444 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2448 handle_peer_close_data(struct socket *so, struct mbuf *m)
2450 struct tcpcb *tp = so_sototcpcb(so);
2451 struct toepcb *toep = tp->t_toe;
2452 struct ddp_state *q;
2453 struct ddp_buf_state *bsp;
2454 struct cpl_peer_close *req = cplhdr(m);
2455 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2456 struct sockbuf *rcv;
2458 if (tp->rcv_nxt == rcv_nxt) /* no data */
2461 CTR0(KTR_TOM, "handle_peer_close_data");
2462 if (__predict_false(so_no_receive(so))) {
2463 handle_excess_rx(toep, m);
2466 * Although we discard the data we want to process the FIN so
2467 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2468 * PEER_CLOSE without data. In particular this PEER_CLOSE
2469 * may be what will close the connection. We return 1 because
2470 * handle_excess_rx() already freed the packet.
2475 inp_lock_assert(tp->t_inpcb);
2476 q = &toep->tp_ddp_state;
2477 rcv = so_sockbuf_rcv(so);
2480 bsp = &q->buf_state[q->cur_buf];
2481 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2482 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2483 m->m_ddp_gl = (unsigned char *)bsp->gl;
2484 m->m_flags |= M_DDP;
2485 m->m_cur_offset = bsp->cur_offset;
2487 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2488 m->m_seq = tp->rcv_nxt;
2489 tp->rcv_nxt = rcv_nxt;
2490 bsp->cur_offset += m->m_pkthdr.len;
2491 if (!(bsp->flags & DDP_BF_NOFLIP))
2494 skb_reset_transport_header(skb);
2495 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
2497 tp->t_rcvtime = ticks;
2499 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2500 so_sorwakeup_locked(so);
2502 sockbuf_unlock(rcv);
2508 * Handle a peer FIN.
2511 do_peer_fin(struct toepcb *toep, struct mbuf *m)
2514 struct tcpcb *tp = toep->tp_tp;
2518 CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2519 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2520 printf("abort_pending set\n");
2524 inp_wlock(tp->t_inpcb);
2525 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2526 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2527 keep = handle_peer_close_data(so, m);
2529 inp_wunlock(tp->t_inpcb);
2533 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2535 "waking up waiters for cantrcvmore on %p ", so);
2539 * If connection is half-synchronized
2540 * (ie NEEDSYN flag on) then delay ACK,
2541 * so it may be piggybacked when SYN is sent.
2542 * Otherwise, since we received a FIN then no
2543 * more input can be expected, send ACK now.
2545 if (tp->t_flags & TF_NEEDSYN)
2546 tp->t_flags |= TF_DELACK;
2548 tp->t_flags |= TF_ACKNOW;
2552 switch (tp->t_state) {
2553 case TCPS_SYN_RECEIVED:
2554 tp->t_starttime = ticks;
2556 case TCPS_ESTABLISHED:
2557 tp->t_state = TCPS_CLOSE_WAIT;
2559 case TCPS_FIN_WAIT_1:
2560 tp->t_state = TCPS_CLOSING;
2562 case TCPS_FIN_WAIT_2:
2564 * If we've sent an abort_req we must have sent it too late,
2565 * HW will send us a reply telling us so, and this peer_close
2566 * is really the last message for this connection and needs to
2567 * be treated as an abort_rpl, i.e., transition the connection
2568 * to TCP_CLOSE (note that the host stack does this at the
2569 * time of generating the RST but we must wait for HW).
2570 * Otherwise we enter TIME_WAIT.
2572 t3_release_offload_resources(toep);
2573 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2576 action = TCP_TIMEWAIT;
2581 "%s: TID %u received PEER_CLOSE in bad state %d\n",
2582 toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2584 inp_wunlock(tp->t_inpcb);
2586 if (action == TCP_TIMEWAIT) {
2588 } else if (action == TCP_DROP) {
2589 tcp_offload_drop(tp, 0);
2590 } else if (action == TCP_CLOSE) {
2591 tcp_offload_close(tp);
2595 /* Do not send POLL_HUP for half duplex close. */
2596 if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2597 sk->sk_state == TCP_CLOSE)
2598 sk_wake_async(so, 1, POLL_HUP);
2600 sk_wake_async(so, 1, POLL_IN);
2609 * Handler for PEER_CLOSE CPL messages.
2612 do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2614 struct toepcb *toep = (struct toepcb *)ctx;
2618 do_peer_fin(toep, m);
2623 process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2625 struct cpl_close_con_rpl *rpl = cplhdr(m);
2626 struct tcpcb *tp = toep->tp_tp;
2629 struct sockbuf *rcv;
2631 inp_wlock(tp->t_inpcb);
2632 so = inp_inpcbtosocket(tp->t_inpcb);
2634 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */
2636 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2637 inp_wunlock(tp->t_inpcb);
2641 CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2642 tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2644 switch (tp->t_state) {
2645 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */
2646 t3_release_offload_resources(toep);
2647 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2651 action = TCP_TIMEWAIT;
2656 * In this state we don't care about pending abort_rpl.
2657 * If we've sent abort_req it was post-close and was sent too
2658 * late, this close_con_rpl is the actual last message.
2660 t3_release_offload_resources(toep);
2663 case TCPS_FIN_WAIT_1:
2665 * If we can't receive any more
2666 * data, then closing user can proceed.
2667 * Starting the timer is contrary to the
2668 * specification, but if we don't get a FIN
2669 * we'll hang forever.
2672 * we should release the tp also, and use a
2676 rcv = so_sockbuf_rcv(so);
2680 if (rcv->sb_state & SBS_CANTRCVMORE) {
2684 soisdisconnected(so);
2685 timeout = (tcp_fast_finwait2_recycle) ?
2686 tcp_finwait2_timeout : tcp_maxidle;
2687 tcp_timer_activate(tp, TT_2MSL, timeout);
2689 tp->t_state = TCPS_FIN_WAIT_2;
2690 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2691 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2698 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2699 toep->tp_toedev->tod_name, toep->tp_tid,
2702 inp_wunlock(tp->t_inpcb);
2705 if (action == TCP_TIMEWAIT) {
2707 } else if (action == TCP_DROP) {
2708 tcp_offload_drop(tp, 0);
2709 } else if (action == TCP_CLOSE) {
2710 tcp_offload_close(tp);
2717 * Handler for CLOSE_CON_RPL CPL messages.
2720 do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2723 struct toepcb *toep = (struct toepcb *)ctx;
2725 process_close_con_rpl(toep, m);
2730 * Process abort replies. We only process these messages if we anticipate
2731 * them as the coordination between SW and HW in this area is somewhat lacking
2732 * and sometimes we get ABORT_RPLs after we are done with the connection that
2733 * originated the ABORT_REQ.
2736 process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2738 struct tcpcb *tp = toep->tp_tp;
2743 T3_TRACE1(TIDTB(sk),
2744 "process_abort_rpl: GTS rpl pending %d",
2745 sock_flag(sk, ABORT_RPL_PENDING));
2748 inp_wlock(tp->t_inpcb);
2749 so = inp_inpcbtosocket(tp->t_inpcb);
2751 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2753 * XXX panic on tcpdrop
2755 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2756 toep->tp_flags |= TP_ABORT_RPL_RCVD;
2758 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2759 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2760 !is_t3a(toep->tp_toedev)) {
2761 if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2762 panic("TP_ABORT_REQ_RCVD set");
2763 t3_release_offload_resources(toep);
2768 inp_wunlock(tp->t_inpcb);
2771 tcp_offload_close(tp);
2777 * Handle an ABORT_RPL_RSS CPL message.
2780 do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2782 struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2783 struct toepcb *toep;
2786 * Ignore replies to post-close aborts indicating that the abort was
2787 * requested too late. These connections are terminated when we get
2788 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2789 * arrives the TID is either no longer used or it has been recycled.
2791 if (rpl->status == CPL_ERR_ABORT_FAILED) {
2797 toep = (struct toepcb *)ctx;
2800 * Sometimes we've already closed the socket, e.g., a post-close
2801 * abort races with ABORT_REQ_RSS, the latter frees the socket
2802 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2803 * but FW turns the ABORT_REQ into a regular one and so we get
2804 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A.
2809 if (toep->tp_tp == NULL) {
2810 log(LOG_NOTICE, "removing tid for abort\n");
2811 cxgb_remove_tid(cdev, toep, toep->tp_tid);
2813 l2t_release(L2DATA(cdev), toep->tp_l2t);
2815 toepcb_release(toep);
2819 log(LOG_NOTICE, "toep=%p\n", toep);
2820 log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2823 process_abort_rpl(toep, m);
2824 toepcb_release(toep);
2829 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also
2830 * indicate whether RST should be sent in response.
2833 abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2835 struct tcpcb *tp = so_sototcpcb(so);
2837 switch (abort_reason) {
2838 case CPL_ERR_BAD_SYN:
2840 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through
2842 case CPL_ERR_CONN_RESET:
2843 // XXX need to handle SYN_RECV due to crossed SYNs
2844 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2845 case CPL_ERR_XMIT_TIMEDOUT:
2846 case CPL_ERR_PERSIST_TIMEDOUT:
2847 case CPL_ERR_FINWAIT2_TIMEDOUT:
2848 case CPL_ERR_KEEPALIVE_TIMEDOUT:
2850 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2859 set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2861 struct cpl_abort_rpl *rpl = cplhdr(m);
2863 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2864 rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2865 m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2867 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2872 send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2874 struct mbuf *reply_mbuf;
2875 struct cpl_abort_req_rss *req = cplhdr(m);
2877 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2878 m_set_priority(m, CPL_PRIORITY_DATA);
2879 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2880 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2881 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2886 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2889 is_neg_adv_abort(unsigned int status)
2891 return status == CPL_ERR_RTX_NEG_ADVICE ||
2892 status == CPL_ERR_PERSIST_NEG_ADVICE;
2896 send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2898 struct mbuf *reply_mbuf;
2899 struct cpl_abort_req_rss *req = cplhdr(m);
2901 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2904 /* Defer the reply. Stick rst_status into req->cmd. */
2905 req->status = rst_status;
2906 t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2910 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2911 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2915 * XXX need to sync with ARP as for SYN_RECV connections we can send
2916 * these messages while ARP is pending. For other connection states
2917 * it's not a problem.
2919 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2924 cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2926 CXGB_UNIMPLEMENTED();
2928 struct request_sock *req = child->sk_user_data;
2930 inet_csk_reqsk_queue_removed(parent, req);
2931 synq_remove(tcp_sk(child));
2933 child->sk_user_data = NULL;
2939 * Performs the actual work to abort a SYN_RECV connection.
2942 do_abort_syn_rcv(struct socket *child, struct socket *parent)
2944 struct tcpcb *parenttp = so_sototcpcb(parent);
2945 struct tcpcb *childtp = so_sototcpcb(child);
2948 * If the server is still open we clean up the child connection,
2949 * otherwise the server already did the clean up as it was purging
2950 * its SYN queue and the skb was just sitting in its backlog.
2952 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2953 cleanup_syn_rcv_conn(child, parent);
2954 inp_wlock(childtp->t_inpcb);
2955 t3_release_offload_resources(childtp->t_toe);
2956 inp_wunlock(childtp->t_inpcb);
2957 tcp_offload_close(childtp);
2963 * Handle abort requests for a SYN_RECV connection. These need extra work
2964 * because the socket is on its parent's SYN queue.
2967 abort_syn_rcv(struct socket *so, struct mbuf *m)
2969 CXGB_UNIMPLEMENTED();
2971 struct socket *parent;
2972 struct toedev *tdev = toep->tp_toedev;
2973 struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2974 struct socket *oreq = so->so_incomp;
2975 struct t3c_tid_entry *t3c_stid;
2979 return -1; /* somehow we are not on the SYN queue */
2981 t = &(T3C_DATA(cdev))->tid_maps;
2982 t3c_stid = lookup_stid(t, oreq->ts_recent);
2983 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2986 do_abort_syn_rcv(so, parent);
2987 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2994 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this
2995 * request except that we need to reply to it.
2998 process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3000 int rst_status = CPL_ABORT_NO_RST;
3001 const struct cpl_abort_req_rss *req = cplhdr(m);
3002 struct tcpcb *tp = toep->tp_tp;
3006 inp_wlock(tp->t_inpcb);
3007 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3008 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3009 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3014 toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3016 * Three cases to consider:
3017 * a) We haven't sent an abort_req; close the connection.
3018 * b) We have sent a post-close abort_req that will get to TP too late
3019 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will
3020 * be ignored and the connection should be closed now.
3021 * c) We have sent a regular abort_req that will get to TP too late.
3022 * That will generate an abort_rpl with status 0, wait for it.
3024 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3025 (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3028 error = abort_status_to_errno(so, req->status,
3030 so_error_set(so, error);
3032 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3035 * SYN_RECV needs special processing. If abort_syn_rcv()
3036 * returns 0 is has taken care of the abort.
3038 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3041 t3_release_offload_resources(toep);
3044 inp_wunlock(tp->t_inpcb);
3047 tcp_offload_close(tp);
3049 send_abort_rpl(m, tdev, rst_status);
3052 inp_wunlock(tp->t_inpcb);
3056 * Handle an ABORT_REQ_RSS CPL message.
3059 do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3061 const struct cpl_abort_req_rss *req = cplhdr(m);
3062 struct toepcb *toep = (struct toepcb *)ctx;
3064 if (is_neg_adv_abort(req->status)) {
3069 log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3071 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3072 cxgb_remove_tid(cdev, toep, toep->tp_tid);
3073 toep->tp_flags |= TP_ABORT_REQ_RCVD;
3075 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3077 l2t_release(L2DATA(cdev), toep->tp_l2t);
3082 toep->tp_tp->t_toe = NULL;
3083 toep->tp_tp->t_flags &= ~TF_TOE;
3086 * XXX need to call syncache_chkrst - but we don't
3087 * have a way of doing that yet
3089 toepcb_release(toep);
3090 log(LOG_ERR, "abort for unestablished connection :-(\n");
3093 if (toep->tp_tp == NULL) {
3094 log(LOG_NOTICE, "disconnected toepcb\n");
3095 /* should be freed momentarily */
3101 process_abort_req(toep, m, toep->tp_toedev);
3102 toepcb_release(toep);
3107 pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3109 struct toedev *tdev = TOE_DEV(parent);
3111 do_abort_syn_rcv(child, parent);
3112 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3113 struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3115 rpl->opt0h = htonl(F_TCAM_BYPASS);
3116 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3117 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3123 handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3125 CXGB_UNIMPLEMENTED();
3128 struct t3cdev *cdev;
3129 struct socket *parent;
3130 struct socket *oreq;
3131 struct t3c_tid_entry *t3c_stid;
3133 struct tcpcb *otp, *tp = so_sototcpcb(so);
3134 struct toepcb *toep = tp->t_toe;
3137 * If the connection is being aborted due to the parent listening
3138 * socket going away there's nothing to do, the ABORT_REQ will close
3141 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3146 oreq = so->so_incomp;
3147 otp = so_sototcpcb(oreq);
3150 t = &(T3C_DATA(cdev))->tid_maps;
3151 t3c_stid = lookup_stid(t, otp->ts_recent);
3152 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3155 pass_open_abort(so, parent, m);
3161 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly
3162 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3166 pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3170 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3171 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3173 handle_pass_open_arp_failure(m_get_socket(m), m);
3177 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3180 mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3182 struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3183 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3184 unsigned int tid = GET_TID(req);
3186 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3187 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3188 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3189 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet
3190 rpl->opt0h = htonl(F_TCAM_BYPASS);
3191 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3193 rpl->rsvd = rpl->opt2; /* workaround for HW bug */
3197 * Send a deferred reject to an accept request.
3200 reject_pass_request(struct toedev *tdev, struct mbuf *m)
3202 struct mbuf *reply_mbuf;
3204 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3205 mk_pass_accept_rpl(reply_mbuf, m);
3206 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3211 handle_syncache_event(int event, void *arg)
3213 struct toepcb *toep = arg;
3216 case TOE_SC_ENTRY_PRESENT:
3218 * entry already exists - free toepcb
3221 printf("syncache entry present\n");
3222 toepcb_release(toep);
3226 * The syncache has given up on this entry
3227 * either it timed out, or it was evicted
3228 * we need to explicitly release the tid
3230 printf("syncache entry dropped\n");
3231 toepcb_release(toep);
3234 log(LOG_ERR, "unknown syncache event %d\n", event);
3240 syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3242 struct in_conninfo inc;
3246 int mss, wsf, sack, ts;
3247 uint32_t rcv_isn = ntohl(req->rcv_isn);
3249 bzero(&to, sizeof(struct tcpopt));
3250 inp = so_sotoinpcb(lso);
3253 * Fill out information for entering us into the syncache
3255 inc.inc_fport = th.th_sport = req->peer_port;
3256 inc.inc_lport = th.th_dport = req->local_port;
3257 th.th_seq = req->rcv_isn;
3258 th.th_flags = TH_SYN;
3260 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3265 inc.inc_faddr.s_addr = req->peer_ip;
3266 inc.inc_laddr.s_addr = req->local_ip;
3268 DPRINTF("syncache add of %d:%d %d:%d\n",
3269 ntohl(req->local_ip), ntohs(req->local_port),
3270 ntohl(req->peer_ip), ntohs(req->peer_port));
3272 mss = req->tcp_options.mss;
3273 wsf = req->tcp_options.wsf;
3274 ts = req->tcp_options.tstamp;
3275 sack = req->tcp_options.sack;
3278 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3279 tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3284 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket
3285 * lock held. Note that the sock here is a listening socket that is not owned
3289 process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3290 struct listen_ctx *lctx)
3293 struct l2t_entry *e;
3295 struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3296 struct cpl_pass_accept_rpl *rpl;
3297 struct cpl_pass_accept_req *req = cplhdr(m);
3298 unsigned int tid = GET_TID(req);
3299 struct tom_data *d = TOM_DATA(tdev);
3300 struct t3cdev *cdev = d->cdev;
3301 struct tcpcb *tp = so_sototcpcb(so);
3302 struct toepcb *newtoep;
3303 struct rtentry *dst;
3304 struct sockaddr_in nam;
3305 struct t3c_data *td = T3C_DATA(cdev);
3307 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3308 if (__predict_false(reply_mbuf == NULL)) {
3309 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3310 t3_defer_reply(m, tdev, reject_pass_request);
3312 cxgb_queue_tid_release(cdev, tid);
3315 DPRINTF("failed to get reply_mbuf\n");
3320 if (tp->t_state != TCPS_LISTEN) {
3321 DPRINTF("socket not in listen state\n");
3326 tim.mac_addr = req->dst_mac;
3327 tim.vlan_tag = ntohs(req->vlan_tag);
3328 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3329 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3335 * XXX do route lookup to confirm that we're still listening on this
3338 if (ip_route_input(skb, req->local_ip, req->peer_ip,
3339 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3341 rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3342 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3343 dst_release(skb->dst); // done with the input route, release it
3346 if ((rt_flags & RTF_LOCAL) == 0)
3352 rt_flags = RTF_LOCAL;
3353 if ((rt_flags & RTF_LOCAL) == 0)
3357 * Calculate values and add to syncache
3360 newtoep = toepcb_alloc();
3361 if (newtoep == NULL)
3364 bzero(&nam, sizeof(struct sockaddr_in));
3366 nam.sin_len = sizeof(struct sockaddr_in);
3367 nam.sin_family = AF_INET;
3368 nam.sin_addr.s_addr =req->peer_ip;
3369 dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3372 printf("failed to find route\n");
3375 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3376 (struct sockaddr *)&nam);
3378 DPRINTF("failed to get l2t\n");
3381 * Point to our listen socket until accept
3383 newtoep->tp_tp = tp;
3384 newtoep->tp_flags = TP_SYN_RCVD;
3385 newtoep->tp_tid = tid;
3386 newtoep->tp_toedev = tdev;
3387 tp->rcv_wnd = select_rcv_wnd(tdev, so);
3389 cxgb_insert_tid(cdev, d->client, newtoep, tid);
3391 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3394 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3395 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3397 if (newtoep->tp_ulp_mode) {
3398 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3400 if (ddp_mbuf == NULL)
3401 newtoep->tp_ulp_mode = 0;
3404 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3405 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3406 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3408 * XXX workaround for lack of syncache drop
3410 toepcb_hold(newtoep);
3411 syncache_add_accept_req(req, so, newtoep);
3413 rpl = cplhdr(reply_mbuf);
3414 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3415 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3417 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3418 rpl->opt2 = htonl(calc_opt2(so, tdev));
3419 rpl->rsvd = rpl->opt2; /* workaround for HW bug */
3420 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten
3422 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3423 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3424 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3425 CPL_PASS_OPEN_ACCEPT);
3427 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3429 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3431 l2t_send(cdev, reply_mbuf, e);
3433 if (newtoep->tp_ulp_mode) {
3434 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3436 TP_DDP_TIMER_WORKAROUND_MASK,
3438 TP_DDP_TIMER_WORKAROUND_VAL, 1);
3440 printf("not offloading\n");
3446 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3447 mk_pass_accept_rpl(reply_mbuf, m);
3449 mk_tid_release(reply_mbuf, newtoep, tid);
3450 cxgb_ofld_send(cdev, reply_mbuf);
3454 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3461 * Handle a CPL_PASS_ACCEPT_REQ message.
3464 do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3466 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3467 struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3468 struct tom_data *d = listen_ctx->tom_data;
3471 struct cpl_pass_accept_req *req = cplhdr(m);
3472 unsigned int tid = GET_TID(req);
3473 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3475 if (unlikely(!lsk)) {
3476 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3478 (unsigned long)((union listen_entry *)ctx -
3480 return CPL_RET_BUF_DONE;
3482 if (unlikely(tid >= t->ntids)) {
3483 printk(KERN_ERR "%s: passive open TID %u too large\n",
3485 return CPL_RET_BUF_DONE;
3488 * For T3A the current user of the TID may have closed but its last
3489 * message(s) may have been backlogged so the TID appears to be still
3490 * in use. Just take the TID away, the connection can close at its
3491 * own leisure. For T3B this situation is a bug.
3493 if (!valid_new_tid(t, tid) &&
3494 cdev->type != T3A) {
3495 printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3497 return CPL_RET_BUF_DONE;
3501 process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3506 * Called when a connection is established to translate the TCP options
3507 * reported by HW to FreeBSD's native format.
3510 assign_rxopt(struct socket *so, unsigned int opt)
3512 struct tcpcb *tp = so_sototcpcb(so);
3513 struct toepcb *toep = tp->t_toe;
3514 const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3516 inp_lock_assert(tp->t_inpcb);
3518 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3519 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3520 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3521 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3522 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3523 (TF_RCVD_SCALE|TF_REQ_SCALE))
3524 tp->rcv_scale = tp->request_r_scale;
3528 * Completes some final bits of initialization for just established connections
3529 * and changes their state to TCP_ESTABLISHED.
3531 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3534 make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3536 struct tcpcb *tp = so_sototcpcb(so);
3537 struct toepcb *toep = tp->t_toe;
3539 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3540 assign_rxopt(so, opt);
3547 so->so_proto->pr_ctloutput = t3_ctloutput;
3551 inet_sk(sk)->id = tp->write_seq ^ jiffies;
3554 * XXX not clear what rcv_wup maps to
3557 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3558 * pass through opt0.
3560 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3561 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3567 * no clean interface for marking ARP up to date
3569 dst_confirm(sk->sk_dst_cache);
3571 tp->t_starttime = ticks;
3572 tp->t_state = TCPS_ESTABLISHED;
3577 syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3580 struct in_conninfo inc;
3583 int mss, wsf, sack, ts;
3584 struct mbuf *m = NULL;
3585 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3589 #error "no MAC support"
3592 opt = ntohs(req->tcp_opt);
3594 bzero(&to, sizeof(struct tcpopt));
3597 * Fill out information for entering us into the syncache
3599 inc.inc_fport = th.th_sport = req->peer_port;
3600 inc.inc_lport = th.th_dport = req->local_port;
3601 th.th_seq = req->rcv_isn;
3602 th.th_flags = TH_ACK;
3606 inc.inc_faddr.s_addr = req->peer_ip;
3607 inc.inc_laddr.s_addr = req->local_ip;
3609 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3610 wsf = G_TCPOPT_WSCALE_OK(opt);
3611 ts = G_TCPOPT_TSTAMP(opt);
3612 sack = G_TCPOPT_SACK(opt);
3615 to.to_wscale = G_TCPOPT_SND_WSCALE(opt);
3616 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3618 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3619 ntohl(req->local_ip), ntohs(req->local_port),
3620 ntohl(req->peer_ip), ntohs(req->peer_port),
3621 mss, wsf, ts, sack);
3622 return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3627 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work
3628 * if we are in TCP_SYN_RECV due to crossed SYNs
3631 do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3633 struct cpl_pass_establish *req = cplhdr(m);
3634 struct toepcb *toep = (struct toepcb *)ctx;
3635 struct tcpcb *tp = toep->tp_tp;
3636 struct socket *so, *lso;
3637 struct t3c_data *td = T3C_DATA(cdev);
3638 struct sockbuf *snd, *rcv;
3640 // Complete socket initialization now that we have the SND_ISN
3642 struct toedev *tdev;
3645 tdev = toep->tp_toedev;
3647 inp_wlock(tp->t_inpcb);
3651 * XXX need to add reference while we're manipulating
3653 so = lso = inp_inpcbtosocket(tp->t_inpcb);
3655 inp_wunlock(tp->t_inpcb);
3658 LIST_REMOVE(toep, synq_entry);
3661 if (!syncache_expand_establish_req(req, &so, toep)) {
3665 CXGB_UNIMPLEMENTED();
3669 * Couldn't create the socket
3671 CXGB_UNIMPLEMENTED();
3674 tp = so_sototcpcb(so);
3675 inp_wlock(tp->t_inpcb);
3677 snd = so_sockbuf_snd(so);
3678 rcv = so_sockbuf_rcv(so);
3680 snd->sb_flags |= SB_NOCOALESCE;
3681 rcv->sb_flags |= SB_NOCOALESCE;
3686 reset_wr_list(toep);
3687 tp->rcv_wnd = select_rcv_wnd(tdev, so);
3688 tp->rcv_nxt = toep->tp_copied_seq;
3689 install_offload_ops(so);
3691 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3692 toep->tp_wr_unacked = 0;
3693 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3694 toep->tp_qset_idx = 0;
3695 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3698 * XXX Cancel any keep alive timer
3701 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3704 * XXX workaround for lack of syncache drop
3706 toepcb_release(toep);
3707 inp_wunlock(tp->t_inpcb);
3709 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3710 cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3713 * XXX not sure how these checks map to us
3715 if (unlikely(sk->sk_socket)) { // simultaneous opens only
3716 sk->sk_state_change(sk);
3717 sk_wake_async(so, 0, POLL_OUT);
3720 * The state for the new connection is now up to date.
3721 * Next check if we should add the connection to the parent's
3722 * accept queue. When the parent closes it resets connections
3723 * on its SYN queue, so check if we are being reset. If so we
3724 * don't need to do anything more, the coming ABORT_RPL will
3725 * destroy this socket. Otherwise move the connection to the
3728 * Note that we reset the synq before closing the server so if
3729 * we are not being reset the stid is still open.
3731 if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3742 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3743 * and send them to the TOE.
3746 fixup_and_send_ofo(struct toepcb *toep)
3749 struct toedev *tdev = toep->tp_toedev;
3750 struct tcpcb *tp = toep->tp_tp;
3751 unsigned int tid = toep->tp_tid;
3753 log(LOG_NOTICE, "fixup_and_send_ofo\n");
3755 inp_lock_assert(tp->t_inpcb);
3756 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3758 * A variety of messages can be waiting but the fields we'll
3759 * be touching are common to all so any message type will do.
3761 struct cpl_close_con_req *p = cplhdr(m);
3763 p->wr.wr_lo = htonl(V_WR_TID(tid));
3764 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3765 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3770 * Updates socket state from an active establish CPL message. Runs with the
3774 socket_act_establish(struct socket *so, struct mbuf *m)
3776 struct cpl_act_establish *req = cplhdr(m);
3777 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */
3778 struct tcpcb *tp = so_sototcpcb(so);
3779 struct toepcb *toep = tp->t_toe;
3781 if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3782 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3783 toep->tp_tid, tp->t_state);
3785 tp->ts_recent_age = ticks;
3786 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3787 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3789 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3792 * Now that we finally have a TID send any CPL messages that we had to
3793 * defer for lack of a TID.
3795 if (mbufq_len(&toep->out_of_order_queue))
3796 fixup_and_send_ofo(toep);
3798 if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3800 * XXX does this even make sense?
3807 * XXX assume no write requests permitted while socket connection is
3811 * Currently the send queue must be empty at this point because the
3812 * socket layer does not send anything before a connection is
3813 * established. To be future proof though we handle the possibility
3814 * that there are pending buffers to send (either TX_DATA or
3815 * CLOSE_CON_REQ). First we need to adjust the sequence number of the
3816 * buffers according to the just learned write_seq, and then we send
3817 * them on their way.
3819 fixup_pending_writeq_buffers(sk);
3820 if (t3_push_frames(so, 1))
3821 sk->sk_write_space(sk);
3824 toep->tp_state = tp->t_state;
3825 V_tcpstat.tcps_connects++;
3830 * Process a CPL_ACT_ESTABLISH message.
3833 do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3835 struct cpl_act_establish *req = cplhdr(m);
3836 unsigned int tid = GET_TID(req);
3837 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3838 struct toepcb *toep = (struct toepcb *)ctx;
3839 struct tcpcb *tp = toep->tp_tp;
3841 struct toedev *tdev;
3845 free_atid(cdev, atid);
3848 inp_wlock(tp->t_inpcb);
3853 so = inp_inpcbtosocket(tp->t_inpcb);
3854 tdev = toep->tp_toedev; /* blow up here if link was down */
3858 * It's OK if the TID is currently in use, the owning socket may have
3859 * backlogged its last CPL message(s). Just take it away.
3863 so_insert_tid(d, toep, tid);
3864 free_atid(cdev, atid);
3865 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3867 socket_act_establish(so, m);
3868 inp_wunlock(tp->t_inpcb);
3869 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3870 cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3876 * Process an acknowledgment of WR completion. Advance snd_una and send the
3877 * next batch of work requests from the write queue.
3880 wr_ack(struct toepcb *toep, struct mbuf *m)
3882 struct tcpcb *tp = toep->tp_tp;
3883 struct cpl_wr_ack *hdr = cplhdr(m);
3885 unsigned int credits = ntohs(hdr->credits);
3886 u32 snd_una = ntohl(hdr->snd_una);
3888 struct sockbuf *snd;
3890 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3892 inp_wlock(tp->t_inpcb);
3893 so = inp_inpcbtosocket(tp->t_inpcb);
3894 toep->tp_wr_avail += credits;
3895 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3896 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3899 struct mbuf *p = peek_wr(toep);
3901 if (__predict_false(!p)) {
3902 log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3903 "nothing pending, state %u wr_avail=%u\n",
3904 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3908 "wr_ack: p->credits=%d p->bytes=%d",
3909 p->m_pkthdr.csum_data, p->m_pkthdr.len);
3910 KASSERT(p->m_pkthdr.csum_data != 0,
3911 ("empty request still on list"));
3913 if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3916 struct tx_data_wr *w = cplhdr(p);
3918 "TID %u got %u WR credits, need %u, len %u, "
3919 "main body %u, frags %u, seq # %u, ACK una %u,"
3920 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3921 toep->tp_tid, credits, p->csum, p->len,
3922 p->len - p->data_len, skb_shinfo(p)->nr_frags,
3923 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3924 toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3926 p->m_pkthdr.csum_data -= credits;
3930 credits -= p->m_pkthdr.csum_data;
3931 bytes += p->m_pkthdr.len;
3933 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3934 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3941 check_wr_invariants(tp);
3944 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3946 struct tom_data *d = TOM_DATA(TOE_DEV(so));
3948 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3949 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3950 toep->tp_tid, tp->snd_una);
3955 if (tp->snd_una != snd_una) {
3956 tp->snd_una = snd_una;
3957 tp->ts_recent_age = ticks;
3960 * Keep ARP entry "minty fresh"
3962 dst_confirm(sk->sk_dst_cache);
3964 if (tp->snd_una == tp->snd_nxt)
3965 toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3968 snd = so_sockbuf_snd(so);
3970 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3971 snd = so_sockbuf_snd(so);
3973 sbdrop_locked(snd, bytes);
3974 so_sowwakeup_locked(so);
3977 if (snd->sb_sndptroff < snd->sb_cc)
3978 t3_push_frames(so, 0);
3981 inp_wunlock(tp->t_inpcb);
3986 * Handler for TX_DATA_ACK CPL messages.
3989 do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3991 struct toepcb *toep = (struct toepcb *)ctx;
4000 * Handler for TRACE_PKT CPL messages. Just sink these packets.
4003 do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4010 * Reset a connection that is on a listener's SYN queue or accept queue,
4011 * i.e., one that has not had a struct socket associated with it.
4012 * Must be called from process context.
4014 * Modeled after code in inet_csk_listen_stop().
4017 t3_reset_listen_child(struct socket *child)
4019 struct tcpcb *tp = so_sototcpcb(child);
4021 t3_send_reset(tp->t_toe);
4026 t3_child_disconnect(struct socket *so, void *arg)
4028 struct tcpcb *tp = so_sototcpcb(so);
4030 if (tp->t_flags & TF_TOE) {
4031 inp_wlock(tp->t_inpcb);
4032 t3_reset_listen_child(so);
4033 inp_wunlock(tp->t_inpcb);
4038 * Disconnect offloaded established but not yet accepted connections sitting
4039 * on a server's accept_queue. We just send an ABORT_REQ at this point and
4040 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4043 t3_disconnect_acceptq(struct socket *listen_so)
4047 so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4048 so_unlock(listen_so);
4052 * Reset offloaded connections sitting on a server's syn queue. As above
4053 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4057 t3_reset_synq(struct listen_ctx *lctx)
4059 struct toepcb *toep;
4062 while (!LIST_EMPTY(&lctx->synq_head)) {
4063 toep = LIST_FIRST(&lctx->synq_head);
4064 LIST_REMOVE(toep, synq_entry);
4066 t3_send_reset(toep);
4067 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4068 toepcb_release(toep);
4070 so_unlock(lctx->lso);
4075 t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4076 unsigned int nppods, unsigned int tag, unsigned int maxoff,
4077 unsigned int pg_off, unsigned int color)
4079 unsigned int i, j, pidx;
4082 struct ulp_mem_io *req;
4083 unsigned int tid = toep->tp_tid;
4084 const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4085 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4087 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4088 gl, nppods, tag, maxoff, pg_off, color);
4090 for (i = 0; i < nppods; ++i) {
4091 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4092 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4093 req = mtod(m, struct ulp_mem_io *);
4094 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4095 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4097 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4098 V_ULPTX_CMD(ULP_MEM_WRITE));
4099 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4100 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4102 p = (struct pagepod *)(req + 1);
4103 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4104 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4105 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4106 V_PPOD_COLOR(color));
4107 p->pp_max_offset = htonl(maxoff);
4108 p->pp_page_offset = htonl(pg_off);
4110 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4111 p->pp_addr[j] = pidx < gl->dgl_nelem ?
4112 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4114 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */
4115 send_or_defer(toep, m, 0);
4116 ppod_addr += PPOD_SIZE;
4122 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4125 mk_cpl_barrier_ulp(struct cpl_barrier *b)
4127 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4129 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4130 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4131 b->opcode = CPL_BARRIER;
4135 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4138 mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4140 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4142 txpkt = (struct ulp_txpkt *)req;
4143 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4144 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4145 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4146 req->cpuno = htons(cpuno);
4150 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4153 mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4154 unsigned int word, uint64_t mask, uint64_t val)
4156 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4158 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4159 tid, word, mask, val);
4161 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4162 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4163 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4164 req->reply = V_NO_REPLY(1);
4166 req->word = htons(word);
4167 req->mask = htobe64(mask);
4168 req->val = htobe64(val);
4172 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4175 mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4176 unsigned int tid, unsigned int credits)
4178 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4180 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4181 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4182 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4183 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4184 V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4185 V_RX_CREDITS(credits));
4189 t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4193 struct work_request_hdr *wr;
4194 struct cpl_barrier *lock;
4195 struct cpl_set_tcb_field *req;
4196 struct cpl_get_tcb *getreq;
4197 struct ddp_state *p = &toep->tp_ddp_state;
4200 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4202 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4204 m = m_gethdr_nofail(wrlen);
4205 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4206 wr = mtod(m, struct work_request_hdr *);
4209 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4210 m->m_pkthdr.len = m->m_len = wrlen;
4212 lock = (struct cpl_barrier *)(wr + 1);
4213 mk_cpl_barrier_ulp(lock);
4215 req = (struct cpl_set_tcb_field *)(lock + 1);
4217 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4219 /* Hmmm, not sure if this actually a good thing: reactivating
4220 * the other buffer might be an issue if it has been completed
4221 * already. However, that is unlikely, since the fact that the UBUF
4222 * is not completed indicates that there is no oustanding data.
4225 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4226 V_TF_DDP_ACTIVE_BUF(1) |
4227 V_TF_DDP_BUF0_VALID(1),
4228 V_TF_DDP_ACTIVE_BUF(1));
4230 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4231 V_TF_DDP_ACTIVE_BUF(1) |
4232 V_TF_DDP_BUF1_VALID(1), 0);
4234 getreq = (struct cpl_get_tcb *)(req + 1);
4235 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4237 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4239 /* Keep track of the number of oustanding CPL_GET_TCB requests
4244 T3_TRACE1(TIDTB(so),
4245 "t3_cancel_ddpbuf: bufidx %u", bufidx);
4247 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4251 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4252 * @sk: the socket associated with the buffers
4253 * @bufidx: index of HW DDP buffer (0 or 1)
4254 * @tag0: new tag for HW buffer 0
4255 * @tag1: new tag for HW buffer 1
4256 * @len: new length for HW buf @bufidx
4258 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4259 * buffer by changing the buffer tag and length and setting the valid and
4260 * active flag accordingly. The caller must ensure the new buffer is at
4261 * least as big as the existing one. Since we typically reprogram both HW
4262 * buffers this function sets both tags for convenience. Read the TCB to
4263 * determine how made data was written into the buffer before the overlay
4267 t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4268 unsigned int tag1, unsigned int len)
4272 struct work_request_hdr *wr;
4273 struct cpl_get_tcb *getreq;
4274 struct cpl_set_tcb_field *req;
4275 struct ddp_state *p = &toep->tp_ddp_state;
4277 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4278 bufidx, tag0, tag1, len);
4280 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4282 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4283 m = m_gethdr_nofail(wrlen);
4284 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4285 wr = mtod(m, struct work_request_hdr *);
4286 m->m_pkthdr.len = m->m_len = wrlen;
4290 /* Set the ATOMIC flag to make sure that TP processes the following
4291 * CPLs in an atomic manner and no wire segments can be interleaved.
4293 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4294 req = (struct cpl_set_tcb_field *)(wr + 1);
4295 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4296 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4297 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4298 V_TCB_RX_DDP_BUF0_TAG(tag0) |
4299 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4302 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4303 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4304 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4306 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4307 V_TF_DDP_PUSH_DISABLE_0(1) |
4308 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4309 V_TF_DDP_PUSH_DISABLE_0(0) |
4310 V_TF_DDP_BUF0_VALID(1));
4312 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4313 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4314 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4316 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4317 V_TF_DDP_PUSH_DISABLE_1(1) |
4318 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4319 V_TF_DDP_PUSH_DISABLE_1(0) |
4320 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4323 getreq = (struct cpl_get_tcb *)(req + 1);
4324 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4326 /* Keep track of the number of oustanding CPL_GET_TCB requests
4331 T3_TRACE4(TIDTB(sk),
4332 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4334 bufidx, tag0, tag1, len);
4336 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4340 * Sends a compound WR containing all the CPL messages needed to program the
4341 * two HW DDP buffers, namely optionally setting up the length and offset of
4342 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4345 t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4346 unsigned int len1, unsigned int offset1,
4347 uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4351 struct work_request_hdr *wr;
4352 struct cpl_set_tcb_field *req;
4354 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4355 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4358 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4360 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4361 (len1 ? sizeof(*req) : 0) +
4362 (modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4363 m = m_gethdr_nofail(wrlen);
4364 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4365 wr = mtod(m, struct work_request_hdr *);
4368 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4369 m->m_pkthdr.len = m->m_len = wrlen;
4371 req = (struct cpl_set_tcb_field *)(wr + 1);
4372 if (len0) { /* program buffer 0 offset and length */
4373 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4374 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4375 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4376 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4377 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4380 if (len1) { /* program buffer 1 offset and length */
4381 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4382 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4383 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4384 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4385 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4389 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4393 mk_rx_data_ack_ulp(toep,
4394 (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4395 toep->tp_copied_seq - toep->tp_rcv_wup);
4396 toep->tp_rcv_wup = toep->tp_copied_seq;
4400 T3_TRACE5(TIDTB(sk),
4401 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4403 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4407 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4411 t3_init_wr_tab(unsigned int wr_len)
4415 if (mbuf_wrs[1]) /* already initialized */
4418 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4419 int sgl_len = (3 * i) / 2 + (i & 1);
4422 mbuf_wrs[i] = sgl_len <= wr_len ?
4423 1 : 1 + (sgl_len - 2) / (wr_len - 1);
4430 t3_init_cpl_io(void)
4433 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4436 "Chelsio TCP offload: can't allocate sk_buff\n");
4439 skb_put(tcphdr_skb, sizeof(struct tcphdr));
4440 tcphdr_skb->h.raw = tcphdr_skb->data;
4441 memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4444 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4445 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4446 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4447 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4448 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4449 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4450 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4451 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4452 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4453 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4454 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4455 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4456 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4457 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4458 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);