1 /**************************************************************************
3 Copyright (c) 2007-2008, Chelsio Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Chelsio Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/fcntl.h>
36 #include <sys/kernel.h>
37 #include <sys/limits.h>
41 #include <sys/mutex.h>
42 #include <sys/sockstate.h>
43 #include <sys/sockopt.h>
44 #include <sys/socket.h>
45 #include <sys/sockbuf.h>
46 #include <sys/sysctl.h>
47 #include <sys/syslog.h>
48 #include <sys/protosw.h>
51 #if __FreeBSD_version >= 800044
52 #include <sys/vimage.h>
54 #define V_tcp_do_autosndbuf tcp_do_autosndbuf
55 #define V_tcp_autosndbuf_max tcp_autosndbuf_max
56 #define V_tcp_do_rfc1323 tcp_do_rfc1323
57 #define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
58 #define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
59 #define V_tcpstat tcpstat
63 #include <net/route.h>
65 #include <netinet/in.h>
66 #include <netinet/in_pcb.h>
67 #include <netinet/in_systm.h>
68 #include <netinet/in_var.h>
71 #include <cxgb_osdep.h>
72 #include <sys/mbufq.h>
74 #include <netinet/ip.h>
75 #include <netinet/tcp_var.h>
76 #include <netinet/tcp_fsm.h>
77 #include <netinet/tcp_offload.h>
78 #include <netinet/tcp_seq.h>
79 #include <netinet/tcp_syncache.h>
80 #include <netinet/tcp_timer.h>
81 #include <net/route.h>
84 #include <common/cxgb_firmware_exports.h>
85 #include <common/cxgb_t3_cpl.h>
86 #include <common/cxgb_tcb.h>
87 #include <common/cxgb_ctl_defs.h>
88 #include <cxgb_offload.h>
91 #include <machine/bus.h>
93 #include <ulp/toecore/cxgb_toedev.h>
94 #include <ulp/tom/cxgb_l2t.h>
95 #include <ulp/tom/cxgb_defs.h>
96 #include <ulp/tom/cxgb_tom.h>
97 #include <ulp/tom/cxgb_t3_ddp.h>
98 #include <ulp/tom/cxgb_toepcb.h>
99 #include <ulp/tom/cxgb_tcp.h>
100 #include <ulp/tom/cxgb_tcp_offload.h>
103 * For ULP connections HW may add headers, e.g., for digests, that aren't part
104 * of the messages sent by the host but that are part of the TCP payload and
105 * therefore consume TCP sequence space. Tx connection parameters that
106 * operate in TCP sequence space are affected by the HW additions and need to
107 * compensate for them to accurately track TCP sequence numbers. This array
108 * contains the compensating extra lengths for ULP packets. It is indexed by
109 * a packet's ULP submode.
111 const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
115 * This sk_buff holds a fake header-only TCP segment that we use whenever we
116 * need to exploit SW TCP functionality that expects TCP headers, such as
117 * tcp_create_openreq_child(). It's a RO buffer that may be used by multiple
118 * CPUs without locking.
120 static struct mbuf *tcphdr_mbuf __read_mostly;
124 * Size of WRs in bytes. Note that we assume all devices we are handling have
127 static unsigned int wrlen __read_mostly;
130 * The number of WRs needed for an skb depends on the number of page fragments
131 * in the skb and whether it has any payload in its main body. This maps the
132 * length of the gather list represented by an skb into the # of necessary WRs.
134 static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
137 * Max receive window supported by HW in bytes. Only a small part of it can
138 * be set through option0, the rest needs to be set through RX_DATA_ACK.
140 #define MAX_RCV_WND ((1U << 27) - 1)
143 * Min receive window. We want it to be large enough to accommodate receive
144 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
146 #define MIN_RCV_WND (24 * 1024U)
147 #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
149 #define VALIDATE_SEQ 0
150 #define VALIDATE_SOCK(so)
153 #define TCP_TIMEWAIT 1
157 extern int tcp_do_autorcvbuf;
158 extern int tcp_do_autosndbuf;
159 extern int tcp_autorcvbuf_max;
160 extern int tcp_autosndbuf_max;
162 static void t3_send_reset(struct toepcb *toep);
163 static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
164 static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
165 static void handle_syncache_event(int event, void *arg);
168 SBAPPEND(struct sockbuf *sb, struct mbuf *n)
174 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
175 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
176 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
177 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
178 m->m_next, m->m_nextpkt, m->m_flags));
183 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
184 !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
185 !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
186 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
187 m->m_next, m->m_nextpkt, m->m_flags));
190 KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
191 sbappendstream_locked(sb, n);
195 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
196 m->m_next, m->m_nextpkt, m->m_flags));
202 is_t3a(const struct toedev *dev)
204 return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
208 dump_toepcb(struct toepcb *toep)
210 DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
211 toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
212 toep->tp_mtu_idx, toep->tp_tid);
214 DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
215 toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
216 toep->tp_mss_clamp, toep->tp_flags);
219 #ifndef RTALLOC2_DEFINED
220 static struct rtentry *
221 rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
223 struct rtentry *rt = NULL;
225 if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
233 * Determine whether to send a CPL message now or defer it. A message is
234 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
235 * For connections in other states the message is sent immediately.
236 * If through_l2t is set the message is subject to ARP processing, otherwise
237 * it is sent directly.
240 send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
242 struct tcpcb *tp = toep->tp_tp;
244 if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
245 inp_wlock(tp->t_inpcb);
246 mbufq_tail(&toep->out_of_order_queue, m); // defer
247 inp_wunlock(tp->t_inpcb);
248 } else if (through_l2t)
249 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t); // send through L2T
251 cxgb_ofld_send(TOEP_T3C_DEV(toep), m); // send directly
254 static inline unsigned int
255 mkprio(unsigned int cntrl, const struct toepcb *toep)
261 * Populate a TID_RELEASE WR. The skb must be already propely sized.
264 mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
266 struct cpl_tid_release *req;
268 m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
269 m->m_pkthdr.len = m->m_len = sizeof(*req);
270 req = mtod(m, struct cpl_tid_release *);
271 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
273 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
277 make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
279 INIT_VNET_INET(so->so_vnet);
280 struct tcpcb *tp = so_sototcpcb(so);
281 struct toepcb *toep = tp->t_toe;
282 struct tx_data_wr *req;
285 inp_lock_assert(tp->t_inpcb);
286 snd = so_sockbuf_snd(so);
288 req = mtod(m, struct tx_data_wr *);
289 m->m_len = sizeof(*req);
290 req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
291 req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
292 /* len includes the length of any HW ULP additions */
293 req->len = htonl(len);
294 req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
295 /* V_TX_ULP_SUBMODE sets both the mode and submode */
296 req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
297 V_TX_URG(/* skb_urgent(skb) */ 0 ) |
298 V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
300 req->sndseq = htonl(tp->snd_nxt);
301 if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
302 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
303 V_TX_CPU_IDX(toep->tp_qset));
305 /* Sendbuffer is in units of 32KB.
307 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
308 req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
310 req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
313 toep->tp_flags |= TP_DATASENT;
317 #define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
320 t3_push_frames(struct socket *so, int req_completion)
322 struct tcpcb *tp = so_sototcpcb(so);
323 struct toepcb *toep = tp->t_toe;
325 struct mbuf *tail, *m0, *last;
328 int state, bytes, count, total_bytes;
329 bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
332 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
333 DPRINTF("tcp state=%d\n", tp->t_state);
337 state = so_state_get(so);
339 if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
340 DPRINTF("disconnecting\n");
345 inp_lock_assert(tp->t_inpcb);
347 snd = so_sockbuf_snd(so);
350 d = TOM_DATA(toep->tp_toedev);
353 last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
356 DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
357 toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
359 if (last && toep->tp_m_last == last && snd->sb_sndptroff != 0) {
360 KASSERT(tail, ("sbdrop error"));
361 last = tail = tail->m_next;
364 if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
365 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
371 toep->tp_m_last = NULL;
372 while (toep->tp_wr_avail && (tail != NULL)) {
375 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
380 * If the data in tail fits as in-line, then
381 * make an immediate data wr.
383 if (tail->m_len <= IMM_LEN) {
390 make_tx_data_wr(so, m0, bytes, tail);
391 m_append(m0, bytes, mtod(last, caddr_t));
392 KASSERT(!m0->m_next, ("bad append"));
394 while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
395 && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
396 bytes += tail->m_len;
400 * technically an abuse to be using this for a VA
401 * but less gross than defining my own structure
402 * or calling pmap_kextract from here :-|
404 segp->ds_addr = (bus_addr_t)tail->m_data;
405 segp->ds_len = tail->m_len;
406 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
407 count, mbuf_wrs[count], tail->m_data, tail->m_len);
411 DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
412 toep->tp_wr_avail, count, mbuf_wrs[count], tail);
415 m_set_sgllen(m0, count);
416 make_tx_data_wr(so, m0, bytes, tail);
418 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
421 snd->sb_sndptr = tail;
422 toep->tp_m_last = NULL;
424 toep->tp_m_last = snd->sb_sndptr = last;
427 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
429 snd->sb_sndptroff += bytes;
430 total_bytes += bytes;
431 toep->tp_write_seq += bytes;
432 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
433 " tail=%p sndptr=%p sndptroff=%d",
434 toep->tp_wr_avail, count, mbuf_wrs[count],
435 tail, snd->sb_sndptr, snd->sb_sndptroff);
437 CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
438 " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
439 total_bytes, toep->tp_m_last, tail->m_data,
442 CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
443 " tp_m_last=%p snd_una=0x%08x",
444 total_bytes, toep->tp_m_last, tp->snd_una);
452 while (i < count && m_get_sgllen(m0)) {
453 if ((count - i) >= 3) {
455 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
456 " len=%d pa=0x%zx len=%d",
457 segs[i].ds_addr, segs[i].ds_len,
458 segs[i + 1].ds_addr, segs[i + 1].ds_len,
459 segs[i + 2].ds_addr, segs[i + 2].ds_len);
461 } else if ((count - i) == 2) {
463 "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
465 segs[i].ds_addr, segs[i].ds_len,
466 segs[i + 1].ds_addr, segs[i + 1].ds_len);
469 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
470 segs[i].ds_addr, segs[i].ds_len);
478 * remember credits used
480 m0->m_pkthdr.csum_data = mbuf_wrs[count];
481 m0->m_pkthdr.len = bytes;
482 toep->tp_wr_avail -= mbuf_wrs[count];
483 toep->tp_wr_unacked += mbuf_wrs[count];
485 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
486 toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
487 struct work_request_hdr *wr = cplhdr(m0);
489 wr->wr_hi |= htonl(F_WR_COMPL);
490 toep->tp_wr_unacked = 0;
492 KASSERT((m0->m_pkthdr.csum_data > 0) &&
493 (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
494 m0->m_pkthdr.csum_data));
495 m0->m_type = MT_DONTFREE;
496 enqueue_wr(toep, m0);
497 DPRINTF("sending offload tx with %d bytes in %d segments\n",
499 l2t_send(cdev, m0, toep->tp_l2t);
502 return (total_bytes);
506 * Close a connection by sending a CPL_CLOSE_CON_REQ message. Cannot fail
507 * under any circumstances. We take the easy way out and always queue the
508 * message to the write_queue. We can optimize the case where the queue is
509 * already empty though the optimization is probably not worth it.
512 close_conn(struct socket *so)
515 struct cpl_close_con_req *req;
517 struct inpcb *inp = so_sotoinpcb(so);
524 tp = so_sototcpcb(so);
527 if (tp->t_state != TCPS_SYN_SENT)
528 t3_push_frames(so, 1);
530 if (toep->tp_flags & TP_FIN_SENT) {
537 d = TOM_DATA(toep->tp_toedev);
539 m = m_gethdr_nofail(sizeof(*req));
540 m_set_priority(m, CPL_PRIORITY_DATA);
544 toep->tp_flags |= TP_FIN_SENT;
545 req = mtod(m, struct cpl_close_con_req *);
547 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
548 req->wr.wr_lo = htonl(V_WR_TID(tid));
549 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
553 * XXX - need to defer shutdown while there is still data in the queue
556 CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
557 cxgb_ofld_send(d->cdev, m);
562 * Handle an ARP failure for a CPL_ABORT_REQ. Change it into a no RST variant
566 abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
568 struct cpl_abort_req *req = cplhdr(m);
570 req->cmd = CPL_ABORT_NO_RST;
571 cxgb_ofld_send(cdev, m);
575 * Send RX credits through an RX_DATA_ACK CPL message. If nofail is 0 we are
576 * permitted to return without sending the message in case we cannot allocate
577 * an sk_buff. Returns the number of credits sent.
580 t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
583 struct cpl_rx_data_ack *req;
584 struct toepcb *toep = tp->t_toe;
585 struct toedev *tdev = toep->tp_toedev;
587 m = m_gethdr_nofail(sizeof(*req));
589 DPRINTF("returning %u credits to HW\n", credits);
591 req = mtod(m, struct cpl_rx_data_ack *);
592 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
594 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
595 req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
596 m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
597 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
602 * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
603 * This is only used in DDP mode, so we take the opportunity to also set the
604 * DACK mode and flush any Rx credits.
607 t3_send_rx_modulate(struct toepcb *toep)
610 struct cpl_rx_data_ack *req;
612 m = m_gethdr_nofail(sizeof(*req));
614 req = mtod(m, struct cpl_rx_data_ack *);
615 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
617 m->m_pkthdr.len = m->m_len = sizeof(*req);
619 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
620 req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
622 V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
623 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
624 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
625 toep->tp_rcv_wup = toep->tp_copied_seq;
629 * Handle receipt of an urgent pointer.
632 handle_urg_ptr(struct socket *so, uint32_t urg_seq)
634 #ifdef URGENT_DATA_SUPPORTED
635 struct tcpcb *tp = so_sototcpcb(so);
637 urg_seq--; /* initially points past the urgent data, per BSD */
639 if (tp->urg_data && !after(urg_seq, tp->urg_seq))
640 return; /* duplicate pointer */
642 if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
643 !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
644 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
647 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
648 tom_eat_skb(sk, skb, 0);
650 tp->urg_data = TCP_URG_NOTYET;
651 tp->urg_seq = urg_seq;
656 * Returns true if a socket cannot accept new Rx data.
659 so_no_receive(const struct socket *so)
661 return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
665 * Process an urgent data notification.
668 rx_urg_notify(struct toepcb *toep, struct mbuf *m)
670 struct cpl_rx_urg_notify *hdr = cplhdr(m);
671 struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
675 if (!so_no_receive(so))
676 handle_urg_ptr(so, ntohl(hdr->seq));
682 * Handler for RX_URG_NOTIFY CPL messages.
685 do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
687 struct toepcb *toep = (struct toepcb *)ctx;
689 rx_urg_notify(toep, m);
694 is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
696 return (toep->tp_ulp_mode ||
697 (toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
698 dev->tod_ttid >= TOE_ID_CHELSIO_T3));
702 * Set of states for which we should return RX credits.
704 #define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
707 * Called after some received data has been read. It returns RX credits
708 * to the HW for the amount of data processed.
711 t3_cleanup_rbuf(struct tcpcb *tp, int copied)
713 struct toepcb *toep = tp->t_toe;
716 int dack_mode, must_send, read;
717 u32 thres, credits, dack = 0;
720 so = inp_inpcbtosocket(tp->t_inpcb);
721 rcv = so_sockbuf_rcv(so);
723 if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
724 (tp->t_state == TCPS_FIN_WAIT_2))) {
727 toep->tp_copied_seq += copied;
734 inp_lock_assert(tp->t_inpcb);
738 toep->tp_copied_seq += copied;
740 read = toep->tp_enqueued_bytes - rcv->sb_cc;
741 toep->tp_copied_seq += read;
743 credits = toep->tp_copied_seq - toep->tp_rcv_wup;
744 toep->tp_enqueued_bytes = rcv->sb_cc;
747 if (credits > rcv->sb_mbmax) {
748 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
749 toep->tp_copied_seq, toep->tp_rcv_wup, credits);
750 credits = rcv->sb_mbmax;
755 * XXX this won't accurately reflect credit return - we need
756 * to look at the difference between the amount that has been
757 * put in the recv sockbuf and what is there now
760 if (__predict_false(!credits))
763 dev = toep->tp_toedev;
764 thres = TOM_TUNABLE(dev, rx_credit_thres);
766 if (__predict_false(thres == 0))
769 if (is_delack_mode_valid(dev, toep)) {
770 dack_mode = TOM_TUNABLE(dev, delack);
771 if (__predict_false(dack_mode != toep->tp_delack_mode)) {
772 u32 r = tp->rcv_nxt - toep->tp_delack_seq;
774 if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
775 dack = F_RX_DACK_CHANGE |
776 V_RX_DACK_MODE(dack_mode);
779 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
782 * For coalescing to work effectively ensure the receive window has
783 * at least 16KB left.
785 must_send = credits + 16384 >= tp->rcv_wnd;
787 if (must_send || credits >= thres)
788 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
792 cxgb_toe_disconnect(struct tcpcb *tp)
796 DPRINTF("cxgb_toe_disconnect\n");
798 so = inp_inpcbtosocket(tp->t_inpcb);
804 cxgb_toe_reset(struct tcpcb *tp)
806 struct toepcb *toep = tp->t_toe;
813 tp->t_flags &= ~TF_TOE;
820 cxgb_toe_send(struct tcpcb *tp)
824 DPRINTF("cxgb_toe_send\n");
825 dump_toepcb(tp->t_toe);
827 so = inp_inpcbtosocket(tp->t_inpcb);
828 t3_push_frames(so, 1);
833 cxgb_toe_rcvd(struct tcpcb *tp)
836 inp_lock_assert(tp->t_inpcb);
838 t3_cleanup_rbuf(tp, 0);
844 cxgb_toe_detach(struct tcpcb *tp)
849 * XXX how do we handle teardown in the SYN_SENT state?
852 inp_lock_assert(tp->t_inpcb);
859 tp->t_flags &= ~TF_TOE;
864 static struct toe_usrreqs cxgb_toe_usrreqs = {
865 .tu_disconnect = cxgb_toe_disconnect,
866 .tu_reset = cxgb_toe_reset,
867 .tu_send = cxgb_toe_send,
868 .tu_rcvd = cxgb_toe_rcvd,
869 .tu_detach = cxgb_toe_detach,
870 .tu_detach = cxgb_toe_detach,
871 .tu_syncache_event = handle_syncache_event,
876 __set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
877 uint64_t mask, uint64_t val, int no_reply)
879 struct cpl_set_tcb_field *req;
881 CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
882 toep->tp_tid, word, mask, val);
884 req = mtod(m, struct cpl_set_tcb_field *);
885 m->m_pkthdr.len = m->m_len = sizeof(*req);
886 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
888 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
889 req->reply = V_NO_REPLY(no_reply);
891 req->word = htons(word);
892 req->mask = htobe64(mask);
893 req->val = htobe64(val);
895 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
896 send_or_defer(toep, m, 0);
900 t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
903 struct tcpcb *tp = toep->tp_tp;
908 if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
909 printf("not seting field\n");
913 m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
915 __set_tcb_field(toep, m, word, mask, val, 1);
919 * Set one of the t_flags bits in the TCB.
922 set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
925 t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
929 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
932 t3_set_nagle(struct toepcb *toep)
934 struct tcpcb *tp = toep->tp_tp;
936 set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
940 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
943 t3_set_keepalive(struct toepcb *toep, int on_off)
946 set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
950 t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
952 set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
956 t3_set_dack_mss(struct toepcb *toep, int on_off)
959 set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
963 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
966 t3_set_tos(struct toepcb *toep)
968 int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
970 t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
976 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
977 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
978 * set the PSH bit in the last segment, which would trigger delivery.]
979 * We work around the issue by setting a DDP buffer in a partial placed state,
980 * which guarantees that TP will schedule a timer.
982 #define TP_DDP_TIMER_WORKAROUND_MASK\
983 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
984 ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
985 V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
986 #define TP_DDP_TIMER_WORKAROUND_VAL\
987 (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
988 ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
992 t3_enable_ddp(struct toepcb *toep, int on)
996 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
999 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
1001 TP_DDP_TIMER_WORKAROUND_MASK,
1003 TP_DDP_TIMER_WORKAROUND_VAL);
1008 t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
1010 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1011 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1016 t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1020 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1021 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1022 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1023 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1024 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1026 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1027 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1028 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1029 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1030 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1034 t3_set_cong_control(struct socket *so, const char *name)
1036 #ifdef CONGESTION_CONTROL_SUPPORTED
1039 for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1040 if (!strcmp(name, t3_cong_ops[cong_algo].name))
1043 if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1050 t3_get_tcb(struct toepcb *toep)
1052 struct cpl_get_tcb *req;
1053 struct tcpcb *tp = toep->tp_tp;
1054 struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1059 inp_lock_assert(tp->t_inpcb);
1060 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1061 req = mtod(m, struct cpl_get_tcb *);
1062 m->m_pkthdr.len = m->m_len = sizeof(*req);
1063 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1065 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1066 req->cpuno = htons(toep->tp_qset);
1068 if (tp->t_state == TCPS_SYN_SENT)
1069 mbufq_tail(&toep->out_of_order_queue, m); // defer
1071 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1076 so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1081 cxgb_insert_tid(d->cdev, d->client, toep, tid);
1085 * find_best_mtu - find the entry in the MTU table closest to an MTU
1087 * @mtu: the target MTU
1089 * Returns the index of the value in the MTU table that is closest to but
1090 * does not exceed the target MTU.
1093 find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1097 while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1103 select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1108 struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1111 tp->t_maxseg = pmtu - 40;
1112 if (tp->t_maxseg < td->mtus[0] - 40)
1113 tp->t_maxseg = td->mtus[0] - 40;
1114 idx = find_best_mtu(td, tp->t_maxseg + 40);
1116 tp->t_maxseg = td->mtus[idx] - 40;
1118 idx = find_best_mtu(td, pmtu);
1124 free_atid(struct t3cdev *cdev, unsigned int tid)
1126 struct toepcb *toep = cxgb_free_atid(cdev, tid);
1129 toepcb_release(toep);
1133 * Release resources held by an offload connection (TID, L2T entry, etc.)
1136 t3_release_offload_resources(struct toepcb *toep)
1138 struct tcpcb *tp = toep->tp_tp;
1139 struct toedev *tdev = toep->tp_toedev;
1140 struct t3cdev *cdev;
1142 unsigned int tid = toep->tp_tid;
1143 struct sockbuf *rcv;
1145 CTR0(KTR_TOM, "t3_release_offload_resources");
1150 cdev = TOEP_T3C_DEV(toep);
1155 t3_release_ddp_resources(toep);
1157 #ifdef CTRL_SKB_CACHE
1158 kfree_skb(CTRL_SKB_CACHE(tp));
1159 CTRL_SKB_CACHE(tp) = NULL;
1162 if (toep->tp_wr_avail != toep->tp_wr_max) {
1163 purge_wr_queue(toep);
1164 reset_wr_list(toep);
1168 l2t_release(L2DATA(cdev), toep->tp_l2t);
1169 toep->tp_l2t = NULL;
1173 inp_lock_assert(tp->t_inpcb);
1174 so = inp_inpcbtosocket(tp->t_inpcb);
1175 rcv = so_sockbuf_rcv(so);
1177 * cancel any offloaded reads
1182 tp->t_flags &= ~TF_TOE;
1183 if (toep->tp_ddp_state.user_ddp_pending) {
1184 t3_cancel_ubuf(toep, rcv);
1185 toep->tp_ddp_state.user_ddp_pending = 0;
1187 so_sorwakeup_locked(so);
1191 if (toep->tp_state == TCPS_SYN_SENT) {
1192 free_atid(cdev, tid);
1194 __skb_queue_purge(&tp->out_of_order_queue);
1196 } else { // we have TID
1197 cxgb_remove_tid(cdev, toep, tid);
1198 toepcb_release(toep);
1201 log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1206 install_offload_ops(struct socket *so)
1208 struct tcpcb *tp = so_sototcpcb(so);
1210 KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1212 t3_install_socket_ops(so);
1213 tp->t_flags |= TF_TOE;
1214 tp->t_tu = &cxgb_toe_usrreqs;
1218 * Determine the receive window scaling factor given a target max
1222 select_rcv_wscale(int space)
1224 INIT_VNET_INET(so->so_vnet);
1227 if (space > MAX_RCV_WND)
1228 space = MAX_RCV_WND;
1230 if (V_tcp_do_rfc1323)
1231 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1237 * Determine the receive window size for a socket.
1239 static unsigned long
1240 select_rcv_wnd(struct toedev *dev, struct socket *so)
1242 INIT_VNET_INET(so->so_vnet);
1243 struct tom_data *d = TOM_DATA(dev);
1245 unsigned int max_rcv_wnd;
1246 struct sockbuf *rcv;
1248 rcv = so_sockbuf_rcv(so);
1250 if (V_tcp_do_autorcvbuf)
1251 wnd = V_tcp_autorcvbuf_max;
1253 wnd = rcv->sb_hiwat;
1258 * For receive coalescing to work effectively we need a receive window
1259 * that can accomodate a coalesced segment.
1261 if (wnd < MIN_RCV_WND)
1265 max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1266 (uint32_t)d->rx_page_size * 23 :
1269 return min(wnd, max_rcv_wnd);
1273 * Assign offload parameters to some socket fields. This code is used by
1274 * both active and passive opens.
1277 init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1278 struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1280 struct tcpcb *tp = so_sototcpcb(so);
1281 struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1282 struct sockbuf *snd, *rcv;
1285 SOCK_LOCK_ASSERT(so);
1288 snd = so_sockbuf_snd(so);
1289 rcv = so_sockbuf_rcv(so);
1291 log(LOG_INFO, "initializing offload socket\n");
1293 * We either need to fix push frames to work with sbcompress
1294 * or we need to add this
1296 snd->sb_flags |= SB_NOCOALESCE;
1297 rcv->sb_flags |= SB_NOCOALESCE;
1301 toep->tp_toedev = dev;
1305 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1306 toep->tp_wr_unacked = 0;
1307 toep->tp_delack_mode = 0;
1309 toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1314 tp->rcv_wnd = select_rcv_wnd(dev, so);
1316 toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1317 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1318 toep->tp_qset_idx = 0;
1320 reset_wr_list(toep);
1321 DPRINTF("initialization done\n");
1325 * The next two functions calculate the option 0 value for a socket.
1327 static inline unsigned int
1328 calc_opt0h(struct socket *so, int mtu_idx)
1330 struct tcpcb *tp = so_sototcpcb(so);
1331 int wscale = select_rcv_wscale(tp->rcv_wnd);
1333 return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1334 V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1335 V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1338 static inline unsigned int
1339 calc_opt0l(struct socket *so, int ulp_mode)
1341 struct tcpcb *tp = so_sototcpcb(so);
1344 val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1345 V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1347 DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1351 static inline unsigned int
1352 calc_opt2(const struct socket *so, struct toedev *dev)
1356 flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1358 return (V_FLAVORS_VALID(flv_valid) |
1359 V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1364 count_pending_wrs(const struct toepcb *toep)
1366 const struct mbuf *m;
1369 wr_queue_walk(toep, m)
1370 n += m->m_pkthdr.csum_data;
1376 (((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1380 mk_act_open_req(struct socket *so, struct mbuf *m,
1381 unsigned int atid, const struct l2t_entry *e)
1383 struct cpl_act_open_req *req;
1384 struct inpcb *inp = so_sotoinpcb(so);
1385 struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1386 struct toepcb *toep = tp->t_toe;
1387 struct toedev *tdev = toep->tp_toedev;
1389 m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1391 req = mtod(m, struct cpl_act_open_req *);
1392 m->m_pkthdr.len = m->m_len = sizeof(*req);
1394 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1396 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1397 inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1399 req->local_port = inp->inp_lport;
1400 req->peer_port = inp->inp_fport;
1401 memcpy(&req->local_ip, &inp->inp_laddr, 4);
1402 memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1404 req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1405 V_TX_CHANNEL(e->smt_idx));
1406 req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1408 req->opt2 = htonl(calc_opt2(so, tdev));
1413 * Convert an ACT_OPEN_RPL status to an errno.
1416 act_open_rpl_status_to_errno(int status)
1419 case CPL_ERR_CONN_RESET:
1420 return (ECONNREFUSED);
1421 case CPL_ERR_ARP_MISS:
1422 return (EHOSTUNREACH);
1423 case CPL_ERR_CONN_TIMEDOUT:
1425 case CPL_ERR_TCAM_FULL:
1427 case CPL_ERR_CONN_EXIST:
1428 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1429 return (EADDRINUSE);
1436 fail_act_open(struct toepcb *toep, int errno)
1438 struct tcpcb *tp = toep->tp_tp;
1440 t3_release_offload_resources(toep);
1442 inp_wunlock(tp->t_inpcb);
1443 tcp_offload_drop(tp, errno);
1447 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1452 * Handle active open failures.
1455 active_open_failed(struct toepcb *toep, struct mbuf *m)
1457 struct cpl_act_open_rpl *rpl = cplhdr(m);
1460 if (toep->tp_tp == NULL)
1463 inp = toep->tp_tp->t_inpcb;
1466 * Don't handle connection retry for now
1469 struct inet_connection_sock *icsk = inet_csk(sk);
1471 if (rpl->status == CPL_ERR_CONN_EXIST &&
1472 icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1473 icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1474 sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1481 * drops the inpcb lock
1483 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1491 * Return whether a failed active open has allocated a TID
1494 act_open_has_tid(int status)
1496 return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1497 status != CPL_ERR_ARP_MISS;
1501 * Process an ACT_OPEN_RPL CPL message.
1504 do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1506 struct toepcb *toep = (struct toepcb *)ctx;
1507 struct cpl_act_open_rpl *rpl = cplhdr(m);
1509 if (cdev->type != T3A && act_open_has_tid(rpl->status))
1510 cxgb_queue_tid_release(cdev, GET_TID(rpl));
1512 active_open_failed(toep, m);
1517 * Handle an ARP failure for an active open. XXX purge ofo queue
1519 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1520 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1521 * check SOCK_DEAD or sk->sk_sock. Or maybe generate the error here but don't
1522 * free the atid. Hmm.
1526 act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1528 struct toepcb *toep = m_get_toep(m);
1529 struct tcpcb *tp = toep->tp_tp;
1530 struct inpcb *inp = tp->t_inpcb;
1534 if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1536 * drops the inpcb lock
1538 fail_act_open(so, EHOSTUNREACH);
1539 printf("freeing %p\n", m);
1547 * Send an active open request.
1550 t3_connect(struct toedev *tdev, struct socket *so,
1551 struct rtentry *rt, struct sockaddr *nam)
1554 struct l2t_entry *e;
1555 struct tom_data *d = TOM_DATA(tdev);
1556 struct inpcb *inp = so_sotoinpcb(so);
1557 struct tcpcb *tp = intotcpcb(inp);
1558 struct toepcb *toep; /* allocated by init_offload_socket */
1562 toep = toepcb_alloc();
1566 if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1569 e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1573 inp_lock_assert(inp);
1574 m = m_gethdr(MT_DATA, M_WAITOK);
1577 m->m_toe.mt_toepcb = tp->t_toe;
1578 set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1582 init_offload_socket(so, tdev, atid, e, rt, toep);
1584 install_offload_ops(so);
1586 mk_act_open_req(so, m, atid, e);
1591 m_set_toep(m, tp->t_toe);
1593 toep->tp_state = TCPS_SYN_SENT;
1594 l2t_send(d->cdev, (struct mbuf *)m, e);
1596 if (toep->tp_ulp_mode)
1597 t3_enable_ddp(toep, 0);
1601 printf("failing connect - free atid\n");
1603 free_atid(d->cdev, atid);
1605 printf("return ENOMEM\n");
1610 * Send an ABORT_REQ message. Cannot fail. This routine makes sure we do
1611 * not send multiple ABORT_REQs for the same connection and also that we do
1612 * not try to send a message after the connection has closed. Returns 1 if
1613 * an ABORT_REQ wasn't generated after all, 0 otherwise.
1616 t3_send_reset(struct toepcb *toep)
1619 struct cpl_abort_req *req;
1620 unsigned int tid = toep->tp_tid;
1621 int mode = CPL_ABORT_SEND_RST;
1622 struct tcpcb *tp = toep->tp_tp;
1623 struct toedev *tdev = toep->tp_toedev;
1624 struct socket *so = NULL;
1626 struct sockbuf *snd;
1629 inp_lock_assert(tp->t_inpcb);
1630 so = inp_inpcbtosocket(tp->t_inpcb);
1633 if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1636 toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1638 snd = so_sockbuf_snd(so);
1639 /* Purge the send queue so we don't send anything after an abort. */
1642 if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1643 mode |= CPL_ABORT_POST_CLOSE_REQ;
1645 m = m_gethdr_nofail(sizeof(*req));
1646 m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1647 set_arp_failure_handler(m, abort_arp_failure);
1649 req = mtod(m, struct cpl_abort_req *);
1650 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1651 req->wr.wr_lo = htonl(V_WR_TID(tid));
1652 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1653 req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1654 req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1656 if (tp && (tp->t_state == TCPS_SYN_SENT))
1657 mbufq_tail(&toep->out_of_order_queue, m); // defer
1659 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1663 t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1668 if (sopt->sopt_name == IP_OPTIONS)
1669 return (ENOPROTOOPT);
1671 if (sopt->sopt_name != IP_TOS)
1672 return (EOPNOTSUPP);
1674 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1679 if (optval > IPTOS_PREC_CRITIC_ECP)
1682 inp = so_sotoinpcb(so);
1684 inp_ip_tos_set(inp, optval);
1686 inp->inp_ip_tos = optval;
1688 t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1695 t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1700 if (sopt->sopt_name != TCP_CONGESTION &&
1701 sopt->sopt_name != TCP_NODELAY)
1702 return (EOPNOTSUPP);
1704 if (sopt->sopt_name == TCP_CONGESTION) {
1705 char name[TCP_CA_NAME_MAX];
1706 int optlen = sopt->sopt_valsize;
1709 if (sopt->sopt_dir == SOPT_GET) {
1710 KASSERT(0, ("unimplemented"));
1711 return (EOPNOTSUPP);
1717 err = copyinstr(sopt->sopt_val, name,
1718 min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1724 tp = so_sototcpcb(so);
1726 * XXX I need to revisit this
1728 if ((err = t3_set_cong_control(so, name)) == 0) {
1729 #ifdef CONGESTION_CONTROL_SUPPORTED
1730 tp->t_cong_control = strdup(name, M_CXGB);
1739 if (sopt->sopt_dir == SOPT_GET)
1740 return (EOPNOTSUPP);
1742 err = sooptcopyin(sopt, &optval, sizeof optval,
1748 inp = so_sotoinpcb(so);
1750 tp = inp_inpcbtotcpcb(inp);
1752 oldval = tp->t_flags;
1754 tp->t_flags |= TF_NODELAY;
1756 tp->t_flags &= ~TF_NODELAY;
1760 if (oldval != tp->t_flags && (tp->t_toe != NULL))
1761 t3_set_nagle(tp->t_toe);
1769 t3_ctloutput(struct socket *so, struct sockopt *sopt)
1773 if (sopt->sopt_level != IPPROTO_TCP)
1774 err = t3_ip_ctloutput(so, sopt);
1776 err = t3_tcp_ctloutput(so, sopt);
1778 if (err != EOPNOTSUPP)
1781 return (tcp_ctloutput(so, sopt));
1785 * Returns true if we need to explicitly request RST when we receive new data
1786 * on an RX-closed connection.
1789 need_rst_on_excess_rx(const struct toepcb *toep)
1795 * Handles Rx data that arrives in a state where the socket isn't accepting
1799 handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1802 if (need_rst_on_excess_rx(toep) &&
1803 !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1804 t3_send_reset(toep);
1809 * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1810 * by getting the DDP offset from the TCB.
1813 tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1815 struct ddp_state *q = &toep->tp_ddp_state;
1816 struct ddp_buf_state *bsp;
1817 struct cpl_get_tcb_rpl *hdr;
1818 unsigned int ddp_offset;
1821 struct sockbuf *rcv;
1828 so = inp_inpcbtosocket(tp->t_inpcb);
1830 inp_lock_assert(tp->t_inpcb);
1831 rcv = so_sockbuf_rcv(so);
1834 /* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1835 * We really need a cookie in order to dispatch the RPLs.
1839 /* It is a possible that a previous CPL already invalidated UBUF DDP
1840 * and moved the cur_buf idx and hence no further processing of this
1841 * skb is required. However, the app might be sleeping on
1842 * !q->get_tcb_count and we need to wake it up.
1844 if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1845 int state = so_state_get(so);
1848 if (__predict_true((state & SS_NOFDREF) == 0))
1849 so_sorwakeup_locked(so);
1851 sockbuf_unlock(rcv);
1856 bsp = &q->buf_state[q->cur_buf];
1858 tcb = (__be64 *)(hdr + 1);
1859 if (q->cur_buf == 0) {
1860 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1861 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1863 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1864 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1866 ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1867 m->m_cur_offset = bsp->cur_offset;
1868 bsp->cur_offset = ddp_offset;
1869 m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1872 "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1873 q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1874 KASSERT(ddp_offset >= m->m_cur_offset,
1875 ("ddp_offset=%u less than cur_offset=%u",
1876 ddp_offset, m->m_cur_offset));
1880 unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1882 t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1883 ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1885 t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1886 rcv_nxt = t >> S_TCB_RCV_NXT;
1887 rcv_nxt &= M_TCB_RCV_NXT;
1889 t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1890 rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1891 rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1893 T3_TRACE2(TIDTB(sk),
1894 "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1895 ddp_flags, rcv_nxt - rx_hdr_offset);
1897 "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1898 tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1900 "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1901 rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1903 "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1904 q->buf_state[0].flags, q->buf_state[1].flags);
1908 if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1909 handle_excess_rx(toep, m);
1914 if ((int)m->m_pkthdr.len < 0) {
1915 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1918 if (bsp->flags & DDP_BF_NOCOPY) {
1921 "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1923 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1924 printk("!cancel_ubuf");
1925 t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1928 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1929 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1931 } else if (bsp->flags & DDP_BF_NOFLIP) {
1933 m->m_ddp_flags = 1; /* always a kernel buffer */
1935 /* now HW buffer carries a user buffer */
1936 bsp->flags &= ~DDP_BF_NOFLIP;
1937 bsp->flags |= DDP_BF_NOCOPY;
1939 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1940 * any new data in which case we're done. If in addition the
1941 * offset is 0, then there wasn't a completion for the kbuf
1942 * and we need to decrement the posted count.
1944 if (m->m_pkthdr.len == 0) {
1945 if (ddp_offset == 0) {
1947 bsp->flags |= DDP_BF_NODATA;
1949 sockbuf_unlock(rcv);
1954 sockbuf_unlock(rcv);
1956 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1957 * but it got here way late and nobody cares anymore.
1963 m->m_ddp_gl = (unsigned char *)bsp->gl;
1964 m->m_flags |= M_DDP;
1965 m->m_seq = tp->rcv_nxt;
1966 tp->rcv_nxt += m->m_pkthdr.len;
1967 tp->t_rcvtime = ticks;
1968 CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1969 m->m_seq, q->cur_buf, m->m_pkthdr.len);
1970 if (m->m_pkthdr.len == 0) {
1971 q->user_ddp_pending = 0;
1976 state = so_state_get(so);
1977 if (__predict_true((state & SS_NOFDREF) == 0))
1978 so_sorwakeup_locked(so);
1980 sockbuf_unlock(rcv);
1984 * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code,
1985 * in that case they are similar to DDP completions.
1988 do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1990 struct toepcb *toep = (struct toepcb *)ctx;
1992 /* OK if socket doesn't exist */
1994 printf("null toep in do_get_tcb_rpl\n");
1995 return (CPL_RET_BUF_DONE);
1998 inp_wlock(toep->tp_tp->t_inpcb);
1999 tcb_rpl_as_ddp_complete(toep, m);
2000 inp_wunlock(toep->tp_tp->t_inpcb);
2006 handle_ddp_data(struct toepcb *toep, struct mbuf *m)
2008 struct tcpcb *tp = toep->tp_tp;
2010 struct ddp_state *q;
2011 struct ddp_buf_state *bsp;
2012 struct cpl_rx_data *hdr = cplhdr(m);
2013 unsigned int rcv_nxt = ntohl(hdr->seq);
2014 struct sockbuf *rcv;
2016 if (tp->rcv_nxt == rcv_nxt)
2019 inp_lock_assert(tp->t_inpcb);
2020 so = inp_inpcbtosocket(tp->t_inpcb);
2021 rcv = so_sockbuf_rcv(so);
2024 q = &toep->tp_ddp_state;
2025 bsp = &q->buf_state[q->cur_buf];
2026 KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2027 rcv_nxt, tp->rcv_nxt));
2028 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2029 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2030 CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2031 rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2034 if ((int)m->m_pkthdr.len < 0) {
2035 t3_ddp_error(so, "handle_ddp_data: neg len");
2038 m->m_ddp_gl = (unsigned char *)bsp->gl;
2039 m->m_flags |= M_DDP;
2040 m->m_cur_offset = bsp->cur_offset;
2041 m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2042 if (bsp->flags & DDP_BF_NOCOPY)
2043 bsp->flags &= ~DDP_BF_NOCOPY;
2045 m->m_seq = tp->rcv_nxt;
2046 tp->rcv_nxt = rcv_nxt;
2047 bsp->cur_offset += m->m_pkthdr.len;
2048 if (!(bsp->flags & DDP_BF_NOFLIP))
2051 * For now, don't re-enable DDP after a connection fell out of DDP
2054 q->ubuf_ddp_ready = 0;
2055 sockbuf_unlock(rcv);
2059 * Process new data received for a connection.
2062 new_rx_data(struct toepcb *toep, struct mbuf *m)
2064 struct cpl_rx_data *hdr = cplhdr(m);
2065 struct tcpcb *tp = toep->tp_tp;
2067 struct sockbuf *rcv;
2069 int len = be16toh(hdr->len);
2071 inp_wlock(tp->t_inpcb);
2073 so = inp_inpcbtosocket(tp->t_inpcb);
2075 if (__predict_false(so_no_receive(so))) {
2076 handle_excess_rx(toep, m);
2077 inp_wunlock(tp->t_inpcb);
2082 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2083 handle_ddp_data(toep, m);
2085 m->m_seq = ntohl(hdr->seq);
2086 m->m_ulp_mode = 0; /* for iSCSI */
2089 if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2091 "%s: TID %u: Bad sequence number %u, expected %u\n",
2092 toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2095 inp_wunlock(tp->t_inpcb);
2099 m_adj(m, sizeof(*hdr));
2101 #ifdef URGENT_DATA_SUPPORTED
2103 * We don't handle urgent data yet
2105 if (__predict_false(hdr->urg))
2106 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2107 if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2108 tp->urg_seq - tp->rcv_nxt < skb->len))
2109 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2112 if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2113 toep->tp_delack_mode = hdr->dack_mode;
2114 toep->tp_delack_seq = tp->rcv_nxt;
2116 CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2117 m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2119 if (len < m->m_pkthdr.len)
2120 m->m_pkthdr.len = m->m_len = len;
2122 tp->rcv_nxt += m->m_pkthdr.len;
2123 tp->t_rcvtime = ticks;
2124 toep->tp_enqueued_bytes += m->m_pkthdr.len;
2126 "new_rx_data: seq 0x%x len %u",
2127 m->m_seq, m->m_pkthdr.len);
2128 inp_wunlock(tp->t_inpcb);
2129 rcv = so_sockbuf_rcv(so);
2133 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2139 * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2142 KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2144 ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2145 so, rcv->sb_cc, rcv->sb_mbmax));
2149 CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2150 rcv->sb_cc, rcv->sb_mbcnt);
2152 state = so_state_get(so);
2153 if (__predict_true((state & SS_NOFDREF) == 0))
2154 so_sorwakeup_locked(so);
2156 sockbuf_unlock(rcv);
2160 * Handler for RX_DATA CPL messages.
2163 do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2165 struct toepcb *toep = (struct toepcb *)ctx;
2167 DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2169 new_rx_data(toep, m);
2175 new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2178 struct ddp_state *q;
2179 struct ddp_buf_state *bsp;
2180 struct cpl_rx_data_ddp *hdr;
2182 unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2184 unsigned int delack_mode;
2185 struct sockbuf *rcv;
2188 inp_wlock(tp->t_inpcb);
2189 so = inp_inpcbtosocket(tp->t_inpcb);
2191 if (__predict_false(so_no_receive(so))) {
2193 handle_excess_rx(toep, m);
2194 inp_wunlock(tp->t_inpcb);
2198 q = &toep->tp_ddp_state;
2200 ddp_report = ntohl(hdr->u.ddp_report);
2201 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2202 bsp = &q->buf_state[buf_idx];
2205 "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2206 "hdr seq 0x%x len %u",
2207 tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2210 "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2211 G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2213 ddp_len = ntohs(hdr->len);
2214 rcv_nxt = ntohl(hdr->seq) + ddp_len;
2216 delack_mode = G_DDP_DACK_MODE(ddp_report);
2217 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2218 toep->tp_delack_mode = delack_mode;
2219 toep->tp_delack_seq = tp->rcv_nxt;
2222 m->m_seq = tp->rcv_nxt;
2223 tp->rcv_nxt = rcv_nxt;
2225 tp->t_rcvtime = ticks;
2227 * Store the length in m->m_len. We are changing the meaning of
2228 * m->m_len here, we need to be very careful that nothing from now on
2229 * interprets ->len of this packet the usual way.
2231 m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2232 inp_wunlock(tp->t_inpcb);
2234 "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2235 m->m_len, rcv_nxt, m->m_seq);
2237 * Figure out where the new data was placed in the buffer and store it
2238 * in when. Assumes the buffer offset starts at 0, consumer needs to
2239 * account for page pod's pg_offset.
2241 end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2242 m->m_cur_offset = end_offset - m->m_pkthdr.len;
2244 rcv = so_sockbuf_rcv(so);
2247 m->m_ddp_gl = (unsigned char *)bsp->gl;
2248 m->m_flags |= M_DDP;
2249 bsp->cur_offset = end_offset;
2250 toep->tp_enqueued_bytes += m->m_pkthdr.len;
2253 * Length is only meaningful for kbuf
2255 if (!(bsp->flags & DDP_BF_NOCOPY))
2256 KASSERT(m->m_len <= bsp->gl->dgl_length,
2257 ("length received exceeds ddp pages: len=%d dgl_length=%d",
2258 m->m_len, bsp->gl->dgl_length));
2260 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2261 KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2263 * Bit 0 of flags stores whether the DDP buffer is completed.
2264 * Note that other parts of the code depend on this being in bit 0.
2266 if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2267 panic("spurious ddp completion");
2269 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2270 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2271 q->cur_buf ^= 1; /* flip buffers */
2274 if (bsp->flags & DDP_BF_NOCOPY) {
2275 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2276 bsp->flags &= ~DDP_BF_NOCOPY;
2279 if (ddp_report & F_DDP_PSH)
2280 m->m_ddp_flags |= DDP_BF_PSH;
2282 m->m_ddp_flags |= DDP_BF_NODATA;
2285 skb_reset_transport_header(skb);
2286 tcp_hdr(skb)->fin = 0; /* changes original hdr->ddp_report */
2290 if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2291 (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2292 || !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2293 so_sorwakeup_locked(so);
2295 sockbuf_unlock(rcv);
2298 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2299 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2300 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2304 * Handler for RX_DATA_DDP CPL messages.
2307 do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2309 struct toepcb *toep = ctx;
2310 const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2314 if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2315 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2316 GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2317 return (CPL_RET_BUF_DONE);
2320 skb->h.th = tcphdr_skb->h.th;
2322 new_rx_data_ddp(toep, m);
2327 process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2329 struct tcpcb *tp = toep->tp_tp;
2331 struct ddp_state *q;
2332 struct ddp_buf_state *bsp;
2333 struct cpl_rx_ddp_complete *hdr;
2334 unsigned int ddp_report, buf_idx, when, delack_mode;
2336 struct sockbuf *rcv;
2338 inp_wlock(tp->t_inpcb);
2339 so = inp_inpcbtosocket(tp->t_inpcb);
2341 if (__predict_false(so_no_receive(so))) {
2342 struct inpcb *inp = so_sotoinpcb(so);
2344 handle_excess_rx(toep, m);
2348 q = &toep->tp_ddp_state;
2350 ddp_report = ntohl(hdr->ddp_report);
2351 buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2352 m->m_pkthdr.csum_data = tp->rcv_nxt;
2354 rcv = so_sockbuf_rcv(so);
2357 bsp = &q->buf_state[buf_idx];
2358 when = bsp->cur_offset;
2359 m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2360 tp->rcv_nxt += m->m_len;
2361 tp->t_rcvtime = ticks;
2363 delack_mode = G_DDP_DACK_MODE(ddp_report);
2364 if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2365 toep->tp_delack_mode = delack_mode;
2366 toep->tp_delack_seq = tp->rcv_nxt;
2369 skb_reset_transport_header(skb);
2370 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
2372 inp_wunlock(tp->t_inpcb);
2374 KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2376 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2377 "ddp_report 0x%x offset %u, len %u",
2378 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2379 G_DDP_OFFSET(ddp_report), m->m_len);
2381 m->m_cur_offset = bsp->cur_offset;
2382 bsp->cur_offset += m->m_len;
2384 if (!(bsp->flags & DDP_BF_NOFLIP)) {
2385 q->cur_buf ^= 1; /* flip buffers */
2386 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2391 "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2392 "ddp_report %u offset %u",
2393 tp->rcv_nxt, bsp->cur_offset, ddp_report,
2394 G_DDP_OFFSET(ddp_report));
2396 m->m_ddp_gl = (unsigned char *)bsp->gl;
2397 m->m_flags |= M_DDP;
2398 m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2399 if (bsp->flags & DDP_BF_NOCOPY)
2400 bsp->flags &= ~DDP_BF_NOCOPY;
2402 m->m_ddp_flags |= DDP_BF_NODATA;
2405 if ((so_state_get(so) & SS_NOFDREF) == 0)
2406 so_sorwakeup_locked(so);
2408 sockbuf_unlock(rcv);
2412 * Handler for RX_DDP_COMPLETE CPL messages.
2415 do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2417 struct toepcb *toep = ctx;
2421 skb->h.th = tcphdr_skb->h.th;
2423 process_ddp_complete(toep, m);
2428 * Move a socket to TIME_WAIT state. We need to make some adjustments to the
2429 * socket state before calling tcp_time_wait to comply with its expectations.
2432 enter_timewait(struct tcpcb *tp)
2435 * Bump rcv_nxt for the peer FIN. We don't do this at the time we
2436 * process peer_close because we don't want to carry the peer FIN in
2437 * the socket's receive queue and if we increment rcv_nxt without
2438 * having the FIN in the receive queue we'll confuse facilities such
2441 inp_wlock(tp->t_inpcb);
2444 tp->ts_recent_age = 0; /* defeat recycling */
2445 tp->t_srtt = 0; /* defeat tcp_update_metrics */
2446 inp_wunlock(tp->t_inpcb);
2447 tcp_offload_twstart(tp);
2451 * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE. This
2452 * function deals with the data that may be reported along with the FIN.
2453 * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2454 * perform normal FIN-related processing. In the latter case 1 indicates that
2455 * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2459 handle_peer_close_data(struct socket *so, struct mbuf *m)
2461 struct tcpcb *tp = so_sototcpcb(so);
2462 struct toepcb *toep = tp->t_toe;
2463 struct ddp_state *q;
2464 struct ddp_buf_state *bsp;
2465 struct cpl_peer_close *req = cplhdr(m);
2466 unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2467 struct sockbuf *rcv;
2469 if (tp->rcv_nxt == rcv_nxt) /* no data */
2472 CTR0(KTR_TOM, "handle_peer_close_data");
2473 if (__predict_false(so_no_receive(so))) {
2474 handle_excess_rx(toep, m);
2477 * Although we discard the data we want to process the FIN so
2478 * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2479 * PEER_CLOSE without data. In particular this PEER_CLOSE
2480 * may be what will close the connection. We return 1 because
2481 * handle_excess_rx() already freed the packet.
2486 inp_lock_assert(tp->t_inpcb);
2487 q = &toep->tp_ddp_state;
2488 rcv = so_sockbuf_rcv(so);
2491 bsp = &q->buf_state[q->cur_buf];
2492 m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2493 KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2494 m->m_ddp_gl = (unsigned char *)bsp->gl;
2495 m->m_flags |= M_DDP;
2496 m->m_cur_offset = bsp->cur_offset;
2498 DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2499 m->m_seq = tp->rcv_nxt;
2500 tp->rcv_nxt = rcv_nxt;
2501 bsp->cur_offset += m->m_pkthdr.len;
2502 if (!(bsp->flags & DDP_BF_NOFLIP))
2505 skb_reset_transport_header(skb);
2506 tcp_hdr(skb)->fin = 0; /* changes valid memory past CPL */
2508 tp->t_rcvtime = ticks;
2510 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2511 so_sorwakeup_locked(so);
2513 sockbuf_unlock(rcv);
2519 * Handle a peer FIN.
2522 do_peer_fin(struct toepcb *toep, struct mbuf *m)
2525 struct tcpcb *tp = toep->tp_tp;
2529 CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2530 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2531 printf("abort_pending set\n");
2535 inp_wlock(tp->t_inpcb);
2536 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2537 if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2538 keep = handle_peer_close_data(so, m);
2540 inp_wunlock(tp->t_inpcb);
2544 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2546 "waking up waiters for cantrcvmore on %p ", so);
2550 * If connection is half-synchronized
2551 * (ie NEEDSYN flag on) then delay ACK,
2552 * so it may be piggybacked when SYN is sent.
2553 * Otherwise, since we received a FIN then no
2554 * more input can be expected, send ACK now.
2556 if (tp->t_flags & TF_NEEDSYN)
2557 tp->t_flags |= TF_DELACK;
2559 tp->t_flags |= TF_ACKNOW;
2563 switch (tp->t_state) {
2564 case TCPS_SYN_RECEIVED:
2565 tp->t_starttime = ticks;
2567 case TCPS_ESTABLISHED:
2568 tp->t_state = TCPS_CLOSE_WAIT;
2570 case TCPS_FIN_WAIT_1:
2571 tp->t_state = TCPS_CLOSING;
2573 case TCPS_FIN_WAIT_2:
2575 * If we've sent an abort_req we must have sent it too late,
2576 * HW will send us a reply telling us so, and this peer_close
2577 * is really the last message for this connection and needs to
2578 * be treated as an abort_rpl, i.e., transition the connection
2579 * to TCP_CLOSE (note that the host stack does this at the
2580 * time of generating the RST but we must wait for HW).
2581 * Otherwise we enter TIME_WAIT.
2583 t3_release_offload_resources(toep);
2584 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2587 action = TCP_TIMEWAIT;
2592 "%s: TID %u received PEER_CLOSE in bad state %d\n",
2593 toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2595 inp_wunlock(tp->t_inpcb);
2597 if (action == TCP_TIMEWAIT) {
2599 } else if (action == TCP_DROP) {
2600 tcp_offload_drop(tp, 0);
2601 } else if (action == TCP_CLOSE) {
2602 tcp_offload_close(tp);
2606 /* Do not send POLL_HUP for half duplex close. */
2607 if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2608 sk->sk_state == TCP_CLOSE)
2609 sk_wake_async(so, 1, POLL_HUP);
2611 sk_wake_async(so, 1, POLL_IN);
2620 * Handler for PEER_CLOSE CPL messages.
2623 do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2625 struct toepcb *toep = (struct toepcb *)ctx;
2629 do_peer_fin(toep, m);
2634 process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2636 struct cpl_close_con_rpl *rpl = cplhdr(m);
2637 struct tcpcb *tp = toep->tp_tp;
2640 struct sockbuf *rcv;
2642 inp_wlock(tp->t_inpcb);
2643 so = inp_inpcbtosocket(tp->t_inpcb);
2645 tp->snd_una = ntohl(rpl->snd_nxt) - 1; /* exclude FIN */
2647 if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2648 inp_wunlock(tp->t_inpcb);
2652 CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2653 tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2655 switch (tp->t_state) {
2656 case TCPS_CLOSING: /* see FIN_WAIT2 case in do_peer_fin */
2657 t3_release_offload_resources(toep);
2658 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2662 action = TCP_TIMEWAIT;
2667 * In this state we don't care about pending abort_rpl.
2668 * If we've sent abort_req it was post-close and was sent too
2669 * late, this close_con_rpl is the actual last message.
2671 t3_release_offload_resources(toep);
2674 case TCPS_FIN_WAIT_1:
2676 * If we can't receive any more
2677 * data, then closing user can proceed.
2678 * Starting the timer is contrary to the
2679 * specification, but if we don't get a FIN
2680 * we'll hang forever.
2683 * we should release the tp also, and use a
2687 rcv = so_sockbuf_rcv(so);
2691 if (rcv->sb_state & SBS_CANTRCVMORE) {
2695 soisdisconnected(so);
2696 timeout = (tcp_fast_finwait2_recycle) ?
2697 tcp_finwait2_timeout : tcp_maxidle;
2698 tcp_timer_activate(tp, TT_2MSL, timeout);
2700 tp->t_state = TCPS_FIN_WAIT_2;
2701 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2702 (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2709 "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2710 toep->tp_toedev->tod_name, toep->tp_tid,
2713 inp_wunlock(tp->t_inpcb);
2716 if (action == TCP_TIMEWAIT) {
2718 } else if (action == TCP_DROP) {
2719 tcp_offload_drop(tp, 0);
2720 } else if (action == TCP_CLOSE) {
2721 tcp_offload_close(tp);
2728 * Handler for CLOSE_CON_RPL CPL messages.
2731 do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2734 struct toepcb *toep = (struct toepcb *)ctx;
2736 process_close_con_rpl(toep, m);
2741 * Process abort replies. We only process these messages if we anticipate
2742 * them as the coordination between SW and HW in this area is somewhat lacking
2743 * and sometimes we get ABORT_RPLs after we are done with the connection that
2744 * originated the ABORT_REQ.
2747 process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2749 struct tcpcb *tp = toep->tp_tp;
2754 T3_TRACE1(TIDTB(sk),
2755 "process_abort_rpl: GTS rpl pending %d",
2756 sock_flag(sk, ABORT_RPL_PENDING));
2759 inp_wlock(tp->t_inpcb);
2760 so = inp_inpcbtosocket(tp->t_inpcb);
2762 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2764 * XXX panic on tcpdrop
2766 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2767 toep->tp_flags |= TP_ABORT_RPL_RCVD;
2769 toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2770 if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2771 !is_t3a(toep->tp_toedev)) {
2772 if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2773 panic("TP_ABORT_REQ_RCVD set");
2774 t3_release_offload_resources(toep);
2779 inp_wunlock(tp->t_inpcb);
2782 tcp_offload_close(tp);
2788 * Handle an ABORT_RPL_RSS CPL message.
2791 do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2793 struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2794 struct toepcb *toep;
2797 * Ignore replies to post-close aborts indicating that the abort was
2798 * requested too late. These connections are terminated when we get
2799 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2800 * arrives the TID is either no longer used or it has been recycled.
2802 if (rpl->status == CPL_ERR_ABORT_FAILED) {
2808 toep = (struct toepcb *)ctx;
2811 * Sometimes we've already closed the socket, e.g., a post-close
2812 * abort races with ABORT_REQ_RSS, the latter frees the socket
2813 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2814 * but FW turns the ABORT_REQ into a regular one and so we get
2815 * ABORT_RPL_RSS with status 0 and no socket. Only on T3A.
2820 if (toep->tp_tp == NULL) {
2821 log(LOG_NOTICE, "removing tid for abort\n");
2822 cxgb_remove_tid(cdev, toep, toep->tp_tid);
2824 l2t_release(L2DATA(cdev), toep->tp_l2t);
2826 toepcb_release(toep);
2830 log(LOG_NOTICE, "toep=%p\n", toep);
2831 log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2834 process_abort_rpl(toep, m);
2835 toepcb_release(toep);
2840 * Convert the status code of an ABORT_REQ into a FreeBSD error code. Also
2841 * indicate whether RST should be sent in response.
2844 abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2846 struct tcpcb *tp = so_sototcpcb(so);
2848 switch (abort_reason) {
2849 case CPL_ERR_BAD_SYN:
2851 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); // fall through
2853 case CPL_ERR_CONN_RESET:
2854 // XXX need to handle SYN_RECV due to crossed SYNs
2855 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2856 case CPL_ERR_XMIT_TIMEDOUT:
2857 case CPL_ERR_PERSIST_TIMEDOUT:
2858 case CPL_ERR_FINWAIT2_TIMEDOUT:
2859 case CPL_ERR_KEEPALIVE_TIMEDOUT:
2861 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2870 set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2872 struct cpl_abort_rpl *rpl = cplhdr(m);
2874 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2875 rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2876 m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2878 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2883 send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2885 struct mbuf *reply_mbuf;
2886 struct cpl_abort_req_rss *req = cplhdr(m);
2888 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2889 m_set_priority(m, CPL_PRIORITY_DATA);
2890 m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2891 set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2892 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2897 * Returns whether an ABORT_REQ_RSS message is a negative advice.
2900 is_neg_adv_abort(unsigned int status)
2902 return status == CPL_ERR_RTX_NEG_ADVICE ||
2903 status == CPL_ERR_PERSIST_NEG_ADVICE;
2907 send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2909 struct mbuf *reply_mbuf;
2910 struct cpl_abort_req_rss *req = cplhdr(m);
2912 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2915 /* Defer the reply. Stick rst_status into req->cmd. */
2916 req->status = rst_status;
2917 t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2921 m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2922 set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2926 * XXX need to sync with ARP as for SYN_RECV connections we can send
2927 * these messages while ARP is pending. For other connection states
2928 * it's not a problem.
2930 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2935 cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2937 CXGB_UNIMPLEMENTED();
2939 struct request_sock *req = child->sk_user_data;
2941 inet_csk_reqsk_queue_removed(parent, req);
2942 synq_remove(tcp_sk(child));
2944 child->sk_user_data = NULL;
2950 * Performs the actual work to abort a SYN_RECV connection.
2953 do_abort_syn_rcv(struct socket *child, struct socket *parent)
2955 struct tcpcb *parenttp = so_sototcpcb(parent);
2956 struct tcpcb *childtp = so_sototcpcb(child);
2959 * If the server is still open we clean up the child connection,
2960 * otherwise the server already did the clean up as it was purging
2961 * its SYN queue and the skb was just sitting in its backlog.
2963 if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2964 cleanup_syn_rcv_conn(child, parent);
2965 inp_wlock(childtp->t_inpcb);
2966 t3_release_offload_resources(childtp->t_toe);
2967 inp_wunlock(childtp->t_inpcb);
2968 tcp_offload_close(childtp);
2974 * Handle abort requests for a SYN_RECV connection. These need extra work
2975 * because the socket is on its parent's SYN queue.
2978 abort_syn_rcv(struct socket *so, struct mbuf *m)
2980 CXGB_UNIMPLEMENTED();
2982 struct socket *parent;
2983 struct toedev *tdev = toep->tp_toedev;
2984 struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2985 struct socket *oreq = so->so_incomp;
2986 struct t3c_tid_entry *t3c_stid;
2990 return -1; /* somehow we are not on the SYN queue */
2992 t = &(T3C_DATA(cdev))->tid_maps;
2993 t3c_stid = lookup_stid(t, oreq->ts_recent);
2994 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2997 do_abort_syn_rcv(so, parent);
2998 send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
3005 * Process abort requests. If we are waiting for an ABORT_RPL we ignore this
3006 * request except that we need to reply to it.
3009 process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3011 int rst_status = CPL_ABORT_NO_RST;
3012 const struct cpl_abort_req_rss *req = cplhdr(m);
3013 struct tcpcb *tp = toep->tp_tp;
3017 inp_wlock(tp->t_inpcb);
3018 so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3019 if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3020 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3025 toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3027 * Three cases to consider:
3028 * a) We haven't sent an abort_req; close the connection.
3029 * b) We have sent a post-close abort_req that will get to TP too late
3030 * and will generate a CPL_ERR_ABORT_FAILED reply. The reply will
3031 * be ignored and the connection should be closed now.
3032 * c) We have sent a regular abort_req that will get to TP too late.
3033 * That will generate an abort_rpl with status 0, wait for it.
3035 if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3036 (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3039 error = abort_status_to_errno(so, req->status,
3041 so_error_set(so, error);
3043 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3046 * SYN_RECV needs special processing. If abort_syn_rcv()
3047 * returns 0 is has taken care of the abort.
3049 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3052 t3_release_offload_resources(toep);
3055 inp_wunlock(tp->t_inpcb);
3058 tcp_offload_close(tp);
3060 send_abort_rpl(m, tdev, rst_status);
3063 inp_wunlock(tp->t_inpcb);
3067 * Handle an ABORT_REQ_RSS CPL message.
3070 do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3072 const struct cpl_abort_req_rss *req = cplhdr(m);
3073 struct toepcb *toep = (struct toepcb *)ctx;
3075 if (is_neg_adv_abort(req->status)) {
3080 log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3082 if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3083 cxgb_remove_tid(cdev, toep, toep->tp_tid);
3084 toep->tp_flags |= TP_ABORT_REQ_RCVD;
3086 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3088 l2t_release(L2DATA(cdev), toep->tp_l2t);
3093 toep->tp_tp->t_toe = NULL;
3094 toep->tp_tp->t_flags &= ~TF_TOE;
3097 * XXX need to call syncache_chkrst - but we don't
3098 * have a way of doing that yet
3100 toepcb_release(toep);
3101 log(LOG_ERR, "abort for unestablished connection :-(\n");
3104 if (toep->tp_tp == NULL) {
3105 log(LOG_NOTICE, "disconnected toepcb\n");
3106 /* should be freed momentarily */
3112 process_abort_req(toep, m, toep->tp_toedev);
3113 toepcb_release(toep);
3118 pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3120 struct toedev *tdev = TOE_DEV(parent);
3122 do_abort_syn_rcv(child, parent);
3123 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3124 struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3126 rpl->opt0h = htonl(F_TCAM_BYPASS);
3127 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3128 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3134 handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3136 CXGB_UNIMPLEMENTED();
3139 struct t3cdev *cdev;
3140 struct socket *parent;
3141 struct socket *oreq;
3142 struct t3c_tid_entry *t3c_stid;
3144 struct tcpcb *otp, *tp = so_sototcpcb(so);
3145 struct toepcb *toep = tp->t_toe;
3148 * If the connection is being aborted due to the parent listening
3149 * socket going away there's nothing to do, the ABORT_REQ will close
3152 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3157 oreq = so->so_incomp;
3158 otp = so_sototcpcb(oreq);
3161 t = &(T3C_DATA(cdev))->tid_maps;
3162 t3c_stid = lookup_stid(t, otp->ts_recent);
3163 parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3166 pass_open_abort(so, parent, m);
3172 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL. This is treated similarly
3173 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3177 pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3181 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3182 BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3184 handle_pass_open_arp_failure(m_get_socket(m), m);
3188 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3191 mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3193 struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3194 struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3195 unsigned int tid = GET_TID(req);
3197 m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3198 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3199 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3200 rpl->peer_ip = req->peer_ip; // req->peer_ip not overwritten yet
3201 rpl->opt0h = htonl(F_TCAM_BYPASS);
3202 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3204 rpl->rsvd = rpl->opt2; /* workaround for HW bug */
3208 * Send a deferred reject to an accept request.
3211 reject_pass_request(struct toedev *tdev, struct mbuf *m)
3213 struct mbuf *reply_mbuf;
3215 reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3216 mk_pass_accept_rpl(reply_mbuf, m);
3217 cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3222 handle_syncache_event(int event, void *arg)
3224 struct toepcb *toep = arg;
3227 case TOE_SC_ENTRY_PRESENT:
3229 * entry already exists - free toepcb
3232 printf("syncache entry present\n");
3233 toepcb_release(toep);
3237 * The syncache has given up on this entry
3238 * either it timed out, or it was evicted
3239 * we need to explicitly release the tid
3241 printf("syncache entry dropped\n");
3242 toepcb_release(toep);
3245 log(LOG_ERR, "unknown syncache event %d\n", event);
3251 syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3253 struct in_conninfo inc;
3257 int mss, wsf, sack, ts;
3258 uint32_t rcv_isn = ntohl(req->rcv_isn);
3260 bzero(&to, sizeof(struct tcpopt));
3261 inp = so_sotoinpcb(lso);
3264 * Fill out information for entering us into the syncache
3266 bzero(&inc, sizeof(inc));
3267 inc.inc_fport = th.th_sport = req->peer_port;
3268 inc.inc_lport = th.th_dport = req->local_port;
3269 th.th_seq = req->rcv_isn;
3270 th.th_flags = TH_SYN;
3272 toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3277 inc.inc_faddr.s_addr = req->peer_ip;
3278 inc.inc_laddr.s_addr = req->local_ip;
3280 DPRINTF("syncache add of %d:%d %d:%d\n",
3281 ntohl(req->local_ip), ntohs(req->local_port),
3282 ntohl(req->peer_ip), ntohs(req->peer_port));
3284 mss = req->tcp_options.mss;
3285 wsf = req->tcp_options.wsf;
3286 ts = req->tcp_options.tstamp;
3287 sack = req->tcp_options.sack;
3290 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3291 tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3296 * Process a CPL_PASS_ACCEPT_REQ message. Does the part that needs the socket
3297 * lock held. Note that the sock here is a listening socket that is not owned
3301 process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3302 struct listen_ctx *lctx)
3305 struct l2t_entry *e;
3307 struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3308 struct cpl_pass_accept_rpl *rpl;
3309 struct cpl_pass_accept_req *req = cplhdr(m);
3310 unsigned int tid = GET_TID(req);
3311 struct tom_data *d = TOM_DATA(tdev);
3312 struct t3cdev *cdev = d->cdev;
3313 struct tcpcb *tp = so_sototcpcb(so);
3314 struct toepcb *newtoep;
3315 struct rtentry *dst;
3316 struct sockaddr_in nam;
3317 struct t3c_data *td = T3C_DATA(cdev);
3319 reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3320 if (__predict_false(reply_mbuf == NULL)) {
3321 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3322 t3_defer_reply(m, tdev, reject_pass_request);
3324 cxgb_queue_tid_release(cdev, tid);
3327 DPRINTF("failed to get reply_mbuf\n");
3332 if (tp->t_state != TCPS_LISTEN) {
3333 DPRINTF("socket not in listen state\n");
3338 tim.mac_addr = req->dst_mac;
3339 tim.vlan_tag = ntohs(req->vlan_tag);
3340 if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3341 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3347 * XXX do route lookup to confirm that we're still listening on this
3350 if (ip_route_input(skb, req->local_ip, req->peer_ip,
3351 G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3353 rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3354 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3355 dst_release(skb->dst); // done with the input route, release it
3358 if ((rt_flags & RTF_LOCAL) == 0)
3364 rt_flags = RTF_LOCAL;
3365 if ((rt_flags & RTF_LOCAL) == 0)
3369 * Calculate values and add to syncache
3372 newtoep = toepcb_alloc();
3373 if (newtoep == NULL)
3376 bzero(&nam, sizeof(struct sockaddr_in));
3378 nam.sin_len = sizeof(struct sockaddr_in);
3379 nam.sin_family = AF_INET;
3380 nam.sin_addr.s_addr =req->peer_ip;
3381 dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3384 printf("failed to find route\n");
3387 e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3388 (struct sockaddr *)&nam);
3390 DPRINTF("failed to get l2t\n");
3393 * Point to our listen socket until accept
3395 newtoep->tp_tp = tp;
3396 newtoep->tp_flags = TP_SYN_RCVD;
3397 newtoep->tp_tid = tid;
3398 newtoep->tp_toedev = tdev;
3399 tp->rcv_wnd = select_rcv_wnd(tdev, so);
3401 cxgb_insert_tid(cdev, d->client, newtoep, tid);
3403 LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3406 newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3407 tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3409 if (newtoep->tp_ulp_mode) {
3410 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3412 if (ddp_mbuf == NULL)
3413 newtoep->tp_ulp_mode = 0;
3416 CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3417 TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3418 set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3420 * XXX workaround for lack of syncache drop
3422 toepcb_hold(newtoep);
3423 syncache_add_accept_req(req, so, newtoep);
3425 rpl = cplhdr(reply_mbuf);
3426 reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3427 rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3429 OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3430 rpl->opt2 = htonl(calc_opt2(so, tdev));
3431 rpl->rsvd = rpl->opt2; /* workaround for HW bug */
3432 rpl->peer_ip = req->peer_ip; // req->peer_ip is not overwritten
3434 rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3435 V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3436 rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3437 CPL_PASS_OPEN_ACCEPT);
3439 DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3441 m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3443 l2t_send(cdev, reply_mbuf, e);
3445 if (newtoep->tp_ulp_mode) {
3446 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3448 TP_DDP_TIMER_WORKAROUND_MASK,
3450 TP_DDP_TIMER_WORKAROUND_VAL, 1);
3452 printf("not offloading\n");
3458 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3459 mk_pass_accept_rpl(reply_mbuf, m);
3461 mk_tid_release(reply_mbuf, newtoep, tid);
3462 cxgb_ofld_send(cdev, reply_mbuf);
3466 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3473 * Handle a CPL_PASS_ACCEPT_REQ message.
3476 do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3478 struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3479 struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3480 struct tom_data *d = listen_ctx->tom_data;
3483 struct cpl_pass_accept_req *req = cplhdr(m);
3484 unsigned int tid = GET_TID(req);
3485 struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3487 if (unlikely(!lsk)) {
3488 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3490 (unsigned long)((union listen_entry *)ctx -
3492 return CPL_RET_BUF_DONE;
3494 if (unlikely(tid >= t->ntids)) {
3495 printk(KERN_ERR "%s: passive open TID %u too large\n",
3497 return CPL_RET_BUF_DONE;
3500 * For T3A the current user of the TID may have closed but its last
3501 * message(s) may have been backlogged so the TID appears to be still
3502 * in use. Just take the TID away, the connection can close at its
3503 * own leisure. For T3B this situation is a bug.
3505 if (!valid_new_tid(t, tid) &&
3506 cdev->type != T3A) {
3507 printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3509 return CPL_RET_BUF_DONE;
3513 process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3518 * Called when a connection is established to translate the TCP options
3519 * reported by HW to FreeBSD's native format.
3522 assign_rxopt(struct socket *so, unsigned int opt)
3524 struct tcpcb *tp = so_sototcpcb(so);
3525 struct toepcb *toep = tp->t_toe;
3526 const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3528 inp_lock_assert(tp->t_inpcb);
3530 toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3531 tp->t_flags |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3532 tp->t_flags |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3533 tp->t_flags |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3534 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3535 (TF_RCVD_SCALE|TF_REQ_SCALE))
3536 tp->rcv_scale = tp->request_r_scale;
3540 * Completes some final bits of initialization for just established connections
3541 * and changes their state to TCP_ESTABLISHED.
3543 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3546 make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3548 struct tcpcb *tp = so_sototcpcb(so);
3549 struct toepcb *toep = tp->t_toe;
3551 toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3552 assign_rxopt(so, opt);
3559 so->so_proto->pr_ctloutput = t3_ctloutput;
3563 inet_sk(sk)->id = tp->write_seq ^ jiffies;
3566 * XXX not clear what rcv_wup maps to
3569 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3570 * pass through opt0.
3572 if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3573 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3579 * no clean interface for marking ARP up to date
3581 dst_confirm(sk->sk_dst_cache);
3583 tp->t_starttime = ticks;
3584 tp->t_state = TCPS_ESTABLISHED;
3589 syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3592 struct in_conninfo inc;
3595 int mss, wsf, sack, ts;
3596 struct mbuf *m = NULL;
3597 const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3601 #error "no MAC support"
3604 opt = ntohs(req->tcp_opt);
3606 bzero(&to, sizeof(struct tcpopt));
3609 * Fill out information for entering us into the syncache
3611 bzero(&inc, sizeof(inc));
3612 inc.inc_fport = th.th_sport = req->peer_port;
3613 inc.inc_lport = th.th_dport = req->local_port;
3614 th.th_seq = req->rcv_isn;
3615 th.th_flags = TH_ACK;
3619 inc.inc_faddr.s_addr = req->peer_ip;
3620 inc.inc_laddr.s_addr = req->local_ip;
3622 mss = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3623 wsf = G_TCPOPT_WSCALE_OK(opt);
3624 ts = G_TCPOPT_TSTAMP(opt);
3625 sack = G_TCPOPT_SACK(opt);
3628 to.to_wscale = G_TCPOPT_SND_WSCALE(opt);
3629 to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3631 DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3632 ntohl(req->local_ip), ntohs(req->local_port),
3633 ntohl(req->peer_ip), ntohs(req->peer_port),
3634 mss, wsf, ts, sack);
3635 return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3640 * Process a CPL_PASS_ESTABLISH message. XXX a lot of the locking doesn't work
3641 * if we are in TCP_SYN_RECV due to crossed SYNs
3644 do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3646 struct cpl_pass_establish *req = cplhdr(m);
3647 struct toepcb *toep = (struct toepcb *)ctx;
3648 struct tcpcb *tp = toep->tp_tp;
3649 struct socket *so, *lso;
3650 struct t3c_data *td = T3C_DATA(cdev);
3651 struct sockbuf *snd, *rcv;
3653 // Complete socket initialization now that we have the SND_ISN
3655 struct toedev *tdev;
3658 tdev = toep->tp_toedev;
3660 inp_wlock(tp->t_inpcb);
3664 * XXX need to add reference while we're manipulating
3666 so = lso = inp_inpcbtosocket(tp->t_inpcb);
3668 inp_wunlock(tp->t_inpcb);
3671 LIST_REMOVE(toep, synq_entry);
3674 if (!syncache_expand_establish_req(req, &so, toep)) {
3678 CXGB_UNIMPLEMENTED();
3682 * Couldn't create the socket
3684 CXGB_UNIMPLEMENTED();
3687 tp = so_sototcpcb(so);
3688 inp_wlock(tp->t_inpcb);
3690 snd = so_sockbuf_snd(so);
3691 rcv = so_sockbuf_rcv(so);
3693 snd->sb_flags |= SB_NOCOALESCE;
3694 rcv->sb_flags |= SB_NOCOALESCE;
3699 reset_wr_list(toep);
3700 tp->rcv_wnd = select_rcv_wnd(tdev, so);
3701 tp->rcv_nxt = toep->tp_copied_seq;
3702 install_offload_ops(so);
3704 toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3705 toep->tp_wr_unacked = 0;
3706 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3707 toep->tp_qset_idx = 0;
3708 toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3711 * XXX Cancel any keep alive timer
3714 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3717 * XXX workaround for lack of syncache drop
3719 toepcb_release(toep);
3720 inp_wunlock(tp->t_inpcb);
3722 CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3723 cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3726 * XXX not sure how these checks map to us
3728 if (unlikely(sk->sk_socket)) { // simultaneous opens only
3729 sk->sk_state_change(sk);
3730 sk_wake_async(so, 0, POLL_OUT);
3733 * The state for the new connection is now up to date.
3734 * Next check if we should add the connection to the parent's
3735 * accept queue. When the parent closes it resets connections
3736 * on its SYN queue, so check if we are being reset. If so we
3737 * don't need to do anything more, the coming ABORT_RPL will
3738 * destroy this socket. Otherwise move the connection to the
3741 * Note that we reset the synq before closing the server so if
3742 * we are not being reset the stid is still open.
3744 if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3755 * Fill in the right TID for CPL messages waiting in the out-of-order queue
3756 * and send them to the TOE.
3759 fixup_and_send_ofo(struct toepcb *toep)
3762 struct toedev *tdev = toep->tp_toedev;
3763 struct tcpcb *tp = toep->tp_tp;
3764 unsigned int tid = toep->tp_tid;
3766 log(LOG_NOTICE, "fixup_and_send_ofo\n");
3768 inp_lock_assert(tp->t_inpcb);
3769 while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3771 * A variety of messages can be waiting but the fields we'll
3772 * be touching are common to all so any message type will do.
3774 struct cpl_close_con_req *p = cplhdr(m);
3776 p->wr.wr_lo = htonl(V_WR_TID(tid));
3777 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3778 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3783 * Updates socket state from an active establish CPL message. Runs with the
3787 socket_act_establish(struct socket *so, struct mbuf *m)
3789 INIT_VNET_INET(so->so_vnet);
3790 struct cpl_act_establish *req = cplhdr(m);
3791 u32 rcv_isn = ntohl(req->rcv_isn); /* real RCV_ISN + 1 */
3792 struct tcpcb *tp = so_sototcpcb(so);
3793 struct toepcb *toep = tp->t_toe;
3795 if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3796 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3797 toep->tp_tid, tp->t_state);
3799 tp->ts_recent_age = ticks;
3800 tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3801 toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3803 make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3806 * Now that we finally have a TID send any CPL messages that we had to
3807 * defer for lack of a TID.
3809 if (mbufq_len(&toep->out_of_order_queue))
3810 fixup_and_send_ofo(toep);
3812 if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3814 * XXX does this even make sense?
3821 * XXX assume no write requests permitted while socket connection is
3825 * Currently the send queue must be empty at this point because the
3826 * socket layer does not send anything before a connection is
3827 * established. To be future proof though we handle the possibility
3828 * that there are pending buffers to send (either TX_DATA or
3829 * CLOSE_CON_REQ). First we need to adjust the sequence number of the
3830 * buffers according to the just learned write_seq, and then we send
3831 * them on their way.
3833 fixup_pending_writeq_buffers(sk);
3834 if (t3_push_frames(so, 1))
3835 sk->sk_write_space(sk);
3838 toep->tp_state = tp->t_state;
3839 V_tcpstat.tcps_connects++;
3844 * Process a CPL_ACT_ESTABLISH message.
3847 do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3849 struct cpl_act_establish *req = cplhdr(m);
3850 unsigned int tid = GET_TID(req);
3851 unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3852 struct toepcb *toep = (struct toepcb *)ctx;
3853 struct tcpcb *tp = toep->tp_tp;
3855 struct toedev *tdev;
3859 free_atid(cdev, atid);
3862 inp_wlock(tp->t_inpcb);
3867 so = inp_inpcbtosocket(tp->t_inpcb);
3868 tdev = toep->tp_toedev; /* blow up here if link was down */
3872 * It's OK if the TID is currently in use, the owning socket may have
3873 * backlogged its last CPL message(s). Just take it away.
3877 so_insert_tid(d, toep, tid);
3878 free_atid(cdev, atid);
3879 toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3881 socket_act_establish(so, m);
3882 inp_wunlock(tp->t_inpcb);
3883 CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3884 cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3890 * Process an acknowledgment of WR completion. Advance snd_una and send the
3891 * next batch of work requests from the write queue.
3894 wr_ack(struct toepcb *toep, struct mbuf *m)
3896 struct tcpcb *tp = toep->tp_tp;
3897 struct cpl_wr_ack *hdr = cplhdr(m);
3899 unsigned int credits = ntohs(hdr->credits);
3900 u32 snd_una = ntohl(hdr->snd_una);
3902 struct sockbuf *snd;
3904 CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3906 inp_wlock(tp->t_inpcb);
3907 so = inp_inpcbtosocket(tp->t_inpcb);
3908 toep->tp_wr_avail += credits;
3909 if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3910 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3913 struct mbuf *p = peek_wr(toep);
3915 if (__predict_false(!p)) {
3916 log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3917 "nothing pending, state %u wr_avail=%u\n",
3918 credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3922 "wr_ack: p->credits=%d p->bytes=%d",
3923 p->m_pkthdr.csum_data, p->m_pkthdr.len);
3924 KASSERT(p->m_pkthdr.csum_data != 0,
3925 ("empty request still on list"));
3927 if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3930 struct tx_data_wr *w = cplhdr(p);
3932 "TID %u got %u WR credits, need %u, len %u, "
3933 "main body %u, frags %u, seq # %u, ACK una %u,"
3934 " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3935 toep->tp_tid, credits, p->csum, p->len,
3936 p->len - p->data_len, skb_shinfo(p)->nr_frags,
3937 ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3938 toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3940 p->m_pkthdr.csum_data -= credits;
3944 credits -= p->m_pkthdr.csum_data;
3945 bytes += p->m_pkthdr.len;
3947 "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3948 p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3955 check_wr_invariants(tp);
3958 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3960 struct tom_data *d = TOM_DATA(TOE_DEV(so));
3962 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3963 "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3964 toep->tp_tid, tp->snd_una);
3969 if (tp->snd_una != snd_una) {
3970 tp->snd_una = snd_una;
3971 tp->ts_recent_age = ticks;
3974 * Keep ARP entry "minty fresh"
3976 dst_confirm(sk->sk_dst_cache);
3978 if (tp->snd_una == tp->snd_nxt)
3979 toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3982 snd = so_sockbuf_snd(so);
3984 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3985 snd = so_sockbuf_snd(so);
3987 sbdrop_locked(snd, bytes);
3988 so_sowwakeup_locked(so);
3991 if (snd->sb_sndptroff < snd->sb_cc)
3992 t3_push_frames(so, 0);
3995 inp_wunlock(tp->t_inpcb);
4000 * Handler for TX_DATA_ACK CPL messages.
4003 do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
4005 struct toepcb *toep = (struct toepcb *)ctx;
4014 * Handler for TRACE_PKT CPL messages. Just sink these packets.
4017 do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4024 * Reset a connection that is on a listener's SYN queue or accept queue,
4025 * i.e., one that has not had a struct socket associated with it.
4026 * Must be called from process context.
4028 * Modeled after code in inet_csk_listen_stop().
4031 t3_reset_listen_child(struct socket *child)
4033 struct tcpcb *tp = so_sototcpcb(child);
4035 t3_send_reset(tp->t_toe);
4040 t3_child_disconnect(struct socket *so, void *arg)
4042 struct tcpcb *tp = so_sototcpcb(so);
4044 if (tp->t_flags & TF_TOE) {
4045 inp_wlock(tp->t_inpcb);
4046 t3_reset_listen_child(so);
4047 inp_wunlock(tp->t_inpcb);
4052 * Disconnect offloaded established but not yet accepted connections sitting
4053 * on a server's accept_queue. We just send an ABORT_REQ at this point and
4054 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4057 t3_disconnect_acceptq(struct socket *listen_so)
4061 so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4062 so_unlock(listen_so);
4066 * Reset offloaded connections sitting on a server's syn queue. As above
4067 * we send ABORT_REQ and finish off when we get ABORT_RPL.
4071 t3_reset_synq(struct listen_ctx *lctx)
4073 struct toepcb *toep;
4076 while (!LIST_EMPTY(&lctx->synq_head)) {
4077 toep = LIST_FIRST(&lctx->synq_head);
4078 LIST_REMOVE(toep, synq_entry);
4080 t3_send_reset(toep);
4081 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4082 toepcb_release(toep);
4084 so_unlock(lctx->lso);
4089 t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4090 unsigned int nppods, unsigned int tag, unsigned int maxoff,
4091 unsigned int pg_off, unsigned int color)
4093 unsigned int i, j, pidx;
4096 struct ulp_mem_io *req;
4097 unsigned int tid = toep->tp_tid;
4098 const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4099 unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4101 CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4102 gl, nppods, tag, maxoff, pg_off, color);
4104 for (i = 0; i < nppods; ++i) {
4105 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4106 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4107 req = mtod(m, struct ulp_mem_io *);
4108 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4109 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4111 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4112 V_ULPTX_CMD(ULP_MEM_WRITE));
4113 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4114 V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4116 p = (struct pagepod *)(req + 1);
4117 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4118 p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4119 p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4120 V_PPOD_COLOR(color));
4121 p->pp_max_offset = htonl(maxoff);
4122 p->pp_page_offset = htonl(pg_off);
4124 for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4125 p->pp_addr[j] = pidx < gl->dgl_nelem ?
4126 htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4128 p->pp_vld_tid = 0; /* mark sentinel page pods invalid */
4129 send_or_defer(toep, m, 0);
4130 ppod_addr += PPOD_SIZE;
4136 * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4139 mk_cpl_barrier_ulp(struct cpl_barrier *b)
4141 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4143 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4144 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4145 b->opcode = CPL_BARRIER;
4149 * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4152 mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4154 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4156 txpkt = (struct ulp_txpkt *)req;
4157 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4158 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4159 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4160 req->cpuno = htons(cpuno);
4164 * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4167 mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4168 unsigned int word, uint64_t mask, uint64_t val)
4170 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4172 CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4173 tid, word, mask, val);
4175 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4176 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4177 OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4178 req->reply = V_NO_REPLY(1);
4180 req->word = htons(word);
4181 req->mask = htobe64(mask);
4182 req->val = htobe64(val);
4186 * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4189 mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4190 unsigned int tid, unsigned int credits)
4192 struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4194 txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4195 txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4196 OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4197 ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4198 V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4199 V_RX_CREDITS(credits));
4203 t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4207 struct work_request_hdr *wr;
4208 struct cpl_barrier *lock;
4209 struct cpl_set_tcb_field *req;
4210 struct cpl_get_tcb *getreq;
4211 struct ddp_state *p = &toep->tp_ddp_state;
4214 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4216 wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4218 m = m_gethdr_nofail(wrlen);
4219 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4220 wr = mtod(m, struct work_request_hdr *);
4223 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4224 m->m_pkthdr.len = m->m_len = wrlen;
4226 lock = (struct cpl_barrier *)(wr + 1);
4227 mk_cpl_barrier_ulp(lock);
4229 req = (struct cpl_set_tcb_field *)(lock + 1);
4231 CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4233 /* Hmmm, not sure if this actually a good thing: reactivating
4234 * the other buffer might be an issue if it has been completed
4235 * already. However, that is unlikely, since the fact that the UBUF
4236 * is not completed indicates that there is no oustanding data.
4239 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4240 V_TF_DDP_ACTIVE_BUF(1) |
4241 V_TF_DDP_BUF0_VALID(1),
4242 V_TF_DDP_ACTIVE_BUF(1));
4244 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4245 V_TF_DDP_ACTIVE_BUF(1) |
4246 V_TF_DDP_BUF1_VALID(1), 0);
4248 getreq = (struct cpl_get_tcb *)(req + 1);
4249 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4251 mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4253 /* Keep track of the number of oustanding CPL_GET_TCB requests
4258 T3_TRACE1(TIDTB(so),
4259 "t3_cancel_ddpbuf: bufidx %u", bufidx);
4261 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4265 * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4266 * @sk: the socket associated with the buffers
4267 * @bufidx: index of HW DDP buffer (0 or 1)
4268 * @tag0: new tag for HW buffer 0
4269 * @tag1: new tag for HW buffer 1
4270 * @len: new length for HW buf @bufidx
4272 * Sends a compound WR to overlay a new DDP buffer on top of an existing
4273 * buffer by changing the buffer tag and length and setting the valid and
4274 * active flag accordingly. The caller must ensure the new buffer is at
4275 * least as big as the existing one. Since we typically reprogram both HW
4276 * buffers this function sets both tags for convenience. Read the TCB to
4277 * determine how made data was written into the buffer before the overlay
4281 t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4282 unsigned int tag1, unsigned int len)
4286 struct work_request_hdr *wr;
4287 struct cpl_get_tcb *getreq;
4288 struct cpl_set_tcb_field *req;
4289 struct ddp_state *p = &toep->tp_ddp_state;
4291 CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4292 bufidx, tag0, tag1, len);
4294 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4296 wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4297 m = m_gethdr_nofail(wrlen);
4298 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4299 wr = mtod(m, struct work_request_hdr *);
4300 m->m_pkthdr.len = m->m_len = wrlen;
4304 /* Set the ATOMIC flag to make sure that TP processes the following
4305 * CPLs in an atomic manner and no wire segments can be interleaved.
4307 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4308 req = (struct cpl_set_tcb_field *)(wr + 1);
4309 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4310 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4311 V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4312 V_TCB_RX_DDP_BUF0_TAG(tag0) |
4313 V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4316 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4317 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4318 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4320 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4321 V_TF_DDP_PUSH_DISABLE_0(1) |
4322 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4323 V_TF_DDP_PUSH_DISABLE_0(0) |
4324 V_TF_DDP_BUF0_VALID(1));
4326 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4327 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4328 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4330 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4331 V_TF_DDP_PUSH_DISABLE_1(1) |
4332 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4333 V_TF_DDP_PUSH_DISABLE_1(0) |
4334 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4337 getreq = (struct cpl_get_tcb *)(req + 1);
4338 mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4340 /* Keep track of the number of oustanding CPL_GET_TCB requests
4345 T3_TRACE4(TIDTB(sk),
4346 "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4348 bufidx, tag0, tag1, len);
4350 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4354 * Sends a compound WR containing all the CPL messages needed to program the
4355 * two HW DDP buffers, namely optionally setting up the length and offset of
4356 * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4359 t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4360 unsigned int len1, unsigned int offset1,
4361 uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4365 struct work_request_hdr *wr;
4366 struct cpl_set_tcb_field *req;
4368 CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4369 len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4372 SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4374 wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4375 (len1 ? sizeof(*req) : 0) +
4376 (modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4377 m = m_gethdr_nofail(wrlen);
4378 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4379 wr = mtod(m, struct work_request_hdr *);
4382 wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4383 m->m_pkthdr.len = m->m_len = wrlen;
4385 req = (struct cpl_set_tcb_field *)(wr + 1);
4386 if (len0) { /* program buffer 0 offset and length */
4387 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4388 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4389 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4390 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4391 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4394 if (len1) { /* program buffer 1 offset and length */
4395 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4396 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4397 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4398 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4399 V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4403 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4407 mk_rx_data_ack_ulp(toep,
4408 (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4409 toep->tp_copied_seq - toep->tp_rcv_wup);
4410 toep->tp_rcv_wup = toep->tp_copied_seq;
4414 T3_TRACE5(TIDTB(sk),
4415 "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4417 len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4421 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4425 t3_init_wr_tab(unsigned int wr_len)
4429 if (mbuf_wrs[1]) /* already initialized */
4432 for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4433 int sgl_len = (3 * i) / 2 + (i & 1);
4436 mbuf_wrs[i] = sgl_len <= wr_len ?
4437 1 : 1 + (sgl_len - 2) / (wr_len - 1);
4444 t3_init_cpl_io(void)
4447 tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4450 "Chelsio TCP offload: can't allocate sk_buff\n");
4453 skb_put(tcphdr_skb, sizeof(struct tcphdr));
4454 tcphdr_skb->h.raw = tcphdr_skb->data;
4455 memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4458 t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4459 t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4460 t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4461 t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4462 t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4463 t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4464 t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4465 t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4466 t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4467 t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4468 t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4469 t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4470 t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4471 t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4472 t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);