sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c

   1 /**************************************************************************
   2
   3 Copyright (c) 2007-2008, Chelsio Inc.
   4 All rights reserved.
   5
   6 Redistribution and use in source and binary forms, with or without
   7 modification, are permitted provided that the following conditions are met:
   8
   9  1. Redistributions of source code must retain the above copyright notice,
  10     this list of conditions and the following disclaimer.
  11
  12  2. Neither the name of the Chelsio Corporation nor the names of its
  13     contributors may be used to endorse or promote products derived from
  14     this software without specific prior written permission.
  15
  16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26 POSSIBILITY OF SUCH DAMAGE.
  27
  28 ***************************************************************************/
  29
  30 #include <sys/cdefs.h>
  31 __FBSDID("$FreeBSD$");
  32
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/fcntl.h>
  36 #include <sys/kernel.h>
  37 #include <sys/limits.h>
  38 #include <sys/ktr.h>
  39 #include <sys/lock.h>
  40 #include <sys/mbuf.h>
  41 #include <sys/mutex.h>
  42 #include <sys/sockstate.h>
  43 #include <sys/sockopt.h>
  44 #include <sys/socket.h>
  45 #include <sys/sockbuf.h>
  46 #include <sys/sysctl.h>
  47 #include <sys/syslog.h>
  48 #include <sys/protosw.h>
  49 #include <sys/priv.h>
  50
  51 #if __FreeBSD_version >= 800044
  52 #include <sys/vimage.h>
  53 #else
  54 #define V_tcp_do_autosndbuf tcp_do_autosndbuf
  55 #define V_tcp_autosndbuf_max tcp_autosndbuf_max
  56 #define V_tcp_do_rfc1323 tcp_do_rfc1323
  57 #define V_tcp_do_autorcvbuf tcp_do_autorcvbuf
  58 #define V_tcp_autorcvbuf_max tcp_autorcvbuf_max
  59 #define V_tcpstat tcpstat
  60 #endif
  61
  62 #include <net/if.h>
  63 #include <net/route.h>
  64
  65 #include <netinet/in.h>
  66 #include <netinet/in_pcb.h>
  67 #include <netinet/in_systm.h>
  68 #include <netinet/in_var.h>
  69
  70
  71 #include <cxgb_osdep.h>
  72 #include <sys/mbufq.h>
  73
  74 #include <netinet/ip.h>
  75 #include <netinet/tcp_var.h>
  76 #include <netinet/tcp_fsm.h>
  77 #include <netinet/tcp_offload.h>
  78 #include <netinet/tcp_seq.h>
  79 #include <netinet/tcp_syncache.h>
  80 #include <netinet/tcp_timer.h>
  81 #include <net/route.h>
  82
  83 #include <t3cdev.h>
  84 #include <common/cxgb_firmware_exports.h>
  85 #include <common/cxgb_t3_cpl.h>
  86 #include <common/cxgb_tcb.h>
  87 #include <common/cxgb_ctl_defs.h>
  88 #include <cxgb_offload.h>
  89 #include <vm/vm.h>
  90 #include <vm/pmap.h>
  91 #include <machine/bus.h>
  92 #include <sys/mvec.h>
  93 #include <ulp/toecore/cxgb_toedev.h>
  94 #include <ulp/tom/cxgb_l2t.h>
  95 #include <ulp/tom/cxgb_defs.h>
  96 #include <ulp/tom/cxgb_tom.h>
  97 #include <ulp/tom/cxgb_t3_ddp.h>
  98 #include <ulp/tom/cxgb_toepcb.h>
  99 #include <ulp/tom/cxgb_tcp.h>
 100 #include <ulp/tom/cxgb_tcp_offload.h>
 101
 102 /*
 103  * For ULP connections HW may add headers, e.g., for digests, that aren't part
 104  * of the messages sent by the host but that are part of the TCP payload and
 105  * therefore consume TCP sequence space.  Tx connection parameters that
 106  * operate in TCP sequence space are affected by the HW additions and need to
 107  * compensate for them to accurately track TCP sequence numbers. This array
 108  * contains the compensating extra lengths for ULP packets.  It is indexed by
 109  * a packet's ULP submode.
 110  */
 111 const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
 112
 113 #ifdef notyet
 114 /*
 115  * This sk_buff holds a fake header-only TCP segment that we use whenever we
 116  * need to exploit SW TCP functionality that expects TCP headers, such as
 117  * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
 118  * CPUs without locking.
 119  */
 120 static struct mbuf *tcphdr_mbuf __read_mostly;
 121 #endif
 122
 123 /*
 124  * Size of WRs in bytes.  Note that we assume all devices we are handling have
 125  * the same WR size.
 126  */
 127 static unsigned int wrlen __read_mostly;
 128
 129 /*
 130  * The number of WRs needed for an skb depends on the number of page fragments
 131  * in the skb and whether it has any payload in its main body.  This maps the
 132  * length of the gather list represented by an skb into the # of necessary WRs.
 133  */
 134 static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
 135
 136 /*
 137  * Max receive window supported by HW in bytes.  Only a small part of it can
 138  * be set through option0, the rest needs to be set through RX_DATA_ACK.
 139  */
 140 #define MAX_RCV_WND ((1U << 27) - 1)
 141
 142 /*
 143  * Min receive window.  We want it to be large enough to accommodate receive
 144  * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
 145  */
 146 #define MIN_RCV_WND (24 * 1024U)
 147 #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
 148
 149 #define VALIDATE_SEQ 0
 150 #define VALIDATE_SOCK(so)
 151 #define DEBUG_WR 0
 152
 153 #define TCP_TIMEWAIT    1
 154 #define TCP_CLOSE       2
 155 #define TCP_DROP        3
 156
 157 extern int tcp_do_autorcvbuf;
 158 extern int tcp_do_autosndbuf;
 159 extern int tcp_autorcvbuf_max;
 160 extern int tcp_autosndbuf_max;
 161
 162 static void t3_send_reset(struct toepcb *toep);
 163 static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
 164 static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
 165 static void handle_syncache_event(int event, void *arg);
 166
 167 static inline void
 168 SBAPPEND(struct sockbuf *sb, struct mbuf *n)
 169 {
 170         struct mbuf *m;
 171
 172         m = sb->sb_mb;
 173         while (m) {
 174                 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
 175                     !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
 176                         !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
 177                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 178                         m->m_next, m->m_nextpkt, m->m_flags));
 179                 m = m->m_next;
 180         }
 181         m = n;
 182         while (m) {
 183                 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
 184                     !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
 185                         !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
 186                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 187                         m->m_next, m->m_nextpkt, m->m_flags));
 188                 m = m->m_next;
 189         }
 190         KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
 191         sbappendstream_locked(sb, n);
 192         m = sb->sb_mb;
 193
 194         while (m) {
 195                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 196                         m->m_next, m->m_nextpkt, m->m_flags));
 197                 m = m->m_next;
 198         }
 199 }
 200
 201 static inline int
 202 is_t3a(const struct toedev *dev)
 203 {
 204         return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
 205 }
 206
 207 static void
 208 dump_toepcb(struct toepcb *toep)
 209 {
 210         DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
 211             toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
 212             toep->tp_mtu_idx, toep->tp_tid);
 213
 214         DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
 215             toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
 216             toep->tp_mss_clamp, toep->tp_flags);
 217 }
 218
 219 #ifndef RTALLOC2_DEFINED
 220 static struct rtentry *
 221 rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
 222 {
 223         struct rtentry *rt = NULL;
 224
 225         if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
 226                 RT_UNLOCK(rt);
 227
 228         return (rt);
 229 }
 230 #endif
 231
 232 /*
 233  * Determine whether to send a CPL message now or defer it.  A message is
 234  * deferred if the connection is in SYN_SENT since we don't know the TID yet.
 235  * For connections in other states the message is sent immediately.
 236  * If through_l2t is set the message is subject to ARP processing, otherwise
 237  * it is sent directly.
 238  */
 239 static inline void
 240 send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
 241 {
 242         struct tcpcb *tp = toep->tp_tp;
 243
 244         if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
 245                 inp_wlock(tp->t_inpcb);
 246                 mbufq_tail(&toep->out_of_order_queue, m);  // defer
 247                 inp_wunlock(tp->t_inpcb);
 248         } else if (through_l2t)
 249                 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
 250         else
 251                 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
 252 }
 253
 254 static inline unsigned int
 255 mkprio(unsigned int cntrl, const struct toepcb *toep)
 256 {
 257         return (cntrl);
 258 }
 259
 260 /*
 261  * Populate a TID_RELEASE WR.  The skb must be already propely sized.
 262  */
 263 static inline void
 264 mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
 265 {
 266         struct cpl_tid_release *req;
 267
 268         m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
 269         m->m_pkthdr.len = m->m_len = sizeof(*req);
 270         req = mtod(m, struct cpl_tid_release *);
 271         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 272         req->wr.wr_lo = 0;
 273         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
 274 }
 275
 276 static inline void
 277 make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
 278 {
 279         INIT_VNET_INET(so->so_vnet);
 280         struct tcpcb *tp = so_sototcpcb(so);
 281         struct toepcb *toep = tp->t_toe;
 282         struct tx_data_wr *req;
 283         struct sockbuf *snd;
 284
 285         inp_lock_assert(tp->t_inpcb);
 286         snd = so_sockbuf_snd(so);
 287
 288         req = mtod(m, struct tx_data_wr *);
 289         m->m_len = sizeof(*req);
 290         req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
 291         req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
 292         /* len includes the length of any HW ULP additions */
 293         req->len = htonl(len);
 294         req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
 295         /* V_TX_ULP_SUBMODE sets both the mode and submode */
 296         req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
 297                            V_TX_URG(/* skb_urgent(skb) */ 0 ) |
 298                            V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
 299                                    (tail ? 0 : 1))));
 300         req->sndseq = htonl(tp->snd_nxt);
 301         if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
 302                 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
 303                                     V_TX_CPU_IDX(toep->tp_qset));
 304
 305                 /* Sendbuffer is in units of 32KB.
 306                  */
 307                 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
 308                         req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
 309                 else {
 310                         req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
 311                 }
 312
 313                 toep->tp_flags |= TP_DATASENT;
 314         }
 315 }
 316
 317 #define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
 318
 319 int
 320 t3_push_frames(struct socket *so, int req_completion)
 321 {
 322         struct tcpcb *tp = so_sototcpcb(so);
 323         struct toepcb *toep = tp->t_toe;
 324
 325         struct mbuf *tail, *m0, *last;
 326         struct t3cdev *cdev;
 327         struct tom_data *d;
 328         int state, bytes, count, total_bytes;
 329         bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
 330         struct sockbuf *snd;
 331
 332         if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
 333                 DPRINTF("tcp state=%d\n", tp->t_state);
 334                 return (0);
 335         }
 336
 337         state = so_state_get(so);
 338
 339         if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
 340                 DPRINTF("disconnecting\n");
 341
 342                 return (0);
 343         }
 344
 345         inp_lock_assert(tp->t_inpcb);
 346
 347         snd = so_sockbuf_snd(so);
 348         sockbuf_lock(snd);
 349
 350         d = TOM_DATA(toep->tp_toedev);
 351         cdev = d->cdev;
 352
 353         last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
 354
 355         total_bytes = 0;
 356         DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
 357             toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
 358
 359         if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
 360                 KASSERT(tail, ("sbdrop error"));
 361                 last = tail = tail->m_next;
 362         }
 363
 364         if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
 365                 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
 366                 sockbuf_unlock(snd);
 367
 368                 return (0);
 369         }
 370
 371         toep->tp_m_last = NULL;
 372         while (toep->tp_wr_avail && (tail != NULL)) {
 373                 count = bytes = 0;
 374                 segp = segs;
 375                 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
 376                         sockbuf_unlock(snd);
 377                         return (0);
 378                 }
 379                 /*
 380                  * If the data in tail fits as in-line, then
 381                  * make an immediate data wr.
 382                  */
 383                 if (tail->m_len <= IMM_LEN) {
 384                         count = 1;
 385                         bytes = tail->m_len;
 386                         last = tail;
 387                         tail = tail->m_next;
 388                         m_set_sgl(m0, NULL);
 389                         m_set_sgllen(m0, 0);
 390                         make_tx_data_wr(so, m0, bytes, tail);
 391                         m_append(m0, bytes, mtod(last, caddr_t));
 392                         KASSERT(!m0->m_next, ("bad append"));
 393                 } else {
 394                         while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
 395                             && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
 396                                 bytes += tail->m_len;
 397                                 last = tail;
 398                                 count++;
 399                                 /*
 400                                  * technically an abuse to be using this for a VA
 401                                  * but less gross than defining my own structure
 402                                  * or calling pmap_kextract from here :-|
 403                                  */
 404                                 segp->ds_addr = (bus_addr_t)tail->m_data;
 405                                 segp->ds_len = tail->m_len;
 406                                 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
 407                                     count, mbuf_wrs[count], tail->m_data, tail->m_len);
 408                                 segp++;
 409                                 tail = tail->m_next;
 410                         }
 411                         DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
 412                             toep->tp_wr_avail, count, mbuf_wrs[count], tail);
 413
 414                         m_set_sgl(m0, segs);
 415                         m_set_sgllen(m0, count);
 416                         make_tx_data_wr(so, m0, bytes, tail);
 417                 }
 418                 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
 419
 420                 if (tail) {
 421                         snd->sb_sndptr = tail;
 422                         toep->tp_m_last = NULL;
 423                 } else
 424                         toep->tp_m_last = snd->sb_sndptr = last;
 425
 426
 427                 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
 428
 429                 snd->sb_sndptroff += bytes;
 430                 total_bytes += bytes;
 431                 toep->tp_write_seq += bytes;
 432                 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
 433                     " tail=%p sndptr=%p sndptroff=%d",
 434                     toep->tp_wr_avail, count, mbuf_wrs[count],
 435                     tail, snd->sb_sndptr, snd->sb_sndptroff);
 436                 if (tail)
 437                         CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
 438                             " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
 439                             total_bytes, toep->tp_m_last, tail->m_data,
 440                             tp->snd_una);
 441                 else
 442                         CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
 443                             " tp_m_last=%p snd_una=0x%08x",
 444                             total_bytes, toep->tp_m_last, tp->snd_una);
 445
 446
 447 #ifdef KTR
 448 {
 449                 int i;
 450
 451                 i = 0;
 452                 while (i < count && m_get_sgllen(m0)) {
 453                         if ((count - i) >= 3) {
 454                                 CTR6(KTR_TOM,
 455                                     "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
 456                                     " len=%d pa=0x%zx len=%d",
 457                                     segs[i].ds_addr, segs[i].ds_len,
 458                                     segs[i + 1].ds_addr, segs[i + 1].ds_len,
 459                                     segs[i + 2].ds_addr, segs[i + 2].ds_len);
 460                                     i += 3;
 461                         } else if ((count - i) == 2) {
 462                                 CTR4(KTR_TOM,
 463                                     "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
 464                                     " len=%d",
 465                                     segs[i].ds_addr, segs[i].ds_len,
 466                                     segs[i + 1].ds_addr, segs[i + 1].ds_len);
 467                                     i += 2;
 468                         } else {
 469                                 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
 470                                     segs[i].ds_addr, segs[i].ds_len);
 471                                 i++;
 472                         }
 473
 474                 }
 475 }
 476 #endif
 477                  /*
 478                  * remember credits used
 479                  */
 480                 m0->m_pkthdr.csum_data = mbuf_wrs[count];
 481                 m0->m_pkthdr.len = bytes;
 482                 toep->tp_wr_avail -= mbuf_wrs[count];
 483                 toep->tp_wr_unacked += mbuf_wrs[count];
 484
 485                 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
 486                     toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
 487                         struct work_request_hdr *wr = cplhdr(m0);
 488
 489                         wr->wr_hi |= htonl(F_WR_COMPL);
 490                         toep->tp_wr_unacked = 0;
 491                 }
 492                 KASSERT((m0->m_pkthdr.csum_data > 0) &&
 493                     (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
 494                         m0->m_pkthdr.csum_data));
 495                 m0->m_type = MT_DONTFREE;
 496                 enqueue_wr(toep, m0);
 497                 DPRINTF("sending offload tx with %d bytes in %d segments\n",
 498                     bytes, count);
 499                 l2t_send(cdev, m0, toep->tp_l2t);
 500         }
 501         sockbuf_unlock(snd);
 502         return (total_bytes);
 503 }
 504
 505 /*
 506  * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
 507  * under any circumstances.  We take the easy way out and always queue the
 508  * message to the write_queue.  We can optimize the case where the queue is
 509  * already empty though the optimization is probably not worth it.
 510  */
 511 static void
 512 close_conn(struct socket *so)
 513 {
 514         struct mbuf *m;
 515         struct cpl_close_con_req *req;
 516         struct tom_data *d;
 517         struct inpcb *inp = so_sotoinpcb(so);
 518         struct tcpcb *tp;
 519         struct toepcb *toep;
 520         unsigned int tid;
 521
 522
 523         inp_wlock(inp);
 524         tp = so_sototcpcb(so);
 525         toep = tp->t_toe;
 526
 527         if (tp->t_state != TCPS_SYN_SENT)
 528                 t3_push_frames(so, 1);
 529
 530         if (toep->tp_flags & TP_FIN_SENT) {
 531                 inp_wunlock(inp);
 532                 return;
 533         }
 534
 535         tid = toep->tp_tid;
 536
 537         d = TOM_DATA(toep->tp_toedev);
 538
 539         m = m_gethdr_nofail(sizeof(*req));
 540         m_set_priority(m, CPL_PRIORITY_DATA);
 541         m_set_sgl(m, NULL);
 542         m_set_sgllen(m, 0);
 543
 544         toep->tp_flags |= TP_FIN_SENT;
 545         req = mtod(m, struct cpl_close_con_req *);
 546
 547         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
 548         req->wr.wr_lo = htonl(V_WR_TID(tid));
 549         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
 550         req->rsvd = 0;
 551         inp_wunlock(inp);
 552         /*
 553          * XXX - need to defer shutdown while there is still data in the queue
 554          *
 555          */
 556         CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
 557         cxgb_ofld_send(d->cdev, m);
 558
 559 }
 560
 561 /*
 562  * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
 563  * and send it along.
 564  */
 565 static void
 566 abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
 567 {
 568         struct cpl_abort_req *req = cplhdr(m);
 569
 570         req->cmd = CPL_ABORT_NO_RST;
 571         cxgb_ofld_send(cdev, m);
 572 }
 573
 574 /*
 575  * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
 576  * permitted to return without sending the message in case we cannot allocate
 577  * an sk_buff.  Returns the number of credits sent.
 578  */
 579 uint32_t
 580 t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
 581 {
 582         struct mbuf *m;
 583         struct cpl_rx_data_ack *req;
 584         struct toepcb *toep = tp->t_toe;
 585         struct toedev *tdev = toep->tp_toedev;
 586
 587         m = m_gethdr_nofail(sizeof(*req));
 588
 589         DPRINTF("returning %u credits to HW\n", credits);
 590
 591         req = mtod(m, struct cpl_rx_data_ack *);
 592         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 593         req->wr.wr_lo = 0;
 594         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 595         req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
 596         m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
 597         cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
 598         return (credits);
 599 }
 600
 601 /*
 602  * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
 603  * This is only used in DDP mode, so we take the opportunity to also set the
 604  * DACK mode and flush any Rx credits.
 605  */
 606 void
 607 t3_send_rx_modulate(struct toepcb *toep)
 608 {
 609         struct mbuf *m;
 610         struct cpl_rx_data_ack *req;
 611
 612         m = m_gethdr_nofail(sizeof(*req));
 613
 614         req = mtod(m, struct cpl_rx_data_ack *);
 615         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 616         req->wr.wr_lo = 0;
 617         m->m_pkthdr.len = m->m_len = sizeof(*req);
 618
 619         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 620         req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
 621                                  V_RX_DACK_MODE(1) |
 622                                  V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
 623         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 624         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
 625         toep->tp_rcv_wup = toep->tp_copied_seq;
 626 }
 627
 628 /*
 629  * Handle receipt of an urgent pointer.
 630  */
 631 static void
 632 handle_urg_ptr(struct socket *so, uint32_t urg_seq)
 633 {
 634 #ifdef URGENT_DATA_SUPPORTED
 635         struct tcpcb *tp = so_sototcpcb(so);
 636
 637         urg_seq--;   /* initially points past the urgent data, per BSD */
 638
 639         if (tp->urg_data && !after(urg_seq, tp->urg_seq))
 640                 return;                                 /* duplicate pointer */
 641         sk_send_sigurg(sk);
 642         if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
 643             !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
 644                 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 645
 646                 tp->copied_seq++;
 647                 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
 648                         tom_eat_skb(sk, skb, 0);
 649         }
 650         tp->urg_data = TCP_URG_NOTYET;
 651         tp->urg_seq = urg_seq;
 652 #endif
 653 }
 654
 655 /*
 656  * Returns true if a socket cannot accept new Rx data.
 657  */
 658 static inline int
 659 so_no_receive(const struct socket *so)
 660 {
 661         return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
 662 }
 663
 664 /*
 665  * Process an urgent data notification.
 666  */
 667 static void
 668 rx_urg_notify(struct toepcb *toep, struct mbuf *m)
 669 {
 670         struct cpl_rx_urg_notify *hdr = cplhdr(m);
 671         struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
 672
 673         VALIDATE_SOCK(so);
 674
 675         if (!so_no_receive(so))
 676                 handle_urg_ptr(so, ntohl(hdr->seq));
 677
 678         m_freem(m);
 679 }
 680
 681 /*
 682  * Handler for RX_URG_NOTIFY CPL messages.
 683  */
 684 static int
 685 do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 686 {
 687         struct toepcb *toep = (struct toepcb *)ctx;
 688
 689         rx_urg_notify(toep, m);
 690         return (0);
 691 }
 692
 693 static __inline int
 694 is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
 695 {
 696         return (toep->tp_ulp_mode ||
 697                 (toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
 698                     dev->tod_ttid >= TOE_ID_CHELSIO_T3));
 699 }
 700
 701 /*
 702  * Set of states for which we should return RX credits.
 703  */
 704 #define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
 705
 706 /*
 707  * Called after some received data has been read.  It returns RX credits
 708  * to the HW for the amount of data processed.
 709  */
 710 void
 711 t3_cleanup_rbuf(struct tcpcb *tp, int copied)
 712 {
 713         struct toepcb *toep = tp->t_toe;
 714         struct socket *so;
 715         struct toedev *dev;
 716         int dack_mode, must_send, read;
 717         u32 thres, credits, dack = 0;
 718         struct sockbuf *rcv;
 719
 720         so = inp_inpcbtosocket(tp->t_inpcb);
 721         rcv = so_sockbuf_rcv(so);
 722
 723         if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
 724                 (tp->t_state == TCPS_FIN_WAIT_2))) {
 725                 if (copied) {
 726                         sockbuf_lock(rcv);
 727                         toep->tp_copied_seq += copied;
 728                         sockbuf_unlock(rcv);
 729                 }
 730
 731                 return;
 732         }
 733
 734         inp_lock_assert(tp->t_inpcb);
 735
 736         sockbuf_lock(rcv);
 737         if (copied)
 738                 toep->tp_copied_seq += copied;
 739         else {
 740                 read = toep->tp_enqueued_bytes - rcv->sb_cc;
 741                 toep->tp_copied_seq += read;
 742         }
 743         credits = toep->tp_copied_seq - toep->tp_rcv_wup;
 744         toep->tp_enqueued_bytes = rcv->sb_cc;
 745         sockbuf_unlock(rcv);
 746
 747         if (credits > rcv->sb_mbmax) {
 748                 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
 749                     toep->tp_copied_seq, toep->tp_rcv_wup, credits);
 750             credits = rcv->sb_mbmax;
 751         }
 752
 753
 754         /*
 755          * XXX this won't accurately reflect credit return - we need
 756          * to look at the difference between the amount that has been
 757          * put in the recv sockbuf and what is there now
 758          */
 759
 760         if (__predict_false(!credits))
 761                 return;
 762
 763         dev = toep->tp_toedev;
 764         thres = TOM_TUNABLE(dev, rx_credit_thres);
 765
 766         if (__predict_false(thres == 0))
 767                 return;
 768
 769         if (is_delack_mode_valid(dev, toep)) {
 770                 dack_mode = TOM_TUNABLE(dev, delack);
 771                 if (__predict_false(dack_mode != toep->tp_delack_mode)) {
 772                         u32 r = tp->rcv_nxt - toep->tp_delack_seq;
 773
 774                         if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
 775                                 dack = F_RX_DACK_CHANGE |
 776                                        V_RX_DACK_MODE(dack_mode);
 777                 }
 778         } else
 779                 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 780
 781         /*
 782          * For coalescing to work effectively ensure the receive window has
 783          * at least 16KB left.
 784          */
 785         must_send = credits + 16384 >= tp->rcv_wnd;
 786
 787         if (must_send || credits >= thres)
 788                 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
 789 }
 790
 791 static int
 792 cxgb_toe_disconnect(struct tcpcb *tp)
 793 {
 794         struct socket *so;
 795
 796         DPRINTF("cxgb_toe_disconnect\n");
 797
 798         so = inp_inpcbtosocket(tp->t_inpcb);
 799         close_conn(so);
 800         return (0);
 801 }
 802
 803 static int
 804 cxgb_toe_reset(struct tcpcb *tp)
 805 {
 806         struct toepcb *toep = tp->t_toe;
 807
 808         t3_send_reset(toep);
 809
 810         /*
 811          * unhook from socket
 812          */
 813         tp->t_flags &= ~TF_TOE;
 814         toep->tp_tp = NULL;
 815         tp->t_toe = NULL;
 816         return (0);
 817 }
 818
 819 static int
 820 cxgb_toe_send(struct tcpcb *tp)
 821 {
 822         struct socket *so;
 823
 824         DPRINTF("cxgb_toe_send\n");
 825         dump_toepcb(tp->t_toe);
 826
 827         so = inp_inpcbtosocket(tp->t_inpcb);
 828         t3_push_frames(so, 1);
 829         return (0);
 830 }
 831
 832 static int
 833 cxgb_toe_rcvd(struct tcpcb *tp)
 834 {
 835
 836         inp_lock_assert(tp->t_inpcb);
 837
 838         t3_cleanup_rbuf(tp, 0);
 839
 840         return (0);
 841 }
 842
 843 static void
 844 cxgb_toe_detach(struct tcpcb *tp)
 845 {
 846         struct toepcb *toep;
 847
 848         /*
 849          * XXX how do we handle teardown in the SYN_SENT state?
 850          *
 851          */
 852         inp_lock_assert(tp->t_inpcb);
 853         toep = tp->t_toe;
 854         toep->tp_tp = NULL;
 855
 856         /*
 857          * unhook from socket
 858          */
 859         tp->t_flags &= ~TF_TOE;
 860         tp->t_toe = NULL;
 861 }
 862
 863
 864 static struct toe_usrreqs cxgb_toe_usrreqs = {
 865         .tu_disconnect = cxgb_toe_disconnect,
 866         .tu_reset = cxgb_toe_reset,
 867         .tu_send = cxgb_toe_send,
 868         .tu_rcvd = cxgb_toe_rcvd,
 869         .tu_detach = cxgb_toe_detach,
 870         .tu_detach = cxgb_toe_detach,
 871         .tu_syncache_event = handle_syncache_event,
 872 };
 873
 874
 875 static void
 876 __set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
 877                             uint64_t mask, uint64_t val, int no_reply)
 878 {
 879         struct cpl_set_tcb_field *req;
 880
 881         CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
 882             toep->tp_tid, word, mask, val);
 883
 884         req = mtod(m, struct cpl_set_tcb_field *);
 885         m->m_pkthdr.len = m->m_len = sizeof(*req);
 886         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 887         req->wr.wr_lo = 0;
 888         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
 889         req->reply = V_NO_REPLY(no_reply);
 890         req->cpu_idx = 0;
 891         req->word = htons(word);
 892         req->mask = htobe64(mask);
 893         req->val = htobe64(val);
 894
 895         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 896         send_or_defer(toep, m, 0);
 897 }
 898
 899 static void
 900 t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
 901 {
 902         struct mbuf *m;
 903         struct tcpcb *tp = toep->tp_tp;
 904
 905         if (toep == NULL)
 906                 return;
 907
 908         if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
 909                 printf("not seting field\n");
 910                 return;
 911         }
 912
 913         m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
 914
 915         __set_tcb_field(toep, m, word, mask, val, 1);
 916 }
 917
 918 /*
 919  * Set one of the t_flags bits in the TCB.
 920  */
 921 static void
 922 set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
 923 {
 924
 925         t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
 926 }
 927
 928 /*
 929  * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
 930  */
 931 static void
 932 t3_set_nagle(struct toepcb *toep)
 933 {
 934         struct tcpcb *tp = toep->tp_tp;
 935
 936         set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
 937 }
 938
 939 /*
 940  * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
 941  */
 942 void
 943 t3_set_keepalive(struct toepcb *toep, int on_off)
 944 {
 945
 946         set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
 947 }
 948
 949 void
 950 t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
 951 {
 952         set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
 953 }
 954
 955 void
 956 t3_set_dack_mss(struct toepcb *toep, int on_off)
 957 {
 958
 959         set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
 960 }
 961
 962 /*
 963  * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
 964  */
 965 static void
 966 t3_set_tos(struct toepcb *toep)
 967 {
 968         int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
 969
 970         t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
 971                          V_TCB_TOS(tos));
 972 }
 973
 974
 975 /*
 976  * In DDP mode, TP fails to schedule a timer to push RX data to the host when
 977  * DDP is disabled (data is delivered to freelist). [Note that, the peer should
 978  * set the PSH bit in the last segment, which would trigger delivery.]
 979  * We work around the issue by setting a DDP buffer in a partial placed state,
 980  * which guarantees that TP will schedule a timer.
 981  */
 982 #define TP_DDP_TIMER_WORKAROUND_MASK\
 983     (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
 984      ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
 985        V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
 986 #define TP_DDP_TIMER_WORKAROUND_VAL\
 987     (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
 988      ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
 989       32))
 990
 991 static void
 992 t3_enable_ddp(struct toepcb *toep, int on)
 993 {
 994         if (on) {
 995
 996                 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
 997                                  V_TF_DDP_OFF(0));
 998         } else
 999                 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
1000                                  V_TF_DDP_OFF(1) |
1001                                  TP_DDP_TIMER_WORKAROUND_MASK,
1002                                  V_TF_DDP_OFF(1) |
1003                                  TP_DDP_TIMER_WORKAROUND_VAL);
1004
1005 }
1006
1007 void
1008 t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
1009 {
1010         t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1011                          V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1012                          tag_color);
1013 }
1014
1015 void
1016 t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1017                     unsigned int len)
1018 {
1019         if (buf_idx == 0)
1020                 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1021                          V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1022                          V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1023                          V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1024                          V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1025         else
1026                 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1027                          V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1028                          V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1029                          V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1030                          V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1031 }
1032
1033 static int
1034 t3_set_cong_control(struct socket *so, const char *name)
1035 {
1036 #ifdef CONGESTION_CONTROL_SUPPORTED
1037         int cong_algo;
1038
1039         for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1040                 if (!strcmp(name, t3_cong_ops[cong_algo].name))
1041                         break;
1042
1043         if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1044                 return -EINVAL;
1045 #endif
1046         return 0;
1047 }
1048
1049 int
1050 t3_get_tcb(struct toepcb *toep)
1051 {
1052         struct cpl_get_tcb *req;
1053         struct tcpcb *tp = toep->tp_tp;
1054         struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1055
1056         if (!m)
1057                 return (ENOMEM);
1058
1059         inp_lock_assert(tp->t_inpcb);
1060         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1061         req = mtod(m, struct cpl_get_tcb *);
1062         m->m_pkthdr.len = m->m_len = sizeof(*req);
1063         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1064         req->wr.wr_lo = 0;
1065         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1066         req->cpuno = htons(toep->tp_qset);
1067         req->rsvd = 0;
1068         if (tp->t_state == TCPS_SYN_SENT)
1069                 mbufq_tail(&toep->out_of_order_queue, m);       // defer
1070         else
1071                 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1072         return 0;
1073 }
1074
1075 static inline void
1076 so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1077 {
1078
1079         toepcb_hold(toep);
1080
1081         cxgb_insert_tid(d->cdev, d->client, toep, tid);
1082 }
1083
1084 /**
1085  *      find_best_mtu - find the entry in the MTU table closest to an MTU
1086  *      @d: TOM state
1087  *      @mtu: the target MTU
1088  *
1089  *      Returns the index of the value in the MTU table that is closest to but
1090  *      does not exceed the target MTU.
1091  */
1092 static unsigned int
1093 find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1094 {
1095         int i = 0;
1096
1097         while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1098                 ++i;
1099         return (i);
1100 }
1101
1102 static unsigned int
1103 select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1104 {
1105         unsigned int idx;
1106
1107 #ifdef notyet
1108         struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1109 #endif
1110         if (tp) {
1111                 tp->t_maxseg = pmtu - 40;
1112                 if (tp->t_maxseg < td->mtus[0] - 40)
1113                         tp->t_maxseg = td->mtus[0] - 40;
1114                 idx = find_best_mtu(td, tp->t_maxseg + 40);
1115
1116                 tp->t_maxseg = td->mtus[idx] - 40;
1117         } else
1118                 idx = find_best_mtu(td, pmtu);
1119
1120         return (idx);
1121 }
1122
1123 static inline void
1124 free_atid(struct t3cdev *cdev, unsigned int tid)
1125 {
1126         struct toepcb *toep = cxgb_free_atid(cdev, tid);
1127
1128         if (toep)
1129                 toepcb_release(toep);
1130 }
1131
1132 /*
1133  * Release resources held by an offload connection (TID, L2T entry, etc.)
1134  */
1135 static void
1136 t3_release_offload_resources(struct toepcb *toep)
1137 {
1138         struct tcpcb *tp = toep->tp_tp;
1139         struct toedev *tdev = toep->tp_toedev;
1140         struct t3cdev *cdev;
1141         struct socket *so;
1142         unsigned int tid = toep->tp_tid;
1143         struct sockbuf *rcv;
1144
1145         CTR0(KTR_TOM, "t3_release_offload_resources");
1146
1147         if (!tdev)
1148                 return;
1149
1150         cdev = TOEP_T3C_DEV(toep);
1151         if (!cdev)
1152                 return;
1153
1154         toep->tp_qset = 0;
1155         t3_release_ddp_resources(toep);
1156
1157 #ifdef CTRL_SKB_CACHE
1158         kfree_skb(CTRL_SKB_CACHE(tp));
1159         CTRL_SKB_CACHE(tp) = NULL;
1160 #endif
1161
1162         if (toep->tp_wr_avail != toep->tp_wr_max) {
1163                 purge_wr_queue(toep);
1164                 reset_wr_list(toep);
1165         }
1166
1167         if (toep->tp_l2t) {
1168                 l2t_release(L2DATA(cdev), toep->tp_l2t);
1169                 toep->tp_l2t = NULL;
1170         }
1171         toep->tp_tp = NULL;
1172         if (tp) {
1173                 inp_lock_assert(tp->t_inpcb);
1174                 so = inp_inpcbtosocket(tp->t_inpcb);
1175                 rcv = so_sockbuf_rcv(so);
1176                 /*
1177                  * cancel any offloaded reads
1178                  *
1179                  */
1180                 sockbuf_lock(rcv);
1181                 tp->t_toe = NULL;
1182                 tp->t_flags &= ~TF_TOE;
1183                 if (toep->tp_ddp_state.user_ddp_pending) {
1184                         t3_cancel_ubuf(toep, rcv);
1185                         toep->tp_ddp_state.user_ddp_pending = 0;
1186                 }
1187                 so_sorwakeup_locked(so);
1188
1189         }
1190
1191         if (toep->tp_state == TCPS_SYN_SENT) {
1192                 free_atid(cdev, tid);
1193 #ifdef notyet
1194                 __skb_queue_purge(&tp->out_of_order_queue);
1195 #endif
1196         } else {                                          // we have TID
1197                 cxgb_remove_tid(cdev, toep, tid);
1198                 toepcb_release(toep);
1199         }
1200 #if 0
1201         log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1202 #endif
1203 }
1204
1205 static void
1206 install_offload_ops(struct socket *so)
1207 {
1208         struct tcpcb *tp = so_sototcpcb(so);
1209
1210         KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1211
1212         t3_install_socket_ops(so);
1213         tp->t_flags |= TF_TOE;
1214         tp->t_tu = &cxgb_toe_usrreqs;
1215 }
1216
1217 /*
1218  * Determine the receive window scaling factor given a target max
1219  * receive window.
1220  */
1221 static __inline int
1222 select_rcv_wscale(int space)
1223 {
1224         INIT_VNET_INET(so->so_vnet);
1225         int wscale = 0;
1226
1227         if (space > MAX_RCV_WND)
1228                 space = MAX_RCV_WND;
1229
1230         if (V_tcp_do_rfc1323)
1231                 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1232
1233         return (wscale);
1234 }
1235
1236 /*
1237  * Determine the receive window size for a socket.
1238  */
1239 static unsigned long
1240 select_rcv_wnd(struct toedev *dev, struct socket *so)
1241 {
1242         INIT_VNET_INET(so->so_vnet);
1243         struct tom_data *d = TOM_DATA(dev);
1244         unsigned int wnd;
1245         unsigned int max_rcv_wnd;
1246         struct sockbuf *rcv;
1247
1248         rcv = so_sockbuf_rcv(so);
1249
1250         if (V_tcp_do_autorcvbuf)
1251                 wnd = V_tcp_autorcvbuf_max;
1252         else
1253                 wnd = rcv->sb_hiwat;
1254
1255
1256
1257         /* XXX
1258          * For receive coalescing to work effectively we need a receive window
1259          * that can accomodate a coalesced segment.
1260          */
1261         if (wnd < MIN_RCV_WND)
1262                 wnd = MIN_RCV_WND;
1263
1264         /* PR 5138 */
1265         max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1266                                     (uint32_t)d->rx_page_size * 23 :
1267                                     MAX_RCV_WND);
1268
1269         return min(wnd, max_rcv_wnd);
1270 }
1271
1272 /*
1273  * Assign offload parameters to some socket fields.  This code is used by
1274  * both active and passive opens.
1275  */
1276 static inline void
1277 init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1278     struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1279 {
1280         struct tcpcb *tp = so_sototcpcb(so);
1281         struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1282         struct sockbuf *snd, *rcv;
1283
1284 #ifdef notyet
1285         SOCK_LOCK_ASSERT(so);
1286 #endif
1287
1288         snd = so_sockbuf_snd(so);
1289         rcv = so_sockbuf_rcv(so);
1290
1291         log(LOG_INFO, "initializing offload socket\n");
1292         /*
1293          * We either need to fix push frames to work with sbcompress
1294          * or we need to add this
1295          */
1296         snd->sb_flags |= SB_NOCOALESCE;
1297         rcv->sb_flags |= SB_NOCOALESCE;
1298
1299         tp->t_toe = toep;
1300         toep->tp_tp = tp;
1301         toep->tp_toedev = dev;
1302
1303         toep->tp_tid = tid;
1304         toep->tp_l2t = e;
1305         toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1306         toep->tp_wr_unacked = 0;
1307         toep->tp_delack_mode = 0;
1308
1309         toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1310         /*
1311          * XXX broken
1312          *
1313          */
1314         tp->rcv_wnd = select_rcv_wnd(dev, so);
1315
1316         toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1317                        tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1318         toep->tp_qset_idx = 0;
1319
1320         reset_wr_list(toep);
1321         DPRINTF("initialization done\n");
1322 }
1323
1324 /*
1325  * The next two functions calculate the option 0 value for a socket.
1326  */
1327 static inline unsigned int
1328 calc_opt0h(struct socket *so, int mtu_idx)
1329 {
1330         struct tcpcb *tp = so_sototcpcb(so);
1331         int wscale = select_rcv_wscale(tp->rcv_wnd);
1332
1333         return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1334             V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1335             V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1336 }
1337
1338 static inline unsigned int
1339 calc_opt0l(struct socket *so, int ulp_mode)
1340 {
1341         struct tcpcb *tp = so_sototcpcb(so);
1342         unsigned int val;
1343
1344         val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1345                V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1346
1347         DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1348         return (val);
1349 }
1350
1351 static inline unsigned int
1352 calc_opt2(const struct socket *so, struct toedev *dev)
1353 {
1354         int flv_valid;
1355
1356         flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1357
1358         return (V_FLAVORS_VALID(flv_valid) |
1359             V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1360 }
1361
1362 #if DEBUG_WR > 1
1363 static int
1364 count_pending_wrs(const struct toepcb *toep)
1365 {
1366         const struct mbuf *m;
1367         int n = 0;
1368
1369         wr_queue_walk(toep, m)
1370                 n += m->m_pkthdr.csum_data;
1371         return (n);
1372 }
1373 #endif
1374
1375 #if 0
1376 (((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1377 #endif
1378
1379 static void
1380 mk_act_open_req(struct socket *so, struct mbuf *m,
1381     unsigned int atid, const struct l2t_entry *e)
1382 {
1383         struct cpl_act_open_req *req;
1384         struct inpcb *inp = so_sotoinpcb(so);
1385         struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1386         struct toepcb *toep = tp->t_toe;
1387         struct toedev *tdev = toep->tp_toedev;
1388
1389         m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1390
1391         req = mtod(m, struct cpl_act_open_req *);
1392         m->m_pkthdr.len = m->m_len = sizeof(*req);
1393
1394         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1395         req->wr.wr_lo = 0;
1396         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1397         inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1398 #if 0
1399         req->local_port = inp->inp_lport;
1400         req->peer_port = inp->inp_fport;
1401         memcpy(&req->local_ip, &inp->inp_laddr, 4);
1402         memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1403 #endif
1404         req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1405                            V_TX_CHANNEL(e->smt_idx));
1406         req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1407         req->params = 0;
1408         req->opt2 = htonl(calc_opt2(so, tdev));
1409 }
1410
1411
1412 /*
1413  * Convert an ACT_OPEN_RPL status to an errno.
1414  */
1415 static int
1416 act_open_rpl_status_to_errno(int status)
1417 {
1418         switch (status) {
1419         case CPL_ERR_CONN_RESET:
1420                 return (ECONNREFUSED);
1421         case CPL_ERR_ARP_MISS:
1422                 return (EHOSTUNREACH);
1423         case CPL_ERR_CONN_TIMEDOUT:
1424                 return (ETIMEDOUT);
1425         case CPL_ERR_TCAM_FULL:
1426                 return (ENOMEM);
1427         case CPL_ERR_CONN_EXIST:
1428                 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1429                 return (EADDRINUSE);
1430         default:
1431                 return (EIO);
1432         }
1433 }
1434
1435 static void
1436 fail_act_open(struct toepcb *toep, int errno)
1437 {
1438         struct tcpcb *tp = toep->tp_tp;
1439
1440         t3_release_offload_resources(toep);
1441         if (tp) {
1442                 inp_wunlock(tp->t_inpcb);
1443                 tcp_offload_drop(tp, errno);
1444         }
1445
1446 #ifdef notyet
1447         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1448 #endif
1449 }
1450
1451 /*
1452  * Handle active open failures.
1453  */
1454 static void
1455 active_open_failed(struct toepcb *toep, struct mbuf *m)
1456 {
1457         struct cpl_act_open_rpl *rpl = cplhdr(m);
1458         struct inpcb *inp;
1459
1460         if (toep->tp_tp == NULL)
1461                 goto done;
1462
1463         inp = toep->tp_tp->t_inpcb;
1464
1465 /*
1466  * Don't handle connection retry for now
1467  */
1468 #ifdef notyet
1469         struct inet_connection_sock *icsk = inet_csk(sk);
1470
1471         if (rpl->status == CPL_ERR_CONN_EXIST &&
1472             icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1473                 icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1474                 sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1475                                jiffies + HZ / 2);
1476         } else
1477 #endif
1478         {
1479                 inp_wlock(inp);
1480                 /*
1481                  * drops the inpcb lock
1482                  */
1483                 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1484         }
1485
1486         done:
1487         m_free(m);
1488 }
1489
1490 /*
1491  * Return whether a failed active open has allocated a TID
1492  */
1493 static inline int
1494 act_open_has_tid(int status)
1495 {
1496         return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1497                status != CPL_ERR_ARP_MISS;
1498 }
1499
1500 /*
1501  * Process an ACT_OPEN_RPL CPL message.
1502  */
1503 static int
1504 do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1505 {
1506         struct toepcb *toep = (struct toepcb *)ctx;
1507         struct cpl_act_open_rpl *rpl = cplhdr(m);
1508
1509         if (cdev->type != T3A && act_open_has_tid(rpl->status))
1510                 cxgb_queue_tid_release(cdev, GET_TID(rpl));
1511
1512         active_open_failed(toep, m);
1513         return (0);
1514 }
1515
1516 /*
1517  * Handle an ARP failure for an active open.   XXX purge ofo queue
1518  *
1519  * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1520  * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1521  * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1522  * free the atid.  Hmm.
1523  */
1524 #ifdef notyet
1525 static void
1526 act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1527 {
1528         struct toepcb *toep = m_get_toep(m);
1529         struct tcpcb *tp = toep->tp_tp;
1530         struct inpcb *inp = tp->t_inpcb;
1531         struct socket *so;
1532
1533         inp_wlock(inp);
1534         if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1535                 /*
1536                  * drops the inpcb lock
1537                  */
1538                 fail_act_open(so, EHOSTUNREACH);
1539                 printf("freeing %p\n", m);
1540
1541                 m_free(m);
1542         } else
1543                 inp_wunlock(inp);
1544 }
1545 #endif
1546 /*
1547  * Send an active open request.
1548  */
1549 int
1550 t3_connect(struct toedev *tdev, struct socket *so,
1551     struct rtentry *rt, struct sockaddr *nam)
1552 {
1553         struct mbuf *m;
1554         struct l2t_entry *e;
1555         struct tom_data *d = TOM_DATA(tdev);
1556         struct inpcb *inp = so_sotoinpcb(so);
1557         struct tcpcb *tp = intotcpcb(inp);
1558         struct toepcb *toep; /* allocated by init_offload_socket */
1559
1560         int atid;
1561
1562         toep = toepcb_alloc();
1563         if (toep == NULL)
1564                 goto out_err;
1565
1566         if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1567                 goto out_err;
1568
1569         e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1570         if (!e)
1571                 goto free_tid;
1572
1573         inp_lock_assert(inp);
1574         m = m_gethdr(MT_DATA, M_WAITOK);
1575
1576 #if 0
1577         m->m_toe.mt_toepcb = tp->t_toe;
1578         set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1579 #endif
1580         so_lock(so);
1581
1582         init_offload_socket(so, tdev, atid, e, rt, toep);
1583
1584         install_offload_ops(so);
1585
1586         mk_act_open_req(so, m, atid, e);
1587         so_unlock(so);
1588
1589         soisconnecting(so);
1590         toep = tp->t_toe;
1591         m_set_toep(m, tp->t_toe);
1592
1593         toep->tp_state = TCPS_SYN_SENT;
1594         l2t_send(d->cdev, (struct mbuf *)m, e);
1595
1596         if (toep->tp_ulp_mode)
1597                 t3_enable_ddp(toep, 0);
1598         return  (0);
1599
1600 free_tid:
1601         printf("failing connect - free atid\n");
1602
1603         free_atid(d->cdev, atid);
1604 out_err:
1605         printf("return ENOMEM\n");
1606        return (ENOMEM);
1607 }
1608
1609 /*
1610  * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1611  * not send multiple ABORT_REQs for the same connection and also that we do
1612  * not try to send a message after the connection has closed.  Returns 1 if
1613  * an ABORT_REQ wasn't generated after all, 0 otherwise.
1614  */
1615 static void
1616 t3_send_reset(struct toepcb *toep)
1617 {
1618
1619         struct cpl_abort_req *req;
1620         unsigned int tid = toep->tp_tid;
1621         int mode = CPL_ABORT_SEND_RST;
1622         struct tcpcb *tp = toep->tp_tp;
1623         struct toedev *tdev = toep->tp_toedev;
1624         struct socket *so = NULL;
1625         struct mbuf *m;
1626         struct sockbuf *snd;
1627
1628         if (tp) {
1629                 inp_lock_assert(tp->t_inpcb);
1630                 so = inp_inpcbtosocket(tp->t_inpcb);
1631         }
1632
1633         if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1634                 tdev == NULL))
1635                 return;
1636         toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1637
1638         snd = so_sockbuf_snd(so);
1639         /* Purge the send queue so we don't send anything after an abort. */
1640         if (so)
1641                 sbflush(snd);
1642         if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1643                 mode |= CPL_ABORT_POST_CLOSE_REQ;
1644
1645         m = m_gethdr_nofail(sizeof(*req));
1646         m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1647         set_arp_failure_handler(m, abort_arp_failure);
1648
1649         req = mtod(m, struct cpl_abort_req *);
1650         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1651         req->wr.wr_lo = htonl(V_WR_TID(tid));
1652         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1653         req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1654         req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1655         req->cmd = mode;
1656         if (tp && (tp->t_state == TCPS_SYN_SENT))
1657                 mbufq_tail(&toep->out_of_order_queue, m);       // defer
1658         else
1659                 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1660 }
1661
1662 static int
1663 t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1664 {
1665         struct inpcb *inp;
1666         int error, optval;
1667
1668         if (sopt->sopt_name == IP_OPTIONS)
1669                 return (ENOPROTOOPT);
1670
1671         if (sopt->sopt_name != IP_TOS)
1672                 return (EOPNOTSUPP);
1673
1674         error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1675
1676         if (error)
1677                 return (error);
1678
1679         if (optval > IPTOS_PREC_CRITIC_ECP)
1680                 return (EINVAL);
1681
1682         inp = so_sotoinpcb(so);
1683         inp_wlock(inp);
1684         inp_ip_tos_set(inp, optval);
1685 #if 0
1686         inp->inp_ip_tos = optval;
1687 #endif
1688         t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1689         inp_wunlock(inp);
1690
1691         return (0);
1692 }
1693
1694 static int
1695 t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1696 {
1697         int err = 0;
1698         size_t copied;
1699
1700         if (sopt->sopt_name != TCP_CONGESTION &&
1701             sopt->sopt_name != TCP_NODELAY)
1702                 return (EOPNOTSUPP);
1703
1704         if (sopt->sopt_name == TCP_CONGESTION) {
1705                 char name[TCP_CA_NAME_MAX];
1706                 int optlen = sopt->sopt_valsize;
1707                 struct tcpcb *tp;
1708
1709                 if (sopt->sopt_dir == SOPT_GET) {
1710                         KASSERT(0, ("unimplemented"));
1711                         return (EOPNOTSUPP);
1712                 }
1713
1714                 if (optlen < 1)
1715                         return (EINVAL);
1716
1717                 err = copyinstr(sopt->sopt_val, name,
1718                     min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1719                 if (err)
1720                         return (err);
1721                 if (copied < 1)
1722                         return (EINVAL);
1723
1724                 tp = so_sototcpcb(so);
1725                 /*
1726                  * XXX I need to revisit this
1727                  */
1728                 if ((err = t3_set_cong_control(so, name)) == 0) {
1729 #ifdef CONGESTION_CONTROL_SUPPORTED
1730                         tp->t_cong_control = strdup(name, M_CXGB);
1731 #endif
1732                 } else
1733                         return (err);
1734         } else {
1735                 int optval, oldval;
1736                 struct inpcb *inp;
1737                 struct tcpcb *tp;
1738
1739                 if (sopt->sopt_dir == SOPT_GET)
1740                         return (EOPNOTSUPP);
1741
1742                 err = sooptcopyin(sopt, &optval, sizeof optval,
1743                     sizeof optval);
1744
1745                 if (err)
1746                         return (err);
1747
1748                 inp = so_sotoinpcb(so);
1749                 inp_wlock(inp);
1750                 tp = inp_inpcbtotcpcb(inp);
1751
1752                 oldval = tp->t_flags;
1753                 if (optval)
1754                         tp->t_flags |= TF_NODELAY;
1755                 else
1756                         tp->t_flags &= ~TF_NODELAY;
1757                 inp_wunlock(inp);
1758
1759
1760                 if (oldval != tp->t_flags && (tp->t_toe != NULL))
1761                         t3_set_nagle(tp->t_toe);
1762
1763         }
1764
1765         return (0);
1766 }
1767
1768 int
1769 t3_ctloutput(struct socket *so, struct sockopt *sopt)
1770 {
1771         int err;
1772
1773         if (sopt->sopt_level != IPPROTO_TCP)
1774                 err =  t3_ip_ctloutput(so, sopt);
1775         else
1776                 err = t3_tcp_ctloutput(so, sopt);
1777
1778         if (err != EOPNOTSUPP)
1779                 return (err);
1780
1781         return (tcp_ctloutput(so, sopt));
1782 }
1783
1784 /*
1785  * Returns true if we need to explicitly request RST when we receive new data
1786  * on an RX-closed connection.
1787  */
1788 static inline int
1789 need_rst_on_excess_rx(const struct toepcb *toep)
1790 {
1791         return (1);
1792 }
1793
1794 /*
1795  * Handles Rx data that arrives in a state where the socket isn't accepting
1796  * new data.
1797  */
1798 static void
1799 handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1800 {
1801
1802         if (need_rst_on_excess_rx(toep) &&
1803             !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1804                 t3_send_reset(toep);
1805         m_freem(m);
1806 }
1807
1808 /*
1809  * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1810  * by getting the DDP offset from the TCB.
1811  */
1812 static void
1813 tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1814 {
1815         struct ddp_state *q = &toep->tp_ddp_state;
1816         struct ddp_buf_state *bsp;
1817         struct cpl_get_tcb_rpl *hdr;
1818         unsigned int ddp_offset;
1819         struct socket *so;
1820         struct tcpcb *tp;
1821         struct sockbuf *rcv;
1822         int state;
1823
1824         uint64_t t;
1825         __be64 *tcb;
1826
1827         tp = toep->tp_tp;
1828         so = inp_inpcbtosocket(tp->t_inpcb);
1829
1830         inp_lock_assert(tp->t_inpcb);
1831         rcv = so_sockbuf_rcv(so);
1832         sockbuf_lock(rcv);
1833
1834         /* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1835          * We really need a cookie in order to dispatch the RPLs.
1836          */
1837         q->get_tcb_count--;
1838
1839         /* It is a possible that a previous CPL already invalidated UBUF DDP
1840          * and moved the cur_buf idx and hence no further processing of this
1841          * skb is required. However, the app might be sleeping on
1842          * !q->get_tcb_count and we need to wake it up.
1843          */
1844         if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1845                 int state = so_state_get(so);
1846
1847                 m_freem(m);
1848                 if (__predict_true((state & SS_NOFDREF) == 0))
1849                         so_sorwakeup_locked(so);
1850                 else
1851                         sockbuf_unlock(rcv);
1852
1853                 return;
1854         }
1855
1856         bsp = &q->buf_state[q->cur_buf];
1857         hdr = cplhdr(m);
1858         tcb = (__be64 *)(hdr + 1);
1859         if (q->cur_buf == 0) {
1860                 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1861                 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1862         } else {
1863                 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1864                 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1865         }
1866         ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1867         m->m_cur_offset = bsp->cur_offset;
1868         bsp->cur_offset = ddp_offset;
1869         m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1870
1871         CTR5(KTR_TOM,
1872             "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1873             q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1874         KASSERT(ddp_offset >= m->m_cur_offset,
1875             ("ddp_offset=%u less than cur_offset=%u",
1876                 ddp_offset, m->m_cur_offset));
1877
1878 #if 0
1879 {
1880         unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1881
1882         t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1883         ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1884
1885         t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1886         rcv_nxt = t >> S_TCB_RCV_NXT;
1887         rcv_nxt &= M_TCB_RCV_NXT;
1888
1889         t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1890         rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1891         rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1892
1893         T3_TRACE2(TIDTB(sk),
1894                   "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1895                   ddp_flags, rcv_nxt - rx_hdr_offset);
1896         T3_TRACE4(TB(q),
1897                   "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1898                   tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1899         T3_TRACE3(TB(q),
1900                   "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1901                   rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1902         T3_TRACE2(TB(q),
1903                   "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1904                  q->buf_state[0].flags, q->buf_state[1].flags);
1905
1906 }
1907 #endif
1908         if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1909                 handle_excess_rx(toep, m);
1910                 return;
1911         }
1912
1913 #ifdef T3_TRACE
1914         if ((int)m->m_pkthdr.len < 0) {
1915                 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1916         }
1917 #endif
1918         if (bsp->flags & DDP_BF_NOCOPY) {
1919 #ifdef T3_TRACE
1920                 T3_TRACE0(TB(q),
1921                           "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1922
1923                 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1924                         printk("!cancel_ubuf");
1925                         t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1926                 }
1927 #endif
1928                 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1929                 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1930                 q->cur_buf ^= 1;
1931         } else if (bsp->flags & DDP_BF_NOFLIP) {
1932
1933                 m->m_ddp_flags = 1;    /* always a kernel buffer */
1934
1935                 /* now HW buffer carries a user buffer */
1936                 bsp->flags &= ~DDP_BF_NOFLIP;
1937                 bsp->flags |= DDP_BF_NOCOPY;
1938
1939                 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1940                  * any new data in which case we're done. If in addition the
1941                  * offset is 0, then there wasn't a completion for the kbuf
1942                  * and we need to decrement the posted count.
1943                  */
1944                 if (m->m_pkthdr.len == 0) {
1945                         if (ddp_offset == 0) {
1946                                 q->kbuf_posted--;
1947                                 bsp->flags |= DDP_BF_NODATA;
1948                         }
1949                         sockbuf_unlock(rcv);
1950                         m_free(m);
1951                         return;
1952                 }
1953         } else {
1954                 sockbuf_unlock(rcv);
1955
1956                 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1957                  * but it got here way late and nobody cares anymore.
1958                  */
1959                 m_free(m);
1960                 return;
1961         }
1962
1963         m->m_ddp_gl = (unsigned char *)bsp->gl;
1964         m->m_flags |= M_DDP;
1965         m->m_seq = tp->rcv_nxt;
1966         tp->rcv_nxt += m->m_pkthdr.len;
1967         tp->t_rcvtime = ticks;
1968         CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1969                   m->m_seq, q->cur_buf, m->m_pkthdr.len);
1970         if (m->m_pkthdr.len == 0) {
1971                 q->user_ddp_pending = 0;
1972                 m_free(m);
1973         } else
1974                 SBAPPEND(rcv, m);
1975
1976         state = so_state_get(so);
1977         if (__predict_true((state & SS_NOFDREF) == 0))
1978                 so_sorwakeup_locked(so);
1979         else
1980                 sockbuf_unlock(rcv);
1981 }
1982
1983 /*
1984  * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1985  * in that case they are similar to DDP completions.
1986  */
1987 static int
1988 do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1989 {
1990         struct toepcb *toep = (struct toepcb *)ctx;
1991
1992         /* OK if socket doesn't exist */
1993         if (toep == NULL) {
1994                 printf("null toep in do_get_tcb_rpl\n");
1995                 return (CPL_RET_BUF_DONE);
1996         }
1997
1998         inp_wlock(toep->tp_tp->t_inpcb);
1999         tcb_rpl_as_ddp_complete(toep, m);
2000         inp_wunlock(toep->tp_tp->t_inpcb);
2001
2002         return (0);
2003 }
2004
2005 static void
2006 handle_ddp_data(struct toepcb *toep, struct mbuf *m)
2007 {
2008         struct tcpcb *tp = toep->tp_tp;
2009         struct socket *so;
2010         struct ddp_state *q;
2011         struct ddp_buf_state *bsp;
2012         struct cpl_rx_data *hdr = cplhdr(m);
2013         unsigned int rcv_nxt = ntohl(hdr->seq);
2014         struct sockbuf *rcv;
2015
2016         if (tp->rcv_nxt == rcv_nxt)
2017                 return;
2018
2019         inp_lock_assert(tp->t_inpcb);
2020         so  = inp_inpcbtosocket(tp->t_inpcb);
2021         rcv = so_sockbuf_rcv(so);
2022         sockbuf_lock(rcv);
2023
2024         q = &toep->tp_ddp_state;
2025         bsp = &q->buf_state[q->cur_buf];
2026         KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2027                 rcv_nxt, tp->rcv_nxt));
2028         m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2029         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2030         CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2031             rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2032
2033 #ifdef T3_TRACE
2034         if ((int)m->m_pkthdr.len < 0) {
2035                 t3_ddp_error(so, "handle_ddp_data: neg len");
2036         }
2037 #endif
2038         m->m_ddp_gl = (unsigned char *)bsp->gl;
2039         m->m_flags |= M_DDP;
2040         m->m_cur_offset = bsp->cur_offset;
2041         m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2042         if (bsp->flags & DDP_BF_NOCOPY)
2043                 bsp->flags &= ~DDP_BF_NOCOPY;
2044
2045         m->m_seq = tp->rcv_nxt;
2046         tp->rcv_nxt = rcv_nxt;
2047         bsp->cur_offset += m->m_pkthdr.len;
2048         if (!(bsp->flags & DDP_BF_NOFLIP))
2049                 q->cur_buf ^= 1;
2050         /*
2051          * For now, don't re-enable DDP after a connection fell out of  DDP
2052          * mode.
2053          */
2054         q->ubuf_ddp_ready = 0;
2055         sockbuf_unlock(rcv);
2056 }
2057
2058 /*
2059  * Process new data received for a connection.
2060  */
2061 static void
2062 new_rx_data(struct toepcb *toep, struct mbuf *m)
2063 {
2064         struct cpl_rx_data *hdr = cplhdr(m);
2065         struct tcpcb *tp = toep->tp_tp;
2066         struct socket *so;
2067         struct sockbuf *rcv;
2068         int state;
2069         int len = be16toh(hdr->len);
2070
2071         inp_wlock(tp->t_inpcb);
2072
2073         so  = inp_inpcbtosocket(tp->t_inpcb);
2074
2075         if (__predict_false(so_no_receive(so))) {
2076                 handle_excess_rx(toep, m);
2077                 inp_wunlock(tp->t_inpcb);
2078                 TRACE_EXIT;
2079                 return;
2080         }
2081
2082         if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2083                 handle_ddp_data(toep, m);
2084
2085         m->m_seq = ntohl(hdr->seq);
2086         m->m_ulp_mode = 0;                    /* for iSCSI */
2087
2088 #if VALIDATE_SEQ
2089         if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2090                 log(LOG_ERR,
2091                        "%s: TID %u: Bad sequence number %u, expected %u\n",
2092                     toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2093                        tp->rcv_nxt);
2094                 m_freem(m);
2095                 inp_wunlock(tp->t_inpcb);
2096                 return;
2097         }
2098 #endif
2099         m_adj(m, sizeof(*hdr));
2100
2101 #ifdef URGENT_DATA_SUPPORTED
2102         /*
2103          * We don't handle urgent data yet
2104          */
2105         if (__predict_false(hdr->urg))
2106                 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2107         if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2108                      tp->urg_seq - tp->rcv_nxt < skb->len))
2109                 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2110                                                          tp->rcv_nxt];
2111 #endif
2112         if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2113                 toep->tp_delack_mode = hdr->dack_mode;
2114                 toep->tp_delack_seq = tp->rcv_nxt;
2115         }
2116         CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2117             m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2118
2119         if (len < m->m_pkthdr.len)
2120                 m->m_pkthdr.len = m->m_len = len;
2121
2122         tp->rcv_nxt += m->m_pkthdr.len;
2123         tp->t_rcvtime = ticks;
2124         toep->tp_enqueued_bytes += m->m_pkthdr.len;
2125         CTR2(KTR_TOM,
2126             "new_rx_data: seq 0x%x len %u",
2127             m->m_seq, m->m_pkthdr.len);
2128         inp_wunlock(tp->t_inpcb);
2129         rcv = so_sockbuf_rcv(so);
2130         sockbuf_lock(rcv);
2131 #if 0
2132         if (sb_notify(rcv))
2133                 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2134 #endif
2135         SBAPPEND(rcv, m);
2136
2137 #ifdef notyet
2138         /*
2139          * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2140          *
2141          */
2142         KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2143
2144             ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2145                 so, rcv->sb_cc, rcv->sb_mbmax));
2146 #endif
2147
2148
2149         CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2150             rcv->sb_cc, rcv->sb_mbcnt);
2151
2152         state = so_state_get(so);
2153         if (__predict_true((state & SS_NOFDREF) == 0))
2154                 so_sorwakeup_locked(so);
2155         else
2156                 sockbuf_unlock(rcv);
2157 }
2158
2159 /*
2160  * Handler for RX_DATA CPL messages.
2161  */
2162 static int
2163 do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2164 {
2165         struct toepcb *toep = (struct toepcb *)ctx;
2166
2167         DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2168
2169         new_rx_data(toep, m);
2170
2171         return (0);
2172 }
2173
2174 static void
2175 new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2176 {
2177         struct tcpcb *tp;
2178         struct ddp_state *q;
2179         struct ddp_buf_state *bsp;
2180         struct cpl_rx_data_ddp *hdr;
2181         struct socket *so;
2182         unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2183         int nomoredata = 0;
2184         unsigned int delack_mode;
2185         struct sockbuf *rcv;
2186
2187         tp = toep->tp_tp;
2188         inp_wlock(tp->t_inpcb);
2189         so = inp_inpcbtosocket(tp->t_inpcb);
2190
2191         if (__predict_false(so_no_receive(so))) {
2192
2193                 handle_excess_rx(toep, m);
2194                 inp_wunlock(tp->t_inpcb);
2195                 return;
2196         }
2197
2198         q = &toep->tp_ddp_state;
2199         hdr = cplhdr(m);
2200         ddp_report = ntohl(hdr->u.ddp_report);
2201         buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2202         bsp = &q->buf_state[buf_idx];
2203
2204         CTR4(KTR_TOM,
2205             "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2206             "hdr seq 0x%x len %u",
2207             tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2208             ntohs(hdr->len));
2209         CTR3(KTR_TOM,
2210             "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2211             G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2212
2213         ddp_len = ntohs(hdr->len);
2214         rcv_nxt = ntohl(hdr->seq) + ddp_len;
2215
2216         delack_mode = G_DDP_DACK_MODE(ddp_report);
2217         if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2218                 toep->tp_delack_mode = delack_mode;
2219                 toep->tp_delack_seq = tp->rcv_nxt;
2220         }
2221
2222         m->m_seq = tp->rcv_nxt;
2223         tp->rcv_nxt = rcv_nxt;
2224
2225         tp->t_rcvtime = ticks;
2226         /*
2227          * Store the length in m->m_len.  We are changing the meaning of
2228          * m->m_len here, we need to be very careful that nothing from now on
2229          * interprets ->len of this packet the usual way.
2230          */
2231         m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2232         inp_wunlock(tp->t_inpcb);
2233         CTR3(KTR_TOM,
2234             "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2235             m->m_len, rcv_nxt, m->m_seq);
2236         /*
2237          * Figure out where the new data was placed in the buffer and store it
2238          * in when.  Assumes the buffer offset starts at 0, consumer needs to
2239          * account for page pod's pg_offset.
2240          */
2241         end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2242         m->m_cur_offset = end_offset - m->m_pkthdr.len;
2243
2244         rcv = so_sockbuf_rcv(so);
2245         sockbuf_lock(rcv);
2246
2247         m->m_ddp_gl = (unsigned char *)bsp->gl;
2248         m->m_flags |= M_DDP;
2249         bsp->cur_offset = end_offset;
2250         toep->tp_enqueued_bytes += m->m_pkthdr.len;
2251
2252         /*
2253          * Length is only meaningful for kbuf
2254          */
2255         if (!(bsp->flags & DDP_BF_NOCOPY))
2256                 KASSERT(m->m_len <= bsp->gl->dgl_length,
2257                     ("length received exceeds ddp pages: len=%d dgl_length=%d",
2258                         m->m_len, bsp->gl->dgl_length));
2259
2260         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2261         KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2262         /*
2263          * Bit 0 of flags stores whether the DDP buffer is completed.
2264          * Note that other parts of the code depend on this being in bit 0.
2265          */
2266         if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2267                 panic("spurious ddp completion");
2268         } else {
2269                 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2270                 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2271                         q->cur_buf ^= 1;                     /* flip buffers */
2272         }
2273
2274         if (bsp->flags & DDP_BF_NOCOPY) {
2275                 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2276                 bsp->flags &= ~DDP_BF_NOCOPY;
2277         }
2278
2279         if (ddp_report & F_DDP_PSH)
2280                 m->m_ddp_flags |= DDP_BF_PSH;
2281         if (nomoredata)
2282                 m->m_ddp_flags |= DDP_BF_NODATA;
2283
2284 #ifdef notyet
2285         skb_reset_transport_header(skb);
2286         tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2287 #endif
2288         SBAPPEND(rcv, m);
2289
2290         if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2291             (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2292                 || !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2293                 so_sorwakeup_locked(so);
2294         else
2295                 sockbuf_unlock(rcv);
2296 }
2297
2298 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2299                  F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2300                  F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2301                  F_DDP_INVALID_PPOD)
2302
2303 /*
2304  * Handler for RX_DATA_DDP CPL messages.
2305  */
2306 static int
2307 do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2308 {
2309         struct toepcb *toep = ctx;
2310         const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2311
2312         VALIDATE_SOCK(so);
2313
2314         if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2315                 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2316                        GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2317                 return (CPL_RET_BUF_DONE);
2318         }
2319 #if 0
2320         skb->h.th = tcphdr_skb->h.th;
2321 #endif
2322         new_rx_data_ddp(toep, m);
2323         return (0);
2324 }
2325
2326 static void
2327 process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2328 {
2329         struct tcpcb *tp = toep->tp_tp;
2330         struct socket *so;
2331         struct ddp_state *q;
2332         struct ddp_buf_state *bsp;
2333         struct cpl_rx_ddp_complete *hdr;
2334         unsigned int ddp_report, buf_idx, when, delack_mode;
2335         int nomoredata = 0;
2336         struct sockbuf *rcv;
2337
2338         inp_wlock(tp->t_inpcb);
2339         so = inp_inpcbtosocket(tp->t_inpcb);
2340
2341         if (__predict_false(so_no_receive(so))) {
2342                 struct inpcb *inp = so_sotoinpcb(so);
2343
2344                 handle_excess_rx(toep, m);
2345                 inp_wunlock(inp);
2346                 return;
2347         }
2348         q = &toep->tp_ddp_state;
2349         hdr = cplhdr(m);
2350         ddp_report = ntohl(hdr->ddp_report);
2351         buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2352         m->m_pkthdr.csum_data = tp->rcv_nxt;
2353
2354         rcv = so_sockbuf_rcv(so);
2355         sockbuf_lock(rcv);
2356
2357         bsp = &q->buf_state[buf_idx];
2358         when = bsp->cur_offset;
2359         m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2360         tp->rcv_nxt += m->m_len;
2361         tp->t_rcvtime = ticks;
2362
2363         delack_mode = G_DDP_DACK_MODE(ddp_report);
2364         if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2365                 toep->tp_delack_mode = delack_mode;
2366                 toep->tp_delack_seq = tp->rcv_nxt;
2367         }
2368 #ifdef notyet
2369         skb_reset_transport_header(skb);
2370         tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2371 #endif
2372         inp_wunlock(tp->t_inpcb);
2373
2374         KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2375         CTR5(KTR_TOM,
2376                   "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2377                   "ddp_report 0x%x offset %u, len %u",
2378                   tp->rcv_nxt, bsp->cur_offset, ddp_report,
2379                    G_DDP_OFFSET(ddp_report), m->m_len);
2380
2381         m->m_cur_offset = bsp->cur_offset;
2382         bsp->cur_offset += m->m_len;
2383
2384         if (!(bsp->flags & DDP_BF_NOFLIP)) {
2385                 q->cur_buf ^= 1;                     /* flip buffers */
2386                 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2387                         nomoredata=1;
2388         }
2389
2390         CTR4(KTR_TOM,
2391                   "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2392                   "ddp_report %u offset %u",
2393                   tp->rcv_nxt, bsp->cur_offset, ddp_report,
2394                    G_DDP_OFFSET(ddp_report));
2395
2396         m->m_ddp_gl = (unsigned char *)bsp->gl;
2397         m->m_flags |= M_DDP;
2398         m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2399         if (bsp->flags & DDP_BF_NOCOPY)
2400                 bsp->flags &= ~DDP_BF_NOCOPY;
2401         if (nomoredata)
2402                 m->m_ddp_flags |= DDP_BF_NODATA;
2403
2404         SBAPPEND(rcv, m);
2405         if ((so_state_get(so) & SS_NOFDREF) == 0)
2406                 so_sorwakeup_locked(so);
2407         else
2408                 sockbuf_unlock(rcv);
2409 }
2410
2411 /*
2412  * Handler for RX_DDP_COMPLETE CPL messages.
2413  */
2414 static int
2415 do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2416 {
2417         struct toepcb *toep = ctx;
2418
2419         VALIDATE_SOCK(so);
2420 #if 0
2421         skb->h.th = tcphdr_skb->h.th;
2422 #endif
2423         process_ddp_complete(toep, m);
2424         return (0);
2425 }
2426
2427 /*
2428  * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2429  * socket state before calling tcp_time_wait to comply with its expectations.
2430  */
2431 static void
2432 enter_timewait(struct tcpcb *tp)
2433 {
2434         /*
2435          * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2436          * process peer_close because we don't want to carry the peer FIN in
2437          * the socket's receive queue and if we increment rcv_nxt without
2438          * having the FIN in the receive queue we'll confuse facilities such
2439          * as SIOCINQ.
2440          */
2441         inp_wlock(tp->t_inpcb);
2442         tp->rcv_nxt++;
2443
2444         tp->ts_recent_age = 0;       /* defeat recycling */
2445         tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2446         inp_wunlock(tp->t_inpcb);
2447         tcp_offload_twstart(tp);
2448 }
2449
2450 /*
2451  * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2452  * function deals with the data that may be reported along with the FIN.
2453  * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2454  * perform normal FIN-related processing.  In the latter case 1 indicates that
2455  * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2456  * skb can be freed.
2457  */
2458 static int
2459 handle_peer_close_data(struct socket *so, struct mbuf *m)
2460 {
2461         struct tcpcb *tp = so_sototcpcb(so);
2462         struct toepcb *toep = tp->t_toe;
2463         struct ddp_state *q;
2464         struct ddp_buf_state *bsp;
2465         struct cpl_peer_close *req = cplhdr(m);
2466         unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2467         struct sockbuf *rcv;
2468
2469         if (tp->rcv_nxt == rcv_nxt)                     /* no data */
2470                 return (0);
2471
2472         CTR0(KTR_TOM, "handle_peer_close_data");
2473         if (__predict_false(so_no_receive(so))) {
2474                 handle_excess_rx(toep, m);
2475
2476                 /*
2477                  * Although we discard the data we want to process the FIN so
2478                  * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2479                  * PEER_CLOSE without data.  In particular this PEER_CLOSE
2480                  * may be what will close the connection.  We return 1 because
2481                  * handle_excess_rx() already freed the packet.
2482                  */
2483                 return (1);
2484         }
2485
2486         inp_lock_assert(tp->t_inpcb);
2487         q = &toep->tp_ddp_state;
2488         rcv = so_sockbuf_rcv(so);
2489         sockbuf_lock(rcv);
2490
2491         bsp = &q->buf_state[q->cur_buf];
2492         m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2493         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2494         m->m_ddp_gl = (unsigned char *)bsp->gl;
2495         m->m_flags |= M_DDP;
2496         m->m_cur_offset = bsp->cur_offset;
2497         m->m_ddp_flags =
2498             DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2499         m->m_seq = tp->rcv_nxt;
2500         tp->rcv_nxt = rcv_nxt;
2501         bsp->cur_offset += m->m_pkthdr.len;
2502         if (!(bsp->flags & DDP_BF_NOFLIP))
2503                 q->cur_buf ^= 1;
2504 #ifdef notyet
2505         skb_reset_transport_header(skb);
2506         tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2507 #endif
2508         tp->t_rcvtime = ticks;
2509         SBAPPEND(rcv, m);
2510         if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2511                 so_sorwakeup_locked(so);
2512         else
2513                 sockbuf_unlock(rcv);
2514
2515         return (1);
2516 }
2517
2518 /*
2519  * Handle a peer FIN.
2520  */
2521 static void
2522 do_peer_fin(struct toepcb *toep, struct mbuf *m)
2523 {
2524         struct socket *so;
2525         struct tcpcb *tp = toep->tp_tp;
2526         int keep, action;
2527
2528         action = keep = 0;
2529         CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2530         if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2531                 printf("abort_pending set\n");
2532
2533                 goto out;
2534         }
2535         inp_wlock(tp->t_inpcb);
2536         so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2537         if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2538                 keep = handle_peer_close_data(so, m);
2539                 if (keep < 0) {
2540                         inp_wunlock(tp->t_inpcb);
2541                         return;
2542                 }
2543         }
2544         if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2545                 CTR1(KTR_TOM,
2546                     "waking up waiters for cantrcvmore on %p ", so);
2547                 socantrcvmore(so);
2548
2549                 /*
2550                  * If connection is half-synchronized
2551                  * (ie NEEDSYN flag on) then delay ACK,
2552                  * so it may be piggybacked when SYN is sent.
2553                  * Otherwise, since we received a FIN then no
2554                  * more input can be expected, send ACK now.
2555                  */
2556                 if (tp->t_flags & TF_NEEDSYN)
2557                         tp->t_flags |= TF_DELACK;
2558                 else
2559                         tp->t_flags |= TF_ACKNOW;
2560                 tp->rcv_nxt++;
2561         }
2562
2563         switch (tp->t_state) {
2564         case TCPS_SYN_RECEIVED:
2565             tp->t_starttime = ticks;
2566         /* FALLTHROUGH */
2567         case TCPS_ESTABLISHED:
2568                 tp->t_state = TCPS_CLOSE_WAIT;
2569                 break;
2570         case TCPS_FIN_WAIT_1:
2571                 tp->t_state = TCPS_CLOSING;
2572                 break;
2573         case TCPS_FIN_WAIT_2:
2574                 /*
2575                  * If we've sent an abort_req we must have sent it too late,
2576                  * HW will send us a reply telling us so, and this peer_close
2577                  * is really the last message for this connection and needs to
2578                  * be treated as an abort_rpl, i.e., transition the connection
2579                  * to TCP_CLOSE (note that the host stack does this at the
2580                  * time of generating the RST but we must wait for HW).
2581                  * Otherwise we enter TIME_WAIT.
2582                  */
2583                 t3_release_offload_resources(toep);
2584                 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2585                         action = TCP_CLOSE;
2586                 } else {
2587                         action = TCP_TIMEWAIT;
2588                 }
2589                 break;
2590         default:
2591                 log(LOG_ERR,
2592                        "%s: TID %u received PEER_CLOSE in bad state %d\n",
2593                     toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2594         }
2595         inp_wunlock(tp->t_inpcb);
2596
2597         if (action == TCP_TIMEWAIT) {
2598                 enter_timewait(tp);
2599         } else if (action == TCP_DROP) {
2600                 tcp_offload_drop(tp, 0);
2601         } else if (action == TCP_CLOSE) {
2602                 tcp_offload_close(tp);
2603         }
2604
2605 #ifdef notyet
2606         /* Do not send POLL_HUP for half duplex close. */
2607         if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2608             sk->sk_state == TCP_CLOSE)
2609                 sk_wake_async(so, 1, POLL_HUP);
2610         else
2611                 sk_wake_async(so, 1, POLL_IN);
2612 #endif
2613
2614 out:
2615         if (!keep)
2616                 m_free(m);
2617 }
2618
2619 /*
2620  * Handler for PEER_CLOSE CPL messages.
2621  */
2622 static int
2623 do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2624 {
2625         struct toepcb *toep = (struct toepcb *)ctx;
2626
2627         VALIDATE_SOCK(so);
2628
2629         do_peer_fin(toep, m);
2630         return (0);
2631 }
2632
2633 static void
2634 process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2635 {
2636         struct cpl_close_con_rpl *rpl = cplhdr(m);
2637         struct tcpcb *tp = toep->tp_tp;
2638         struct socket *so;
2639         int action = 0;
2640         struct sockbuf *rcv;
2641
2642         inp_wlock(tp->t_inpcb);
2643         so = inp_inpcbtosocket(tp->t_inpcb);
2644
2645         tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2646
2647         if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2648                 inp_wunlock(tp->t_inpcb);
2649                 goto out;
2650         }
2651
2652         CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2653             tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2654
2655         switch (tp->t_state) {
2656         case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2657                 t3_release_offload_resources(toep);
2658                 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2659                         action = TCP_CLOSE;
2660
2661                 } else {
2662                         action = TCP_TIMEWAIT;
2663                 }
2664                 break;
2665         case TCPS_LAST_ACK:
2666                 /*
2667                  * In this state we don't care about pending abort_rpl.
2668                  * If we've sent abort_req it was post-close and was sent too
2669                  * late, this close_con_rpl is the actual last message.
2670                  */
2671                 t3_release_offload_resources(toep);
2672                 action = TCP_CLOSE;
2673                 break;
2674         case TCPS_FIN_WAIT_1:
2675                 /*
2676                  * If we can't receive any more
2677                  * data, then closing user can proceed.
2678                  * Starting the timer is contrary to the
2679                  * specification, but if we don't get a FIN
2680                  * we'll hang forever.
2681                  *
2682                  * XXXjl:
2683                  * we should release the tp also, and use a
2684                  * compressed state.
2685                  */
2686                 if (so)
2687                         rcv = so_sockbuf_rcv(so);
2688                 else
2689                         break;
2690
2691                 if (rcv->sb_state & SBS_CANTRCVMORE) {
2692                         int timeout;
2693
2694                         if (so)
2695                                 soisdisconnected(so);
2696                         timeout = (tcp_fast_finwait2_recycle) ?
2697                             tcp_finwait2_timeout : tcp_maxidle;
2698                         tcp_timer_activate(tp, TT_2MSL, timeout);
2699                 }
2700                 tp->t_state = TCPS_FIN_WAIT_2;
2701                 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2702                     (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2703                         action = TCP_DROP;
2704                 }
2705
2706                 break;
2707         default:
2708                 log(LOG_ERR,
2709                        "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2710                        toep->tp_toedev->tod_name, toep->tp_tid,
2711                        tp->t_state);
2712         }
2713         inp_wunlock(tp->t_inpcb);
2714
2715
2716         if (action == TCP_TIMEWAIT) {
2717                 enter_timewait(tp);
2718         } else if (action == TCP_DROP) {
2719                 tcp_offload_drop(tp, 0);
2720         } else if (action == TCP_CLOSE) {
2721                 tcp_offload_close(tp);
2722         }
2723 out:
2724         m_freem(m);
2725 }
2726
2727 /*
2728  * Handler for CLOSE_CON_RPL CPL messages.
2729  */
2730 static int
2731 do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2732                             void *ctx)
2733 {
2734         struct toepcb *toep = (struct toepcb *)ctx;
2735
2736         process_close_con_rpl(toep, m);
2737         return (0);
2738 }
2739
2740 /*
2741  * Process abort replies.  We only process these messages if we anticipate
2742  * them as the coordination between SW and HW in this area is somewhat lacking
2743  * and sometimes we get ABORT_RPLs after we are done with the connection that
2744  * originated the ABORT_REQ.
2745  */
2746 static void
2747 process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2748 {
2749         struct tcpcb *tp = toep->tp_tp;
2750         struct socket *so;
2751         int needclose = 0;
2752
2753 #ifdef T3_TRACE
2754         T3_TRACE1(TIDTB(sk),
2755                   "process_abort_rpl: GTS rpl pending %d",
2756                   sock_flag(sk, ABORT_RPL_PENDING));
2757 #endif
2758
2759         inp_wlock(tp->t_inpcb);
2760         so = inp_inpcbtosocket(tp->t_inpcb);
2761
2762         if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2763                 /*
2764                  * XXX panic on tcpdrop
2765                  */
2766                 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2767                         toep->tp_flags |= TP_ABORT_RPL_RCVD;
2768                 else {
2769                         toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2770                         if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2771                             !is_t3a(toep->tp_toedev)) {
2772                                 if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2773                                         panic("TP_ABORT_REQ_RCVD set");
2774                                 t3_release_offload_resources(toep);
2775                                 needclose = 1;
2776                         }
2777                 }
2778         }
2779         inp_wunlock(tp->t_inpcb);
2780
2781         if (needclose)
2782                 tcp_offload_close(tp);
2783
2784         m_free(m);
2785 }
2786
2787 /*
2788  * Handle an ABORT_RPL_RSS CPL message.
2789  */
2790 static int
2791 do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2792 {
2793         struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2794         struct toepcb *toep;
2795
2796         /*
2797          * Ignore replies to post-close aborts indicating that the abort was
2798          * requested too late.  These connections are terminated when we get
2799          * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2800          * arrives the TID is either no longer used or it has been recycled.
2801          */
2802         if (rpl->status == CPL_ERR_ABORT_FAILED) {
2803 discard:
2804                 m_free(m);
2805                 return (0);
2806         }
2807
2808         toep = (struct toepcb *)ctx;
2809
2810         /*
2811          * Sometimes we've already closed the socket, e.g., a post-close
2812          * abort races with ABORT_REQ_RSS, the latter frees the socket
2813          * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2814          * but FW turns the ABORT_REQ into a regular one and so we get
2815          * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2816          */
2817         if (!toep)
2818                 goto discard;
2819
2820         if (toep->tp_tp == NULL) {
2821                 log(LOG_NOTICE, "removing tid for abort\n");
2822                 cxgb_remove_tid(cdev, toep, toep->tp_tid);
2823                 if (toep->tp_l2t)
2824                         l2t_release(L2DATA(cdev), toep->tp_l2t);
2825
2826                 toepcb_release(toep);
2827                 goto discard;
2828         }
2829
2830         log(LOG_NOTICE, "toep=%p\n", toep);
2831         log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2832
2833         toepcb_hold(toep);
2834         process_abort_rpl(toep, m);
2835         toepcb_release(toep);
2836         return (0);
2837 }
2838
2839 /*
2840  * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2841  * indicate whether RST should be sent in response.
2842  */
2843 static int
2844 abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2845 {
2846         struct tcpcb *tp = so_sototcpcb(so);
2847
2848         switch (abort_reason) {
2849         case CPL_ERR_BAD_SYN:
2850 #if 0
2851                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);      // fall through
2852 #endif
2853         case CPL_ERR_CONN_RESET:
2854                 // XXX need to handle SYN_RECV due to crossed SYNs
2855                 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2856         case CPL_ERR_XMIT_TIMEDOUT:
2857         case CPL_ERR_PERSIST_TIMEDOUT:
2858         case CPL_ERR_FINWAIT2_TIMEDOUT:
2859         case CPL_ERR_KEEPALIVE_TIMEDOUT:
2860 #if 0
2861                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2862 #endif
2863                 return (ETIMEDOUT);
2864         default:
2865                 return (EIO);
2866         }
2867 }
2868
2869 static inline void
2870 set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2871 {
2872         struct cpl_abort_rpl *rpl = cplhdr(m);
2873
2874         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2875         rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2876         m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2877
2878         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2879         rpl->cmd = cmd;
2880 }
2881
2882 static void
2883 send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2884 {
2885         struct mbuf *reply_mbuf;
2886         struct cpl_abort_req_rss *req = cplhdr(m);
2887
2888         reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2889         m_set_priority(m, CPL_PRIORITY_DATA);
2890         m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2891         set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2892         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2893         m_free(m);
2894 }
2895
2896 /*
2897  * Returns whether an ABORT_REQ_RSS message is a negative advice.
2898  */
2899 static inline int
2900 is_neg_adv_abort(unsigned int status)
2901 {
2902         return status == CPL_ERR_RTX_NEG_ADVICE ||
2903             status == CPL_ERR_PERSIST_NEG_ADVICE;
2904 }
2905
2906 static void
2907 send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2908 {
2909         struct mbuf  *reply_mbuf;
2910         struct cpl_abort_req_rss *req = cplhdr(m);
2911
2912         reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2913
2914         if (!reply_mbuf) {
2915                 /* Defer the reply.  Stick rst_status into req->cmd. */
2916                 req->status = rst_status;
2917                 t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2918                 return;
2919         }
2920
2921         m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2922         set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2923         m_free(m);
2924
2925         /*
2926          * XXX need to sync with ARP as for SYN_RECV connections we can send
2927          * these messages while ARP is pending.  For other connection states
2928          * it's not a problem.
2929          */
2930         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2931 }
2932
2933 #ifdef notyet
2934 static void
2935 cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2936 {
2937         CXGB_UNIMPLEMENTED();
2938 #ifdef notyet
2939         struct request_sock *req = child->sk_user_data;
2940
2941         inet_csk_reqsk_queue_removed(parent, req);
2942         synq_remove(tcp_sk(child));
2943         __reqsk_free(req);
2944         child->sk_user_data = NULL;
2945 #endif
2946 }
2947
2948
2949 /*
2950  * Performs the actual work to abort a SYN_RECV connection.
2951  */
2952 static void
2953 do_abort_syn_rcv(struct socket *child, struct socket *parent)
2954 {
2955         struct tcpcb *parenttp = so_sototcpcb(parent);
2956         struct tcpcb *childtp = so_sototcpcb(child);
2957
2958         /*
2959          * If the server is still open we clean up the child connection,
2960          * otherwise the server already did the clean up as it was purging
2961          * its SYN queue and the skb was just sitting in its backlog.
2962          */
2963         if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2964                 cleanup_syn_rcv_conn(child, parent);
2965                 inp_wlock(childtp->t_inpcb);
2966                 t3_release_offload_resources(childtp->t_toe);
2967                 inp_wunlock(childtp->t_inpcb);
2968                 tcp_offload_close(childtp);
2969         }
2970 }
2971 #endif
2972
2973 /*
2974  * Handle abort requests for a SYN_RECV connection.  These need extra work
2975  * because the socket is on its parent's SYN queue.
2976  */
2977 static int
2978 abort_syn_rcv(struct socket *so, struct mbuf *m)
2979 {
2980         CXGB_UNIMPLEMENTED();
2981 #ifdef notyet
2982         struct socket *parent;
2983         struct toedev *tdev = toep->tp_toedev;
2984         struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2985         struct socket *oreq = so->so_incomp;
2986         struct t3c_tid_entry *t3c_stid;
2987         struct tid_info *t;
2988
2989         if (!oreq)
2990                 return -1;        /* somehow we are not on the SYN queue */
2991
2992         t = &(T3C_DATA(cdev))->tid_maps;
2993         t3c_stid = lookup_stid(t, oreq->ts_recent);
2994         parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2995
2996         so_lock(parent);
2997         do_abort_syn_rcv(so, parent);
2998         send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2999         so_unlock(parent);
3000 #endif
3001         return (0);
3002 }
3003
3004 /*
3005  * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
3006  * request except that we need to reply to it.
3007  */
3008 static void
3009 process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3010 {
3011         int rst_status = CPL_ABORT_NO_RST;
3012         const struct cpl_abort_req_rss *req = cplhdr(m);
3013         struct tcpcb *tp = toep->tp_tp;
3014         struct socket *so;
3015         int needclose = 0;
3016
3017         inp_wlock(tp->t_inpcb);
3018         so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3019         if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3020                 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3021                 m_free(m);
3022                 goto skip;
3023         }
3024
3025         toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3026         /*
3027          * Three cases to consider:
3028          * a) We haven't sent an abort_req; close the connection.
3029          * b) We have sent a post-close abort_req that will get to TP too late
3030          *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3031          *    be ignored and the connection should be closed now.
3032          * c) We have sent a regular abort_req that will get to TP too late.
3033          *    That will generate an abort_rpl with status 0, wait for it.
3034          */
3035         if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3036             (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3037                 int error;
3038
3039                 error = abort_status_to_errno(so, req->status,
3040                     &rst_status);
3041                 so_error_set(so, error);
3042
3043                 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3044                         so_sorwakeup(so);
3045                 /*
3046                  * SYN_RECV needs special processing.  If abort_syn_rcv()
3047                  * returns 0 is has taken care of the abort.
3048                  */
3049                 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3050                         goto skip;
3051
3052                 t3_release_offload_resources(toep);
3053                 needclose = 1;
3054         }
3055         inp_wunlock(tp->t_inpcb);
3056
3057         if (needclose)
3058                 tcp_offload_close(tp);
3059
3060         send_abort_rpl(m, tdev, rst_status);
3061         return;
3062 skip:
3063         inp_wunlock(tp->t_inpcb);
3064 }
3065
3066 /*
3067  * Handle an ABORT_REQ_RSS CPL message.
3068  */
3069 static int
3070 do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3071 {
3072         const struct cpl_abort_req_rss *req = cplhdr(m);
3073         struct toepcb *toep = (struct toepcb *)ctx;
3074
3075         if (is_neg_adv_abort(req->status)) {
3076                 m_free(m);
3077                 return (0);
3078         }
3079
3080         log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3081
3082         if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3083                 cxgb_remove_tid(cdev, toep, toep->tp_tid);
3084                 toep->tp_flags |= TP_ABORT_REQ_RCVD;
3085
3086                 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3087                 if (toep->tp_l2t)
3088                         l2t_release(L2DATA(cdev), toep->tp_l2t);
3089
3090                 /*
3091                  *  Unhook
3092                  */
3093                 toep->tp_tp->t_toe = NULL;
3094                 toep->tp_tp->t_flags &= ~TF_TOE;
3095                 toep->tp_tp = NULL;
3096                 /*
3097                  * XXX need to call syncache_chkrst - but we don't
3098                  * have a way of doing that yet
3099                  */
3100                 toepcb_release(toep);
3101                 log(LOG_ERR, "abort for unestablished connection :-(\n");
3102                 return (0);
3103         }
3104         if (toep->tp_tp == NULL) {
3105                 log(LOG_NOTICE, "disconnected toepcb\n");
3106                 /* should be freed momentarily */
3107                 return (0);
3108         }
3109
3110
3111         toepcb_hold(toep);
3112         process_abort_req(toep, m, toep->tp_toedev);
3113         toepcb_release(toep);
3114         return (0);
3115 }
3116 #ifdef notyet
3117 static void
3118 pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3119 {
3120         struct toedev *tdev = TOE_DEV(parent);
3121
3122         do_abort_syn_rcv(child, parent);
3123         if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3124                 struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3125
3126                 rpl->opt0h = htonl(F_TCAM_BYPASS);
3127                 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3128                 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3129         } else
3130                 m_free(m);
3131 }
3132 #endif
3133 static void
3134 handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3135 {
3136         CXGB_UNIMPLEMENTED();
3137
3138 #ifdef notyet
3139         struct t3cdev *cdev;
3140         struct socket *parent;
3141         struct socket *oreq;
3142         struct t3c_tid_entry *t3c_stid;
3143         struct tid_info *t;
3144         struct tcpcb *otp, *tp = so_sototcpcb(so);
3145         struct toepcb *toep = tp->t_toe;
3146
3147         /*
3148          * If the connection is being aborted due to the parent listening
3149          * socket going away there's nothing to do, the ABORT_REQ will close
3150          * the connection.
3151          */
3152         if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3153                 m_free(m);
3154                 return;
3155         }
3156
3157         oreq = so->so_incomp;
3158         otp = so_sototcpcb(oreq);
3159
3160         cdev = T3C_DEV(so);
3161         t = &(T3C_DATA(cdev))->tid_maps;
3162         t3c_stid = lookup_stid(t, otp->ts_recent);
3163         parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3164
3165         so_lock(parent);
3166         pass_open_abort(so, parent, m);
3167         so_unlock(parent);
3168 #endif
3169 }
3170
3171 /*
3172  * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3173  * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3174  * connection.
3175  */
3176 static void
3177 pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3178 {
3179
3180 #ifdef notyet
3181         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3182         BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3183 #endif
3184         handle_pass_open_arp_failure(m_get_socket(m), m);
3185 }
3186
3187 /*
3188  * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3189  */
3190 static void
3191 mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3192 {
3193         struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3194         struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3195         unsigned int tid = GET_TID(req);
3196
3197         m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3198         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3199         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3200         rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3201         rpl->opt0h = htonl(F_TCAM_BYPASS);
3202         rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3203         rpl->opt2 = 0;
3204         rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3205 }
3206
3207 /*
3208  * Send a deferred reject to an accept request.
3209  */
3210 static void
3211 reject_pass_request(struct toedev *tdev, struct mbuf *m)
3212 {
3213         struct mbuf *reply_mbuf;
3214
3215         reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3216         mk_pass_accept_rpl(reply_mbuf, m);
3217         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3218         m_free(m);
3219 }
3220
3221 static void
3222 handle_syncache_event(int event, void *arg)
3223 {
3224         struct toepcb *toep = arg;
3225
3226         switch (event) {
3227         case TOE_SC_ENTRY_PRESENT:
3228                 /*
3229                  * entry already exists - free toepcb
3230                  * and l2t
3231                  */
3232                 printf("syncache entry present\n");
3233                 toepcb_release(toep);
3234                 break;
3235         case TOE_SC_DROP:
3236                 /*
3237                  * The syncache has given up on this entry
3238                  * either it timed out, or it was evicted
3239                  * we need to explicitly release the tid
3240                  */
3241                 printf("syncache entry dropped\n");
3242                 toepcb_release(toep);
3243                 break;
3244         default:
3245                 log(LOG_ERR, "unknown syncache event %d\n", event);
3246                 break;
3247         }
3248 }
3249
3250 static void
3251 syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3252 {
3253         struct in_conninfo inc;
3254         struct tcpopt to;
3255         struct tcphdr th;
3256         struct inpcb *inp;
3257         int mss, wsf, sack, ts;
3258         uint32_t rcv_isn = ntohl(req->rcv_isn);
3259
3260         bzero(&to, sizeof(struct tcpopt));
3261         inp = so_sotoinpcb(lso);
3262
3263         /*
3264          * Fill out information for entering us into the syncache
3265          */
3266         bzero(&inc, sizeof(inc));
3267         inc.inc_fport = th.th_sport = req->peer_port;
3268         inc.inc_lport = th.th_dport = req->local_port;
3269         th.th_seq = req->rcv_isn;
3270         th.th_flags = TH_SYN;
3271
3272         toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3273
3274
3275         inc.inc_isipv6 = 0;
3276         inc.inc_len = 0;
3277         inc.inc_faddr.s_addr = req->peer_ip;
3278         inc.inc_laddr.s_addr = req->local_ip;
3279
3280         DPRINTF("syncache add of %d:%d %d:%d\n",
3281             ntohl(req->local_ip), ntohs(req->local_port),
3282             ntohl(req->peer_ip), ntohs(req->peer_port));
3283
3284         mss = req->tcp_options.mss;
3285         wsf = req->tcp_options.wsf;
3286         ts = req->tcp_options.tstamp;
3287         sack = req->tcp_options.sack;
3288         to.to_mss = mss;
3289         to.to_wscale = wsf;
3290         to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3291         tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3292 }
3293
3294
3295 /*
3296  * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3297  * lock held.  Note that the sock here is a listening socket that is not owned
3298  * by the TOE.
3299  */
3300 static void
3301 process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3302     struct listen_ctx *lctx)
3303 {
3304         int rt_flags;
3305         struct l2t_entry *e;
3306         struct iff_mac tim;
3307         struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3308         struct cpl_pass_accept_rpl *rpl;
3309         struct cpl_pass_accept_req *req = cplhdr(m);
3310         unsigned int tid = GET_TID(req);
3311         struct tom_data *d = TOM_DATA(tdev);
3312         struct t3cdev *cdev = d->cdev;
3313         struct tcpcb *tp = so_sototcpcb(so);
3314         struct toepcb *newtoep;
3315         struct rtentry *dst;
3316         struct sockaddr_in nam;
3317         struct t3c_data *td = T3C_DATA(cdev);
3318
3319         reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3320         if (__predict_false(reply_mbuf == NULL)) {
3321                 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3322                         t3_defer_reply(m, tdev, reject_pass_request);
3323                 else {
3324                         cxgb_queue_tid_release(cdev, tid);
3325                         m_free(m);
3326                 }
3327                 DPRINTF("failed to get reply_mbuf\n");
3328
3329                 goto out;
3330         }
3331
3332         if (tp->t_state != TCPS_LISTEN) {
3333                 DPRINTF("socket not in listen state\n");
3334
3335                 goto reject;
3336         }
3337
3338         tim.mac_addr = req->dst_mac;
3339         tim.vlan_tag = ntohs(req->vlan_tag);
3340         if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3341                 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3342                 goto reject;
3343         }
3344
3345 #ifdef notyet
3346         /*
3347          * XXX do route lookup to confirm that we're still listening on this
3348          * address
3349          */
3350         if (ip_route_input(skb, req->local_ip, req->peer_ip,
3351                            G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3352                 goto reject;
3353         rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3354                 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3355         dst_release(skb->dst);  // done with the input route, release it
3356         skb->dst = NULL;
3357
3358         if ((rt_flags & RTF_LOCAL) == 0)
3359                 goto reject;
3360 #endif
3361         /*
3362          * XXX
3363          */
3364         rt_flags = RTF_LOCAL;
3365         if ((rt_flags & RTF_LOCAL) == 0)
3366                 goto reject;
3367
3368         /*
3369          * Calculate values and add to syncache
3370          */
3371
3372         newtoep = toepcb_alloc();
3373         if (newtoep == NULL)
3374                 goto reject;
3375
3376         bzero(&nam, sizeof(struct sockaddr_in));
3377
3378         nam.sin_len = sizeof(struct sockaddr_in);
3379         nam.sin_family = AF_INET;
3380         nam.sin_addr.s_addr =req->peer_ip;
3381         dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3382
3383         if (dst == NULL) {
3384                 printf("failed to find route\n");
3385                 goto reject;
3386         }
3387         e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3388             (struct sockaddr *)&nam);
3389         if (e == NULL) {
3390                 DPRINTF("failed to get l2t\n");
3391         }
3392         /*
3393          * Point to our listen socket until accept
3394          */
3395         newtoep->tp_tp = tp;
3396         newtoep->tp_flags = TP_SYN_RCVD;
3397         newtoep->tp_tid = tid;
3398         newtoep->tp_toedev = tdev;
3399         tp->rcv_wnd = select_rcv_wnd(tdev, so);
3400
3401         cxgb_insert_tid(cdev, d->client, newtoep, tid);
3402         so_lock(so);
3403         LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3404         so_unlock(so);
3405
3406         newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3407                        tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3408
3409         if (newtoep->tp_ulp_mode) {
3410                 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3411
3412                 if (ddp_mbuf == NULL)
3413                         newtoep->tp_ulp_mode = 0;
3414         }
3415
3416         CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3417             TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3418         set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3419         /*
3420          * XXX workaround for lack of syncache drop
3421          */
3422         toepcb_hold(newtoep);
3423         syncache_add_accept_req(req, so, newtoep);
3424
3425         rpl = cplhdr(reply_mbuf);
3426         reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3427         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3428         rpl->wr.wr_lo = 0;
3429         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3430         rpl->opt2 = htonl(calc_opt2(so, tdev));
3431         rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3432         rpl->peer_ip = req->peer_ip;    // req->peer_ip is not overwritten
3433
3434         rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3435             V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3436         rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3437                                   CPL_PASS_OPEN_ACCEPT);
3438
3439         DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3440
3441         m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3442
3443         l2t_send(cdev, reply_mbuf, e);
3444         m_free(m);
3445         if (newtoep->tp_ulp_mode) {
3446                 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3447                                 V_TF_DDP_OFF(1) |
3448                                 TP_DDP_TIMER_WORKAROUND_MASK,
3449                                 V_TF_DDP_OFF(1) |
3450                     TP_DDP_TIMER_WORKAROUND_VAL, 1);
3451         } else
3452                 printf("not offloading\n");
3453
3454
3455
3456         return;
3457 reject:
3458         if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3459                 mk_pass_accept_rpl(reply_mbuf, m);
3460         else
3461                 mk_tid_release(reply_mbuf, newtoep, tid);
3462         cxgb_ofld_send(cdev, reply_mbuf);
3463         m_free(m);
3464 out:
3465 #if 0
3466         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3467 #else
3468         return;
3469 #endif
3470 }
3471
3472 /*
3473  * Handle a CPL_PASS_ACCEPT_REQ message.
3474  */
3475 static int
3476 do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3477 {
3478         struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3479         struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3480         struct tom_data *d = listen_ctx->tom_data;
3481
3482 #if VALIDATE_TID
3483         struct cpl_pass_accept_req *req = cplhdr(m);
3484         unsigned int tid = GET_TID(req);
3485         struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3486
3487         if (unlikely(!lsk)) {
3488                 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3489                        cdev->name,
3490                        (unsigned long)((union listen_entry *)ctx -
3491                                         t->stid_tab));
3492                 return CPL_RET_BUF_DONE;
3493         }
3494         if (unlikely(tid >= t->ntids)) {
3495                 printk(KERN_ERR "%s: passive open TID %u too large\n",
3496                        cdev->name, tid);
3497                 return CPL_RET_BUF_DONE;
3498         }
3499         /*
3500          * For T3A the current user of the TID may have closed but its last
3501          * message(s) may have been backlogged so the TID appears to be still
3502          * in use.  Just take the TID away, the connection can close at its
3503          * own leisure.  For T3B this situation is a bug.
3504          */
3505         if (!valid_new_tid(t, tid) &&
3506             cdev->type != T3A) {
3507                 printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3508                        cdev->name, tid);
3509                 return CPL_RET_BUF_DONE;
3510         }
3511 #endif
3512
3513         process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3514         return (0);
3515 }
3516
3517 /*
3518  * Called when a connection is established to translate the TCP options
3519  * reported by HW to FreeBSD's native format.
3520  */
3521 static void
3522 assign_rxopt(struct socket *so, unsigned int opt)
3523 {
3524         struct tcpcb *tp = so_sototcpcb(so);
3525         struct toepcb *toep = tp->t_toe;
3526         const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3527
3528         inp_lock_assert(tp->t_inpcb);
3529
3530         toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3531         tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3532         tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3533         tp->t_flags         |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3534         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3535             (TF_RCVD_SCALE|TF_REQ_SCALE))
3536                 tp->rcv_scale = tp->request_r_scale;
3537 }
3538
3539 /*
3540  * Completes some final bits of initialization for just established connections
3541  * and changes their state to TCP_ESTABLISHED.
3542  *
3543  * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3544  */
3545 static void
3546 make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3547 {
3548         struct tcpcb *tp = so_sototcpcb(so);
3549         struct toepcb *toep = tp->t_toe;
3550
3551         toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3552         assign_rxopt(so, opt);
3553
3554         /*
3555          *XXXXXXXXXXX
3556          *
3557          */
3558 #ifdef notyet
3559         so->so_proto->pr_ctloutput = t3_ctloutput;
3560 #endif
3561
3562 #if 0
3563         inet_sk(sk)->id = tp->write_seq ^ jiffies;
3564 #endif
3565         /*
3566          * XXX not clear what rcv_wup maps to
3567          */
3568         /*
3569          * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3570          * pass through opt0.
3571          */
3572         if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3573                 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3574
3575         dump_toepcb(toep);
3576
3577 #ifdef notyet
3578 /*
3579  * no clean interface for marking ARP up to date
3580  */
3581         dst_confirm(sk->sk_dst_cache);
3582 #endif
3583         tp->t_starttime = ticks;
3584         tp->t_state = TCPS_ESTABLISHED;
3585         soisconnected(so);
3586 }
3587
3588 static int
3589 syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3590 {
3591
3592         struct in_conninfo inc;
3593         struct tcpopt to;
3594         struct tcphdr th;
3595         int mss, wsf, sack, ts;
3596         struct mbuf *m = NULL;
3597         const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3598         unsigned int opt;
3599
3600 #ifdef MAC
3601 #error  "no MAC support"
3602 #endif
3603
3604         opt = ntohs(req->tcp_opt);
3605
3606         bzero(&to, sizeof(struct tcpopt));
3607
3608         /*
3609          * Fill out information for entering us into the syncache
3610          */
3611         bzero(&inc, sizeof(inc));
3612         inc.inc_fport = th.th_sport = req->peer_port;
3613         inc.inc_lport = th.th_dport = req->local_port;
3614         th.th_seq = req->rcv_isn;
3615         th.th_flags = TH_ACK;
3616
3617         inc.inc_isipv6 = 0;
3618         inc.inc_len = 0;
3619         inc.inc_faddr.s_addr = req->peer_ip;
3620         inc.inc_laddr.s_addr = req->local_ip;
3621
3622         mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3623         wsf  = G_TCPOPT_WSCALE_OK(opt);
3624         ts   = G_TCPOPT_TSTAMP(opt);
3625         sack = G_TCPOPT_SACK(opt);
3626
3627         to.to_mss = mss;
3628         to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3629         to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3630
3631         DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3632             ntohl(req->local_ip), ntohs(req->local_port),
3633             ntohl(req->peer_ip), ntohs(req->peer_port),
3634             mss, wsf, ts, sack);
3635         return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3636 }
3637
3638
3639 /*
3640  * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3641  * if we are in TCP_SYN_RECV due to crossed SYNs
3642  */
3643 static int
3644 do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3645 {
3646         struct cpl_pass_establish *req = cplhdr(m);
3647         struct toepcb *toep = (struct toepcb *)ctx;
3648         struct tcpcb *tp = toep->tp_tp;
3649         struct socket *so, *lso;
3650         struct t3c_data *td = T3C_DATA(cdev);
3651         struct sockbuf *snd, *rcv;
3652
3653         // Complete socket initialization now that we have the SND_ISN
3654
3655         struct toedev *tdev;
3656
3657
3658         tdev = toep->tp_toedev;
3659
3660         inp_wlock(tp->t_inpcb);
3661
3662         /*
3663          *
3664          * XXX need to add reference while we're manipulating
3665          */
3666         so = lso = inp_inpcbtosocket(tp->t_inpcb);
3667
3668         inp_wunlock(tp->t_inpcb);
3669
3670         so_lock(so);
3671         LIST_REMOVE(toep, synq_entry);
3672         so_unlock(so);
3673
3674         if (!syncache_expand_establish_req(req, &so, toep)) {
3675                 /*
3676                  * No entry
3677                  */
3678                 CXGB_UNIMPLEMENTED();
3679         }
3680         if (so == NULL) {
3681                 /*
3682                  * Couldn't create the socket
3683                  */
3684                 CXGB_UNIMPLEMENTED();
3685         }
3686
3687         tp = so_sototcpcb(so);
3688         inp_wlock(tp->t_inpcb);
3689
3690         snd = so_sockbuf_snd(so);
3691         rcv = so_sockbuf_rcv(so);
3692
3693         snd->sb_flags |= SB_NOCOALESCE;
3694         rcv->sb_flags |= SB_NOCOALESCE;
3695
3696         toep->tp_tp = tp;
3697         toep->tp_flags = 0;
3698         tp->t_toe = toep;
3699         reset_wr_list(toep);
3700         tp->rcv_wnd = select_rcv_wnd(tdev, so);
3701         tp->rcv_nxt = toep->tp_copied_seq;
3702         install_offload_ops(so);
3703
3704         toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3705         toep->tp_wr_unacked = 0;
3706         toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3707         toep->tp_qset_idx = 0;
3708         toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3709
3710         /*
3711          * XXX Cancel any keep alive timer
3712          */
3713
3714         make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3715
3716         /*
3717          * XXX workaround for lack of syncache drop
3718          */
3719         toepcb_release(toep);
3720         inp_wunlock(tp->t_inpcb);
3721
3722         CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3723         cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3724 #ifdef notyet
3725         /*
3726          * XXX not sure how these checks map to us
3727          */
3728         if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3729                 sk->sk_state_change(sk);
3730                 sk_wake_async(so, 0, POLL_OUT);
3731         }
3732         /*
3733          * The state for the new connection is now up to date.
3734          * Next check if we should add the connection to the parent's
3735          * accept queue.  When the parent closes it resets connections
3736          * on its SYN queue, so check if we are being reset.  If so we
3737          * don't need to do anything more, the coming ABORT_RPL will
3738          * destroy this socket.  Otherwise move the connection to the
3739          * accept queue.
3740          *
3741          * Note that we reset the synq before closing the server so if
3742          * we are not being reset the stid is still open.
3743          */
3744         if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3745                 __kfree_skb(skb);
3746                 goto unlock;
3747         }
3748 #endif
3749         m_free(m);
3750
3751         return (0);
3752 }
3753
3754 /*
3755  * Fill in the right TID for CPL messages waiting in the out-of-order queue
3756  * and send them to the TOE.
3757  */
3758 static void
3759 fixup_and_send_ofo(struct toepcb *toep)
3760 {
3761         struct mbuf *m;
3762         struct toedev *tdev = toep->tp_toedev;
3763         struct tcpcb *tp = toep->tp_tp;
3764         unsigned int tid = toep->tp_tid;
3765
3766         log(LOG_NOTICE, "fixup_and_send_ofo\n");
3767
3768         inp_lock_assert(tp->t_inpcb);
3769         while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3770                 /*
3771                  * A variety of messages can be waiting but the fields we'll
3772                  * be touching are common to all so any message type will do.
3773                  */
3774                 struct cpl_close_con_req *p = cplhdr(m);
3775
3776                 p->wr.wr_lo = htonl(V_WR_TID(tid));
3777                 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3778                 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3779         }
3780 }
3781
3782 /*
3783  * Updates socket state from an active establish CPL message.  Runs with the
3784  * socket lock held.
3785  */
3786 static void
3787 socket_act_establish(struct socket *so, struct mbuf *m)
3788 {
3789         INIT_VNET_INET(so->so_vnet);
3790         struct cpl_act_establish *req = cplhdr(m);
3791         u32 rcv_isn = ntohl(req->rcv_isn);      /* real RCV_ISN + 1 */
3792         struct tcpcb *tp = so_sototcpcb(so);
3793         struct toepcb *toep = tp->t_toe;
3794
3795         if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3796                 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3797                     toep->tp_tid, tp->t_state);
3798
3799         tp->ts_recent_age = ticks;
3800         tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3801         toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3802
3803         make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3804
3805         /*
3806          * Now that we finally have a TID send any CPL messages that we had to
3807          * defer for lack of a TID.
3808          */
3809         if (mbufq_len(&toep->out_of_order_queue))
3810                 fixup_and_send_ofo(toep);
3811
3812         if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3813                 /*
3814                  * XXX does this even make sense?
3815                  */
3816                 so_sorwakeup(so);
3817         }
3818         m_free(m);
3819 #ifdef notyet
3820 /*
3821  * XXX assume no write requests permitted while socket connection is
3822  * incomplete
3823  */
3824         /*
3825          * Currently the send queue must be empty at this point because the
3826          * socket layer does not send anything before a connection is
3827          * established.  To be future proof though we handle the possibility
3828          * that there are pending buffers to send (either TX_DATA or
3829          * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3830          * buffers according to the just learned write_seq, and then we send
3831          * them on their way.
3832          */
3833         fixup_pending_writeq_buffers(sk);
3834         if (t3_push_frames(so, 1))
3835                 sk->sk_write_space(sk);
3836 #endif
3837
3838         toep->tp_state = tp->t_state;
3839         V_tcpstat.tcps_connects++;
3840
3841 }
3842
3843 /*
3844  * Process a CPL_ACT_ESTABLISH message.
3845  */
3846 static int
3847 do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3848 {
3849         struct cpl_act_establish *req = cplhdr(m);
3850         unsigned int tid = GET_TID(req);
3851         unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3852         struct toepcb *toep = (struct toepcb *)ctx;
3853         struct tcpcb *tp = toep->tp_tp;
3854         struct socket *so;
3855         struct toedev *tdev;
3856         struct tom_data *d;
3857
3858         if (tp == NULL) {
3859                 free_atid(cdev, atid);
3860                 return (0);
3861         }
3862         inp_wlock(tp->t_inpcb);
3863
3864         /*
3865          * XXX
3866          */
3867         so = inp_inpcbtosocket(tp->t_inpcb);
3868         tdev = toep->tp_toedev; /* blow up here if link was down */
3869         d = TOM_DATA(tdev);
3870
3871         /*
3872          * It's OK if the TID is currently in use, the owning socket may have
3873          * backlogged its last CPL message(s).  Just take it away.
3874          */
3875         toep->tp_tid = tid;
3876         toep->tp_tp = tp;
3877         so_insert_tid(d, toep, tid);
3878         free_atid(cdev, atid);
3879         toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3880
3881         socket_act_establish(so, m);
3882         inp_wunlock(tp->t_inpcb);
3883         CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3884         cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3885
3886         return (0);
3887 }
3888
3889 /*
3890  * Process an acknowledgment of WR completion.  Advance snd_una and send the
3891  * next batch of work requests from the write queue.
3892  */
3893 static void
3894 wr_ack(struct toepcb *toep, struct mbuf *m)
3895 {
3896         struct tcpcb *tp = toep->tp_tp;
3897         struct cpl_wr_ack *hdr = cplhdr(m);
3898         struct socket *so;
3899         unsigned int credits = ntohs(hdr->credits);
3900         u32 snd_una = ntohl(hdr->snd_una);
3901         int bytes = 0;
3902         struct sockbuf *snd;
3903
3904         CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3905
3906         inp_wlock(tp->t_inpcb);
3907         so = inp_inpcbtosocket(tp->t_inpcb);
3908         toep->tp_wr_avail += credits;
3909         if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3910                 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3911
3912         while (credits) {
3913                 struct mbuf *p = peek_wr(toep);
3914
3915                 if (__predict_false(!p)) {
3916                         log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3917                             "nothing pending, state %u wr_avail=%u\n",
3918                             credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3919                         break;
3920                 }
3921                 CTR2(KTR_TOM,
3922                         "wr_ack: p->credits=%d p->bytes=%d",
3923                     p->m_pkthdr.csum_data, p->m_pkthdr.len);
3924                 KASSERT(p->m_pkthdr.csum_data != 0,
3925                     ("empty request still on list"));
3926
3927                 if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3928
3929 #if DEBUG_WR > 1
3930                         struct tx_data_wr *w = cplhdr(p);
3931                         log(LOG_ERR,
3932                                "TID %u got %u WR credits, need %u, len %u, "
3933                                "main body %u, frags %u, seq # %u, ACK una %u,"
3934                                " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3935                                toep->tp_tid, credits, p->csum, p->len,
3936                                p->len - p->data_len, skb_shinfo(p)->nr_frags,
3937                                ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3938                             toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3939 #endif
3940                         p->m_pkthdr.csum_data -= credits;
3941                         break;
3942                 } else {
3943                         dequeue_wr(toep);
3944                         credits -= p->m_pkthdr.csum_data;
3945                         bytes += p->m_pkthdr.len;
3946                         CTR3(KTR_TOM,
3947                             "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3948                             p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3949
3950                         m_free(p);
3951                 }
3952         }
3953
3954 #if DEBUG_WR
3955         check_wr_invariants(tp);
3956 #endif
3957
3958         if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3959 #if VALIDATE_SEQ
3960                 struct tom_data *d = TOM_DATA(TOE_DEV(so));
3961
3962                 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3963                     "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3964                     toep->tp_tid, tp->snd_una);
3965 #endif
3966                 goto out_free;
3967         }
3968
3969         if (tp->snd_una != snd_una) {
3970                 tp->snd_una = snd_una;
3971                 tp->ts_recent_age = ticks;
3972 #ifdef notyet
3973                 /*
3974                  * Keep ARP entry "minty fresh"
3975                  */
3976                 dst_confirm(sk->sk_dst_cache);
3977 #endif
3978                 if (tp->snd_una == tp->snd_nxt)
3979                         toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3980         }
3981
3982         snd = so_sockbuf_snd(so);
3983         if (bytes) {
3984                 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3985                 snd = so_sockbuf_snd(so);
3986                 sockbuf_lock(snd);
3987                 sbdrop_locked(snd, bytes);
3988                 so_sowwakeup_locked(so);
3989         }
3990
3991         if (snd->sb_sndptroff < snd->sb_cc)
3992                 t3_push_frames(so, 0);
3993
3994 out_free:
3995         inp_wunlock(tp->t_inpcb);
3996         m_free(m);
3997 }
3998
3999 /*
4000  * Handler for TX_DATA_ACK CPL messages.
4001  */
4002 static int
4003 do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
4004 {
4005         struct toepcb *toep = (struct toepcb *)ctx;
4006
4007         VALIDATE_SOCK(so);
4008
4009         wr_ack(toep, m);
4010         return 0;
4011 }
4012
4013 /*
4014  * Handler for TRACE_PKT CPL messages.  Just sink these packets.
4015  */
4016 static int
4017 do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4018 {
4019         m_freem(m);
4020         return 0;
4021 }
4022
4023 /*
4024  * Reset a connection that is on a listener's SYN queue or accept queue,
4025  * i.e., one that has not had a struct socket associated with it.
4026  * Must be called from process context.
4027  *
4028  * Modeled after code in inet_csk_listen_stop().
4029  */
4030 static void
4031 t3_reset_listen_child(struct socket *child)
4032 {
4033         struct tcpcb *tp = so_sototcpcb(child);
4034
4035         t3_send_reset(tp->t_toe);
4036 }
4037
4038
4039 static void
4040 t3_child_disconnect(struct socket *so, void *arg)
4041 {
4042         struct tcpcb *tp = so_sototcpcb(so);
4043
4044         if (tp->t_flags & TF_TOE) {
4045                 inp_wlock(tp->t_inpcb);
4046                 t3_reset_listen_child(so);
4047                 inp_wunlock(tp->t_inpcb);
4048         }
4049 }
4050
4051 /*
4052  * Disconnect offloaded established but not yet accepted connections sitting
4053  * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4054  * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4055  */
4056 void
4057 t3_disconnect_acceptq(struct socket *listen_so)
4058 {
4059
4060         so_lock(listen_so);
4061         so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4062         so_unlock(listen_so);
4063 }
4064
4065 /*
4066  * Reset offloaded connections sitting on a server's syn queue.  As above
4067  * we send ABORT_REQ and finish off when we get ABORT_RPL.
4068  */
4069
4070 void
4071 t3_reset_synq(struct listen_ctx *lctx)
4072 {
4073         struct toepcb *toep;
4074
4075         so_lock(lctx->lso);
4076         while (!LIST_EMPTY(&lctx->synq_head)) {
4077                 toep = LIST_FIRST(&lctx->synq_head);
4078                 LIST_REMOVE(toep, synq_entry);
4079                 toep->tp_tp = NULL;
4080                 t3_send_reset(toep);
4081                 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4082                 toepcb_release(toep);
4083         }
4084         so_unlock(lctx->lso);
4085 }
4086
4087
4088 int
4089 t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4090                    unsigned int nppods, unsigned int tag, unsigned int maxoff,
4091                    unsigned int pg_off, unsigned int color)
4092 {
4093         unsigned int i, j, pidx;
4094         struct pagepod *p;
4095         struct mbuf *m;
4096         struct ulp_mem_io *req;
4097         unsigned int tid = toep->tp_tid;
4098         const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4099         unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4100
4101         CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4102             gl, nppods, tag, maxoff, pg_off, color);
4103
4104         for (i = 0; i < nppods; ++i) {
4105                 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4106                 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4107                 req = mtod(m, struct ulp_mem_io *);
4108                 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4109                 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4110                 req->wr.wr_lo = 0;
4111                 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4112                                            V_ULPTX_CMD(ULP_MEM_WRITE));
4113                 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4114                                  V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4115
4116                 p = (struct pagepod *)(req + 1);
4117                 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4118                         p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4119                         p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4120                                                   V_PPOD_COLOR(color));
4121                         p->pp_max_offset = htonl(maxoff);
4122                         p->pp_page_offset = htonl(pg_off);
4123                         p->pp_rsvd = 0;
4124                         for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4125                                 p->pp_addr[j] = pidx < gl->dgl_nelem ?
4126                                     htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4127                 } else
4128                         p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4129                 send_or_defer(toep, m, 0);
4130                 ppod_addr += PPOD_SIZE;
4131         }
4132         return (0);
4133 }
4134
4135 /*
4136  * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4137  */
4138 static inline void
4139 mk_cpl_barrier_ulp(struct cpl_barrier *b)
4140 {
4141         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4142
4143         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4144         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4145         b->opcode = CPL_BARRIER;
4146 }
4147
4148 /*
4149  * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4150  */
4151 static inline void
4152 mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4153 {
4154         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4155
4156         txpkt = (struct ulp_txpkt *)req;
4157         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4158         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4159         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4160         req->cpuno = htons(cpuno);
4161 }
4162
4163 /*
4164  * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4165  */
4166 static inline void
4167 mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4168                      unsigned int word, uint64_t mask, uint64_t val)
4169 {
4170         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4171
4172         CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4173             tid, word, mask, val);
4174
4175         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4176         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4177         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4178         req->reply = V_NO_REPLY(1);
4179         req->cpu_idx = 0;
4180         req->word = htons(word);
4181         req->mask = htobe64(mask);
4182         req->val = htobe64(val);
4183 }
4184
4185 /*
4186  * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4187  */
4188 static void
4189 mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4190     unsigned int tid, unsigned int credits)
4191 {
4192         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4193
4194         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4195         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4196         OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4197         ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4198             V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4199                                  V_RX_CREDITS(credits));
4200 }
4201
4202 void
4203 t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4204 {
4205         unsigned int wrlen;
4206         struct mbuf *m;
4207         struct work_request_hdr *wr;
4208         struct cpl_barrier *lock;
4209         struct cpl_set_tcb_field *req;
4210         struct cpl_get_tcb *getreq;
4211         struct ddp_state *p = &toep->tp_ddp_state;
4212
4213 #if 0
4214         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4215 #endif
4216         wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4217                 sizeof(*getreq);
4218         m = m_gethdr_nofail(wrlen);
4219         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4220         wr = mtod(m, struct work_request_hdr *);
4221         bzero(wr, wrlen);
4222
4223         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4224         m->m_pkthdr.len = m->m_len = wrlen;
4225
4226         lock = (struct cpl_barrier *)(wr + 1);
4227         mk_cpl_barrier_ulp(lock);
4228
4229         req = (struct cpl_set_tcb_field *)(lock + 1);
4230
4231         CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4232
4233         /* Hmmm, not sure if this actually a good thing: reactivating
4234          * the other buffer might be an issue if it has been completed
4235          * already. However, that is unlikely, since the fact that the UBUF
4236          * is not completed indicates that there is no oustanding data.
4237          */
4238         if (bufidx == 0)
4239                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4240                                      V_TF_DDP_ACTIVE_BUF(1) |
4241                                      V_TF_DDP_BUF0_VALID(1),
4242                                      V_TF_DDP_ACTIVE_BUF(1));
4243         else
4244                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4245                                      V_TF_DDP_ACTIVE_BUF(1) |
4246                                      V_TF_DDP_BUF1_VALID(1), 0);
4247
4248         getreq = (struct cpl_get_tcb *)(req + 1);
4249         mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4250
4251         mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4252
4253         /* Keep track of the number of oustanding CPL_GET_TCB requests
4254          */
4255         p->get_tcb_count++;
4256
4257 #ifdef T3_TRACE
4258         T3_TRACE1(TIDTB(so),
4259                   "t3_cancel_ddpbuf: bufidx %u", bufidx);
4260 #endif
4261         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4262 }
4263
4264 /**
4265  * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4266  * @sk: the socket associated with the buffers
4267  * @bufidx: index of HW DDP buffer (0 or 1)
4268  * @tag0: new tag for HW buffer 0
4269  * @tag1: new tag for HW buffer 1
4270  * @len: new length for HW buf @bufidx
4271  *
4272  * Sends a compound WR to overlay a new DDP buffer on top of an existing
4273  * buffer by changing the buffer tag and length and setting the valid and
4274  * active flag accordingly.  The caller must ensure the new buffer is at
4275  * least as big as the existing one.  Since we typically reprogram both HW
4276  * buffers this function sets both tags for convenience. Read the TCB to
4277  * determine how made data was written into the buffer before the overlay
4278  * took place.
4279  */
4280 void
4281 t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4282                        unsigned int tag1, unsigned int len)
4283 {
4284         unsigned int wrlen;
4285         struct mbuf *m;
4286         struct work_request_hdr *wr;
4287         struct cpl_get_tcb *getreq;
4288         struct cpl_set_tcb_field *req;
4289         struct ddp_state *p = &toep->tp_ddp_state;
4290
4291         CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4292             bufidx, tag0, tag1, len);
4293 #if 0
4294         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4295 #endif
4296         wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4297         m = m_gethdr_nofail(wrlen);
4298         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4299         wr = mtod(m, struct work_request_hdr *);
4300         m->m_pkthdr.len = m->m_len = wrlen;
4301         bzero(wr, wrlen);
4302
4303
4304         /* Set the ATOMIC flag to make sure that TP processes the following
4305          * CPLs in an atomic manner and no wire segments can be interleaved.
4306          */
4307         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4308         req = (struct cpl_set_tcb_field *)(wr + 1);
4309         mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4310                              V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4311                              V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4312                              V_TCB_RX_DDP_BUF0_TAG(tag0) |
4313                              V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4314         req++;
4315         if (bufidx == 0) {
4316                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4317                             V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4318                             V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4319                 req++;
4320                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4321                             V_TF_DDP_PUSH_DISABLE_0(1) |
4322                             V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4323                             V_TF_DDP_PUSH_DISABLE_0(0) |
4324                             V_TF_DDP_BUF0_VALID(1));
4325         } else {
4326                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4327                             V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4328                             V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4329                 req++;
4330                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4331                             V_TF_DDP_PUSH_DISABLE_1(1) |
4332                             V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4333                             V_TF_DDP_PUSH_DISABLE_1(0) |
4334                             V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4335         }
4336
4337         getreq = (struct cpl_get_tcb *)(req + 1);
4338         mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4339
4340         /* Keep track of the number of oustanding CPL_GET_TCB requests
4341          */
4342         p->get_tcb_count++;
4343
4344 #ifdef T3_TRACE
4345         T3_TRACE4(TIDTB(sk),
4346                   "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4347                   "len %d",
4348                   bufidx, tag0, tag1, len);
4349 #endif
4350         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4351 }
4352
4353 /*
4354  * Sends a compound WR containing all the CPL messages needed to program the
4355  * two HW DDP buffers, namely optionally setting up the length and offset of
4356  * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4357  */
4358 void
4359 t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4360                       unsigned int len1, unsigned int offset1,
4361                       uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4362 {
4363         unsigned int wrlen;
4364         struct mbuf *m;
4365         struct work_request_hdr *wr;
4366         struct cpl_set_tcb_field *req;
4367
4368         CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4369             len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4370
4371 #if 0
4372         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4373 #endif
4374         wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4375                 (len1 ? sizeof(*req) : 0) +
4376                 (modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4377         m = m_gethdr_nofail(wrlen);
4378         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4379         wr = mtod(m, struct work_request_hdr *);
4380         bzero(wr, wrlen);
4381
4382         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4383         m->m_pkthdr.len = m->m_len = wrlen;
4384
4385         req = (struct cpl_set_tcb_field *)(wr + 1);
4386         if (len0) {                  /* program buffer 0 offset and length */
4387                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4388                         V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4389                         V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4390                         V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4391                         V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4392                 req++;
4393         }
4394         if (len1) {                  /* program buffer 1 offset and length */
4395                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4396                         V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4397                         V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4398                         V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4399                         V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4400                 req++;
4401         }
4402
4403         mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4404                              ddp_flags);
4405
4406         if (modulate) {
4407                 mk_rx_data_ack_ulp(toep,
4408                     (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4409                     toep->tp_copied_seq - toep->tp_rcv_wup);
4410                 toep->tp_rcv_wup = toep->tp_copied_seq;
4411         }
4412
4413 #ifdef T3_TRACE
4414         T3_TRACE5(TIDTB(sk),
4415                   "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4416                   "modulate %d",
4417                   len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4418                   modulate);
4419 #endif
4420
4421         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4422 }
4423
4424 void
4425 t3_init_wr_tab(unsigned int wr_len)
4426 {
4427         int i;
4428
4429         if (mbuf_wrs[1])     /* already initialized */
4430                 return;
4431
4432         for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4433                 int sgl_len = (3 * i) / 2 + (i & 1);
4434
4435                 sgl_len += 3;
4436                 mbuf_wrs[i] = sgl_len <= wr_len ?
4437                         1 : 1 + (sgl_len - 2) / (wr_len - 1);
4438         }
4439
4440         wrlen = wr_len * 8;
4441 }
4442
4443 int
4444 t3_init_cpl_io(void)
4445 {
4446 #ifdef notyet
4447         tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4448         if (!tcphdr_skb) {
4449                 log(LOG_ERR,
4450                        "Chelsio TCP offload: can't allocate sk_buff\n");
4451                 return -1;
4452         }
4453         skb_put(tcphdr_skb, sizeof(struct tcphdr));
4454         tcphdr_skb->h.raw = tcphdr_skb->data;
4455         memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4456 #endif
4457
4458         t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4459         t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4460         t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4461         t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4462         t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4463         t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4464         t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4465         t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4466         t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4467         t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4468         t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4469         t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4470         t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4471         t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4472         t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4473         return (0);
4474 }
4475