sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c

   1 /**************************************************************************
   2
   3 Copyright (c) 2007-2008, Chelsio Inc.
   4 All rights reserved.
   5
   6 Redistribution and use in source and binary forms, with or without
   7 modification, are permitted provided that the following conditions are met:
   8
   9  1. Redistributions of source code must retain the above copyright notice,
  10     this list of conditions and the following disclaimer.
  11
  12  2. Neither the name of the Chelsio Corporation nor the names of its
  13     contributors may be used to endorse or promote products derived from
  14     this software without specific prior written permission.
  15
  16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26 POSSIBILITY OF SUCH DAMAGE.
  27
  28 ***************************************************************************/
  29
  30 #include <sys/cdefs.h>
  31 __FBSDID("$FreeBSD$");
  32
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/fcntl.h>
  36 #include <sys/kernel.h>
  37 #include <sys/limits.h>
  38 #include <sys/ktr.h>
  39 #include <sys/lock.h>
  40 #include <sys/mbuf.h>
  41 #include <sys/mutex.h>
  42 #include <sys/sockstate.h>
  43 #include <sys/sockopt.h>
  44 #include <sys/socket.h>
  45 #include <sys/sockbuf.h>
  46 #include <sys/sysctl.h>
  47 #include <sys/syslog.h>
  48 #include <sys/protosw.h>
  49 #include <sys/priv.h>
  50 #include <sys/vimage.h>
  51
  52 #include <net/if.h>
  53 #include <net/route.h>
  54
  55 #include <netinet/in.h>
  56 #include <netinet/in_pcb.h>
  57 #include <netinet/in_systm.h>
  58 #include <netinet/in_var.h>
  59
  60
  61 #include <dev/cxgb/cxgb_osdep.h>
  62 #include <dev/cxgb/sys/mbufq.h>
  63
  64 #include <netinet/ip.h>
  65 #include <netinet/tcp_var.h>
  66 #include <netinet/tcp_fsm.h>
  67 #include <netinet/tcp_offload.h>
  68 #include <netinet/tcp_seq.h>
  69 #include <netinet/tcp_syncache.h>
  70 #include <netinet/tcp_timer.h>
  71 #include <net/route.h>
  72
  73 #include <dev/cxgb/t3cdev.h>
  74 #include <dev/cxgb/common/cxgb_firmware_exports.h>
  75 #include <dev/cxgb/common/cxgb_t3_cpl.h>
  76 #include <dev/cxgb/common/cxgb_tcb.h>
  77 #include <dev/cxgb/common/cxgb_ctl_defs.h>
  78 #include <dev/cxgb/cxgb_offload.h>
  79 #include <vm/vm.h>
  80 #include <vm/pmap.h>
  81 #include <machine/bus.h>
  82 #include <dev/cxgb/sys/mvec.h>
  83 #include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
  84 #include <dev/cxgb/ulp/tom/cxgb_defs.h>
  85 #include <dev/cxgb/ulp/tom/cxgb_tom.h>
  86 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
  87 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
  88 #include <dev/cxgb/ulp/tom/cxgb_tcp.h>
  89
  90 #include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
  91
  92 /*
  93  * For ULP connections HW may add headers, e.g., for digests, that aren't part
  94  * of the messages sent by the host but that are part of the TCP payload and
  95  * therefore consume TCP sequence space.  Tx connection parameters that
  96  * operate in TCP sequence space are affected by the HW additions and need to
  97  * compensate for them to accurately track TCP sequence numbers. This array
  98  * contains the compensating extra lengths for ULP packets.  It is indexed by
  99  * a packet's ULP submode.
 100  */
 101 const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
 102
 103 #ifdef notyet
 104 /*
 105  * This sk_buff holds a fake header-only TCP segment that we use whenever we
 106  * need to exploit SW TCP functionality that expects TCP headers, such as
 107  * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
 108  * CPUs without locking.
 109  */
 110 static struct mbuf *tcphdr_mbuf __read_mostly;
 111 #endif
 112
 113 /*
 114  * Size of WRs in bytes.  Note that we assume all devices we are handling have
 115  * the same WR size.
 116  */
 117 static unsigned int wrlen __read_mostly;
 118
 119 /*
 120  * The number of WRs needed for an skb depends on the number of page fragments
 121  * in the skb and whether it has any payload in its main body.  This maps the
 122  * length of the gather list represented by an skb into the # of necessary WRs.
 123  */
 124 static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
 125
 126 /*
 127  * Max receive window supported by HW in bytes.  Only a small part of it can
 128  * be set through option0, the rest needs to be set through RX_DATA_ACK.
 129  */
 130 #define MAX_RCV_WND ((1U << 27) - 1)
 131
 132 /*
 133  * Min receive window.  We want it to be large enough to accommodate receive
 134  * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
 135  */
 136 #define MIN_RCV_WND (24 * 1024U)
 137 #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
 138
 139 #define VALIDATE_SEQ 0
 140 #define VALIDATE_SOCK(so)
 141 #define DEBUG_WR 0
 142
 143 #define TCP_TIMEWAIT    1
 144 #define TCP_CLOSE       2
 145 #define TCP_DROP        3
 146
 147 extern int tcp_do_autorcvbuf;
 148 extern int tcp_do_autosndbuf;
 149 extern int tcp_autorcvbuf_max;
 150 extern int tcp_autosndbuf_max;
 151
 152 static void t3_send_reset(struct toepcb *toep);
 153 static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
 154 static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
 155 static void handle_syncache_event(int event, void *arg);
 156
 157 static inline void
 158 SBAPPEND(struct sockbuf *sb, struct mbuf *n)
 159 {
 160         struct mbuf *m;
 161
 162         m = sb->sb_mb;
 163         while (m) {
 164                 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
 165                     !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
 166                         !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
 167                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 168                         m->m_next, m->m_nextpkt, m->m_flags));
 169                 m = m->m_next;
 170         }
 171         m = n;
 172         while (m) {
 173                 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
 174                     !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
 175                         !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
 176                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 177                         m->m_next, m->m_nextpkt, m->m_flags));
 178                 m = m->m_next;
 179         }
 180         KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
 181         sbappendstream_locked(sb, n);
 182         m = sb->sb_mb;
 183
 184         while (m) {
 185                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 186                         m->m_next, m->m_nextpkt, m->m_flags));
 187                 m = m->m_next;
 188         }
 189 }
 190
 191 static inline int
 192 is_t3a(const struct toedev *dev)
 193 {
 194         return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
 195 }
 196
 197 static void
 198 dump_toepcb(struct toepcb *toep)
 199 {
 200         DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
 201             toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
 202             toep->tp_mtu_idx, toep->tp_tid);
 203
 204         DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
 205             toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
 206             toep->tp_mss_clamp, toep->tp_flags);
 207 }
 208
 209 #ifndef RTALLOC2_DEFINED
 210 static struct rtentry *
 211 rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
 212 {
 213         struct rtentry *rt = NULL;
 214
 215         if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
 216                 RT_UNLOCK(rt);
 217
 218         return (rt);
 219 }
 220 #endif
 221
 222 /*
 223  * Determine whether to send a CPL message now or defer it.  A message is
 224  * deferred if the connection is in SYN_SENT since we don't know the TID yet.
 225  * For connections in other states the message is sent immediately.
 226  * If through_l2t is set the message is subject to ARP processing, otherwise
 227  * it is sent directly.
 228  */
 229 static inline void
 230 send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
 231 {
 232         struct tcpcb *tp = toep->tp_tp;
 233
 234         if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
 235                 inp_wlock(tp->t_inpcb);
 236                 mbufq_tail(&toep->out_of_order_queue, m);  // defer
 237                 inp_wunlock(tp->t_inpcb);
 238         } else if (through_l2t)
 239                 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
 240         else
 241                 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
 242 }
 243
 244 static inline unsigned int
 245 mkprio(unsigned int cntrl, const struct toepcb *toep)
 246 {
 247         return (cntrl);
 248 }
 249
 250 /*
 251  * Populate a TID_RELEASE WR.  The skb must be already propely sized.
 252  */
 253 static inline void
 254 mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
 255 {
 256         struct cpl_tid_release *req;
 257
 258         m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
 259         m->m_pkthdr.len = m->m_len = sizeof(*req);
 260         req = mtod(m, struct cpl_tid_release *);
 261         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 262         req->wr.wr_lo = 0;
 263         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
 264 }
 265
 266 static inline void
 267 make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
 268 {
 269         struct tcpcb *tp = so_sototcpcb(so);
 270         struct toepcb *toep = tp->t_toe;
 271         struct tx_data_wr *req;
 272         struct sockbuf *snd;
 273
 274         inp_lock_assert(tp->t_inpcb);
 275         snd = so_sockbuf_snd(so);
 276
 277         req = mtod(m, struct tx_data_wr *);
 278         m->m_len = sizeof(*req);
 279         req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
 280         req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
 281         /* len includes the length of any HW ULP additions */
 282         req->len = htonl(len);
 283         req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
 284         /* V_TX_ULP_SUBMODE sets both the mode and submode */
 285         req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
 286                            V_TX_URG(/* skb_urgent(skb) */ 0 ) |
 287                            V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
 288                                    (tail ? 0 : 1))));
 289         req->sndseq = htonl(tp->snd_nxt);
 290         if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
 291                 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
 292                                     V_TX_CPU_IDX(toep->tp_qset));
 293
 294                 /* Sendbuffer is in units of 32KB.
 295                  */
 296                 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
 297                         req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
 298                 else {
 299                         req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
 300                 }
 301
 302                 toep->tp_flags |= TP_DATASENT;
 303         }
 304 }
 305
 306 #define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
 307
 308 int
 309 t3_push_frames(struct socket *so, int req_completion)
 310 {
 311         struct tcpcb *tp = so_sototcpcb(so);
 312         struct toepcb *toep = tp->t_toe;
 313
 314         struct mbuf *tail, *m0, *last;
 315         struct t3cdev *cdev;
 316         struct tom_data *d;
 317         int state, bytes, count, total_bytes;
 318         bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
 319         struct sockbuf *snd;
 320
 321         if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
 322                 DPRINTF("tcp state=%d\n", tp->t_state);
 323                 return (0);
 324         }
 325
 326         state = so_state_get(so);
 327
 328         if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
 329                 DPRINTF("disconnecting\n");
 330
 331                 return (0);
 332         }
 333
 334         inp_lock_assert(tp->t_inpcb);
 335
 336         snd = so_sockbuf_snd(so);
 337         sockbuf_lock(snd);
 338
 339         d = TOM_DATA(toep->tp_toedev);
 340         cdev = d->cdev;
 341
 342         last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
 343
 344         total_bytes = 0;
 345         DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
 346             toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
 347
 348         if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
 349                 KASSERT(tail, ("sbdrop error"));
 350                 last = tail = tail->m_next;
 351         }
 352
 353         if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
 354                 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
 355                 sockbuf_unlock(snd);
 356
 357                 return (0);
 358         }
 359
 360         toep->tp_m_last = NULL;
 361         while (toep->tp_wr_avail && (tail != NULL)) {
 362                 count = bytes = 0;
 363                 segp = segs;
 364                 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
 365                         sockbuf_unlock(snd);
 366                         return (0);
 367                 }
 368                 /*
 369                  * If the data in tail fits as in-line, then
 370                  * make an immediate data wr.
 371                  */
 372                 if (tail->m_len <= IMM_LEN) {
 373                         count = 1;
 374                         bytes = tail->m_len;
 375                         last = tail;
 376                         tail = tail->m_next;
 377                         m_set_sgl(m0, NULL);
 378                         m_set_sgllen(m0, 0);
 379                         make_tx_data_wr(so, m0, bytes, tail);
 380                         m_append(m0, bytes, mtod(last, caddr_t));
 381                         KASSERT(!m0->m_next, ("bad append"));
 382                 } else {
 383                         while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
 384                             && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
 385                                 bytes += tail->m_len;
 386                                 last = tail;
 387                                 count++;
 388                                 /*
 389                                  * technically an abuse to be using this for a VA
 390                                  * but less gross than defining my own structure
 391                                  * or calling pmap_kextract from here :-|
 392                                  */
 393                                 segp->ds_addr = (bus_addr_t)tail->m_data;
 394                                 segp->ds_len = tail->m_len;
 395                                 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
 396                                     count, mbuf_wrs[count], tail->m_data, tail->m_len);
 397                                 segp++;
 398                                 tail = tail->m_next;
 399                         }
 400                         DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
 401                             toep->tp_wr_avail, count, mbuf_wrs[count], tail);
 402
 403                         m_set_sgl(m0, segs);
 404                         m_set_sgllen(m0, count);
 405                         make_tx_data_wr(so, m0, bytes, tail);
 406                 }
 407                 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
 408
 409                 if (tail) {
 410                         snd->sb_sndptr = tail;
 411                         toep->tp_m_last = NULL;
 412                 } else
 413                         toep->tp_m_last = snd->sb_sndptr = last;
 414
 415
 416                 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
 417
 418                 snd->sb_sndptroff += bytes;
 419                 total_bytes += bytes;
 420                 toep->tp_write_seq += bytes;
 421                 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
 422                     " tail=%p sndptr=%p sndptroff=%d",
 423                     toep->tp_wr_avail, count, mbuf_wrs[count],
 424                     tail, snd->sb_sndptr, snd->sb_sndptroff);
 425                 if (tail)
 426                         CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
 427                             " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
 428                             total_bytes, toep->tp_m_last, tail->m_data,
 429                             tp->snd_una);
 430                 else
 431                         CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
 432                             " tp_m_last=%p snd_una=0x%08x",
 433                             total_bytes, toep->tp_m_last, tp->snd_una);
 434
 435
 436 #ifdef KTR
 437 {
 438                 int i;
 439
 440                 i = 0;
 441                 while (i < count && m_get_sgllen(m0)) {
 442                         if ((count - i) >= 3) {
 443                                 CTR6(KTR_TOM,
 444                                     "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
 445                                     " len=%d pa=0x%zx len=%d",
 446                                     segs[i].ds_addr, segs[i].ds_len,
 447                                     segs[i + 1].ds_addr, segs[i + 1].ds_len,
 448                                     segs[i + 2].ds_addr, segs[i + 2].ds_len);
 449                                     i += 3;
 450                         } else if ((count - i) == 2) {
 451                                 CTR4(KTR_TOM,
 452                                     "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
 453                                     " len=%d",
 454                                     segs[i].ds_addr, segs[i].ds_len,
 455                                     segs[i + 1].ds_addr, segs[i + 1].ds_len);
 456                                     i += 2;
 457                         } else {
 458                                 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
 459                                     segs[i].ds_addr, segs[i].ds_len);
 460                                 i++;
 461                         }
 462
 463                 }
 464 }
 465 #endif
 466                  /*
 467                  * remember credits used
 468                  */
 469                 m0->m_pkthdr.csum_data = mbuf_wrs[count];
 470                 m0->m_pkthdr.len = bytes;
 471                 toep->tp_wr_avail -= mbuf_wrs[count];
 472                 toep->tp_wr_unacked += mbuf_wrs[count];
 473
 474                 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
 475                     toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
 476                         struct work_request_hdr *wr = cplhdr(m0);
 477
 478                         wr->wr_hi |= htonl(F_WR_COMPL);
 479                         toep->tp_wr_unacked = 0;
 480                 }
 481                 KASSERT((m0->m_pkthdr.csum_data > 0) &&
 482                     (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
 483                         m0->m_pkthdr.csum_data));
 484                 m0->m_type = MT_DONTFREE;
 485                 enqueue_wr(toep, m0);
 486                 DPRINTF("sending offload tx with %d bytes in %d segments\n",
 487                     bytes, count);
 488                 l2t_send(cdev, m0, toep->tp_l2t);
 489         }
 490         sockbuf_unlock(snd);
 491         return (total_bytes);
 492 }
 493
 494 /*
 495  * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
 496  * under any circumstances.  We take the easy way out and always queue the
 497  * message to the write_queue.  We can optimize the case where the queue is
 498  * already empty though the optimization is probably not worth it.
 499  */
 500 static void
 501 close_conn(struct socket *so)
 502 {
 503         struct mbuf *m;
 504         struct cpl_close_con_req *req;
 505         struct tom_data *d;
 506         struct inpcb *inp = so_sotoinpcb(so);
 507         struct tcpcb *tp;
 508         struct toepcb *toep;
 509         unsigned int tid;
 510
 511
 512         inp_wlock(inp);
 513         tp = so_sototcpcb(so);
 514         toep = tp->t_toe;
 515
 516         if (tp->t_state != TCPS_SYN_SENT)
 517                 t3_push_frames(so, 1);
 518
 519         if (toep->tp_flags & TP_FIN_SENT) {
 520                 inp_wunlock(inp);
 521                 return;
 522         }
 523
 524         tid = toep->tp_tid;
 525
 526         d = TOM_DATA(toep->tp_toedev);
 527
 528         m = m_gethdr_nofail(sizeof(*req));
 529         m_set_priority(m, CPL_PRIORITY_DATA);
 530         m_set_sgl(m, NULL);
 531         m_set_sgllen(m, 0);
 532
 533         toep->tp_flags |= TP_FIN_SENT;
 534         req = mtod(m, struct cpl_close_con_req *);
 535
 536         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
 537         req->wr.wr_lo = htonl(V_WR_TID(tid));
 538         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
 539         req->rsvd = 0;
 540         inp_wunlock(inp);
 541         /*
 542          * XXX - need to defer shutdown while there is still data in the queue
 543          *
 544          */
 545         CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
 546         cxgb_ofld_send(d->cdev, m);
 547
 548 }
 549
 550 /*
 551  * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
 552  * and send it along.
 553  */
 554 static void
 555 abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
 556 {
 557         struct cpl_abort_req *req = cplhdr(m);
 558
 559         req->cmd = CPL_ABORT_NO_RST;
 560         cxgb_ofld_send(cdev, m);
 561 }
 562
 563 /*
 564  * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
 565  * permitted to return without sending the message in case we cannot allocate
 566  * an sk_buff.  Returns the number of credits sent.
 567  */
 568 uint32_t
 569 t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
 570 {
 571         struct mbuf *m;
 572         struct cpl_rx_data_ack *req;
 573         struct toepcb *toep = tp->t_toe;
 574         struct toedev *tdev = toep->tp_toedev;
 575
 576         m = m_gethdr_nofail(sizeof(*req));
 577
 578         DPRINTF("returning %u credits to HW\n", credits);
 579
 580         req = mtod(m, struct cpl_rx_data_ack *);
 581         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 582         req->wr.wr_lo = 0;
 583         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 584         req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
 585         m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
 586         cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
 587         return (credits);
 588 }
 589
 590 /*
 591  * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
 592  * This is only used in DDP mode, so we take the opportunity to also set the
 593  * DACK mode and flush any Rx credits.
 594  */
 595 void
 596 t3_send_rx_modulate(struct toepcb *toep)
 597 {
 598         struct mbuf *m;
 599         struct cpl_rx_data_ack *req;
 600
 601         m = m_gethdr_nofail(sizeof(*req));
 602
 603         req = mtod(m, struct cpl_rx_data_ack *);
 604         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 605         req->wr.wr_lo = 0;
 606         m->m_pkthdr.len = m->m_len = sizeof(*req);
 607
 608         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 609         req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
 610                                  V_RX_DACK_MODE(1) |
 611                                  V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
 612         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 613         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
 614         toep->tp_rcv_wup = toep->tp_copied_seq;
 615 }
 616
 617 /*
 618  * Handle receipt of an urgent pointer.
 619  */
 620 static void
 621 handle_urg_ptr(struct socket *so, uint32_t urg_seq)
 622 {
 623 #ifdef URGENT_DATA_SUPPORTED
 624         struct tcpcb *tp = so_sototcpcb(so);
 625
 626         urg_seq--;   /* initially points past the urgent data, per BSD */
 627
 628         if (tp->urg_data && !after(urg_seq, tp->urg_seq))
 629                 return;                                 /* duplicate pointer */
 630         sk_send_sigurg(sk);
 631         if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
 632             !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
 633                 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 634
 635                 tp->copied_seq++;
 636                 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
 637                         tom_eat_skb(sk, skb, 0);
 638         }
 639         tp->urg_data = TCP_URG_NOTYET;
 640         tp->urg_seq = urg_seq;
 641 #endif
 642 }
 643
 644 /*
 645  * Returns true if a socket cannot accept new Rx data.
 646  */
 647 static inline int
 648 so_no_receive(const struct socket *so)
 649 {
 650         return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
 651 }
 652
 653 /*
 654  * Process an urgent data notification.
 655  */
 656 static void
 657 rx_urg_notify(struct toepcb *toep, struct mbuf *m)
 658 {
 659         struct cpl_rx_urg_notify *hdr = cplhdr(m);
 660         struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
 661
 662         VALIDATE_SOCK(so);
 663
 664         if (!so_no_receive(so))
 665                 handle_urg_ptr(so, ntohl(hdr->seq));
 666
 667         m_freem(m);
 668 }
 669
 670 /*
 671  * Handler for RX_URG_NOTIFY CPL messages.
 672  */
 673 static int
 674 do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 675 {
 676         struct toepcb *toep = (struct toepcb *)ctx;
 677
 678         rx_urg_notify(toep, m);
 679         return (0);
 680 }
 681
 682 static __inline int
 683 is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
 684 {
 685         return (toep->tp_ulp_mode ||
 686                 (toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
 687                     dev->tod_ttid >= TOE_ID_CHELSIO_T3));
 688 }
 689
 690 /*
 691  * Set of states for which we should return RX credits.
 692  */
 693 #define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
 694
 695 /*
 696  * Called after some received data has been read.  It returns RX credits
 697  * to the HW for the amount of data processed.
 698  */
 699 void
 700 t3_cleanup_rbuf(struct tcpcb *tp, int copied)
 701 {
 702         struct toepcb *toep = tp->t_toe;
 703         struct socket *so;
 704         struct toedev *dev;
 705         int dack_mode, must_send, read;
 706         u32 thres, credits, dack = 0;
 707         struct sockbuf *rcv;
 708
 709         so = inp_inpcbtosocket(tp->t_inpcb);
 710         rcv = so_sockbuf_rcv(so);
 711
 712         if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
 713                 (tp->t_state == TCPS_FIN_WAIT_2))) {
 714                 if (copied) {
 715                         sockbuf_lock(rcv);
 716                         toep->tp_copied_seq += copied;
 717                         sockbuf_unlock(rcv);
 718                 }
 719
 720                 return;
 721         }
 722
 723         inp_lock_assert(tp->t_inpcb);
 724
 725         sockbuf_lock(rcv);
 726         if (copied)
 727                 toep->tp_copied_seq += copied;
 728         else {
 729                 read = toep->tp_enqueued_bytes - rcv->sb_cc;
 730                 toep->tp_copied_seq += read;
 731         }
 732         credits = toep->tp_copied_seq - toep->tp_rcv_wup;
 733         toep->tp_enqueued_bytes = rcv->sb_cc;
 734         sockbuf_unlock(rcv);
 735
 736         if (credits > rcv->sb_mbmax) {
 737                 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
 738                     toep->tp_copied_seq, toep->tp_rcv_wup, credits);
 739             credits = rcv->sb_mbmax;
 740         }
 741
 742
 743         /*
 744          * XXX this won't accurately reflect credit return - we need
 745          * to look at the difference between the amount that has been
 746          * put in the recv sockbuf and what is there now
 747          */
 748
 749         if (__predict_false(!credits))
 750                 return;
 751
 752         dev = toep->tp_toedev;
 753         thres = TOM_TUNABLE(dev, rx_credit_thres);
 754
 755         if (__predict_false(thres == 0))
 756                 return;
 757
 758         if (is_delack_mode_valid(dev, toep)) {
 759                 dack_mode = TOM_TUNABLE(dev, delack);
 760                 if (__predict_false(dack_mode != toep->tp_delack_mode)) {
 761                         u32 r = tp->rcv_nxt - toep->tp_delack_seq;
 762
 763                         if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
 764                                 dack = F_RX_DACK_CHANGE |
 765                                        V_RX_DACK_MODE(dack_mode);
 766                 }
 767         } else
 768                 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 769
 770         /*
 771          * For coalescing to work effectively ensure the receive window has
 772          * at least 16KB left.
 773          */
 774         must_send = credits + 16384 >= tp->rcv_wnd;
 775
 776         if (must_send || credits >= thres)
 777                 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
 778 }
 779
 780 static int
 781 cxgb_toe_disconnect(struct tcpcb *tp)
 782 {
 783         struct socket *so;
 784
 785         DPRINTF("cxgb_toe_disconnect\n");
 786
 787         so = inp_inpcbtosocket(tp->t_inpcb);
 788         close_conn(so);
 789         return (0);
 790 }
 791
 792 static int
 793 cxgb_toe_reset(struct tcpcb *tp)
 794 {
 795         struct toepcb *toep = tp->t_toe;
 796
 797         t3_send_reset(toep);
 798
 799         /*
 800          * unhook from socket
 801          */
 802         tp->t_flags &= ~TF_TOE;
 803         toep->tp_tp = NULL;
 804         tp->t_toe = NULL;
 805         return (0);
 806 }
 807
 808 static int
 809 cxgb_toe_send(struct tcpcb *tp)
 810 {
 811         struct socket *so;
 812
 813         DPRINTF("cxgb_toe_send\n");
 814         dump_toepcb(tp->t_toe);
 815
 816         so = inp_inpcbtosocket(tp->t_inpcb);
 817         t3_push_frames(so, 1);
 818         return (0);
 819 }
 820
 821 static int
 822 cxgb_toe_rcvd(struct tcpcb *tp)
 823 {
 824
 825         inp_lock_assert(tp->t_inpcb);
 826
 827         t3_cleanup_rbuf(tp, 0);
 828
 829         return (0);
 830 }
 831
 832 static void
 833 cxgb_toe_detach(struct tcpcb *tp)
 834 {
 835         struct toepcb *toep;
 836
 837         /*
 838          * XXX how do we handle teardown in the SYN_SENT state?
 839          *
 840          */
 841         inp_lock_assert(tp->t_inpcb);
 842         toep = tp->t_toe;
 843         toep->tp_tp = NULL;
 844
 845         /*
 846          * unhook from socket
 847          */
 848         tp->t_flags &= ~TF_TOE;
 849         tp->t_toe = NULL;
 850 }
 851
 852
 853 static struct toe_usrreqs cxgb_toe_usrreqs = {
 854         .tu_disconnect = cxgb_toe_disconnect,
 855         .tu_reset = cxgb_toe_reset,
 856         .tu_send = cxgb_toe_send,
 857         .tu_rcvd = cxgb_toe_rcvd,
 858         .tu_detach = cxgb_toe_detach,
 859         .tu_detach = cxgb_toe_detach,
 860         .tu_syncache_event = handle_syncache_event,
 861 };
 862
 863
 864 static void
 865 __set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
 866                             uint64_t mask, uint64_t val, int no_reply)
 867 {
 868         struct cpl_set_tcb_field *req;
 869
 870         CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
 871             toep->tp_tid, word, mask, val);
 872
 873         req = mtod(m, struct cpl_set_tcb_field *);
 874         m->m_pkthdr.len = m->m_len = sizeof(*req);
 875         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 876         req->wr.wr_lo = 0;
 877         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
 878         req->reply = V_NO_REPLY(no_reply);
 879         req->cpu_idx = 0;
 880         req->word = htons(word);
 881         req->mask = htobe64(mask);
 882         req->val = htobe64(val);
 883
 884         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 885         send_or_defer(toep, m, 0);
 886 }
 887
 888 static void
 889 t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
 890 {
 891         struct mbuf *m;
 892         struct tcpcb *tp = toep->tp_tp;
 893
 894         if (toep == NULL)
 895                 return;
 896
 897         if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
 898                 printf("not seting field\n");
 899                 return;
 900         }
 901
 902         m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
 903
 904         __set_tcb_field(toep, m, word, mask, val, 1);
 905 }
 906
 907 /*
 908  * Set one of the t_flags bits in the TCB.
 909  */
 910 static void
 911 set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
 912 {
 913
 914         t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
 915 }
 916
 917 /*
 918  * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
 919  */
 920 static void
 921 t3_set_nagle(struct toepcb *toep)
 922 {
 923         struct tcpcb *tp = toep->tp_tp;
 924
 925         set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
 926 }
 927
 928 /*
 929  * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
 930  */
 931 void
 932 t3_set_keepalive(struct toepcb *toep, int on_off)
 933 {
 934
 935         set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
 936 }
 937
 938 void
 939 t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
 940 {
 941         set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
 942 }
 943
 944 void
 945 t3_set_dack_mss(struct toepcb *toep, int on_off)
 946 {
 947
 948         set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
 949 }
 950
 951 /*
 952  * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
 953  */
 954 static void
 955 t3_set_tos(struct toepcb *toep)
 956 {
 957         int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
 958
 959         t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
 960                          V_TCB_TOS(tos));
 961 }
 962
 963
 964 /*
 965  * In DDP mode, TP fails to schedule a timer to push RX data to the host when
 966  * DDP is disabled (data is delivered to freelist). [Note that, the peer should
 967  * set the PSH bit in the last segment, which would trigger delivery.]
 968  * We work around the issue by setting a DDP buffer in a partial placed state,
 969  * which guarantees that TP will schedule a timer.
 970  */
 971 #define TP_DDP_TIMER_WORKAROUND_MASK\
 972     (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
 973      ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
 974        V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
 975 #define TP_DDP_TIMER_WORKAROUND_VAL\
 976     (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
 977      ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
 978       32))
 979
 980 static void
 981 t3_enable_ddp(struct toepcb *toep, int on)
 982 {
 983         if (on) {
 984
 985                 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
 986                                  V_TF_DDP_OFF(0));
 987         } else
 988                 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
 989                                  V_TF_DDP_OFF(1) |
 990                                  TP_DDP_TIMER_WORKAROUND_MASK,
 991                                  V_TF_DDP_OFF(1) |
 992                                  TP_DDP_TIMER_WORKAROUND_VAL);
 993
 994 }
 995
 996 void
 997 t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
 998 {
 999         t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1000                          V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1001                          tag_color);
1002 }
1003
1004 void
1005 t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1006                     unsigned int len)
1007 {
1008         if (buf_idx == 0)
1009                 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1010                          V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1011                          V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1012                          V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1013                          V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1014         else
1015                 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1016                          V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1017                          V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1018                          V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1019                          V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1020 }
1021
1022 static int
1023 t3_set_cong_control(struct socket *so, const char *name)
1024 {
1025 #ifdef CONGESTION_CONTROL_SUPPORTED
1026         int cong_algo;
1027
1028         for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1029                 if (!strcmp(name, t3_cong_ops[cong_algo].name))
1030                         break;
1031
1032         if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1033                 return -EINVAL;
1034 #endif
1035         return 0;
1036 }
1037
1038 int
1039 t3_get_tcb(struct toepcb *toep)
1040 {
1041         struct cpl_get_tcb *req;
1042         struct tcpcb *tp = toep->tp_tp;
1043         struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1044
1045         if (!m)
1046                 return (ENOMEM);
1047
1048         inp_lock_assert(tp->t_inpcb);
1049         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1050         req = mtod(m, struct cpl_get_tcb *);
1051         m->m_pkthdr.len = m->m_len = sizeof(*req);
1052         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1053         req->wr.wr_lo = 0;
1054         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1055         req->cpuno = htons(toep->tp_qset);
1056         req->rsvd = 0;
1057         if (tp->t_state == TCPS_SYN_SENT)
1058                 mbufq_tail(&toep->out_of_order_queue, m);       // defer
1059         else
1060                 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1061         return 0;
1062 }
1063
1064 static inline void
1065 so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1066 {
1067
1068         toepcb_hold(toep);
1069
1070         cxgb_insert_tid(d->cdev, d->client, toep, tid);
1071 }
1072
1073 /**
1074  *      find_best_mtu - find the entry in the MTU table closest to an MTU
1075  *      @d: TOM state
1076  *      @mtu: the target MTU
1077  *
1078  *      Returns the index of the value in the MTU table that is closest to but
1079  *      does not exceed the target MTU.
1080  */
1081 static unsigned int
1082 find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1083 {
1084         int i = 0;
1085
1086         while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1087                 ++i;
1088         return (i);
1089 }
1090
1091 static unsigned int
1092 select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1093 {
1094         unsigned int idx;
1095
1096 #ifdef notyet
1097         struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1098 #endif
1099         if (tp) {
1100                 tp->t_maxseg = pmtu - 40;
1101                 if (tp->t_maxseg < td->mtus[0] - 40)
1102                         tp->t_maxseg = td->mtus[0] - 40;
1103                 idx = find_best_mtu(td, tp->t_maxseg + 40);
1104
1105                 tp->t_maxseg = td->mtus[idx] - 40;
1106         } else
1107                 idx = find_best_mtu(td, pmtu);
1108
1109         return (idx);
1110 }
1111
1112 static inline void
1113 free_atid(struct t3cdev *cdev, unsigned int tid)
1114 {
1115         struct toepcb *toep = cxgb_free_atid(cdev, tid);
1116
1117         if (toep)
1118                 toepcb_release(toep);
1119 }
1120
1121 /*
1122  * Release resources held by an offload connection (TID, L2T entry, etc.)
1123  */
1124 static void
1125 t3_release_offload_resources(struct toepcb *toep)
1126 {
1127         struct tcpcb *tp = toep->tp_tp;
1128         struct toedev *tdev = toep->tp_toedev;
1129         struct t3cdev *cdev;
1130         struct socket *so;
1131         unsigned int tid = toep->tp_tid;
1132         struct sockbuf *rcv;
1133
1134         CTR0(KTR_TOM, "t3_release_offload_resources");
1135
1136         if (!tdev)
1137                 return;
1138
1139         cdev = TOEP_T3C_DEV(toep);
1140         if (!cdev)
1141                 return;
1142
1143         toep->tp_qset = 0;
1144         t3_release_ddp_resources(toep);
1145
1146 #ifdef CTRL_SKB_CACHE
1147         kfree_skb(CTRL_SKB_CACHE(tp));
1148         CTRL_SKB_CACHE(tp) = NULL;
1149 #endif
1150
1151         if (toep->tp_wr_avail != toep->tp_wr_max) {
1152                 purge_wr_queue(toep);
1153                 reset_wr_list(toep);
1154         }
1155
1156         if (toep->tp_l2t) {
1157                 l2t_release(L2DATA(cdev), toep->tp_l2t);
1158                 toep->tp_l2t = NULL;
1159         }
1160         toep->tp_tp = NULL;
1161         if (tp) {
1162                 inp_lock_assert(tp->t_inpcb);
1163                 so = inp_inpcbtosocket(tp->t_inpcb);
1164                 rcv = so_sockbuf_rcv(so);
1165                 /*
1166                  * cancel any offloaded reads
1167                  *
1168                  */
1169                 sockbuf_lock(rcv);
1170                 tp->t_toe = NULL;
1171                 tp->t_flags &= ~TF_TOE;
1172                 if (toep->tp_ddp_state.user_ddp_pending) {
1173                         t3_cancel_ubuf(toep, rcv);
1174                         toep->tp_ddp_state.user_ddp_pending = 0;
1175                 }
1176                 so_sorwakeup_locked(so);
1177
1178         }
1179
1180         if (toep->tp_state == TCPS_SYN_SENT) {
1181                 free_atid(cdev, tid);
1182 #ifdef notyet
1183                 __skb_queue_purge(&tp->out_of_order_queue);
1184 #endif
1185         } else {                                          // we have TID
1186                 cxgb_remove_tid(cdev, toep, tid);
1187                 toepcb_release(toep);
1188         }
1189 #if 0
1190         log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1191 #endif
1192 }
1193
1194 static void
1195 install_offload_ops(struct socket *so)
1196 {
1197         struct tcpcb *tp = so_sototcpcb(so);
1198
1199         KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1200
1201         t3_install_socket_ops(so);
1202         tp->t_flags |= TF_TOE;
1203         tp->t_tu = &cxgb_toe_usrreqs;
1204 }
1205
1206 /*
1207  * Determine the receive window scaling factor given a target max
1208  * receive window.
1209  */
1210 static __inline int
1211 select_rcv_wscale(int space)
1212 {
1213         int wscale = 0;
1214
1215         if (space > MAX_RCV_WND)
1216                 space = MAX_RCV_WND;
1217
1218         if (V_tcp_do_rfc1323)
1219                 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1220
1221         return (wscale);
1222 }
1223
1224 /*
1225  * Determine the receive window size for a socket.
1226  */
1227 static unsigned long
1228 select_rcv_wnd(struct toedev *dev, struct socket *so)
1229 {
1230         struct tom_data *d = TOM_DATA(dev);
1231         unsigned int wnd;
1232         unsigned int max_rcv_wnd;
1233         struct sockbuf *rcv;
1234
1235         rcv = so_sockbuf_rcv(so);
1236
1237         if (V_tcp_do_autorcvbuf)
1238                 wnd = V_tcp_autorcvbuf_max;
1239         else
1240                 wnd = rcv->sb_hiwat;
1241
1242
1243
1244         /* XXX
1245          * For receive coalescing to work effectively we need a receive window
1246          * that can accomodate a coalesced segment.
1247          */
1248         if (wnd < MIN_RCV_WND)
1249                 wnd = MIN_RCV_WND;
1250
1251         /* PR 5138 */
1252         max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1253                                     (uint32_t)d->rx_page_size * 23 :
1254                                     MAX_RCV_WND);
1255
1256         return min(wnd, max_rcv_wnd);
1257 }
1258
1259 /*
1260  * Assign offload parameters to some socket fields.  This code is used by
1261  * both active and passive opens.
1262  */
1263 static inline void
1264 init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1265     struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1266 {
1267         struct tcpcb *tp = so_sototcpcb(so);
1268         struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1269         struct sockbuf *snd, *rcv;
1270
1271 #ifdef notyet
1272         SOCK_LOCK_ASSERT(so);
1273 #endif
1274
1275         snd = so_sockbuf_snd(so);
1276         rcv = so_sockbuf_rcv(so);
1277
1278         log(LOG_INFO, "initializing offload socket\n");
1279         /*
1280          * We either need to fix push frames to work with sbcompress
1281          * or we need to add this
1282          */
1283         snd->sb_flags |= SB_NOCOALESCE;
1284         rcv->sb_flags |= SB_NOCOALESCE;
1285
1286         tp->t_toe = toep;
1287         toep->tp_tp = tp;
1288         toep->tp_toedev = dev;
1289
1290         toep->tp_tid = tid;
1291         toep->tp_l2t = e;
1292         toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1293         toep->tp_wr_unacked = 0;
1294         toep->tp_delack_mode = 0;
1295
1296         toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1297         /*
1298          * XXX broken
1299          *
1300          */
1301         tp->rcv_wnd = select_rcv_wnd(dev, so);
1302
1303         toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1304                        tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1305         toep->tp_qset_idx = 0;
1306
1307         reset_wr_list(toep);
1308         DPRINTF("initialization done\n");
1309 }
1310
1311 /*
1312  * The next two functions calculate the option 0 value for a socket.
1313  */
1314 static inline unsigned int
1315 calc_opt0h(struct socket *so, int mtu_idx)
1316 {
1317         struct tcpcb *tp = so_sototcpcb(so);
1318         int wscale = select_rcv_wscale(tp->rcv_wnd);
1319
1320         return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1321             V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1322             V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1323 }
1324
1325 static inline unsigned int
1326 calc_opt0l(struct socket *so, int ulp_mode)
1327 {
1328         struct tcpcb *tp = so_sototcpcb(so);
1329         unsigned int val;
1330
1331         val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1332                V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1333
1334         DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1335         return (val);
1336 }
1337
1338 static inline unsigned int
1339 calc_opt2(const struct socket *so, struct toedev *dev)
1340 {
1341         int flv_valid;
1342
1343         flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1344
1345         return (V_FLAVORS_VALID(flv_valid) |
1346             V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1347 }
1348
1349 #if DEBUG_WR > 1
1350 static int
1351 count_pending_wrs(const struct toepcb *toep)
1352 {
1353         const struct mbuf *m;
1354         int n = 0;
1355
1356         wr_queue_walk(toep, m)
1357                 n += m->m_pkthdr.csum_data;
1358         return (n);
1359 }
1360 #endif
1361
1362 #if 0
1363 (((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1364 #endif
1365
1366 static void
1367 mk_act_open_req(struct socket *so, struct mbuf *m,
1368     unsigned int atid, const struct l2t_entry *e)
1369 {
1370         struct cpl_act_open_req *req;
1371         struct inpcb *inp = so_sotoinpcb(so);
1372         struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1373         struct toepcb *toep = tp->t_toe;
1374         struct toedev *tdev = toep->tp_toedev;
1375
1376         m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1377
1378         req = mtod(m, struct cpl_act_open_req *);
1379         m->m_pkthdr.len = m->m_len = sizeof(*req);
1380
1381         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1382         req->wr.wr_lo = 0;
1383         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1384         inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1385 #if 0
1386         req->local_port = inp->inp_lport;
1387         req->peer_port = inp->inp_fport;
1388         memcpy(&req->local_ip, &inp->inp_laddr, 4);
1389         memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1390 #endif
1391         req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1392                            V_TX_CHANNEL(e->smt_idx));
1393         req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1394         req->params = 0;
1395         req->opt2 = htonl(calc_opt2(so, tdev));
1396 }
1397
1398
1399 /*
1400  * Convert an ACT_OPEN_RPL status to an errno.
1401  */
1402 static int
1403 act_open_rpl_status_to_errno(int status)
1404 {
1405         switch (status) {
1406         case CPL_ERR_CONN_RESET:
1407                 return (ECONNREFUSED);
1408         case CPL_ERR_ARP_MISS:
1409                 return (EHOSTUNREACH);
1410         case CPL_ERR_CONN_TIMEDOUT:
1411                 return (ETIMEDOUT);
1412         case CPL_ERR_TCAM_FULL:
1413                 return (ENOMEM);
1414         case CPL_ERR_CONN_EXIST:
1415                 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1416                 return (EADDRINUSE);
1417         default:
1418                 return (EIO);
1419         }
1420 }
1421
1422 static void
1423 fail_act_open(struct toepcb *toep, int errno)
1424 {
1425         struct tcpcb *tp = toep->tp_tp;
1426
1427         t3_release_offload_resources(toep);
1428         if (tp) {
1429                 inp_wunlock(tp->t_inpcb);
1430                 tcp_offload_drop(tp, errno);
1431         }
1432
1433 #ifdef notyet
1434         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1435 #endif
1436 }
1437
1438 /*
1439  * Handle active open failures.
1440  */
1441 static void
1442 active_open_failed(struct toepcb *toep, struct mbuf *m)
1443 {
1444         struct cpl_act_open_rpl *rpl = cplhdr(m);
1445         struct inpcb *inp;
1446
1447         if (toep->tp_tp == NULL)
1448                 goto done;
1449
1450         inp = toep->tp_tp->t_inpcb;
1451
1452 /*
1453  * Don't handle connection retry for now
1454  */
1455 #ifdef notyet
1456         struct inet_connection_sock *icsk = inet_csk(sk);
1457
1458         if (rpl->status == CPL_ERR_CONN_EXIST &&
1459             icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1460                 icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1461                 sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1462                                jiffies + HZ / 2);
1463         } else
1464 #endif
1465         {
1466                 inp_wlock(inp);
1467                 /*
1468                  * drops the inpcb lock
1469                  */
1470                 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1471         }
1472
1473         done:
1474         m_free(m);
1475 }
1476
1477 /*
1478  * Return whether a failed active open has allocated a TID
1479  */
1480 static inline int
1481 act_open_has_tid(int status)
1482 {
1483         return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1484                status != CPL_ERR_ARP_MISS;
1485 }
1486
1487 /*
1488  * Process an ACT_OPEN_RPL CPL message.
1489  */
1490 static int
1491 do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1492 {
1493         struct toepcb *toep = (struct toepcb *)ctx;
1494         struct cpl_act_open_rpl *rpl = cplhdr(m);
1495
1496         if (cdev->type != T3A && act_open_has_tid(rpl->status))
1497                 cxgb_queue_tid_release(cdev, GET_TID(rpl));
1498
1499         active_open_failed(toep, m);
1500         return (0);
1501 }
1502
1503 /*
1504  * Handle an ARP failure for an active open.   XXX purge ofo queue
1505  *
1506  * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1507  * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1508  * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1509  * free the atid.  Hmm.
1510  */
1511 #ifdef notyet
1512 static void
1513 act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1514 {
1515         struct toepcb *toep = m_get_toep(m);
1516         struct tcpcb *tp = toep->tp_tp;
1517         struct inpcb *inp = tp->t_inpcb;
1518         struct socket *so;
1519
1520         inp_wlock(inp);
1521         if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1522                 /*
1523                  * drops the inpcb lock
1524                  */
1525                 fail_act_open(so, EHOSTUNREACH);
1526                 printf("freeing %p\n", m);
1527
1528                 m_free(m);
1529         } else
1530                 inp_wunlock(inp);
1531 }
1532 #endif
1533 /*
1534  * Send an active open request.
1535  */
1536 int
1537 t3_connect(struct toedev *tdev, struct socket *so,
1538     struct rtentry *rt, struct sockaddr *nam)
1539 {
1540         struct mbuf *m;
1541         struct l2t_entry *e;
1542         struct tom_data *d = TOM_DATA(tdev);
1543         struct inpcb *inp = so_sotoinpcb(so);
1544         struct tcpcb *tp = intotcpcb(inp);
1545         struct toepcb *toep; /* allocated by init_offload_socket */
1546
1547         int atid;
1548
1549         toep = toepcb_alloc();
1550         if (toep == NULL)
1551                 goto out_err;
1552
1553         if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1554                 goto out_err;
1555
1556         e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1557         if (!e)
1558                 goto free_tid;
1559
1560         inp_lock_assert(inp);
1561         m = m_gethdr(MT_DATA, M_WAITOK);
1562
1563 #if 0
1564         m->m_toe.mt_toepcb = tp->t_toe;
1565         set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1566 #endif
1567         so_lock(so);
1568
1569         init_offload_socket(so, tdev, atid, e, rt, toep);
1570
1571         install_offload_ops(so);
1572
1573         mk_act_open_req(so, m, atid, e);
1574         so_unlock(so);
1575
1576         soisconnecting(so);
1577         toep = tp->t_toe;
1578         m_set_toep(m, tp->t_toe);
1579
1580         toep->tp_state = TCPS_SYN_SENT;
1581         l2t_send(d->cdev, (struct mbuf *)m, e);
1582
1583         if (toep->tp_ulp_mode)
1584                 t3_enable_ddp(toep, 0);
1585         return  (0);
1586
1587 free_tid:
1588         printf("failing connect - free atid\n");
1589
1590         free_atid(d->cdev, atid);
1591 out_err:
1592         printf("return ENOMEM\n");
1593        return (ENOMEM);
1594 }
1595
1596 /*
1597  * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1598  * not send multiple ABORT_REQs for the same connection and also that we do
1599  * not try to send a message after the connection has closed.  Returns 1 if
1600  * an ABORT_REQ wasn't generated after all, 0 otherwise.
1601  */
1602 static void
1603 t3_send_reset(struct toepcb *toep)
1604 {
1605
1606         struct cpl_abort_req *req;
1607         unsigned int tid = toep->tp_tid;
1608         int mode = CPL_ABORT_SEND_RST;
1609         struct tcpcb *tp = toep->tp_tp;
1610         struct toedev *tdev = toep->tp_toedev;
1611         struct socket *so = NULL;
1612         struct mbuf *m;
1613         struct sockbuf *snd;
1614
1615         if (tp) {
1616                 inp_lock_assert(tp->t_inpcb);
1617                 so = inp_inpcbtosocket(tp->t_inpcb);
1618         }
1619
1620         if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1621                 tdev == NULL))
1622                 return;
1623         toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1624
1625         snd = so_sockbuf_snd(so);
1626         /* Purge the send queue so we don't send anything after an abort. */
1627         if (so)
1628                 sbflush(snd);
1629         if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1630                 mode |= CPL_ABORT_POST_CLOSE_REQ;
1631
1632         m = m_gethdr_nofail(sizeof(*req));
1633         m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1634         set_arp_failure_handler(m, abort_arp_failure);
1635
1636         req = mtod(m, struct cpl_abort_req *);
1637         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1638         req->wr.wr_lo = htonl(V_WR_TID(tid));
1639         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1640         req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1641         req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1642         req->cmd = mode;
1643         if (tp && (tp->t_state == TCPS_SYN_SENT))
1644                 mbufq_tail(&toep->out_of_order_queue, m);       // defer
1645         else
1646                 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1647 }
1648
1649 static int
1650 t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1651 {
1652         struct inpcb *inp;
1653         int error, optval;
1654
1655         if (sopt->sopt_name == IP_OPTIONS)
1656                 return (ENOPROTOOPT);
1657
1658         if (sopt->sopt_name != IP_TOS)
1659                 return (EOPNOTSUPP);
1660
1661         error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1662
1663         if (error)
1664                 return (error);
1665
1666         if (optval > IPTOS_PREC_CRITIC_ECP)
1667                 return (EINVAL);
1668
1669         inp = so_sotoinpcb(so);
1670         inp_wlock(inp);
1671         inp_ip_tos_set(inp, optval);
1672 #if 0
1673         inp->inp_ip_tos = optval;
1674 #endif
1675         t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1676         inp_wunlock(inp);
1677
1678         return (0);
1679 }
1680
1681 static int
1682 t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1683 {
1684         int err = 0;
1685         size_t copied;
1686
1687         if (sopt->sopt_name != TCP_CONGESTION &&
1688             sopt->sopt_name != TCP_NODELAY)
1689                 return (EOPNOTSUPP);
1690
1691         if (sopt->sopt_name == TCP_CONGESTION) {
1692                 char name[TCP_CA_NAME_MAX];
1693                 int optlen = sopt->sopt_valsize;
1694                 struct tcpcb *tp;
1695
1696                 if (sopt->sopt_dir == SOPT_GET) {
1697                         KASSERT(0, ("unimplemented"));
1698                         return (EOPNOTSUPP);
1699                 }
1700
1701                 if (optlen < 1)
1702                         return (EINVAL);
1703
1704                 err = copyinstr(sopt->sopt_val, name,
1705                     min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1706                 if (err)
1707                         return (err);
1708                 if (copied < 1)
1709                         return (EINVAL);
1710
1711                 tp = so_sototcpcb(so);
1712                 /*
1713                  * XXX I need to revisit this
1714                  */
1715                 if ((err = t3_set_cong_control(so, name)) == 0) {
1716 #ifdef CONGESTION_CONTROL_SUPPORTED
1717                         tp->t_cong_control = strdup(name, M_CXGB);
1718 #endif
1719                 } else
1720                         return (err);
1721         } else {
1722                 int optval, oldval;
1723                 struct inpcb *inp;
1724                 struct tcpcb *tp;
1725
1726                 if (sopt->sopt_dir == SOPT_GET)
1727                         return (EOPNOTSUPP);
1728
1729                 err = sooptcopyin(sopt, &optval, sizeof optval,
1730                     sizeof optval);
1731
1732                 if (err)
1733                         return (err);
1734
1735                 inp = so_sotoinpcb(so);
1736                 tp = inp_inpcbtotcpcb(inp);
1737
1738                 inp_wlock(inp);
1739
1740                 oldval = tp->t_flags;
1741                 if (optval)
1742                         tp->t_flags |= TF_NODELAY;
1743                 else
1744                         tp->t_flags &= ~TF_NODELAY;
1745                 inp_wunlock(inp);
1746
1747
1748                 if (oldval != tp->t_flags && (tp->t_toe != NULL))
1749                         t3_set_nagle(tp->t_toe);
1750
1751         }
1752
1753         return (0);
1754 }
1755
1756 int
1757 t3_ctloutput(struct socket *so, struct sockopt *sopt)
1758 {
1759         int err;
1760
1761         if (sopt->sopt_level != IPPROTO_TCP)
1762                 err =  t3_ip_ctloutput(so, sopt);
1763         else
1764                 err = t3_tcp_ctloutput(so, sopt);
1765
1766         if (err != EOPNOTSUPP)
1767                 return (err);
1768
1769         return (tcp_ctloutput(so, sopt));
1770 }
1771
1772 /*
1773  * Returns true if we need to explicitly request RST when we receive new data
1774  * on an RX-closed connection.
1775  */
1776 static inline int
1777 need_rst_on_excess_rx(const struct toepcb *toep)
1778 {
1779         return (1);
1780 }
1781
1782 /*
1783  * Handles Rx data that arrives in a state where the socket isn't accepting
1784  * new data.
1785  */
1786 static void
1787 handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1788 {
1789
1790         if (need_rst_on_excess_rx(toep) &&
1791             !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1792                 t3_send_reset(toep);
1793         m_freem(m);
1794 }
1795
1796 /*
1797  * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1798  * by getting the DDP offset from the TCB.
1799  */
1800 static void
1801 tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1802 {
1803         struct ddp_state *q = &toep->tp_ddp_state;
1804         struct ddp_buf_state *bsp;
1805         struct cpl_get_tcb_rpl *hdr;
1806         unsigned int ddp_offset;
1807         struct socket *so;
1808         struct tcpcb *tp;
1809         struct sockbuf *rcv;
1810         int state;
1811
1812         uint64_t t;
1813         __be64 *tcb;
1814
1815         tp = toep->tp_tp;
1816         so = inp_inpcbtosocket(tp->t_inpcb);
1817
1818         inp_lock_assert(tp->t_inpcb);
1819         rcv = so_sockbuf_rcv(so);
1820         sockbuf_lock(rcv);
1821
1822         /* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1823          * We really need a cookie in order to dispatch the RPLs.
1824          */
1825         q->get_tcb_count--;
1826
1827         /* It is a possible that a previous CPL already invalidated UBUF DDP
1828          * and moved the cur_buf idx and hence no further processing of this
1829          * skb is required. However, the app might be sleeping on
1830          * !q->get_tcb_count and we need to wake it up.
1831          */
1832         if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1833                 int state = so_state_get(so);
1834
1835                 m_freem(m);
1836                 if (__predict_true((state & SS_NOFDREF) == 0))
1837                         so_sorwakeup_locked(so);
1838                 else
1839                         sockbuf_unlock(rcv);
1840
1841                 return;
1842         }
1843
1844         bsp = &q->buf_state[q->cur_buf];
1845         hdr = cplhdr(m);
1846         tcb = (__be64 *)(hdr + 1);
1847         if (q->cur_buf == 0) {
1848                 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1849                 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1850         } else {
1851                 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1852                 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1853         }
1854         ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1855         m->m_cur_offset = bsp->cur_offset;
1856         bsp->cur_offset = ddp_offset;
1857         m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1858
1859         CTR5(KTR_TOM,
1860             "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1861             q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1862         KASSERT(ddp_offset >= m->m_cur_offset,
1863             ("ddp_offset=%u less than cur_offset=%u",
1864                 ddp_offset, m->m_cur_offset));
1865
1866 #if 0
1867 {
1868         unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1869
1870         t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1871         ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1872
1873         t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1874         rcv_nxt = t >> S_TCB_RCV_NXT;
1875         rcv_nxt &= M_TCB_RCV_NXT;
1876
1877         t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1878         rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1879         rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1880
1881         T3_TRACE2(TIDTB(sk),
1882                   "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1883                   ddp_flags, rcv_nxt - rx_hdr_offset);
1884         T3_TRACE4(TB(q),
1885                   "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1886                   tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1887         T3_TRACE3(TB(q),
1888                   "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1889                   rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1890         T3_TRACE2(TB(q),
1891                   "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1892                  q->buf_state[0].flags, q->buf_state[1].flags);
1893
1894 }
1895 #endif
1896         if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1897                 handle_excess_rx(toep, m);
1898                 return;
1899         }
1900
1901 #ifdef T3_TRACE
1902         if ((int)m->m_pkthdr.len < 0) {
1903                 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1904         }
1905 #endif
1906         if (bsp->flags & DDP_BF_NOCOPY) {
1907 #ifdef T3_TRACE
1908                 T3_TRACE0(TB(q),
1909                           "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1910
1911                 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1912                         printk("!cancel_ubuf");
1913                         t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1914                 }
1915 #endif
1916                 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1917                 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1918                 q->cur_buf ^= 1;
1919         } else if (bsp->flags & DDP_BF_NOFLIP) {
1920
1921                 m->m_ddp_flags = 1;    /* always a kernel buffer */
1922
1923                 /* now HW buffer carries a user buffer */
1924                 bsp->flags &= ~DDP_BF_NOFLIP;
1925                 bsp->flags |= DDP_BF_NOCOPY;
1926
1927                 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1928                  * any new data in which case we're done. If in addition the
1929                  * offset is 0, then there wasn't a completion for the kbuf
1930                  * and we need to decrement the posted count.
1931                  */
1932                 if (m->m_pkthdr.len == 0) {
1933                         if (ddp_offset == 0) {
1934                                 q->kbuf_posted--;
1935                                 bsp->flags |= DDP_BF_NODATA;
1936                         }
1937                         sockbuf_unlock(rcv);
1938                         m_free(m);
1939                         return;
1940                 }
1941         } else {
1942                 sockbuf_unlock(rcv);
1943
1944                 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1945                  * but it got here way late and nobody cares anymore.
1946                  */
1947                 m_free(m);
1948                 return;
1949         }
1950
1951         m->m_ddp_gl = (unsigned char *)bsp->gl;
1952         m->m_flags |= M_DDP;
1953         m->m_seq = tp->rcv_nxt;
1954         tp->rcv_nxt += m->m_pkthdr.len;
1955         tp->t_rcvtime = ticks;
1956         CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1957                   m->m_seq, q->cur_buf, m->m_pkthdr.len);
1958         if (m->m_pkthdr.len == 0) {
1959                 q->user_ddp_pending = 0;
1960                 m_free(m);
1961         } else
1962                 SBAPPEND(rcv, m);
1963
1964         state = so_state_get(so);
1965         if (__predict_true((state & SS_NOFDREF) == 0))
1966                 so_sorwakeup_locked(so);
1967         else
1968                 sockbuf_unlock(rcv);
1969 }
1970
1971 /*
1972  * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1973  * in that case they are similar to DDP completions.
1974  */
1975 static int
1976 do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1977 {
1978         struct toepcb *toep = (struct toepcb *)ctx;
1979
1980         /* OK if socket doesn't exist */
1981         if (toep == NULL) {
1982                 printf("null toep in do_get_tcb_rpl\n");
1983                 return (CPL_RET_BUF_DONE);
1984         }
1985
1986         inp_wlock(toep->tp_tp->t_inpcb);
1987         tcb_rpl_as_ddp_complete(toep, m);
1988         inp_wunlock(toep->tp_tp->t_inpcb);
1989
1990         return (0);
1991 }
1992
1993 static void
1994 handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1995 {
1996         struct tcpcb *tp = toep->tp_tp;
1997         struct socket *so;
1998         struct ddp_state *q;
1999         struct ddp_buf_state *bsp;
2000         struct cpl_rx_data *hdr = cplhdr(m);
2001         unsigned int rcv_nxt = ntohl(hdr->seq);
2002         struct sockbuf *rcv;
2003
2004         if (tp->rcv_nxt == rcv_nxt)
2005                 return;
2006
2007         inp_lock_assert(tp->t_inpcb);
2008         so  = inp_inpcbtosocket(tp->t_inpcb);
2009         rcv = so_sockbuf_rcv(so);
2010         sockbuf_lock(rcv);
2011
2012         q = &toep->tp_ddp_state;
2013         bsp = &q->buf_state[q->cur_buf];
2014         KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2015                 rcv_nxt, tp->rcv_nxt));
2016         m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2017         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2018         CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2019             rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2020
2021 #ifdef T3_TRACE
2022         if ((int)m->m_pkthdr.len < 0) {
2023                 t3_ddp_error(so, "handle_ddp_data: neg len");
2024         }
2025 #endif
2026         m->m_ddp_gl = (unsigned char *)bsp->gl;
2027         m->m_flags |= M_DDP;
2028         m->m_cur_offset = bsp->cur_offset;
2029         m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2030         if (bsp->flags & DDP_BF_NOCOPY)
2031                 bsp->flags &= ~DDP_BF_NOCOPY;
2032
2033         m->m_seq = tp->rcv_nxt;
2034         tp->rcv_nxt = rcv_nxt;
2035         bsp->cur_offset += m->m_pkthdr.len;
2036         if (!(bsp->flags & DDP_BF_NOFLIP))
2037                 q->cur_buf ^= 1;
2038         /*
2039          * For now, don't re-enable DDP after a connection fell out of  DDP
2040          * mode.
2041          */
2042         q->ubuf_ddp_ready = 0;
2043         sockbuf_unlock(rcv);
2044 }
2045
2046 /*
2047  * Process new data received for a connection.
2048  */
2049 static void
2050 new_rx_data(struct toepcb *toep, struct mbuf *m)
2051 {
2052         struct cpl_rx_data *hdr = cplhdr(m);
2053         struct tcpcb *tp = toep->tp_tp;
2054         struct socket *so;
2055         struct sockbuf *rcv;
2056         int state;
2057         int len = be16toh(hdr->len);
2058
2059         inp_wlock(tp->t_inpcb);
2060
2061         so  = inp_inpcbtosocket(tp->t_inpcb);
2062
2063         if (__predict_false(so_no_receive(so))) {
2064                 handle_excess_rx(toep, m);
2065                 inp_wunlock(tp->t_inpcb);
2066                 TRACE_EXIT;
2067                 return;
2068         }
2069
2070         if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2071                 handle_ddp_data(toep, m);
2072
2073         m->m_seq = ntohl(hdr->seq);
2074         m->m_ulp_mode = 0;                    /* for iSCSI */
2075
2076 #if VALIDATE_SEQ
2077         if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2078                 log(LOG_ERR,
2079                        "%s: TID %u: Bad sequence number %u, expected %u\n",
2080                     toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2081                        tp->rcv_nxt);
2082                 m_freem(m);
2083                 inp_wunlock(tp->t_inpcb);
2084                 return;
2085         }
2086 #endif
2087         m_adj(m, sizeof(*hdr));
2088
2089 #ifdef URGENT_DATA_SUPPORTED
2090         /*
2091          * We don't handle urgent data yet
2092          */
2093         if (__predict_false(hdr->urg))
2094                 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2095         if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2096                      tp->urg_seq - tp->rcv_nxt < skb->len))
2097                 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2098                                                          tp->rcv_nxt];
2099 #endif
2100         if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2101                 toep->tp_delack_mode = hdr->dack_mode;
2102                 toep->tp_delack_seq = tp->rcv_nxt;
2103         }
2104         CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2105             m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2106
2107         if (len < m->m_pkthdr.len)
2108                 m->m_pkthdr.len = m->m_len = len;
2109
2110         tp->rcv_nxt += m->m_pkthdr.len;
2111         tp->t_rcvtime = ticks;
2112         toep->tp_enqueued_bytes += m->m_pkthdr.len;
2113         CTR2(KTR_TOM,
2114             "new_rx_data: seq 0x%x len %u",
2115             m->m_seq, m->m_pkthdr.len);
2116         inp_wunlock(tp->t_inpcb);
2117         rcv = so_sockbuf_rcv(so);
2118         sockbuf_lock(rcv);
2119 #if 0
2120         if (sb_notify(rcv))
2121                 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2122 #endif
2123         SBAPPEND(rcv, m);
2124
2125 #ifdef notyet
2126         /*
2127          * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2128          *
2129          */
2130         KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2131
2132             ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2133                 so, rcv->sb_cc, rcv->sb_mbmax));
2134 #endif
2135
2136
2137         CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2138             rcv->sb_cc, rcv->sb_mbcnt);
2139
2140         state = so_state_get(so);
2141         if (__predict_true((state & SS_NOFDREF) == 0))
2142                 so_sorwakeup_locked(so);
2143         else
2144                 sockbuf_unlock(rcv);
2145 }
2146
2147 /*
2148  * Handler for RX_DATA CPL messages.
2149  */
2150 static int
2151 do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2152 {
2153         struct toepcb *toep = (struct toepcb *)ctx;
2154
2155         DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2156
2157         new_rx_data(toep, m);
2158
2159         return (0);
2160 }
2161
2162 static void
2163 new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2164 {
2165         struct tcpcb *tp;
2166         struct ddp_state *q;
2167         struct ddp_buf_state *bsp;
2168         struct cpl_rx_data_ddp *hdr;
2169         struct socket *so;
2170         unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2171         int nomoredata = 0;
2172         unsigned int delack_mode;
2173         struct sockbuf *rcv;
2174
2175         tp = toep->tp_tp;
2176         inp_wlock(tp->t_inpcb);
2177         so = inp_inpcbtosocket(tp->t_inpcb);
2178
2179         if (__predict_false(so_no_receive(so))) {
2180
2181                 handle_excess_rx(toep, m);
2182                 inp_wunlock(tp->t_inpcb);
2183                 return;
2184         }
2185
2186         q = &toep->tp_ddp_state;
2187         hdr = cplhdr(m);
2188         ddp_report = ntohl(hdr->u.ddp_report);
2189         buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2190         bsp = &q->buf_state[buf_idx];
2191
2192         CTR4(KTR_TOM,
2193             "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2194             "hdr seq 0x%x len %u",
2195             tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2196             ntohs(hdr->len));
2197         CTR3(KTR_TOM,
2198             "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2199             G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2200
2201         ddp_len = ntohs(hdr->len);
2202         rcv_nxt = ntohl(hdr->seq) + ddp_len;
2203
2204         delack_mode = G_DDP_DACK_MODE(ddp_report);
2205         if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2206                 toep->tp_delack_mode = delack_mode;
2207                 toep->tp_delack_seq = tp->rcv_nxt;
2208         }
2209
2210         m->m_seq = tp->rcv_nxt;
2211         tp->rcv_nxt = rcv_nxt;
2212
2213         tp->t_rcvtime = ticks;
2214         /*
2215          * Store the length in m->m_len.  We are changing the meaning of
2216          * m->m_len here, we need to be very careful that nothing from now on
2217          * interprets ->len of this packet the usual way.
2218          */
2219         m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2220         inp_wunlock(tp->t_inpcb);
2221         CTR3(KTR_TOM,
2222             "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2223             m->m_len, rcv_nxt, m->m_seq);
2224         /*
2225          * Figure out where the new data was placed in the buffer and store it
2226          * in when.  Assumes the buffer offset starts at 0, consumer needs to
2227          * account for page pod's pg_offset.
2228          */
2229         end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2230         m->m_cur_offset = end_offset - m->m_pkthdr.len;
2231
2232         rcv = so_sockbuf_rcv(so);
2233         sockbuf_lock(rcv);
2234
2235         m->m_ddp_gl = (unsigned char *)bsp->gl;
2236         m->m_flags |= M_DDP;
2237         bsp->cur_offset = end_offset;
2238         toep->tp_enqueued_bytes += m->m_pkthdr.len;
2239
2240         /*
2241          * Length is only meaningful for kbuf
2242          */
2243         if (!(bsp->flags & DDP_BF_NOCOPY))
2244                 KASSERT(m->m_len <= bsp->gl->dgl_length,
2245                     ("length received exceeds ddp pages: len=%d dgl_length=%d",
2246                         m->m_len, bsp->gl->dgl_length));
2247
2248         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2249         KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2250         /*
2251          * Bit 0 of flags stores whether the DDP buffer is completed.
2252          * Note that other parts of the code depend on this being in bit 0.
2253          */
2254         if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2255                 panic("spurious ddp completion");
2256         } else {
2257                 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2258                 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2259                         q->cur_buf ^= 1;                     /* flip buffers */
2260         }
2261
2262         if (bsp->flags & DDP_BF_NOCOPY) {
2263                 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2264                 bsp->flags &= ~DDP_BF_NOCOPY;
2265         }
2266
2267         if (ddp_report & F_DDP_PSH)
2268                 m->m_ddp_flags |= DDP_BF_PSH;
2269         if (nomoredata)
2270                 m->m_ddp_flags |= DDP_BF_NODATA;
2271
2272 #ifdef notyet
2273         skb_reset_transport_header(skb);
2274         tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2275 #endif
2276         SBAPPEND(rcv, m);
2277
2278         if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2279             (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2280                 || !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2281                 so_sorwakeup_locked(so);
2282         else
2283                 sockbuf_unlock(rcv);
2284 }
2285
2286 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2287                  F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2288                  F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2289                  F_DDP_INVALID_PPOD)
2290
2291 /*
2292  * Handler for RX_DATA_DDP CPL messages.
2293  */
2294 static int
2295 do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2296 {
2297         struct toepcb *toep = ctx;
2298         const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2299
2300         VALIDATE_SOCK(so);
2301
2302         if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2303                 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2304                        GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2305                 return (CPL_RET_BUF_DONE);
2306         }
2307 #if 0
2308         skb->h.th = tcphdr_skb->h.th;
2309 #endif
2310         new_rx_data_ddp(toep, m);
2311         return (0);
2312 }
2313
2314 static void
2315 process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2316 {
2317         struct tcpcb *tp = toep->tp_tp;
2318         struct socket *so;
2319         struct ddp_state *q;
2320         struct ddp_buf_state *bsp;
2321         struct cpl_rx_ddp_complete *hdr;
2322         unsigned int ddp_report, buf_idx, when, delack_mode;
2323         int nomoredata = 0;
2324         struct sockbuf *rcv;
2325
2326         inp_wlock(tp->t_inpcb);
2327         so = inp_inpcbtosocket(tp->t_inpcb);
2328
2329         if (__predict_false(so_no_receive(so))) {
2330                 struct inpcb *inp = so_sotoinpcb(so);
2331
2332                 handle_excess_rx(toep, m);
2333                 inp_wunlock(inp);
2334                 return;
2335         }
2336         q = &toep->tp_ddp_state;
2337         hdr = cplhdr(m);
2338         ddp_report = ntohl(hdr->ddp_report);
2339         buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2340         m->m_pkthdr.csum_data = tp->rcv_nxt;
2341
2342         rcv = so_sockbuf_rcv(so);
2343         sockbuf_lock(rcv);
2344
2345         bsp = &q->buf_state[buf_idx];
2346         when = bsp->cur_offset;
2347         m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2348         tp->rcv_nxt += m->m_len;
2349         tp->t_rcvtime = ticks;
2350
2351         delack_mode = G_DDP_DACK_MODE(ddp_report);
2352         if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2353                 toep->tp_delack_mode = delack_mode;
2354                 toep->tp_delack_seq = tp->rcv_nxt;
2355         }
2356 #ifdef notyet
2357         skb_reset_transport_header(skb);
2358         tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2359 #endif
2360         inp_wunlock(tp->t_inpcb);
2361
2362         KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2363         CTR5(KTR_TOM,
2364                   "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2365                   "ddp_report 0x%x offset %u, len %u",
2366                   tp->rcv_nxt, bsp->cur_offset, ddp_report,
2367                    G_DDP_OFFSET(ddp_report), m->m_len);
2368
2369         m->m_cur_offset = bsp->cur_offset;
2370         bsp->cur_offset += m->m_len;
2371
2372         if (!(bsp->flags & DDP_BF_NOFLIP)) {
2373                 q->cur_buf ^= 1;                     /* flip buffers */
2374                 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2375                         nomoredata=1;
2376         }
2377
2378         CTR4(KTR_TOM,
2379                   "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2380                   "ddp_report %u offset %u",
2381                   tp->rcv_nxt, bsp->cur_offset, ddp_report,
2382                    G_DDP_OFFSET(ddp_report));
2383
2384         m->m_ddp_gl = (unsigned char *)bsp->gl;
2385         m->m_flags |= M_DDP;
2386         m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2387         if (bsp->flags & DDP_BF_NOCOPY)
2388                 bsp->flags &= ~DDP_BF_NOCOPY;
2389         if (nomoredata)
2390                 m->m_ddp_flags |= DDP_BF_NODATA;
2391
2392         SBAPPEND(rcv, m);
2393         if ((so_state_get(so) & SS_NOFDREF) == 0)
2394                 so_sorwakeup_locked(so);
2395         else
2396                 sockbuf_unlock(rcv);
2397 }
2398
2399 /*
2400  * Handler for RX_DDP_COMPLETE CPL messages.
2401  */
2402 static int
2403 do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2404 {
2405         struct toepcb *toep = ctx;
2406
2407         VALIDATE_SOCK(so);
2408 #if 0
2409         skb->h.th = tcphdr_skb->h.th;
2410 #endif
2411         process_ddp_complete(toep, m);
2412         return (0);
2413 }
2414
2415 /*
2416  * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2417  * socket state before calling tcp_time_wait to comply with its expectations.
2418  */
2419 static void
2420 enter_timewait(struct tcpcb *tp)
2421 {
2422         /*
2423          * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2424          * process peer_close because we don't want to carry the peer FIN in
2425          * the socket's receive queue and if we increment rcv_nxt without
2426          * having the FIN in the receive queue we'll confuse facilities such
2427          * as SIOCINQ.
2428          */
2429         inp_wlock(tp->t_inpcb);
2430         tp->rcv_nxt++;
2431
2432         tp->ts_recent_age = 0;       /* defeat recycling */
2433         tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2434         inp_wunlock(tp->t_inpcb);
2435         tcp_offload_twstart(tp);
2436 }
2437
2438 /*
2439  * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2440  * function deals with the data that may be reported along with the FIN.
2441  * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2442  * perform normal FIN-related processing.  In the latter case 1 indicates that
2443  * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2444  * skb can be freed.
2445  */
2446 static int
2447 handle_peer_close_data(struct socket *so, struct mbuf *m)
2448 {
2449         struct tcpcb *tp = so_sototcpcb(so);
2450         struct toepcb *toep = tp->t_toe;
2451         struct ddp_state *q;
2452         struct ddp_buf_state *bsp;
2453         struct cpl_peer_close *req = cplhdr(m);
2454         unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2455         struct sockbuf *rcv;
2456
2457         if (tp->rcv_nxt == rcv_nxt)                     /* no data */
2458                 return (0);
2459
2460         CTR0(KTR_TOM, "handle_peer_close_data");
2461         if (__predict_false(so_no_receive(so))) {
2462                 handle_excess_rx(toep, m);
2463
2464                 /*
2465                  * Although we discard the data we want to process the FIN so
2466                  * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2467                  * PEER_CLOSE without data.  In particular this PEER_CLOSE
2468                  * may be what will close the connection.  We return 1 because
2469                  * handle_excess_rx() already freed the packet.
2470                  */
2471                 return (1);
2472         }
2473
2474         inp_lock_assert(tp->t_inpcb);
2475         q = &toep->tp_ddp_state;
2476         rcv = so_sockbuf_rcv(so);
2477         sockbuf_lock(rcv);
2478
2479         bsp = &q->buf_state[q->cur_buf];
2480         m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2481         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2482         m->m_ddp_gl = (unsigned char *)bsp->gl;
2483         m->m_flags |= M_DDP;
2484         m->m_cur_offset = bsp->cur_offset;
2485         m->m_ddp_flags =
2486             DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2487         m->m_seq = tp->rcv_nxt;
2488         tp->rcv_nxt = rcv_nxt;
2489         bsp->cur_offset += m->m_pkthdr.len;
2490         if (!(bsp->flags & DDP_BF_NOFLIP))
2491                 q->cur_buf ^= 1;
2492 #ifdef notyet
2493         skb_reset_transport_header(skb);
2494         tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2495 #endif
2496         tp->t_rcvtime = ticks;
2497         SBAPPEND(rcv, m);
2498         if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2499                 so_sorwakeup_locked(so);
2500         else
2501                 sockbuf_unlock(rcv);
2502
2503         return (1);
2504 }
2505
2506 /*
2507  * Handle a peer FIN.
2508  */
2509 static void
2510 do_peer_fin(struct toepcb *toep, struct mbuf *m)
2511 {
2512         struct socket *so;
2513         struct tcpcb *tp = toep->tp_tp;
2514         int keep, action;
2515
2516         action = keep = 0;
2517         CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2518         if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2519                 printf("abort_pending set\n");
2520
2521                 goto out;
2522         }
2523         inp_wlock(tp->t_inpcb);
2524         so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2525         if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2526                 keep = handle_peer_close_data(so, m);
2527                 if (keep < 0) {
2528                         inp_wunlock(tp->t_inpcb);
2529                         return;
2530                 }
2531         }
2532         if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2533                 CTR1(KTR_TOM,
2534                     "waking up waiters for cantrcvmore on %p ", so);
2535                 socantrcvmore(so);
2536
2537                 /*
2538                  * If connection is half-synchronized
2539                  * (ie NEEDSYN flag on) then delay ACK,
2540                  * so it may be piggybacked when SYN is sent.
2541                  * Otherwise, since we received a FIN then no
2542                  * more input can be expected, send ACK now.
2543                  */
2544                 if (tp->t_flags & TF_NEEDSYN)
2545                         tp->t_flags |= TF_DELACK;
2546                 else
2547                         tp->t_flags |= TF_ACKNOW;
2548                 tp->rcv_nxt++;
2549         }
2550
2551         switch (tp->t_state) {
2552         case TCPS_SYN_RECEIVED:
2553             tp->t_starttime = ticks;
2554         /* FALLTHROUGH */
2555         case TCPS_ESTABLISHED:
2556                 tp->t_state = TCPS_CLOSE_WAIT;
2557                 break;
2558         case TCPS_FIN_WAIT_1:
2559                 tp->t_state = TCPS_CLOSING;
2560                 break;
2561         case TCPS_FIN_WAIT_2:
2562                 /*
2563                  * If we've sent an abort_req we must have sent it too late,
2564                  * HW will send us a reply telling us so, and this peer_close
2565                  * is really the last message for this connection and needs to
2566                  * be treated as an abort_rpl, i.e., transition the connection
2567                  * to TCP_CLOSE (note that the host stack does this at the
2568                  * time of generating the RST but we must wait for HW).
2569                  * Otherwise we enter TIME_WAIT.
2570                  */
2571                 t3_release_offload_resources(toep);
2572                 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2573                         action = TCP_CLOSE;
2574                 } else {
2575                         action = TCP_TIMEWAIT;
2576                 }
2577                 break;
2578         default:
2579                 log(LOG_ERR,
2580                        "%s: TID %u received PEER_CLOSE in bad state %d\n",
2581                     toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2582         }
2583         inp_wunlock(tp->t_inpcb);
2584
2585         if (action == TCP_TIMEWAIT) {
2586                 enter_timewait(tp);
2587         } else if (action == TCP_DROP) {
2588                 tcp_offload_drop(tp, 0);
2589         } else if (action == TCP_CLOSE) {
2590                 tcp_offload_close(tp);
2591         }
2592
2593 #ifdef notyet
2594         /* Do not send POLL_HUP for half duplex close. */
2595         if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2596             sk->sk_state == TCP_CLOSE)
2597                 sk_wake_async(so, 1, POLL_HUP);
2598         else
2599                 sk_wake_async(so, 1, POLL_IN);
2600 #endif
2601
2602 out:
2603         if (!keep)
2604                 m_free(m);
2605 }
2606
2607 /*
2608  * Handler for PEER_CLOSE CPL messages.
2609  */
2610 static int
2611 do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2612 {
2613         struct toepcb *toep = (struct toepcb *)ctx;
2614
2615         VALIDATE_SOCK(so);
2616
2617         do_peer_fin(toep, m);
2618         return (0);
2619 }
2620
2621 static void
2622 process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2623 {
2624         struct cpl_close_con_rpl *rpl = cplhdr(m);
2625         struct tcpcb *tp = toep->tp_tp;
2626         struct socket *so;
2627         int action = 0;
2628         struct sockbuf *rcv;
2629
2630         inp_wlock(tp->t_inpcb);
2631         so = inp_inpcbtosocket(tp->t_inpcb);
2632
2633         tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2634
2635         if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2636                 inp_wunlock(tp->t_inpcb);
2637                 goto out;
2638         }
2639
2640         CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2641             tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2642
2643         switch (tp->t_state) {
2644         case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2645                 t3_release_offload_resources(toep);
2646                 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2647                         action = TCP_CLOSE;
2648
2649                 } else {
2650                         action = TCP_TIMEWAIT;
2651                 }
2652                 break;
2653         case TCPS_LAST_ACK:
2654                 /*
2655                  * In this state we don't care about pending abort_rpl.
2656                  * If we've sent abort_req it was post-close and was sent too
2657                  * late, this close_con_rpl is the actual last message.
2658                  */
2659                 t3_release_offload_resources(toep);
2660                 action = TCP_CLOSE;
2661                 break;
2662         case TCPS_FIN_WAIT_1:
2663                 /*
2664                  * If we can't receive any more
2665                  * data, then closing user can proceed.
2666                  * Starting the timer is contrary to the
2667                  * specification, but if we don't get a FIN
2668                  * we'll hang forever.
2669                  *
2670                  * XXXjl:
2671                  * we should release the tp also, and use a
2672                  * compressed state.
2673                  */
2674                 if (so)
2675                         rcv = so_sockbuf_rcv(so);
2676                 else
2677                         break;
2678
2679                 if (rcv->sb_state & SBS_CANTRCVMORE) {
2680                         int timeout;
2681
2682                         if (so)
2683                                 soisdisconnected(so);
2684                         timeout = (tcp_fast_finwait2_recycle) ?
2685                             tcp_finwait2_timeout : tcp_maxidle;
2686                         tcp_timer_activate(tp, TT_2MSL, timeout);
2687                 }
2688                 tp->t_state = TCPS_FIN_WAIT_2;
2689                 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2690                     (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2691                         action = TCP_DROP;
2692                 }
2693
2694                 break;
2695         default:
2696                 log(LOG_ERR,
2697                        "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2698                        toep->tp_toedev->tod_name, toep->tp_tid,
2699                        tp->t_state);
2700         }
2701         inp_wunlock(tp->t_inpcb);
2702
2703
2704         if (action == TCP_TIMEWAIT) {
2705                 enter_timewait(tp);
2706         } else if (action == TCP_DROP) {
2707                 tcp_offload_drop(tp, 0);
2708         } else if (action == TCP_CLOSE) {
2709                 tcp_offload_close(tp);
2710         }
2711 out:
2712         m_freem(m);
2713 }
2714
2715 /*
2716  * Handler for CLOSE_CON_RPL CPL messages.
2717  */
2718 static int
2719 do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2720                             void *ctx)
2721 {
2722         struct toepcb *toep = (struct toepcb *)ctx;
2723
2724         process_close_con_rpl(toep, m);
2725         return (0);
2726 }
2727
2728 /*
2729  * Process abort replies.  We only process these messages if we anticipate
2730  * them as the coordination between SW and HW in this area is somewhat lacking
2731  * and sometimes we get ABORT_RPLs after we are done with the connection that
2732  * originated the ABORT_REQ.
2733  */
2734 static void
2735 process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2736 {
2737         struct tcpcb *tp = toep->tp_tp;
2738         struct socket *so;
2739         int needclose = 0;
2740
2741 #ifdef T3_TRACE
2742         T3_TRACE1(TIDTB(sk),
2743                   "process_abort_rpl: GTS rpl pending %d",
2744                   sock_flag(sk, ABORT_RPL_PENDING));
2745 #endif
2746
2747         inp_wlock(tp->t_inpcb);
2748         so = inp_inpcbtosocket(tp->t_inpcb);
2749
2750         if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2751                 /*
2752                  * XXX panic on tcpdrop
2753                  */
2754                 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2755                         toep->tp_flags |= TP_ABORT_RPL_RCVD;
2756                 else {
2757                         toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2758                         if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2759                             !is_t3a(toep->tp_toedev)) {
2760                                 if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2761                                         panic("TP_ABORT_REQ_RCVD set");
2762                                 t3_release_offload_resources(toep);
2763                                 needclose = 1;
2764                         }
2765                 }
2766         }
2767         inp_wunlock(tp->t_inpcb);
2768
2769         if (needclose)
2770                 tcp_offload_close(tp);
2771
2772         m_free(m);
2773 }
2774
2775 /*
2776  * Handle an ABORT_RPL_RSS CPL message.
2777  */
2778 static int
2779 do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2780 {
2781         struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2782         struct toepcb *toep;
2783
2784         /*
2785          * Ignore replies to post-close aborts indicating that the abort was
2786          * requested too late.  These connections are terminated when we get
2787          * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2788          * arrives the TID is either no longer used or it has been recycled.
2789          */
2790         if (rpl->status == CPL_ERR_ABORT_FAILED) {
2791 discard:
2792                 m_free(m);
2793                 return (0);
2794         }
2795
2796         toep = (struct toepcb *)ctx;
2797
2798         /*
2799          * Sometimes we've already closed the socket, e.g., a post-close
2800          * abort races with ABORT_REQ_RSS, the latter frees the socket
2801          * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2802          * but FW turns the ABORT_REQ into a regular one and so we get
2803          * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2804          */
2805         if (!toep)
2806                 goto discard;
2807
2808         if (toep->tp_tp == NULL) {
2809                 log(LOG_NOTICE, "removing tid for abort\n");
2810                 cxgb_remove_tid(cdev, toep, toep->tp_tid);
2811                 if (toep->tp_l2t)
2812                         l2t_release(L2DATA(cdev), toep->tp_l2t);
2813
2814                 toepcb_release(toep);
2815                 goto discard;
2816         }
2817
2818         log(LOG_NOTICE, "toep=%p\n", toep);
2819         log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2820
2821         toepcb_hold(toep);
2822         process_abort_rpl(toep, m);
2823         toepcb_release(toep);
2824         return (0);
2825 }
2826
2827 /*
2828  * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2829  * indicate whether RST should be sent in response.
2830  */
2831 static int
2832 abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2833 {
2834         struct tcpcb *tp = so_sototcpcb(so);
2835
2836         switch (abort_reason) {
2837         case CPL_ERR_BAD_SYN:
2838 #if 0
2839                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);      // fall through
2840 #endif
2841         case CPL_ERR_CONN_RESET:
2842                 // XXX need to handle SYN_RECV due to crossed SYNs
2843                 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2844         case CPL_ERR_XMIT_TIMEDOUT:
2845         case CPL_ERR_PERSIST_TIMEDOUT:
2846         case CPL_ERR_FINWAIT2_TIMEDOUT:
2847         case CPL_ERR_KEEPALIVE_TIMEDOUT:
2848 #if 0
2849                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2850 #endif
2851                 return (ETIMEDOUT);
2852         default:
2853                 return (EIO);
2854         }
2855 }
2856
2857 static inline void
2858 set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2859 {
2860         struct cpl_abort_rpl *rpl = cplhdr(m);
2861
2862         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2863         rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2864         m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2865
2866         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2867         rpl->cmd = cmd;
2868 }
2869
2870 static void
2871 send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2872 {
2873         struct mbuf *reply_mbuf;
2874         struct cpl_abort_req_rss *req = cplhdr(m);
2875
2876         reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2877         m_set_priority(m, CPL_PRIORITY_DATA);
2878         m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2879         set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2880         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2881         m_free(m);
2882 }
2883
2884 /*
2885  * Returns whether an ABORT_REQ_RSS message is a negative advice.
2886  */
2887 static inline int
2888 is_neg_adv_abort(unsigned int status)
2889 {
2890         return status == CPL_ERR_RTX_NEG_ADVICE ||
2891             status == CPL_ERR_PERSIST_NEG_ADVICE;
2892 }
2893
2894 static void
2895 send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2896 {
2897         struct mbuf  *reply_mbuf;
2898         struct cpl_abort_req_rss *req = cplhdr(m);
2899
2900         reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2901
2902         if (!reply_mbuf) {
2903                 /* Defer the reply.  Stick rst_status into req->cmd. */
2904                 req->status = rst_status;
2905                 t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2906                 return;
2907         }
2908
2909         m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2910         set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2911         m_free(m);
2912
2913         /*
2914          * XXX need to sync with ARP as for SYN_RECV connections we can send
2915          * these messages while ARP is pending.  For other connection states
2916          * it's not a problem.
2917          */
2918         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2919 }
2920
2921 #ifdef notyet
2922 static void
2923 cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2924 {
2925         CXGB_UNIMPLEMENTED();
2926 #ifdef notyet
2927         struct request_sock *req = child->sk_user_data;
2928
2929         inet_csk_reqsk_queue_removed(parent, req);
2930         synq_remove(tcp_sk(child));
2931         __reqsk_free(req);
2932         child->sk_user_data = NULL;
2933 #endif
2934 }
2935
2936
2937 /*
2938  * Performs the actual work to abort a SYN_RECV connection.
2939  */
2940 static void
2941 do_abort_syn_rcv(struct socket *child, struct socket *parent)
2942 {
2943         struct tcpcb *parenttp = so_sototcpcb(parent);
2944         struct tcpcb *childtp = so_sototcpcb(child);
2945
2946         /*
2947          * If the server is still open we clean up the child connection,
2948          * otherwise the server already did the clean up as it was purging
2949          * its SYN queue and the skb was just sitting in its backlog.
2950          */
2951         if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2952                 cleanup_syn_rcv_conn(child, parent);
2953                 inp_wlock(childtp->t_inpcb);
2954                 t3_release_offload_resources(childtp->t_toe);
2955                 inp_wunlock(childtp->t_inpcb);
2956                 tcp_offload_close(childtp);
2957         }
2958 }
2959 #endif
2960
2961 /*
2962  * Handle abort requests for a SYN_RECV connection.  These need extra work
2963  * because the socket is on its parent's SYN queue.
2964  */
2965 static int
2966 abort_syn_rcv(struct socket *so, struct mbuf *m)
2967 {
2968         CXGB_UNIMPLEMENTED();
2969 #ifdef notyet
2970         struct socket *parent;
2971         struct toedev *tdev = toep->tp_toedev;
2972         struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2973         struct socket *oreq = so->so_incomp;
2974         struct t3c_tid_entry *t3c_stid;
2975         struct tid_info *t;
2976
2977         if (!oreq)
2978                 return -1;        /* somehow we are not on the SYN queue */
2979
2980         t = &(T3C_DATA(cdev))->tid_maps;
2981         t3c_stid = lookup_stid(t, oreq->ts_recent);
2982         parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2983
2984         so_lock(parent);
2985         do_abort_syn_rcv(so, parent);
2986         send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2987         so_unlock(parent);
2988 #endif
2989         return (0);
2990 }
2991
2992 /*
2993  * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2994  * request except that we need to reply to it.
2995  */
2996 static void
2997 process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
2998 {
2999         int rst_status = CPL_ABORT_NO_RST;
3000         const struct cpl_abort_req_rss *req = cplhdr(m);
3001         struct tcpcb *tp = toep->tp_tp;
3002         struct socket *so;
3003         int needclose = 0;
3004
3005         inp_wlock(tp->t_inpcb);
3006         so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3007         if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3008                 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3009                 m_free(m);
3010                 goto skip;
3011         }
3012
3013         toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3014         /*
3015          * Three cases to consider:
3016          * a) We haven't sent an abort_req; close the connection.
3017          * b) We have sent a post-close abort_req that will get to TP too late
3018          *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3019          *    be ignored and the connection should be closed now.
3020          * c) We have sent a regular abort_req that will get to TP too late.
3021          *    That will generate an abort_rpl with status 0, wait for it.
3022          */
3023         if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3024             (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3025                 int error;
3026
3027                 error = abort_status_to_errno(so, req->status,
3028                     &rst_status);
3029                 so_error_set(so, error);
3030
3031                 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3032                         so_sorwakeup(so);
3033                 /*
3034                  * SYN_RECV needs special processing.  If abort_syn_rcv()
3035                  * returns 0 is has taken care of the abort.
3036                  */
3037                 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3038                         goto skip;
3039
3040                 t3_release_offload_resources(toep);
3041                 needclose = 1;
3042         }
3043         inp_wunlock(tp->t_inpcb);
3044
3045         if (needclose)
3046                 tcp_offload_close(tp);
3047
3048         send_abort_rpl(m, tdev, rst_status);
3049         return;
3050 skip:
3051         inp_wunlock(tp->t_inpcb);
3052 }
3053
3054 /*
3055  * Handle an ABORT_REQ_RSS CPL message.
3056  */
3057 static int
3058 do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3059 {
3060         const struct cpl_abort_req_rss *req = cplhdr(m);
3061         struct toepcb *toep = (struct toepcb *)ctx;
3062
3063         if (is_neg_adv_abort(req->status)) {
3064                 m_free(m);
3065                 return (0);
3066         }
3067
3068         log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3069
3070         if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3071                 cxgb_remove_tid(cdev, toep, toep->tp_tid);
3072                 toep->tp_flags |= TP_ABORT_REQ_RCVD;
3073
3074                 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3075                 if (toep->tp_l2t)
3076                         l2t_release(L2DATA(cdev), toep->tp_l2t);
3077
3078                 /*
3079                  *  Unhook
3080                  */
3081                 toep->tp_tp->t_toe = NULL;
3082                 toep->tp_tp->t_flags &= ~TF_TOE;
3083                 toep->tp_tp = NULL;
3084                 /*
3085                  * XXX need to call syncache_chkrst - but we don't
3086                  * have a way of doing that yet
3087                  */
3088                 toepcb_release(toep);
3089                 log(LOG_ERR, "abort for unestablished connection :-(\n");
3090                 return (0);
3091         }
3092         if (toep->tp_tp == NULL) {
3093                 log(LOG_NOTICE, "disconnected toepcb\n");
3094                 /* should be freed momentarily */
3095                 return (0);
3096         }
3097
3098
3099         toepcb_hold(toep);
3100         process_abort_req(toep, m, toep->tp_toedev);
3101         toepcb_release(toep);
3102         return (0);
3103 }
3104 #ifdef notyet
3105 static void
3106 pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3107 {
3108         struct toedev *tdev = TOE_DEV(parent);
3109
3110         do_abort_syn_rcv(child, parent);
3111         if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3112                 struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3113
3114                 rpl->opt0h = htonl(F_TCAM_BYPASS);
3115                 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3116                 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3117         } else
3118                 m_free(m);
3119 }
3120 #endif
3121 static void
3122 handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3123 {
3124         CXGB_UNIMPLEMENTED();
3125
3126 #ifdef notyet
3127         struct t3cdev *cdev;
3128         struct socket *parent;
3129         struct socket *oreq;
3130         struct t3c_tid_entry *t3c_stid;
3131         struct tid_info *t;
3132         struct tcpcb *otp, *tp = so_sototcpcb(so);
3133         struct toepcb *toep = tp->t_toe;
3134
3135         /*
3136          * If the connection is being aborted due to the parent listening
3137          * socket going away there's nothing to do, the ABORT_REQ will close
3138          * the connection.
3139          */
3140         if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3141                 m_free(m);
3142                 return;
3143         }
3144
3145         oreq = so->so_incomp;
3146         otp = so_sototcpcb(oreq);
3147
3148         cdev = T3C_DEV(so);
3149         t = &(T3C_DATA(cdev))->tid_maps;
3150         t3c_stid = lookup_stid(t, otp->ts_recent);
3151         parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3152
3153         so_lock(parent);
3154         pass_open_abort(so, parent, m);
3155         so_unlock(parent);
3156 #endif
3157 }
3158
3159 /*
3160  * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3161  * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3162  * connection.
3163  */
3164 static void
3165 pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3166 {
3167
3168 #ifdef notyet
3169         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3170         BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3171 #endif
3172         handle_pass_open_arp_failure(m_get_socket(m), m);
3173 }
3174
3175 /*
3176  * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3177  */
3178 static void
3179 mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3180 {
3181         struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3182         struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3183         unsigned int tid = GET_TID(req);
3184
3185         m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3186         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3187         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3188         rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3189         rpl->opt0h = htonl(F_TCAM_BYPASS);
3190         rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3191         rpl->opt2 = 0;
3192         rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3193 }
3194
3195 /*
3196  * Send a deferred reject to an accept request.
3197  */
3198 static void
3199 reject_pass_request(struct toedev *tdev, struct mbuf *m)
3200 {
3201         struct mbuf *reply_mbuf;
3202
3203         reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3204         mk_pass_accept_rpl(reply_mbuf, m);
3205         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3206         m_free(m);
3207 }
3208
3209 static void
3210 handle_syncache_event(int event, void *arg)
3211 {
3212         struct toepcb *toep = arg;
3213
3214         switch (event) {
3215         case TOE_SC_ENTRY_PRESENT:
3216                 /*
3217                  * entry already exists - free toepcb
3218                  * and l2t
3219                  */
3220                 printf("syncache entry present\n");
3221                 toepcb_release(toep);
3222                 break;
3223         case TOE_SC_DROP:
3224                 /*
3225                  * The syncache has given up on this entry
3226                  * either it timed out, or it was evicted
3227                  * we need to explicitly release the tid
3228                  */
3229                 printf("syncache entry dropped\n");
3230                 toepcb_release(toep);
3231                 break;
3232         default:
3233                 log(LOG_ERR, "unknown syncache event %d\n", event);
3234                 break;
3235         }
3236 }
3237
3238 static void
3239 syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3240 {
3241         struct in_conninfo inc;
3242         struct tcpopt to;
3243         struct tcphdr th;
3244         struct inpcb *inp;
3245         int mss, wsf, sack, ts;
3246         uint32_t rcv_isn = ntohl(req->rcv_isn);
3247
3248         bzero(&to, sizeof(struct tcpopt));
3249         inp = so_sotoinpcb(lso);
3250
3251         /*
3252          * Fill out information for entering us into the syncache
3253          */
3254         inc.inc_fport = th.th_sport = req->peer_port;
3255         inc.inc_lport = th.th_dport = req->local_port;
3256         th.th_seq = req->rcv_isn;
3257         th.th_flags = TH_SYN;
3258
3259         toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3260
3261
3262         inc.inc_isipv6 = 0;
3263         inc.inc_len = 0;
3264         inc.inc_faddr.s_addr = req->peer_ip;
3265         inc.inc_laddr.s_addr = req->local_ip;
3266
3267         DPRINTF("syncache add of %d:%d %d:%d\n",
3268             ntohl(req->local_ip), ntohs(req->local_port),
3269             ntohl(req->peer_ip), ntohs(req->peer_port));
3270
3271         mss = req->tcp_options.mss;
3272         wsf = req->tcp_options.wsf;
3273         ts = req->tcp_options.tstamp;
3274         sack = req->tcp_options.sack;
3275         to.to_mss = mss;
3276         to.to_wscale = wsf;
3277         to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3278         tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3279 }
3280
3281
3282 /*
3283  * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3284  * lock held.  Note that the sock here is a listening socket that is not owned
3285  * by the TOE.
3286  */
3287 static void
3288 process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3289     struct listen_ctx *lctx)
3290 {
3291         int rt_flags;
3292         struct l2t_entry *e;
3293         struct iff_mac tim;
3294         struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3295         struct cpl_pass_accept_rpl *rpl;
3296         struct cpl_pass_accept_req *req = cplhdr(m);
3297         unsigned int tid = GET_TID(req);
3298         struct tom_data *d = TOM_DATA(tdev);
3299         struct t3cdev *cdev = d->cdev;
3300         struct tcpcb *tp = so_sototcpcb(so);
3301         struct toepcb *newtoep;
3302         struct rtentry *dst;
3303         struct sockaddr_in nam;
3304         struct t3c_data *td = T3C_DATA(cdev);
3305
3306         reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3307         if (__predict_false(reply_mbuf == NULL)) {
3308                 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3309                         t3_defer_reply(m, tdev, reject_pass_request);
3310                 else {
3311                         cxgb_queue_tid_release(cdev, tid);
3312                         m_free(m);
3313                 }
3314                 DPRINTF("failed to get reply_mbuf\n");
3315
3316                 goto out;
3317         }
3318
3319         if (tp->t_state != TCPS_LISTEN) {
3320                 DPRINTF("socket not in listen state\n");
3321
3322                 goto reject;
3323         }
3324
3325         tim.mac_addr = req->dst_mac;
3326         tim.vlan_tag = ntohs(req->vlan_tag);
3327         if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3328                 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3329                 goto reject;
3330         }
3331
3332 #ifdef notyet
3333         /*
3334          * XXX do route lookup to confirm that we're still listening on this
3335          * address
3336          */
3337         if (ip_route_input(skb, req->local_ip, req->peer_ip,
3338                            G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3339                 goto reject;
3340         rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3341                 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3342         dst_release(skb->dst);  // done with the input route, release it
3343         skb->dst = NULL;
3344
3345         if ((rt_flags & RTF_LOCAL) == 0)
3346                 goto reject;
3347 #endif
3348         /*
3349          * XXX
3350          */
3351         rt_flags = RTF_LOCAL;
3352         if ((rt_flags & RTF_LOCAL) == 0)
3353                 goto reject;
3354
3355         /*
3356          * Calculate values and add to syncache
3357          */
3358
3359         newtoep = toepcb_alloc();
3360         if (newtoep == NULL)
3361                 goto reject;
3362
3363         bzero(&nam, sizeof(struct sockaddr_in));
3364
3365         nam.sin_len = sizeof(struct sockaddr_in);
3366         nam.sin_family = AF_INET;
3367         nam.sin_addr.s_addr =req->peer_ip;
3368         dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3369
3370         if (dst == NULL) {
3371                 printf("failed to find route\n");
3372                 goto reject;
3373         }
3374         e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3375             (struct sockaddr *)&nam);
3376         if (e == NULL) {
3377                 DPRINTF("failed to get l2t\n");
3378         }
3379         /*
3380          * Point to our listen socket until accept
3381          */
3382         newtoep->tp_tp = tp;
3383         newtoep->tp_flags = TP_SYN_RCVD;
3384         newtoep->tp_tid = tid;
3385         newtoep->tp_toedev = tdev;
3386         tp->rcv_wnd = select_rcv_wnd(tdev, so);
3387
3388         cxgb_insert_tid(cdev, d->client, newtoep, tid);
3389         so_lock(so);
3390         LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3391         so_unlock(so);
3392
3393         newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3394                        tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3395
3396         if (newtoep->tp_ulp_mode) {
3397                 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3398
3399                 if (ddp_mbuf == NULL)
3400                         newtoep->tp_ulp_mode = 0;
3401         }
3402
3403         CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3404             TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3405         set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3406         /*
3407          * XXX workaround for lack of syncache drop
3408          */
3409         toepcb_hold(newtoep);
3410         syncache_add_accept_req(req, so, newtoep);
3411
3412         rpl = cplhdr(reply_mbuf);
3413         reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3414         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3415         rpl->wr.wr_lo = 0;
3416         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3417         rpl->opt2 = htonl(calc_opt2(so, tdev));
3418         rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3419         rpl->peer_ip = req->peer_ip;    // req->peer_ip is not overwritten
3420
3421         rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3422             V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3423         rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3424                                   CPL_PASS_OPEN_ACCEPT);
3425
3426         DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3427
3428         m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3429
3430         l2t_send(cdev, reply_mbuf, e);
3431         m_free(m);
3432         if (newtoep->tp_ulp_mode) {
3433                 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3434                                 V_TF_DDP_OFF(1) |
3435                                 TP_DDP_TIMER_WORKAROUND_MASK,
3436                                 V_TF_DDP_OFF(1) |
3437                     TP_DDP_TIMER_WORKAROUND_VAL, 1);
3438         } else
3439                 printf("not offloading\n");
3440
3441
3442
3443         return;
3444 reject:
3445         if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3446                 mk_pass_accept_rpl(reply_mbuf, m);
3447         else
3448                 mk_tid_release(reply_mbuf, newtoep, tid);
3449         cxgb_ofld_send(cdev, reply_mbuf);
3450         m_free(m);
3451 out:
3452 #if 0
3453         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3454 #else
3455         return;
3456 #endif
3457 }
3458
3459 /*
3460  * Handle a CPL_PASS_ACCEPT_REQ message.
3461  */
3462 static int
3463 do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3464 {
3465         struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3466         struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3467         struct tom_data *d = listen_ctx->tom_data;
3468
3469 #if VALIDATE_TID
3470         struct cpl_pass_accept_req *req = cplhdr(m);
3471         unsigned int tid = GET_TID(req);
3472         struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3473
3474         if (unlikely(!lsk)) {
3475                 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3476                        cdev->name,
3477                        (unsigned long)((union listen_entry *)ctx -
3478                                         t->stid_tab));
3479                 return CPL_RET_BUF_DONE;
3480         }
3481         if (unlikely(tid >= t->ntids)) {
3482                 printk(KERN_ERR "%s: passive open TID %u too large\n",
3483                        cdev->name, tid);
3484                 return CPL_RET_BUF_DONE;
3485         }
3486         /*
3487          * For T3A the current user of the TID may have closed but its last
3488          * message(s) may have been backlogged so the TID appears to be still
3489          * in use.  Just take the TID away, the connection can close at its
3490          * own leisure.  For T3B this situation is a bug.
3491          */
3492         if (!valid_new_tid(t, tid) &&
3493             cdev->type != T3A) {
3494                 printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3495                        cdev->name, tid);
3496                 return CPL_RET_BUF_DONE;
3497         }
3498 #endif
3499
3500         process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3501         return (0);
3502 }
3503
3504 /*
3505  * Called when a connection is established to translate the TCP options
3506  * reported by HW to FreeBSD's native format.
3507  */
3508 static void
3509 assign_rxopt(struct socket *so, unsigned int opt)
3510 {
3511         struct tcpcb *tp = so_sototcpcb(so);
3512         struct toepcb *toep = tp->t_toe;
3513         const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3514
3515         inp_lock_assert(tp->t_inpcb);
3516
3517         toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3518         tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3519         tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3520         tp->t_flags         |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3521         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3522             (TF_RCVD_SCALE|TF_REQ_SCALE))
3523                 tp->rcv_scale = tp->request_r_scale;
3524 }
3525
3526 /*
3527  * Completes some final bits of initialization for just established connections
3528  * and changes their state to TCP_ESTABLISHED.
3529  *
3530  * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3531  */
3532 static void
3533 make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3534 {
3535         struct tcpcb *tp = so_sototcpcb(so);
3536         struct toepcb *toep = tp->t_toe;
3537
3538         toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3539         assign_rxopt(so, opt);
3540
3541         /*
3542          *XXXXXXXXXXX
3543          *
3544          */
3545 #ifdef notyet
3546         so->so_proto->pr_ctloutput = t3_ctloutput;
3547 #endif
3548
3549 #if 0
3550         inet_sk(sk)->id = tp->write_seq ^ jiffies;
3551 #endif
3552         /*
3553          * XXX not clear what rcv_wup maps to
3554          */
3555         /*
3556          * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3557          * pass through opt0.
3558          */
3559         if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3560                 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3561
3562         dump_toepcb(toep);
3563
3564 #ifdef notyet
3565 /*
3566  * no clean interface for marking ARP up to date
3567  */
3568         dst_confirm(sk->sk_dst_cache);
3569 #endif
3570         tp->t_starttime = ticks;
3571         tp->t_state = TCPS_ESTABLISHED;
3572         soisconnected(so);
3573 }
3574
3575 static int
3576 syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3577 {
3578
3579         struct in_conninfo inc;
3580         struct tcpopt to;
3581         struct tcphdr th;
3582         int mss, wsf, sack, ts;
3583         struct mbuf *m = NULL;
3584         const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3585         unsigned int opt;
3586
3587 #ifdef MAC
3588 #error  "no MAC support"
3589 #endif
3590
3591         opt = ntohs(req->tcp_opt);
3592
3593         bzero(&to, sizeof(struct tcpopt));
3594
3595         /*
3596          * Fill out information for entering us into the syncache
3597          */
3598         inc.inc_fport = th.th_sport = req->peer_port;
3599         inc.inc_lport = th.th_dport = req->local_port;
3600         th.th_seq = req->rcv_isn;
3601         th.th_flags = TH_ACK;
3602
3603         inc.inc_isipv6 = 0;
3604         inc.inc_len = 0;
3605         inc.inc_faddr.s_addr = req->peer_ip;
3606         inc.inc_laddr.s_addr = req->local_ip;
3607
3608         mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3609         wsf  = G_TCPOPT_WSCALE_OK(opt);
3610         ts   = G_TCPOPT_TSTAMP(opt);
3611         sack = G_TCPOPT_SACK(opt);
3612
3613         to.to_mss = mss;
3614         to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3615         to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3616
3617         DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3618             ntohl(req->local_ip), ntohs(req->local_port),
3619             ntohl(req->peer_ip), ntohs(req->peer_port),
3620             mss, wsf, ts, sack);
3621         return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3622 }
3623
3624
3625 /*
3626  * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3627  * if we are in TCP_SYN_RECV due to crossed SYNs
3628  */
3629 static int
3630 do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3631 {
3632         struct cpl_pass_establish *req = cplhdr(m);
3633         struct toepcb *toep = (struct toepcb *)ctx;
3634         struct tcpcb *tp = toep->tp_tp;
3635         struct socket *so, *lso;
3636         struct t3c_data *td = T3C_DATA(cdev);
3637         struct sockbuf *snd, *rcv;
3638
3639         // Complete socket initialization now that we have the SND_ISN
3640
3641         struct toedev *tdev;
3642
3643
3644         tdev = toep->tp_toedev;
3645
3646         inp_wlock(tp->t_inpcb);
3647
3648         /*
3649          *
3650          * XXX need to add reference while we're manipulating
3651          */
3652         so = lso = inp_inpcbtosocket(tp->t_inpcb);
3653
3654         inp_wunlock(tp->t_inpcb);
3655
3656         so_lock(so);
3657         LIST_REMOVE(toep, synq_entry);
3658         so_unlock(so);
3659
3660         if (!syncache_expand_establish_req(req, &so, toep)) {
3661                 /*
3662                  * No entry
3663                  */
3664                 CXGB_UNIMPLEMENTED();
3665         }
3666         if (so == NULL) {
3667                 /*
3668                  * Couldn't create the socket
3669                  */
3670                 CXGB_UNIMPLEMENTED();
3671         }
3672
3673         tp = so_sototcpcb(so);
3674         inp_wlock(tp->t_inpcb);
3675
3676         snd = so_sockbuf_snd(so);
3677         rcv = so_sockbuf_rcv(so);
3678
3679         snd->sb_flags |= SB_NOCOALESCE;
3680         rcv->sb_flags |= SB_NOCOALESCE;
3681
3682         toep->tp_tp = tp;
3683         toep->tp_flags = 0;
3684         tp->t_toe = toep;
3685         reset_wr_list(toep);
3686         tp->rcv_wnd = select_rcv_wnd(tdev, so);
3687         tp->rcv_nxt = toep->tp_copied_seq;
3688         install_offload_ops(so);
3689
3690         toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3691         toep->tp_wr_unacked = 0;
3692         toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3693         toep->tp_qset_idx = 0;
3694         toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3695
3696         /*
3697          * XXX Cancel any keep alive timer
3698          */
3699
3700         make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3701
3702         /*
3703          * XXX workaround for lack of syncache drop
3704          */
3705         toepcb_release(toep);
3706         inp_wunlock(tp->t_inpcb);
3707
3708         CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3709         cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3710 #ifdef notyet
3711         /*
3712          * XXX not sure how these checks map to us
3713          */
3714         if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3715                 sk->sk_state_change(sk);
3716                 sk_wake_async(so, 0, POLL_OUT);
3717         }
3718         /*
3719          * The state for the new connection is now up to date.
3720          * Next check if we should add the connection to the parent's
3721          * accept queue.  When the parent closes it resets connections
3722          * on its SYN queue, so check if we are being reset.  If so we
3723          * don't need to do anything more, the coming ABORT_RPL will
3724          * destroy this socket.  Otherwise move the connection to the
3725          * accept queue.
3726          *
3727          * Note that we reset the synq before closing the server so if
3728          * we are not being reset the stid is still open.
3729          */
3730         if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3731                 __kfree_skb(skb);
3732                 goto unlock;
3733         }
3734 #endif
3735         m_free(m);
3736
3737         return (0);
3738 }
3739
3740 /*
3741  * Fill in the right TID for CPL messages waiting in the out-of-order queue
3742  * and send them to the TOE.
3743  */
3744 static void
3745 fixup_and_send_ofo(struct toepcb *toep)
3746 {
3747         struct mbuf *m;
3748         struct toedev *tdev = toep->tp_toedev;
3749         struct tcpcb *tp = toep->tp_tp;
3750         unsigned int tid = toep->tp_tid;
3751
3752         log(LOG_NOTICE, "fixup_and_send_ofo\n");
3753
3754         inp_lock_assert(tp->t_inpcb);
3755         while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3756                 /*
3757                  * A variety of messages can be waiting but the fields we'll
3758                  * be touching are common to all so any message type will do.
3759                  */
3760                 struct cpl_close_con_req *p = cplhdr(m);
3761
3762                 p->wr.wr_lo = htonl(V_WR_TID(tid));
3763                 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3764                 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3765         }
3766 }
3767
3768 /*
3769  * Updates socket state from an active establish CPL message.  Runs with the
3770  * socket lock held.
3771  */
3772 static void
3773 socket_act_establish(struct socket *so, struct mbuf *m)
3774 {
3775         struct cpl_act_establish *req = cplhdr(m);
3776         u32 rcv_isn = ntohl(req->rcv_isn);      /* real RCV_ISN + 1 */
3777         struct tcpcb *tp = so_sototcpcb(so);
3778         struct toepcb *toep = tp->t_toe;
3779
3780         if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3781                 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3782                     toep->tp_tid, tp->t_state);
3783
3784         tp->ts_recent_age = ticks;
3785         tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3786         toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3787
3788         make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3789
3790         /*
3791          * Now that we finally have a TID send any CPL messages that we had to
3792          * defer for lack of a TID.
3793          */
3794         if (mbufq_len(&toep->out_of_order_queue))
3795                 fixup_and_send_ofo(toep);
3796
3797         if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3798                 /*
3799                  * XXX does this even make sense?
3800                  */
3801                 so_sorwakeup(so);
3802         }
3803         m_free(m);
3804 #ifdef notyet
3805 /*
3806  * XXX assume no write requests permitted while socket connection is
3807  * incomplete
3808  */
3809         /*
3810          * Currently the send queue must be empty at this point because the
3811          * socket layer does not send anything before a connection is
3812          * established.  To be future proof though we handle the possibility
3813          * that there are pending buffers to send (either TX_DATA or
3814          * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3815          * buffers according to the just learned write_seq, and then we send
3816          * them on their way.
3817          */
3818         fixup_pending_writeq_buffers(sk);
3819         if (t3_push_frames(so, 1))
3820                 sk->sk_write_space(sk);
3821 #endif
3822
3823         toep->tp_state = tp->t_state;
3824         V_tcpstat.tcps_connects++;
3825
3826 }
3827
3828 /*
3829  * Process a CPL_ACT_ESTABLISH message.
3830  */
3831 static int
3832 do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3833 {
3834         struct cpl_act_establish *req = cplhdr(m);
3835         unsigned int tid = GET_TID(req);
3836         unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3837         struct toepcb *toep = (struct toepcb *)ctx;
3838         struct tcpcb *tp = toep->tp_tp;
3839         struct socket *so;
3840         struct toedev *tdev;
3841         struct tom_data *d;
3842
3843         if (tp == NULL) {
3844                 free_atid(cdev, atid);
3845                 return (0);
3846         }
3847         inp_wlock(tp->t_inpcb);
3848
3849         /*
3850          * XXX
3851          */
3852         so = inp_inpcbtosocket(tp->t_inpcb);
3853         tdev = toep->tp_toedev; /* blow up here if link was down */
3854         d = TOM_DATA(tdev);
3855
3856         /*
3857          * It's OK if the TID is currently in use, the owning socket may have
3858          * backlogged its last CPL message(s).  Just take it away.
3859          */
3860         toep->tp_tid = tid;
3861         toep->tp_tp = tp;
3862         so_insert_tid(d, toep, tid);
3863         free_atid(cdev, atid);
3864         toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3865
3866         socket_act_establish(so, m);
3867         inp_wunlock(tp->t_inpcb);
3868         CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3869         cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3870
3871         return (0);
3872 }
3873
3874 /*
3875  * Process an acknowledgment of WR completion.  Advance snd_una and send the
3876  * next batch of work requests from the write queue.
3877  */
3878 static void
3879 wr_ack(struct toepcb *toep, struct mbuf *m)
3880 {
3881         struct tcpcb *tp = toep->tp_tp;
3882         struct cpl_wr_ack *hdr = cplhdr(m);
3883         struct socket *so;
3884         unsigned int credits = ntohs(hdr->credits);
3885         u32 snd_una = ntohl(hdr->snd_una);
3886         int bytes = 0;
3887         struct sockbuf *snd;
3888
3889         CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3890
3891         inp_wlock(tp->t_inpcb);
3892         so = inp_inpcbtosocket(tp->t_inpcb);
3893         toep->tp_wr_avail += credits;
3894         if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3895                 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3896
3897         while (credits) {
3898                 struct mbuf *p = peek_wr(toep);
3899
3900                 if (__predict_false(!p)) {
3901                         log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3902                             "nothing pending, state %u wr_avail=%u\n",
3903                             credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3904                         break;
3905                 }
3906                 CTR2(KTR_TOM,
3907                         "wr_ack: p->credits=%d p->bytes=%d",
3908                     p->m_pkthdr.csum_data, p->m_pkthdr.len);
3909                 KASSERT(p->m_pkthdr.csum_data != 0,
3910                     ("empty request still on list"));
3911
3912                 if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3913
3914 #if DEBUG_WR > 1
3915                         struct tx_data_wr *w = cplhdr(p);
3916                         log(LOG_ERR,
3917                                "TID %u got %u WR credits, need %u, len %u, "
3918                                "main body %u, frags %u, seq # %u, ACK una %u,"
3919                                " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3920                                toep->tp_tid, credits, p->csum, p->len,
3921                                p->len - p->data_len, skb_shinfo(p)->nr_frags,
3922                                ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3923                             toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3924 #endif
3925                         p->m_pkthdr.csum_data -= credits;
3926                         break;
3927                 } else {
3928                         dequeue_wr(toep);
3929                         credits -= p->m_pkthdr.csum_data;
3930                         bytes += p->m_pkthdr.len;
3931                         CTR3(KTR_TOM,
3932                             "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3933                             p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3934
3935                         m_free(p);
3936                 }
3937         }
3938
3939 #if DEBUG_WR
3940         check_wr_invariants(tp);
3941 #endif
3942
3943         if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3944 #if VALIDATE_SEQ
3945                 struct tom_data *d = TOM_DATA(TOE_DEV(so));
3946
3947                 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3948                     "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3949                     toep->tp_tid, tp->snd_una);
3950 #endif
3951                 goto out_free;
3952         }
3953
3954         if (tp->snd_una != snd_una) {
3955                 tp->snd_una = snd_una;
3956                 tp->ts_recent_age = ticks;
3957 #ifdef notyet
3958                 /*
3959                  * Keep ARP entry "minty fresh"
3960                  */
3961                 dst_confirm(sk->sk_dst_cache);
3962 #endif
3963                 if (tp->snd_una == tp->snd_nxt)
3964                         toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3965         }
3966
3967         snd = so_sockbuf_snd(so);
3968         if (bytes) {
3969                 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3970                 snd = so_sockbuf_snd(so);
3971                 sockbuf_lock(snd);
3972                 sbdrop_locked(snd, bytes);
3973                 so_sowwakeup_locked(so);
3974         }
3975
3976         if (snd->sb_sndptroff < snd->sb_cc)
3977                 t3_push_frames(so, 0);
3978
3979 out_free:
3980         inp_wunlock(tp->t_inpcb);
3981         m_free(m);
3982 }
3983
3984 /*
3985  * Handler for TX_DATA_ACK CPL messages.
3986  */
3987 static int
3988 do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3989 {
3990         struct toepcb *toep = (struct toepcb *)ctx;
3991
3992         VALIDATE_SOCK(so);
3993
3994         wr_ack(toep, m);
3995         return 0;
3996 }
3997
3998 /*
3999  * Handler for TRACE_PKT CPL messages.  Just sink these packets.
4000  */
4001 static int
4002 do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4003 {
4004         m_freem(m);
4005         return 0;
4006 }
4007
4008 /*
4009  * Reset a connection that is on a listener's SYN queue or accept queue,
4010  * i.e., one that has not had a struct socket associated with it.
4011  * Must be called from process context.
4012  *
4013  * Modeled after code in inet_csk_listen_stop().
4014  */
4015 static void
4016 t3_reset_listen_child(struct socket *child)
4017 {
4018         struct tcpcb *tp = so_sototcpcb(child);
4019
4020         t3_send_reset(tp->t_toe);
4021 }
4022
4023
4024 static void
4025 t3_child_disconnect(struct socket *so, void *arg)
4026 {
4027         struct tcpcb *tp = so_sototcpcb(so);
4028
4029         if (tp->t_flags & TF_TOE) {
4030                 inp_wlock(tp->t_inpcb);
4031                 t3_reset_listen_child(so);
4032                 inp_wunlock(tp->t_inpcb);
4033         }
4034 }
4035
4036 /*
4037  * Disconnect offloaded established but not yet accepted connections sitting
4038  * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4039  * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4040  */
4041 void
4042 t3_disconnect_acceptq(struct socket *listen_so)
4043 {
4044
4045         so_lock(listen_so);
4046         so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4047         so_unlock(listen_so);
4048 }
4049
4050 /*
4051  * Reset offloaded connections sitting on a server's syn queue.  As above
4052  * we send ABORT_REQ and finish off when we get ABORT_RPL.
4053  */
4054
4055 void
4056 t3_reset_synq(struct listen_ctx *lctx)
4057 {
4058         struct toepcb *toep;
4059
4060         so_lock(lctx->lso);
4061         while (!LIST_EMPTY(&lctx->synq_head)) {
4062                 toep = LIST_FIRST(&lctx->synq_head);
4063                 LIST_REMOVE(toep, synq_entry);
4064                 toep->tp_tp = NULL;
4065                 t3_send_reset(toep);
4066                 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4067                 toepcb_release(toep);
4068         }
4069         so_unlock(lctx->lso);
4070 }
4071
4072
4073 int
4074 t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4075                    unsigned int nppods, unsigned int tag, unsigned int maxoff,
4076                    unsigned int pg_off, unsigned int color)
4077 {
4078         unsigned int i, j, pidx;
4079         struct pagepod *p;
4080         struct mbuf *m;
4081         struct ulp_mem_io *req;
4082         unsigned int tid = toep->tp_tid;
4083         const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4084         unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4085
4086         CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4087             gl, nppods, tag, maxoff, pg_off, color);
4088
4089         for (i = 0; i < nppods; ++i) {
4090                 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4091                 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4092                 req = mtod(m, struct ulp_mem_io *);
4093                 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4094                 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4095                 req->wr.wr_lo = 0;
4096                 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4097                                            V_ULPTX_CMD(ULP_MEM_WRITE));
4098                 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4099                                  V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4100
4101                 p = (struct pagepod *)(req + 1);
4102                 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4103                         p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4104                         p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4105                                                   V_PPOD_COLOR(color));
4106                         p->pp_max_offset = htonl(maxoff);
4107                         p->pp_page_offset = htonl(pg_off);
4108                         p->pp_rsvd = 0;
4109                         for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4110                                 p->pp_addr[j] = pidx < gl->dgl_nelem ?
4111                                     htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4112                 } else
4113                         p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4114                 send_or_defer(toep, m, 0);
4115                 ppod_addr += PPOD_SIZE;
4116         }
4117         return (0);
4118 }
4119
4120 /*
4121  * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4122  */
4123 static inline void
4124 mk_cpl_barrier_ulp(struct cpl_barrier *b)
4125 {
4126         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4127
4128         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4129         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4130         b->opcode = CPL_BARRIER;
4131 }
4132
4133 /*
4134  * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4135  */
4136 static inline void
4137 mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4138 {
4139         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4140
4141         txpkt = (struct ulp_txpkt *)req;
4142         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4143         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4144         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4145         req->cpuno = htons(cpuno);
4146 }
4147
4148 /*
4149  * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4150  */
4151 static inline void
4152 mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4153                      unsigned int word, uint64_t mask, uint64_t val)
4154 {
4155         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4156
4157         CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4158             tid, word, mask, val);
4159
4160         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4161         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4162         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4163         req->reply = V_NO_REPLY(1);
4164         req->cpu_idx = 0;
4165         req->word = htons(word);
4166         req->mask = htobe64(mask);
4167         req->val = htobe64(val);
4168 }
4169
4170 /*
4171  * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4172  */
4173 static void
4174 mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4175     unsigned int tid, unsigned int credits)
4176 {
4177         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4178
4179         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4180         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4181         OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4182         ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4183             V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4184                                  V_RX_CREDITS(credits));
4185 }
4186
4187 void
4188 t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4189 {
4190         unsigned int wrlen;
4191         struct mbuf *m;
4192         struct work_request_hdr *wr;
4193         struct cpl_barrier *lock;
4194         struct cpl_set_tcb_field *req;
4195         struct cpl_get_tcb *getreq;
4196         struct ddp_state *p = &toep->tp_ddp_state;
4197
4198 #if 0
4199         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4200 #endif
4201         wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4202                 sizeof(*getreq);
4203         m = m_gethdr_nofail(wrlen);
4204         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4205         wr = mtod(m, struct work_request_hdr *);
4206         bzero(wr, wrlen);
4207
4208         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4209         m->m_pkthdr.len = m->m_len = wrlen;
4210
4211         lock = (struct cpl_barrier *)(wr + 1);
4212         mk_cpl_barrier_ulp(lock);
4213
4214         req = (struct cpl_set_tcb_field *)(lock + 1);
4215
4216         CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4217
4218         /* Hmmm, not sure if this actually a good thing: reactivating
4219          * the other buffer might be an issue if it has been completed
4220          * already. However, that is unlikely, since the fact that the UBUF
4221          * is not completed indicates that there is no oustanding data.
4222          */
4223         if (bufidx == 0)
4224                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4225                                      V_TF_DDP_ACTIVE_BUF(1) |
4226                                      V_TF_DDP_BUF0_VALID(1),
4227                                      V_TF_DDP_ACTIVE_BUF(1));
4228         else
4229                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4230                                      V_TF_DDP_ACTIVE_BUF(1) |
4231                                      V_TF_DDP_BUF1_VALID(1), 0);
4232
4233         getreq = (struct cpl_get_tcb *)(req + 1);
4234         mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4235
4236         mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4237
4238         /* Keep track of the number of oustanding CPL_GET_TCB requests
4239          */
4240         p->get_tcb_count++;
4241
4242 #ifdef T3_TRACE
4243         T3_TRACE1(TIDTB(so),
4244                   "t3_cancel_ddpbuf: bufidx %u", bufidx);
4245 #endif
4246         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4247 }
4248
4249 /**
4250  * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4251  * @sk: the socket associated with the buffers
4252  * @bufidx: index of HW DDP buffer (0 or 1)
4253  * @tag0: new tag for HW buffer 0
4254  * @tag1: new tag for HW buffer 1
4255  * @len: new length for HW buf @bufidx
4256  *
4257  * Sends a compound WR to overlay a new DDP buffer on top of an existing
4258  * buffer by changing the buffer tag and length and setting the valid and
4259  * active flag accordingly.  The caller must ensure the new buffer is at
4260  * least as big as the existing one.  Since we typically reprogram both HW
4261  * buffers this function sets both tags for convenience. Read the TCB to
4262  * determine how made data was written into the buffer before the overlay
4263  * took place.
4264  */
4265 void
4266 t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4267                        unsigned int tag1, unsigned int len)
4268 {
4269         unsigned int wrlen;
4270         struct mbuf *m;
4271         struct work_request_hdr *wr;
4272         struct cpl_get_tcb *getreq;
4273         struct cpl_set_tcb_field *req;
4274         struct ddp_state *p = &toep->tp_ddp_state;
4275
4276         CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4277             bufidx, tag0, tag1, len);
4278 #if 0
4279         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4280 #endif
4281         wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4282         m = m_gethdr_nofail(wrlen);
4283         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4284         wr = mtod(m, struct work_request_hdr *);
4285         m->m_pkthdr.len = m->m_len = wrlen;
4286         bzero(wr, wrlen);
4287
4288
4289         /* Set the ATOMIC flag to make sure that TP processes the following
4290          * CPLs in an atomic manner and no wire segments can be interleaved.
4291          */
4292         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4293         req = (struct cpl_set_tcb_field *)(wr + 1);
4294         mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4295                              V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4296                              V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4297                              V_TCB_RX_DDP_BUF0_TAG(tag0) |
4298                              V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4299         req++;
4300         if (bufidx == 0) {
4301                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4302                             V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4303                             V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4304                 req++;
4305                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4306                             V_TF_DDP_PUSH_DISABLE_0(1) |
4307                             V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4308                             V_TF_DDP_PUSH_DISABLE_0(0) |
4309                             V_TF_DDP_BUF0_VALID(1));
4310         } else {
4311                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4312                             V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4313                             V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4314                 req++;
4315                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4316                             V_TF_DDP_PUSH_DISABLE_1(1) |
4317                             V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4318                             V_TF_DDP_PUSH_DISABLE_1(0) |
4319                             V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4320         }
4321
4322         getreq = (struct cpl_get_tcb *)(req + 1);
4323         mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4324
4325         /* Keep track of the number of oustanding CPL_GET_TCB requests
4326          */
4327         p->get_tcb_count++;
4328
4329 #ifdef T3_TRACE
4330         T3_TRACE4(TIDTB(sk),
4331                   "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4332                   "len %d",
4333                   bufidx, tag0, tag1, len);
4334 #endif
4335         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4336 }
4337
4338 /*
4339  * Sends a compound WR containing all the CPL messages needed to program the
4340  * two HW DDP buffers, namely optionally setting up the length and offset of
4341  * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4342  */
4343 void
4344 t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4345                       unsigned int len1, unsigned int offset1,
4346                       uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4347 {
4348         unsigned int wrlen;
4349         struct mbuf *m;
4350         struct work_request_hdr *wr;
4351         struct cpl_set_tcb_field *req;
4352
4353         CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4354             len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4355
4356 #if 0
4357         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4358 #endif
4359         wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4360                 (len1 ? sizeof(*req) : 0) +
4361                 (modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4362         m = m_gethdr_nofail(wrlen);
4363         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4364         wr = mtod(m, struct work_request_hdr *);
4365         bzero(wr, wrlen);
4366
4367         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4368         m->m_pkthdr.len = m->m_len = wrlen;
4369
4370         req = (struct cpl_set_tcb_field *)(wr + 1);
4371         if (len0) {                  /* program buffer 0 offset and length */
4372                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4373                         V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4374                         V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4375                         V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4376                         V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4377                 req++;
4378         }
4379         if (len1) {                  /* program buffer 1 offset and length */
4380                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4381                         V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4382                         V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4383                         V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4384                         V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4385                 req++;
4386         }
4387
4388         mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4389                              ddp_flags);
4390
4391         if (modulate) {
4392                 mk_rx_data_ack_ulp(toep,
4393                     (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4394                     toep->tp_copied_seq - toep->tp_rcv_wup);
4395                 toep->tp_rcv_wup = toep->tp_copied_seq;
4396         }
4397
4398 #ifdef T3_TRACE
4399         T3_TRACE5(TIDTB(sk),
4400                   "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4401                   "modulate %d",
4402                   len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4403                   modulate);
4404 #endif
4405
4406         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4407 }
4408
4409 void
4410 t3_init_wr_tab(unsigned int wr_len)
4411 {
4412         int i;
4413
4414         if (mbuf_wrs[1])     /* already initialized */
4415                 return;
4416
4417         for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4418                 int sgl_len = (3 * i) / 2 + (i & 1);
4419
4420                 sgl_len += 3;
4421                 mbuf_wrs[i] = sgl_len <= wr_len ?
4422                         1 : 1 + (sgl_len - 2) / (wr_len - 1);
4423         }
4424
4425         wrlen = wr_len * 8;
4426 }
4427
4428 int
4429 t3_init_cpl_io(void)
4430 {
4431 #ifdef notyet
4432         tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4433         if (!tcphdr_skb) {
4434                 log(LOG_ERR,
4435                        "Chelsio TCP offload: can't allocate sk_buff\n");
4436                 return -1;
4437         }
4438         skb_put(tcphdr_skb, sizeof(struct tcphdr));
4439         tcphdr_skb->h.raw = tcphdr_skb->data;
4440         memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4441 #endif
4442
4443         t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4444         t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4445         t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4446         t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4447         t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4448         t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4449         t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4450         t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4451         t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4452         t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4453         t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4454         t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4455         t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4456         t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4457         t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4458         return (0);
4459 }
4460