sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c

   1 /**************************************************************************
   2
   3 Copyright (c) 2007-2008, Chelsio Inc.
   4 All rights reserved.
   5
   6 Redistribution and use in source and binary forms, with or without
   7 modification, are permitted provided that the following conditions are met:
   8
   9  1. Redistributions of source code must retain the above copyright notice,
  10     this list of conditions and the following disclaimer.
  11
  12  2. Neither the name of the Chelsio Corporation nor the names of its
  13     contributors may be used to endorse or promote products derived from
  14     this software without specific prior written permission.
  15
  16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26 POSSIBILITY OF SUCH DAMAGE.
  27
  28 ***************************************************************************/
  29
  30 #include <sys/cdefs.h>
  31 __FBSDID("$FreeBSD$");
  32
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/fcntl.h>
  36 #include <sys/kernel.h>
  37 #include <sys/limits.h>
  38 #include <sys/ktr.h>
  39 #include <sys/lock.h>
  40 #include <sys/mbuf.h>
  41 #include <sys/mutex.h>
  42 #include <sys/sockbuf.h>
  43 #include <sys/sockopt.h>
  44 #include <sys/sockstate.h>
  45 #include <sys/socket.h>
  46 #include <sys/sysctl.h>
  47 #include <sys/syslog.h>
  48 #include <sys/protosw.h>
  49 #include <sys/priv.h>
  50
  51 #include <net/if.h>
  52 #include <net/route.h>
  53
  54 #include <netinet/in.h>
  55 #include <netinet/in_pcb.h>
  56 #include <netinet/in_systm.h>
  57 #include <netinet/in_var.h>
  58
  59
  60 #include <dev/cxgb/cxgb_osdep.h>
  61 #include <dev/cxgb/sys/mbufq.h>
  62
  63 #include <netinet/ip.h>
  64 #include <netinet/tcp_var.h>
  65 #include <netinet/tcp_fsm.h>
  66 #include <netinet/tcp_offload.h>
  67 #include <netinet/tcp_seq.h>
  68 #include <netinet/tcp_syncache.h>
  69 #include <netinet/tcp_timer.h>
  70 #include <net/route.h>
  71
  72 #include <dev/cxgb/t3cdev.h>
  73 #include <dev/cxgb/common/cxgb_firmware_exports.h>
  74 #include <dev/cxgb/common/cxgb_t3_cpl.h>
  75 #include <dev/cxgb/common/cxgb_tcb.h>
  76 #include <dev/cxgb/common/cxgb_ctl_defs.h>
  77 #include <dev/cxgb/cxgb_offload.h>
  78 #include <vm/vm.h>
  79 #include <vm/pmap.h>
  80 #include <machine/bus.h>
  81 #include <dev/cxgb/sys/mvec.h>
  82 #include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
  83 #include <dev/cxgb/ulp/tom/cxgb_defs.h>
  84 #include <dev/cxgb/ulp/tom/cxgb_tom.h>
  85 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
  86 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
  87 #include <dev/cxgb/ulp/tom/cxgb_tcp.h>
  88
  89 #include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
  90
  91 /*
  92  * For ULP connections HW may add headers, e.g., for digests, that aren't part
  93  * of the messages sent by the host but that are part of the TCP payload and
  94  * therefore consume TCP sequence space.  Tx connection parameters that
  95  * operate in TCP sequence space are affected by the HW additions and need to
  96  * compensate for them to accurately track TCP sequence numbers. This array
  97  * contains the compensating extra lengths for ULP packets.  It is indexed by
  98  * a packet's ULP submode.
  99  */
 100 const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
 101
 102 #ifdef notyet
 103 /*
 104  * This sk_buff holds a fake header-only TCP segment that we use whenever we
 105  * need to exploit SW TCP functionality that expects TCP headers, such as
 106  * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
 107  * CPUs without locking.
 108  */
 109 static struct mbuf *tcphdr_mbuf __read_mostly;
 110 #endif
 111
 112 /*
 113  * Size of WRs in bytes.  Note that we assume all devices we are handling have
 114  * the same WR size.
 115  */
 116 static unsigned int wrlen __read_mostly;
 117
 118 /*
 119  * The number of WRs needed for an skb depends on the number of page fragments
 120  * in the skb and whether it has any payload in its main body.  This maps the
 121  * length of the gather list represented by an skb into the # of necessary WRs.
 122  */
 123 static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
 124
 125 /*
 126  * Max receive window supported by HW in bytes.  Only a small part of it can
 127  * be set through option0, the rest needs to be set through RX_DATA_ACK.
 128  */
 129 #define MAX_RCV_WND ((1U << 27) - 1)
 130
 131 /*
 132  * Min receive window.  We want it to be large enough to accommodate receive
 133  * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
 134  */
 135 #define MIN_RCV_WND (24 * 1024U)
 136 #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
 137
 138 #define VALIDATE_SEQ 0
 139 #define VALIDATE_SOCK(so)
 140 #define DEBUG_WR 0
 141
 142 #define TCP_TIMEWAIT    1
 143 #define TCP_CLOSE       2
 144 #define TCP_DROP        3
 145
 146 extern int tcp_do_autorcvbuf;
 147 extern int tcp_do_autosndbuf;
 148 extern int tcp_autorcvbuf_max;
 149 extern int tcp_autosndbuf_max;
 150
 151 static void t3_send_reset(struct toepcb *toep);
 152 static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
 153 static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
 154 static void handle_syncache_event(int event, void *arg);
 155
 156 static inline void
 157 SBAPPEND(struct sockbuf *sb, struct mbuf *n)
 158 {
 159         struct mbuf *m;
 160
 161         m = sb->sb_mb;
 162         while (m) {
 163                 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
 164                     !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
 165                         !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
 166                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 167                         m->m_next, m->m_nextpkt, m->m_flags));
 168                 m = m->m_next;
 169         }
 170         m = n;
 171         while (m) {
 172                 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
 173                     !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
 174                         !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
 175                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 176                         m->m_next, m->m_nextpkt, m->m_flags));
 177                 m = m->m_next;
 178         }
 179         KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
 180         sbappendstream_locked(sb, n);
 181         m = sb->sb_mb;
 182
 183         while (m) {
 184                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 185                         m->m_next, m->m_nextpkt, m->m_flags));
 186                 m = m->m_next;
 187         }
 188 }
 189
 190 static inline int
 191 is_t3a(const struct toedev *dev)
 192 {
 193         return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
 194 }
 195
 196 static void
 197 dump_toepcb(struct toepcb *toep)
 198 {
 199         DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
 200             toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
 201             toep->tp_mtu_idx, toep->tp_tid);
 202
 203         DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
 204             toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
 205             toep->tp_mss_clamp, toep->tp_flags);
 206 }
 207
 208 #ifndef RTALLOC2_DEFINED
 209 static struct rtentry *
 210 rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
 211 {
 212         struct rtentry *rt = NULL;
 213
 214         if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
 215                 RT_UNLOCK(rt);
 216
 217         return (rt);
 218 }
 219 #endif
 220
 221 /*
 222  * Determine whether to send a CPL message now or defer it.  A message is
 223  * deferred if the connection is in SYN_SENT since we don't know the TID yet.
 224  * For connections in other states the message is sent immediately.
 225  * If through_l2t is set the message is subject to ARP processing, otherwise
 226  * it is sent directly.
 227  */
 228 static inline void
 229 send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
 230 {
 231         struct tcpcb *tp = toep->tp_tp;
 232
 233         if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
 234                 inp_wlock(tp->t_inpcb);
 235                 mbufq_tail(&toep->out_of_order_queue, m);  // defer
 236                 inp_wunlock(tp->t_inpcb);
 237         } else if (through_l2t)
 238                 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
 239         else
 240                 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
 241 }
 242
 243 static inline unsigned int
 244 mkprio(unsigned int cntrl, const struct toepcb *toep)
 245 {
 246         return (cntrl);
 247 }
 248
 249 /*
 250  * Populate a TID_RELEASE WR.  The skb must be already propely sized.
 251  */
 252 static inline void
 253 mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
 254 {
 255         struct cpl_tid_release *req;
 256
 257         m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
 258         m->m_pkthdr.len = m->m_len = sizeof(*req);
 259         req = mtod(m, struct cpl_tid_release *);
 260         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 261         req->wr.wr_lo = 0;
 262         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
 263 }
 264
 265 static inline void
 266 make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
 267 {
 268         struct tcpcb *tp = so_sototcpcb(so);
 269         struct toepcb *toep = tp->t_toe;
 270         struct tx_data_wr *req;
 271         struct sockbuf *snd;
 272
 273         snd = so_sockbuf_snd(so);
 274         inp_wlock_assert(tp->t_inpcb);
 275
 276         req = mtod(m, struct tx_data_wr *);
 277         m->m_len = sizeof(*req);
 278         req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
 279         req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
 280         /* len includes the length of any HW ULP additions */
 281         req->len = htonl(len);
 282         req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
 283         /* V_TX_ULP_SUBMODE sets both the mode and submode */
 284         req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
 285                            V_TX_URG(/* skb_urgent(skb) */ 0 ) |
 286                            V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
 287                                    (tail ? 0 : 1))));
 288         req->sndseq = htonl(tp->snd_nxt);
 289         if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
 290                 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
 291                                     V_TX_CPU_IDX(toep->tp_qset));
 292
 293                 /* Sendbuffer is in units of 32KB.
 294                  */
 295                 if (tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
 296                         req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
 297                 else {
 298                         req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
 299                 }
 300
 301                 toep->tp_flags |= TP_DATASENT;
 302         }
 303 }
 304
 305 #define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
 306
 307 int
 308 t3_push_frames(struct socket *so, int req_completion)
 309 {
 310         struct tcpcb *tp = so_sototcpcb(so);
 311         struct toepcb *toep = tp->t_toe;
 312
 313         struct mbuf *tail, *m0, *last;
 314         struct t3cdev *cdev;
 315         struct tom_data *d;
 316         int state, bytes, count, total_bytes;
 317         bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
 318         struct sockbuf *snd;
 319
 320         if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
 321                 DPRINTF("tcp state=%d\n", tp->t_state);
 322                 return (0);
 323         }
 324
 325         state = so_state_get(so);
 326
 327         if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
 328                 DPRINTF("disconnecting\n");
 329
 330                 return (0);
 331         }
 332
 333         inp_lock_assert(tp->t_inpcb);
 334
 335         snd = so_sockbuf_snd(so);
 336         sockbuf_lock(snd);
 337
 338         d = TOM_DATA(toep->tp_toedev);
 339         cdev = d->cdev;
 340
 341         last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
 342
 343         total_bytes = 0;
 344         DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
 345             toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
 346
 347         if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
 348                 KASSERT(tail, ("sbdrop error"));
 349                 last = tail = tail->m_next;
 350         }
 351
 352         if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
 353                 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
 354                 sockbuf_unlock(snd);
 355
 356                 return (0);
 357         }
 358
 359         toep->tp_m_last = NULL;
 360         while (toep->tp_wr_avail && (tail != NULL)) {
 361                 count = bytes = 0;
 362                 segp = segs;
 363                 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
 364                         sockbuf_unlock(snd);
 365                         return (0);
 366                 }
 367                 /*
 368                  * If the data in tail fits as in-line, then
 369                  * make an immediate data wr.
 370                  */
 371                 if (tail->m_len <= IMM_LEN) {
 372                         count = 1;
 373                         bytes = tail->m_len;
 374                         last = tail;
 375                         tail = tail->m_next;
 376                         m_set_sgl(m0, NULL);
 377                         m_set_sgllen(m0, 0);
 378                         make_tx_data_wr(so, m0, bytes, tail);
 379                         m_append(m0, bytes, mtod(last, caddr_t));
 380                         KASSERT(!m0->m_next, ("bad append"));
 381                 } else {
 382                         while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
 383                             && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
 384                                 bytes += tail->m_len;
 385                                 last = tail;
 386                                 count++;
 387                                 /*
 388                                  * technically an abuse to be using this for a VA
 389                                  * but less gross than defining my own structure
 390                                  * or calling pmap_kextract from here :-|
 391                                  */
 392                                 segp->ds_addr = (bus_addr_t)tail->m_data;
 393                                 segp->ds_len = tail->m_len;
 394                                 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
 395                                     count, mbuf_wrs[count], tail->m_data, tail->m_len);
 396                                 segp++;
 397                                 tail = tail->m_next;
 398                         }
 399                         DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
 400                             toep->tp_wr_avail, count, mbuf_wrs[count], tail);
 401
 402                         m_set_sgl(m0, segs);
 403                         m_set_sgllen(m0, count);
 404                         make_tx_data_wr(so, m0, bytes, tail);
 405                 }
 406                 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
 407
 408                 if (tail) {
 409                         snd->sb_sndptr = tail;
 410                         toep->tp_m_last = NULL;
 411                 } else
 412                         toep->tp_m_last = snd->sb_sndptr = last;
 413
 414
 415                 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
 416
 417                 snd->sb_sndptroff += bytes;
 418                 total_bytes += bytes;
 419                 toep->tp_write_seq += bytes;
 420                 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
 421                     " tail=%p sndptr=%p sndptroff=%d",
 422                     toep->tp_wr_avail, count, mbuf_wrs[count],
 423                     tail, snd->sb_sndptr, snd->sb_sndptroff);
 424                 if (tail)
 425                         CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
 426                             " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
 427                             total_bytes, toep->tp_m_last, tail->m_data,
 428                             tp->snd_una);
 429                 else
 430                         CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
 431                             " tp_m_last=%p snd_una=0x%08x",
 432                             total_bytes, toep->tp_m_last, tp->snd_una);
 433
 434
 435 #ifdef KTR
 436 {
 437                 int i;
 438
 439                 i = 0;
 440                 while (i < count && m_get_sgllen(m0)) {
 441                         if ((count - i) >= 3) {
 442                                 CTR6(KTR_TOM,
 443                                     "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
 444                                     " len=%d pa=0x%zx len=%d",
 445                                     segs[i].ds_addr, segs[i].ds_len,
 446                                     segs[i + 1].ds_addr, segs[i + 1].ds_len,
 447                                     segs[i + 2].ds_addr, segs[i + 2].ds_len);
 448                                     i += 3;
 449                         } else if ((count - i) == 2) {
 450                                 CTR4(KTR_TOM,
 451                                     "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
 452                                     " len=%d",
 453                                     segs[i].ds_addr, segs[i].ds_len,
 454                                     segs[i + 1].ds_addr, segs[i + 1].ds_len);
 455                                     i += 2;
 456                         } else {
 457                                 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
 458                                     segs[i].ds_addr, segs[i].ds_len);
 459                                 i++;
 460                         }
 461
 462                 }
 463 }
 464 #endif
 465                  /*
 466                  * remember credits used
 467                  */
 468                 m0->m_pkthdr.csum_data = mbuf_wrs[count];
 469                 m0->m_pkthdr.len = bytes;
 470                 toep->tp_wr_avail -= mbuf_wrs[count];
 471                 toep->tp_wr_unacked += mbuf_wrs[count];
 472
 473                 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
 474                     toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
 475                         struct work_request_hdr *wr = cplhdr(m0);
 476
 477                         wr->wr_hi |= htonl(F_WR_COMPL);
 478                         toep->tp_wr_unacked = 0;
 479                 }
 480                 KASSERT((m0->m_pkthdr.csum_data > 0) &&
 481                     (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
 482                         m0->m_pkthdr.csum_data));
 483                 m0->m_type = MT_DONTFREE;
 484                 enqueue_wr(toep, m0);
 485                 DPRINTF("sending offload tx with %d bytes in %d segments\n",
 486                     bytes, count);
 487                 l2t_send(cdev, m0, toep->tp_l2t);
 488         }
 489         sockbuf_unlock(snd);
 490         return (total_bytes);
 491 }
 492
 493 /*
 494  * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
 495  * under any circumstances.  We take the easy way out and always queue the
 496  * message to the write_queue.  We can optimize the case where the queue is
 497  * already empty though the optimization is probably not worth it.
 498  */
 499 static void
 500 close_conn(struct socket *so)
 501 {
 502         struct mbuf *m;
 503         struct cpl_close_con_req *req;
 504         struct tom_data *d;
 505         struct inpcb *inp = so_sotoinpcb(so);
 506         struct tcpcb *tp;
 507         struct toepcb *toep;
 508         unsigned int tid;
 509
 510         inp_wlock(inp);
 511         tp = so_sototcpcb(so);
 512
 513         toep = tp->t_toe;
 514
 515         if (tp->t_state != TCPS_SYN_SENT)
 516                 t3_push_frames(so, 1);
 517
 518         if (toep->tp_flags & TP_FIN_SENT) {
 519                 inp_wunlock(inp);
 520                 return;
 521         }
 522
 523         tid = toep->tp_tid;
 524
 525         d = TOM_DATA(toep->tp_toedev);
 526
 527         m = m_gethdr_nofail(sizeof(*req));
 528         m_set_priority(m, CPL_PRIORITY_DATA);
 529         m_set_sgl(m, NULL);
 530         m_set_sgllen(m, 0);
 531
 532         toep->tp_flags |= TP_FIN_SENT;
 533         req = mtod(m, struct cpl_close_con_req *);
 534
 535         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
 536         req->wr.wr_lo = htonl(V_WR_TID(tid));
 537         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
 538
 539         req->rsvd = htonl(toep->tp_write_seq);
 540         inp_wunlock(inp);
 541
 542         /*
 543          * XXX - need to defer shutdown while there is still data in the queue
 544          *
 545          */
 546         CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
 547         cxgb_ofld_send(d->cdev, m);
 548
 549 }
 550
 551 /*
 552  * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
 553  * and send it along.
 554  */
 555 static void
 556 abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
 557 {
 558         struct cpl_abort_req *req = cplhdr(m);
 559
 560         req->cmd = CPL_ABORT_NO_RST;
 561         cxgb_ofld_send(cdev, m);
 562 }
 563
 564 /*
 565  * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
 566  * permitted to return without sending the message in case we cannot allocate
 567  * an sk_buff.  Returns the number of credits sent.
 568  */
 569 uint32_t
 570 t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
 571 {
 572         struct mbuf *m;
 573         struct cpl_rx_data_ack *req;
 574         struct toepcb *toep = tp->t_toe;
 575         struct toedev *tdev = toep->tp_toedev;
 576
 577         m = m_gethdr_nofail(sizeof(*req));
 578
 579         DPRINTF("returning %u credits to HW\n", credits);
 580
 581         req = mtod(m, struct cpl_rx_data_ack *);
 582         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 583         req->wr.wr_lo = 0;
 584         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 585         req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
 586         m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
 587         cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
 588         return (credits);
 589 }
 590
 591 /*
 592  * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
 593  * This is only used in DDP mode, so we take the opportunity to also set the
 594  * DACK mode and flush any Rx credits.
 595  */
 596 void
 597 t3_send_rx_modulate(struct toepcb *toep)
 598 {
 599         struct mbuf *m;
 600         struct cpl_rx_data_ack *req;
 601
 602         m = m_gethdr_nofail(sizeof(*req));
 603
 604         req = mtod(m, struct cpl_rx_data_ack *);
 605         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 606         req->wr.wr_lo = 0;
 607         m->m_pkthdr.len = m->m_len = sizeof(*req);
 608
 609         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 610         req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
 611                                  V_RX_DACK_MODE(1) |
 612                                  V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
 613         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 614         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
 615         toep->tp_rcv_wup = toep->tp_copied_seq;
 616 }
 617
 618 /*
 619  * Handle receipt of an urgent pointer.
 620  */
 621 static void
 622 handle_urg_ptr(struct socket *so, uint32_t urg_seq)
 623 {
 624 #ifdef URGENT_DATA_SUPPORTED
 625         struct tcpcb *tp = so_sototcpcb(so);
 626
 627         urg_seq--;   /* initially points past the urgent data, per BSD */
 628
 629         if (tp->urg_data && !after(urg_seq, tp->urg_seq))
 630                 return;                                 /* duplicate pointer */
 631         sk_send_sigurg(sk);
 632         if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
 633             !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
 634                 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 635
 636                 tp->copied_seq++;
 637                 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
 638                         tom_eat_skb(sk, skb, 0);
 639         }
 640         tp->urg_data = TCP_URG_NOTYET;
 641         tp->urg_seq = urg_seq;
 642 #endif
 643 }
 644
 645 /*
 646  * Returns true if a socket cannot accept new Rx data.
 647  */
 648 static inline int
 649 so_no_receive(const struct socket *so)
 650 {
 651         return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
 652 }
 653
 654 /*
 655  * Process an urgent data notification.
 656  */
 657 static void
 658 rx_urg_notify(struct toepcb *toep, struct mbuf *m)
 659 {
 660         struct cpl_rx_urg_notify *hdr = cplhdr(m);
 661         struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
 662
 663         VALIDATE_SOCK(so);
 664
 665         if (!so_no_receive(so))
 666                 handle_urg_ptr(so, ntohl(hdr->seq));
 667
 668         m_freem(m);
 669 }
 670
 671 /*
 672  * Handler for RX_URG_NOTIFY CPL messages.
 673  */
 674 static int
 675 do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 676 {
 677         struct toepcb *toep = (struct toepcb *)ctx;
 678
 679         rx_urg_notify(toep, m);
 680         return (0);
 681 }
 682
 683 static __inline int
 684 is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
 685 {
 686         return (toep->tp_ulp_mode ||
 687                 (toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
 688                     dev->tod_ttid >= TOE_ID_CHELSIO_T3));
 689 }
 690
 691 /*
 692  * Set of states for which we should return RX credits.
 693  */
 694 #define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
 695
 696 /*
 697  * Called after some received data has been read.  It returns RX credits
 698  * to the HW for the amount of data processed.
 699  */
 700 void
 701 t3_cleanup_rbuf(struct tcpcb *tp, int copied)
 702 {
 703         struct toepcb *toep = tp->t_toe;
 704         struct socket *so;
 705         struct toedev *dev;
 706         int dack_mode, must_send, read;
 707         u32 thres, credits, dack = 0;
 708         struct sockbuf *rcv;
 709
 710         so = inp_inpcbtosocket(tp->t_inpcb);
 711         rcv = so_sockbuf_rcv(so);
 712
 713         if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
 714                 (tp->t_state == TCPS_FIN_WAIT_2))) {
 715                 if (copied) {
 716                         sockbuf_lock(rcv);
 717                         toep->tp_copied_seq += copied;
 718                         sockbuf_unlock(rcv);
 719                 }
 720
 721                 return;
 722         }
 723
 724         sockbuf_lock(rcv);
 725         inp_wlock_assert(tp->t_inpcb);
 726         if (copied)
 727                 toep->tp_copied_seq += copied;
 728         else {
 729                 read = toep->tp_enqueued_bytes - rcv->sb_cc;
 730                 toep->tp_copied_seq += read;
 731         }
 732         credits = toep->tp_copied_seq - toep->tp_rcv_wup;
 733         toep->tp_enqueued_bytes = rcv->sb_cc;
 734         sockbuf_unlock(rcv);
 735
 736         if (credits > rcv->sb_mbmax) {
 737                 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
 738                     toep->tp_copied_seq, toep->tp_rcv_wup, credits);
 739             credits = rcv->sb_mbmax;
 740         }
 741
 742
 743         /*
 744          * XXX this won't accurately reflect credit return - we need
 745          * to look at the difference between the amount that has been
 746          * put in the recv sockbuf and what is there now
 747          */
 748
 749         if (__predict_false(!credits))
 750                 return;
 751
 752         dev = toep->tp_toedev;
 753         thres = TOM_TUNABLE(dev, rx_credit_thres);
 754
 755         if (__predict_false(thres == 0))
 756                 return;
 757
 758         if (is_delack_mode_valid(dev, toep)) {
 759                 dack_mode = TOM_TUNABLE(dev, delack);
 760                 if (__predict_false(dack_mode != toep->tp_delack_mode)) {
 761                         u32 r = tp->rcv_nxt - toep->tp_delack_seq;
 762
 763                         if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
 764                                 dack = F_RX_DACK_CHANGE |
 765                                        V_RX_DACK_MODE(dack_mode);
 766                 }
 767         } else
 768                 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 769
 770         /*
 771          * For coalescing to work effectively ensure the receive window has
 772          * at least 16KB left.
 773          */
 774         must_send = credits + 16384 >= tp->rcv_wnd;
 775
 776         if (must_send || credits >= thres)
 777                 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
 778 }
 779
 780 static int
 781 cxgb_toe_disconnect(struct tcpcb *tp)
 782 {
 783         struct socket *so;
 784
 785         DPRINTF("cxgb_toe_disconnect\n");
 786
 787         so = inp_inpcbtosocket(tp->t_inpcb);
 788         close_conn(so);
 789         return (0);
 790 }
 791
 792 static int
 793 cxgb_toe_reset(struct tcpcb *tp)
 794 {
 795         struct toepcb *toep = tp->t_toe;
 796
 797         t3_send_reset(toep);
 798
 799         /*
 800          * unhook from socket
 801          */
 802         tp->t_flags &= ~TF_TOE;
 803         toep->tp_tp = NULL;
 804         tp->t_toe = NULL;
 805         return (0);
 806 }
 807
 808 static int
 809 cxgb_toe_send(struct tcpcb *tp)
 810 {
 811         struct socket *so;
 812
 813         DPRINTF("cxgb_toe_send\n");
 814         dump_toepcb(tp->t_toe);
 815
 816         so = inp_inpcbtosocket(tp->t_inpcb);
 817         t3_push_frames(so, 1);
 818         return (0);
 819 }
 820
 821 static int
 822 cxgb_toe_rcvd(struct tcpcb *tp)
 823 {
 824
 825         inp_wlock_assert(tp->t_inpcb);
 826
 827         t3_cleanup_rbuf(tp, 0);
 828
 829         return (0);
 830 }
 831
 832 static void
 833 cxgb_toe_detach(struct tcpcb *tp)
 834 {
 835         struct toepcb *toep;
 836
 837         /*
 838          * XXX how do we handle teardown in the SYN_SENT state?
 839          *
 840          */
 841         inp_lock_assert(tp->t_inpcb);
 842         inp_wlock_assert(tp->t_inpcb);
 843         toep = tp->t_toe;
 844         toep->tp_tp = NULL;
 845
 846         /*
 847          * unhook from socket
 848          */
 849         tp->t_flags &= ~TF_TOE;
 850         tp->t_toe = NULL;
 851 }
 852
 853
 854 static struct toe_usrreqs cxgb_toe_usrreqs = {
 855         .tu_disconnect = cxgb_toe_disconnect,
 856         .tu_reset = cxgb_toe_reset,
 857         .tu_send = cxgb_toe_send,
 858         .tu_rcvd = cxgb_toe_rcvd,
 859         .tu_detach = cxgb_toe_detach,
 860         .tu_detach = cxgb_toe_detach,
 861         .tu_syncache_event = handle_syncache_event,
 862 };
 863
 864
 865 static void
 866 __set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
 867                             uint64_t mask, uint64_t val, int no_reply)
 868 {
 869         struct cpl_set_tcb_field *req;
 870
 871         CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
 872             toep->tp_tid, word, mask, val);
 873
 874         req = mtod(m, struct cpl_set_tcb_field *);
 875         m->m_pkthdr.len = m->m_len = sizeof(*req);
 876         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 877         req->wr.wr_lo = 0;
 878         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
 879         req->reply = V_NO_REPLY(no_reply);
 880         req->cpu_idx = 0;
 881         req->word = htons(word);
 882         req->mask = htobe64(mask);
 883         req->val = htobe64(val);
 884
 885         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 886         send_or_defer(toep, m, 0);
 887 }
 888
 889 static void
 890 t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
 891 {
 892         struct mbuf *m;
 893         struct tcpcb *tp = toep->tp_tp;
 894
 895         if (toep == NULL)
 896                 return;
 897
 898         if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
 899                 printf("not seting field\n");
 900                 return;
 901         }
 902
 903         m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
 904
 905         __set_tcb_field(toep, m, word, mask, val, 1);
 906 }
 907
 908 /*
 909  * Set one of the t_flags bits in the TCB.
 910  */
 911 static void
 912 set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
 913 {
 914
 915         t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
 916 }
 917
 918 /*
 919  * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
 920  */
 921 static void
 922 t3_set_nagle(struct toepcb *toep)
 923 {
 924         struct tcpcb *tp = toep->tp_tp;
 925
 926         set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
 927 }
 928
 929 /*
 930  * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
 931  */
 932 void
 933 t3_set_keepalive(struct toepcb *toep, int on_off)
 934 {
 935
 936         set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
 937 }
 938
 939 void
 940 t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
 941 {
 942         set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
 943 }
 944
 945 void
 946 t3_set_dack_mss(struct toepcb *toep, int on_off)
 947 {
 948
 949         set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
 950 }
 951
 952 /*
 953  * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
 954  */
 955 static void
 956 t3_set_tos(struct toepcb *toep)
 957 {
 958         int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
 959
 960         t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
 961                          V_TCB_TOS(tos));
 962 }
 963
 964
 965 /*
 966  * In DDP mode, TP fails to schedule a timer to push RX data to the host when
 967  * DDP is disabled (data is delivered to freelist). [Note that, the peer should
 968  * set the PSH bit in the last segment, which would trigger delivery.]
 969  * We work around the issue by setting a DDP buffer in a partial placed state,
 970  * which guarantees that TP will schedule a timer.
 971  */
 972 #define TP_DDP_TIMER_WORKAROUND_MASK\
 973     (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
 974      ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
 975        V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
 976 #define TP_DDP_TIMER_WORKAROUND_VAL\
 977     (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
 978      ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
 979       32))
 980
 981 static void
 982 t3_enable_ddp(struct toepcb *toep, int on)
 983 {
 984         if (on) {
 985
 986                 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
 987                                  V_TF_DDP_OFF(0));
 988         } else
 989                 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
 990                                  V_TF_DDP_OFF(1) |
 991                                  TP_DDP_TIMER_WORKAROUND_MASK,
 992                                  V_TF_DDP_OFF(1) |
 993                                  TP_DDP_TIMER_WORKAROUND_VAL);
 994
 995 }
 996
 997 void
 998 t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
 999 {
1000         t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1001                          V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1002                          tag_color);
1003 }
1004
1005 void
1006 t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1007                     unsigned int len)
1008 {
1009         if (buf_idx == 0)
1010                 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1011                          V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1012                          V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1013                          V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1014                          V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1015         else
1016                 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1017                          V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1018                          V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1019                          V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1020                          V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1021 }
1022
1023 static int
1024 t3_set_cong_control(struct socket *so, const char *name)
1025 {
1026 #ifdef CONGESTION_CONTROL_SUPPORTED
1027         int cong_algo;
1028
1029         for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1030                 if (!strcmp(name, t3_cong_ops[cong_algo].name))
1031                         break;
1032
1033         if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1034                 return -EINVAL;
1035 #endif
1036         return 0;
1037 }
1038
1039 int
1040 t3_get_tcb(struct toepcb *toep)
1041 {
1042         struct cpl_get_tcb *req;
1043         struct tcpcb *tp = toep->tp_tp;
1044         struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1045
1046         if (!m)
1047                 return (ENOMEM);
1048
1049         inp_lock_assert(tp->t_inpcb);
1050
1051         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1052         req = mtod(m, struct cpl_get_tcb *);
1053         m->m_pkthdr.len = m->m_len = sizeof(*req);
1054         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1055         req->wr.wr_lo = 0;
1056         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1057         req->cpuno = htons(toep->tp_qset);
1058         req->rsvd = 0;
1059         if (tp->t_state == TCPS_SYN_SENT)
1060                 mbufq_tail(&toep->out_of_order_queue, m);       // defer
1061         else
1062                 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1063         return 0;
1064 }
1065
1066 static inline void
1067 so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1068 {
1069
1070         toepcb_hold(toep);
1071
1072         cxgb_insert_tid(d->cdev, d->client, toep, tid);
1073 }
1074
1075 /**
1076  *      find_best_mtu - find the entry in the MTU table closest to an MTU
1077  *      @d: TOM state
1078  *      @mtu: the target MTU
1079  *
1080  *      Returns the index of the value in the MTU table that is closest to but
1081  *      does not exceed the target MTU.
1082  */
1083 static unsigned int
1084 find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1085 {
1086         int i = 0;
1087
1088         while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1089                 ++i;
1090         return (i);
1091 }
1092
1093 static unsigned int
1094 select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1095 {
1096         unsigned int idx;
1097
1098 #ifdef notyet
1099         struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1100 #endif
1101         if (tp) {
1102                 tp->t_maxseg = pmtu - 40;
1103                 if (tp->t_maxseg < td->mtus[0] - 40)
1104                         tp->t_maxseg = td->mtus[0] - 40;
1105                 idx = find_best_mtu(td, tp->t_maxseg + 40);
1106
1107                 tp->t_maxseg = td->mtus[idx] - 40;
1108         } else
1109                 idx = find_best_mtu(td, pmtu);
1110
1111         return (idx);
1112 }
1113
1114 static inline void
1115 free_atid(struct t3cdev *cdev, unsigned int tid)
1116 {
1117         struct toepcb *toep = cxgb_free_atid(cdev, tid);
1118
1119         if (toep)
1120                 toepcb_release(toep);
1121 }
1122
1123 /*
1124  * Release resources held by an offload connection (TID, L2T entry, etc.)
1125  */
1126 static void
1127 t3_release_offload_resources(struct toepcb *toep)
1128 {
1129         struct tcpcb *tp = toep->tp_tp;
1130         struct toedev *tdev = toep->tp_toedev;
1131         struct t3cdev *cdev;
1132         struct socket *so;
1133         unsigned int tid = toep->tp_tid;
1134         struct sockbuf *rcv;
1135
1136         CTR0(KTR_TOM, "t3_release_offload_resources");
1137
1138         if (!tdev)
1139                 return;
1140
1141         cdev = TOEP_T3C_DEV(toep);
1142         if (!cdev)
1143                 return;
1144
1145         toep->tp_qset = 0;
1146         t3_release_ddp_resources(toep);
1147
1148 #ifdef CTRL_SKB_CACHE
1149         kfree_skb(CTRL_SKB_CACHE(tp));
1150         CTRL_SKB_CACHE(tp) = NULL;
1151 #endif
1152
1153         if (toep->tp_wr_avail != toep->tp_wr_max) {
1154                 purge_wr_queue(toep);
1155                 reset_wr_list(toep);
1156         }
1157
1158         if (toep->tp_l2t) {
1159                 l2t_release(L2DATA(cdev), toep->tp_l2t);
1160                 toep->tp_l2t = NULL;
1161         }
1162         toep->tp_tp = NULL;
1163         if (tp) {
1164                 inp_wlock_assert(tp->t_inpcb);
1165                 so = inp_inpcbtosocket(tp->t_inpcb);
1166                 rcv = so_sockbuf_rcv(so);
1167                 /*
1168                  * cancel any offloaded reads
1169                  *
1170                  */
1171                 sockbuf_lock(rcv);
1172
1173                 tp->t_toe = NULL;
1174                 tp->t_flags &= ~TF_TOE;
1175                 if (toep->tp_ddp_state.user_ddp_pending) {
1176                         t3_cancel_ubuf(toep, rcv);
1177                         toep->tp_ddp_state.user_ddp_pending = 0;
1178                 }
1179                 so_sorwakeup_locked(so);
1180
1181         }
1182
1183         if (toep->tp_state == TCPS_SYN_SENT) {
1184                 free_atid(cdev, tid);
1185 #ifdef notyet
1186                 __skb_queue_purge(&tp->out_of_order_queue);
1187 #endif
1188         } else {                                          // we have TID
1189                 cxgb_remove_tid(cdev, toep, tid);
1190                 toepcb_release(toep);
1191         }
1192 #if 0
1193         log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1194 #endif
1195 }
1196
1197 static void
1198 install_offload_ops(struct socket *so)
1199 {
1200         struct tcpcb *tp = so_sototcpcb(so);
1201
1202         KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1203
1204         t3_install_socket_ops(so);
1205         tp->t_flags |= TF_TOE;
1206         tp->t_tu = &cxgb_toe_usrreqs;
1207 }
1208
1209 /*
1210  * Determine the receive window scaling factor given a target max
1211  * receive window.
1212  */
1213 static __inline int
1214 select_rcv_wscale(int space)
1215 {
1216         int wscale = 0;
1217
1218         if (space > MAX_RCV_WND)
1219                 space = MAX_RCV_WND;
1220
1221         if (tcp_do_rfc1323)
1222                 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1223
1224         return (wscale);
1225 }
1226
1227 /*
1228  * Determine the receive window size for a socket.
1229  */
1230 static unsigned long
1231 select_rcv_wnd(struct toedev *dev, struct socket *so)
1232 {
1233         struct tom_data *d = TOM_DATA(dev);
1234         unsigned int wnd;
1235         unsigned int max_rcv_wnd;
1236         struct sockbuf *rcv;
1237
1238         rcv = so_sockbuf_rcv(so);
1239
1240         if (tcp_do_autorcvbuf)
1241                 wnd = tcp_autorcvbuf_max;
1242         else
1243                 wnd = rcv->sb_hiwat;
1244
1245
1246
1247         /* XXX
1248          * For receive coalescing to work effectively we need a receive window
1249          * that can accomodate a coalesced segment.
1250          */
1251         if (wnd < MIN_RCV_WND)
1252                 wnd = MIN_RCV_WND;
1253
1254         /* PR 5138 */
1255         max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1256                                     (uint32_t)d->rx_page_size * 23 :
1257                                     MAX_RCV_WND);
1258
1259         return min(wnd, max_rcv_wnd);
1260 }
1261
1262 /*
1263  * Assign offload parameters to some socket fields.  This code is used by
1264  * both active and passive opens.
1265  */
1266 static inline void
1267 init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1268     struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1269 {
1270         struct tcpcb *tp = so_sototcpcb(so);
1271         struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1272         struct sockbuf *snd, *rcv;
1273
1274 #ifdef notyet
1275         SOCK_LOCK_ASSERT(so);
1276 #endif
1277
1278         snd = so_sockbuf_snd(so);
1279         rcv = so_sockbuf_rcv(so);
1280
1281         log(LOG_INFO, "initializing offload socket\n");
1282         /*
1283          * We either need to fix push frames to work with sbcompress
1284          * or we need to add this
1285          */
1286         snd->sb_flags |= SB_NOCOALESCE;
1287         rcv->sb_flags |= SB_NOCOALESCE;
1288
1289         tp->t_toe = toep;
1290         toep->tp_tp = tp;
1291         toep->tp_toedev = dev;
1292
1293         toep->tp_tid = tid;
1294         toep->tp_l2t = e;
1295         toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1296         toep->tp_wr_unacked = 0;
1297         toep->tp_delack_mode = 0;
1298
1299         toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1300         /*
1301          * XXX broken
1302          *
1303          */
1304         tp->rcv_wnd = select_rcv_wnd(dev, so);
1305
1306         toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1307                        tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1308         toep->tp_qset_idx = 0;
1309
1310         reset_wr_list(toep);
1311         DPRINTF("initialization done\n");
1312 }
1313
1314 /*
1315  * The next two functions calculate the option 0 value for a socket.
1316  */
1317 static inline unsigned int
1318 calc_opt0h(struct socket *so, int mtu_idx)
1319 {
1320         struct tcpcb *tp = so_sototcpcb(so);
1321         int wscale = select_rcv_wscale(tp->rcv_wnd);
1322
1323         return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1324             V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1325             V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1326 }
1327
1328 static inline unsigned int
1329 calc_opt0l(struct socket *so, int ulp_mode)
1330 {
1331         struct tcpcb *tp = so_sototcpcb(so);
1332         unsigned int val;
1333
1334         val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1335                V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1336
1337         DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1338         return (val);
1339 }
1340
1341 static inline unsigned int
1342 calc_opt2(const struct socket *so, struct toedev *dev)
1343 {
1344         int flv_valid;
1345
1346         flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1347
1348         return (V_FLAVORS_VALID(flv_valid) |
1349             V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1350 }
1351
1352 #if DEBUG_WR > 1
1353 static int
1354 count_pending_wrs(const struct toepcb *toep)
1355 {
1356         const struct mbuf *m;
1357         int n = 0;
1358
1359         wr_queue_walk(toep, m)
1360                 n += m->m_pkthdr.csum_data;
1361         return (n);
1362 }
1363 #endif
1364
1365 #if 0
1366 (((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1367 #endif
1368
1369 static void
1370 mk_act_open_req(struct socket *so, struct mbuf *m,
1371     unsigned int atid, const struct l2t_entry *e)
1372 {
1373         struct cpl_act_open_req *req;
1374         struct inpcb *inp = so_sotoinpcb(so);
1375         struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1376         struct toepcb *toep = tp->t_toe;
1377         struct toedev *tdev = toep->tp_toedev;
1378
1379         m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1380
1381         req = mtod(m, struct cpl_act_open_req *);
1382         m->m_pkthdr.len = m->m_len = sizeof(*req);
1383
1384         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1385         req->wr.wr_lo = 0;
1386         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1387         inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1388 #if 0
1389         req->local_port = inp->inp_lport;
1390         req->peer_port = inp->inp_fport;
1391         memcpy(&req->local_ip, &inp->inp_laddr, 4);
1392         memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1393 #endif
1394         req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1395                            V_TX_CHANNEL(e->smt_idx));
1396         req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1397         req->params = 0;
1398         req->opt2 = htonl(calc_opt2(so, tdev));
1399 }
1400
1401
1402 /*
1403  * Convert an ACT_OPEN_RPL status to an errno.
1404  */
1405 static int
1406 act_open_rpl_status_to_errno(int status)
1407 {
1408         switch (status) {
1409         case CPL_ERR_CONN_RESET:
1410                 return (ECONNREFUSED);
1411         case CPL_ERR_ARP_MISS:
1412                 return (EHOSTUNREACH);
1413         case CPL_ERR_CONN_TIMEDOUT:
1414                 return (ETIMEDOUT);
1415         case CPL_ERR_TCAM_FULL:
1416                 return (ENOMEM);
1417         case CPL_ERR_CONN_EXIST:
1418                 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1419                 return (EADDRINUSE);
1420         default:
1421                 return (EIO);
1422         }
1423 }
1424
1425 static void
1426 fail_act_open(struct toepcb *toep, int errno)
1427 {
1428         struct tcpcb *tp = toep->tp_tp;
1429
1430         t3_release_offload_resources(toep);
1431         if (tp) {
1432                 inp_wunlock(tp->t_inpcb);
1433                 tcp_offload_drop(tp, errno);
1434         }
1435
1436 #ifdef notyet
1437         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1438 #endif
1439 }
1440
1441 /*
1442  * Handle active open failures.
1443  */
1444 static void
1445 active_open_failed(struct toepcb *toep, struct mbuf *m)
1446 {
1447         struct cpl_act_open_rpl *rpl = cplhdr(m);
1448         struct inpcb *inp;
1449
1450         if (toep->tp_tp == NULL)
1451                 goto done;
1452
1453         inp = toep->tp_tp->t_inpcb;
1454         inp_wlock(inp);
1455
1456 /*
1457  * Don't handle connection retry for now
1458  */
1459 #ifdef notyet
1460         struct inet_connection_sock *icsk = inet_csk(sk);
1461
1462         if (rpl->status == CPL_ERR_CONN_EXIST &&
1463             icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1464                 icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1465                 sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1466                                jiffies + HZ / 2);
1467         } else
1468 #endif
1469         {
1470                 inp_wlock(inp);
1471                 /*
1472                  * drops the inpcb lock
1473                  */
1474                 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1475         }
1476
1477         inp_wunlock(inp);
1478 done:
1479         INP_INFO_WUNLOCK(&tcbinfo);
1480
1481         m_free(m);
1482 }
1483
1484 /*
1485  * Return whether a failed active open has allocated a TID
1486  */
1487 static inline int
1488 act_open_has_tid(int status)
1489 {
1490         return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1491                status != CPL_ERR_ARP_MISS;
1492 }
1493
1494 /*
1495  * Process an ACT_OPEN_RPL CPL message.
1496  */
1497 static int
1498 do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1499 {
1500         struct toepcb *toep = (struct toepcb *)ctx;
1501         struct cpl_act_open_rpl *rpl = cplhdr(m);
1502
1503         if (cdev->type != T3A && act_open_has_tid(rpl->status))
1504                 cxgb_queue_tid_release(cdev, GET_TID(rpl));
1505
1506         active_open_failed(toep, m);
1507         return (0);
1508 }
1509
1510 /*
1511  * Handle an ARP failure for an active open.   XXX purge ofo queue
1512  *
1513  * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1514  * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1515  * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1516  * free the atid.  Hmm.
1517  */
1518 #ifdef notyet
1519 static void
1520 act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1521 {
1522         struct toepcb *toep = m_get_toep(m);
1523         struct tcpcb *tp = toep->tp_tp;
1524         struct inpcb *inp = tp->t_inpcb;
1525         struct socket *so;
1526
1527         inp_wlock(inp);
1528         if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1529                 /*
1530                  * drops the inpcb lock
1531                  */
1532                 fail_act_open(so, EHOSTUNREACH);
1533                 printf("freeing %p\n", m);
1534
1535                 m_free(m);
1536         }
1537
1538         inp_wunlock(inp);
1539 }
1540 #endif
1541 /*
1542  * Send an active open request.
1543  */
1544 int
1545 t3_connect(struct toedev *tdev, struct socket *so,
1546     struct rtentry *rt, struct sockaddr *nam)
1547 {
1548         struct mbuf *m;
1549         struct l2t_entry *e;
1550         struct tom_data *d = TOM_DATA(tdev);
1551         struct inpcb *inp = so_sotoinpcb(so);
1552         struct tcpcb *tp = intotcpcb(inp);
1553         struct toepcb *toep; /* allocated by init_offload_socket */
1554
1555         int atid;
1556
1557         toep = toepcb_alloc();
1558         if (toep == NULL)
1559                 goto out_err;
1560
1561         if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1562                 goto out_err;
1563
1564         e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1565         if (!e)
1566                 goto free_tid;
1567
1568         inp_wlock_assert(inp);
1569         m = m_gethdr(MT_DATA, M_WAITOK);
1570
1571 #if 0
1572         m->m_toe.mt_toepcb = tp->t_toe;
1573         set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1574 #endif
1575         so_lock(so);
1576
1577         init_offload_socket(so, tdev, atid, e, rt, toep);
1578
1579         install_offload_ops(so);
1580
1581         mk_act_open_req(so, m, atid, e);
1582         so_unlock(so);
1583
1584         soisconnecting(so);
1585         toep = tp->t_toe;
1586         m_set_toep(m, tp->t_toe);
1587
1588         toep->tp_state = TCPS_SYN_SENT;
1589         l2t_send(d->cdev, (struct mbuf *)m, e);
1590
1591         if (toep->tp_ulp_mode)
1592                 t3_enable_ddp(toep, 0);
1593         return  (0);
1594
1595 free_tid:
1596         printf("failing connect - free atid\n");
1597
1598         free_atid(d->cdev, atid);
1599 out_err:
1600         printf("return ENOMEM\n");
1601        return (ENOMEM);
1602 }
1603
1604 /*
1605  * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1606  * not send multiple ABORT_REQs for the same connection and also that we do
1607  * not try to send a message after the connection has closed.  Returns 1 if
1608  * an ABORT_REQ wasn't generated after all, 0 otherwise.
1609  */
1610 static void
1611 t3_send_reset(struct toepcb *toep)
1612 {
1613
1614         struct cpl_abort_req *req;
1615         unsigned int tid = toep->tp_tid;
1616         int mode = CPL_ABORT_SEND_RST;
1617         struct tcpcb *tp = toep->tp_tp;
1618         struct toedev *tdev = toep->tp_toedev;
1619         struct socket *so = NULL;
1620         struct mbuf *m;
1621         struct sockbuf *snd;
1622
1623         if (tp) {
1624                 inp_wlock_assert(tp->t_inpcb);
1625                 so = toeptoso(toep);
1626         }
1627
1628         if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1629                 tdev == NULL))
1630                 return;
1631         toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1632
1633         snd = so_sockbuf_snd(so);
1634         /* Purge the send queue so we don't send anything after an abort. */
1635         if (so)
1636                 sbflush(snd);
1637         if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1638                 mode |= CPL_ABORT_POST_CLOSE_REQ;
1639
1640         m = m_gethdr_nofail(sizeof(*req));
1641         m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1642         set_arp_failure_handler(m, abort_arp_failure);
1643
1644         req = mtod(m, struct cpl_abort_req *);
1645         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1646         req->wr.wr_lo = htonl(V_WR_TID(tid));
1647         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1648         req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1649         req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1650         req->cmd = mode;
1651         if (tp && (tp->t_state == TCPS_SYN_SENT))
1652                 mbufq_tail(&toep->out_of_order_queue, m);       // defer
1653         else
1654                 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1655 }
1656
1657 static int
1658 t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1659 {
1660         struct inpcb *inp;
1661         int error, optval;
1662
1663         if (sopt->sopt_name == IP_OPTIONS)
1664                 return (ENOPROTOOPT);
1665
1666         if (sopt->sopt_name != IP_TOS)
1667                 return (EOPNOTSUPP);
1668
1669         error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1670
1671         if (error)
1672                 return (error);
1673
1674         if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1675                 return (EPERM);
1676
1677         inp = so_sotoinpcb(so);
1678         inp_wlock(inp);
1679         inp_ip_tos_set(inp, optval);
1680 #if 0
1681         inp->inp_ip_tos = optval;
1682 #endif
1683         t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1684         inp_wunlock(inp);
1685
1686         return (0);
1687 }
1688
1689 static int
1690 t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1691 {
1692         int err = 0;
1693         size_t copied;
1694
1695         if (sopt->sopt_name != TCP_CONGESTION &&
1696             sopt->sopt_name != TCP_NODELAY)
1697                 return (EOPNOTSUPP);
1698
1699         if (sopt->sopt_name == TCP_CONGESTION) {
1700                 char name[TCP_CA_NAME_MAX];
1701                 int optlen = sopt->sopt_valsize;
1702                 struct tcpcb *tp;
1703
1704                 if (sopt->sopt_dir == SOPT_GET) {
1705                         KASSERT(0, ("unimplemented"));
1706                         return (EOPNOTSUPP);
1707                 }
1708
1709                 if (optlen < 1)
1710                         return (EINVAL);
1711
1712                 err = copyinstr(sopt->sopt_val, name,
1713                     min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1714                 if (err)
1715                         return (err);
1716                 if (copied < 1)
1717                         return (EINVAL);
1718
1719                 tp = so_sototcpcb(so);
1720                 /*
1721                  * XXX I need to revisit this
1722                  */
1723                 if ((err = t3_set_cong_control(so, name)) == 0) {
1724 #ifdef CONGESTION_CONTROL_SUPPORTED
1725                         tp->t_cong_control = strdup(name, M_CXGB);
1726 #endif
1727                 } else
1728                         return (err);
1729         } else {
1730                 int optval, oldval;
1731                 struct inpcb *inp;
1732                 struct tcpcb *tp;
1733
1734                 if (sopt->sopt_dir == SOPT_GET)
1735                         return (EOPNOTSUPP);
1736
1737                 err = sooptcopyin(sopt, &optval, sizeof optval,
1738                     sizeof optval);
1739
1740                 if (err)
1741                         return (err);
1742
1743                 inp = so_sotoinpcb(so);
1744                 inp_wlock(inp);
1745                 tp = inp_inpcbtotcpcb(inp);
1746
1747                 oldval = tp->t_flags;
1748                 if (optval)
1749                         tp->t_flags |= TF_NODELAY;
1750                 else
1751                         tp->t_flags &= ~TF_NODELAY;
1752                 inp_wunlock(inp);
1753
1754                 if (oldval != tp->t_flags && (tp->t_toe != NULL))
1755                         t3_set_nagle(tp->t_toe);
1756
1757         }
1758
1759         return (0);
1760 }
1761
1762 int
1763 t3_ctloutput(struct socket *so, struct sockopt *sopt)
1764 {
1765         int err;
1766
1767         if (sopt->sopt_level != IPPROTO_TCP)
1768                 err =  t3_ip_ctloutput(so, sopt);
1769         else
1770                 err = t3_tcp_ctloutput(so, sopt);
1771
1772         if (err != EOPNOTSUPP)
1773                 return (err);
1774
1775         return (tcp_ctloutput(so, sopt));
1776 }
1777
1778 /*
1779  * Returns true if we need to explicitly request RST when we receive new data
1780  * on an RX-closed connection.
1781  */
1782 static inline int
1783 need_rst_on_excess_rx(const struct toepcb *toep)
1784 {
1785         return (1);
1786 }
1787
1788 /*
1789  * Handles Rx data that arrives in a state where the socket isn't accepting
1790  * new data.
1791  */
1792 static void
1793 handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1794 {
1795
1796         if (need_rst_on_excess_rx(toep) &&
1797             !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1798                 t3_send_reset(toep);
1799         m_freem(m);
1800 }
1801
1802 /*
1803  * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1804  * by getting the DDP offset from the TCB.
1805  */
1806 static void
1807 tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1808 {
1809         struct ddp_state *q = &toep->tp_ddp_state;
1810         struct ddp_buf_state *bsp;
1811         struct cpl_get_tcb_rpl *hdr;
1812         unsigned int ddp_offset;
1813         struct socket *so;
1814         struct tcpcb *tp;
1815         struct sockbuf *rcv;
1816         int state;
1817
1818         uint64_t t;
1819         __be64 *tcb;
1820
1821         tp = toep->tp_tp;
1822         so = inp_inpcbtosocket(tp->t_inpcb);
1823
1824         inp_wlock_assert(tp->t_inpcb);
1825         rcv = so_sockbuf_rcv(so);
1826         sockbuf_lock(rcv);
1827
1828         /* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1829          * We really need a cookie in order to dispatch the RPLs.
1830          */
1831         q->get_tcb_count--;
1832
1833         /* It is a possible that a previous CPL already invalidated UBUF DDP
1834          * and moved the cur_buf idx and hence no further processing of this
1835          * skb is required. However, the app might be sleeping on
1836          * !q->get_tcb_count and we need to wake it up.
1837          */
1838         if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1839                 int state = so_state_get(so);
1840
1841                 m_freem(m);
1842                 if (__predict_true((state & SS_NOFDREF) == 0))
1843                         so_sorwakeup_locked(so);
1844                 else
1845                         sockbuf_unlock(rcv);
1846
1847                 return;
1848         }
1849
1850         bsp = &q->buf_state[q->cur_buf];
1851         hdr = cplhdr(m);
1852         tcb = (__be64 *)(hdr + 1);
1853         if (q->cur_buf == 0) {
1854                 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1855                 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1856         } else {
1857                 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1858                 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1859         }
1860         ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1861         m->m_cur_offset = bsp->cur_offset;
1862         bsp->cur_offset = ddp_offset;
1863         m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1864
1865         CTR5(KTR_TOM,
1866             "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1867             q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1868         KASSERT(ddp_offset >= m->m_cur_offset,
1869             ("ddp_offset=%u less than cur_offset=%u",
1870                 ddp_offset, m->m_cur_offset));
1871
1872 #if 0
1873 {
1874         unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1875
1876         t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1877         ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1878
1879         t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1880         rcv_nxt = t >> S_TCB_RCV_NXT;
1881         rcv_nxt &= M_TCB_RCV_NXT;
1882
1883         t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1884         rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1885         rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1886
1887         T3_TRACE2(TIDTB(sk),
1888                   "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1889                   ddp_flags, rcv_nxt - rx_hdr_offset);
1890         T3_TRACE4(TB(q),
1891                   "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1892                   tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1893         T3_TRACE3(TB(q),
1894                   "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1895                   rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1896         T3_TRACE2(TB(q),
1897                   "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1898                  q->buf_state[0].flags, q->buf_state[1].flags);
1899
1900 }
1901 #endif
1902         if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1903                 handle_excess_rx(toep, m);
1904                 return;
1905         }
1906
1907 #ifdef T3_TRACE
1908         if ((int)m->m_pkthdr.len < 0) {
1909                 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1910         }
1911 #endif
1912         if (bsp->flags & DDP_BF_NOCOPY) {
1913 #ifdef T3_TRACE
1914                 T3_TRACE0(TB(q),
1915                           "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1916
1917                 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1918                         printk("!cancel_ubuf");
1919                         t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1920                 }
1921 #endif
1922                 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1923                 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1924                 q->cur_buf ^= 1;
1925         } else if (bsp->flags & DDP_BF_NOFLIP) {
1926
1927                 m->m_ddp_flags = 1;    /* always a kernel buffer */
1928
1929                 /* now HW buffer carries a user buffer */
1930                 bsp->flags &= ~DDP_BF_NOFLIP;
1931                 bsp->flags |= DDP_BF_NOCOPY;
1932
1933                 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1934                  * any new data in which case we're done. If in addition the
1935                  * offset is 0, then there wasn't a completion for the kbuf
1936                  * and we need to decrement the posted count.
1937                  */
1938                 if (m->m_pkthdr.len == 0) {
1939                         if (ddp_offset == 0) {
1940                                 q->kbuf_posted--;
1941                                 bsp->flags |= DDP_BF_NODATA;
1942                         }
1943                         sockbuf_unlock(rcv);
1944                         m_free(m);
1945                         return;
1946                 }
1947         } else {
1948                 sockbuf_unlock(rcv);
1949
1950                 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1951                  * but it got here way late and nobody cares anymore.
1952                  */
1953                 m_free(m);
1954                 return;
1955         }
1956
1957         m->m_ddp_gl = (unsigned char *)bsp->gl;
1958         m->m_flags |= M_DDP;
1959         m->m_seq = tp->rcv_nxt;
1960         tp->rcv_nxt += m->m_pkthdr.len;
1961         tp->t_rcvtime = ticks;
1962         CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1963                   m->m_seq, q->cur_buf, m->m_pkthdr.len);
1964         if (m->m_pkthdr.len == 0) {
1965                 q->user_ddp_pending = 0;
1966                 m_free(m);
1967         } else
1968                 SBAPPEND(rcv, m);
1969
1970         state = so_state_get(so);
1971         if (__predict_true((state & SS_NOFDREF) == 0))
1972                 so_sorwakeup_locked(so);
1973         else
1974                 sockbuf_unlock(rcv);
1975 }
1976
1977 /*
1978  * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1979  * in that case they are similar to DDP completions.
1980  */
1981 static int
1982 do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1983 {
1984         struct toepcb *toep = (struct toepcb *)ctx;
1985
1986         /* OK if socket doesn't exist */
1987         if (toep == NULL) {
1988                 printf("null toep in do_get_tcb_rpl\n");
1989                 return (CPL_RET_BUF_DONE);
1990         }
1991
1992         inp_wlock(toep->tp_tp->t_inpcb);
1993         tcb_rpl_as_ddp_complete(toep, m);
1994         inp_wunlock(toep->tp_tp->t_inpcb);
1995
1996         return (0);
1997 }
1998
1999 static void
2000 handle_ddp_data(struct toepcb *toep, struct mbuf *m)
2001 {
2002         struct tcpcb *tp = toep->tp_tp;
2003         struct socket *so;
2004         struct ddp_state *q;
2005         struct ddp_buf_state *bsp;
2006         struct cpl_rx_data *hdr = cplhdr(m);
2007         unsigned int rcv_nxt = ntohl(hdr->seq);
2008         struct sockbuf *rcv;
2009
2010         if (tp->rcv_nxt == rcv_nxt)
2011                 return;
2012
2013         inp_wlock_assert(tp->t_inpcb);
2014         so  = inp_inpcbtosocket(tp->t_inpcb);
2015         rcv = so_sockbuf_rcv(so);
2016         sockbuf_lock(rcv);
2017
2018         q = &toep->tp_ddp_state;
2019         bsp = &q->buf_state[q->cur_buf];
2020         KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2021                 rcv_nxt, tp->rcv_nxt));
2022         m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2023         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2024         CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2025             rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2026
2027 #ifdef T3_TRACE
2028         if ((int)m->m_pkthdr.len < 0) {
2029                 t3_ddp_error(so, "handle_ddp_data: neg len");
2030         }
2031 #endif
2032         m->m_ddp_gl = (unsigned char *)bsp->gl;
2033         m->m_flags |= M_DDP;
2034         m->m_cur_offset = bsp->cur_offset;
2035         m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2036         if (bsp->flags & DDP_BF_NOCOPY)
2037                 bsp->flags &= ~DDP_BF_NOCOPY;
2038
2039         m->m_seq = tp->rcv_nxt;
2040         tp->rcv_nxt = rcv_nxt;
2041         bsp->cur_offset += m->m_pkthdr.len;
2042         if (!(bsp->flags & DDP_BF_NOFLIP))
2043                 q->cur_buf ^= 1;
2044         /*
2045          * For now, don't re-enable DDP after a connection fell out of  DDP
2046          * mode.
2047          */
2048         q->ubuf_ddp_ready = 0;
2049         sockbuf_unlock(rcv);
2050 }
2051
2052 /*
2053  * Process new data received for a connection.
2054  */
2055 static void
2056 new_rx_data(struct toepcb *toep, struct mbuf *m)
2057 {
2058         struct cpl_rx_data *hdr = cplhdr(m);
2059         struct tcpcb *tp = toep->tp_tp;
2060         struct socket *so;
2061         struct sockbuf *rcv;
2062         int state;
2063         int len = be16toh(hdr->len);
2064
2065         inp_wlock(tp->t_inpcb);
2066
2067         so  = inp_inpcbtosocket(tp->t_inpcb);
2068
2069         if (__predict_false(so_no_receive(so))) {
2070                 handle_excess_rx(toep, m);
2071                 inp_wunlock(tp->t_inpcb);
2072                 TRACE_EXIT;
2073                 return;
2074         }
2075
2076         if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2077                 handle_ddp_data(toep, m);
2078
2079         m->m_seq = ntohl(hdr->seq);
2080         m->m_ulp_mode = 0;                    /* for iSCSI */
2081
2082 #if VALIDATE_SEQ
2083         if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2084                 log(LOG_ERR,
2085                        "%s: TID %u: Bad sequence number %u, expected %u\n",
2086                     toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2087                        tp->rcv_nxt);
2088                 m_freem(m);
2089                 inp_wunlock(tp->t_inpcb);
2090                 return;
2091         }
2092 #endif
2093         m_adj(m, sizeof(*hdr));
2094
2095 #ifdef URGENT_DATA_SUPPORTED
2096         /*
2097          * We don't handle urgent data yet
2098          */
2099         if (__predict_false(hdr->urg))
2100                 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2101         if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2102                      tp->urg_seq - tp->rcv_nxt < skb->len))
2103                 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2104                                                          tp->rcv_nxt];
2105 #endif
2106         if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2107                 toep->tp_delack_mode = hdr->dack_mode;
2108                 toep->tp_delack_seq = tp->rcv_nxt;
2109         }
2110         CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2111             m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2112
2113         if (len < m->m_pkthdr.len)
2114                 m->m_pkthdr.len = m->m_len = len;
2115
2116         tp->rcv_nxt += m->m_pkthdr.len;
2117         tp->t_rcvtime = ticks;
2118         toep->tp_enqueued_bytes += m->m_pkthdr.len;
2119         CTR2(KTR_TOM,
2120             "new_rx_data: seq 0x%x len %u",
2121             m->m_seq, m->m_pkthdr.len);
2122         inp_wunlock(tp->t_inpcb);
2123         rcv = so_sockbuf_rcv(so);
2124         sockbuf_lock(rcv);
2125 #if 0
2126         if (sb_notify(rcv))
2127                 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2128 #endif
2129         SBAPPEND(rcv, m);
2130         inp_wunlock(tp->t_inpcb);
2131         sockbuf_lock(rcv);
2132
2133 #ifdef notyet
2134         /*
2135          * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2136          *
2137          */
2138         KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2139
2140             ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2141                 so, rcv->sb_cc, rcv->sb_mbmax));
2142 #endif
2143
2144
2145         CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2146             rcv->sb_cc, rcv->sb_mbcnt);
2147
2148         state = so_state_get(so);
2149         if (__predict_true((state & SS_NOFDREF) == 0))
2150                 so_sorwakeup_locked(so);
2151         else
2152                 sockbuf_unlock(rcv);
2153 }
2154
2155 /*
2156  * Handler for RX_DATA CPL messages.
2157  */
2158 static int
2159 do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2160 {
2161         struct toepcb *toep = (struct toepcb *)ctx;
2162
2163         DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2164
2165         new_rx_data(toep, m);
2166
2167         return (0);
2168 }
2169
2170 static void
2171 new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2172 {
2173         struct tcpcb *tp;
2174         struct ddp_state *q;
2175         struct ddp_buf_state *bsp;
2176         struct cpl_rx_data_ddp *hdr;
2177         struct socket *so;
2178         unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2179         int nomoredata = 0;
2180         unsigned int delack_mode;
2181         struct sockbuf *rcv;
2182
2183         tp = toep->tp_tp;
2184         inp_wlock(tp->t_inpcb);
2185         so = inp_inpcbtosocket(tp->t_inpcb);
2186
2187         if (__predict_false(so_no_receive(so))) {
2188
2189                 handle_excess_rx(toep, m);
2190                 inp_wunlock(tp->t_inpcb);
2191                 return;
2192         }
2193
2194         q = &toep->tp_ddp_state;
2195         hdr = cplhdr(m);
2196         ddp_report = ntohl(hdr->u.ddp_report);
2197         buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2198         bsp = &q->buf_state[buf_idx];
2199
2200         CTR4(KTR_TOM,
2201             "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2202             "hdr seq 0x%x len %u",
2203             tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2204             ntohs(hdr->len));
2205         CTR3(KTR_TOM,
2206             "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2207             G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2208
2209         ddp_len = ntohs(hdr->len);
2210         rcv_nxt = ntohl(hdr->seq) + ddp_len;
2211
2212         delack_mode = G_DDP_DACK_MODE(ddp_report);
2213         if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2214                 toep->tp_delack_mode = delack_mode;
2215                 toep->tp_delack_seq = tp->rcv_nxt;
2216         }
2217
2218         m->m_seq = tp->rcv_nxt;
2219         tp->rcv_nxt = rcv_nxt;
2220
2221         tp->t_rcvtime = ticks;
2222         /*
2223          * Store the length in m->m_len.  We are changing the meaning of
2224          * m->m_len here, we need to be very careful that nothing from now on
2225          * interprets ->len of this packet the usual way.
2226          */
2227         m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2228         inp_wunlock(tp->t_inpcb);
2229         CTR3(KTR_TOM,
2230             "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2231             m->m_len, rcv_nxt, m->m_seq);
2232         /*
2233          * Figure out where the new data was placed in the buffer and store it
2234          * in when.  Assumes the buffer offset starts at 0, consumer needs to
2235          * account for page pod's pg_offset.
2236          */
2237         end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2238         m->m_cur_offset = end_offset - m->m_pkthdr.len;
2239
2240         rcv = so_sockbuf_rcv(so);
2241         sockbuf_lock(rcv);
2242
2243         m->m_ddp_gl = (unsigned char *)bsp->gl;
2244         m->m_flags |= M_DDP;
2245         bsp->cur_offset = end_offset;
2246         toep->tp_enqueued_bytes += m->m_pkthdr.len;
2247
2248         /*
2249          * Length is only meaningful for kbuf
2250          */
2251         if (!(bsp->flags & DDP_BF_NOCOPY))
2252                 KASSERT(m->m_len <= bsp->gl->dgl_length,
2253                     ("length received exceeds ddp pages: len=%d dgl_length=%d",
2254                         m->m_len, bsp->gl->dgl_length));
2255
2256         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2257         KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2258         /*
2259          * Bit 0 of flags stores whether the DDP buffer is completed.
2260          * Note that other parts of the code depend on this being in bit 0.
2261          */
2262         if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2263                 panic("spurious ddp completion");
2264         } else {
2265                 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2266                 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2267                         q->cur_buf ^= 1;                     /* flip buffers */
2268         }
2269
2270         if (bsp->flags & DDP_BF_NOCOPY) {
2271                 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2272                 bsp->flags &= ~DDP_BF_NOCOPY;
2273         }
2274
2275         if (ddp_report & F_DDP_PSH)
2276                 m->m_ddp_flags |= DDP_BF_PSH;
2277         if (nomoredata)
2278                 m->m_ddp_flags |= DDP_BF_NODATA;
2279
2280 #ifdef notyet
2281         skb_reset_transport_header(skb);
2282         tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2283 #endif
2284         SBAPPEND(rcv, m);
2285
2286         if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2287             (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2288                 || !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2289                 so_sorwakeup_locked(so);
2290         else
2291                 sockbuf_unlock(rcv);
2292 }
2293
2294 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2295                  F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2296                  F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2297                  F_DDP_INVALID_PPOD)
2298
2299 /*
2300  * Handler for RX_DATA_DDP CPL messages.
2301  */
2302 static int
2303 do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2304 {
2305         struct toepcb *toep = ctx;
2306         const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2307
2308         VALIDATE_SOCK(so);
2309
2310         if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2311                 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2312                        GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2313                 return (CPL_RET_BUF_DONE);
2314         }
2315 #if 0
2316         skb->h.th = tcphdr_skb->h.th;
2317 #endif
2318         new_rx_data_ddp(toep, m);
2319         return (0);
2320 }
2321
2322 static void
2323 process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2324 {
2325         struct tcpcb *tp = toep->tp_tp;
2326         struct socket *so;
2327         struct ddp_state *q;
2328         struct ddp_buf_state *bsp;
2329         struct cpl_rx_ddp_complete *hdr;
2330         unsigned int ddp_report, buf_idx, when, delack_mode;
2331         int nomoredata = 0;
2332         struct sockbuf *rcv;
2333
2334         inp_wlock(tp->t_inpcb);
2335         so = inp_inpcbtosocket(tp->t_inpcb);
2336         inp_wlock(tp->t_inpcb);
2337
2338         if (__predict_false(so_no_receive(so))) {
2339                 struct inpcb *inp = so_sotoinpcb(so);
2340
2341                 handle_excess_rx(toep, m);
2342                 inp_wunlock(inp);
2343                 return;
2344         }
2345         q = &toep->tp_ddp_state;
2346         hdr = cplhdr(m);
2347         ddp_report = ntohl(hdr->ddp_report);
2348         buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2349         m->m_pkthdr.csum_data = tp->rcv_nxt;
2350
2351         rcv = so_sockbuf_rcv(so);
2352         sockbuf_lock(rcv);
2353
2354         bsp = &q->buf_state[buf_idx];
2355         when = bsp->cur_offset;
2356         m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2357         tp->rcv_nxt += m->m_len;
2358         tp->t_rcvtime = ticks;
2359
2360         delack_mode = G_DDP_DACK_MODE(ddp_report);
2361         if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2362                 toep->tp_delack_mode = delack_mode;
2363                 toep->tp_delack_seq = tp->rcv_nxt;
2364         }
2365 #ifdef notyet
2366         skb_reset_transport_header(skb);
2367         tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2368 #endif
2369         inp_wunlock(tp->t_inpcb);
2370
2371         KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2372         CTR5(KTR_TOM,
2373                   "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2374                   "ddp_report 0x%x offset %u, len %u",
2375                   tp->rcv_nxt, bsp->cur_offset, ddp_report,
2376                    G_DDP_OFFSET(ddp_report), m->m_len);
2377
2378         m->m_cur_offset = bsp->cur_offset;
2379         bsp->cur_offset += m->m_len;
2380
2381         if (!(bsp->flags & DDP_BF_NOFLIP)) {
2382                 q->cur_buf ^= 1;                     /* flip buffers */
2383                 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2384                         nomoredata=1;
2385         }
2386
2387         CTR4(KTR_TOM,
2388                   "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2389                   "ddp_report %u offset %u",
2390                   tp->rcv_nxt, bsp->cur_offset, ddp_report,
2391                    G_DDP_OFFSET(ddp_report));
2392
2393         m->m_ddp_gl = (unsigned char *)bsp->gl;
2394         m->m_flags |= M_DDP;
2395         m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2396         if (bsp->flags & DDP_BF_NOCOPY)
2397                 bsp->flags &= ~DDP_BF_NOCOPY;
2398         if (nomoredata)
2399                 m->m_ddp_flags |= DDP_BF_NODATA;
2400
2401         SBAPPEND(rcv, m);
2402         if ((so_state_get(so) & SS_NOFDREF) == 0)
2403                 so_sorwakeup_locked(so);
2404         else
2405                 sockbuf_unlock(rcv);
2406 }
2407
2408 /*
2409  * Handler for RX_DDP_COMPLETE CPL messages.
2410  */
2411 static int
2412 do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2413 {
2414         struct toepcb *toep = ctx;
2415
2416         VALIDATE_SOCK(so);
2417 #if 0
2418         skb->h.th = tcphdr_skb->h.th;
2419 #endif
2420         process_ddp_complete(toep, m);
2421         return (0);
2422 }
2423
2424 /*
2425  * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2426  * socket state before calling tcp_time_wait to comply with its expectations.
2427  */
2428 static void
2429 enter_timewait(struct tcpcb *tp)
2430 {
2431
2432         inp_wlock_assert(tp->t_inpcb);
2433         /*
2434          * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2435          * process peer_close because we don't want to carry the peer FIN in
2436          * the socket's receive queue and if we increment rcv_nxt without
2437          * having the FIN in the receive queue we'll confuse facilities such
2438          * as SIOCINQ.
2439          */
2440         inp_wlock(tp->t_inpcb);
2441         tp->rcv_nxt++;
2442
2443         tp->ts_recent_age = 0;       /* defeat recycling */
2444         tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2445         inp_wunlock(tp->t_inpcb);
2446         tcp_offload_twstart(tp);
2447 }
2448
2449 /*
2450  * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2451  * function deals with the data that may be reported along with the FIN.
2452  * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2453  * perform normal FIN-related processing.  In the latter case 1 indicates that
2454  * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2455  * skb can be freed.
2456  */
2457 static int
2458 handle_peer_close_data(struct socket *so, struct mbuf *m)
2459 {
2460         struct tcpcb *tp = so_sototcpcb(so);
2461         struct toepcb *toep = tp->t_toe;
2462         struct ddp_state *q;
2463         struct ddp_buf_state *bsp;
2464         struct cpl_peer_close *req = cplhdr(m);
2465         unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2466         struct sockbuf *rcv;
2467
2468         if (tp->rcv_nxt == rcv_nxt)                     /* no data */
2469                 return (0);
2470
2471         CTR0(KTR_TOM, "handle_peer_close_data");
2472         if (__predict_false(so_no_receive(so))) {
2473                 handle_excess_rx(toep, m);
2474
2475                 /*
2476                  * Although we discard the data we want to process the FIN so
2477                  * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2478                  * PEER_CLOSE without data.  In particular this PEER_CLOSE
2479                  * may be what will close the connection.  We return 1 because
2480                  * handle_excess_rx() already freed the packet.
2481                  */
2482                 return (1);
2483         }
2484
2485         inp_wlock_assert(tp->t_inpcb);
2486         q = &toep->tp_ddp_state;
2487         rcv = so_sockbuf_rcv(so);
2488         sockbuf_lock(rcv);
2489
2490         bsp = &q->buf_state[q->cur_buf];
2491         m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2492         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2493         m->m_ddp_gl = (unsigned char *)bsp->gl;
2494         m->m_flags |= M_DDP;
2495         m->m_cur_offset = bsp->cur_offset;
2496         m->m_ddp_flags =
2497             DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2498         m->m_seq = tp->rcv_nxt;
2499         tp->rcv_nxt = rcv_nxt;
2500         bsp->cur_offset += m->m_pkthdr.len;
2501         if (!(bsp->flags & DDP_BF_NOFLIP))
2502                 q->cur_buf ^= 1;
2503 #ifdef notyet
2504         skb_reset_transport_header(skb);
2505         tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2506 #endif
2507         tp->t_rcvtime = ticks;
2508         SBAPPEND(rcv, m);
2509         if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2510                 so_sorwakeup_locked(so);
2511         else
2512                 sockbuf_unlock(rcv);
2513
2514         return (1);
2515 }
2516
2517 /*
2518  * Handle a peer FIN.
2519  */
2520 static void
2521 do_peer_fin(struct toepcb *toep, struct mbuf *m)
2522 {
2523         struct socket *so;
2524         struct tcpcb *tp = toep->tp_tp;
2525         int keep, action;
2526
2527         action = keep = 0;
2528         CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2529         if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2530                 printf("abort_pending set\n");
2531
2532                 goto out;
2533         }
2534         inp_wlock(tp->t_inpcb);
2535         so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2536         if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2537                 keep = handle_peer_close_data(so, m);
2538                 if (keep < 0) {
2539                         inp_wunlock(tp->t_inpcb);
2540                         return;
2541                 }
2542         }
2543         if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2544                 CTR1(KTR_TOM,
2545                     "waking up waiters for cantrcvmore on %p ", so);
2546                 socantrcvmore(so);
2547
2548                 /*
2549                  * If connection is half-synchronized
2550                  * (ie NEEDSYN flag on) then delay ACK,
2551                  * so it may be piggybacked when SYN is sent.
2552                  * Otherwise, since we received a FIN then no
2553                  * more input can be expected, send ACK now.
2554                  */
2555                 if (tp->t_flags & TF_NEEDSYN)
2556                         tp->t_flags |= TF_DELACK;
2557                 else
2558                         tp->t_flags |= TF_ACKNOW;
2559                 tp->rcv_nxt++;
2560         }
2561
2562         switch (tp->t_state) {
2563         case TCPS_SYN_RECEIVED:
2564             tp->t_starttime = ticks;
2565         /* FALLTHROUGH */
2566         case TCPS_ESTABLISHED:
2567                 tp->t_state = TCPS_CLOSE_WAIT;
2568                 break;
2569         case TCPS_FIN_WAIT_1:
2570                 tp->t_state = TCPS_CLOSING;
2571                 break;
2572         case TCPS_FIN_WAIT_2:
2573                 /*
2574                  * If we've sent an abort_req we must have sent it too late,
2575                  * HW will send us a reply telling us so, and this peer_close
2576                  * is really the last message for this connection and needs to
2577                  * be treated as an abort_rpl, i.e., transition the connection
2578                  * to TCP_CLOSE (note that the host stack does this at the
2579                  * time of generating the RST but we must wait for HW).
2580                  * Otherwise we enter TIME_WAIT.
2581                  */
2582                 t3_release_offload_resources(toep);
2583                 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2584                         action = TCP_CLOSE;
2585                 } else {
2586                         action = TCP_TIMEWAIT;
2587                 }
2588                 break;
2589         default:
2590                 log(LOG_ERR,
2591                        "%s: TID %u received PEER_CLOSE in bad state %d\n",
2592                     toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2593         }
2594         inp_wunlock(tp->t_inpcb);
2595
2596         if (action == TCP_TIMEWAIT) {
2597                 enter_timewait(tp);
2598         } else if (action == TCP_DROP) {
2599                 tcp_offload_drop(tp, 0);
2600         } else if (action == TCP_CLOSE) {
2601                 tcp_offload_close(tp);
2602         }
2603
2604 #ifdef notyet
2605         /* Do not send POLL_HUP for half duplex close. */
2606         if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2607             sk->sk_state == TCP_CLOSE)
2608                 sk_wake_async(so, 1, POLL_HUP);
2609         else
2610                 sk_wake_async(so, 1, POLL_IN);
2611 #endif
2612
2613 out:
2614         if (!keep)
2615                 m_free(m);
2616 }
2617
2618 /*
2619  * Handler for PEER_CLOSE CPL messages.
2620  */
2621 static int
2622 do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2623 {
2624         struct toepcb *toep = (struct toepcb *)ctx;
2625
2626         VALIDATE_SOCK(so);
2627
2628         do_peer_fin(toep, m);
2629         return (0);
2630 }
2631
2632 static void
2633 process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2634 {
2635         struct cpl_close_con_rpl *rpl = cplhdr(m);
2636         struct tcpcb *tp = toep->tp_tp;
2637         struct socket *so;
2638         int action = 0;
2639         struct sockbuf *rcv;
2640
2641         inp_wlock(tp->t_inpcb);
2642         so = inp_inpcbtosocket(tp->t_inpcb);
2643
2644         tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2645
2646         if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2647                 inp_wunlock(tp->t_inpcb);
2648                 goto out;
2649         }
2650
2651         CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2652             tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2653
2654         switch (tp->t_state) {
2655         case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2656                 t3_release_offload_resources(toep);
2657                 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2658                         action = TCP_CLOSE;
2659
2660                 } else {
2661                         action = TCP_TIMEWAIT;
2662                 }
2663                 break;
2664         case TCPS_LAST_ACK:
2665                 /*
2666                  * In this state we don't care about pending abort_rpl.
2667                  * If we've sent abort_req it was post-close and was sent too
2668                  * late, this close_con_rpl is the actual last message.
2669                  */
2670                 t3_release_offload_resources(toep);
2671                 action = TCP_CLOSE;
2672                 break;
2673         case TCPS_FIN_WAIT_1:
2674                 /*
2675                  * If we can't receive any more
2676                  * data, then closing user can proceed.
2677                  * Starting the timer is contrary to the
2678                  * specification, but if we don't get a FIN
2679                  * we'll hang forever.
2680                  *
2681                  * XXXjl:
2682                  * we should release the tp also, and use a
2683                  * compressed state.
2684                  */
2685                 if (so)
2686                         rcv = so_sockbuf_rcv(so);
2687                 else
2688                         break;
2689
2690                 if (rcv->sb_state & SBS_CANTRCVMORE) {
2691                         int timeout;
2692
2693                         if (so)
2694                                 soisdisconnected(so);
2695                         timeout = (tcp_fast_finwait2_recycle) ?
2696                             tcp_finwait2_timeout : tcp_maxidle;
2697                         tcp_timer_activate(tp, TT_2MSL, timeout);
2698                 }
2699                 tp->t_state = TCPS_FIN_WAIT_2;
2700                 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2701                     (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2702                         action = TCP_DROP;
2703                 }
2704
2705                 break;
2706         default:
2707                 log(LOG_ERR,
2708                        "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2709                        toep->tp_toedev->tod_name, toep->tp_tid,
2710                        tp->t_state);
2711         }
2712         inp_wunlock(tp->t_inpcb);
2713
2714
2715         if (action == TCP_TIMEWAIT) {
2716                 enter_timewait(tp);
2717         } else if (action == TCP_DROP) {
2718                 tcp_offload_drop(tp, 0);
2719         } else if (action == TCP_CLOSE) {
2720                 tcp_offload_close(tp);
2721         }
2722 out:
2723         m_freem(m);
2724 }
2725
2726 /*
2727  * Handler for CLOSE_CON_RPL CPL messages.
2728  */
2729 static int
2730 do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2731                             void *ctx)
2732 {
2733         struct toepcb *toep = (struct toepcb *)ctx;
2734
2735         process_close_con_rpl(toep, m);
2736         return (0);
2737 }
2738
2739 /*
2740  * Process abort replies.  We only process these messages if we anticipate
2741  * them as the coordination between SW and HW in this area is somewhat lacking
2742  * and sometimes we get ABORT_RPLs after we are done with the connection that
2743  * originated the ABORT_REQ.
2744  */
2745 static void
2746 process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2747 {
2748         struct tcpcb *tp = toep->tp_tp;
2749         struct socket *so;
2750         int needclose = 0;
2751
2752 #ifdef T3_TRACE
2753         T3_TRACE1(TIDTB(sk),
2754                   "process_abort_rpl: GTS rpl pending %d",
2755                   sock_flag(sk, ABORT_RPL_PENDING));
2756 #endif
2757
2758         inp_wlock(tp->t_inpcb);
2759         so = inp_inpcbtosocket(tp->t_inpcb);
2760
2761         if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2762                 /*
2763                  * XXX panic on tcpdrop
2764                  */
2765                 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2766                         toep->tp_flags |= TP_ABORT_RPL_RCVD;
2767                 else {
2768                         toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2769                         if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2770                             !is_t3a(toep->tp_toedev)) {
2771                                 if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2772                                         panic("TP_ABORT_REQ_RCVD set");
2773                                 t3_release_offload_resources(toep);
2774                                 needclose = 1;
2775                         }
2776                 }
2777         }
2778         inp_wunlock(tp->t_inpcb);
2779
2780         if (needclose)
2781                 tcp_offload_close(tp);
2782
2783         m_free(m);
2784 }
2785
2786 /*
2787  * Handle an ABORT_RPL_RSS CPL message.
2788  */
2789 static int
2790 do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2791 {
2792         struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2793         struct toepcb *toep;
2794
2795         /*
2796          * Ignore replies to post-close aborts indicating that the abort was
2797          * requested too late.  These connections are terminated when we get
2798          * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2799          * arrives the TID is either no longer used or it has been recycled.
2800          */
2801         if (rpl->status == CPL_ERR_ABORT_FAILED) {
2802 discard:
2803                 m_free(m);
2804                 return (0);
2805         }
2806
2807         toep = (struct toepcb *)ctx;
2808
2809         /*
2810          * Sometimes we've already closed the socket, e.g., a post-close
2811          * abort races with ABORT_REQ_RSS, the latter frees the socket
2812          * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2813          * but FW turns the ABORT_REQ into a regular one and so we get
2814          * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2815          */
2816         if (!toep)
2817                 goto discard;
2818
2819         if (toep->tp_tp == NULL) {
2820                 log(LOG_NOTICE, "removing tid for abort\n");
2821                 cxgb_remove_tid(cdev, toep, toep->tp_tid);
2822                 if (toep->tp_l2t)
2823                         l2t_release(L2DATA(cdev), toep->tp_l2t);
2824
2825                 toepcb_release(toep);
2826                 goto discard;
2827         }
2828
2829         log(LOG_NOTICE, "toep=%p\n", toep);
2830         log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2831
2832         toepcb_hold(toep);
2833         process_abort_rpl(toep, m);
2834         toepcb_release(toep);
2835         return (0);
2836 }
2837
2838 /*
2839  * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2840  * indicate whether RST should be sent in response.
2841  */
2842 static int
2843 abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2844 {
2845         struct tcpcb *tp = so_sototcpcb(so);
2846
2847         switch (abort_reason) {
2848         case CPL_ERR_BAD_SYN:
2849 #if 0
2850                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);      // fall through
2851 #endif
2852         case CPL_ERR_CONN_RESET:
2853                 // XXX need to handle SYN_RECV due to crossed SYNs
2854                 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2855         case CPL_ERR_XMIT_TIMEDOUT:
2856         case CPL_ERR_PERSIST_TIMEDOUT:
2857         case CPL_ERR_FINWAIT2_TIMEDOUT:
2858         case CPL_ERR_KEEPALIVE_TIMEDOUT:
2859 #if 0
2860                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2861 #endif
2862                 return (ETIMEDOUT);
2863         default:
2864                 return (EIO);
2865         }
2866 }
2867
2868 static inline void
2869 set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2870 {
2871         struct cpl_abort_rpl *rpl = cplhdr(m);
2872
2873         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2874         rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2875         m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2876
2877         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2878         rpl->cmd = cmd;
2879 }
2880
2881 static void
2882 send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2883 {
2884         struct mbuf *reply_mbuf;
2885         struct cpl_abort_req_rss *req = cplhdr(m);
2886
2887         reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2888         m_set_priority(m, CPL_PRIORITY_DATA);
2889         m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2890         set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2891         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2892         m_free(m);
2893 }
2894
2895 /*
2896  * Returns whether an ABORT_REQ_RSS message is a negative advice.
2897  */
2898 static inline int
2899 is_neg_adv_abort(unsigned int status)
2900 {
2901         return status == CPL_ERR_RTX_NEG_ADVICE ||
2902             status == CPL_ERR_PERSIST_NEG_ADVICE;
2903 }
2904
2905 static void
2906 send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2907 {
2908         struct mbuf  *reply_mbuf;
2909         struct cpl_abort_req_rss *req = cplhdr(m);
2910
2911         reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2912
2913         if (!reply_mbuf) {
2914                 /* Defer the reply.  Stick rst_status into req->cmd. */
2915                 req->status = rst_status;
2916                 t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2917                 return;
2918         }
2919
2920         m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2921         set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2922         m_free(m);
2923
2924         /*
2925          * XXX need to sync with ARP as for SYN_RECV connections we can send
2926          * these messages while ARP is pending.  For other connection states
2927          * it's not a problem.
2928          */
2929         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2930 }
2931
2932 #ifdef notyet
2933 static void
2934 cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2935 {
2936         CXGB_UNIMPLEMENTED();
2937 #ifdef notyet
2938         struct request_sock *req = child->sk_user_data;
2939
2940         inet_csk_reqsk_queue_removed(parent, req);
2941         synq_remove(tcp_sk(child));
2942         __reqsk_free(req);
2943         child->sk_user_data = NULL;
2944 #endif
2945 }
2946
2947
2948 /*
2949  * Performs the actual work to abort a SYN_RECV connection.
2950  */
2951 static void
2952 do_abort_syn_rcv(struct socket *child, struct socket *parent)
2953 {
2954         struct tcpcb *parenttp = so_sototcpcb(parent);
2955         struct tcpcb *childtp = so_sototcpcb(child);
2956
2957         /*
2958          * If the server is still open we clean up the child connection,
2959          * otherwise the server already did the clean up as it was purging
2960          * its SYN queue and the skb was just sitting in its backlog.
2961          */
2962         if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2963                 cleanup_syn_rcv_conn(child, parent);
2964                 inp_wlock(childtp->t_inpcb);
2965                 t3_release_offload_resources(childtp->t_toe);
2966                 inp_wunlock(childtp->t_inpcb);
2967                 tcp_offload_close(childtp);
2968         }
2969 }
2970 #endif
2971
2972 /*
2973  * Handle abort requests for a SYN_RECV connection.  These need extra work
2974  * because the socket is on its parent's SYN queue.
2975  */
2976 static int
2977 abort_syn_rcv(struct socket *so, struct mbuf *m)
2978 {
2979         CXGB_UNIMPLEMENTED();
2980 #ifdef notyet
2981         struct socket *parent;
2982         struct toedev *tdev = toep->tp_toedev;
2983         struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2984         struct socket *oreq = so->so_incomp;
2985         struct t3c_tid_entry *t3c_stid;
2986         struct tid_info *t;
2987
2988         if (!oreq)
2989                 return -1;        /* somehow we are not on the SYN queue */
2990
2991         t = &(T3C_DATA(cdev))->tid_maps;
2992         t3c_stid = lookup_stid(t, oreq->ts_recent);
2993         parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2994
2995         so_lock(parent);
2996         do_abort_syn_rcv(so, parent);
2997         send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2998         so_unlock(parent);
2999 #endif
3000         return (0);
3001 }
3002
3003 /*
3004  * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
3005  * request except that we need to reply to it.
3006  */
3007 static void
3008 process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
3009 {
3010         int rst_status = CPL_ABORT_NO_RST;
3011         const struct cpl_abort_req_rss *req = cplhdr(m);
3012         struct tcpcb *tp = toep->tp_tp;
3013         struct socket *so;
3014         int needclose = 0;
3015
3016         inp_wlock(tp->t_inpcb);
3017         so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3018         if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3019                 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3020                 m_free(m);
3021                 goto skip;
3022         }
3023
3024         toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3025         /*
3026          * Three cases to consider:
3027          * a) We haven't sent an abort_req; close the connection.
3028          * b) We have sent a post-close abort_req that will get to TP too late
3029          *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3030          *    be ignored and the connection should be closed now.
3031          * c) We have sent a regular abort_req that will get to TP too late.
3032          *    That will generate an abort_rpl with status 0, wait for it.
3033          */
3034         if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3035             (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3036                 int error;
3037
3038                 error = abort_status_to_errno(so, req->status,
3039                     &rst_status);
3040                 so_error_set(so, error);
3041
3042                 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3043                         so_sorwakeup(so);
3044                 /*
3045                  * SYN_RECV needs special processing.  If abort_syn_rcv()
3046                  * returns 0 is has taken care of the abort.
3047                  */
3048                 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3049                         goto skip;
3050
3051                 t3_release_offload_resources(toep);
3052                 needclose = 1;
3053         }
3054         inp_wunlock(tp->t_inpcb);
3055
3056         if (needclose)
3057                 tcp_offload_close(tp);
3058
3059         send_abort_rpl(m, tdev, rst_status);
3060         return;
3061 skip:
3062         inp_wunlock(tp->t_inpcb);
3063 }
3064
3065 /*
3066  * Handle an ABORT_REQ_RSS CPL message.
3067  */
3068 static int
3069 do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3070 {
3071         const struct cpl_abort_req_rss *req = cplhdr(m);
3072         struct toepcb *toep = (struct toepcb *)ctx;
3073
3074         if (is_neg_adv_abort(req->status)) {
3075                 m_free(m);
3076                 return (0);
3077         }
3078
3079         log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3080
3081         if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3082                 cxgb_remove_tid(cdev, toep, toep->tp_tid);
3083                 toep->tp_flags |= TP_ABORT_REQ_RCVD;
3084
3085                 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3086                 if (toep->tp_l2t)
3087                         l2t_release(L2DATA(cdev), toep->tp_l2t);
3088
3089                 /*
3090                  *  Unhook
3091                  */
3092                 toep->tp_tp->t_toe = NULL;
3093                 toep->tp_tp->t_flags &= ~TF_TOE;
3094                 toep->tp_tp = NULL;
3095                 /*
3096                  * XXX need to call syncache_chkrst - but we don't
3097                  * have a way of doing that yet
3098                  */
3099                 toepcb_release(toep);
3100                 log(LOG_ERR, "abort for unestablished connection :-(\n");
3101                 return (0);
3102         }
3103         if (toep->tp_tp == NULL) {
3104                 log(LOG_NOTICE, "disconnected toepcb\n");
3105                 /* should be freed momentarily */
3106                 return (0);
3107         }
3108
3109
3110         toepcb_hold(toep);
3111         process_abort_req(toep, m, toep->tp_toedev);
3112         toepcb_release(toep);
3113         return (0);
3114 }
3115 #ifdef notyet
3116 static void
3117 pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3118 {
3119         struct toedev *tdev = TOE_DEV(parent);
3120
3121         do_abort_syn_rcv(child, parent);
3122         if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3123                 struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3124
3125                 rpl->opt0h = htonl(F_TCAM_BYPASS);
3126                 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3127                 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3128         } else
3129                 m_free(m);
3130 }
3131 #endif
3132 static void
3133 handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3134 {
3135         CXGB_UNIMPLEMENTED();
3136
3137 #ifdef notyet
3138         struct t3cdev *cdev;
3139         struct socket *parent;
3140         struct socket *oreq;
3141         struct t3c_tid_entry *t3c_stid;
3142         struct tid_info *t;
3143         struct tcpcb *otp, *tp = so_sototcpcb(so);
3144         struct toepcb *toep = tp->t_toe;
3145
3146         /*
3147          * If the connection is being aborted due to the parent listening
3148          * socket going away there's nothing to do, the ABORT_REQ will close
3149          * the connection.
3150          */
3151         if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3152                 m_free(m);
3153                 return;
3154         }
3155
3156         oreq = so->so_incomp;
3157         otp = so_sototcpcb(oreq);
3158
3159         cdev = T3C_DEV(so);
3160         t = &(T3C_DATA(cdev))->tid_maps;
3161         t3c_stid = lookup_stid(t, otp->ts_recent);
3162         parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3163
3164         so_lock(parent);
3165         pass_open_abort(so, parent, m);
3166         so_unlock(parent);
3167 #endif
3168 }
3169
3170 /*
3171  * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3172  * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3173  * connection.
3174  */
3175 static void
3176 pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3177 {
3178
3179 #ifdef notyet
3180         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3181         BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3182 #endif
3183         handle_pass_open_arp_failure(m_get_socket(m), m);
3184 }
3185
3186 /*
3187  * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3188  */
3189 static void
3190 mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3191 {
3192         struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3193         struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3194         unsigned int tid = GET_TID(req);
3195
3196         m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3197         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3198         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3199         rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3200         rpl->opt0h = htonl(F_TCAM_BYPASS);
3201         rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3202         rpl->opt2 = 0;
3203         rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3204 }
3205
3206 /*
3207  * Send a deferred reject to an accept request.
3208  */
3209 static void
3210 reject_pass_request(struct toedev *tdev, struct mbuf *m)
3211 {
3212         struct mbuf *reply_mbuf;
3213
3214         reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3215         mk_pass_accept_rpl(reply_mbuf, m);
3216         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3217         m_free(m);
3218 }
3219
3220 static void
3221 handle_syncache_event(int event, void *arg)
3222 {
3223         struct toepcb *toep = arg;
3224
3225         switch (event) {
3226         case TOE_SC_ENTRY_PRESENT:
3227                 /*
3228                  * entry already exists - free toepcb
3229                  * and l2t
3230                  */
3231                 printf("syncache entry present\n");
3232                 toepcb_release(toep);
3233                 break;
3234         case TOE_SC_DROP:
3235                 /*
3236                  * The syncache has given up on this entry
3237                  * either it timed out, or it was evicted
3238                  * we need to explicitly release the tid
3239                  */
3240                 printf("syncache entry dropped\n");
3241                 toepcb_release(toep);
3242                 break;
3243         default:
3244                 log(LOG_ERR, "unknown syncache event %d\n", event);
3245                 break;
3246         }
3247 }
3248
3249 static void
3250 syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3251 {
3252         struct in_conninfo inc;
3253         struct tcpopt to;
3254         struct tcphdr th;
3255         struct inpcb *inp;
3256         int mss, wsf, sack, ts;
3257         uint32_t rcv_isn = ntohl(req->rcv_isn);
3258
3259         bzero(&to, sizeof(struct tcpopt));
3260         inp = so_sotoinpcb(lso);
3261
3262         /*
3263          * Fill out information for entering us into the syncache
3264          */
3265         bzero(&inc, sizeof(inc));
3266         inc.inc_fport = th.th_sport = req->peer_port;
3267         inc.inc_lport = th.th_dport = req->local_port;
3268         th.th_seq = req->rcv_isn;
3269         th.th_flags = TH_SYN;
3270
3271         toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3272
3273         inc.inc_len = 0;
3274         inc.inc_faddr.s_addr = req->peer_ip;
3275         inc.inc_laddr.s_addr = req->local_ip;
3276
3277         DPRINTF("syncache add of %d:%d %d:%d\n",
3278             ntohl(req->local_ip), ntohs(req->local_port),
3279             ntohl(req->peer_ip), ntohs(req->peer_port));
3280
3281         mss = req->tcp_options.mss;
3282         wsf = req->tcp_options.wsf;
3283         ts = req->tcp_options.tstamp;
3284         sack = req->tcp_options.sack;
3285         to.to_mss = mss;
3286         to.to_wscale = wsf;
3287         to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3288         tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3289 }
3290
3291
3292 /*
3293  * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3294  * lock held.  Note that the sock here is a listening socket that is not owned
3295  * by the TOE.
3296  */
3297 static void
3298 process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3299     struct listen_ctx *lctx)
3300 {
3301         int rt_flags;
3302         struct l2t_entry *e;
3303         struct iff_mac tim;
3304         struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3305         struct cpl_pass_accept_rpl *rpl;
3306         struct cpl_pass_accept_req *req = cplhdr(m);
3307         unsigned int tid = GET_TID(req);
3308         struct tom_data *d = TOM_DATA(tdev);
3309         struct t3cdev *cdev = d->cdev;
3310         struct tcpcb *tp = so_sototcpcb(so);
3311         struct toepcb *newtoep;
3312         struct rtentry *dst;
3313         struct sockaddr_in nam;
3314         struct t3c_data *td = T3C_DATA(cdev);
3315
3316         reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3317         if (__predict_false(reply_mbuf == NULL)) {
3318                 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3319                         t3_defer_reply(m, tdev, reject_pass_request);
3320                 else {
3321                         cxgb_queue_tid_release(cdev, tid);
3322                         m_free(m);
3323                 }
3324                 DPRINTF("failed to get reply_mbuf\n");
3325
3326                 goto out;
3327         }
3328
3329         if (tp->t_state != TCPS_LISTEN) {
3330                 DPRINTF("socket not in listen state\n");
3331
3332                 goto reject;
3333         }
3334
3335         tim.mac_addr = req->dst_mac;
3336         tim.vlan_tag = ntohs(req->vlan_tag);
3337         if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3338                 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3339                 goto reject;
3340         }
3341
3342 #ifdef notyet
3343         /*
3344          * XXX do route lookup to confirm that we're still listening on this
3345          * address
3346          */
3347         if (ip_route_input(skb, req->local_ip, req->peer_ip,
3348                            G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3349                 goto reject;
3350         rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3351                 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3352         dst_release(skb->dst);  // done with the input route, release it
3353         skb->dst = NULL;
3354
3355         if ((rt_flags & RTF_LOCAL) == 0)
3356                 goto reject;
3357 #endif
3358         /*
3359          * XXX
3360          */
3361         rt_flags = RTF_LOCAL;
3362         if ((rt_flags & RTF_LOCAL) == 0)
3363                 goto reject;
3364
3365         /*
3366          * Calculate values and add to syncache
3367          */
3368
3369         newtoep = toepcb_alloc();
3370         if (newtoep == NULL)
3371                 goto reject;
3372
3373         bzero(&nam, sizeof(struct sockaddr_in));
3374
3375         nam.sin_len = sizeof(struct sockaddr_in);
3376         nam.sin_family = AF_INET;
3377         nam.sin_addr.s_addr =req->peer_ip;
3378         dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3379
3380         if (dst == NULL) {
3381                 printf("failed to find route\n");
3382                 goto reject;
3383         }
3384         e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3385             (struct sockaddr *)&nam);
3386         if (e == NULL) {
3387                 DPRINTF("failed to get l2t\n");
3388         }
3389         /*
3390          * Point to our listen socket until accept
3391          */
3392         newtoep->tp_tp = tp;
3393         newtoep->tp_flags = TP_SYN_RCVD;
3394         newtoep->tp_tid = tid;
3395         newtoep->tp_toedev = tdev;
3396         tp->rcv_wnd = select_rcv_wnd(tdev, so);
3397
3398         cxgb_insert_tid(cdev, d->client, newtoep, tid);
3399         so_lock(so);
3400         LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3401         so_unlock(so);
3402
3403         newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3404                        tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3405
3406         if (newtoep->tp_ulp_mode) {
3407                 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3408
3409                 if (ddp_mbuf == NULL)
3410                         newtoep->tp_ulp_mode = 0;
3411         }
3412
3413         CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3414             TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3415         set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3416         /*
3417          * XXX workaround for lack of syncache drop
3418          */
3419         toepcb_hold(newtoep);
3420         syncache_add_accept_req(req, so, newtoep);
3421
3422         rpl = cplhdr(reply_mbuf);
3423         reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3424         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3425         rpl->wr.wr_lo = 0;
3426         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3427         rpl->opt2 = htonl(calc_opt2(so, tdev));
3428         rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3429         rpl->peer_ip = req->peer_ip;    // req->peer_ip is not overwritten
3430
3431         rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3432             V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3433         rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3434                                   CPL_PASS_OPEN_ACCEPT);
3435
3436         DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3437
3438         m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3439
3440         l2t_send(cdev, reply_mbuf, e);
3441         m_free(m);
3442         if (newtoep->tp_ulp_mode) {
3443                 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3444                                 V_TF_DDP_OFF(1) |
3445                                 TP_DDP_TIMER_WORKAROUND_MASK,
3446                                 V_TF_DDP_OFF(1) |
3447                     TP_DDP_TIMER_WORKAROUND_VAL, 1);
3448         } else
3449                 DPRINTF("no DDP\n");
3450
3451         return;
3452 reject:
3453         if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3454                 mk_pass_accept_rpl(reply_mbuf, m);
3455         else
3456                 mk_tid_release(reply_mbuf, newtoep, tid);
3457         cxgb_ofld_send(cdev, reply_mbuf);
3458         m_free(m);
3459 out:
3460 #if 0
3461         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3462 #else
3463         return;
3464 #endif
3465 }
3466
3467 /*
3468  * Handle a CPL_PASS_ACCEPT_REQ message.
3469  */
3470 static int
3471 do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3472 {
3473         struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3474         struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3475         struct tom_data *d = listen_ctx->tom_data;
3476
3477 #if VALIDATE_TID
3478         struct cpl_pass_accept_req *req = cplhdr(m);
3479         unsigned int tid = GET_TID(req);
3480         struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3481
3482         if (unlikely(!lsk)) {
3483                 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3484                        cdev->name,
3485                        (unsigned long)((union listen_entry *)ctx -
3486                                         t->stid_tab));
3487                 return CPL_RET_BUF_DONE;
3488         }
3489         if (unlikely(tid >= t->ntids)) {
3490                 printk(KERN_ERR "%s: passive open TID %u too large\n",
3491                        cdev->name, tid);
3492                 return CPL_RET_BUF_DONE;
3493         }
3494         /*
3495          * For T3A the current user of the TID may have closed but its last
3496          * message(s) may have been backlogged so the TID appears to be still
3497          * in use.  Just take the TID away, the connection can close at its
3498          * own leisure.  For T3B this situation is a bug.
3499          */
3500         if (!valid_new_tid(t, tid) &&
3501             cdev->type != T3A) {
3502                 printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3503                        cdev->name, tid);
3504                 return CPL_RET_BUF_DONE;
3505         }
3506 #endif
3507
3508         process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3509         return (0);
3510 }
3511
3512 /*
3513  * Called when a connection is established to translate the TCP options
3514  * reported by HW to FreeBSD's native format.
3515  */
3516 static void
3517 assign_rxopt(struct socket *so, unsigned int opt)
3518 {
3519         struct tcpcb *tp = so_sototcpcb(so);
3520         struct toepcb *toep = tp->t_toe;
3521         const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3522
3523         inp_wlock_assert(tp->t_inpcb);
3524
3525         toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3526         tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3527         tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3528         tp->t_flags         |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3529         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3530             (TF_RCVD_SCALE|TF_REQ_SCALE))
3531                 tp->rcv_scale = tp->request_r_scale;
3532 }
3533
3534 /*
3535  * Completes some final bits of initialization for just established connections
3536  * and changes their state to TCP_ESTABLISHED.
3537  *
3538  * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3539  */
3540 static void
3541 make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3542 {
3543         struct tcpcb *tp = so_sototcpcb(so);
3544         struct toepcb *toep = tp->t_toe;
3545
3546         toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3547         assign_rxopt(so, opt);
3548
3549         /*
3550          *XXXXXXXXXXX
3551          *
3552          */
3553 #ifdef notyet
3554         so->so_proto->pr_ctloutput = t3_ctloutput;
3555 #endif
3556
3557 #if 0
3558         inet_sk(sk)->id = tp->write_seq ^ jiffies;
3559 #endif
3560         /*
3561          * XXX not clear what rcv_wup maps to
3562          */
3563         /*
3564          * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3565          * pass through opt0.
3566          */
3567         if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3568                 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3569
3570         dump_toepcb(toep);
3571
3572 #ifdef notyet
3573 /*
3574  * no clean interface for marking ARP up to date
3575  */
3576         dst_confirm(sk->sk_dst_cache);
3577 #endif
3578         tp->t_starttime = ticks;
3579         tp->t_state = TCPS_ESTABLISHED;
3580         soisconnected(so);
3581 }
3582
3583 static int
3584 syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3585 {
3586
3587         struct in_conninfo inc;
3588         struct tcpopt to;
3589         struct tcphdr th;
3590         int mss, wsf, sack, ts;
3591         struct mbuf *m = NULL;
3592         const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3593         unsigned int opt;
3594
3595 #ifdef MAC
3596 #error  "no MAC support"
3597 #endif
3598
3599         opt = ntohs(req->tcp_opt);
3600
3601         bzero(&to, sizeof(struct tcpopt));
3602
3603         /*
3604          * Fill out information for entering us into the syncache
3605          */
3606         bzero(&inc, sizeof(inc));
3607         inc.inc_fport = th.th_sport = req->peer_port;
3608         inc.inc_lport = th.th_dport = req->local_port;
3609         th.th_seq = req->rcv_isn;
3610         th.th_flags = TH_ACK;
3611
3612         inc.inc_len = 0;
3613         inc.inc_faddr.s_addr = req->peer_ip;
3614         inc.inc_laddr.s_addr = req->local_ip;
3615
3616         mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3617         wsf  = G_TCPOPT_WSCALE_OK(opt);
3618         ts   = G_TCPOPT_TSTAMP(opt);
3619         sack = G_TCPOPT_SACK(opt);
3620
3621         to.to_mss = mss;
3622         to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3623         to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3624
3625         DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3626             ntohl(req->local_ip), ntohs(req->local_port),
3627             ntohl(req->peer_ip), ntohs(req->peer_port),
3628             mss, wsf, ts, sack);
3629         return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3630 }
3631
3632
3633 /*
3634  * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3635  * if we are in TCP_SYN_RECV due to crossed SYNs
3636  */
3637 static int
3638 do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3639 {
3640         struct cpl_pass_establish *req = cplhdr(m);
3641         struct toepcb *toep = (struct toepcb *)ctx;
3642         struct tcpcb *tp = toep->tp_tp;
3643         struct socket *so, *lso;
3644         struct t3c_data *td = T3C_DATA(cdev);
3645         struct sockbuf *snd, *rcv;
3646
3647         // Complete socket initialization now that we have the SND_ISN
3648
3649         struct toedev *tdev;
3650
3651
3652         tdev = toep->tp_toedev;
3653
3654         inp_wlock(tp->t_inpcb);
3655
3656         /*
3657          *
3658          * XXX need to add reference while we're manipulating
3659          */
3660         so = lso = inp_inpcbtosocket(tp->t_inpcb);
3661
3662         inp_wunlock(tp->t_inpcb);
3663
3664         so_lock(so);
3665         LIST_REMOVE(toep, synq_entry);
3666         so_unlock(so);
3667
3668         if (!syncache_expand_establish_req(req, &so, toep)) {
3669                 /*
3670                  * No entry
3671                  */
3672                 CXGB_UNIMPLEMENTED();
3673         }
3674         if (so == NULL) {
3675                 /*
3676                  * Couldn't create the socket
3677                  */
3678                 CXGB_UNIMPLEMENTED();
3679         }
3680
3681         tp = so_sototcpcb(so);
3682         inp_wlock(tp->t_inpcb);
3683
3684         snd = so_sockbuf_snd(so);
3685         rcv = so_sockbuf_rcv(so);
3686
3687         snd->sb_flags |= SB_NOCOALESCE;
3688         rcv->sb_flags |= SB_NOCOALESCE;
3689
3690         toep->tp_tp = tp;
3691         toep->tp_flags = 0;
3692         tp->t_toe = toep;
3693         reset_wr_list(toep);
3694         tp->rcv_wnd = select_rcv_wnd(tdev, so);
3695         tp->rcv_nxt = toep->tp_copied_seq;
3696         install_offload_ops(so);
3697
3698         toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3699         toep->tp_wr_unacked = 0;
3700         toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3701         toep->tp_qset_idx = 0;
3702         toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3703
3704         /*
3705          * XXX Cancel any keep alive timer
3706          */
3707
3708         make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3709
3710         /*
3711          * XXX workaround for lack of syncache drop
3712          */
3713         toepcb_release(toep);
3714         inp_wunlock(tp->t_inpcb);
3715
3716         CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3717         cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3718 #ifdef notyet
3719         /*
3720          * XXX not sure how these checks map to us
3721          */
3722         if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3723                 sk->sk_state_change(sk);
3724                 sk_wake_async(so, 0, POLL_OUT);
3725         }
3726         /*
3727          * The state for the new connection is now up to date.
3728          * Next check if we should add the connection to the parent's
3729          * accept queue.  When the parent closes it resets connections
3730          * on its SYN queue, so check if we are being reset.  If so we
3731          * don't need to do anything more, the coming ABORT_RPL will
3732          * destroy this socket.  Otherwise move the connection to the
3733          * accept queue.
3734          *
3735          * Note that we reset the synq before closing the server so if
3736          * we are not being reset the stid is still open.
3737          */
3738         if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3739                 __kfree_skb(skb);
3740                 goto unlock;
3741         }
3742 #endif
3743         m_free(m);
3744
3745         return (0);
3746 }
3747
3748 /*
3749  * Fill in the right TID for CPL messages waiting in the out-of-order queue
3750  * and send them to the TOE.
3751  */
3752 static void
3753 fixup_and_send_ofo(struct toepcb *toep)
3754 {
3755         struct mbuf *m;
3756         struct toedev *tdev = toep->tp_toedev;
3757         struct tcpcb *tp = toep->tp_tp;
3758         unsigned int tid = toep->tp_tid;
3759
3760         log(LOG_NOTICE, "fixup_and_send_ofo\n");
3761
3762         inp_wlock_assert(tp->t_inpcb);
3763         while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3764                 /*
3765                  * A variety of messages can be waiting but the fields we'll
3766                  * be touching are common to all so any message type will do.
3767                  */
3768                 struct cpl_close_con_req *p = cplhdr(m);
3769
3770                 p->wr.wr_lo = htonl(V_WR_TID(tid));
3771                 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3772                 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3773         }
3774 }
3775
3776 /*
3777  * Updates socket state from an active establish CPL message.  Runs with the
3778  * socket lock held.
3779  */
3780 static void
3781 socket_act_establish(struct socket *so, struct mbuf *m)
3782 {
3783         struct cpl_act_establish *req = cplhdr(m);
3784         u32 rcv_isn = ntohl(req->rcv_isn);      /* real RCV_ISN + 1 */
3785         struct tcpcb *tp = so_sototcpcb(so);
3786         struct toepcb *toep = tp->t_toe;
3787
3788         if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3789                 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3790                     toep->tp_tid, tp->t_state);
3791
3792         tp->ts_recent_age = ticks;
3793         tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3794         toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3795
3796         make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3797
3798         /*
3799          * Now that we finally have a TID send any CPL messages that we had to
3800          * defer for lack of a TID.
3801          */
3802         if (mbufq_len(&toep->out_of_order_queue))
3803                 fixup_and_send_ofo(toep);
3804
3805         if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3806                 /*
3807                  * XXX does this even make sense?
3808                  */
3809                 so_sorwakeup(so);
3810         }
3811         m_free(m);
3812 #ifdef notyet
3813 /*
3814  * XXX assume no write requests permitted while socket connection is
3815  * incomplete
3816  */
3817         /*
3818          * Currently the send queue must be empty at this point because the
3819          * socket layer does not send anything before a connection is
3820          * established.  To be future proof though we handle the possibility
3821          * that there are pending buffers to send (either TX_DATA or
3822          * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3823          * buffers according to the just learned write_seq, and then we send
3824          * them on their way.
3825          */
3826         fixup_pending_writeq_buffers(sk);
3827         if (t3_push_frames(so, 1))
3828                 sk->sk_write_space(sk);
3829 #endif
3830
3831         toep->tp_state = tp->t_state;
3832         tcpstat.tcps_connects++;
3833
3834 }
3835
3836 /*
3837  * Process a CPL_ACT_ESTABLISH message.
3838  */
3839 static int
3840 do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3841 {
3842         struct cpl_act_establish *req = cplhdr(m);
3843         unsigned int tid = GET_TID(req);
3844         unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3845         struct toepcb *toep = (struct toepcb *)ctx;
3846         struct tcpcb *tp = toep->tp_tp;
3847         struct socket *so;
3848         struct toedev *tdev;
3849         struct tom_data *d;
3850
3851         if (tp == NULL) {
3852                 free_atid(cdev, atid);
3853                 return (0);
3854         }
3855         inp_wlock(tp->t_inpcb);
3856
3857         /*
3858          * XXX
3859          */
3860         so = inp_inpcbtosocket(tp->t_inpcb);
3861         tdev = toep->tp_toedev; /* blow up here if link was down */
3862         d = TOM_DATA(tdev);
3863         inp_wlock(tp->t_inpcb);
3864
3865         /*
3866          * It's OK if the TID is currently in use, the owning socket may have
3867          * backlogged its last CPL message(s).  Just take it away.
3868          */
3869         toep->tp_tid = tid;
3870         toep->tp_tp = tp;
3871         so_insert_tid(d, toep, tid);
3872         free_atid(cdev, atid);
3873         toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3874
3875         socket_act_establish(so, m);
3876         inp_wunlock(tp->t_inpcb);
3877         CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3878         cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3879
3880         return (0);
3881 }
3882
3883 /*
3884  * Process an acknowledgment of WR completion.  Advance snd_una and send the
3885  * next batch of work requests from the write queue.
3886  */
3887 static void
3888 wr_ack(struct toepcb *toep, struct mbuf *m)
3889 {
3890         struct tcpcb *tp = toep->tp_tp;
3891         struct cpl_wr_ack *hdr = cplhdr(m);
3892         struct socket *so;
3893         unsigned int credits = ntohs(hdr->credits);
3894         u32 snd_una = ntohl(hdr->snd_una);
3895         int bytes = 0;
3896         struct sockbuf *snd;
3897
3898         CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3899
3900         inp_wlock(tp->t_inpcb);
3901         so = inp_inpcbtosocket(tp->t_inpcb);
3902
3903         toep->tp_wr_avail += credits;
3904         if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3905                 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3906
3907         while (credits) {
3908                 struct mbuf *p = peek_wr(toep);
3909
3910                 if (__predict_false(!p)) {
3911                         log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3912                             "nothing pending, state %u wr_avail=%u\n",
3913                             credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3914                         break;
3915                 }
3916                 CTR2(KTR_TOM,
3917                         "wr_ack: p->credits=%d p->bytes=%d",
3918                     p->m_pkthdr.csum_data, p->m_pkthdr.len);
3919                 KASSERT(p->m_pkthdr.csum_data != 0,
3920                     ("empty request still on list"));
3921
3922                 if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3923
3924 #if DEBUG_WR > 1
3925                         struct tx_data_wr *w = cplhdr(p);
3926                         log(LOG_ERR,
3927                                "TID %u got %u WR credits, need %u, len %u, "
3928                                "main body %u, frags %u, seq # %u, ACK una %u,"
3929                                " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3930                                toep->tp_tid, credits, p->csum, p->len,
3931                                p->len - p->data_len, skb_shinfo(p)->nr_frags,
3932                                ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3933                             toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3934 #endif
3935                         p->m_pkthdr.csum_data -= credits;
3936                         break;
3937                 } else {
3938                         dequeue_wr(toep);
3939                         credits -= p->m_pkthdr.csum_data;
3940                         bytes += p->m_pkthdr.len;
3941                         CTR3(KTR_TOM,
3942                             "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3943                             p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3944
3945                         m_free(p);
3946                 }
3947         }
3948
3949 #if DEBUG_WR
3950         check_wr_invariants(tp);
3951 #endif
3952
3953         if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3954 #if VALIDATE_SEQ
3955                 struct tom_data *d = TOM_DATA(TOE_DEV(so));
3956
3957                 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3958                     "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3959                     toep->tp_tid, tp->snd_una);
3960 #endif
3961                 goto out_free;
3962         }
3963
3964         if (tp->snd_una != snd_una) {
3965                 tp->snd_una = snd_una;
3966                 tp->ts_recent_age = ticks;
3967 #ifdef notyet
3968                 /*
3969                  * Keep ARP entry "minty fresh"
3970                  */
3971                 dst_confirm(sk->sk_dst_cache);
3972 #endif
3973                 if (tp->snd_una == tp->snd_nxt)
3974                         toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3975         }
3976
3977         snd = so_sockbuf_snd(so);
3978         if (bytes) {
3979                 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3980                 snd = so_sockbuf_snd(so);
3981                 sockbuf_lock(snd);
3982                 sbdrop_locked(snd, bytes);
3983                 so_sowwakeup_locked(so);
3984         }
3985
3986         if (snd->sb_sndptroff < snd->sb_cc)
3987                 t3_push_frames(so, 0);
3988
3989 out_free:
3990         inp_wunlock(tp->t_inpcb);
3991         m_free(m);
3992 }
3993
3994 /*
3995  * Handler for TX_DATA_ACK CPL messages.
3996  */
3997 static int
3998 do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3999 {
4000         struct toepcb *toep = (struct toepcb *)ctx;
4001
4002         VALIDATE_SOCK(so);
4003
4004         wr_ack(toep, m);
4005         return 0;
4006 }
4007
4008 /*
4009  * Handler for TRACE_PKT CPL messages.  Just sink these packets.
4010  */
4011 static int
4012 do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4013 {
4014         m_freem(m);
4015         return 0;
4016 }
4017
4018 /*
4019  * Reset a connection that is on a listener's SYN queue or accept queue,
4020  * i.e., one that has not had a struct socket associated with it.
4021  * Must be called from process context.
4022  *
4023  * Modeled after code in inet_csk_listen_stop().
4024  */
4025 static void
4026 t3_reset_listen_child(struct socket *child)
4027 {
4028         struct tcpcb *tp = so_sototcpcb(child);
4029
4030         t3_send_reset(tp->t_toe);
4031 }
4032
4033
4034 static void
4035 t3_child_disconnect(struct socket *so, void *arg)
4036 {
4037         struct tcpcb *tp = so_sototcpcb(so);
4038
4039         if (tp->t_flags & TF_TOE) {
4040                 inp_wlock(tp->t_inpcb);
4041                 t3_reset_listen_child(so);
4042                 inp_wunlock(tp->t_inpcb);
4043         }
4044 }
4045
4046 /*
4047  * Disconnect offloaded established but not yet accepted connections sitting
4048  * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4049  * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4050  */
4051 void
4052 t3_disconnect_acceptq(struct socket *listen_so)
4053 {
4054
4055         so_lock(listen_so);
4056         so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4057         so_unlock(listen_so);
4058 }
4059
4060 /*
4061  * Reset offloaded connections sitting on a server's syn queue.  As above
4062  * we send ABORT_REQ and finish off when we get ABORT_RPL.
4063  */
4064
4065 void
4066 t3_reset_synq(struct listen_ctx *lctx)
4067 {
4068         struct toepcb *toep;
4069
4070         so_lock(lctx->lso);
4071         while (!LIST_EMPTY(&lctx->synq_head)) {
4072                 toep = LIST_FIRST(&lctx->synq_head);
4073                 LIST_REMOVE(toep, synq_entry);
4074                 toep->tp_tp = NULL;
4075                 t3_send_reset(toep);
4076                 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4077                 toepcb_release(toep);
4078         }
4079         so_unlock(lctx->lso);
4080 }
4081
4082
4083 int
4084 t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4085                    unsigned int nppods, unsigned int tag, unsigned int maxoff,
4086                    unsigned int pg_off, unsigned int color)
4087 {
4088         unsigned int i, j, pidx;
4089         struct pagepod *p;
4090         struct mbuf *m;
4091         struct ulp_mem_io *req;
4092         unsigned int tid = toep->tp_tid;
4093         const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4094         unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4095
4096         CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4097             gl, nppods, tag, maxoff, pg_off, color);
4098
4099         for (i = 0; i < nppods; ++i) {
4100                 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4101                 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4102                 req = mtod(m, struct ulp_mem_io *);
4103                 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4104                 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4105                 req->wr.wr_lo = 0;
4106                 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4107                                            V_ULPTX_CMD(ULP_MEM_WRITE));
4108                 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4109                                  V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4110
4111                 p = (struct pagepod *)(req + 1);
4112                 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4113                         p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4114                         p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4115                                                   V_PPOD_COLOR(color));
4116                         p->pp_max_offset = htonl(maxoff);
4117                         p->pp_page_offset = htonl(pg_off);
4118                         p->pp_rsvd = 0;
4119                         for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4120                                 p->pp_addr[j] = pidx < gl->dgl_nelem ?
4121                                     htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4122                 } else
4123                         p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4124                 send_or_defer(toep, m, 0);
4125                 ppod_addr += PPOD_SIZE;
4126         }
4127         return (0);
4128 }
4129
4130 /*
4131  * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4132  */
4133 static inline void
4134 mk_cpl_barrier_ulp(struct cpl_barrier *b)
4135 {
4136         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4137
4138         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4139         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4140         b->opcode = CPL_BARRIER;
4141 }
4142
4143 /*
4144  * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4145  */
4146 static inline void
4147 mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4148 {
4149         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4150
4151         txpkt = (struct ulp_txpkt *)req;
4152         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4153         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4154         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4155         req->cpuno = htons(cpuno);
4156 }
4157
4158 /*
4159  * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4160  */
4161 static inline void
4162 mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4163                      unsigned int word, uint64_t mask, uint64_t val)
4164 {
4165         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4166
4167         CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4168             tid, word, mask, val);
4169
4170         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4171         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4172         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4173         req->reply = V_NO_REPLY(1);
4174         req->cpu_idx = 0;
4175         req->word = htons(word);
4176         req->mask = htobe64(mask);
4177         req->val = htobe64(val);
4178 }
4179
4180 /*
4181  * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4182  */
4183 static void
4184 mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4185     unsigned int tid, unsigned int credits)
4186 {
4187         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4188
4189         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4190         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4191         OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4192         ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4193             V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4194                                  V_RX_CREDITS(credits));
4195 }
4196
4197 void
4198 t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4199 {
4200         unsigned int wrlen;
4201         struct mbuf *m;
4202         struct work_request_hdr *wr;
4203         struct cpl_barrier *lock;
4204         struct cpl_set_tcb_field *req;
4205         struct cpl_get_tcb *getreq;
4206         struct ddp_state *p = &toep->tp_ddp_state;
4207
4208 #if 0
4209         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4210 #endif
4211         wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4212                 sizeof(*getreq);
4213         m = m_gethdr_nofail(wrlen);
4214         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4215         wr = mtod(m, struct work_request_hdr *);
4216         bzero(wr, wrlen);
4217
4218         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4219         m->m_pkthdr.len = m->m_len = wrlen;
4220
4221         lock = (struct cpl_barrier *)(wr + 1);
4222         mk_cpl_barrier_ulp(lock);
4223
4224         req = (struct cpl_set_tcb_field *)(lock + 1);
4225
4226         CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4227
4228         /* Hmmm, not sure if this actually a good thing: reactivating
4229          * the other buffer might be an issue if it has been completed
4230          * already. However, that is unlikely, since the fact that the UBUF
4231          * is not completed indicates that there is no oustanding data.
4232          */
4233         if (bufidx == 0)
4234                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4235                                      V_TF_DDP_ACTIVE_BUF(1) |
4236                                      V_TF_DDP_BUF0_VALID(1),
4237                                      V_TF_DDP_ACTIVE_BUF(1));
4238         else
4239                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4240                                      V_TF_DDP_ACTIVE_BUF(1) |
4241                                      V_TF_DDP_BUF1_VALID(1), 0);
4242
4243         getreq = (struct cpl_get_tcb *)(req + 1);
4244         mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4245
4246         mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4247
4248         /* Keep track of the number of oustanding CPL_GET_TCB requests
4249          */
4250         p->get_tcb_count++;
4251
4252 #ifdef T3_TRACE
4253         T3_TRACE1(TIDTB(so),
4254                   "t3_cancel_ddpbuf: bufidx %u", bufidx);
4255 #endif
4256         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4257 }
4258
4259 /**
4260  * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4261  * @sk: the socket associated with the buffers
4262  * @bufidx: index of HW DDP buffer (0 or 1)
4263  * @tag0: new tag for HW buffer 0
4264  * @tag1: new tag for HW buffer 1
4265  * @len: new length for HW buf @bufidx
4266  *
4267  * Sends a compound WR to overlay a new DDP buffer on top of an existing
4268  * buffer by changing the buffer tag and length and setting the valid and
4269  * active flag accordingly.  The caller must ensure the new buffer is at
4270  * least as big as the existing one.  Since we typically reprogram both HW
4271  * buffers this function sets both tags for convenience. Read the TCB to
4272  * determine how made data was written into the buffer before the overlay
4273  * took place.
4274  */
4275 void
4276 t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4277                        unsigned int tag1, unsigned int len)
4278 {
4279         unsigned int wrlen;
4280         struct mbuf *m;
4281         struct work_request_hdr *wr;
4282         struct cpl_get_tcb *getreq;
4283         struct cpl_set_tcb_field *req;
4284         struct ddp_state *p = &toep->tp_ddp_state;
4285
4286         CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4287             bufidx, tag0, tag1, len);
4288 #if 0
4289         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4290 #endif
4291         wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4292         m = m_gethdr_nofail(wrlen);
4293         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4294         wr = mtod(m, struct work_request_hdr *);
4295         m->m_pkthdr.len = m->m_len = wrlen;
4296         bzero(wr, wrlen);
4297
4298
4299         /* Set the ATOMIC flag to make sure that TP processes the following
4300          * CPLs in an atomic manner and no wire segments can be interleaved.
4301          */
4302         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4303         req = (struct cpl_set_tcb_field *)(wr + 1);
4304         mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4305                              V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4306                              V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4307                              V_TCB_RX_DDP_BUF0_TAG(tag0) |
4308                              V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4309         req++;
4310         if (bufidx == 0) {
4311                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4312                             V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4313                             V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4314                 req++;
4315                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4316                             V_TF_DDP_PUSH_DISABLE_0(1) |
4317                             V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4318                             V_TF_DDP_PUSH_DISABLE_0(0) |
4319                             V_TF_DDP_BUF0_VALID(1));
4320         } else {
4321                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4322                             V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4323                             V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4324                 req++;
4325                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4326                             V_TF_DDP_PUSH_DISABLE_1(1) |
4327                             V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4328                             V_TF_DDP_PUSH_DISABLE_1(0) |
4329                             V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4330         }
4331
4332         getreq = (struct cpl_get_tcb *)(req + 1);
4333         mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4334
4335         /* Keep track of the number of oustanding CPL_GET_TCB requests
4336          */
4337         p->get_tcb_count++;
4338
4339 #ifdef T3_TRACE
4340         T3_TRACE4(TIDTB(sk),
4341                   "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4342                   "len %d",
4343                   bufidx, tag0, tag1, len);
4344 #endif
4345         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4346 }
4347
4348 /*
4349  * Sends a compound WR containing all the CPL messages needed to program the
4350  * two HW DDP buffers, namely optionally setting up the length and offset of
4351  * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4352  */
4353 void
4354 t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4355                       unsigned int len1, unsigned int offset1,
4356                       uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4357 {
4358         unsigned int wrlen;
4359         struct mbuf *m;
4360         struct work_request_hdr *wr;
4361         struct cpl_set_tcb_field *req;
4362
4363         CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4364             len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4365
4366 #if 0
4367         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4368 #endif
4369         wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4370                 (len1 ? sizeof(*req) : 0) +
4371                 (modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4372         m = m_gethdr_nofail(wrlen);
4373         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4374         wr = mtod(m, struct work_request_hdr *);
4375         bzero(wr, wrlen);
4376
4377         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4378         m->m_pkthdr.len = m->m_len = wrlen;
4379
4380         req = (struct cpl_set_tcb_field *)(wr + 1);
4381         if (len0) {                  /* program buffer 0 offset and length */
4382                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4383                         V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4384                         V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4385                         V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4386                         V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4387                 req++;
4388         }
4389         if (len1) {                  /* program buffer 1 offset and length */
4390                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4391                         V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4392                         V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4393                         V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4394                         V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4395                 req++;
4396         }
4397
4398         mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4399                              ddp_flags);
4400
4401         if (modulate) {
4402                 mk_rx_data_ack_ulp(toep,
4403                     (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4404                     toep->tp_copied_seq - toep->tp_rcv_wup);
4405                 toep->tp_rcv_wup = toep->tp_copied_seq;
4406         }
4407
4408 #ifdef T3_TRACE
4409         T3_TRACE5(TIDTB(sk),
4410                   "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4411                   "modulate %d",
4412                   len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4413                   modulate);
4414 #endif
4415
4416         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4417 }
4418
4419 void
4420 t3_init_wr_tab(unsigned int wr_len)
4421 {
4422         int i;
4423
4424         if (mbuf_wrs[1])     /* already initialized */
4425                 return;
4426
4427         for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4428                 int sgl_len = (3 * i) / 2 + (i & 1);
4429
4430                 sgl_len += 3;
4431                 mbuf_wrs[i] = sgl_len <= wr_len ?
4432                         1 : 1 + (sgl_len - 2) / (wr_len - 1);
4433         }
4434
4435         wrlen = wr_len * 8;
4436 }
4437
4438 int
4439 t3_init_cpl_io(void)
4440 {
4441 #ifdef notyet
4442         tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4443         if (!tcphdr_skb) {
4444                 log(LOG_ERR,
4445                        "Chelsio TCP offload: can't allocate sk_buff\n");
4446                 return -1;
4447         }
4448         skb_put(tcphdr_skb, sizeof(struct tcphdr));
4449         tcphdr_skb->h.raw = tcphdr_skb->data;
4450         memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4451 #endif
4452
4453         t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4454         t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4455         t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4456         t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4457         t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4458         t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4459         t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4460         t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4461         t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4462         t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4463         t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4464         t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4465         t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4466         t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4467         t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4468         return (0);
4469 }
4470