sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c

   1 /**************************************************************************
   2
   3 Copyright (c) 2007-2008, Chelsio Inc.
   4 All rights reserved.
   5
   6 Redistribution and use in source and binary forms, with or without
   7 modification, are permitted provided that the following conditions are met:
   8
   9  1. Redistributions of source code must retain the above copyright notice,
  10     this list of conditions and the following disclaimer.
  11
  12  2. Neither the name of the Chelsio Corporation nor the names of its
  13     contributors may be used to endorse or promote products derived from
  14     this software without specific prior written permission.
  15
  16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26 POSSIBILITY OF SUCH DAMAGE.
  27
  28 ***************************************************************************/
  29
  30 #include <sys/cdefs.h>
  31 __FBSDID("$FreeBSD$");
  32
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/fcntl.h>
  36 #include <sys/kernel.h>
  37 #include <sys/limits.h>
  38 #include <sys/ktr.h>
  39 #include <sys/lock.h>
  40 #include <sys/mbuf.h>
  41 #include <sys/mutex.h>
  42 #include <sys/sockstate.h>
  43 #include <sys/sockopt.h>
  44 #include <sys/socket.h>
  45 #include <sys/socketvar.h>
  46 #include <sys/sockbuf.h>
  47 #include <sys/sysctl.h>
  48 #include <sys/syslog.h>
  49 #include <sys/protosw.h>
  50 #include <sys/priv.h>
  51 #include <sys/vimage.h>
  52
  53 #include <net/if.h>
  54 #include <net/route.h>
  55
  56 #include <netinet/in.h>
  57 #include <netinet/in_pcb.h>
  58 #include <netinet/in_systm.h>
  59 #include <netinet/in_var.h>
  60
  61
  62 #include <dev/cxgb/cxgb_osdep.h>
  63 #include <dev/cxgb/sys/mbufq.h>
  64
  65 #include <netinet/ip.h>
  66 #include <netinet/tcp_var.h>
  67 #include <netinet/tcp_fsm.h>
  68 #include <netinet/tcp_offload.h>
  69 #include <netinet/tcp_seq.h>
  70 #include <netinet/tcp_syncache.h>
  71 #include <netinet/tcp_timer.h>
  72 #include <net/route.h>
  73
  74 #include <dev/cxgb/t3cdev.h>
  75 #include <dev/cxgb/common/cxgb_firmware_exports.h>
  76 #include <dev/cxgb/common/cxgb_t3_cpl.h>
  77 #include <dev/cxgb/common/cxgb_tcb.h>
  78 #include <dev/cxgb/common/cxgb_ctl_defs.h>
  79 #include <dev/cxgb/cxgb_offload.h>
  80 #include <vm/vm.h>
  81 #include <vm/pmap.h>
  82 #include <machine/bus.h>
  83 #include <dev/cxgb/sys/mvec.h>
  84 #include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
  85 #include <dev/cxgb/ulp/tom/cxgb_defs.h>
  86 #include <dev/cxgb/ulp/tom/cxgb_tom.h>
  87 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
  88 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
  89 #include <dev/cxgb/ulp/tom/cxgb_tcp.h>
  90
  91 #include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
  92
  93 /*
  94  * For ULP connections HW may add headers, e.g., for digests, that aren't part
  95  * of the messages sent by the host but that are part of the TCP payload and
  96  * therefore consume TCP sequence space.  Tx connection parameters that
  97  * operate in TCP sequence space are affected by the HW additions and need to
  98  * compensate for them to accurately track TCP sequence numbers. This array
  99  * contains the compensating extra lengths for ULP packets.  It is indexed by
 100  * a packet's ULP submode.
 101  */
 102 const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
 103
 104 #ifdef notyet
 105 /*
 106  * This sk_buff holds a fake header-only TCP segment that we use whenever we
 107  * need to exploit SW TCP functionality that expects TCP headers, such as
 108  * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
 109  * CPUs without locking.
 110  */
 111 static struct mbuf *tcphdr_mbuf __read_mostly;
 112 #endif
 113
 114 /*
 115  * Size of WRs in bytes.  Note that we assume all devices we are handling have
 116  * the same WR size.
 117  */
 118 static unsigned int wrlen __read_mostly;
 119
 120 /*
 121  * The number of WRs needed for an skb depends on the number of page fragments
 122  * in the skb and whether it has any payload in its main body.  This maps the
 123  * length of the gather list represented by an skb into the # of necessary WRs.
 124  */
 125 static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
 126
 127 /*
 128  * Max receive window supported by HW in bytes.  Only a small part of it can
 129  * be set through option0, the rest needs to be set through RX_DATA_ACK.
 130  */
 131 #define MAX_RCV_WND ((1U << 27) - 1)
 132
 133 /*
 134  * Min receive window.  We want it to be large enough to accommodate receive
 135  * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
 136  */
 137 #define MIN_RCV_WND (24 * 1024U)
 138 #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
 139
 140 #define VALIDATE_SEQ 0
 141 #define VALIDATE_SOCK(so)
 142 #define DEBUG_WR 0
 143
 144 #define TCP_TIMEWAIT    1
 145 #define TCP_CLOSE       2
 146 #define TCP_DROP        3
 147
 148 extern int tcp_do_autorcvbuf;
 149 extern int tcp_do_autosndbuf;
 150 extern int tcp_autorcvbuf_max;
 151 extern int tcp_autosndbuf_max;
 152
 153 static void t3_send_reset(struct toepcb *toep);
 154 static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
 155 static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
 156 static void handle_syncache_event(int event, void *arg);
 157
 158 static inline void
 159 SBAPPEND(struct sockbuf *sb, struct mbuf *n)
 160 {
 161         struct mbuf *m;
 162
 163         m = sb->sb_mb;
 164         while (m) {
 165                 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
 166                     !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
 167                         !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
 168                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 169                         m->m_next, m->m_nextpkt, m->m_flags));
 170                 m = m->m_next;
 171         }
 172         m = n;
 173         while (m) {
 174                 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
 175                     !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
 176                         !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
 177                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 178                         m->m_next, m->m_nextpkt, m->m_flags));
 179                 m = m->m_next;
 180         }
 181         KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
 182         sbappendstream_locked(sb, n);
 183         m = sb->sb_mb;
 184
 185         while (m) {
 186                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 187                         m->m_next, m->m_nextpkt, m->m_flags));
 188                 m = m->m_next;
 189         }
 190 }
 191
 192 static inline int
 193 is_t3a(const struct toedev *dev)
 194 {
 195         return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
 196 }
 197
 198 static void
 199 dump_toepcb(struct toepcb *toep)
 200 {
 201         DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
 202             toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
 203             toep->tp_mtu_idx, toep->tp_tid);
 204
 205         DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
 206             toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
 207             toep->tp_mss_clamp, toep->tp_flags);
 208 }
 209
 210 #ifndef RTALLOC2_DEFINED
 211 static struct rtentry *
 212 rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
 213 {
 214         struct rtentry *rt = NULL;
 215
 216         if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
 217                 RT_UNLOCK(rt);
 218
 219         return (rt);
 220 }
 221 #endif
 222
 223 /*
 224  * Determine whether to send a CPL message now or defer it.  A message is
 225  * deferred if the connection is in SYN_SENT since we don't know the TID yet.
 226  * For connections in other states the message is sent immediately.
 227  * If through_l2t is set the message is subject to ARP processing, otherwise
 228  * it is sent directly.
 229  */
 230 static inline void
 231 send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
 232 {
 233         struct tcpcb *tp = toep->tp_tp;
 234
 235         if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
 236                 inp_wlock(tp->t_inpcb);
 237                 mbufq_tail(&toep->out_of_order_queue, m);  // defer
 238                 inp_wunlock(tp->t_inpcb);
 239         } else if (through_l2t)
 240                 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
 241         else
 242                 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
 243 }
 244
 245 static inline unsigned int
 246 mkprio(unsigned int cntrl, const struct toepcb *toep)
 247 {
 248         return (cntrl);
 249 }
 250
 251 /*
 252  * Populate a TID_RELEASE WR.  The skb must be already propely sized.
 253  */
 254 static inline void
 255 mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
 256 {
 257         struct cpl_tid_release *req;
 258
 259         m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
 260         m->m_pkthdr.len = m->m_len = sizeof(*req);
 261         req = mtod(m, struct cpl_tid_release *);
 262         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 263         req->wr.wr_lo = 0;
 264         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
 265 }
 266
 267 static inline void
 268 make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
 269 {
 270         struct tcpcb *tp = so_sototcpcb(so);
 271         struct toepcb *toep = tp->t_toe;
 272         struct tx_data_wr *req;
 273         struct sockbuf *snd;
 274
 275         inp_lock_assert(tp->t_inpcb);
 276         snd = so_sockbuf_snd(so);
 277
 278         req = mtod(m, struct tx_data_wr *);
 279         m->m_len = sizeof(*req);
 280         req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
 281         req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
 282         /* len includes the length of any HW ULP additions */
 283         req->len = htonl(len);
 284         req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
 285         /* V_TX_ULP_SUBMODE sets both the mode and submode */
 286         req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
 287                            V_TX_URG(/* skb_urgent(skb) */ 0 ) |
 288                            V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
 289                                    (tail ? 0 : 1))));
 290         req->sndseq = htonl(tp->snd_nxt);
 291         if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
 292                 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
 293                                     V_TX_CPU_IDX(toep->tp_qset));
 294
 295                 /* Sendbuffer is in units of 32KB.
 296                  */
 297                 if (V_tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
 298                         req->param |= htonl(V_TX_SNDBUF(V_tcp_autosndbuf_max >> 15));
 299                 else {
 300                         req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
 301                 }
 302
 303                 toep->tp_flags |= TP_DATASENT;
 304         }
 305 }
 306
 307 #define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
 308
 309 int
 310 t3_push_frames(struct socket *so, int req_completion)
 311 {
 312         struct tcpcb *tp = so_sototcpcb(so);
 313         struct toepcb *toep = tp->t_toe;
 314
 315         struct mbuf *tail, *m0, *last;
 316         struct t3cdev *cdev;
 317         struct tom_data *d;
 318         int state, bytes, count, total_bytes;
 319         bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
 320         struct sockbuf *snd;
 321
 322         if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
 323                 DPRINTF("tcp state=%d\n", tp->t_state);
 324                 return (0);
 325         }
 326
 327         state = so_state_get(so);
 328
 329         if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
 330                 DPRINTF("disconnecting\n");
 331
 332                 return (0);
 333         }
 334
 335         inp_lock_assert(tp->t_inpcb);
 336
 337         snd = so_sockbuf_snd(so);
 338         sockbuf_lock(snd);
 339
 340         d = TOM_DATA(toep->tp_toedev);
 341         cdev = d->cdev;
 342
 343         last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
 344
 345         total_bytes = 0;
 346         DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
 347             toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
 348
 349         if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
 350                 KASSERT(tail, ("sbdrop error"));
 351                 last = tail = tail->m_next;
 352         }
 353
 354         if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
 355                 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
 356                 sockbuf_unlock(snd);
 357
 358                 return (0);
 359         }
 360
 361         toep->tp_m_last = NULL;
 362         while (toep->tp_wr_avail && (tail != NULL)) {
 363                 count = bytes = 0;
 364                 segp = segs;
 365                 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
 366                         sockbuf_unlock(snd);
 367                         return (0);
 368                 }
 369                 /*
 370                  * If the data in tail fits as in-line, then
 371                  * make an immediate data wr.
 372                  */
 373                 if (tail->m_len <= IMM_LEN) {
 374                         count = 1;
 375                         bytes = tail->m_len;
 376                         last = tail;
 377                         tail = tail->m_next;
 378                         m_set_sgl(m0, NULL);
 379                         m_set_sgllen(m0, 0);
 380                         make_tx_data_wr(so, m0, bytes, tail);
 381                         m_append(m0, bytes, mtod(last, caddr_t));
 382                         KASSERT(!m0->m_next, ("bad append"));
 383                 } else {
 384                         while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
 385                             && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
 386                                 bytes += tail->m_len;
 387                                 last = tail;
 388                                 count++;
 389                                 /*
 390                                  * technically an abuse to be using this for a VA
 391                                  * but less gross than defining my own structure
 392                                  * or calling pmap_kextract from here :-|
 393                                  */
 394                                 segp->ds_addr = (bus_addr_t)tail->m_data;
 395                                 segp->ds_len = tail->m_len;
 396                                 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
 397                                     count, mbuf_wrs[count], tail->m_data, tail->m_len);
 398                                 segp++;
 399                                 tail = tail->m_next;
 400                         }
 401                         DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
 402                             toep->tp_wr_avail, count, mbuf_wrs[count], tail);
 403
 404                         m_set_sgl(m0, segs);
 405                         m_set_sgllen(m0, count);
 406                         make_tx_data_wr(so, m0, bytes, tail);
 407                 }
 408                 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
 409
 410                 if (tail) {
 411                         snd->sb_sndptr = tail;
 412                         toep->tp_m_last = NULL;
 413                 } else
 414                         toep->tp_m_last = snd->sb_sndptr = last;
 415
 416
 417                 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
 418
 419                 snd->sb_sndptroff += bytes;
 420                 total_bytes += bytes;
 421                 toep->tp_write_seq += bytes;
 422                 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d"
 423                     " tail=%p sndptr=%p sndptroff=%d",
 424                     toep->tp_wr_avail, count, mbuf_wrs[count],
 425                     tail, snd->sb_sndptr, snd->sb_sndptroff);
 426                 if (tail)
 427                         CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d"
 428                             " tp_m_last=%p tailbuf=%p snd_una=0x%08x",
 429                             total_bytes, toep->tp_m_last, tail->m_data,
 430                             tp->snd_una);
 431                 else
 432                         CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d"
 433                             " tp_m_last=%p snd_una=0x%08x",
 434                             total_bytes, toep->tp_m_last, tp->snd_una);
 435
 436
 437 #ifdef KTR
 438 {
 439                 int i;
 440
 441                 i = 0;
 442                 while (i < count && m_get_sgllen(m0)) {
 443                         if ((count - i) >= 3) {
 444                                 CTR6(KTR_TOM,
 445                                     "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
 446                                     " len=%d pa=0x%zx len=%d",
 447                                     segs[i].ds_addr, segs[i].ds_len,
 448                                     segs[i + 1].ds_addr, segs[i + 1].ds_len,
 449                                     segs[i + 2].ds_addr, segs[i + 2].ds_len);
 450                                     i += 3;
 451                         } else if ((count - i) == 2) {
 452                                 CTR4(KTR_TOM,
 453                                     "t3_push_frames: pa=0x%zx len=%d pa=0x%zx"
 454                                     " len=%d",
 455                                     segs[i].ds_addr, segs[i].ds_len,
 456                                     segs[i + 1].ds_addr, segs[i + 1].ds_len);
 457                                     i += 2;
 458                         } else {
 459                                 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
 460                                     segs[i].ds_addr, segs[i].ds_len);
 461                                 i++;
 462                         }
 463
 464                 }
 465 }
 466 #endif
 467                  /*
 468                  * remember credits used
 469                  */
 470                 m0->m_pkthdr.csum_data = mbuf_wrs[count];
 471                 m0->m_pkthdr.len = bytes;
 472                 toep->tp_wr_avail -= mbuf_wrs[count];
 473                 toep->tp_wr_unacked += mbuf_wrs[count];
 474
 475                 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
 476                     toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
 477                         struct work_request_hdr *wr = cplhdr(m0);
 478
 479                         wr->wr_hi |= htonl(F_WR_COMPL);
 480                         toep->tp_wr_unacked = 0;
 481                 }
 482                 KASSERT((m0->m_pkthdr.csum_data > 0) &&
 483                     (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
 484                         m0->m_pkthdr.csum_data));
 485                 m0->m_type = MT_DONTFREE;
 486                 enqueue_wr(toep, m0);
 487                 DPRINTF("sending offload tx with %d bytes in %d segments\n",
 488                     bytes, count);
 489                 l2t_send(cdev, m0, toep->tp_l2t);
 490         }
 491         sockbuf_unlock(snd);
 492         return (total_bytes);
 493 }
 494
 495 /*
 496  * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
 497  * under any circumstances.  We take the easy way out and always queue the
 498  * message to the write_queue.  We can optimize the case where the queue is
 499  * already empty though the optimization is probably not worth it.
 500  */
 501 static void
 502 close_conn(struct socket *so)
 503 {
 504         struct mbuf *m;
 505         struct cpl_close_con_req *req;
 506         struct tom_data *d;
 507         struct inpcb *inp = so_sotoinpcb(so);
 508         struct tcpcb *tp;
 509         struct toepcb *toep;
 510         unsigned int tid;
 511
 512
 513         inp_wlock(inp);
 514         tp = so_sototcpcb(so);
 515         toep = tp->t_toe;
 516
 517         if (tp->t_state != TCPS_SYN_SENT)
 518                 t3_push_frames(so, 1);
 519
 520         if (toep->tp_flags & TP_FIN_SENT) {
 521                 inp_wunlock(inp);
 522                 return;
 523         }
 524
 525         tid = toep->tp_tid;
 526
 527         d = TOM_DATA(toep->tp_toedev);
 528
 529         m = m_gethdr_nofail(sizeof(*req));
 530         m_set_priority(m, CPL_PRIORITY_DATA);
 531         m_set_sgl(m, NULL);
 532         m_set_sgllen(m, 0);
 533
 534         toep->tp_flags |= TP_FIN_SENT;
 535         req = mtod(m, struct cpl_close_con_req *);
 536
 537         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
 538         req->wr.wr_lo = htonl(V_WR_TID(tid));
 539         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
 540         req->rsvd = 0;
 541         inp_wunlock(inp);
 542         /*
 543          * XXX - need to defer shutdown while there is still data in the queue
 544          *
 545          */
 546         CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
 547         cxgb_ofld_send(d->cdev, m);
 548
 549 }
 550
 551 /*
 552  * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
 553  * and send it along.
 554  */
 555 static void
 556 abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
 557 {
 558         struct cpl_abort_req *req = cplhdr(m);
 559
 560         req->cmd = CPL_ABORT_NO_RST;
 561         cxgb_ofld_send(cdev, m);
 562 }
 563
 564 /*
 565  * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
 566  * permitted to return without sending the message in case we cannot allocate
 567  * an sk_buff.  Returns the number of credits sent.
 568  */
 569 uint32_t
 570 t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
 571 {
 572         struct mbuf *m;
 573         struct cpl_rx_data_ack *req;
 574         struct toepcb *toep = tp->t_toe;
 575         struct toedev *tdev = toep->tp_toedev;
 576
 577         m = m_gethdr_nofail(sizeof(*req));
 578
 579         DPRINTF("returning %u credits to HW\n", credits);
 580
 581         req = mtod(m, struct cpl_rx_data_ack *);
 582         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 583         req->wr.wr_lo = 0;
 584         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 585         req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
 586         m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
 587         cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
 588         return (credits);
 589 }
 590
 591 /*
 592  * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
 593  * This is only used in DDP mode, so we take the opportunity to also set the
 594  * DACK mode and flush any Rx credits.
 595  */
 596 void
 597 t3_send_rx_modulate(struct toepcb *toep)
 598 {
 599         struct mbuf *m;
 600         struct cpl_rx_data_ack *req;
 601
 602         m = m_gethdr_nofail(sizeof(*req));
 603
 604         req = mtod(m, struct cpl_rx_data_ack *);
 605         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 606         req->wr.wr_lo = 0;
 607         m->m_pkthdr.len = m->m_len = sizeof(*req);
 608
 609         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 610         req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
 611                                  V_RX_DACK_MODE(1) |
 612                                  V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
 613         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 614         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
 615         toep->tp_rcv_wup = toep->tp_copied_seq;
 616 }
 617
 618 /*
 619  * Handle receipt of an urgent pointer.
 620  */
 621 static void
 622 handle_urg_ptr(struct socket *so, uint32_t urg_seq)
 623 {
 624 #ifdef URGENT_DATA_SUPPORTED
 625         struct tcpcb *tp = so_sototcpcb(so);
 626
 627         urg_seq--;   /* initially points past the urgent data, per BSD */
 628
 629         if (tp->urg_data && !after(urg_seq, tp->urg_seq))
 630                 return;                                 /* duplicate pointer */
 631         sk_send_sigurg(sk);
 632         if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
 633             !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
 634                 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 635
 636                 tp->copied_seq++;
 637                 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
 638                         tom_eat_skb(sk, skb, 0);
 639         }
 640         tp->urg_data = TCP_URG_NOTYET;
 641         tp->urg_seq = urg_seq;
 642 #endif
 643 }
 644
 645 /*
 646  * Returns true if a socket cannot accept new Rx data.
 647  */
 648 static inline int
 649 so_no_receive(const struct socket *so)
 650 {
 651         return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
 652 }
 653
 654 /*
 655  * Process an urgent data notification.
 656  */
 657 static void
 658 rx_urg_notify(struct toepcb *toep, struct mbuf *m)
 659 {
 660         struct cpl_rx_urg_notify *hdr = cplhdr(m);
 661         struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
 662
 663         VALIDATE_SOCK(so);
 664
 665         if (!so_no_receive(so))
 666                 handle_urg_ptr(so, ntohl(hdr->seq));
 667
 668         m_freem(m);
 669 }
 670
 671 /*
 672  * Handler for RX_URG_NOTIFY CPL messages.
 673  */
 674 static int
 675 do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 676 {
 677         struct toepcb *toep = (struct toepcb *)ctx;
 678
 679         rx_urg_notify(toep, m);
 680         return (0);
 681 }
 682
 683 static __inline int
 684 is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
 685 {
 686         return (toep->tp_ulp_mode ||
 687                 (toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
 688                     dev->tod_ttid >= TOE_ID_CHELSIO_T3));
 689 }
 690
 691 /*
 692  * Set of states for which we should return RX credits.
 693  */
 694 #define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
 695
 696 /*
 697  * Called after some received data has been read.  It returns RX credits
 698  * to the HW for the amount of data processed.
 699  */
 700 void
 701 t3_cleanup_rbuf(struct tcpcb *tp, int copied)
 702 {
 703         struct toepcb *toep = tp->t_toe;
 704         struct socket *so;
 705         struct toedev *dev;
 706         int dack_mode, must_send, read;
 707         u32 thres, credits, dack = 0;
 708         struct sockbuf *rcv;
 709
 710         so = inp_inpcbtosocket(tp->t_inpcb);
 711         rcv = so_sockbuf_rcv(so);
 712
 713         if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
 714                 (tp->t_state == TCPS_FIN_WAIT_2))) {
 715                 if (copied) {
 716                         sockbuf_lock(rcv);
 717                         toep->tp_copied_seq += copied;
 718                         sockbuf_unlock(rcv);
 719                 }
 720
 721                 return;
 722         }
 723
 724         inp_lock_assert(tp->t_inpcb);
 725
 726         sockbuf_lock(rcv);
 727         if (copied)
 728                 toep->tp_copied_seq += copied;
 729         else {
 730                 read = toep->tp_enqueued_bytes - rcv->sb_cc;
 731                 toep->tp_copied_seq += read;
 732         }
 733         credits = toep->tp_copied_seq - toep->tp_rcv_wup;
 734         toep->tp_enqueued_bytes = rcv->sb_cc;
 735         sockbuf_unlock(rcv);
 736
 737         if (credits > rcv->sb_mbmax) {
 738                 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
 739                     toep->tp_copied_seq, toep->tp_rcv_wup, credits);
 740             credits = rcv->sb_mbmax;
 741         }
 742
 743
 744         /*
 745          * XXX this won't accurately reflect credit return - we need
 746          * to look at the difference between the amount that has been
 747          * put in the recv sockbuf and what is there now
 748          */
 749
 750         if (__predict_false(!credits))
 751                 return;
 752
 753         dev = toep->tp_toedev;
 754         thres = TOM_TUNABLE(dev, rx_credit_thres);
 755
 756         if (__predict_false(thres == 0))
 757                 return;
 758
 759         if (is_delack_mode_valid(dev, toep)) {
 760                 dack_mode = TOM_TUNABLE(dev, delack);
 761                 if (__predict_false(dack_mode != toep->tp_delack_mode)) {
 762                         u32 r = tp->rcv_nxt - toep->tp_delack_seq;
 763
 764                         if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
 765                                 dack = F_RX_DACK_CHANGE |
 766                                        V_RX_DACK_MODE(dack_mode);
 767                 }
 768         } else
 769                 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 770
 771         /*
 772          * For coalescing to work effectively ensure the receive window has
 773          * at least 16KB left.
 774          */
 775         must_send = credits + 16384 >= tp->rcv_wnd;
 776
 777         if (must_send || credits >= thres)
 778                 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
 779 }
 780
 781 static int
 782 cxgb_toe_disconnect(struct tcpcb *tp)
 783 {
 784         struct socket *so;
 785
 786         DPRINTF("cxgb_toe_disconnect\n");
 787
 788         so = inp_inpcbtosocket(tp->t_inpcb);
 789         close_conn(so);
 790         return (0);
 791 }
 792
 793 static int
 794 cxgb_toe_reset(struct tcpcb *tp)
 795 {
 796         struct toepcb *toep = tp->t_toe;
 797
 798         t3_send_reset(toep);
 799
 800         /*
 801          * unhook from socket
 802          */
 803         tp->t_flags &= ~TF_TOE;
 804         toep->tp_tp = NULL;
 805         tp->t_toe = NULL;
 806         return (0);
 807 }
 808
 809 static int
 810 cxgb_toe_send(struct tcpcb *tp)
 811 {
 812         struct socket *so;
 813
 814         DPRINTF("cxgb_toe_send\n");
 815         dump_toepcb(tp->t_toe);
 816
 817         so = inp_inpcbtosocket(tp->t_inpcb);
 818         t3_push_frames(so, 1);
 819         return (0);
 820 }
 821
 822 static int
 823 cxgb_toe_rcvd(struct tcpcb *tp)
 824 {
 825
 826         inp_lock_assert(tp->t_inpcb);
 827
 828         t3_cleanup_rbuf(tp, 0);
 829
 830         return (0);
 831 }
 832
 833 static void
 834 cxgb_toe_detach(struct tcpcb *tp)
 835 {
 836         struct toepcb *toep;
 837
 838         /*
 839          * XXX how do we handle teardown in the SYN_SENT state?
 840          *
 841          */
 842         inp_lock_assert(tp->t_inpcb);
 843         toep = tp->t_toe;
 844         toep->tp_tp = NULL;
 845
 846         /*
 847          * unhook from socket
 848          */
 849         tp->t_flags &= ~TF_TOE;
 850         tp->t_toe = NULL;
 851 }
 852
 853
 854 static struct toe_usrreqs cxgb_toe_usrreqs = {
 855         .tu_disconnect = cxgb_toe_disconnect,
 856         .tu_reset = cxgb_toe_reset,
 857         .tu_send = cxgb_toe_send,
 858         .tu_rcvd = cxgb_toe_rcvd,
 859         .tu_detach = cxgb_toe_detach,
 860         .tu_detach = cxgb_toe_detach,
 861         .tu_syncache_event = handle_syncache_event,
 862 };
 863
 864
 865 static void
 866 __set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
 867                             uint64_t mask, uint64_t val, int no_reply)
 868 {
 869         struct cpl_set_tcb_field *req;
 870
 871         CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
 872             toep->tp_tid, word, mask, val);
 873
 874         req = mtod(m, struct cpl_set_tcb_field *);
 875         m->m_pkthdr.len = m->m_len = sizeof(*req);
 876         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 877         req->wr.wr_lo = 0;
 878         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
 879         req->reply = V_NO_REPLY(no_reply);
 880         req->cpu_idx = 0;
 881         req->word = htons(word);
 882         req->mask = htobe64(mask);
 883         req->val = htobe64(val);
 884
 885         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 886         send_or_defer(toep, m, 0);
 887 }
 888
 889 static void
 890 t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
 891 {
 892         struct mbuf *m;
 893         struct tcpcb *tp = toep->tp_tp;
 894
 895         if (toep == NULL)
 896                 return;
 897
 898         if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
 899                 printf("not seting field\n");
 900                 return;
 901         }
 902
 903         m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
 904
 905         __set_tcb_field(toep, m, word, mask, val, 1);
 906 }
 907
 908 /*
 909  * Set one of the t_flags bits in the TCB.
 910  */
 911 static void
 912 set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
 913 {
 914
 915         t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
 916 }
 917
 918 /*
 919  * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
 920  */
 921 static void
 922 t3_set_nagle(struct toepcb *toep)
 923 {
 924         struct tcpcb *tp = toep->tp_tp;
 925
 926         set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
 927 }
 928
 929 /*
 930  * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
 931  */
 932 void
 933 t3_set_keepalive(struct toepcb *toep, int on_off)
 934 {
 935
 936         set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
 937 }
 938
 939 void
 940 t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
 941 {
 942         set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
 943 }
 944
 945 void
 946 t3_set_dack_mss(struct toepcb *toep, int on_off)
 947 {
 948
 949         set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
 950 }
 951
 952 /*
 953  * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
 954  */
 955 static void
 956 t3_set_tos(struct toepcb *toep)
 957 {
 958         int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
 959
 960         t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
 961                          V_TCB_TOS(tos));
 962 }
 963
 964
 965 /*
 966  * In DDP mode, TP fails to schedule a timer to push RX data to the host when
 967  * DDP is disabled (data is delivered to freelist). [Note that, the peer should
 968  * set the PSH bit in the last segment, which would trigger delivery.]
 969  * We work around the issue by setting a DDP buffer in a partial placed state,
 970  * which guarantees that TP will schedule a timer.
 971  */
 972 #define TP_DDP_TIMER_WORKAROUND_MASK\
 973     (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
 974      ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
 975        V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
 976 #define TP_DDP_TIMER_WORKAROUND_VAL\
 977     (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
 978      ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
 979       32))
 980
 981 static void
 982 t3_enable_ddp(struct toepcb *toep, int on)
 983 {
 984         if (on) {
 985
 986                 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
 987                                  V_TF_DDP_OFF(0));
 988         } else
 989                 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
 990                                  V_TF_DDP_OFF(1) |
 991                                  TP_DDP_TIMER_WORKAROUND_MASK,
 992                                  V_TF_DDP_OFF(1) |
 993                                  TP_DDP_TIMER_WORKAROUND_VAL);
 994
 995 }
 996
 997 void
 998 t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
 999 {
1000         t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
1001                          V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
1002                          tag_color);
1003 }
1004
1005 void
1006 t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
1007                     unsigned int len)
1008 {
1009         if (buf_idx == 0)
1010                 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
1011                          V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
1012                          V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
1013                          V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1014                          V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1015         else
1016                 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1017                          V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1018                          V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1019                          V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1020                          V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1021 }
1022
1023 static int
1024 t3_set_cong_control(struct socket *so, const char *name)
1025 {
1026 #ifdef CONGESTION_CONTROL_SUPPORTED
1027         int cong_algo;
1028
1029         for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1030                 if (!strcmp(name, t3_cong_ops[cong_algo].name))
1031                         break;
1032
1033         if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1034                 return -EINVAL;
1035 #endif
1036         return 0;
1037 }
1038
1039 int
1040 t3_get_tcb(struct toepcb *toep)
1041 {
1042         struct cpl_get_tcb *req;
1043         struct tcpcb *tp = toep->tp_tp;
1044         struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1045
1046         if (!m)
1047                 return (ENOMEM);
1048
1049         inp_lock_assert(tp->t_inpcb);
1050         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1051         req = mtod(m, struct cpl_get_tcb *);
1052         m->m_pkthdr.len = m->m_len = sizeof(*req);
1053         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1054         req->wr.wr_lo = 0;
1055         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1056         req->cpuno = htons(toep->tp_qset);
1057         req->rsvd = 0;
1058         if (tp->t_state == TCPS_SYN_SENT)
1059                 mbufq_tail(&toep->out_of_order_queue, m);       // defer
1060         else
1061                 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1062         return 0;
1063 }
1064
1065 static inline void
1066 so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1067 {
1068
1069         toepcb_hold(toep);
1070
1071         cxgb_insert_tid(d->cdev, d->client, toep, tid);
1072 }
1073
1074 /**
1075  *      find_best_mtu - find the entry in the MTU table closest to an MTU
1076  *      @d: TOM state
1077  *      @mtu: the target MTU
1078  *
1079  *      Returns the index of the value in the MTU table that is closest to but
1080  *      does not exceed the target MTU.
1081  */
1082 static unsigned int
1083 find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1084 {
1085         int i = 0;
1086
1087         while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1088                 ++i;
1089         return (i);
1090 }
1091
1092 static unsigned int
1093 select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1094 {
1095         unsigned int idx;
1096
1097 #ifdef notyet
1098         struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1099 #endif
1100         if (tp) {
1101                 tp->t_maxseg = pmtu - 40;
1102                 if (tp->t_maxseg < td->mtus[0] - 40)
1103                         tp->t_maxseg = td->mtus[0] - 40;
1104                 idx = find_best_mtu(td, tp->t_maxseg + 40);
1105
1106                 tp->t_maxseg = td->mtus[idx] - 40;
1107         } else
1108                 idx = find_best_mtu(td, pmtu);
1109
1110         return (idx);
1111 }
1112
1113 static inline void
1114 free_atid(struct t3cdev *cdev, unsigned int tid)
1115 {
1116         struct toepcb *toep = cxgb_free_atid(cdev, tid);
1117
1118         if (toep)
1119                 toepcb_release(toep);
1120 }
1121
1122 /*
1123  * Release resources held by an offload connection (TID, L2T entry, etc.)
1124  */
1125 static void
1126 t3_release_offload_resources(struct toepcb *toep)
1127 {
1128         struct tcpcb *tp = toep->tp_tp;
1129         struct toedev *tdev = toep->tp_toedev;
1130         struct t3cdev *cdev;
1131         struct socket *so;
1132         unsigned int tid = toep->tp_tid;
1133         struct sockbuf *rcv;
1134
1135         CTR0(KTR_TOM, "t3_release_offload_resources");
1136
1137         if (!tdev)
1138                 return;
1139
1140         cdev = TOEP_T3C_DEV(toep);
1141         if (!cdev)
1142                 return;
1143
1144         toep->tp_qset = 0;
1145         t3_release_ddp_resources(toep);
1146
1147 #ifdef CTRL_SKB_CACHE
1148         kfree_skb(CTRL_SKB_CACHE(tp));
1149         CTRL_SKB_CACHE(tp) = NULL;
1150 #endif
1151
1152         if (toep->tp_wr_avail != toep->tp_wr_max) {
1153                 purge_wr_queue(toep);
1154                 reset_wr_list(toep);
1155         }
1156
1157         if (toep->tp_l2t) {
1158                 l2t_release(L2DATA(cdev), toep->tp_l2t);
1159                 toep->tp_l2t = NULL;
1160         }
1161         toep->tp_tp = NULL;
1162         if (tp) {
1163                 inp_lock_assert(tp->t_inpcb);
1164                 so = inp_inpcbtosocket(tp->t_inpcb);
1165                 rcv = so_sockbuf_rcv(so);
1166                 /*
1167                  * cancel any offloaded reads
1168                  *
1169                  */
1170                 sockbuf_lock(rcv);
1171                 tp->t_toe = NULL;
1172                 tp->t_flags &= ~TF_TOE;
1173                 if (toep->tp_ddp_state.user_ddp_pending) {
1174                         t3_cancel_ubuf(toep, rcv);
1175                         toep->tp_ddp_state.user_ddp_pending = 0;
1176                 }
1177                 so_sorwakeup_locked(so);
1178
1179         }
1180
1181         if (toep->tp_state == TCPS_SYN_SENT) {
1182                 free_atid(cdev, tid);
1183 #ifdef notyet
1184                 __skb_queue_purge(&tp->out_of_order_queue);
1185 #endif
1186         } else {                                          // we have TID
1187                 cxgb_remove_tid(cdev, toep, tid);
1188                 toepcb_release(toep);
1189         }
1190 #if 0
1191         log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1192 #endif
1193 }
1194
1195 static void
1196 install_offload_ops(struct socket *so)
1197 {
1198         struct tcpcb *tp = so_sototcpcb(so);
1199
1200         KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1201
1202         t3_install_socket_ops(so);
1203         tp->t_flags |= TF_TOE;
1204         tp->t_tu = &cxgb_toe_usrreqs;
1205 }
1206
1207 /*
1208  * Determine the receive window scaling factor given a target max
1209  * receive window.
1210  */
1211 static __inline int
1212 select_rcv_wscale(int space)
1213 {
1214         int wscale = 0;
1215
1216         if (space > MAX_RCV_WND)
1217                 space = MAX_RCV_WND;
1218
1219         if (V_tcp_do_rfc1323)
1220                 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1221
1222         return (wscale);
1223 }
1224
1225 /*
1226  * Determine the receive window size for a socket.
1227  */
1228 static unsigned long
1229 select_rcv_wnd(struct toedev *dev, struct socket *so)
1230 {
1231         struct tom_data *d = TOM_DATA(dev);
1232         unsigned int wnd;
1233         unsigned int max_rcv_wnd;
1234         struct sockbuf *rcv;
1235
1236         rcv = so_sockbuf_rcv(so);
1237
1238         if (V_tcp_do_autorcvbuf)
1239                 wnd = V_tcp_autorcvbuf_max;
1240         else
1241                 wnd = rcv->sb_hiwat;
1242
1243
1244
1245         /* XXX
1246          * For receive coalescing to work effectively we need a receive window
1247          * that can accomodate a coalesced segment.
1248          */
1249         if (wnd < MIN_RCV_WND)
1250                 wnd = MIN_RCV_WND;
1251
1252         /* PR 5138 */
1253         max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1254                                     (uint32_t)d->rx_page_size * 23 :
1255                                     MAX_RCV_WND);
1256
1257         return min(wnd, max_rcv_wnd);
1258 }
1259
1260 /*
1261  * Assign offload parameters to some socket fields.  This code is used by
1262  * both active and passive opens.
1263  */
1264 static inline void
1265 init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1266     struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1267 {
1268         struct tcpcb *tp = so_sototcpcb(so);
1269         struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1270         struct sockbuf *snd, *rcv;
1271
1272 #ifdef notyet
1273         SOCK_LOCK_ASSERT(so);
1274 #endif
1275
1276         snd = so_sockbuf_snd(so);
1277         rcv = so_sockbuf_rcv(so);
1278
1279         log(LOG_INFO, "initializing offload socket\n");
1280         /*
1281          * We either need to fix push frames to work with sbcompress
1282          * or we need to add this
1283          */
1284         snd->sb_flags |= SB_NOCOALESCE;
1285         rcv->sb_flags |= SB_NOCOALESCE;
1286
1287         tp->t_toe = toep;
1288         toep->tp_tp = tp;
1289         toep->tp_toedev = dev;
1290
1291         toep->tp_tid = tid;
1292         toep->tp_l2t = e;
1293         toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1294         toep->tp_wr_unacked = 0;
1295         toep->tp_delack_mode = 0;
1296
1297         toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1298         /*
1299          * XXX broken
1300          *
1301          */
1302         tp->rcv_wnd = select_rcv_wnd(dev, so);
1303
1304         toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1305                        tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1306         toep->tp_qset_idx = 0;
1307
1308         reset_wr_list(toep);
1309         DPRINTF("initialization done\n");
1310 }
1311
1312 /*
1313  * The next two functions calculate the option 0 value for a socket.
1314  */
1315 static inline unsigned int
1316 calc_opt0h(struct socket *so, int mtu_idx)
1317 {
1318         struct tcpcb *tp = so_sototcpcb(so);
1319         int wscale = select_rcv_wscale(tp->rcv_wnd);
1320
1321         return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1322             V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1323             V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1324 }
1325
1326 static inline unsigned int
1327 calc_opt0l(struct socket *so, int ulp_mode)
1328 {
1329         struct tcpcb *tp = so_sototcpcb(so);
1330         unsigned int val;
1331
1332         val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1333                V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1334
1335         DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1336         return (val);
1337 }
1338
1339 static inline unsigned int
1340 calc_opt2(const struct socket *so, struct toedev *dev)
1341 {
1342         int flv_valid;
1343
1344         flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1345
1346         return (V_FLAVORS_VALID(flv_valid) |
1347             V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1348 }
1349
1350 #if DEBUG_WR > 1
1351 static int
1352 count_pending_wrs(const struct toepcb *toep)
1353 {
1354         const struct mbuf *m;
1355         int n = 0;
1356
1357         wr_queue_walk(toep, m)
1358                 n += m->m_pkthdr.csum_data;
1359         return (n);
1360 }
1361 #endif
1362
1363 #if 0
1364 (((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1365 #endif
1366
1367 static void
1368 mk_act_open_req(struct socket *so, struct mbuf *m,
1369     unsigned int atid, const struct l2t_entry *e)
1370 {
1371         struct cpl_act_open_req *req;
1372         struct inpcb *inp = so_sotoinpcb(so);
1373         struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1374         struct toepcb *toep = tp->t_toe;
1375         struct toedev *tdev = toep->tp_toedev;
1376
1377         m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1378
1379         req = mtod(m, struct cpl_act_open_req *);
1380         m->m_pkthdr.len = m->m_len = sizeof(*req);
1381
1382         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1383         req->wr.wr_lo = 0;
1384         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1385         inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1386 #if 0
1387         req->local_port = inp->inp_lport;
1388         req->peer_port = inp->inp_fport;
1389         memcpy(&req->local_ip, &inp->inp_laddr, 4);
1390         memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1391 #endif
1392         req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1393                            V_TX_CHANNEL(e->smt_idx));
1394         req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1395         req->params = 0;
1396         req->opt2 = htonl(calc_opt2(so, tdev));
1397 }
1398
1399
1400 /*
1401  * Convert an ACT_OPEN_RPL status to an errno.
1402  */
1403 static int
1404 act_open_rpl_status_to_errno(int status)
1405 {
1406         switch (status) {
1407         case CPL_ERR_CONN_RESET:
1408                 return (ECONNREFUSED);
1409         case CPL_ERR_ARP_MISS:
1410                 return (EHOSTUNREACH);
1411         case CPL_ERR_CONN_TIMEDOUT:
1412                 return (ETIMEDOUT);
1413         case CPL_ERR_TCAM_FULL:
1414                 return (ENOMEM);
1415         case CPL_ERR_CONN_EXIST:
1416                 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1417                 return (EADDRINUSE);
1418         default:
1419                 return (EIO);
1420         }
1421 }
1422
1423 static void
1424 fail_act_open(struct toepcb *toep, int errno)
1425 {
1426         struct tcpcb *tp = toep->tp_tp;
1427
1428         t3_release_offload_resources(toep);
1429         if (tp) {
1430                 inp_wunlock(tp->t_inpcb);
1431                 tcp_offload_drop(tp, errno);
1432         }
1433
1434 #ifdef notyet
1435         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1436 #endif
1437 }
1438
1439 /*
1440  * Handle active open failures.
1441  */
1442 static void
1443 active_open_failed(struct toepcb *toep, struct mbuf *m)
1444 {
1445         struct cpl_act_open_rpl *rpl = cplhdr(m);
1446         struct inpcb *inp;
1447
1448         if (toep->tp_tp == NULL)
1449                 goto done;
1450
1451         inp = toep->tp_tp->t_inpcb;
1452
1453 /*
1454  * Don't handle connection retry for now
1455  */
1456 #ifdef notyet
1457         struct inet_connection_sock *icsk = inet_csk(sk);
1458
1459         if (rpl->status == CPL_ERR_CONN_EXIST &&
1460             icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1461                 icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1462                 sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1463                                jiffies + HZ / 2);
1464         } else
1465 #endif
1466         {
1467                 inp_wlock(inp);
1468                 /*
1469                  * drops the inpcb lock
1470                  */
1471                 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1472         }
1473
1474         done:
1475         m_free(m);
1476 }
1477
1478 /*
1479  * Return whether a failed active open has allocated a TID
1480  */
1481 static inline int
1482 act_open_has_tid(int status)
1483 {
1484         return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1485                status != CPL_ERR_ARP_MISS;
1486 }
1487
1488 /*
1489  * Process an ACT_OPEN_RPL CPL message.
1490  */
1491 static int
1492 do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1493 {
1494         struct toepcb *toep = (struct toepcb *)ctx;
1495         struct cpl_act_open_rpl *rpl = cplhdr(m);
1496
1497         if (cdev->type != T3A && act_open_has_tid(rpl->status))
1498                 cxgb_queue_tid_release(cdev, GET_TID(rpl));
1499
1500         active_open_failed(toep, m);
1501         return (0);
1502 }
1503
1504 /*
1505  * Handle an ARP failure for an active open.   XXX purge ofo queue
1506  *
1507  * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1508  * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1509  * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1510  * free the atid.  Hmm.
1511  */
1512 #ifdef notyet
1513 static void
1514 act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1515 {
1516         struct toepcb *toep = m_get_toep(m);
1517         struct tcpcb *tp = toep->tp_tp;
1518         struct inpcb *inp = tp->t_inpcb;
1519         struct socket *so;
1520
1521         inp_wlock(inp);
1522         if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1523                 /*
1524                  * drops the inpcb lock
1525                  */
1526                 fail_act_open(so, EHOSTUNREACH);
1527                 printf("freeing %p\n", m);
1528
1529                 m_free(m);
1530         } else
1531                 inp_wunlock(inp);
1532 }
1533 #endif
1534 /*
1535  * Send an active open request.
1536  */
1537 int
1538 t3_connect(struct toedev *tdev, struct socket *so,
1539     struct rtentry *rt, struct sockaddr *nam)
1540 {
1541         struct mbuf *m;
1542         struct l2t_entry *e;
1543         struct tom_data *d = TOM_DATA(tdev);
1544         struct inpcb *inp = so_sotoinpcb(so);
1545         struct tcpcb *tp = intotcpcb(inp);
1546         struct toepcb *toep; /* allocated by init_offload_socket */
1547
1548         int atid;
1549
1550         toep = toepcb_alloc();
1551         if (toep == NULL)
1552                 goto out_err;
1553
1554         if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1555                 goto out_err;
1556
1557         e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1558         if (!e)
1559                 goto free_tid;
1560
1561         inp_lock_assert(inp);
1562         m = m_gethdr(MT_DATA, M_WAITOK);
1563
1564 #if 0
1565         m->m_toe.mt_toepcb = tp->t_toe;
1566         set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1567 #endif
1568         so_lock(so);
1569
1570         init_offload_socket(so, tdev, atid, e, rt, toep);
1571
1572         install_offload_ops(so);
1573
1574         mk_act_open_req(so, m, atid, e);
1575         so_unlock(so);
1576
1577         soisconnecting(so);
1578         toep = tp->t_toe;
1579         m_set_toep(m, tp->t_toe);
1580
1581         toep->tp_state = TCPS_SYN_SENT;
1582         l2t_send(d->cdev, (struct mbuf *)m, e);
1583
1584         if (toep->tp_ulp_mode)
1585                 t3_enable_ddp(toep, 0);
1586         return  (0);
1587
1588 free_tid:
1589         printf("failing connect - free atid\n");
1590
1591         free_atid(d->cdev, atid);
1592 out_err:
1593         printf("return ENOMEM\n");
1594        return (ENOMEM);
1595 }
1596
1597 /*
1598  * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1599  * not send multiple ABORT_REQs for the same connection and also that we do
1600  * not try to send a message after the connection has closed.  Returns 1 if
1601  * an ABORT_REQ wasn't generated after all, 0 otherwise.
1602  */
1603 static void
1604 t3_send_reset(struct toepcb *toep)
1605 {
1606
1607         struct cpl_abort_req *req;
1608         unsigned int tid = toep->tp_tid;
1609         int mode = CPL_ABORT_SEND_RST;
1610         struct tcpcb *tp = toep->tp_tp;
1611         struct toedev *tdev = toep->tp_toedev;
1612         struct socket *so = NULL;
1613         struct mbuf *m;
1614         struct sockbuf *snd;
1615
1616         if (tp) {
1617                 inp_lock_assert(tp->t_inpcb);
1618                 so = inp_inpcbtosocket(tp->t_inpcb);
1619         }
1620
1621         if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1622                 tdev == NULL))
1623                 return;
1624         toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1625
1626         snd = so_sockbuf_snd(so);
1627         /* Purge the send queue so we don't send anything after an abort. */
1628         if (so)
1629                 sbflush(snd);
1630         if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1631                 mode |= CPL_ABORT_POST_CLOSE_REQ;
1632
1633         m = m_gethdr_nofail(sizeof(*req));
1634         m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1635         set_arp_failure_handler(m, abort_arp_failure);
1636
1637         req = mtod(m, struct cpl_abort_req *);
1638         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1639         req->wr.wr_lo = htonl(V_WR_TID(tid));
1640         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1641         req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1642         req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1643         req->cmd = mode;
1644         if (tp && (tp->t_state == TCPS_SYN_SENT))
1645                 mbufq_tail(&toep->out_of_order_queue, m);       // defer
1646         else
1647                 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1648 }
1649
1650 static int
1651 t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1652 {
1653         struct inpcb *inp;
1654         int error, optval;
1655
1656         if (sopt->sopt_name == IP_OPTIONS)
1657                 return (ENOPROTOOPT);
1658
1659         if (sopt->sopt_name != IP_TOS)
1660                 return (EOPNOTSUPP);
1661
1662         error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1663
1664         if (error)
1665                 return (error);
1666
1667         if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1668                 return (EPERM);
1669
1670         inp = so_sotoinpcb(so);
1671         inp_wlock(inp);
1672         inp_ip_tos_set(inp, optval);
1673 #if 0
1674         inp->inp_ip_tos = optval;
1675 #endif
1676         t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1677         inp_wunlock(inp);
1678
1679         return (0);
1680 }
1681
1682 static int
1683 t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1684 {
1685         int err = 0;
1686         size_t copied;
1687
1688         if (sopt->sopt_name != TCP_CONGESTION &&
1689             sopt->sopt_name != TCP_NODELAY)
1690                 return (EOPNOTSUPP);
1691
1692         if (sopt->sopt_name == TCP_CONGESTION) {
1693                 char name[TCP_CA_NAME_MAX];
1694                 int optlen = sopt->sopt_valsize;
1695                 struct tcpcb *tp;
1696
1697                 if (sopt->sopt_dir == SOPT_GET) {
1698                         KASSERT(0, ("unimplemented"));
1699                         return (EOPNOTSUPP);
1700                 }
1701
1702                 if (optlen < 1)
1703                         return (EINVAL);
1704
1705                 err = copyinstr(sopt->sopt_val, name,
1706                     min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1707                 if (err)
1708                         return (err);
1709                 if (copied < 1)
1710                         return (EINVAL);
1711
1712                 tp = so_sototcpcb(so);
1713                 /*
1714                  * XXX I need to revisit this
1715                  */
1716                 if ((err = t3_set_cong_control(so, name)) == 0) {
1717 #ifdef CONGESTION_CONTROL_SUPPORTED
1718                         tp->t_cong_control = strdup(name, M_CXGB);
1719 #endif
1720                 } else
1721                         return (err);
1722         } else {
1723                 int optval, oldval;
1724                 struct inpcb *inp;
1725                 struct tcpcb *tp;
1726
1727                 if (sopt->sopt_dir == SOPT_GET)
1728                         return (EOPNOTSUPP);
1729
1730                 err = sooptcopyin(sopt, &optval, sizeof optval,
1731                     sizeof optval);
1732
1733                 if (err)
1734                         return (err);
1735
1736                 inp = so_sotoinpcb(so);
1737                 tp = inp_inpcbtotcpcb(inp);
1738
1739                 inp_wlock(inp);
1740
1741                 oldval = tp->t_flags;
1742                 if (optval)
1743                         tp->t_flags |= TF_NODELAY;
1744                 else
1745                         tp->t_flags &= ~TF_NODELAY;
1746                 inp_wunlock(inp);
1747
1748
1749                 if (oldval != tp->t_flags && (tp->t_toe != NULL))
1750                         t3_set_nagle(tp->t_toe);
1751
1752         }
1753
1754         return (0);
1755 }
1756
1757 int
1758 t3_ctloutput(struct socket *so, struct sockopt *sopt)
1759 {
1760         int err;
1761
1762         if (sopt->sopt_level != IPPROTO_TCP)
1763                 err =  t3_ip_ctloutput(so, sopt);
1764         else
1765                 err = t3_tcp_ctloutput(so, sopt);
1766
1767         if (err != EOPNOTSUPP)
1768                 return (err);
1769
1770         return (tcp_ctloutput(so, sopt));
1771 }
1772
1773 /*
1774  * Returns true if we need to explicitly request RST when we receive new data
1775  * on an RX-closed connection.
1776  */
1777 static inline int
1778 need_rst_on_excess_rx(const struct toepcb *toep)
1779 {
1780         return (1);
1781 }
1782
1783 /*
1784  * Handles Rx data that arrives in a state where the socket isn't accepting
1785  * new data.
1786  */
1787 static void
1788 handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1789 {
1790
1791         if (need_rst_on_excess_rx(toep) &&
1792             !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1793                 t3_send_reset(toep);
1794         m_freem(m);
1795 }
1796
1797 /*
1798  * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1799  * by getting the DDP offset from the TCB.
1800  */
1801 static void
1802 tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1803 {
1804         struct ddp_state *q = &toep->tp_ddp_state;
1805         struct ddp_buf_state *bsp;
1806         struct cpl_get_tcb_rpl *hdr;
1807         unsigned int ddp_offset;
1808         struct socket *so;
1809         struct tcpcb *tp;
1810         struct sockbuf *rcv;
1811         int state;
1812
1813         uint64_t t;
1814         __be64 *tcb;
1815
1816         tp = toep->tp_tp;
1817         so = inp_inpcbtosocket(tp->t_inpcb);
1818
1819         inp_lock_assert(tp->t_inpcb);
1820         rcv = so_sockbuf_rcv(so);
1821         sockbuf_lock(rcv);
1822
1823         /* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1824          * We really need a cookie in order to dispatch the RPLs.
1825          */
1826         q->get_tcb_count--;
1827
1828         /* It is a possible that a previous CPL already invalidated UBUF DDP
1829          * and moved the cur_buf idx and hence no further processing of this
1830          * skb is required. However, the app might be sleeping on
1831          * !q->get_tcb_count and we need to wake it up.
1832          */
1833         if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1834                 int state = so_state_get(so);
1835
1836                 m_freem(m);
1837                 if (__predict_true((state & SS_NOFDREF) == 0))
1838                         so_sorwakeup_locked(so);
1839                 else
1840                         sockbuf_unlock(rcv);
1841
1842                 return;
1843         }
1844
1845         bsp = &q->buf_state[q->cur_buf];
1846         hdr = cplhdr(m);
1847         tcb = (__be64 *)(hdr + 1);
1848         if (q->cur_buf == 0) {
1849                 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1850                 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1851         } else {
1852                 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1853                 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1854         }
1855         ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1856         m->m_cur_offset = bsp->cur_offset;
1857         bsp->cur_offset = ddp_offset;
1858         m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1859
1860         CTR5(KTR_TOM,
1861             "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1862             q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1863         KASSERT(ddp_offset >= m->m_cur_offset,
1864             ("ddp_offset=%u less than cur_offset=%u",
1865                 ddp_offset, m->m_cur_offset));
1866
1867 #if 0
1868 {
1869         unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1870
1871         t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1872         ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1873
1874         t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1875         rcv_nxt = t >> S_TCB_RCV_NXT;
1876         rcv_nxt &= M_TCB_RCV_NXT;
1877
1878         t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1879         rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1880         rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1881
1882         T3_TRACE2(TIDTB(sk),
1883                   "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1884                   ddp_flags, rcv_nxt - rx_hdr_offset);
1885         T3_TRACE4(TB(q),
1886                   "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1887                   tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1888         T3_TRACE3(TB(q),
1889                   "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1890                   rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1891         T3_TRACE2(TB(q),
1892                   "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1893                  q->buf_state[0].flags, q->buf_state[1].flags);
1894
1895 }
1896 #endif
1897         if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1898                 handle_excess_rx(toep, m);
1899                 return;
1900         }
1901
1902 #ifdef T3_TRACE
1903         if ((int)m->m_pkthdr.len < 0) {
1904                 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1905         }
1906 #endif
1907         if (bsp->flags & DDP_BF_NOCOPY) {
1908 #ifdef T3_TRACE
1909                 T3_TRACE0(TB(q),
1910                           "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1911
1912                 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1913                         printk("!cancel_ubuf");
1914                         t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1915                 }
1916 #endif
1917                 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1918                 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1919                 q->cur_buf ^= 1;
1920         } else if (bsp->flags & DDP_BF_NOFLIP) {
1921
1922                 m->m_ddp_flags = 1;    /* always a kernel buffer */
1923
1924                 /* now HW buffer carries a user buffer */
1925                 bsp->flags &= ~DDP_BF_NOFLIP;
1926                 bsp->flags |= DDP_BF_NOCOPY;
1927
1928                 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1929                  * any new data in which case we're done. If in addition the
1930                  * offset is 0, then there wasn't a completion for the kbuf
1931                  * and we need to decrement the posted count.
1932                  */
1933                 if (m->m_pkthdr.len == 0) {
1934                         if (ddp_offset == 0) {
1935                                 q->kbuf_posted--;
1936                                 bsp->flags |= DDP_BF_NODATA;
1937                         }
1938                         sockbuf_unlock(rcv);
1939                         m_free(m);
1940                         return;
1941                 }
1942         } else {
1943                 sockbuf_unlock(rcv);
1944
1945                 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1946                  * but it got here way late and nobody cares anymore.
1947                  */
1948                 m_free(m);
1949                 return;
1950         }
1951
1952         m->m_ddp_gl = (unsigned char *)bsp->gl;
1953         m->m_flags |= M_DDP;
1954         m->m_seq = tp->rcv_nxt;
1955         tp->rcv_nxt += m->m_pkthdr.len;
1956         tp->t_rcvtime = ticks;
1957         CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1958                   m->m_seq, q->cur_buf, m->m_pkthdr.len);
1959         if (m->m_pkthdr.len == 0) {
1960                 q->user_ddp_pending = 0;
1961                 m_free(m);
1962         } else
1963                 SBAPPEND(rcv, m);
1964
1965         state = so_state_get(so);
1966         if (__predict_true((state & SS_NOFDREF) == 0))
1967                 so_sorwakeup_locked(so);
1968         else
1969                 sockbuf_unlock(rcv);
1970 }
1971
1972 /*
1973  * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1974  * in that case they are similar to DDP completions.
1975  */
1976 static int
1977 do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1978 {
1979         struct toepcb *toep = (struct toepcb *)ctx;
1980
1981         /* OK if socket doesn't exist */
1982         if (toep == NULL) {
1983                 printf("null toep in do_get_tcb_rpl\n");
1984                 return (CPL_RET_BUF_DONE);
1985         }
1986
1987         inp_wlock(toep->tp_tp->t_inpcb);
1988         tcb_rpl_as_ddp_complete(toep, m);
1989         inp_wunlock(toep->tp_tp->t_inpcb);
1990
1991         return (0);
1992 }
1993
1994 static void
1995 handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1996 {
1997         struct tcpcb *tp = toep->tp_tp;
1998         struct socket *so;
1999         struct ddp_state *q;
2000         struct ddp_buf_state *bsp;
2001         struct cpl_rx_data *hdr = cplhdr(m);
2002         unsigned int rcv_nxt = ntohl(hdr->seq);
2003         struct sockbuf *rcv;
2004
2005         if (tp->rcv_nxt == rcv_nxt)
2006                 return;
2007
2008         inp_lock_assert(tp->t_inpcb);
2009         so  = inp_inpcbtosocket(tp->t_inpcb);
2010         rcv = so_sockbuf_rcv(so);
2011         sockbuf_lock(rcv);
2012
2013         q = &toep->tp_ddp_state;
2014         bsp = &q->buf_state[q->cur_buf];
2015         KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
2016                 rcv_nxt, tp->rcv_nxt));
2017         m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2018         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2019         CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
2020             rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
2021
2022 #ifdef T3_TRACE
2023         if ((int)m->m_pkthdr.len < 0) {
2024                 t3_ddp_error(so, "handle_ddp_data: neg len");
2025         }
2026 #endif
2027         m->m_ddp_gl = (unsigned char *)bsp->gl;
2028         m->m_flags |= M_DDP;
2029         m->m_cur_offset = bsp->cur_offset;
2030         m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2031         if (bsp->flags & DDP_BF_NOCOPY)
2032                 bsp->flags &= ~DDP_BF_NOCOPY;
2033
2034         m->m_seq = tp->rcv_nxt;
2035         tp->rcv_nxt = rcv_nxt;
2036         bsp->cur_offset += m->m_pkthdr.len;
2037         if (!(bsp->flags & DDP_BF_NOFLIP))
2038                 q->cur_buf ^= 1;
2039         /*
2040          * For now, don't re-enable DDP after a connection fell out of  DDP
2041          * mode.
2042          */
2043         q->ubuf_ddp_ready = 0;
2044         sockbuf_unlock(rcv);
2045 }
2046
2047 /*
2048  * Process new data received for a connection.
2049  */
2050 static void
2051 new_rx_data(struct toepcb *toep, struct mbuf *m)
2052 {
2053         struct cpl_rx_data *hdr = cplhdr(m);
2054         struct tcpcb *tp = toep->tp_tp;
2055         struct socket *so;
2056         struct sockbuf *rcv;
2057         int state;
2058         int len = be16toh(hdr->len);
2059
2060         inp_wlock(tp->t_inpcb);
2061
2062         so  = inp_inpcbtosocket(tp->t_inpcb);
2063
2064         if (__predict_false(so_no_receive(so))) {
2065                 handle_excess_rx(toep, m);
2066                 inp_wunlock(tp->t_inpcb);
2067                 TRACE_EXIT;
2068                 return;
2069         }
2070
2071         if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2072                 handle_ddp_data(toep, m);
2073
2074         m->m_seq = ntohl(hdr->seq);
2075         m->m_ulp_mode = 0;                    /* for iSCSI */
2076
2077 #if VALIDATE_SEQ
2078         if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2079                 log(LOG_ERR,
2080                        "%s: TID %u: Bad sequence number %u, expected %u\n",
2081                     toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2082                        tp->rcv_nxt);
2083                 m_freem(m);
2084                 inp_wunlock(tp->t_inpcb);
2085                 return;
2086         }
2087 #endif
2088         m_adj(m, sizeof(*hdr));
2089
2090 #ifdef URGENT_DATA_SUPPORTED
2091         /*
2092          * We don't handle urgent data yet
2093          */
2094         if (__predict_false(hdr->urg))
2095                 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2096         if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2097                      tp->urg_seq - tp->rcv_nxt < skb->len))
2098                 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2099                                                          tp->rcv_nxt];
2100 #endif
2101         if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2102                 toep->tp_delack_mode = hdr->dack_mode;
2103                 toep->tp_delack_seq = tp->rcv_nxt;
2104         }
2105         CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2106             m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2107
2108         if (len < m->m_pkthdr.len)
2109                 m->m_pkthdr.len = m->m_len = len;
2110
2111         tp->rcv_nxt += m->m_pkthdr.len;
2112         tp->t_rcvtime = ticks;
2113         toep->tp_enqueued_bytes += m->m_pkthdr.len;
2114         CTR2(KTR_TOM,
2115             "new_rx_data: seq 0x%x len %u",
2116             m->m_seq, m->m_pkthdr.len);
2117         inp_wunlock(tp->t_inpcb);
2118         rcv = so_sockbuf_rcv(so);
2119         sockbuf_lock(rcv);
2120 #if 0
2121         if (sb_notify(rcv))
2122                 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2123 #endif
2124         SBAPPEND(rcv, m);
2125
2126 #ifdef notyet
2127         /*
2128          * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2129          *
2130          */
2131         KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2132
2133             ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2134                 so, rcv->sb_cc, rcv->sb_mbmax));
2135 #endif
2136
2137
2138         CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2139             rcv->sb_cc, rcv->sb_mbcnt);
2140
2141         state = so_state_get(so);
2142         if (__predict_true((state & SS_NOFDREF) == 0))
2143                 so_sorwakeup_locked(so);
2144         else
2145                 sockbuf_unlock(rcv);
2146 }
2147
2148 /*
2149  * Handler for RX_DATA CPL messages.
2150  */
2151 static int
2152 do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2153 {
2154         struct toepcb *toep = (struct toepcb *)ctx;
2155
2156         DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2157
2158         new_rx_data(toep, m);
2159
2160         return (0);
2161 }
2162
2163 static void
2164 new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2165 {
2166         struct tcpcb *tp;
2167         struct ddp_state *q;
2168         struct ddp_buf_state *bsp;
2169         struct cpl_rx_data_ddp *hdr;
2170         struct socket *so;
2171         unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2172         int nomoredata = 0;
2173         unsigned int delack_mode;
2174         struct sockbuf *rcv;
2175
2176         tp = toep->tp_tp;
2177         inp_wlock(tp->t_inpcb);
2178         so = inp_inpcbtosocket(tp->t_inpcb);
2179
2180         if (__predict_false(so_no_receive(so))) {
2181
2182                 handle_excess_rx(toep, m);
2183                 inp_wunlock(tp->t_inpcb);
2184                 return;
2185         }
2186
2187         q = &toep->tp_ddp_state;
2188         hdr = cplhdr(m);
2189         ddp_report = ntohl(hdr->u.ddp_report);
2190         buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2191         bsp = &q->buf_state[buf_idx];
2192
2193         CTR4(KTR_TOM,
2194             "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2195             "hdr seq 0x%x len %u",
2196             tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2197             ntohs(hdr->len));
2198         CTR3(KTR_TOM,
2199             "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2200             G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2201
2202         ddp_len = ntohs(hdr->len);
2203         rcv_nxt = ntohl(hdr->seq) + ddp_len;
2204
2205         delack_mode = G_DDP_DACK_MODE(ddp_report);
2206         if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2207                 toep->tp_delack_mode = delack_mode;
2208                 toep->tp_delack_seq = tp->rcv_nxt;
2209         }
2210
2211         m->m_seq = tp->rcv_nxt;
2212         tp->rcv_nxt = rcv_nxt;
2213
2214         tp->t_rcvtime = ticks;
2215         /*
2216          * Store the length in m->m_len.  We are changing the meaning of
2217          * m->m_len here, we need to be very careful that nothing from now on
2218          * interprets ->len of this packet the usual way.
2219          */
2220         m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2221         inp_wunlock(tp->t_inpcb);
2222         CTR3(KTR_TOM,
2223             "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2224             m->m_len, rcv_nxt, m->m_seq);
2225         /*
2226          * Figure out where the new data was placed in the buffer and store it
2227          * in when.  Assumes the buffer offset starts at 0, consumer needs to
2228          * account for page pod's pg_offset.
2229          */
2230         end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2231         m->m_cur_offset = end_offset - m->m_pkthdr.len;
2232
2233         rcv = so_sockbuf_rcv(so);
2234         sockbuf_lock(rcv);
2235
2236         m->m_ddp_gl = (unsigned char *)bsp->gl;
2237         m->m_flags |= M_DDP;
2238         bsp->cur_offset = end_offset;
2239         toep->tp_enqueued_bytes += m->m_pkthdr.len;
2240
2241         /*
2242          * Length is only meaningful for kbuf
2243          */
2244         if (!(bsp->flags & DDP_BF_NOCOPY))
2245                 KASSERT(m->m_len <= bsp->gl->dgl_length,
2246                     ("length received exceeds ddp pages: len=%d dgl_length=%d",
2247                         m->m_len, bsp->gl->dgl_length));
2248
2249         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2250         KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2251         /*
2252          * Bit 0 of flags stores whether the DDP buffer is completed.
2253          * Note that other parts of the code depend on this being in bit 0.
2254          */
2255         if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2256                 panic("spurious ddp completion");
2257         } else {
2258                 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2259                 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2260                         q->cur_buf ^= 1;                     /* flip buffers */
2261         }
2262
2263         if (bsp->flags & DDP_BF_NOCOPY) {
2264                 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2265                 bsp->flags &= ~DDP_BF_NOCOPY;
2266         }
2267
2268         if (ddp_report & F_DDP_PSH)
2269                 m->m_ddp_flags |= DDP_BF_PSH;
2270         if (nomoredata)
2271                 m->m_ddp_flags |= DDP_BF_NODATA;
2272
2273 #ifdef notyet
2274         skb_reset_transport_header(skb);
2275         tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2276 #endif
2277         SBAPPEND(rcv, m);
2278
2279         if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2280             (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2281                 || !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2282                 so_sorwakeup_locked(so);
2283         else
2284                 sockbuf_unlock(rcv);
2285 }
2286
2287 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2288                  F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2289                  F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2290                  F_DDP_INVALID_PPOD)
2291
2292 /*
2293  * Handler for RX_DATA_DDP CPL messages.
2294  */
2295 static int
2296 do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2297 {
2298         struct toepcb *toep = ctx;
2299         const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2300
2301         VALIDATE_SOCK(so);
2302
2303         if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2304                 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2305                        GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2306                 return (CPL_RET_BUF_DONE);
2307         }
2308 #if 0
2309         skb->h.th = tcphdr_skb->h.th;
2310 #endif
2311         new_rx_data_ddp(toep, m);
2312         return (0);
2313 }
2314
2315 static void
2316 process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2317 {
2318         struct tcpcb *tp = toep->tp_tp;
2319         struct socket *so;
2320         struct ddp_state *q;
2321         struct ddp_buf_state *bsp;
2322         struct cpl_rx_ddp_complete *hdr;
2323         unsigned int ddp_report, buf_idx, when, delack_mode;
2324         int nomoredata = 0;
2325         struct sockbuf *rcv;
2326
2327         inp_wlock(tp->t_inpcb);
2328         so = inp_inpcbtosocket(tp->t_inpcb);
2329
2330         if (__predict_false(so_no_receive(so))) {
2331                 struct inpcb *inp = so_sotoinpcb(so);
2332
2333                 handle_excess_rx(toep, m);
2334                 inp_wunlock(inp);
2335                 return;
2336         }
2337         q = &toep->tp_ddp_state;
2338         hdr = cplhdr(m);
2339         ddp_report = ntohl(hdr->ddp_report);
2340         buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2341         m->m_pkthdr.csum_data = tp->rcv_nxt;
2342
2343         rcv = so_sockbuf_rcv(so);
2344         sockbuf_lock(rcv);
2345
2346         bsp = &q->buf_state[buf_idx];
2347         when = bsp->cur_offset;
2348         m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2349         tp->rcv_nxt += m->m_len;
2350         tp->t_rcvtime = ticks;
2351
2352         delack_mode = G_DDP_DACK_MODE(ddp_report);
2353         if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2354                 toep->tp_delack_mode = delack_mode;
2355                 toep->tp_delack_seq = tp->rcv_nxt;
2356         }
2357 #ifdef notyet
2358         skb_reset_transport_header(skb);
2359         tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2360 #endif
2361         inp_wunlock(tp->t_inpcb);
2362
2363         KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2364         CTR5(KTR_TOM,
2365                   "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2366                   "ddp_report 0x%x offset %u, len %u",
2367                   tp->rcv_nxt, bsp->cur_offset, ddp_report,
2368                    G_DDP_OFFSET(ddp_report), m->m_len);
2369
2370         m->m_cur_offset = bsp->cur_offset;
2371         bsp->cur_offset += m->m_len;
2372
2373         if (!(bsp->flags & DDP_BF_NOFLIP)) {
2374                 q->cur_buf ^= 1;                     /* flip buffers */
2375                 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2376                         nomoredata=1;
2377         }
2378
2379         CTR4(KTR_TOM,
2380                   "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2381                   "ddp_report %u offset %u",
2382                   tp->rcv_nxt, bsp->cur_offset, ddp_report,
2383                    G_DDP_OFFSET(ddp_report));
2384
2385         m->m_ddp_gl = (unsigned char *)bsp->gl;
2386         m->m_flags |= M_DDP;
2387         m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2388         if (bsp->flags & DDP_BF_NOCOPY)
2389                 bsp->flags &= ~DDP_BF_NOCOPY;
2390         if (nomoredata)
2391                 m->m_ddp_flags |= DDP_BF_NODATA;
2392
2393         SBAPPEND(rcv, m);
2394         if ((so_state_get(so) & SS_NOFDREF) == 0)
2395                 so_sorwakeup_locked(so);
2396         else
2397                 sockbuf_unlock(rcv);
2398 }
2399
2400 /*
2401  * Handler for RX_DDP_COMPLETE CPL messages.
2402  */
2403 static int
2404 do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2405 {
2406         struct toepcb *toep = ctx;
2407
2408         VALIDATE_SOCK(so);
2409 #if 0
2410         skb->h.th = tcphdr_skb->h.th;
2411 #endif
2412         process_ddp_complete(toep, m);
2413         return (0);
2414 }
2415
2416 /*
2417  * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2418  * socket state before calling tcp_time_wait to comply with its expectations.
2419  */
2420 static void
2421 enter_timewait(struct tcpcb *tp)
2422 {
2423         /*
2424          * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2425          * process peer_close because we don't want to carry the peer FIN in
2426          * the socket's receive queue and if we increment rcv_nxt without
2427          * having the FIN in the receive queue we'll confuse facilities such
2428          * as SIOCINQ.
2429          */
2430         inp_wlock(tp->t_inpcb);
2431         tp->rcv_nxt++;
2432
2433         tp->ts_recent_age = 0;       /* defeat recycling */
2434         tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2435         inp_wunlock(tp->t_inpcb);
2436         tcp_offload_twstart(tp);
2437 }
2438
2439 /*
2440  * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2441  * function deals with the data that may be reported along with the FIN.
2442  * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2443  * perform normal FIN-related processing.  In the latter case 1 indicates that
2444  * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2445  * skb can be freed.
2446  */
2447 static int
2448 handle_peer_close_data(struct socket *so, struct mbuf *m)
2449 {
2450         struct tcpcb *tp = so_sototcpcb(so);
2451         struct toepcb *toep = tp->t_toe;
2452         struct ddp_state *q;
2453         struct ddp_buf_state *bsp;
2454         struct cpl_peer_close *req = cplhdr(m);
2455         unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2456         struct sockbuf *rcv;
2457
2458         if (tp->rcv_nxt == rcv_nxt)                     /* no data */
2459                 return (0);
2460
2461         CTR0(KTR_TOM, "handle_peer_close_data");
2462         if (__predict_false(so_no_receive(so))) {
2463                 handle_excess_rx(toep, m);
2464
2465                 /*
2466                  * Although we discard the data we want to process the FIN so
2467                  * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2468                  * PEER_CLOSE without data.  In particular this PEER_CLOSE
2469                  * may be what will close the connection.  We return 1 because
2470                  * handle_excess_rx() already freed the packet.
2471                  */
2472                 return (1);
2473         }
2474
2475         inp_lock_assert(tp->t_inpcb);
2476         q = &toep->tp_ddp_state;
2477         rcv = so_sockbuf_rcv(so);
2478         sockbuf_lock(rcv);
2479
2480         bsp = &q->buf_state[q->cur_buf];
2481         m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2482         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2483         m->m_ddp_gl = (unsigned char *)bsp->gl;
2484         m->m_flags |= M_DDP;
2485         m->m_cur_offset = bsp->cur_offset;
2486         m->m_ddp_flags =
2487             DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2488         m->m_seq = tp->rcv_nxt;
2489         tp->rcv_nxt = rcv_nxt;
2490         bsp->cur_offset += m->m_pkthdr.len;
2491         if (!(bsp->flags & DDP_BF_NOFLIP))
2492                 q->cur_buf ^= 1;
2493 #ifdef notyet
2494         skb_reset_transport_header(skb);
2495         tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2496 #endif
2497         tp->t_rcvtime = ticks;
2498         SBAPPEND(rcv, m);
2499         if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2500                 so_sorwakeup_locked(so);
2501         else
2502                 sockbuf_unlock(rcv);
2503
2504         return (1);
2505 }
2506
2507 /*
2508  * Handle a peer FIN.
2509  */
2510 static void
2511 do_peer_fin(struct toepcb *toep, struct mbuf *m)
2512 {
2513         struct socket *so;
2514         struct tcpcb *tp = toep->tp_tp;
2515         int keep, action;
2516
2517         action = keep = 0;
2518         CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2519         if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2520                 printf("abort_pending set\n");
2521
2522                 goto out;
2523         }
2524         inp_wlock(tp->t_inpcb);
2525         so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2526         if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2527                 keep = handle_peer_close_data(so, m);
2528                 if (keep < 0) {
2529                         inp_wunlock(tp->t_inpcb);
2530                         return;
2531                 }
2532         }
2533         if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2534                 CTR1(KTR_TOM,
2535                     "waking up waiters for cantrcvmore on %p ", so);
2536                 socantrcvmore(so);
2537
2538                 /*
2539                  * If connection is half-synchronized
2540                  * (ie NEEDSYN flag on) then delay ACK,
2541                  * so it may be piggybacked when SYN is sent.
2542                  * Otherwise, since we received a FIN then no
2543                  * more input can be expected, send ACK now.
2544                  */
2545                 if (tp->t_flags & TF_NEEDSYN)
2546                         tp->t_flags |= TF_DELACK;
2547                 else
2548                         tp->t_flags |= TF_ACKNOW;
2549                 tp->rcv_nxt++;
2550         }
2551
2552         switch (tp->t_state) {
2553         case TCPS_SYN_RECEIVED:
2554             tp->t_starttime = ticks;
2555         /* FALLTHROUGH */
2556         case TCPS_ESTABLISHED:
2557                 tp->t_state = TCPS_CLOSE_WAIT;
2558                 break;
2559         case TCPS_FIN_WAIT_1:
2560                 tp->t_state = TCPS_CLOSING;
2561                 break;
2562         case TCPS_FIN_WAIT_2:
2563                 /*
2564                  * If we've sent an abort_req we must have sent it too late,
2565                  * HW will send us a reply telling us so, and this peer_close
2566                  * is really the last message for this connection and needs to
2567                  * be treated as an abort_rpl, i.e., transition the connection
2568                  * to TCP_CLOSE (note that the host stack does this at the
2569                  * time of generating the RST but we must wait for HW).
2570                  * Otherwise we enter TIME_WAIT.
2571                  */
2572                 t3_release_offload_resources(toep);
2573                 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2574                         action = TCP_CLOSE;
2575                 } else {
2576                         action = TCP_TIMEWAIT;
2577                 }
2578                 break;
2579         default:
2580                 log(LOG_ERR,
2581                        "%s: TID %u received PEER_CLOSE in bad state %d\n",
2582                     toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2583         }
2584         inp_wunlock(tp->t_inpcb);
2585
2586         if (action == TCP_TIMEWAIT) {
2587                 enter_timewait(tp);
2588         } else if (action == TCP_DROP) {
2589                 tcp_offload_drop(tp, 0);
2590         } else if (action == TCP_CLOSE) {
2591                 tcp_offload_close(tp);
2592         }
2593
2594 #ifdef notyet
2595         /* Do not send POLL_HUP for half duplex close. */
2596         if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2597             sk->sk_state == TCP_CLOSE)
2598                 sk_wake_async(so, 1, POLL_HUP);
2599         else
2600                 sk_wake_async(so, 1, POLL_IN);
2601 #endif
2602
2603 out:
2604         if (!keep)
2605                 m_free(m);
2606 }
2607
2608 /*
2609  * Handler for PEER_CLOSE CPL messages.
2610  */
2611 static int
2612 do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2613 {
2614         struct toepcb *toep = (struct toepcb *)ctx;
2615
2616         VALIDATE_SOCK(so);
2617
2618         do_peer_fin(toep, m);
2619         return (0);
2620 }
2621
2622 static void
2623 process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2624 {
2625         struct cpl_close_con_rpl *rpl = cplhdr(m);
2626         struct tcpcb *tp = toep->tp_tp;
2627         struct socket *so;
2628         int action = 0;
2629         struct sockbuf *rcv;
2630
2631         inp_wlock(tp->t_inpcb);
2632         so = inp_inpcbtosocket(tp->t_inpcb);
2633
2634         tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2635
2636         if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2637                 inp_wunlock(tp->t_inpcb);
2638                 goto out;
2639         }
2640
2641         CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2642             tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2643
2644         switch (tp->t_state) {
2645         case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2646                 t3_release_offload_resources(toep);
2647                 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2648                         action = TCP_CLOSE;
2649
2650                 } else {
2651                         action = TCP_TIMEWAIT;
2652                 }
2653                 break;
2654         case TCPS_LAST_ACK:
2655                 /*
2656                  * In this state we don't care about pending abort_rpl.
2657                  * If we've sent abort_req it was post-close and was sent too
2658                  * late, this close_con_rpl is the actual last message.
2659                  */
2660                 t3_release_offload_resources(toep);
2661                 action = TCP_CLOSE;
2662                 break;
2663         case TCPS_FIN_WAIT_1:
2664                 /*
2665                  * If we can't receive any more
2666                  * data, then closing user can proceed.
2667                  * Starting the timer is contrary to the
2668                  * specification, but if we don't get a FIN
2669                  * we'll hang forever.
2670                  *
2671                  * XXXjl:
2672                  * we should release the tp also, and use a
2673                  * compressed state.
2674                  */
2675                 if (so)
2676                         rcv = so_sockbuf_rcv(so);
2677                 else
2678                         break;
2679
2680                 if (rcv->sb_state & SBS_CANTRCVMORE) {
2681                         int timeout;
2682
2683                         if (so)
2684                                 soisdisconnected(so);
2685                         timeout = (tcp_fast_finwait2_recycle) ?
2686                             tcp_finwait2_timeout : tcp_maxidle;
2687                         tcp_timer_activate(tp, TT_2MSL, timeout);
2688                 }
2689                 tp->t_state = TCPS_FIN_WAIT_2;
2690                 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2691                     (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2692                         action = TCP_DROP;
2693                 }
2694
2695                 break;
2696         default:
2697                 log(LOG_ERR,
2698                        "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2699                        toep->tp_toedev->tod_name, toep->tp_tid,
2700                        tp->t_state);
2701         }
2702         inp_wunlock(tp->t_inpcb);
2703
2704
2705         if (action == TCP_TIMEWAIT) {
2706                 enter_timewait(tp);
2707         } else if (action == TCP_DROP) {
2708                 tcp_offload_drop(tp, 0);
2709         } else if (action == TCP_CLOSE) {
2710                 tcp_offload_close(tp);
2711         }
2712 out:
2713         m_freem(m);
2714 }
2715
2716 /*
2717  * Handler for CLOSE_CON_RPL CPL messages.
2718  */
2719 static int
2720 do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2721                             void *ctx)
2722 {
2723         struct toepcb *toep = (struct toepcb *)ctx;
2724
2725         process_close_con_rpl(toep, m);
2726         return (0);
2727 }
2728
2729 /*
2730  * Process abort replies.  We only process these messages if we anticipate
2731  * them as the coordination between SW and HW in this area is somewhat lacking
2732  * and sometimes we get ABORT_RPLs after we are done with the connection that
2733  * originated the ABORT_REQ.
2734  */
2735 static void
2736 process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2737 {
2738         struct tcpcb *tp = toep->tp_tp;
2739         struct socket *so;
2740         int needclose = 0;
2741
2742 #ifdef T3_TRACE
2743         T3_TRACE1(TIDTB(sk),
2744                   "process_abort_rpl: GTS rpl pending %d",
2745                   sock_flag(sk, ABORT_RPL_PENDING));
2746 #endif
2747
2748         inp_wlock(tp->t_inpcb);
2749         so = inp_inpcbtosocket(tp->t_inpcb);
2750
2751         if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2752                 /*
2753                  * XXX panic on tcpdrop
2754                  */
2755                 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2756                         toep->tp_flags |= TP_ABORT_RPL_RCVD;
2757                 else {
2758                         toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2759                         if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2760                             !is_t3a(toep->tp_toedev)) {
2761                                 if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2762                                         panic("TP_ABORT_REQ_RCVD set");
2763                                 t3_release_offload_resources(toep);
2764                                 needclose = 1;
2765                         }
2766                 }
2767         }
2768         inp_wunlock(tp->t_inpcb);
2769
2770         if (needclose)
2771                 tcp_offload_close(tp);
2772
2773         m_free(m);
2774 }
2775
2776 /*
2777  * Handle an ABORT_RPL_RSS CPL message.
2778  */
2779 static int
2780 do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2781 {
2782         struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2783         struct toepcb *toep;
2784
2785         /*
2786          * Ignore replies to post-close aborts indicating that the abort was
2787          * requested too late.  These connections are terminated when we get
2788          * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2789          * arrives the TID is either no longer used or it has been recycled.
2790          */
2791         if (rpl->status == CPL_ERR_ABORT_FAILED) {
2792 discard:
2793                 m_free(m);
2794                 return (0);
2795         }
2796
2797         toep = (struct toepcb *)ctx;
2798
2799         /*
2800          * Sometimes we've already closed the socket, e.g., a post-close
2801          * abort races with ABORT_REQ_RSS, the latter frees the socket
2802          * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2803          * but FW turns the ABORT_REQ into a regular one and so we get
2804          * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2805          */
2806         if (!toep)
2807                 goto discard;
2808
2809         if (toep->tp_tp == NULL) {
2810                 log(LOG_NOTICE, "removing tid for abort\n");
2811                 cxgb_remove_tid(cdev, toep, toep->tp_tid);
2812                 if (toep->tp_l2t)
2813                         l2t_release(L2DATA(cdev), toep->tp_l2t);
2814
2815                 toepcb_release(toep);
2816                 goto discard;
2817         }
2818
2819         log(LOG_NOTICE, "toep=%p\n", toep);
2820         log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2821
2822         toepcb_hold(toep);
2823         process_abort_rpl(toep, m);
2824         toepcb_release(toep);
2825         return (0);
2826 }
2827
2828 /*
2829  * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2830  * indicate whether RST should be sent in response.
2831  */
2832 static int
2833 abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2834 {
2835         struct tcpcb *tp = so_sototcpcb(so);
2836
2837         switch (abort_reason) {
2838         case CPL_ERR_BAD_SYN:
2839 #if 0
2840                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);      // fall through
2841 #endif
2842         case CPL_ERR_CONN_RESET:
2843                 // XXX need to handle SYN_RECV due to crossed SYNs
2844                 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2845         case CPL_ERR_XMIT_TIMEDOUT:
2846         case CPL_ERR_PERSIST_TIMEDOUT:
2847         case CPL_ERR_FINWAIT2_TIMEDOUT:
2848         case CPL_ERR_KEEPALIVE_TIMEDOUT:
2849 #if 0
2850                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2851 #endif
2852                 return (ETIMEDOUT);
2853         default:
2854                 return (EIO);
2855         }
2856 }
2857
2858 static inline void
2859 set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2860 {
2861         struct cpl_abort_rpl *rpl = cplhdr(m);
2862
2863         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2864         rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2865         m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2866
2867         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2868         rpl->cmd = cmd;
2869 }
2870
2871 static void
2872 send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2873 {
2874         struct mbuf *reply_mbuf;
2875         struct cpl_abort_req_rss *req = cplhdr(m);
2876
2877         reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2878         m_set_priority(m, CPL_PRIORITY_DATA);
2879         m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2880         set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2881         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2882         m_free(m);
2883 }
2884
2885 /*
2886  * Returns whether an ABORT_REQ_RSS message is a negative advice.
2887  */
2888 static inline int
2889 is_neg_adv_abort(unsigned int status)
2890 {
2891         return status == CPL_ERR_RTX_NEG_ADVICE ||
2892             status == CPL_ERR_PERSIST_NEG_ADVICE;
2893 }
2894
2895 static void
2896 send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2897 {
2898         struct mbuf  *reply_mbuf;
2899         struct cpl_abort_req_rss *req = cplhdr(m);
2900
2901         reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2902
2903         if (!reply_mbuf) {
2904                 /* Defer the reply.  Stick rst_status into req->cmd. */
2905                 req->status = rst_status;
2906                 t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2907                 return;
2908         }
2909
2910         m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2911         set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2912         m_free(m);
2913
2914         /*
2915          * XXX need to sync with ARP as for SYN_RECV connections we can send
2916          * these messages while ARP is pending.  For other connection states
2917          * it's not a problem.
2918          */
2919         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2920 }
2921
2922 #ifdef notyet
2923 static void
2924 cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2925 {
2926         CXGB_UNIMPLEMENTED();
2927 #ifdef notyet
2928         struct request_sock *req = child->sk_user_data;
2929
2930         inet_csk_reqsk_queue_removed(parent, req);
2931         synq_remove(tcp_sk(child));
2932         __reqsk_free(req);
2933         child->sk_user_data = NULL;
2934 #endif
2935 }
2936
2937
2938 /*
2939  * Performs the actual work to abort a SYN_RECV connection.
2940  */
2941 static void
2942 do_abort_syn_rcv(struct socket *child, struct socket *parent)
2943 {
2944         struct tcpcb *parenttp = so_sototcpcb(parent);
2945         struct tcpcb *childtp = so_sototcpcb(child);
2946
2947         /*
2948          * If the server is still open we clean up the child connection,
2949          * otherwise the server already did the clean up as it was purging
2950          * its SYN queue and the skb was just sitting in its backlog.
2951          */
2952         if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2953                 cleanup_syn_rcv_conn(child, parent);
2954                 inp_wlock(childtp->t_inpcb);
2955                 t3_release_offload_resources(childtp->t_toe);
2956                 inp_wunlock(childtp->t_inpcb);
2957                 tcp_offload_close(childtp);
2958         }
2959 }
2960 #endif
2961
2962 /*
2963  * Handle abort requests for a SYN_RECV connection.  These need extra work
2964  * because the socket is on its parent's SYN queue.
2965  */
2966 static int
2967 abort_syn_rcv(struct socket *so, struct mbuf *m)
2968 {
2969         CXGB_UNIMPLEMENTED();
2970 #ifdef notyet
2971         struct socket *parent;
2972         struct toedev *tdev = toep->tp_toedev;
2973         struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2974         struct socket *oreq = so->so_incomp;
2975         struct t3c_tid_entry *t3c_stid;
2976         struct tid_info *t;
2977
2978         if (!oreq)
2979                 return -1;        /* somehow we are not on the SYN queue */
2980
2981         t = &(T3C_DATA(cdev))->tid_maps;
2982         t3c_stid = lookup_stid(t, oreq->ts_recent);
2983         parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2984
2985         so_lock(parent);
2986         do_abort_syn_rcv(so, parent);
2987         send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2988         so_unlock(parent);
2989 #endif
2990         return (0);
2991 }
2992
2993 /*
2994  * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2995  * request except that we need to reply to it.
2996  */
2997 static void
2998 process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
2999 {
3000         int rst_status = CPL_ABORT_NO_RST;
3001         const struct cpl_abort_req_rss *req = cplhdr(m);
3002         struct tcpcb *tp = toep->tp_tp;
3003         struct socket *so;
3004         int needclose = 0;
3005
3006         inp_wlock(tp->t_inpcb);
3007         so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3008         if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3009                 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3010                 m_free(m);
3011                 goto skip;
3012         }
3013
3014         toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3015         /*
3016          * Three cases to consider:
3017          * a) We haven't sent an abort_req; close the connection.
3018          * b) We have sent a post-close abort_req that will get to TP too late
3019          *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3020          *    be ignored and the connection should be closed now.
3021          * c) We have sent a regular abort_req that will get to TP too late.
3022          *    That will generate an abort_rpl with status 0, wait for it.
3023          */
3024         if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3025             (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3026                 int error;
3027
3028                 error = abort_status_to_errno(so, req->status,
3029                     &rst_status);
3030                 so_error_set(so, error);
3031
3032                 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3033                         so_sorwakeup(so);
3034                 /*
3035                  * SYN_RECV needs special processing.  If abort_syn_rcv()
3036                  * returns 0 is has taken care of the abort.
3037                  */
3038                 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3039                         goto skip;
3040
3041                 t3_release_offload_resources(toep);
3042                 needclose = 1;
3043         }
3044         inp_wunlock(tp->t_inpcb);
3045
3046         if (needclose)
3047                 tcp_offload_close(tp);
3048
3049         send_abort_rpl(m, tdev, rst_status);
3050         return;
3051 skip:
3052         inp_wunlock(tp->t_inpcb);
3053 }
3054
3055 /*
3056  * Handle an ABORT_REQ_RSS CPL message.
3057  */
3058 static int
3059 do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3060 {
3061         const struct cpl_abort_req_rss *req = cplhdr(m);
3062         struct toepcb *toep = (struct toepcb *)ctx;
3063
3064         if (is_neg_adv_abort(req->status)) {
3065                 m_free(m);
3066                 return (0);
3067         }
3068
3069         log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3070
3071         if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3072                 cxgb_remove_tid(cdev, toep, toep->tp_tid);
3073                 toep->tp_flags |= TP_ABORT_REQ_RCVD;
3074
3075                 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3076                 if (toep->tp_l2t)
3077                         l2t_release(L2DATA(cdev), toep->tp_l2t);
3078
3079                 /*
3080                  *  Unhook
3081                  */
3082                 toep->tp_tp->t_toe = NULL;
3083                 toep->tp_tp->t_flags &= ~TF_TOE;
3084                 toep->tp_tp = NULL;
3085                 /*
3086                  * XXX need to call syncache_chkrst - but we don't
3087                  * have a way of doing that yet
3088                  */
3089                 toepcb_release(toep);
3090                 log(LOG_ERR, "abort for unestablished connection :-(\n");
3091                 return (0);
3092         }
3093         if (toep->tp_tp == NULL) {
3094                 log(LOG_NOTICE, "disconnected toepcb\n");
3095                 /* should be freed momentarily */
3096                 return (0);
3097         }
3098
3099
3100         toepcb_hold(toep);
3101         process_abort_req(toep, m, toep->tp_toedev);
3102         toepcb_release(toep);
3103         return (0);
3104 }
3105 #ifdef notyet
3106 static void
3107 pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3108 {
3109         struct toedev *tdev = TOE_DEV(parent);
3110
3111         do_abort_syn_rcv(child, parent);
3112         if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3113                 struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3114
3115                 rpl->opt0h = htonl(F_TCAM_BYPASS);
3116                 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3117                 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3118         } else
3119                 m_free(m);
3120 }
3121 #endif
3122 static void
3123 handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3124 {
3125         CXGB_UNIMPLEMENTED();
3126
3127 #ifdef notyet
3128         struct t3cdev *cdev;
3129         struct socket *parent;
3130         struct socket *oreq;
3131         struct t3c_tid_entry *t3c_stid;
3132         struct tid_info *t;
3133         struct tcpcb *otp, *tp = so_sototcpcb(so);
3134         struct toepcb *toep = tp->t_toe;
3135
3136         /*
3137          * If the connection is being aborted due to the parent listening
3138          * socket going away there's nothing to do, the ABORT_REQ will close
3139          * the connection.
3140          */
3141         if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3142                 m_free(m);
3143                 return;
3144         }
3145
3146         oreq = so->so_incomp;
3147         otp = so_sototcpcb(oreq);
3148
3149         cdev = T3C_DEV(so);
3150         t = &(T3C_DATA(cdev))->tid_maps;
3151         t3c_stid = lookup_stid(t, otp->ts_recent);
3152         parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3153
3154         so_lock(parent);
3155         pass_open_abort(so, parent, m);
3156         so_unlock(parent);
3157 #endif
3158 }
3159
3160 /*
3161  * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3162  * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3163  * connection.
3164  */
3165 static void
3166 pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3167 {
3168
3169 #ifdef notyet
3170         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3171         BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3172 #endif
3173         handle_pass_open_arp_failure(m_get_socket(m), m);
3174 }
3175
3176 /*
3177  * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3178  */
3179 static void
3180 mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3181 {
3182         struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3183         struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3184         unsigned int tid = GET_TID(req);
3185
3186         m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3187         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3188         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3189         rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3190         rpl->opt0h = htonl(F_TCAM_BYPASS);
3191         rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3192         rpl->opt2 = 0;
3193         rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3194 }
3195
3196 /*
3197  * Send a deferred reject to an accept request.
3198  */
3199 static void
3200 reject_pass_request(struct toedev *tdev, struct mbuf *m)
3201 {
3202         struct mbuf *reply_mbuf;
3203
3204         reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3205         mk_pass_accept_rpl(reply_mbuf, m);
3206         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3207         m_free(m);
3208 }
3209
3210 static void
3211 handle_syncache_event(int event, void *arg)
3212 {
3213         struct toepcb *toep = arg;
3214
3215         switch (event) {
3216         case TOE_SC_ENTRY_PRESENT:
3217                 /*
3218                  * entry already exists - free toepcb
3219                  * and l2t
3220                  */
3221                 printf("syncache entry present\n");
3222                 toepcb_release(toep);
3223                 break;
3224         case TOE_SC_DROP:
3225                 /*
3226                  * The syncache has given up on this entry
3227                  * either it timed out, or it was evicted
3228                  * we need to explicitly release the tid
3229                  */
3230                 printf("syncache entry dropped\n");
3231                 toepcb_release(toep);
3232                 break;
3233         default:
3234                 log(LOG_ERR, "unknown syncache event %d\n", event);
3235                 break;
3236         }
3237 }
3238
3239 static void
3240 syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3241 {
3242         struct in_conninfo inc;
3243         struct tcpopt to;
3244         struct tcphdr th;
3245         struct inpcb *inp;
3246         int mss, wsf, sack, ts;
3247         uint32_t rcv_isn = ntohl(req->rcv_isn);
3248
3249         bzero(&to, sizeof(struct tcpopt));
3250         inp = so_sotoinpcb(lso);
3251
3252         /*
3253          * Fill out information for entering us into the syncache
3254          */
3255         inc.inc_fport = th.th_sport = req->peer_port;
3256         inc.inc_lport = th.th_dport = req->local_port;
3257         th.th_seq = req->rcv_isn;
3258         th.th_flags = TH_SYN;
3259
3260         toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3261
3262
3263         inc.inc_isipv6 = 0;
3264         inc.inc_len = 0;
3265         inc.inc_faddr.s_addr = req->peer_ip;
3266         inc.inc_laddr.s_addr = req->local_ip;
3267
3268         DPRINTF("syncache add of %d:%d %d:%d\n",
3269             ntohl(req->local_ip), ntohs(req->local_port),
3270             ntohl(req->peer_ip), ntohs(req->peer_port));
3271
3272         mss = req->tcp_options.mss;
3273         wsf = req->tcp_options.wsf;
3274         ts = req->tcp_options.tstamp;
3275         sack = req->tcp_options.sack;
3276         to.to_mss = mss;
3277         to.to_wscale = wsf;
3278         to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3279         tcp_offload_syncache_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3280 }
3281
3282
3283 /*
3284  * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3285  * lock held.  Note that the sock here is a listening socket that is not owned
3286  * by the TOE.
3287  */
3288 static void
3289 process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3290     struct listen_ctx *lctx)
3291 {
3292         int rt_flags;
3293         struct l2t_entry *e;
3294         struct iff_mac tim;
3295         struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3296         struct cpl_pass_accept_rpl *rpl;
3297         struct cpl_pass_accept_req *req = cplhdr(m);
3298         unsigned int tid = GET_TID(req);
3299         struct tom_data *d = TOM_DATA(tdev);
3300         struct t3cdev *cdev = d->cdev;
3301         struct tcpcb *tp = so_sototcpcb(so);
3302         struct toepcb *newtoep;
3303         struct rtentry *dst;
3304         struct sockaddr_in nam;
3305         struct t3c_data *td = T3C_DATA(cdev);
3306
3307         reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3308         if (__predict_false(reply_mbuf == NULL)) {
3309                 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3310                         t3_defer_reply(m, tdev, reject_pass_request);
3311                 else {
3312                         cxgb_queue_tid_release(cdev, tid);
3313                         m_free(m);
3314                 }
3315                 DPRINTF("failed to get reply_mbuf\n");
3316
3317                 goto out;
3318         }
3319
3320         if (tp->t_state != TCPS_LISTEN) {
3321                 DPRINTF("socket not in listen state\n");
3322
3323                 goto reject;
3324         }
3325
3326         tim.mac_addr = req->dst_mac;
3327         tim.vlan_tag = ntohs(req->vlan_tag);
3328         if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3329                 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3330                 goto reject;
3331         }
3332
3333 #ifdef notyet
3334         /*
3335          * XXX do route lookup to confirm that we're still listening on this
3336          * address
3337          */
3338         if (ip_route_input(skb, req->local_ip, req->peer_ip,
3339                            G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3340                 goto reject;
3341         rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3342                 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3343         dst_release(skb->dst);  // done with the input route, release it
3344         skb->dst = NULL;
3345
3346         if ((rt_flags & RTF_LOCAL) == 0)
3347                 goto reject;
3348 #endif
3349         /*
3350          * XXX
3351          */
3352         rt_flags = RTF_LOCAL;
3353         if ((rt_flags & RTF_LOCAL) == 0)
3354                 goto reject;
3355
3356         /*
3357          * Calculate values and add to syncache
3358          */
3359
3360         newtoep = toepcb_alloc();
3361         if (newtoep == NULL)
3362                 goto reject;
3363
3364         bzero(&nam, sizeof(struct sockaddr_in));
3365
3366         nam.sin_len = sizeof(struct sockaddr_in);
3367         nam.sin_family = AF_INET;
3368         nam.sin_addr.s_addr =req->peer_ip;
3369         dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3370
3371         if (dst == NULL) {
3372                 printf("failed to find route\n");
3373                 goto reject;
3374         }
3375         e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3376             (struct sockaddr *)&nam);
3377         if (e == NULL) {
3378                 DPRINTF("failed to get l2t\n");
3379         }
3380         /*
3381          * Point to our listen socket until accept
3382          */
3383         newtoep->tp_tp = tp;
3384         newtoep->tp_flags = TP_SYN_RCVD;
3385         newtoep->tp_tid = tid;
3386         newtoep->tp_toedev = tdev;
3387         tp->rcv_wnd = select_rcv_wnd(tdev, so);
3388
3389         cxgb_insert_tid(cdev, d->client, newtoep, tid);
3390         so_lock(so);
3391         LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3392         so_unlock(so);
3393
3394         newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3395                        tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3396
3397         if (newtoep->tp_ulp_mode) {
3398                 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3399
3400                 if (ddp_mbuf == NULL)
3401                         newtoep->tp_ulp_mode = 0;
3402         }
3403
3404         CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3405             TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3406         set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3407         /*
3408          * XXX workaround for lack of syncache drop
3409          */
3410         toepcb_hold(newtoep);
3411         syncache_add_accept_req(req, so, newtoep);
3412
3413         rpl = cplhdr(reply_mbuf);
3414         reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3415         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3416         rpl->wr.wr_lo = 0;
3417         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3418         rpl->opt2 = htonl(calc_opt2(so, tdev));
3419         rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3420         rpl->peer_ip = req->peer_ip;    // req->peer_ip is not overwritten
3421
3422         rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3423             V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3424         rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3425                                   CPL_PASS_OPEN_ACCEPT);
3426
3427         DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3428
3429         m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3430
3431         l2t_send(cdev, reply_mbuf, e);
3432         m_free(m);
3433         if (newtoep->tp_ulp_mode) {
3434                 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3435                                 V_TF_DDP_OFF(1) |
3436                                 TP_DDP_TIMER_WORKAROUND_MASK,
3437                                 V_TF_DDP_OFF(1) |
3438                     TP_DDP_TIMER_WORKAROUND_VAL, 1);
3439         } else
3440                 printf("not offloading\n");
3441
3442
3443
3444         return;
3445 reject:
3446         if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3447                 mk_pass_accept_rpl(reply_mbuf, m);
3448         else
3449                 mk_tid_release(reply_mbuf, newtoep, tid);
3450         cxgb_ofld_send(cdev, reply_mbuf);
3451         m_free(m);
3452 out:
3453 #if 0
3454         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3455 #else
3456         return;
3457 #endif
3458 }
3459
3460 /*
3461  * Handle a CPL_PASS_ACCEPT_REQ message.
3462  */
3463 static int
3464 do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3465 {
3466         struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3467         struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3468         struct tom_data *d = listen_ctx->tom_data;
3469
3470 #if VALIDATE_TID
3471         struct cpl_pass_accept_req *req = cplhdr(m);
3472         unsigned int tid = GET_TID(req);
3473         struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3474
3475         if (unlikely(!lsk)) {
3476                 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3477                        cdev->name,
3478                        (unsigned long)((union listen_entry *)ctx -
3479                                         t->stid_tab));
3480                 return CPL_RET_BUF_DONE;
3481         }
3482         if (unlikely(tid >= t->ntids)) {
3483                 printk(KERN_ERR "%s: passive open TID %u too large\n",
3484                        cdev->name, tid);
3485                 return CPL_RET_BUF_DONE;
3486         }
3487         /*
3488          * For T3A the current user of the TID may have closed but its last
3489          * message(s) may have been backlogged so the TID appears to be still
3490          * in use.  Just take the TID away, the connection can close at its
3491          * own leisure.  For T3B this situation is a bug.
3492          */
3493         if (!valid_new_tid(t, tid) &&
3494             cdev->type != T3A) {
3495                 printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3496                        cdev->name, tid);
3497                 return CPL_RET_BUF_DONE;
3498         }
3499 #endif
3500
3501         process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3502         return (0);
3503 }
3504
3505 /*
3506  * Called when a connection is established to translate the TCP options
3507  * reported by HW to FreeBSD's native format.
3508  */
3509 static void
3510 assign_rxopt(struct socket *so, unsigned int opt)
3511 {
3512         struct tcpcb *tp = so_sototcpcb(so);
3513         struct toepcb *toep = tp->t_toe;
3514         const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3515
3516         inp_lock_assert(tp->t_inpcb);
3517
3518         toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3519         tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3520         tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3521         tp->t_flags         |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3522         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3523             (TF_RCVD_SCALE|TF_REQ_SCALE))
3524                 tp->rcv_scale = tp->request_r_scale;
3525 }
3526
3527 /*
3528  * Completes some final bits of initialization for just established connections
3529  * and changes their state to TCP_ESTABLISHED.
3530  *
3531  * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3532  */
3533 static void
3534 make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3535 {
3536         struct tcpcb *tp = so_sototcpcb(so);
3537         struct toepcb *toep = tp->t_toe;
3538
3539         toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3540         assign_rxopt(so, opt);
3541
3542         /*
3543          *XXXXXXXXXXX
3544          *
3545          */
3546 #ifdef notyet
3547         so->so_proto->pr_ctloutput = t3_ctloutput;
3548 #endif
3549
3550 #if 0
3551         inet_sk(sk)->id = tp->write_seq ^ jiffies;
3552 #endif
3553         /*
3554          * XXX not clear what rcv_wup maps to
3555          */
3556         /*
3557          * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3558          * pass through opt0.
3559          */
3560         if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3561                 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3562
3563         dump_toepcb(toep);
3564
3565 #ifdef notyet
3566 /*
3567  * no clean interface for marking ARP up to date
3568  */
3569         dst_confirm(sk->sk_dst_cache);
3570 #endif
3571         tp->t_starttime = ticks;
3572         tp->t_state = TCPS_ESTABLISHED;
3573         soisconnected(so);
3574 }
3575
3576 static int
3577 syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3578 {
3579
3580         struct in_conninfo inc;
3581         struct tcpopt to;
3582         struct tcphdr th;
3583         int mss, wsf, sack, ts;
3584         struct mbuf *m = NULL;
3585         const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3586         unsigned int opt;
3587
3588 #ifdef MAC
3589 #error  "no MAC support"
3590 #endif
3591
3592         opt = ntohs(req->tcp_opt);
3593
3594         bzero(&to, sizeof(struct tcpopt));
3595
3596         /*
3597          * Fill out information for entering us into the syncache
3598          */
3599         inc.inc_fport = th.th_sport = req->peer_port;
3600         inc.inc_lport = th.th_dport = req->local_port;
3601         th.th_seq = req->rcv_isn;
3602         th.th_flags = TH_ACK;
3603
3604         inc.inc_isipv6 = 0;
3605         inc.inc_len = 0;
3606         inc.inc_faddr.s_addr = req->peer_ip;
3607         inc.inc_laddr.s_addr = req->local_ip;
3608
3609         mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3610         wsf  = G_TCPOPT_WSCALE_OK(opt);
3611         ts   = G_TCPOPT_TSTAMP(opt);
3612         sack = G_TCPOPT_SACK(opt);
3613
3614         to.to_mss = mss;
3615         to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3616         to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3617
3618         DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3619             ntohl(req->local_ip), ntohs(req->local_port),
3620             ntohl(req->peer_ip), ntohs(req->peer_port),
3621             mss, wsf, ts, sack);
3622         return tcp_offload_syncache_expand(&inc, &to, &th, so, m);
3623 }
3624
3625
3626 /*
3627  * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3628  * if we are in TCP_SYN_RECV due to crossed SYNs
3629  */
3630 static int
3631 do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3632 {
3633         struct cpl_pass_establish *req = cplhdr(m);
3634         struct toepcb *toep = (struct toepcb *)ctx;
3635         struct tcpcb *tp = toep->tp_tp;
3636         struct socket *so, *lso;
3637         struct t3c_data *td = T3C_DATA(cdev);
3638         struct sockbuf *snd, *rcv;
3639
3640         // Complete socket initialization now that we have the SND_ISN
3641
3642         struct toedev *tdev;
3643
3644
3645         tdev = toep->tp_toedev;
3646
3647         inp_wlock(tp->t_inpcb);
3648
3649         /*
3650          *
3651          * XXX need to add reference while we're manipulating
3652          */
3653         so = lso = inp_inpcbtosocket(tp->t_inpcb);
3654
3655         inp_wunlock(tp->t_inpcb);
3656
3657         so_lock(so);
3658         LIST_REMOVE(toep, synq_entry);
3659         so_unlock(so);
3660
3661         if (!syncache_expand_establish_req(req, &so, toep)) {
3662                 /*
3663                  * No entry
3664                  */
3665                 CXGB_UNIMPLEMENTED();
3666         }
3667         if (so == NULL) {
3668                 /*
3669                  * Couldn't create the socket
3670                  */
3671                 CXGB_UNIMPLEMENTED();
3672         }
3673
3674         tp = so_sototcpcb(so);
3675         inp_wlock(tp->t_inpcb);
3676
3677         snd = so_sockbuf_snd(so);
3678         rcv = so_sockbuf_rcv(so);
3679
3680         snd->sb_flags |= SB_NOCOALESCE;
3681         rcv->sb_flags |= SB_NOCOALESCE;
3682
3683         toep->tp_tp = tp;
3684         toep->tp_flags = 0;
3685         tp->t_toe = toep;
3686         reset_wr_list(toep);
3687         tp->rcv_wnd = select_rcv_wnd(tdev, so);
3688         tp->rcv_nxt = toep->tp_copied_seq;
3689         install_offload_ops(so);
3690
3691         toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3692         toep->tp_wr_unacked = 0;
3693         toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3694         toep->tp_qset_idx = 0;
3695         toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3696
3697         /*
3698          * XXX Cancel any keep alive timer
3699          */
3700
3701         make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3702
3703         /*
3704          * XXX workaround for lack of syncache drop
3705          */
3706         toepcb_release(toep);
3707         inp_wunlock(tp->t_inpcb);
3708
3709         CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3710         cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3711 #ifdef notyet
3712         /*
3713          * XXX not sure how these checks map to us
3714          */
3715         if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3716                 sk->sk_state_change(sk);
3717                 sk_wake_async(so, 0, POLL_OUT);
3718         }
3719         /*
3720          * The state for the new connection is now up to date.
3721          * Next check if we should add the connection to the parent's
3722          * accept queue.  When the parent closes it resets connections
3723          * on its SYN queue, so check if we are being reset.  If so we
3724          * don't need to do anything more, the coming ABORT_RPL will
3725          * destroy this socket.  Otherwise move the connection to the
3726          * accept queue.
3727          *
3728          * Note that we reset the synq before closing the server so if
3729          * we are not being reset the stid is still open.
3730          */
3731         if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3732                 __kfree_skb(skb);
3733                 goto unlock;
3734         }
3735 #endif
3736         m_free(m);
3737
3738         return (0);
3739 }
3740
3741 /*
3742  * Fill in the right TID for CPL messages waiting in the out-of-order queue
3743  * and send them to the TOE.
3744  */
3745 static void
3746 fixup_and_send_ofo(struct toepcb *toep)
3747 {
3748         struct mbuf *m;
3749         struct toedev *tdev = toep->tp_toedev;
3750         struct tcpcb *tp = toep->tp_tp;
3751         unsigned int tid = toep->tp_tid;
3752
3753         log(LOG_NOTICE, "fixup_and_send_ofo\n");
3754
3755         inp_lock_assert(tp->t_inpcb);
3756         while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3757                 /*
3758                  * A variety of messages can be waiting but the fields we'll
3759                  * be touching are common to all so any message type will do.
3760                  */
3761                 struct cpl_close_con_req *p = cplhdr(m);
3762
3763                 p->wr.wr_lo = htonl(V_WR_TID(tid));
3764                 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3765                 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3766         }
3767 }
3768
3769 /*
3770  * Updates socket state from an active establish CPL message.  Runs with the
3771  * socket lock held.
3772  */
3773 static void
3774 socket_act_establish(struct socket *so, struct mbuf *m)
3775 {
3776         struct cpl_act_establish *req = cplhdr(m);
3777         u32 rcv_isn = ntohl(req->rcv_isn);      /* real RCV_ISN + 1 */
3778         struct tcpcb *tp = so_sototcpcb(so);
3779         struct toepcb *toep = tp->t_toe;
3780
3781         if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3782                 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3783                     toep->tp_tid, tp->t_state);
3784
3785         tp->ts_recent_age = ticks;
3786         tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3787         toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3788
3789         make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3790
3791         /*
3792          * Now that we finally have a TID send any CPL messages that we had to
3793          * defer for lack of a TID.
3794          */
3795         if (mbufq_len(&toep->out_of_order_queue))
3796                 fixup_and_send_ofo(toep);
3797
3798         if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3799                 /*
3800                  * XXX does this even make sense?
3801                  */
3802                 so_sorwakeup(so);
3803         }
3804         m_free(m);
3805 #ifdef notyet
3806 /*
3807  * XXX assume no write requests permitted while socket connection is
3808  * incomplete
3809  */
3810         /*
3811          * Currently the send queue must be empty at this point because the
3812          * socket layer does not send anything before a connection is
3813          * established.  To be future proof though we handle the possibility
3814          * that there are pending buffers to send (either TX_DATA or
3815          * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3816          * buffers according to the just learned write_seq, and then we send
3817          * them on their way.
3818          */
3819         fixup_pending_writeq_buffers(sk);
3820         if (t3_push_frames(so, 1))
3821                 sk->sk_write_space(sk);
3822 #endif
3823
3824         toep->tp_state = tp->t_state;
3825         V_tcpstat.tcps_connects++;
3826
3827 }
3828
3829 /*
3830  * Process a CPL_ACT_ESTABLISH message.
3831  */
3832 static int
3833 do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3834 {
3835         struct cpl_act_establish *req = cplhdr(m);
3836         unsigned int tid = GET_TID(req);
3837         unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3838         struct toepcb *toep = (struct toepcb *)ctx;
3839         struct tcpcb *tp = toep->tp_tp;
3840         struct socket *so;
3841         struct toedev *tdev;
3842         struct tom_data *d;
3843
3844         if (tp == NULL) {
3845                 free_atid(cdev, atid);
3846                 return (0);
3847         }
3848         inp_wlock(tp->t_inpcb);
3849
3850         /*
3851          * XXX
3852          */
3853         so = inp_inpcbtosocket(tp->t_inpcb);
3854         tdev = toep->tp_toedev; /* blow up here if link was down */
3855         d = TOM_DATA(tdev);
3856
3857         /*
3858          * It's OK if the TID is currently in use, the owning socket may have
3859          * backlogged its last CPL message(s).  Just take it away.
3860          */
3861         toep->tp_tid = tid;
3862         toep->tp_tp = tp;
3863         so_insert_tid(d, toep, tid);
3864         free_atid(cdev, atid);
3865         toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3866
3867         socket_act_establish(so, m);
3868         inp_wunlock(tp->t_inpcb);
3869         CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3870         cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3871
3872         return (0);
3873 }
3874
3875 /*
3876  * Process an acknowledgment of WR completion.  Advance snd_una and send the
3877  * next batch of work requests from the write queue.
3878  */
3879 static void
3880 wr_ack(struct toepcb *toep, struct mbuf *m)
3881 {
3882         struct tcpcb *tp = toep->tp_tp;
3883         struct cpl_wr_ack *hdr = cplhdr(m);
3884         struct socket *so;
3885         unsigned int credits = ntohs(hdr->credits);
3886         u32 snd_una = ntohl(hdr->snd_una);
3887         int bytes = 0;
3888         struct sockbuf *snd;
3889
3890         CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3891
3892         inp_wlock(tp->t_inpcb);
3893         so = inp_inpcbtosocket(tp->t_inpcb);
3894         toep->tp_wr_avail += credits;
3895         if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3896                 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3897
3898         while (credits) {
3899                 struct mbuf *p = peek_wr(toep);
3900
3901                 if (__predict_false(!p)) {
3902                         log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3903                             "nothing pending, state %u wr_avail=%u\n",
3904                             credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3905                         break;
3906                 }
3907                 CTR2(KTR_TOM,
3908                         "wr_ack: p->credits=%d p->bytes=%d",
3909                     p->m_pkthdr.csum_data, p->m_pkthdr.len);
3910                 KASSERT(p->m_pkthdr.csum_data != 0,
3911                     ("empty request still on list"));
3912
3913                 if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3914
3915 #if DEBUG_WR > 1
3916                         struct tx_data_wr *w = cplhdr(p);
3917                         log(LOG_ERR,
3918                                "TID %u got %u WR credits, need %u, len %u, "
3919                                "main body %u, frags %u, seq # %u, ACK una %u,"
3920                                " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3921                                toep->tp_tid, credits, p->csum, p->len,
3922                                p->len - p->data_len, skb_shinfo(p)->nr_frags,
3923                                ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3924                             toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3925 #endif
3926                         p->m_pkthdr.csum_data -= credits;
3927                         break;
3928                 } else {
3929                         dequeue_wr(toep);
3930                         credits -= p->m_pkthdr.csum_data;
3931                         bytes += p->m_pkthdr.len;
3932                         CTR3(KTR_TOM,
3933                             "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3934                             p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3935
3936                         m_free(p);
3937                 }
3938         }
3939
3940 #if DEBUG_WR
3941         check_wr_invariants(tp);
3942 #endif
3943
3944         if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3945 #if VALIDATE_SEQ
3946                 struct tom_data *d = TOM_DATA(TOE_DEV(so));
3947
3948                 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3949                     "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3950                     toep->tp_tid, tp->snd_una);
3951 #endif
3952                 goto out_free;
3953         }
3954
3955         if (tp->snd_una != snd_una) {
3956                 tp->snd_una = snd_una;
3957                 tp->ts_recent_age = ticks;
3958 #ifdef notyet
3959                 /*
3960                  * Keep ARP entry "minty fresh"
3961                  */
3962                 dst_confirm(sk->sk_dst_cache);
3963 #endif
3964                 if (tp->snd_una == tp->snd_nxt)
3965                         toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3966         }
3967
3968         snd = so_sockbuf_snd(so);
3969         if (bytes) {
3970                 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3971                 snd = so_sockbuf_snd(so);
3972                 sockbuf_lock(snd);
3973                 sbdrop_locked(snd, bytes);
3974                 so_sowwakeup_locked(so);
3975         }
3976
3977         if (snd->sb_sndptroff < snd->sb_cc)
3978                 t3_push_frames(so, 0);
3979
3980 out_free:
3981         inp_wunlock(tp->t_inpcb);
3982         m_free(m);
3983 }
3984
3985 /*
3986  * Handler for TX_DATA_ACK CPL messages.
3987  */
3988 static int
3989 do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3990 {
3991         struct toepcb *toep = (struct toepcb *)ctx;
3992
3993         VALIDATE_SOCK(so);
3994
3995         wr_ack(toep, m);
3996         return 0;
3997 }
3998
3999 /*
4000  * Handler for TRACE_PKT CPL messages.  Just sink these packets.
4001  */
4002 static int
4003 do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4004 {
4005         m_freem(m);
4006         return 0;
4007 }
4008
4009 /*
4010  * Reset a connection that is on a listener's SYN queue or accept queue,
4011  * i.e., one that has not had a struct socket associated with it.
4012  * Must be called from process context.
4013  *
4014  * Modeled after code in inet_csk_listen_stop().
4015  */
4016 static void
4017 t3_reset_listen_child(struct socket *child)
4018 {
4019         struct tcpcb *tp = so_sototcpcb(child);
4020
4021         t3_send_reset(tp->t_toe);
4022 }
4023
4024
4025 static void
4026 t3_child_disconnect(struct socket *so, void *arg)
4027 {
4028         struct tcpcb *tp = so_sototcpcb(so);
4029
4030         if (tp->t_flags & TF_TOE) {
4031                 inp_wlock(tp->t_inpcb);
4032                 t3_reset_listen_child(so);
4033                 inp_wunlock(tp->t_inpcb);
4034         }
4035 }
4036
4037 /*
4038  * Disconnect offloaded established but not yet accepted connections sitting
4039  * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4040  * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4041  */
4042 void
4043 t3_disconnect_acceptq(struct socket *listen_so)
4044 {
4045
4046         so_lock(listen_so);
4047         so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4048         so_unlock(listen_so);
4049 }
4050
4051 /*
4052  * Reset offloaded connections sitting on a server's syn queue.  As above
4053  * we send ABORT_REQ and finish off when we get ABORT_RPL.
4054  */
4055
4056 void
4057 t3_reset_synq(struct listen_ctx *lctx)
4058 {
4059         struct toepcb *toep;
4060
4061         so_lock(lctx->lso);
4062         while (!LIST_EMPTY(&lctx->synq_head)) {
4063                 toep = LIST_FIRST(&lctx->synq_head);
4064                 LIST_REMOVE(toep, synq_entry);
4065                 toep->tp_tp = NULL;
4066                 t3_send_reset(toep);
4067                 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4068                 toepcb_release(toep);
4069         }
4070         so_unlock(lctx->lso);
4071 }
4072
4073
4074 int
4075 t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4076                    unsigned int nppods, unsigned int tag, unsigned int maxoff,
4077                    unsigned int pg_off, unsigned int color)
4078 {
4079         unsigned int i, j, pidx;
4080         struct pagepod *p;
4081         struct mbuf *m;
4082         struct ulp_mem_io *req;
4083         unsigned int tid = toep->tp_tid;
4084         const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4085         unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4086
4087         CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4088             gl, nppods, tag, maxoff, pg_off, color);
4089
4090         for (i = 0; i < nppods; ++i) {
4091                 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4092                 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4093                 req = mtod(m, struct ulp_mem_io *);
4094                 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4095                 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4096                 req->wr.wr_lo = 0;
4097                 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4098                                            V_ULPTX_CMD(ULP_MEM_WRITE));
4099                 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4100                                  V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4101
4102                 p = (struct pagepod *)(req + 1);
4103                 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4104                         p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4105                         p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4106                                                   V_PPOD_COLOR(color));
4107                         p->pp_max_offset = htonl(maxoff);
4108                         p->pp_page_offset = htonl(pg_off);
4109                         p->pp_rsvd = 0;
4110                         for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4111                                 p->pp_addr[j] = pidx < gl->dgl_nelem ?
4112                                     htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4113                 } else
4114                         p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4115                 send_or_defer(toep, m, 0);
4116                 ppod_addr += PPOD_SIZE;
4117         }
4118         return (0);
4119 }
4120
4121 /*
4122  * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4123  */
4124 static inline void
4125 mk_cpl_barrier_ulp(struct cpl_barrier *b)
4126 {
4127         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4128
4129         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4130         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4131         b->opcode = CPL_BARRIER;
4132 }
4133
4134 /*
4135  * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4136  */
4137 static inline void
4138 mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4139 {
4140         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4141
4142         txpkt = (struct ulp_txpkt *)req;
4143         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4144         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4145         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4146         req->cpuno = htons(cpuno);
4147 }
4148
4149 /*
4150  * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4151  */
4152 static inline void
4153 mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4154                      unsigned int word, uint64_t mask, uint64_t val)
4155 {
4156         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4157
4158         CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4159             tid, word, mask, val);
4160
4161         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4162         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4163         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4164         req->reply = V_NO_REPLY(1);
4165         req->cpu_idx = 0;
4166         req->word = htons(word);
4167         req->mask = htobe64(mask);
4168         req->val = htobe64(val);
4169 }
4170
4171 /*
4172  * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4173  */
4174 static void
4175 mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4176     unsigned int tid, unsigned int credits)
4177 {
4178         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4179
4180         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4181         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4182         OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4183         ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4184             V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4185                                  V_RX_CREDITS(credits));
4186 }
4187
4188 void
4189 t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4190 {
4191         unsigned int wrlen;
4192         struct mbuf *m;
4193         struct work_request_hdr *wr;
4194         struct cpl_barrier *lock;
4195         struct cpl_set_tcb_field *req;
4196         struct cpl_get_tcb *getreq;
4197         struct ddp_state *p = &toep->tp_ddp_state;
4198
4199 #if 0
4200         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4201 #endif
4202         wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4203                 sizeof(*getreq);
4204         m = m_gethdr_nofail(wrlen);
4205         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4206         wr = mtod(m, struct work_request_hdr *);
4207         bzero(wr, wrlen);
4208
4209         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4210         m->m_pkthdr.len = m->m_len = wrlen;
4211
4212         lock = (struct cpl_barrier *)(wr + 1);
4213         mk_cpl_barrier_ulp(lock);
4214
4215         req = (struct cpl_set_tcb_field *)(lock + 1);
4216
4217         CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4218
4219         /* Hmmm, not sure if this actually a good thing: reactivating
4220          * the other buffer might be an issue if it has been completed
4221          * already. However, that is unlikely, since the fact that the UBUF
4222          * is not completed indicates that there is no oustanding data.
4223          */
4224         if (bufidx == 0)
4225                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4226                                      V_TF_DDP_ACTIVE_BUF(1) |
4227                                      V_TF_DDP_BUF0_VALID(1),
4228                                      V_TF_DDP_ACTIVE_BUF(1));
4229         else
4230                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4231                                      V_TF_DDP_ACTIVE_BUF(1) |
4232                                      V_TF_DDP_BUF1_VALID(1), 0);
4233
4234         getreq = (struct cpl_get_tcb *)(req + 1);
4235         mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4236
4237         mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4238
4239         /* Keep track of the number of oustanding CPL_GET_TCB requests
4240          */
4241         p->get_tcb_count++;
4242
4243 #ifdef T3_TRACE
4244         T3_TRACE1(TIDTB(so),
4245                   "t3_cancel_ddpbuf: bufidx %u", bufidx);
4246 #endif
4247         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4248 }
4249
4250 /**
4251  * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4252  * @sk: the socket associated with the buffers
4253  * @bufidx: index of HW DDP buffer (0 or 1)
4254  * @tag0: new tag for HW buffer 0
4255  * @tag1: new tag for HW buffer 1
4256  * @len: new length for HW buf @bufidx
4257  *
4258  * Sends a compound WR to overlay a new DDP buffer on top of an existing
4259  * buffer by changing the buffer tag and length and setting the valid and
4260  * active flag accordingly.  The caller must ensure the new buffer is at
4261  * least as big as the existing one.  Since we typically reprogram both HW
4262  * buffers this function sets both tags for convenience. Read the TCB to
4263  * determine how made data was written into the buffer before the overlay
4264  * took place.
4265  */
4266 void
4267 t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4268                        unsigned int tag1, unsigned int len)
4269 {
4270         unsigned int wrlen;
4271         struct mbuf *m;
4272         struct work_request_hdr *wr;
4273         struct cpl_get_tcb *getreq;
4274         struct cpl_set_tcb_field *req;
4275         struct ddp_state *p = &toep->tp_ddp_state;
4276
4277         CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4278             bufidx, tag0, tag1, len);
4279 #if 0
4280         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4281 #endif
4282         wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4283         m = m_gethdr_nofail(wrlen);
4284         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4285         wr = mtod(m, struct work_request_hdr *);
4286         m->m_pkthdr.len = m->m_len = wrlen;
4287         bzero(wr, wrlen);
4288
4289
4290         /* Set the ATOMIC flag to make sure that TP processes the following
4291          * CPLs in an atomic manner and no wire segments can be interleaved.
4292          */
4293         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4294         req = (struct cpl_set_tcb_field *)(wr + 1);
4295         mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4296                              V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4297                              V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4298                              V_TCB_RX_DDP_BUF0_TAG(tag0) |
4299                              V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4300         req++;
4301         if (bufidx == 0) {
4302                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4303                             V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4304                             V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4305                 req++;
4306                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4307                             V_TF_DDP_PUSH_DISABLE_0(1) |
4308                             V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4309                             V_TF_DDP_PUSH_DISABLE_0(0) |
4310                             V_TF_DDP_BUF0_VALID(1));
4311         } else {
4312                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4313                             V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4314                             V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4315                 req++;
4316                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4317                             V_TF_DDP_PUSH_DISABLE_1(1) |
4318                             V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4319                             V_TF_DDP_PUSH_DISABLE_1(0) |
4320                             V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4321         }
4322
4323         getreq = (struct cpl_get_tcb *)(req + 1);
4324         mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4325
4326         /* Keep track of the number of oustanding CPL_GET_TCB requests
4327          */
4328         p->get_tcb_count++;
4329
4330 #ifdef T3_TRACE
4331         T3_TRACE4(TIDTB(sk),
4332                   "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4333                   "len %d",
4334                   bufidx, tag0, tag1, len);
4335 #endif
4336         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4337 }
4338
4339 /*
4340  * Sends a compound WR containing all the CPL messages needed to program the
4341  * two HW DDP buffers, namely optionally setting up the length and offset of
4342  * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4343  */
4344 void
4345 t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4346                       unsigned int len1, unsigned int offset1,
4347                       uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4348 {
4349         unsigned int wrlen;
4350         struct mbuf *m;
4351         struct work_request_hdr *wr;
4352         struct cpl_set_tcb_field *req;
4353
4354         CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4355             len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4356
4357 #if 0
4358         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4359 #endif
4360         wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4361                 (len1 ? sizeof(*req) : 0) +
4362                 (modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4363         m = m_gethdr_nofail(wrlen);
4364         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4365         wr = mtod(m, struct work_request_hdr *);
4366         bzero(wr, wrlen);
4367
4368         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4369         m->m_pkthdr.len = m->m_len = wrlen;
4370
4371         req = (struct cpl_set_tcb_field *)(wr + 1);
4372         if (len0) {                  /* program buffer 0 offset and length */
4373                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4374                         V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4375                         V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4376                         V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4377                         V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4378                 req++;
4379         }
4380         if (len1) {                  /* program buffer 1 offset and length */
4381                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4382                         V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4383                         V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4384                         V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4385                         V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4386                 req++;
4387         }
4388
4389         mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4390                              ddp_flags);
4391
4392         if (modulate) {
4393                 mk_rx_data_ack_ulp(toep,
4394                     (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4395                     toep->tp_copied_seq - toep->tp_rcv_wup);
4396                 toep->tp_rcv_wup = toep->tp_copied_seq;
4397         }
4398
4399 #ifdef T3_TRACE
4400         T3_TRACE5(TIDTB(sk),
4401                   "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4402                   "modulate %d",
4403                   len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4404                   modulate);
4405 #endif
4406
4407         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4408 }
4409
4410 void
4411 t3_init_wr_tab(unsigned int wr_len)
4412 {
4413         int i;
4414
4415         if (mbuf_wrs[1])     /* already initialized */
4416                 return;
4417
4418         for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4419                 int sgl_len = (3 * i) / 2 + (i & 1);
4420
4421                 sgl_len += 3;
4422                 mbuf_wrs[i] = sgl_len <= wr_len ?
4423                         1 : 1 + (sgl_len - 2) / (wr_len - 1);
4424         }
4425
4426         wrlen = wr_len * 8;
4427 }
4428
4429 int
4430 t3_init_cpl_io(void)
4431 {
4432 #ifdef notyet
4433         tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4434         if (!tcphdr_skb) {
4435                 log(LOG_ERR,
4436                        "Chelsio TCP offload: can't allocate sk_buff\n");
4437                 return -1;
4438         }
4439         skb_put(tcphdr_skb, sizeof(struct tcphdr));
4440         tcphdr_skb->h.raw = tcphdr_skb->data;
4441         memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4442 #endif
4443
4444         t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4445         t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4446         t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4447         t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4448         t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4449         t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4450         t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4451         t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4452         t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4453         t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4454         t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4455         t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4456         t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4457         t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4458         t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4459         return (0);
4460 }
4461