sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c

   1 /**************************************************************************
   2
   3 Copyright (c) 2007-2008, Chelsio Inc.
   4 All rights reserved.
   5
   6 Redistribution and use in source and binary forms, with or without
   7 modification, are permitted provided that the following conditions are met:
   8
   9  1. Redistributions of source code must retain the above copyright notice,
  10     this list of conditions and the following disclaimer.
  11
  12  2. Neither the name of the Chelsio Corporation nor the names of its
  13     contributors may be used to endorse or promote products derived from
  14     this software without specific prior written permission.
  15
  16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26 POSSIBILITY OF SUCH DAMAGE.
  27
  28 ***************************************************************************/
  29
  30 #include <sys/cdefs.h>
  31 __FBSDID("$FreeBSD$");
  32
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/fcntl.h>
  36 #include <sys/kernel.h>
  37 #include <sys/limits.h>
  38 #include <sys/ktr.h>
  39 #include <sys/lock.h>
  40 #include <sys/mbuf.h>
  41 #include <sys/mutex.h>
  42 #include <sys/socket.h>
  43 #include <sys/sysctl.h>
  44 #include <sys/syslog.h>
  45 #include <sys/protosw.h>
  46 #include <sys/priv.h>
  47
  48 #include <net/if.h>
  49 #include <net/route.h>
  50
  51 #include <netinet/in.h>
  52 #include <netinet/in_pcb.h>
  53 #include <netinet/in_systm.h>
  54 #include <netinet/in_var.h>
  55
  56
  57 #include <dev/cxgb/cxgb_osdep.h>
  58 #include <dev/cxgb/sys/mbufq.h>
  59
  60 #include <netinet/ip.h>
  61 #include <netinet/tcp_var.h>
  62 #include <netinet/tcp_fsm.h>
  63 #include <netinet/tcp_offload.h>
  64 #include <netinet/tcp_seq.h>
  65 #include <netinet/tcp_syncache.h>
  66 #include <netinet/tcp_timer.h>
  67 #include <net/route.h>
  68
  69 #include <dev/cxgb/t3cdev.h>
  70 #include <dev/cxgb/common/cxgb_firmware_exports.h>
  71 #include <dev/cxgb/common/cxgb_t3_cpl.h>
  72 #include <dev/cxgb/common/cxgb_tcb.h>
  73 #include <dev/cxgb/common/cxgb_ctl_defs.h>
  74 #include <dev/cxgb/cxgb_offload.h>
  75 #include <vm/vm.h>
  76 #include <vm/pmap.h>
  77 #include <machine/bus.h>
  78 #include <dev/cxgb/sys/mvec.h>
  79 #include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
  80 #include <dev/cxgb/ulp/tom/cxgb_defs.h>
  81 #include <dev/cxgb/ulp/tom/cxgb_tom.h>
  82 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
  83 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
  84 #include <dev/cxgb/ulp/tom/cxgb_tcp.h>
  85
  86 #include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
  87
  88 /*
  89  * For ULP connections HW may add headers, e.g., for digests, that aren't part
  90  * of the messages sent by the host but that are part of the TCP payload and
  91  * therefore consume TCP sequence space.  Tx connection parameters that
  92  * operate in TCP sequence space are affected by the HW additions and need to
  93  * compensate for them to accurately track TCP sequence numbers. This array
  94  * contains the compensating extra lengths for ULP packets.  It is indexed by
  95  * a packet's ULP submode.
  96  */
  97 const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};
  98
  99 #ifdef notyet
 100 /*
 101  * This sk_buff holds a fake header-only TCP segment that we use whenever we
 102  * need to exploit SW TCP functionality that expects TCP headers, such as
 103  * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
 104  * CPUs without locking.
 105  */
 106 static struct mbuf *tcphdr_mbuf __read_mostly;
 107 #endif
 108
 109 /*
 110  * Size of WRs in bytes.  Note that we assume all devices we are handling have
 111  * the same WR size.
 112  */
 113 static unsigned int wrlen __read_mostly;
 114
 115 /*
 116  * The number of WRs needed for an skb depends on the number of page fragments
 117  * in the skb and whether it has any payload in its main body.  This maps the
 118  * length of the gather list represented by an skb into the # of necessary WRs.
 119  */
 120 static unsigned int mbuf_wrs[TX_MAX_SEGS + 1] __read_mostly;
 121
 122 /*
 123  * Max receive window supported by HW in bytes.  Only a small part of it can
 124  * be set through option0, the rest needs to be set through RX_DATA_ACK.
 125  */
 126 #define MAX_RCV_WND ((1U << 27) - 1)
 127
 128 /*
 129  * Min receive window.  We want it to be large enough to accommodate receive
 130  * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
 131  */
 132 #define MIN_RCV_WND (24 * 1024U)
 133 #define INP_TOS(inp) ((inp_ip_tos_get(inp) >> 2) & M_TOS)
 134
 135 #define VALIDATE_SEQ 0
 136 #define VALIDATE_SOCK(so)
 137 #define DEBUG_WR 0
 138
 139 #define TCP_TIMEWAIT    1
 140 #define TCP_CLOSE       2
 141 #define TCP_DROP        3
 142
 143 extern int tcp_do_autorcvbuf;
 144 extern int tcp_do_autosndbuf;
 145 extern int tcp_autorcvbuf_max;
 146 extern int tcp_autosndbuf_max;
 147
 148 static void t3_send_reset(struct toepcb *toep);
 149 static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
 150 static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
 151 static void handle_syncache_event(int event, void *arg);
 152
 153 static inline void
 154 SBAPPEND(struct sockbuf *sb, struct mbuf *n)
 155 {
 156         struct mbuf *m;
 157
 158         m = sb->sb_mb;
 159         while (m) {
 160                 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
 161                     !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
 162                         !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
 163                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 164                         m->m_next, m->m_nextpkt, m->m_flags));
 165                 m = m->m_next;
 166         }
 167         m = n;
 168         while (m) {
 169                 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) ||
 170                     !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n",
 171                         !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
 172                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 173                         m->m_next, m->m_nextpkt, m->m_flags));
 174                 m = m->m_next;
 175         }
 176         KASSERT(sb->sb_flags & SB_NOCOALESCE, ("NOCOALESCE not set"));
 177         sbappendstream_locked(sb, n);
 178         m = sb->sb_mb;
 179
 180         while (m) {
 181                 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p m_flags=0x%x",
 182                         m->m_next, m->m_nextpkt, m->m_flags));
 183                 m = m->m_next;
 184         }
 185 }
 186
 187 static inline int
 188 is_t3a(const struct toedev *dev)
 189 {
 190         return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
 191 }
 192
 193 static void
 194 dump_toepcb(struct toepcb *toep)
 195 {
 196         DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
 197             toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
 198             toep->tp_mtu_idx, toep->tp_tid);
 199
 200         DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
 201             toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked,
 202             toep->tp_mss_clamp, toep->tp_flags);
 203 }
 204
 205 #ifndef RTALLOC2_DEFINED
 206 static struct rtentry *
 207 rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
 208 {
 209         struct rtentry *rt = NULL;
 210
 211         if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
 212                 RT_UNLOCK(rt);
 213
 214         return (rt);
 215 }
 216 #endif
 217
 218 /*
 219  * Determine whether to send a CPL message now or defer it.  A message is
 220  * deferred if the connection is in SYN_SENT since we don't know the TID yet.
 221  * For connections in other states the message is sent immediately.
 222  * If through_l2t is set the message is subject to ARP processing, otherwise
 223  * it is sent directly.
 224  */
 225 static inline void
 226 send_or_defer(struct toepcb *toep, struct mbuf *m, int through_l2t)
 227 {
 228         struct tcpcb *tp = toep->tp_tp;
 229
 230         if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
 231                 inp_wlock(tp->t_inpcb);
 232                 mbufq_tail(&toep->out_of_order_queue, m);  // defer
 233                 inp_wunlock(tp->t_inpcb);
 234         } else if (through_l2t)
 235                 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);  // send through L2T
 236         else
 237                 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);          // send directly
 238 }
 239
 240 static inline unsigned int
 241 mkprio(unsigned int cntrl, const struct toepcb *toep)
 242 {
 243         return (cntrl);
 244 }
 245
 246 /*
 247  * Populate a TID_RELEASE WR.  The skb must be already propely sized.
 248  */
 249 static inline void
 250 mk_tid_release(struct mbuf *m, const struct toepcb *toep, unsigned int tid)
 251 {
 252         struct cpl_tid_release *req;
 253
 254         m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, toep));
 255         m->m_pkthdr.len = m->m_len = sizeof(*req);
 256         req = mtod(m, struct cpl_tid_release *);
 257         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 258         req->wr.wr_lo = 0;
 259         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
 260 }
 261
 262 static inline void
 263 make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
 264 {
 265         struct tcpcb *tp = so_sototcpcb(so);
 266         struct toepcb *toep = tp->t_toe;
 267         struct tx_data_wr *req;
 268         struct sockbuf *snd;
 269
 270         inp_lock_assert(tp->t_inpcb);
 271         snd = so_sockbuf_snd(so);
 272
 273         req = mtod(m, struct tx_data_wr *);
 274         m->m_len = sizeof(*req);
 275         req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
 276         req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
 277         /* len includes the length of any HW ULP additions */
 278         req->len = htonl(len);
 279         req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
 280         /* V_TX_ULP_SUBMODE sets both the mode and submode */
 281         req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
 282                            V_TX_URG(/* skb_urgent(skb) */ 0 ) |
 283                            V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
 284                                    (tail ? 0 : 1))));
 285         req->sndseq = htonl(tp->snd_nxt);
 286         if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
 287                 req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
 288                                     V_TX_CPU_IDX(toep->tp_qset));
 289
 290                 /* Sendbuffer is in units of 32KB.
 291                  */
 292                 if (tcp_do_autosndbuf && snd->sb_flags & SB_AUTOSIZE)
 293                         req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
 294                 else {
 295                         req->param |= htonl(V_TX_SNDBUF(snd->sb_hiwat >> 15));
 296                 }
 297
 298                 toep->tp_flags |= TP_DATASENT;
 299         }
 300 }
 301
 302 #define IMM_LEN 64 /* XXX - see WR_LEN in the cxgb driver */
 303
 304 int
 305 t3_push_frames(struct socket *so, int req_completion)
 306 {
 307         struct tcpcb *tp = so_sototcpcb(so);
 308         struct toepcb *toep = tp->t_toe;
 309
 310         struct mbuf *tail, *m0, *last;
 311         struct t3cdev *cdev;
 312         struct tom_data *d;
 313         int state, bytes, count, total_bytes;
 314         bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
 315         struct sockbuf *snd;
 316
 317         if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
 318                 DPRINTF("tcp state=%d\n", tp->t_state);
 319                 return (0);
 320         }
 321
 322         state = so_state_get(so);
 323
 324         if (state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
 325                 DPRINTF("disconnecting\n");
 326
 327                 return (0);
 328         }
 329
 330         inp_lock_assert(tp->t_inpcb);
 331
 332         snd = so_sockbuf_snd(so);
 333         sockbuf_lock(snd);
 334
 335         d = TOM_DATA(toep->tp_toedev);
 336         cdev = d->cdev;
 337
 338         last = tail = snd->sb_sndptr ? snd->sb_sndptr : snd->sb_mb;
 339
 340         total_bytes = 0;
 341         DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
 342             toep->tp_wr_avail, tail, snd->sb_cc, toep->tp_m_last);
 343
 344         if (last && toep->tp_m_last == last  && snd->sb_sndptroff != 0) {
 345                 KASSERT(tail, ("sbdrop error"));
 346                 last = tail = tail->m_next;
 347         }
 348
 349         if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
 350                 DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
 351                 sockbuf_unlock(snd);
 352
 353                 return (0);
 354         }
 355
 356         toep->tp_m_last = NULL;
 357         while (toep->tp_wr_avail && (tail != NULL)) {
 358                 count = bytes = 0;
 359                 segp = segs;
 360                 if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
 361                         sockbuf_unlock(snd);
 362                         return (0);
 363                 }
 364                 /*
 365                  * If the data in tail fits as in-line, then
 366                  * make an immediate data wr.
 367                  */
 368                 if (tail->m_len <= IMM_LEN) {
 369                         count = 1;
 370                         bytes = tail->m_len;
 371                         last = tail;
 372                         tail = tail->m_next;
 373                         m_set_sgl(m0, NULL);
 374                         m_set_sgllen(m0, 0);
 375                         make_tx_data_wr(so, m0, bytes, tail);
 376                         m_append(m0, bytes, mtod(last, caddr_t));
 377                         KASSERT(!m0->m_next, ("bad append"));
 378                 } else {
 379                         while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
 380                             && (tail != NULL) && (count < TX_MAX_SEGS-1)) {
 381                                 bytes += tail->m_len;
 382                                 last = tail;
 383                                 count++;
 384                                 /*
 385                                  * technically an abuse to be using this for a VA
 386                                  * but less gross than defining my own structure
 387                                  * or calling pmap_kextract from here :-|
 388                                  */
 389                                 segp->ds_addr = (bus_addr_t)tail->m_data;
 390                                 segp->ds_len = tail->m_len;
 391                                 DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
 392                                     count, mbuf_wrs[count], tail->m_data, tail->m_len);
 393                                 segp++;
 394                                 tail = tail->m_next;
 395                         }
 396                         DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
 397                             toep->tp_wr_avail, count, mbuf_wrs[count], tail);
 398
 399                         m_set_sgl(m0, segs);
 400                         m_set_sgllen(m0, count);
 401                         make_tx_data_wr(so, m0, bytes, tail);
 402                 }
 403                 m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, toep));
 404
 405                 if (tail) {
 406                         snd->sb_sndptr = tail;
 407                         toep->tp_m_last = NULL;
 408                 } else
 409                         toep->tp_m_last = snd->sb_sndptr = last;
 410
 411
 412                 DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);
 413
 414                 snd->sb_sndptroff += bytes;
 415                 total_bytes += bytes;
 416                 toep->tp_write_seq += bytes;
 417                 CTR6(KTR_TOM, "t3_push_frames: wr_avail=%d mbuf_wrs[%d]=%d tail=%p sndptr=%p sndptroff=%d",
 418                     toep->tp_wr_avail, count, mbuf_wrs[count], tail, snd->sb_sndptr, snd->sb_sndptroff);
 419                 if (tail)
 420                         CTR4(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p tailbuf=%p snd_una=0x%08x",
 421                             total_bytes, toep->tp_m_last, tail->m_data, tp->snd_una);
 422                 else
 423                         CTR3(KTR_TOM, "t3_push_frames: total_bytes=%d tp_m_last=%p snd_una=0x%08x",
 424                             total_bytes, toep->tp_m_last, tp->snd_una);
 425
 426
 427 #ifdef KTR
 428 {
 429                 int i;
 430
 431                 i = 0;
 432                 while (i < count && m_get_sgllen(m0)) {
 433                         if ((count - i) >= 3) {
 434                                 CTR6(KTR_TOM,
 435                                     "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d pa=0x%zx len=%d",
 436                                     segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len,
 437                                     segs[i + 2].ds_addr, segs[i + 2].ds_len);
 438                                     i += 3;
 439                         } else if ((count - i) == 2) {
 440                                 CTR4(KTR_TOM,
 441                                     "t3_push_frames: pa=0x%zx len=%d pa=0x%zx len=%d",
 442                                     segs[i].ds_addr, segs[i].ds_len, segs[i + 1].ds_addr, segs[i + 1].ds_len);
 443                                     i += 2;
 444                         } else {
 445                                 CTR2(KTR_TOM, "t3_push_frames: pa=0x%zx len=%d",
 446                                     segs[i].ds_addr, segs[i].ds_len);
 447                                 i++;
 448                         }
 449
 450                 }
 451 }
 452 #endif
 453                  /*
 454                  * remember credits used
 455                  */
 456                 m0->m_pkthdr.csum_data = mbuf_wrs[count];
 457                 m0->m_pkthdr.len = bytes;
 458                 toep->tp_wr_avail -= mbuf_wrs[count];
 459                 toep->tp_wr_unacked += mbuf_wrs[count];
 460
 461                 if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
 462                     toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
 463                         struct work_request_hdr *wr = cplhdr(m0);
 464
 465                         wr->wr_hi |= htonl(F_WR_COMPL);
 466                         toep->tp_wr_unacked = 0;
 467                 }
 468                 KASSERT((m0->m_pkthdr.csum_data > 0) &&
 469                     (m0->m_pkthdr.csum_data <= 4), ("bad credit count %d",
 470                         m0->m_pkthdr.csum_data));
 471                 m0->m_type = MT_DONTFREE;
 472                 enqueue_wr(toep, m0);
 473                 DPRINTF("sending offload tx with %d bytes in %d segments\n",
 474                     bytes, count);
 475                 l2t_send(cdev, m0, toep->tp_l2t);
 476         }
 477         sockbuf_unlock(snd);
 478         return (total_bytes);
 479 }
 480
 481 /*
 482  * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
 483  * under any circumstances.  We take the easy way out and always queue the
 484  * message to the write_queue.  We can optimize the case where the queue is
 485  * already empty though the optimization is probably not worth it.
 486  */
 487 static void
 488 close_conn(struct socket *so)
 489 {
 490         struct mbuf *m;
 491         struct cpl_close_con_req *req;
 492         struct tom_data *d;
 493         struct inpcb *inp = so_sotoinpcb(so);
 494         struct tcpcb *tp;
 495         struct toepcb *toep;
 496         unsigned int tid;
 497
 498
 499         inp_wlock(inp);
 500         tp = so_sototcpcb(so);
 501         toep = tp->t_toe;
 502
 503         if (tp->t_state != TCPS_SYN_SENT)
 504                 t3_push_frames(so, 1);
 505
 506         if (toep->tp_flags & TP_FIN_SENT) {
 507                 inp_wunlock(inp);
 508                 return;
 509         }
 510
 511         tid = toep->tp_tid;
 512
 513         d = TOM_DATA(toep->tp_toedev);
 514
 515         m = m_gethdr_nofail(sizeof(*req));
 516         m_set_priority(m, CPL_PRIORITY_DATA);
 517         m_set_sgl(m, NULL);
 518         m_set_sgllen(m, 0);
 519
 520         toep->tp_flags |= TP_FIN_SENT;
 521         req = mtod(m, struct cpl_close_con_req *);
 522
 523         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
 524         req->wr.wr_lo = htonl(V_WR_TID(tid));
 525         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
 526         req->rsvd = 0;
 527         inp_wunlock(inp);
 528         /*
 529          * XXX - need to defer shutdown while there is still data in the queue
 530          *
 531          */
 532         CTR4(KTR_TOM, "%s CLOSE_CON_REQ so %p tp %p tid=%u", __FUNCTION__, so, tp, tid);
 533         cxgb_ofld_send(d->cdev, m);
 534
 535 }
 536
 537 /*
 538  * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
 539  * and send it along.
 540  */
 541 static void
 542 abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
 543 {
 544         struct cpl_abort_req *req = cplhdr(m);
 545
 546         req->cmd = CPL_ABORT_NO_RST;
 547         cxgb_ofld_send(cdev, m);
 548 }
 549
 550 /*
 551  * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
 552  * permitted to return without sending the message in case we cannot allocate
 553  * an sk_buff.  Returns the number of credits sent.
 554  */
 555 uint32_t
 556 t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
 557 {
 558         struct mbuf *m;
 559         struct cpl_rx_data_ack *req;
 560         struct toepcb *toep = tp->t_toe;
 561         struct toedev *tdev = toep->tp_toedev;
 562
 563         m = m_gethdr_nofail(sizeof(*req));
 564
 565         DPRINTF("returning %u credits to HW\n", credits);
 566
 567         req = mtod(m, struct cpl_rx_data_ack *);
 568         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 569         req->wr.wr_lo = 0;
 570         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 571         req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
 572         m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toep));
 573         cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
 574         return (credits);
 575 }
 576
 577 /*
 578  * Send RX_DATA_ACK CPL message to request a modulation timer to be scheduled.
 579  * This is only used in DDP mode, so we take the opportunity to also set the
 580  * DACK mode and flush any Rx credits.
 581  */
 582 void
 583 t3_send_rx_modulate(struct toepcb *toep)
 584 {
 585         struct mbuf *m;
 586         struct cpl_rx_data_ack *req;
 587
 588         m = m_gethdr_nofail(sizeof(*req));
 589
 590         req = mtod(m, struct cpl_rx_data_ack *);
 591         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 592         req->wr.wr_lo = 0;
 593         m->m_pkthdr.len = m->m_len = sizeof(*req);
 594
 595         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 596         req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
 597                                  V_RX_DACK_MODE(1) |
 598                                  V_RX_CREDITS(toep->tp_copied_seq - toep->tp_rcv_wup));
 599         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 600         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
 601         toep->tp_rcv_wup = toep->tp_copied_seq;
 602 }
 603
 604 /*
 605  * Handle receipt of an urgent pointer.
 606  */
 607 static void
 608 handle_urg_ptr(struct socket *so, uint32_t urg_seq)
 609 {
 610 #ifdef URGENT_DATA_SUPPORTED
 611         struct tcpcb *tp = so_sototcpcb(so);
 612
 613         urg_seq--;   /* initially points past the urgent data, per BSD */
 614
 615         if (tp->urg_data && !after(urg_seq, tp->urg_seq))
 616                 return;                                 /* duplicate pointer */
 617         sk_send_sigurg(sk);
 618         if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
 619             !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
 620                 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 621
 622                 tp->copied_seq++;
 623                 if (skb && tp->copied_seq - TCP_SKB_CB(skb)->seq >= skb->len)
 624                         tom_eat_skb(sk, skb, 0);
 625         }
 626         tp->urg_data = TCP_URG_NOTYET;
 627         tp->urg_seq = urg_seq;
 628 #endif
 629 }
 630
 631 /*
 632  * Returns true if a socket cannot accept new Rx data.
 633  */
 634 static inline int
 635 so_no_receive(const struct socket *so)
 636 {
 637         return (so_state_get(so) & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
 638 }
 639
 640 /*
 641  * Process an urgent data notification.
 642  */
 643 static void
 644 rx_urg_notify(struct toepcb *toep, struct mbuf *m)
 645 {
 646         struct cpl_rx_urg_notify *hdr = cplhdr(m);
 647         struct socket *so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
 648
 649         VALIDATE_SOCK(so);
 650
 651         if (!so_no_receive(so))
 652                 handle_urg_ptr(so, ntohl(hdr->seq));
 653
 654         m_freem(m);
 655 }
 656
 657 /*
 658  * Handler for RX_URG_NOTIFY CPL messages.
 659  */
 660 static int
 661 do_rx_urg_notify(struct t3cdev *cdev, struct mbuf *m, void *ctx)
 662 {
 663         struct toepcb *toep = (struct toepcb *)ctx;
 664
 665         rx_urg_notify(toep, m);
 666         return (0);
 667 }
 668
 669 static __inline int
 670 is_delack_mode_valid(struct toedev *dev, struct toepcb *toep)
 671 {
 672         return (toep->tp_ulp_mode ||
 673                 (toep->tp_ulp_mode == ULP_MODE_TCPDDP &&
 674                     dev->tod_ttid >= TOE_ID_CHELSIO_T3));
 675 }
 676
 677 /*
 678  * Set of states for which we should return RX credits.
 679  */
 680 #define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)
 681
 682 /*
 683  * Called after some received data has been read.  It returns RX credits
 684  * to the HW for the amount of data processed.
 685  */
 686 void
 687 t3_cleanup_rbuf(struct tcpcb *tp, int copied)
 688 {
 689         struct toepcb *toep = tp->t_toe;
 690         struct socket *so;
 691         struct toedev *dev;
 692         int dack_mode, must_send, read;
 693         u32 thres, credits, dack = 0;
 694         struct sockbuf *rcv;
 695
 696         so = inp_inpcbtosocket(tp->t_inpcb);
 697         rcv = so_sockbuf_rcv(so);
 698
 699         if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
 700                 (tp->t_state == TCPS_FIN_WAIT_2))) {
 701                 if (copied) {
 702                         sockbuf_lock(rcv);
 703                         toep->tp_copied_seq += copied;
 704                         sockbuf_unlock(rcv);
 705                 }
 706
 707                 return;
 708         }
 709
 710         inp_lock_assert(tp->t_inpcb);
 711
 712         sockbuf_lock(rcv);
 713         if (copied)
 714                 toep->tp_copied_seq += copied;
 715         else {
 716                 read = toep->tp_enqueued_bytes - rcv->sb_cc;
 717                 toep->tp_copied_seq += read;
 718         }
 719         credits = toep->tp_copied_seq - toep->tp_rcv_wup;
 720         toep->tp_enqueued_bytes = rcv->sb_cc;
 721         sockbuf_unlock(rcv);
 722
 723         if (credits > rcv->sb_mbmax) {
 724                 log(LOG_ERR, "copied_seq=%u rcv_wup=%u credits=%u\n",
 725                     toep->tp_copied_seq, toep->tp_rcv_wup, credits);
 726             credits = rcv->sb_mbmax;
 727         }
 728
 729
 730         /*
 731          * XXX this won't accurately reflect credit return - we need
 732          * to look at the difference between the amount that has been
 733          * put in the recv sockbuf and what is there now
 734          */
 735
 736         if (__predict_false(!credits))
 737                 return;
 738
 739         dev = toep->tp_toedev;
 740         thres = TOM_TUNABLE(dev, rx_credit_thres);
 741
 742         if (__predict_false(thres == 0))
 743                 return;
 744
 745         if (is_delack_mode_valid(dev, toep)) {
 746                 dack_mode = TOM_TUNABLE(dev, delack);
 747                 if (__predict_false(dack_mode != toep->tp_delack_mode)) {
 748                         u32 r = tp->rcv_nxt - toep->tp_delack_seq;
 749
 750                         if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
 751                                 dack = F_RX_DACK_CHANGE |
 752                                        V_RX_DACK_MODE(dack_mode);
 753                 }
 754         } else
 755                 dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
 756
 757         /*
 758          * For coalescing to work effectively ensure the receive window has
 759          * at least 16KB left.
 760          */
 761         must_send = credits + 16384 >= tp->rcv_wnd;
 762
 763         if (must_send || credits >= thres)
 764                 toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
 765 }
 766
 767 static int
 768 cxgb_toe_disconnect(struct tcpcb *tp)
 769 {
 770         struct socket *so;
 771
 772         DPRINTF("cxgb_toe_disconnect\n");
 773
 774         so = inp_inpcbtosocket(tp->t_inpcb);
 775         close_conn(so);
 776         return (0);
 777 }
 778
 779 static int
 780 cxgb_toe_reset(struct tcpcb *tp)
 781 {
 782         struct toepcb *toep = tp->t_toe;
 783
 784         t3_send_reset(toep);
 785
 786         /*
 787          * unhook from socket
 788          */
 789         tp->t_flags &= ~TF_TOE;
 790         toep->tp_tp = NULL;
 791         tp->t_toe = NULL;
 792         return (0);
 793 }
 794
 795 static int
 796 cxgb_toe_send(struct tcpcb *tp)
 797 {
 798         struct socket *so;
 799
 800         DPRINTF("cxgb_toe_send\n");
 801         dump_toepcb(tp->t_toe);
 802
 803         so = inp_inpcbtosocket(tp->t_inpcb);
 804         t3_push_frames(so, 1);
 805         return (0);
 806 }
 807
 808 static int
 809 cxgb_toe_rcvd(struct tcpcb *tp)
 810 {
 811
 812         inp_lock_assert(tp->t_inpcb);
 813
 814         t3_cleanup_rbuf(tp, 0);
 815
 816         return (0);
 817 }
 818
 819 static void
 820 cxgb_toe_detach(struct tcpcb *tp)
 821 {
 822         struct toepcb *toep;
 823
 824         /*
 825          * XXX how do we handle teardown in the SYN_SENT state?
 826          *
 827          */
 828         inp_lock_assert(tp->t_inpcb);
 829         toep = tp->t_toe;
 830         toep->tp_tp = NULL;
 831
 832         /*
 833          * unhook from socket
 834          */
 835         tp->t_flags &= ~TF_TOE;
 836         tp->t_toe = NULL;
 837 }
 838
 839
 840 static struct toe_usrreqs cxgb_toe_usrreqs = {
 841         .tu_disconnect = cxgb_toe_disconnect,
 842         .tu_reset = cxgb_toe_reset,
 843         .tu_send = cxgb_toe_send,
 844         .tu_rcvd = cxgb_toe_rcvd,
 845         .tu_detach = cxgb_toe_detach,
 846         .tu_detach = cxgb_toe_detach,
 847         .tu_syncache_event = handle_syncache_event,
 848 };
 849
 850
 851 static void
 852 __set_tcb_field(struct toepcb *toep, struct mbuf *m, uint16_t word,
 853                             uint64_t mask, uint64_t val, int no_reply)
 854 {
 855         struct cpl_set_tcb_field *req;
 856
 857         CTR4(KTR_TCB, "__set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
 858             toep->tp_tid, word, mask, val);
 859
 860         req = mtod(m, struct cpl_set_tcb_field *);
 861         m->m_pkthdr.len = m->m_len = sizeof(*req);
 862         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
 863         req->wr.wr_lo = 0;
 864         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
 865         req->reply = V_NO_REPLY(no_reply);
 866         req->cpu_idx = 0;
 867         req->word = htons(word);
 868         req->mask = htobe64(mask);
 869         req->val = htobe64(val);
 870
 871         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
 872         send_or_defer(toep, m, 0);
 873 }
 874
 875 static void
 876 t3_set_tcb_field(struct toepcb *toep, uint16_t word, uint64_t mask, uint64_t val)
 877 {
 878         struct mbuf *m;
 879         struct tcpcb *tp = toep->tp_tp;
 880
 881         if (toep == NULL)
 882                 return;
 883
 884         if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN)) {
 885                 printf("not seting field\n");
 886                 return;
 887         }
 888
 889         m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));
 890
 891         __set_tcb_field(toep, m, word, mask, val, 1);
 892 }
 893
 894 /*
 895  * Set one of the t_flags bits in the TCB.
 896  */
 897 static void
 898 set_tcb_tflag(struct toepcb *toep, unsigned int bit_pos, int val)
 899 {
 900
 901         t3_set_tcb_field(toep, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
 902 }
 903
 904 /*
 905  * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
 906  */
 907 static void
 908 t3_set_nagle(struct toepcb *toep)
 909 {
 910         struct tcpcb *tp = toep->tp_tp;
 911
 912         set_tcb_tflag(toep, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
 913 }
 914
 915 /*
 916  * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
 917  */
 918 void
 919 t3_set_keepalive(struct toepcb *toep, int on_off)
 920 {
 921
 922         set_tcb_tflag(toep, S_TF_KEEPALIVE, on_off);
 923 }
 924
 925 void
 926 t3_set_rcv_coalesce_enable(struct toepcb *toep, int on_off)
 927 {
 928         set_tcb_tflag(toep, S_TF_RCV_COALESCE_ENABLE, on_off);
 929 }
 930
 931 void
 932 t3_set_dack_mss(struct toepcb *toep, int on_off)
 933 {
 934
 935         set_tcb_tflag(toep, S_TF_DACK_MSS, on_off);
 936 }
 937
 938 /*
 939  * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
 940  */
 941 static void
 942 t3_set_tos(struct toepcb *toep)
 943 {
 944         int tos = inp_ip_tos_get(toep->tp_tp->t_inpcb);
 945
 946         t3_set_tcb_field(toep, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
 947                          V_TCB_TOS(tos));
 948 }
 949
 950
 951 /*
 952  * In DDP mode, TP fails to schedule a timer to push RX data to the host when
 953  * DDP is disabled (data is delivered to freelist). [Note that, the peer should
 954  * set the PSH bit in the last segment, which would trigger delivery.]
 955  * We work around the issue by setting a DDP buffer in a partial placed state,
 956  * which guarantees that TP will schedule a timer.
 957  */
 958 #define TP_DDP_TIMER_WORKAROUND_MASK\
 959     (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
 960      ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
 961        V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
 962 #define TP_DDP_TIMER_WORKAROUND_VAL\
 963     (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
 964      ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
 965       32))
 966
 967 static void
 968 t3_enable_ddp(struct toepcb *toep, int on)
 969 {
 970         if (on) {
 971
 972                 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
 973                                  V_TF_DDP_OFF(0));
 974         } else
 975                 t3_set_tcb_field(toep, W_TCB_RX_DDP_FLAGS,
 976                                  V_TF_DDP_OFF(1) |
 977                                  TP_DDP_TIMER_WORKAROUND_MASK,
 978                                  V_TF_DDP_OFF(1) |
 979                                  TP_DDP_TIMER_WORKAROUND_VAL);
 980
 981 }
 982
 983 void
 984 t3_set_ddp_tag(struct toepcb *toep, int buf_idx, unsigned int tag_color)
 985 {
 986         t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
 987                          V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
 988                          tag_color);
 989 }
 990
 991 void
 992 t3_set_ddp_buf(struct toepcb *toep, int buf_idx, unsigned int offset,
 993                     unsigned int len)
 994 {
 995         if (buf_idx == 0)
 996                 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF0_OFFSET,
 997                          V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
 998                          V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
 999                          V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
1000                          V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
1001         else
1002                 t3_set_tcb_field(toep, W_TCB_RX_DDP_BUF1_OFFSET,
1003                          V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
1004                          V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
1005                          V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
1006                          V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
1007 }
1008
1009 static int
1010 t3_set_cong_control(struct socket *so, const char *name)
1011 {
1012 #ifdef CONGESTION_CONTROL_SUPPORTED
1013         int cong_algo;
1014
1015         for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
1016                 if (!strcmp(name, t3_cong_ops[cong_algo].name))
1017                         break;
1018
1019         if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
1020                 return -EINVAL;
1021 #endif
1022         return 0;
1023 }
1024
1025 int
1026 t3_get_tcb(struct toepcb *toep)
1027 {
1028         struct cpl_get_tcb *req;
1029         struct tcpcb *tp = toep->tp_tp;
1030         struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
1031
1032         if (!m)
1033                 return (ENOMEM);
1034
1035         inp_lock_assert(tp->t_inpcb);
1036         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
1037         req = mtod(m, struct cpl_get_tcb *);
1038         m->m_pkthdr.len = m->m_len = sizeof(*req);
1039         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1040         req->wr.wr_lo = 0;
1041         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
1042         req->cpuno = htons(toep->tp_qset);
1043         req->rsvd = 0;
1044         if (tp->t_state == TCPS_SYN_SENT)
1045                 mbufq_tail(&toep->out_of_order_queue, m);       // defer
1046         else
1047                 cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
1048         return 0;
1049 }
1050
1051 static inline void
1052 so_insert_tid(struct tom_data *d, struct toepcb *toep, unsigned int tid)
1053 {
1054
1055         toepcb_hold(toep);
1056
1057         cxgb_insert_tid(d->cdev, d->client, toep, tid);
1058 }
1059
1060 /**
1061  *      find_best_mtu - find the entry in the MTU table closest to an MTU
1062  *      @d: TOM state
1063  *      @mtu: the target MTU
1064  *
1065  *      Returns the index of the value in the MTU table that is closest to but
1066  *      does not exceed the target MTU.
1067  */
1068 static unsigned int
1069 find_best_mtu(const struct t3c_data *d, unsigned short mtu)
1070 {
1071         int i = 0;
1072
1073         while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
1074                 ++i;
1075         return (i);
1076 }
1077
1078 static unsigned int
1079 select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
1080 {
1081         unsigned int idx;
1082
1083 #ifdef notyet
1084         struct rtentry *dst = so_sotoinpcb(so)->inp_route.ro_rt;
1085 #endif
1086         if (tp) {
1087                 tp->t_maxseg = pmtu - 40;
1088                 if (tp->t_maxseg < td->mtus[0] - 40)
1089                         tp->t_maxseg = td->mtus[0] - 40;
1090                 idx = find_best_mtu(td, tp->t_maxseg + 40);
1091
1092                 tp->t_maxseg = td->mtus[idx] - 40;
1093         } else
1094                 idx = find_best_mtu(td, pmtu);
1095
1096         return (idx);
1097 }
1098
1099 static inline void
1100 free_atid(struct t3cdev *cdev, unsigned int tid)
1101 {
1102         struct toepcb *toep = cxgb_free_atid(cdev, tid);
1103
1104         if (toep)
1105                 toepcb_release(toep);
1106 }
1107
1108 /*
1109  * Release resources held by an offload connection (TID, L2T entry, etc.)
1110  */
1111 static void
1112 t3_release_offload_resources(struct toepcb *toep)
1113 {
1114         struct tcpcb *tp = toep->tp_tp;
1115         struct toedev *tdev = toep->tp_toedev;
1116         struct t3cdev *cdev;
1117         struct socket *so;
1118         unsigned int tid = toep->tp_tid;
1119         struct sockbuf *rcv;
1120
1121         CTR0(KTR_TOM, "t3_release_offload_resources");
1122
1123         if (!tdev)
1124                 return;
1125
1126         cdev = TOEP_T3C_DEV(toep);
1127         if (!cdev)
1128                 return;
1129
1130         toep->tp_qset = 0;
1131         t3_release_ddp_resources(toep);
1132
1133 #ifdef CTRL_SKB_CACHE
1134         kfree_skb(CTRL_SKB_CACHE(tp));
1135         CTRL_SKB_CACHE(tp) = NULL;
1136 #endif
1137
1138         if (toep->tp_wr_avail != toep->tp_wr_max) {
1139                 purge_wr_queue(toep);
1140                 reset_wr_list(toep);
1141         }
1142
1143         if (toep->tp_l2t) {
1144                 l2t_release(L2DATA(cdev), toep->tp_l2t);
1145                 toep->tp_l2t = NULL;
1146         }
1147         toep->tp_tp = NULL;
1148         if (tp) {
1149                 inp_lock_assert(tp->t_inpcb);
1150                 so = inp_inpcbtosocket(tp->t_inpcb);
1151                 rcv = so_sockbuf_rcv(so);
1152                 /*
1153                  * cancel any offloaded reads
1154                  *
1155                  */
1156                 sockbuf_lock(rcv);
1157                 tp->t_toe = NULL;
1158                 tp->t_flags &= ~TF_TOE;
1159                 if (toep->tp_ddp_state.user_ddp_pending) {
1160                         t3_cancel_ubuf(toep, rcv);
1161                         toep->tp_ddp_state.user_ddp_pending = 0;
1162                 }
1163                 so_sorwakeup_locked(so);
1164
1165         }
1166
1167         if (toep->tp_state == TCPS_SYN_SENT) {
1168                 free_atid(cdev, tid);
1169 #ifdef notyet
1170                 __skb_queue_purge(&tp->out_of_order_queue);
1171 #endif
1172         } else {                                          // we have TID
1173                 cxgb_remove_tid(cdev, toep, tid);
1174                 toepcb_release(toep);
1175         }
1176 #if 0
1177         log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
1178 #endif
1179 }
1180
1181 static void
1182 install_offload_ops(struct socket *so)
1183 {
1184         struct tcpcb *tp = so_sototcpcb(so);
1185
1186         KASSERT(tp->t_toe != NULL, ("toepcb not set"));
1187
1188         t3_install_socket_ops(so);
1189         tp->t_flags |= TF_TOE;
1190         tp->t_tu = &cxgb_toe_usrreqs;
1191 }
1192
1193 /*
1194  * Determine the receive window scaling factor given a target max
1195  * receive window.
1196  */
1197 static __inline int
1198 select_rcv_wscale(int space)
1199 {
1200         int wscale = 0;
1201
1202         if (space > MAX_RCV_WND)
1203                 space = MAX_RCV_WND;
1204
1205         if (tcp_do_rfc1323)
1206                 for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
1207
1208         return (wscale);
1209 }
1210
1211 /*
1212  * Determine the receive window size for a socket.
1213  */
1214 static unsigned long
1215 select_rcv_wnd(struct toedev *dev, struct socket *so)
1216 {
1217         struct tom_data *d = TOM_DATA(dev);
1218         unsigned int wnd;
1219         unsigned int max_rcv_wnd;
1220         struct sockbuf *rcv;
1221
1222         rcv = so_sockbuf_rcv(so);
1223
1224         if (tcp_do_autorcvbuf)
1225                 wnd = tcp_autorcvbuf_max;
1226         else
1227                 wnd = rcv->sb_hiwat;
1228
1229
1230
1231         /* XXX
1232          * For receive coalescing to work effectively we need a receive window
1233          * that can accomodate a coalesced segment.
1234          */
1235         if (wnd < MIN_RCV_WND)
1236                 wnd = MIN_RCV_WND;
1237
1238         /* PR 5138 */
1239         max_rcv_wnd = (dev->tod_ttid < TOE_ID_CHELSIO_T3C ?
1240                                     (uint32_t)d->rx_page_size * 23 :
1241                                     MAX_RCV_WND);
1242
1243         return min(wnd, max_rcv_wnd);
1244 }
1245
1246 /*
1247  * Assign offload parameters to some socket fields.  This code is used by
1248  * both active and passive opens.
1249  */
1250 static inline void
1251 init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
1252     struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
1253 {
1254         struct tcpcb *tp = so_sototcpcb(so);
1255         struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);
1256         struct sockbuf *snd, *rcv;
1257
1258 #ifdef notyet
1259         SOCK_LOCK_ASSERT(so);
1260 #endif
1261
1262         snd = so_sockbuf_snd(so);
1263         rcv = so_sockbuf_rcv(so);
1264
1265         log(LOG_INFO, "initializing offload socket\n");
1266         /*
1267          * We either need to fix push frames to work with sbcompress
1268          * or we need to add this
1269          */
1270         snd->sb_flags |= SB_NOCOALESCE;
1271         rcv->sb_flags |= SB_NOCOALESCE;
1272
1273         tp->t_toe = toep;
1274         toep->tp_tp = tp;
1275         toep->tp_toedev = dev;
1276
1277         toep->tp_tid = tid;
1278         toep->tp_l2t = e;
1279         toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
1280         toep->tp_wr_unacked = 0;
1281         toep->tp_delack_mode = 0;
1282
1283         toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
1284         /*
1285          * XXX broken
1286          *
1287          */
1288         tp->rcv_wnd = select_rcv_wnd(dev, so);
1289
1290         toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
1291                        tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
1292         toep->tp_qset_idx = 0;
1293
1294         reset_wr_list(toep);
1295         DPRINTF("initialization done\n");
1296 }
1297
1298 /*
1299  * The next two functions calculate the option 0 value for a socket.
1300  */
1301 static inline unsigned int
1302 calc_opt0h(struct socket *so, int mtu_idx)
1303 {
1304         struct tcpcb *tp = so_sototcpcb(so);
1305         int wscale = select_rcv_wscale(tp->rcv_wnd);
1306
1307         return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
1308             V_KEEP_ALIVE((so_options_get(so) & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
1309             V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
1310 }
1311
1312 static inline unsigned int
1313 calc_opt0l(struct socket *so, int ulp_mode)
1314 {
1315         struct tcpcb *tp = so_sototcpcb(so);
1316         unsigned int val;
1317
1318         val = V_TOS(INP_TOS(tp->t_inpcb)) | V_ULP_MODE(ulp_mode) |
1319                V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));
1320
1321         DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", INP_TOS(tp->t_inpcb), tp->rcv_wnd, val);
1322         return (val);
1323 }
1324
1325 static inline unsigned int
1326 calc_opt2(const struct socket *so, struct toedev *dev)
1327 {
1328         int flv_valid;
1329
1330         flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);
1331
1332         return (V_FLAVORS_VALID(flv_valid) |
1333             V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0));
1334 }
1335
1336 #if DEBUG_WR > 1
1337 static int
1338 count_pending_wrs(const struct toepcb *toep)
1339 {
1340         const struct mbuf *m;
1341         int n = 0;
1342
1343         wr_queue_walk(toep, m)
1344                 n += m->m_pkthdr.csum_data;
1345         return (n);
1346 }
1347 #endif
1348
1349 #if 0
1350 (((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
1351 #endif
1352
1353 static void
1354 mk_act_open_req(struct socket *so, struct mbuf *m,
1355     unsigned int atid, const struct l2t_entry *e)
1356 {
1357         struct cpl_act_open_req *req;
1358         struct inpcb *inp = so_sotoinpcb(so);
1359         struct tcpcb *tp = inp_inpcbtotcpcb(inp);
1360         struct toepcb *toep = tp->t_toe;
1361         struct toedev *tdev = toep->tp_toedev;
1362
1363         m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, toep));
1364
1365         req = mtod(m, struct cpl_act_open_req *);
1366         m->m_pkthdr.len = m->m_len = sizeof(*req);
1367
1368         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
1369         req->wr.wr_lo = 0;
1370         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
1371         inp_4tuple_get(inp, &req->local_ip, &req->local_port, &req->peer_ip, &req->peer_port);
1372 #if 0
1373         req->local_port = inp->inp_lport;
1374         req->peer_port = inp->inp_fport;
1375         memcpy(&req->local_ip, &inp->inp_laddr, 4);
1376         memcpy(&req->peer_ip, &inp->inp_faddr, 4);
1377 #endif
1378         req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
1379                            V_TX_CHANNEL(e->smt_idx));
1380         req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
1381         req->params = 0;
1382         req->opt2 = htonl(calc_opt2(so, tdev));
1383 }
1384
1385
1386 /*
1387  * Convert an ACT_OPEN_RPL status to an errno.
1388  */
1389 static int
1390 act_open_rpl_status_to_errno(int status)
1391 {
1392         switch (status) {
1393         case CPL_ERR_CONN_RESET:
1394                 return (ECONNREFUSED);
1395         case CPL_ERR_ARP_MISS:
1396                 return (EHOSTUNREACH);
1397         case CPL_ERR_CONN_TIMEDOUT:
1398                 return (ETIMEDOUT);
1399         case CPL_ERR_TCAM_FULL:
1400                 return (ENOMEM);
1401         case CPL_ERR_CONN_EXIST:
1402                 log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
1403                 return (EADDRINUSE);
1404         default:
1405                 return (EIO);
1406         }
1407 }
1408
1409 static void
1410 fail_act_open(struct toepcb *toep, int errno)
1411 {
1412         struct tcpcb *tp = toep->tp_tp;
1413
1414         t3_release_offload_resources(toep);
1415         if (tp) {
1416                 inp_wunlock(tp->t_inpcb);
1417                 tcp_offload_drop(tp, errno);
1418         }
1419
1420 #ifdef notyet
1421         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1422 #endif
1423 }
1424
1425 /*
1426  * Handle active open failures.
1427  */
1428 static void
1429 active_open_failed(struct toepcb *toep, struct mbuf *m)
1430 {
1431         struct cpl_act_open_rpl *rpl = cplhdr(m);
1432         struct inpcb *inp;
1433
1434         if (toep->tp_tp == NULL)
1435                 goto done;
1436
1437         inp = toep->tp_tp->t_inpcb;
1438
1439 /*
1440  * Don't handle connection retry for now
1441  */
1442 #ifdef notyet
1443         struct inet_connection_sock *icsk = inet_csk(sk);
1444
1445         if (rpl->status == CPL_ERR_CONN_EXIST &&
1446             icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
1447                 icsk->icsk_retransmit_timer.function = act_open_retry_timer;
1448                 sk_reset_timer(so, &icsk->icsk_retransmit_timer,
1449                                jiffies + HZ / 2);
1450         } else
1451 #endif
1452         {
1453                 inp_wlock(inp);
1454                 /*
1455                  * drops the inpcb lock
1456                  */
1457                 fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
1458         }
1459
1460         done:
1461         m_free(m);
1462 }
1463
1464 /*
1465  * Return whether a failed active open has allocated a TID
1466  */
1467 static inline int
1468 act_open_has_tid(int status)
1469 {
1470         return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
1471                status != CPL_ERR_ARP_MISS;
1472 }
1473
1474 /*
1475  * Process an ACT_OPEN_RPL CPL message.
1476  */
1477 static int
1478 do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1479 {
1480         struct toepcb *toep = (struct toepcb *)ctx;
1481         struct cpl_act_open_rpl *rpl = cplhdr(m);
1482
1483         if (cdev->type != T3A && act_open_has_tid(rpl->status))
1484                 cxgb_queue_tid_release(cdev, GET_TID(rpl));
1485
1486         active_open_failed(toep, m);
1487         return (0);
1488 }
1489
1490 /*
1491  * Handle an ARP failure for an active open.   XXX purge ofo queue
1492  *
1493  * XXX badly broken for crossed SYNs as the ATID is no longer valid.
1494  * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
1495  * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
1496  * free the atid.  Hmm.
1497  */
1498 #ifdef notyet
1499 static void
1500 act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
1501 {
1502         struct toepcb *toep = m_get_toep(m);
1503         struct tcpcb *tp = toep->tp_tp;
1504         struct inpcb *inp = tp->t_inpcb;
1505         struct socket *so;
1506
1507         inp_wlock(inp);
1508         if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
1509                 /*
1510                  * drops the inpcb lock
1511                  */
1512                 fail_act_open(so, EHOSTUNREACH);
1513                 printf("freeing %p\n", m);
1514
1515                 m_free(m);
1516         } else
1517                 inp_wunlock(inp);
1518 }
1519 #endif
1520 /*
1521  * Send an active open request.
1522  */
1523 int
1524 t3_connect(struct toedev *tdev, struct socket *so,
1525     struct rtentry *rt, struct sockaddr *nam)
1526 {
1527         struct mbuf *m;
1528         struct l2t_entry *e;
1529         struct tom_data *d = TOM_DATA(tdev);
1530         struct inpcb *inp = so_sotoinpcb(so);
1531         struct tcpcb *tp = intotcpcb(inp);
1532         struct toepcb *toep; /* allocated by init_offload_socket */
1533
1534         int atid;
1535
1536         toep = toepcb_alloc();
1537         if (toep == NULL)
1538                 goto out_err;
1539
1540         if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
1541                 goto out_err;
1542
1543         e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
1544         if (!e)
1545                 goto free_tid;
1546
1547         inp_lock_assert(inp);
1548         m = m_gethdr(MT_DATA, M_WAITOK);
1549
1550 #if 0
1551         m->m_toe.mt_toepcb = tp->t_toe;
1552         set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
1553 #endif
1554         so_lock(so);
1555
1556         init_offload_socket(so, tdev, atid, e, rt, toep);
1557
1558         install_offload_ops(so);
1559
1560         mk_act_open_req(so, m, atid, e);
1561         so_unlock(so);
1562
1563         soisconnecting(so);
1564         toep = tp->t_toe;
1565         m_set_toep(m, tp->t_toe);
1566
1567         toep->tp_state = TCPS_SYN_SENT;
1568         l2t_send(d->cdev, (struct mbuf *)m, e);
1569
1570         if (toep->tp_ulp_mode)
1571                 t3_enable_ddp(toep, 0);
1572         return  (0);
1573
1574 free_tid:
1575         printf("failing connect - free atid\n");
1576
1577         free_atid(d->cdev, atid);
1578 out_err:
1579         printf("return ENOMEM\n");
1580        return (ENOMEM);
1581 }
1582
1583 /*
1584  * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
1585  * not send multiple ABORT_REQs for the same connection and also that we do
1586  * not try to send a message after the connection has closed.  Returns 1 if
1587  * an ABORT_REQ wasn't generated after all, 0 otherwise.
1588  */
1589 static void
1590 t3_send_reset(struct toepcb *toep)
1591 {
1592
1593         struct cpl_abort_req *req;
1594         unsigned int tid = toep->tp_tid;
1595         int mode = CPL_ABORT_SEND_RST;
1596         struct tcpcb *tp = toep->tp_tp;
1597         struct toedev *tdev = toep->tp_toedev;
1598         struct socket *so = NULL;
1599         struct mbuf *m;
1600         struct sockbuf *snd;
1601
1602         if (tp) {
1603                 inp_lock_assert(tp->t_inpcb);
1604                 so = inp_inpcbtosocket(tp->t_inpcb);
1605         }
1606
1607         if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
1608                 tdev == NULL))
1609                 return;
1610         toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
1611
1612         snd = so_sockbuf_snd(so);
1613         /* Purge the send queue so we don't send anything after an abort. */
1614         if (so)
1615                 sbflush(snd);
1616         if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
1617                 mode |= CPL_ABORT_POST_CLOSE_REQ;
1618
1619         m = m_gethdr_nofail(sizeof(*req));
1620         m_set_priority(m, mkprio(CPL_PRIORITY_DATA, toep));
1621         set_arp_failure_handler(m, abort_arp_failure);
1622
1623         req = mtod(m, struct cpl_abort_req *);
1624         req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
1625         req->wr.wr_lo = htonl(V_WR_TID(tid));
1626         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
1627         req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
1628         req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
1629         req->cmd = mode;
1630         if (tp && (tp->t_state == TCPS_SYN_SENT))
1631                 mbufq_tail(&toep->out_of_order_queue, m);       // defer
1632         else
1633                 l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
1634 }
1635
1636 static int
1637 t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
1638 {
1639         struct inpcb *inp;
1640         int error, optval;
1641
1642         if (sopt->sopt_name == IP_OPTIONS)
1643                 return (ENOPROTOOPT);
1644
1645         if (sopt->sopt_name != IP_TOS)
1646                 return (EOPNOTSUPP);
1647
1648         error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
1649
1650         if (error)
1651                 return (error);
1652
1653         if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
1654                 return (EPERM);
1655
1656         inp = so_sotoinpcb(so);
1657         inp_wlock(inp);
1658         inp_ip_tos_set(inp, optval);
1659 #if 0
1660         inp->inp_ip_tos = optval;
1661 #endif
1662         t3_set_tos(inp_inpcbtotcpcb(inp)->t_toe);
1663         inp_wunlock(inp);
1664
1665         return (0);
1666 }
1667
1668 static int
1669 t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
1670 {
1671         int err = 0;
1672         size_t copied;
1673
1674         if (sopt->sopt_name != TCP_CONGESTION &&
1675             sopt->sopt_name != TCP_NODELAY)
1676                 return (EOPNOTSUPP);
1677
1678         if (sopt->sopt_name == TCP_CONGESTION) {
1679                 char name[TCP_CA_NAME_MAX];
1680                 int optlen = sopt->sopt_valsize;
1681                 struct tcpcb *tp;
1682
1683                 if (optlen < 1)
1684                         return (EINVAL);
1685
1686                 err = copyinstr(sopt->sopt_val, name,
1687                     min(TCP_CA_NAME_MAX - 1, optlen), &copied);
1688                 if (err)
1689                         return (err);
1690                 if (copied < 1)
1691                         return (EINVAL);
1692
1693                 tp = so_sototcpcb(so);
1694                 /*
1695                  * XXX I need to revisit this
1696                  */
1697                 if ((err = t3_set_cong_control(so, name)) == 0) {
1698 #ifdef CONGESTION_CONTROL_SUPPORTED
1699                         tp->t_cong_control = strdup(name, M_CXGB);
1700 #endif
1701                 } else
1702                         return (err);
1703         } else {
1704                 int optval, oldval;
1705                 struct inpcb *inp;
1706                 struct tcpcb *tp;
1707
1708                 err = sooptcopyin(sopt, &optval, sizeof optval,
1709                     sizeof optval);
1710
1711                 if (err)
1712                         return (err);
1713
1714                 inp = so_sotoinpcb(so);
1715                 tp = inp_inpcbtotcpcb(inp);
1716
1717                 inp_wlock(inp);
1718
1719                 oldval = tp->t_flags;
1720                 if (optval)
1721                         tp->t_flags |= TF_NODELAY;
1722                 else
1723                         tp->t_flags &= ~TF_NODELAY;
1724                 inp_wunlock(inp);
1725
1726
1727                 if (oldval != tp->t_flags && (tp->t_toe != NULL))
1728                         t3_set_nagle(tp->t_toe);
1729
1730         }
1731
1732         return (0);
1733 }
1734
1735 int
1736 t3_ctloutput(struct socket *so, struct sockopt *sopt)
1737 {
1738         int err;
1739
1740         if (sopt->sopt_level != IPPROTO_TCP)
1741                 err =  t3_ip_ctloutput(so, sopt);
1742         else
1743                 err = t3_tcp_ctloutput(so, sopt);
1744
1745         if (err != EOPNOTSUPP)
1746                 return (err);
1747
1748         return (tcp_ctloutput(so, sopt));
1749 }
1750
1751 /*
1752  * Returns true if we need to explicitly request RST when we receive new data
1753  * on an RX-closed connection.
1754  */
1755 static inline int
1756 need_rst_on_excess_rx(const struct toepcb *toep)
1757 {
1758         return (1);
1759 }
1760
1761 /*
1762  * Handles Rx data that arrives in a state where the socket isn't accepting
1763  * new data.
1764  */
1765 static void
1766 handle_excess_rx(struct toepcb *toep, struct mbuf *m)
1767 {
1768
1769         if (need_rst_on_excess_rx(toep) &&
1770             !(toep->tp_flags & TP_ABORT_SHUTDOWN))
1771                 t3_send_reset(toep);
1772         m_freem(m);
1773 }
1774
1775 /*
1776  * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
1777  * by getting the DDP offset from the TCB.
1778  */
1779 static void
1780 tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
1781 {
1782         struct ddp_state *q = &toep->tp_ddp_state;
1783         struct ddp_buf_state *bsp;
1784         struct cpl_get_tcb_rpl *hdr;
1785         unsigned int ddp_offset;
1786         struct socket *so;
1787         struct tcpcb *tp;
1788         struct sockbuf *rcv;
1789         int state;
1790
1791         uint64_t t;
1792         __be64 *tcb;
1793
1794         tp = toep->tp_tp;
1795         so = inp_inpcbtosocket(tp->t_inpcb);
1796
1797         inp_lock_assert(tp->t_inpcb);
1798         rcv = so_sockbuf_rcv(so);
1799         sockbuf_lock(rcv);
1800
1801         /* Note that we only accout for CPL_GET_TCB issued by the DDP code.
1802          * We really need a cookie in order to dispatch the RPLs.
1803          */
1804         q->get_tcb_count--;
1805
1806         /* It is a possible that a previous CPL already invalidated UBUF DDP
1807          * and moved the cur_buf idx and hence no further processing of this
1808          * skb is required. However, the app might be sleeping on
1809          * !q->get_tcb_count and we need to wake it up.
1810          */
1811         if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
1812                 int state = so_state_get(so);
1813
1814                 m_freem(m);
1815                 if (__predict_true((state & SS_NOFDREF) == 0))
1816                         so_sorwakeup_locked(so);
1817                 else
1818                         sockbuf_unlock(rcv);
1819
1820                 return;
1821         }
1822
1823         bsp = &q->buf_state[q->cur_buf];
1824         hdr = cplhdr(m);
1825         tcb = (__be64 *)(hdr + 1);
1826         if (q->cur_buf == 0) {
1827                 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
1828                 ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
1829         } else {
1830                 t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
1831                 ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
1832         }
1833         ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
1834         m->m_cur_offset = bsp->cur_offset;
1835         bsp->cur_offset = ddp_offset;
1836         m->m_len = m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
1837
1838         CTR5(KTR_TOM,
1839             "tcb_rpl_as_ddp_complete: idx=%d seq=0x%x hwbuf=%u ddp_offset=%u cur_offset=%u",
1840             q->cur_buf, tp->rcv_nxt, q->cur_buf, ddp_offset, m->m_cur_offset);
1841         KASSERT(ddp_offset >= m->m_cur_offset,
1842             ("ddp_offset=%u less than cur_offset=%u",
1843                 ddp_offset, m->m_cur_offset));
1844
1845 #if 0
1846 {
1847         unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
1848
1849         t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
1850         ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
1851
1852         t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
1853         rcv_nxt = t >> S_TCB_RCV_NXT;
1854         rcv_nxt &= M_TCB_RCV_NXT;
1855
1856         t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
1857         rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
1858         rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
1859
1860         T3_TRACE2(TIDTB(sk),
1861                   "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
1862                   ddp_flags, rcv_nxt - rx_hdr_offset);
1863         T3_TRACE4(TB(q),
1864                   "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
1865                   tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
1866         T3_TRACE3(TB(q),
1867                   "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
1868                   rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
1869         T3_TRACE2(TB(q),
1870                   "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
1871                  q->buf_state[0].flags, q->buf_state[1].flags);
1872
1873 }
1874 #endif
1875         if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
1876                 handle_excess_rx(toep, m);
1877                 return;
1878         }
1879
1880 #ifdef T3_TRACE
1881         if ((int)m->m_pkthdr.len < 0) {
1882                 t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
1883         }
1884 #endif
1885         if (bsp->flags & DDP_BF_NOCOPY) {
1886 #ifdef T3_TRACE
1887                 T3_TRACE0(TB(q),
1888                           "tcb_rpl_as_ddp_complete: CANCEL UBUF");
1889
1890                 if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1891                         printk("!cancel_ubuf");
1892                         t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
1893                 }
1894 #endif
1895                 m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
1896                 bsp->flags &= ~(DDP_BF_NOCOPY|DDP_BF_NODATA);
1897                 q->cur_buf ^= 1;
1898         } else if (bsp->flags & DDP_BF_NOFLIP) {
1899
1900                 m->m_ddp_flags = 1;    /* always a kernel buffer */
1901
1902                 /* now HW buffer carries a user buffer */
1903                 bsp->flags &= ~DDP_BF_NOFLIP;
1904                 bsp->flags |= DDP_BF_NOCOPY;
1905
1906                 /* It is possible that the CPL_GET_TCB_RPL doesn't indicate
1907                  * any new data in which case we're done. If in addition the
1908                  * offset is 0, then there wasn't a completion for the kbuf
1909                  * and we need to decrement the posted count.
1910                  */
1911                 if (m->m_pkthdr.len == 0) {
1912                         if (ddp_offset == 0) {
1913                                 q->kbuf_posted--;
1914                                 bsp->flags |= DDP_BF_NODATA;
1915                         }
1916                         sockbuf_unlock(rcv);
1917                         m_free(m);
1918                         return;
1919                 }
1920         } else {
1921                 sockbuf_unlock(rcv);
1922
1923                 /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
1924                  * but it got here way late and nobody cares anymore.
1925                  */
1926                 m_free(m);
1927                 return;
1928         }
1929
1930         m->m_ddp_gl = (unsigned char *)bsp->gl;
1931         m->m_flags |= M_DDP;
1932         m->m_seq = tp->rcv_nxt;
1933         tp->rcv_nxt += m->m_pkthdr.len;
1934         tp->t_rcvtime = ticks;
1935         CTR3(KTR_TOM, "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u m->m_pktlen %u",
1936                   m->m_seq, q->cur_buf, m->m_pkthdr.len);
1937         if (m->m_pkthdr.len == 0) {
1938                 q->user_ddp_pending = 0;
1939                 m_free(m);
1940         } else
1941                 SBAPPEND(rcv, m);
1942
1943         state = so_state_get(so);
1944         if (__predict_true((state & SS_NOFDREF) == 0))
1945                 so_sorwakeup_locked(so);
1946         else
1947                 sockbuf_unlock(rcv);
1948 }
1949
1950 /*
1951  * Process a CPL_GET_TCB_RPL.  These can also be generated by the DDP code,
1952  * in that case they are similar to DDP completions.
1953  */
1954 static int
1955 do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
1956 {
1957         struct toepcb *toep = (struct toepcb *)ctx;
1958
1959         /* OK if socket doesn't exist */
1960         if (toep == NULL) {
1961                 printf("null toep in do_get_tcb_rpl\n");
1962                 return (CPL_RET_BUF_DONE);
1963         }
1964
1965         inp_wlock(toep->tp_tp->t_inpcb);
1966         tcb_rpl_as_ddp_complete(toep, m);
1967         inp_wunlock(toep->tp_tp->t_inpcb);
1968
1969         return (0);
1970 }
1971
1972 static void
1973 handle_ddp_data(struct toepcb *toep, struct mbuf *m)
1974 {
1975         struct tcpcb *tp = toep->tp_tp;
1976         struct socket *so;
1977         struct ddp_state *q;
1978         struct ddp_buf_state *bsp;
1979         struct cpl_rx_data *hdr = cplhdr(m);
1980         unsigned int rcv_nxt = ntohl(hdr->seq);
1981         struct sockbuf *rcv;
1982
1983         if (tp->rcv_nxt == rcv_nxt)
1984                 return;
1985
1986         inp_lock_assert(tp->t_inpcb);
1987         so  = inp_inpcbtosocket(tp->t_inpcb);
1988         rcv = so_sockbuf_rcv(so);
1989         sockbuf_lock(rcv);
1990
1991         q = &toep->tp_ddp_state;
1992         bsp = &q->buf_state[q->cur_buf];
1993         KASSERT(SEQ_GT(rcv_nxt, tp->rcv_nxt), ("tp->rcv_nxt=0x%08x decreased rcv_nxt=0x08%x",
1994                 rcv_nxt, tp->rcv_nxt));
1995         m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
1996         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
1997         CTR3(KTR_TOM, "rcv_nxt=0x%x tp->rcv_nxt=0x%x len=%d",
1998             rcv_nxt, tp->rcv_nxt, m->m_pkthdr.len);
1999
2000 #ifdef T3_TRACE
2001         if ((int)m->m_pkthdr.len < 0) {
2002                 t3_ddp_error(so, "handle_ddp_data: neg len");
2003         }
2004 #endif
2005         m->m_ddp_gl = (unsigned char *)bsp->gl;
2006         m->m_flags |= M_DDP;
2007         m->m_cur_offset = bsp->cur_offset;
2008         m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2009         if (bsp->flags & DDP_BF_NOCOPY)
2010                 bsp->flags &= ~DDP_BF_NOCOPY;
2011
2012         m->m_seq = tp->rcv_nxt;
2013         tp->rcv_nxt = rcv_nxt;
2014         bsp->cur_offset += m->m_pkthdr.len;
2015         if (!(bsp->flags & DDP_BF_NOFLIP))
2016                 q->cur_buf ^= 1;
2017         /*
2018          * For now, don't re-enable DDP after a connection fell out of  DDP
2019          * mode.
2020          */
2021         q->ubuf_ddp_ready = 0;
2022         sockbuf_unlock(rcv);
2023 }
2024
2025 /*
2026  * Process new data received for a connection.
2027  */
2028 static void
2029 new_rx_data(struct toepcb *toep, struct mbuf *m)
2030 {
2031         struct cpl_rx_data *hdr = cplhdr(m);
2032         struct tcpcb *tp = toep->tp_tp;
2033         struct socket *so;
2034         struct sockbuf *rcv;
2035         int state;
2036         int len = be16toh(hdr->len);
2037
2038         inp_wlock(tp->t_inpcb);
2039
2040         so  = inp_inpcbtosocket(tp->t_inpcb);
2041
2042         if (__predict_false(so_no_receive(so))) {
2043                 handle_excess_rx(toep, m);
2044                 inp_wunlock(tp->t_inpcb);
2045                 TRACE_EXIT;
2046                 return;
2047         }
2048
2049         if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
2050                 handle_ddp_data(toep, m);
2051
2052         m->m_seq = ntohl(hdr->seq);
2053         m->m_ulp_mode = 0;                    /* for iSCSI */
2054
2055 #if VALIDATE_SEQ
2056         if (__predict_false(m->m_seq != tp->rcv_nxt)) {
2057                 log(LOG_ERR,
2058                        "%s: TID %u: Bad sequence number %u, expected %u\n",
2059                     toep->tp_toedev->name, toep->tp_tid, m->m_seq,
2060                        tp->rcv_nxt);
2061                 m_freem(m);
2062                 inp_wunlock(tp->t_inpcb);
2063                 return;
2064         }
2065 #endif
2066         m_adj(m, sizeof(*hdr));
2067
2068 #ifdef URGENT_DATA_SUPPORTED
2069         /*
2070          * We don't handle urgent data yet
2071          */
2072         if (__predict_false(hdr->urg))
2073                 handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
2074         if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
2075                      tp->urg_seq - tp->rcv_nxt < skb->len))
2076                 tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
2077                                                          tp->rcv_nxt];
2078 #endif
2079         if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
2080                 toep->tp_delack_mode = hdr->dack_mode;
2081                 toep->tp_delack_seq = tp->rcv_nxt;
2082         }
2083         CTR6(KTR_TOM, "appending mbuf=%p pktlen=%d m_len=%d len=%d rcv_nxt=0x%x enqueued_bytes=%d",
2084             m, m->m_pkthdr.len, m->m_len, len, tp->rcv_nxt, toep->tp_enqueued_bytes);
2085
2086         if (len < m->m_pkthdr.len)
2087                 m->m_pkthdr.len = m->m_len = len;
2088
2089         tp->rcv_nxt += m->m_pkthdr.len;
2090         tp->t_rcvtime = ticks;
2091         toep->tp_enqueued_bytes += m->m_pkthdr.len;
2092         CTR2(KTR_TOM,
2093             "new_rx_data: seq 0x%x len %u",
2094             m->m_seq, m->m_pkthdr.len);
2095         inp_wunlock(tp->t_inpcb);
2096         rcv = so_sockbuf_rcv(so);
2097         sockbuf_lock(rcv);
2098 #if 0
2099         if (sb_notify(rcv))
2100                 DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, rcv->sb_flags, m->m_pkthdr.len);
2101 #endif
2102         SBAPPEND(rcv, m);
2103
2104 #ifdef notyet
2105         /*
2106          * We're giving too many credits to the card - but disable this check so we can keep on moving :-|
2107          *
2108          */
2109         KASSERT(rcv->sb_cc < (rcv->sb_mbmax << 1),
2110
2111             ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
2112                 so, rcv->sb_cc, rcv->sb_mbmax));
2113 #endif
2114
2115
2116         CTR2(KTR_TOM, "sb_cc=%d sb_mbcnt=%d",
2117             rcv->sb_cc, rcv->sb_mbcnt);
2118
2119         state = so_state_get(so);
2120         if (__predict_true((state & SS_NOFDREF) == 0))
2121                 so_sorwakeup_locked(so);
2122         else
2123                 sockbuf_unlock(rcv);
2124 }
2125
2126 /*
2127  * Handler for RX_DATA CPL messages.
2128  */
2129 static int
2130 do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2131 {
2132         struct toepcb *toep = (struct toepcb *)ctx;
2133
2134         DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
2135
2136         new_rx_data(toep, m);
2137
2138         return (0);
2139 }
2140
2141 static void
2142 new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
2143 {
2144         struct tcpcb *tp;
2145         struct ddp_state *q;
2146         struct ddp_buf_state *bsp;
2147         struct cpl_rx_data_ddp *hdr;
2148         struct socket *so;
2149         unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
2150         int nomoredata = 0;
2151         unsigned int delack_mode;
2152         struct sockbuf *rcv;
2153
2154         tp = toep->tp_tp;
2155         inp_wlock(tp->t_inpcb);
2156         so = inp_inpcbtosocket(tp->t_inpcb);
2157
2158         if (__predict_false(so_no_receive(so))) {
2159
2160                 handle_excess_rx(toep, m);
2161                 inp_wunlock(tp->t_inpcb);
2162                 return;
2163         }
2164
2165         q = &toep->tp_ddp_state;
2166         hdr = cplhdr(m);
2167         ddp_report = ntohl(hdr->u.ddp_report);
2168         buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2169         bsp = &q->buf_state[buf_idx];
2170
2171         CTR4(KTR_TOM,
2172             "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
2173             "hdr seq 0x%x len %u",
2174             tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
2175             ntohs(hdr->len));
2176         CTR3(KTR_TOM,
2177             "new_rx_data_ddp: offset %u ddp_report 0x%x buf_idx=%d",
2178             G_DDP_OFFSET(ddp_report), ddp_report, buf_idx);
2179
2180         ddp_len = ntohs(hdr->len);
2181         rcv_nxt = ntohl(hdr->seq) + ddp_len;
2182
2183         delack_mode = G_DDP_DACK_MODE(ddp_report);
2184         if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2185                 toep->tp_delack_mode = delack_mode;
2186                 toep->tp_delack_seq = tp->rcv_nxt;
2187         }
2188
2189         m->m_seq = tp->rcv_nxt;
2190         tp->rcv_nxt = rcv_nxt;
2191
2192         tp->t_rcvtime = ticks;
2193         /*
2194          * Store the length in m->m_len.  We are changing the meaning of
2195          * m->m_len here, we need to be very careful that nothing from now on
2196          * interprets ->len of this packet the usual way.
2197          */
2198         m->m_len = m->m_pkthdr.len = rcv_nxt - m->m_seq;
2199         inp_wunlock(tp->t_inpcb);
2200         CTR3(KTR_TOM,
2201             "new_rx_data_ddp: m_len=%u rcv_next 0x%08x rcv_nxt_prev=0x%08x ",
2202             m->m_len, rcv_nxt, m->m_seq);
2203         /*
2204          * Figure out where the new data was placed in the buffer and store it
2205          * in when.  Assumes the buffer offset starts at 0, consumer needs to
2206          * account for page pod's pg_offset.
2207          */
2208         end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
2209         m->m_cur_offset = end_offset - m->m_pkthdr.len;
2210
2211         rcv = so_sockbuf_rcv(so);
2212         sockbuf_lock(rcv);
2213
2214         m->m_ddp_gl = (unsigned char *)bsp->gl;
2215         m->m_flags |= M_DDP;
2216         bsp->cur_offset = end_offset;
2217         toep->tp_enqueued_bytes += m->m_pkthdr.len;
2218
2219         /*
2220          * Length is only meaningful for kbuf
2221          */
2222         if (!(bsp->flags & DDP_BF_NOCOPY))
2223                 KASSERT(m->m_len <= bsp->gl->dgl_length,
2224                     ("length received exceeds ddp pages: len=%d dgl_length=%d",
2225                         m->m_len, bsp->gl->dgl_length));
2226
2227         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2228         KASSERT(m->m_next == NULL, ("m_len=%p", m->m_next));
2229         /*
2230          * Bit 0 of flags stores whether the DDP buffer is completed.
2231          * Note that other parts of the code depend on this being in bit 0.
2232          */
2233         if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
2234                 panic("spurious ddp completion");
2235         } else {
2236                 m->m_ddp_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
2237                 if (m->m_ddp_flags && !(bsp->flags & DDP_BF_NOFLIP))
2238                         q->cur_buf ^= 1;                     /* flip buffers */
2239         }
2240
2241         if (bsp->flags & DDP_BF_NOCOPY) {
2242                 m->m_ddp_flags |= (bsp->flags & DDP_BF_NOCOPY);
2243                 bsp->flags &= ~DDP_BF_NOCOPY;
2244         }
2245
2246         if (ddp_report & F_DDP_PSH)
2247                 m->m_ddp_flags |= DDP_BF_PSH;
2248         if (nomoredata)
2249                 m->m_ddp_flags |= DDP_BF_NODATA;
2250
2251 #ifdef notyet
2252         skb_reset_transport_header(skb);
2253         tcp_hdr(skb)->fin = 0;          /* changes original hdr->ddp_report */
2254 #endif
2255         SBAPPEND(rcv, m);
2256
2257         if ((so_state_get(so) & SS_NOFDREF) == 0 && ((ddp_report & F_DDP_PSH) ||
2258             (((m->m_ddp_flags & (DDP_BF_NOCOPY|1)) == (DDP_BF_NOCOPY|1))
2259                 || !(m->m_ddp_flags & DDP_BF_NOCOPY))))
2260                 so_sorwakeup_locked(so);
2261         else
2262                 sockbuf_unlock(rcv);
2263 }
2264
2265 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
2266                  F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
2267                  F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
2268                  F_DDP_INVALID_PPOD)
2269
2270 /*
2271  * Handler for RX_DATA_DDP CPL messages.
2272  */
2273 static int
2274 do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2275 {
2276         struct toepcb *toep = ctx;
2277         const struct cpl_rx_data_ddp *hdr = cplhdr(m);
2278
2279         VALIDATE_SOCK(so);
2280
2281         if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
2282                 log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
2283                        GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
2284                 return (CPL_RET_BUF_DONE);
2285         }
2286 #if 0
2287         skb->h.th = tcphdr_skb->h.th;
2288 #endif
2289         new_rx_data_ddp(toep, m);
2290         return (0);
2291 }
2292
2293 static void
2294 process_ddp_complete(struct toepcb *toep, struct mbuf *m)
2295 {
2296         struct tcpcb *tp = toep->tp_tp;
2297         struct socket *so;
2298         struct ddp_state *q;
2299         struct ddp_buf_state *bsp;
2300         struct cpl_rx_ddp_complete *hdr;
2301         unsigned int ddp_report, buf_idx, when, delack_mode;
2302         int nomoredata = 0;
2303         struct sockbuf *rcv;
2304
2305         inp_wlock(tp->t_inpcb);
2306         so = inp_inpcbtosocket(tp->t_inpcb);
2307
2308         if (__predict_false(so_no_receive(so))) {
2309                 struct inpcb *inp = so_sotoinpcb(so);
2310
2311                 handle_excess_rx(toep, m);
2312                 inp_wunlock(inp);
2313                 return;
2314         }
2315         q = &toep->tp_ddp_state;
2316         hdr = cplhdr(m);
2317         ddp_report = ntohl(hdr->ddp_report);
2318         buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
2319         m->m_pkthdr.csum_data = tp->rcv_nxt;
2320
2321         rcv = so_sockbuf_rcv(so);
2322         sockbuf_lock(rcv);
2323
2324         bsp = &q->buf_state[buf_idx];
2325         when = bsp->cur_offset;
2326         m->m_len = m->m_pkthdr.len = G_DDP_OFFSET(ddp_report) - when;
2327         tp->rcv_nxt += m->m_len;
2328         tp->t_rcvtime = ticks;
2329
2330         delack_mode = G_DDP_DACK_MODE(ddp_report);
2331         if (__predict_false(G_DDP_DACK_MODE(ddp_report) != toep->tp_delack_mode)) {
2332                 toep->tp_delack_mode = delack_mode;
2333                 toep->tp_delack_seq = tp->rcv_nxt;
2334         }
2335 #ifdef notyet
2336         skb_reset_transport_header(skb);
2337         tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2338 #endif
2339         inp_wunlock(tp->t_inpcb);
2340
2341         KASSERT(m->m_len >= 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2342         CTR5(KTR_TOM,
2343                   "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2344                   "ddp_report 0x%x offset %u, len %u",
2345                   tp->rcv_nxt, bsp->cur_offset, ddp_report,
2346                    G_DDP_OFFSET(ddp_report), m->m_len);
2347
2348         m->m_cur_offset = bsp->cur_offset;
2349         bsp->cur_offset += m->m_len;
2350
2351         if (!(bsp->flags & DDP_BF_NOFLIP)) {
2352                 q->cur_buf ^= 1;                     /* flip buffers */
2353                 if (G_DDP_OFFSET(ddp_report) < q->kbuf[0]->dgl_length)
2354                         nomoredata=1;
2355         }
2356
2357         CTR4(KTR_TOM,
2358                   "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
2359                   "ddp_report %u offset %u",
2360                   tp->rcv_nxt, bsp->cur_offset, ddp_report,
2361                    G_DDP_OFFSET(ddp_report));
2362
2363         m->m_ddp_gl = (unsigned char *)bsp->gl;
2364         m->m_flags |= M_DDP;
2365         m->m_ddp_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
2366         if (bsp->flags & DDP_BF_NOCOPY)
2367                 bsp->flags &= ~DDP_BF_NOCOPY;
2368         if (nomoredata)
2369                 m->m_ddp_flags |= DDP_BF_NODATA;
2370
2371         SBAPPEND(rcv, m);
2372         if ((so_state_get(so) & SS_NOFDREF) == 0)
2373                 so_sorwakeup_locked(so);
2374         else
2375                 sockbuf_unlock(rcv);
2376 }
2377
2378 /*
2379  * Handler for RX_DDP_COMPLETE CPL messages.
2380  */
2381 static int
2382 do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2383 {
2384         struct toepcb *toep = ctx;
2385
2386         VALIDATE_SOCK(so);
2387 #if 0
2388         skb->h.th = tcphdr_skb->h.th;
2389 #endif
2390         process_ddp_complete(toep, m);
2391         return (0);
2392 }
2393
2394 /*
2395  * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
2396  * socket state before calling tcp_time_wait to comply with its expectations.
2397  */
2398 static void
2399 enter_timewait(struct tcpcb *tp)
2400 {
2401         /*
2402          * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2403          * process peer_close because we don't want to carry the peer FIN in
2404          * the socket's receive queue and if we increment rcv_nxt without
2405          * having the FIN in the receive queue we'll confuse facilities such
2406          * as SIOCINQ.
2407          */
2408         inp_wlock(tp->t_inpcb);
2409         tp->rcv_nxt++;
2410
2411         tp->ts_recent_age = 0;       /* defeat recycling */
2412         tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2413         inp_wunlock(tp->t_inpcb);
2414         tcp_offload_twstart(tp);
2415 }
2416
2417 static void
2418 enter_timewait_disconnect(struct tcpcb *tp)
2419 {
2420         /*
2421          * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
2422          * process peer_close because we don't want to carry the peer FIN in
2423          * the socket's receive queue and if we increment rcv_nxt without
2424          * having the FIN in the receive queue we'll confuse facilities such
2425          * as SIOCINQ.
2426          */
2427         inp_wlock(tp->t_inpcb);
2428         tp->rcv_nxt++;
2429
2430         tp->ts_recent_age = 0;       /* defeat recycling */
2431         tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
2432         inp_wunlock(tp->t_inpcb);
2433         tcp_offload_twstart_disconnect(tp);
2434 }
2435
2436 /*
2437  * For TCP DDP a PEER_CLOSE may also be an implicit RX_DDP_COMPLETE.  This
2438  * function deals with the data that may be reported along with the FIN.
2439  * Returns -1 if no further processing of the PEER_CLOSE is needed, >= 0 to
2440  * perform normal FIN-related processing.  In the latter case 1 indicates that
2441  * there was an implicit RX_DDP_COMPLETE and the skb should not be freed, 0 the
2442  * skb can be freed.
2443  */
2444 static int
2445 handle_peer_close_data(struct socket *so, struct mbuf *m)
2446 {
2447         struct tcpcb *tp = so_sototcpcb(so);
2448         struct toepcb *toep = tp->t_toe;
2449         struct ddp_state *q;
2450         struct ddp_buf_state *bsp;
2451         struct cpl_peer_close *req = cplhdr(m);
2452         unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
2453         struct sockbuf *rcv;
2454
2455         if (tp->rcv_nxt == rcv_nxt)                     /* no data */
2456                 return (0);
2457
2458         CTR0(KTR_TOM, "handle_peer_close_data");
2459         if (__predict_false(so_no_receive(so))) {
2460                 handle_excess_rx(toep, m);
2461
2462                 /*
2463                  * Although we discard the data we want to process the FIN so
2464                  * that PEER_CLOSE + data behaves the same as RX_DATA_DDP +
2465                  * PEER_CLOSE without data.  In particular this PEER_CLOSE
2466                  * may be what will close the connection.  We return 1 because
2467                  * handle_excess_rx() already freed the packet.
2468                  */
2469                 return (1);
2470         }
2471
2472         inp_lock_assert(tp->t_inpcb);
2473         q = &toep->tp_ddp_state;
2474         rcv = so_sockbuf_rcv(so);
2475         sockbuf_lock(rcv);
2476
2477         bsp = &q->buf_state[q->cur_buf];
2478         m->m_len = m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
2479         KASSERT(m->m_len > 0, ("%s m_len=%d", __FUNCTION__, m->m_len));
2480         m->m_ddp_gl = (unsigned char *)bsp->gl;
2481         m->m_flags |= M_DDP;
2482         m->m_cur_offset = bsp->cur_offset;
2483         m->m_ddp_flags =
2484             DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
2485         m->m_seq = tp->rcv_nxt;
2486         tp->rcv_nxt = rcv_nxt;
2487         bsp->cur_offset += m->m_pkthdr.len;
2488         if (!(bsp->flags & DDP_BF_NOFLIP))
2489                 q->cur_buf ^= 1;
2490 #ifdef notyet
2491         skb_reset_transport_header(skb);
2492         tcp_hdr(skb)->fin = 0;          /* changes valid memory past CPL */
2493 #endif
2494         tp->t_rcvtime = ticks;
2495         SBAPPEND(rcv, m);
2496         if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
2497                 so_sorwakeup_locked(so);
2498         else
2499                 sockbuf_unlock(rcv);
2500
2501         return (1);
2502 }
2503
2504 /*
2505  * Handle a peer FIN.
2506  */
2507 static void
2508 do_peer_fin(struct toepcb *toep, struct mbuf *m)
2509 {
2510         struct socket *so;
2511         struct tcpcb *tp = toep->tp_tp;
2512         int keep, action;
2513
2514         action = keep = 0;
2515         CTR1(KTR_TOM, "do_peer_fin state=%d", tp->t_state);
2516         if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2517                 printf("abort_pending set\n");
2518
2519                 goto out;
2520         }
2521         inp_wlock(tp->t_inpcb);
2522         so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
2523         if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
2524                 keep = handle_peer_close_data(so, m);
2525                 if (keep < 0) {
2526                         inp_wunlock(tp->t_inpcb);
2527                         return;
2528                 }
2529         }
2530         if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
2531                 CTR1(KTR_TOM,
2532                     "waking up waiters for cantrcvmore on %p ", so);
2533                 socantrcvmore(so);
2534
2535                 /*
2536                  * If connection is half-synchronized
2537                  * (ie NEEDSYN flag on) then delay ACK,
2538                  * so it may be piggybacked when SYN is sent.
2539                  * Otherwise, since we received a FIN then no
2540                  * more input can be expected, send ACK now.
2541                  */
2542                 if (tp->t_flags & TF_NEEDSYN)
2543                         tp->t_flags |= TF_DELACK;
2544                 else
2545                         tp->t_flags |= TF_ACKNOW;
2546                 tp->rcv_nxt++;
2547         }
2548
2549         switch (tp->t_state) {
2550         case TCPS_SYN_RECEIVED:
2551             tp->t_starttime = ticks;
2552         /* FALLTHROUGH */
2553         case TCPS_ESTABLISHED:
2554                 tp->t_state = TCPS_CLOSE_WAIT;
2555                 break;
2556         case TCPS_FIN_WAIT_1:
2557                 tp->t_state = TCPS_CLOSING;
2558                 break;
2559         case TCPS_FIN_WAIT_2:
2560                 /*
2561                  * If we've sent an abort_req we must have sent it too late,
2562                  * HW will send us a reply telling us so, and this peer_close
2563                  * is really the last message for this connection and needs to
2564                  * be treated as an abort_rpl, i.e., transition the connection
2565                  * to TCP_CLOSE (note that the host stack does this at the
2566                  * time of generating the RST but we must wait for HW).
2567                  * Otherwise we enter TIME_WAIT.
2568                  */
2569                 t3_release_offload_resources(toep);
2570                 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2571                         action = TCP_CLOSE;
2572                 } else {
2573                         action = TCP_TIMEWAIT;
2574                 }
2575                 break;
2576         default:
2577                 log(LOG_ERR,
2578                        "%s: TID %u received PEER_CLOSE in bad state %d\n",
2579                     toep->tp_toedev->tod_name, toep->tp_tid, tp->t_state);
2580         }
2581         inp_wunlock(tp->t_inpcb);
2582
2583         if (action == TCP_TIMEWAIT) {
2584                 enter_timewait(tp);
2585         } else if (action == TCP_DROP) {
2586                 tcp_offload_drop(tp, 0);
2587         } else if (action == TCP_CLOSE) {
2588                 tcp_offload_close(tp);
2589         }
2590
2591 #ifdef notyet
2592         /* Do not send POLL_HUP for half duplex close. */
2593         if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
2594             sk->sk_state == TCP_CLOSE)
2595                 sk_wake_async(so, 1, POLL_HUP);
2596         else
2597                 sk_wake_async(so, 1, POLL_IN);
2598 #endif
2599
2600 out:
2601         if (!keep)
2602                 m_free(m);
2603 }
2604
2605 /*
2606  * Handler for PEER_CLOSE CPL messages.
2607  */
2608 static int
2609 do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2610 {
2611         struct toepcb *toep = (struct toepcb *)ctx;
2612
2613         VALIDATE_SOCK(so);
2614
2615         do_peer_fin(toep, m);
2616         return (0);
2617 }
2618
2619 static void
2620 process_close_con_rpl(struct toepcb *toep, struct mbuf *m)
2621 {
2622         struct cpl_close_con_rpl *rpl = cplhdr(m);
2623         struct tcpcb *tp = toep->tp_tp;
2624         struct socket *so;
2625         int action = 0;
2626         struct sockbuf *rcv;
2627
2628         inp_wlock(tp->t_inpcb);
2629         so = inp_inpcbtosocket(tp->t_inpcb);
2630
2631         tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */
2632
2633         if (!is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
2634                 inp_wunlock(tp->t_inpcb);
2635                 goto out;
2636         }
2637
2638         CTR3(KTR_TOM, "process_close_con_rpl(%p) state=%d dead=%d", toep,
2639             tp->t_state, !!(so_state_get(so) & SS_NOFDREF));
2640
2641         switch (tp->t_state) {
2642         case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
2643                 t3_release_offload_resources(toep);
2644                 if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2645                         action = TCP_CLOSE;
2646
2647                 } else {
2648                         action = TCP_TIMEWAIT;
2649                 }
2650                 break;
2651         case TCPS_LAST_ACK:
2652                 /*
2653                  * In this state we don't care about pending abort_rpl.
2654                  * If we've sent abort_req it was post-close and was sent too
2655                  * late, this close_con_rpl is the actual last message.
2656                  */
2657                 t3_release_offload_resources(toep);
2658                 action = TCP_CLOSE;
2659                 break;
2660         case TCPS_FIN_WAIT_1:
2661                 /*
2662                  * If we can't receive any more
2663                  * data, then closing user can proceed.
2664                  * Starting the timer is contrary to the
2665                  * specification, but if we don't get a FIN
2666                  * we'll hang forever.
2667                  *
2668                  * XXXjl:
2669                  * we should release the tp also, and use a
2670                  * compressed state.
2671                  */
2672                 if (so)
2673                         rcv = so_sockbuf_rcv(so);
2674                 else
2675                         break;
2676
2677                 if (rcv->sb_state & SBS_CANTRCVMORE) {
2678                         int timeout;
2679
2680                         if (so)
2681                                 soisdisconnected(so);
2682                         timeout = (tcp_fast_finwait2_recycle) ?
2683                             tcp_finwait2_timeout : tcp_maxidle;
2684                         tcp_timer_activate(tp, TT_2MSL, timeout);
2685                 }
2686                 tp->t_state = TCPS_FIN_WAIT_2;
2687                 if ((so_options_get(so) & SO_LINGER) && so_linger_get(so) == 0 &&
2688                     (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
2689                         action = TCP_DROP;
2690                 }
2691
2692                 break;
2693         default:
2694                 log(LOG_ERR,
2695                        "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
2696                        toep->tp_toedev->tod_name, toep->tp_tid,
2697                        tp->t_state);
2698         }
2699         inp_wunlock(tp->t_inpcb);
2700
2701
2702         if (action == TCP_TIMEWAIT) {
2703                 enter_timewait_disconnect(tp);
2704         } else if (action == TCP_DROP) {
2705                 tcp_offload_drop(tp, 0);
2706         } else if (action == TCP_CLOSE) {
2707                 tcp_offload_close(tp);
2708         }
2709 out:
2710         m_freem(m);
2711 }
2712
2713 /*
2714  * Handler for CLOSE_CON_RPL CPL messages.
2715  */
2716 static int
2717 do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
2718                             void *ctx)
2719 {
2720         struct toepcb *toep = (struct toepcb *)ctx;
2721
2722         process_close_con_rpl(toep, m);
2723         return (0);
2724 }
2725
2726 /*
2727  * Process abort replies.  We only process these messages if we anticipate
2728  * them as the coordination between SW and HW in this area is somewhat lacking
2729  * and sometimes we get ABORT_RPLs after we are done with the connection that
2730  * originated the ABORT_REQ.
2731  */
2732 static void
2733 process_abort_rpl(struct toepcb *toep, struct mbuf *m)
2734 {
2735         struct tcpcb *tp = toep->tp_tp;
2736         struct socket *so;
2737         int needclose = 0;
2738
2739 #ifdef T3_TRACE
2740         T3_TRACE1(TIDTB(sk),
2741                   "process_abort_rpl: GTS rpl pending %d",
2742                   sock_flag(sk, ABORT_RPL_PENDING));
2743 #endif
2744
2745         inp_wlock(tp->t_inpcb);
2746         so = inp_inpcbtosocket(tp->t_inpcb);
2747
2748         if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
2749                 /*
2750                  * XXX panic on tcpdrop
2751                  */
2752                 if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(toep->tp_toedev))
2753                         toep->tp_flags |= TP_ABORT_RPL_RCVD;
2754                 else {
2755                         toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
2756                         if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
2757                             !is_t3a(toep->tp_toedev)) {
2758                                 if (toep->tp_flags & TP_ABORT_REQ_RCVD)
2759                                         panic("TP_ABORT_REQ_RCVD set");
2760                                 t3_release_offload_resources(toep);
2761                                 needclose = 1;
2762                         }
2763                 }
2764         }
2765         inp_wunlock(tp->t_inpcb);
2766
2767         if (needclose)
2768                 tcp_offload_close(tp);
2769
2770         m_free(m);
2771 }
2772
2773 /*
2774  * Handle an ABORT_RPL_RSS CPL message.
2775  */
2776 static int
2777 do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
2778 {
2779         struct cpl_abort_rpl_rss *rpl = cplhdr(m);
2780         struct toepcb *toep;
2781
2782         /*
2783          * Ignore replies to post-close aborts indicating that the abort was
2784          * requested too late.  These connections are terminated when we get
2785          * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
2786          * arrives the TID is either no longer used or it has been recycled.
2787          */
2788         if (rpl->status == CPL_ERR_ABORT_FAILED) {
2789 discard:
2790                 m_free(m);
2791                 return (0);
2792         }
2793
2794         toep = (struct toepcb *)ctx;
2795
2796         /*
2797          * Sometimes we've already closed the socket, e.g., a post-close
2798          * abort races with ABORT_REQ_RSS, the latter frees the socket
2799          * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
2800          * but FW turns the ABORT_REQ into a regular one and so we get
2801          * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
2802          */
2803         if (!toep)
2804                 goto discard;
2805
2806         if (toep->tp_tp == NULL) {
2807                 log(LOG_NOTICE, "removing tid for abort\n");
2808                 cxgb_remove_tid(cdev, toep, toep->tp_tid);
2809                 if (toep->tp_l2t)
2810                         l2t_release(L2DATA(cdev), toep->tp_l2t);
2811
2812                 toepcb_release(toep);
2813                 goto discard;
2814         }
2815
2816         log(LOG_NOTICE, "toep=%p\n", toep);
2817         log(LOG_NOTICE, "tp=%p\n", toep->tp_tp);
2818
2819         toepcb_hold(toep);
2820         process_abort_rpl(toep, m);
2821         toepcb_release(toep);
2822         return (0);
2823 }
2824
2825 /*
2826  * Convert the status code of an ABORT_REQ into a FreeBSD error code.  Also
2827  * indicate whether RST should be sent in response.
2828  */
2829 static int
2830 abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
2831 {
2832         struct tcpcb *tp = so_sototcpcb(so);
2833
2834         switch (abort_reason) {
2835         case CPL_ERR_BAD_SYN:
2836 #if 0
2837                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);      // fall through
2838 #endif
2839         case CPL_ERR_CONN_RESET:
2840                 // XXX need to handle SYN_RECV due to crossed SYNs
2841                 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
2842         case CPL_ERR_XMIT_TIMEDOUT:
2843         case CPL_ERR_PERSIST_TIMEDOUT:
2844         case CPL_ERR_FINWAIT2_TIMEDOUT:
2845         case CPL_ERR_KEEPALIVE_TIMEDOUT:
2846 #if 0
2847                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
2848 #endif
2849                 return (ETIMEDOUT);
2850         default:
2851                 return (EIO);
2852         }
2853 }
2854
2855 static inline void
2856 set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
2857 {
2858         struct cpl_abort_rpl *rpl = cplhdr(m);
2859
2860         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
2861         rpl->wr.wr_lo = htonl(V_WR_TID(tid));
2862         m->m_len = m->m_pkthdr.len = sizeof(*rpl);
2863
2864         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
2865         rpl->cmd = cmd;
2866 }
2867
2868 static void
2869 send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
2870 {
2871         struct mbuf *reply_mbuf;
2872         struct cpl_abort_req_rss *req = cplhdr(m);
2873
2874         reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
2875         m_set_priority(m, CPL_PRIORITY_DATA);
2876         m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
2877         set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
2878         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2879         m_free(m);
2880 }
2881
2882 /*
2883  * Returns whether an ABORT_REQ_RSS message is a negative advice.
2884  */
2885 static inline int
2886 is_neg_adv_abort(unsigned int status)
2887 {
2888         return status == CPL_ERR_RTX_NEG_ADVICE ||
2889             status == CPL_ERR_PERSIST_NEG_ADVICE;
2890 }
2891
2892 static void
2893 send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
2894 {
2895         struct mbuf  *reply_mbuf;
2896         struct cpl_abort_req_rss *req = cplhdr(m);
2897
2898         reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
2899
2900         if (!reply_mbuf) {
2901                 /* Defer the reply.  Stick rst_status into req->cmd. */
2902                 req->status = rst_status;
2903                 t3_defer_reply(m, tdev, send_deferred_abort_rpl);
2904                 return;
2905         }
2906
2907         m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
2908         set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
2909         m_free(m);
2910
2911         /*
2912          * XXX need to sync with ARP as for SYN_RECV connections we can send
2913          * these messages while ARP is pending.  For other connection states
2914          * it's not a problem.
2915          */
2916         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
2917 }
2918
2919 #ifdef notyet
2920 static void
2921 cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
2922 {
2923         CXGB_UNIMPLEMENTED();
2924 #ifdef notyet
2925         struct request_sock *req = child->sk_user_data;
2926
2927         inet_csk_reqsk_queue_removed(parent, req);
2928         synq_remove(tcp_sk(child));
2929         __reqsk_free(req);
2930         child->sk_user_data = NULL;
2931 #endif
2932 }
2933
2934
2935 /*
2936  * Performs the actual work to abort a SYN_RECV connection.
2937  */
2938 static void
2939 do_abort_syn_rcv(struct socket *child, struct socket *parent)
2940 {
2941         struct tcpcb *parenttp = so_sototcpcb(parent);
2942         struct tcpcb *childtp = so_sototcpcb(child);
2943
2944         /*
2945          * If the server is still open we clean up the child connection,
2946          * otherwise the server already did the clean up as it was purging
2947          * its SYN queue and the skb was just sitting in its backlog.
2948          */
2949         if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
2950                 cleanup_syn_rcv_conn(child, parent);
2951                 inp_wlock(childtp->t_inpcb);
2952                 t3_release_offload_resources(childtp->t_toe);
2953                 inp_wunlock(childtp->t_inpcb);
2954                 tcp_offload_close(childtp);
2955         }
2956 }
2957 #endif
2958
2959 /*
2960  * Handle abort requests for a SYN_RECV connection.  These need extra work
2961  * because the socket is on its parent's SYN queue.
2962  */
2963 static int
2964 abort_syn_rcv(struct socket *so, struct mbuf *m)
2965 {
2966         CXGB_UNIMPLEMENTED();
2967 #ifdef notyet
2968         struct socket *parent;
2969         struct toedev *tdev = toep->tp_toedev;
2970         struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
2971         struct socket *oreq = so->so_incomp;
2972         struct t3c_tid_entry *t3c_stid;
2973         struct tid_info *t;
2974
2975         if (!oreq)
2976                 return -1;        /* somehow we are not on the SYN queue */
2977
2978         t = &(T3C_DATA(cdev))->tid_maps;
2979         t3c_stid = lookup_stid(t, oreq->ts_recent);
2980         parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
2981
2982         so_lock(parent);
2983         do_abort_syn_rcv(so, parent);
2984         send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
2985         so_unlock(parent);
2986 #endif
2987         return (0);
2988 }
2989
2990 /*
2991  * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
2992  * request except that we need to reply to it.
2993  */
2994 static void
2995 process_abort_req(struct toepcb *toep, struct mbuf *m, struct toedev *tdev)
2996 {
2997         int rst_status = CPL_ABORT_NO_RST;
2998         const struct cpl_abort_req_rss *req = cplhdr(m);
2999         struct tcpcb *tp = toep->tp_tp;
3000         struct socket *so;
3001         int needclose = 0;
3002
3003         inp_wlock(tp->t_inpcb);
3004         so = inp_inpcbtosocket(toep->tp_tp->t_inpcb);
3005         if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
3006                 toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
3007                 m_free(m);
3008                 goto skip;
3009         }
3010
3011         toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
3012         /*
3013          * Three cases to consider:
3014          * a) We haven't sent an abort_req; close the connection.
3015          * b) We have sent a post-close abort_req that will get to TP too late
3016          *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
3017          *    be ignored and the connection should be closed now.
3018          * c) We have sent a regular abort_req that will get to TP too late.
3019          *    That will generate an abort_rpl with status 0, wait for it.
3020          */
3021         if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
3022             (is_t3a(toep->tp_toedev) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
3023                 int error;
3024
3025                 error = abort_status_to_errno(so, req->status,
3026                     &rst_status);
3027                 so_error_set(so, error);
3028
3029                 if (__predict_true((so_state_get(so) & SS_NOFDREF) == 0))
3030                         so_sorwakeup(so);
3031                 /*
3032                  * SYN_RECV needs special processing.  If abort_syn_rcv()
3033                  * returns 0 is has taken care of the abort.
3034                  */
3035                 if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
3036                         goto skip;
3037
3038                 t3_release_offload_resources(toep);
3039                 needclose = 1;
3040         }
3041         inp_wunlock(tp->t_inpcb);
3042
3043         if (needclose)
3044                 tcp_offload_close(tp);
3045
3046         send_abort_rpl(m, tdev, rst_status);
3047         return;
3048 skip:
3049         inp_wunlock(tp->t_inpcb);
3050 }
3051
3052 /*
3053  * Handle an ABORT_REQ_RSS CPL message.
3054  */
3055 static int
3056 do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3057 {
3058         const struct cpl_abort_req_rss *req = cplhdr(m);
3059         struct toepcb *toep = (struct toepcb *)ctx;
3060
3061         if (is_neg_adv_abort(req->status)) {
3062                 m_free(m);
3063                 return (0);
3064         }
3065
3066         log(LOG_NOTICE, "aborting tid=%d\n", toep->tp_tid);
3067
3068         if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
3069                 cxgb_remove_tid(cdev, toep, toep->tp_tid);
3070                 toep->tp_flags |= TP_ABORT_REQ_RCVD;
3071
3072                 send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
3073                 if (toep->tp_l2t)
3074                         l2t_release(L2DATA(cdev), toep->tp_l2t);
3075
3076                 /*
3077                  *  Unhook
3078                  */
3079                 toep->tp_tp->t_toe = NULL;
3080                 toep->tp_tp->t_flags &= ~TF_TOE;
3081                 toep->tp_tp = NULL;
3082                 /*
3083                  * XXX need to call syncache_chkrst - but we don't
3084                  * have a way of doing that yet
3085                  */
3086                 toepcb_release(toep);
3087                 log(LOG_ERR, "abort for unestablished connection :-(\n");
3088                 return (0);
3089         }
3090         if (toep->tp_tp == NULL) {
3091                 log(LOG_NOTICE, "disconnected toepcb\n");
3092                 /* should be freed momentarily */
3093                 return (0);
3094         }
3095
3096
3097         toepcb_hold(toep);
3098         process_abort_req(toep, m, toep->tp_toedev);
3099         toepcb_release(toep);
3100         return (0);
3101 }
3102 #ifdef notyet
3103 static void
3104 pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
3105 {
3106         struct toedev *tdev = TOE_DEV(parent);
3107
3108         do_abort_syn_rcv(child, parent);
3109         if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
3110                 struct cpl_pass_accept_rpl *rpl = cplhdr(m);
3111
3112                 rpl->opt0h = htonl(F_TCAM_BYPASS);
3113                 rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3114                 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3115         } else
3116                 m_free(m);
3117 }
3118 #endif
3119 static void
3120 handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
3121 {
3122         CXGB_UNIMPLEMENTED();
3123
3124 #ifdef notyet
3125         struct t3cdev *cdev;
3126         struct socket *parent;
3127         struct socket *oreq;
3128         struct t3c_tid_entry *t3c_stid;
3129         struct tid_info *t;
3130         struct tcpcb *otp, *tp = so_sototcpcb(so);
3131         struct toepcb *toep = tp->t_toe;
3132
3133         /*
3134          * If the connection is being aborted due to the parent listening
3135          * socket going away there's nothing to do, the ABORT_REQ will close
3136          * the connection.
3137          */
3138         if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
3139                 m_free(m);
3140                 return;
3141         }
3142
3143         oreq = so->so_incomp;
3144         otp = so_sototcpcb(oreq);
3145
3146         cdev = T3C_DEV(so);
3147         t = &(T3C_DATA(cdev))->tid_maps;
3148         t3c_stid = lookup_stid(t, otp->ts_recent);
3149         parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;
3150
3151         so_lock(parent);
3152         pass_open_abort(so, parent, m);
3153         so_unlock(parent);
3154 #endif
3155 }
3156
3157 /*
3158  * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
3159  * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
3160  * connection.
3161  */
3162 static void
3163 pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
3164 {
3165
3166 #ifdef notyet
3167         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3168         BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
3169 #endif
3170         handle_pass_open_arp_failure(m_get_socket(m), m);
3171 }
3172
3173 /*
3174  * Populate a reject CPL_PASS_ACCEPT_RPL WR.
3175  */
3176 static void
3177 mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
3178 {
3179         struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
3180         struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
3181         unsigned int tid = GET_TID(req);
3182
3183         m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
3184         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3185         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3186         rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
3187         rpl->opt0h = htonl(F_TCAM_BYPASS);
3188         rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
3189         rpl->opt2 = 0;
3190         rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
3191 }
3192
3193 /*
3194  * Send a deferred reject to an accept request.
3195  */
3196 static void
3197 reject_pass_request(struct toedev *tdev, struct mbuf *m)
3198 {
3199         struct mbuf *reply_mbuf;
3200
3201         reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
3202         mk_pass_accept_rpl(reply_mbuf, m);
3203         cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
3204         m_free(m);
3205 }
3206
3207 static void
3208 handle_syncache_event(int event, void *arg)
3209 {
3210         struct toepcb *toep = arg;
3211
3212         switch (event) {
3213         case TOE_SC_ENTRY_PRESENT:
3214                 /*
3215                  * entry already exists - free toepcb
3216                  * and l2t
3217                  */
3218                 printf("syncache entry present\n");
3219                 toepcb_release(toep);
3220                 break;
3221         case TOE_SC_DROP:
3222                 /*
3223                  * The syncache has given up on this entry
3224                  * either it timed out, or it was evicted
3225                  * we need to explicitly release the tid
3226                  */
3227                 printf("syncache entry dropped\n");
3228                 toepcb_release(toep);
3229                 break;
3230         default:
3231                 log(LOG_ERR, "unknown syncache event %d\n", event);
3232                 break;
3233         }
3234 }
3235
3236 static void
3237 syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
3238 {
3239         struct in_conninfo inc;
3240         struct tcpopt to;
3241         struct tcphdr th;
3242         struct inpcb *inp;
3243         int mss, wsf, sack, ts;
3244         uint32_t rcv_isn = ntohl(req->rcv_isn);
3245
3246         bzero(&to, sizeof(struct tcpopt));
3247         inp = so_sotoinpcb(lso);
3248
3249         /*
3250          * Fill out information for entering us into the syncache
3251          */
3252         inc.inc_fport = th.th_sport = req->peer_port;
3253         inc.inc_lport = th.th_dport = req->local_port;
3254         th.th_seq = req->rcv_isn;
3255         th.th_flags = TH_SYN;
3256
3257         toep->tp_iss = toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = rcv_isn + 1;
3258
3259
3260         inc.inc_isipv6 = 0;
3261         inc.inc_len = 0;
3262         inc.inc_faddr.s_addr = req->peer_ip;
3263         inc.inc_laddr.s_addr = req->local_ip;
3264
3265         DPRINTF("syncache add of %d:%d %d:%d\n",
3266             ntohl(req->local_ip), ntohs(req->local_port),
3267             ntohl(req->peer_ip), ntohs(req->peer_port));
3268
3269         mss = req->tcp_options.mss;
3270         wsf = req->tcp_options.wsf;
3271         ts = req->tcp_options.tstamp;
3272         sack = req->tcp_options.sack;
3273         to.to_mss = mss;
3274         to.to_wscale = wsf;
3275         to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3276         syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
3277 }
3278
3279
3280 /*
3281  * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
3282  * lock held.  Note that the sock here is a listening socket that is not owned
3283  * by the TOE.
3284  */
3285 static void
3286 process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
3287     struct listen_ctx *lctx)
3288 {
3289         int rt_flags;
3290         struct l2t_entry *e;
3291         struct iff_mac tim;
3292         struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
3293         struct cpl_pass_accept_rpl *rpl;
3294         struct cpl_pass_accept_req *req = cplhdr(m);
3295         unsigned int tid = GET_TID(req);
3296         struct tom_data *d = TOM_DATA(tdev);
3297         struct t3cdev *cdev = d->cdev;
3298         struct tcpcb *tp = so_sototcpcb(so);
3299         struct toepcb *newtoep;
3300         struct rtentry *dst;
3301         struct sockaddr_in nam;
3302         struct t3c_data *td = T3C_DATA(cdev);
3303
3304         reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3305         if (__predict_false(reply_mbuf == NULL)) {
3306                 if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3307                         t3_defer_reply(m, tdev, reject_pass_request);
3308                 else {
3309                         cxgb_queue_tid_release(cdev, tid);
3310                         m_free(m);
3311                 }
3312                 DPRINTF("failed to get reply_mbuf\n");
3313
3314                 goto out;
3315         }
3316
3317         if (tp->t_state != TCPS_LISTEN) {
3318                 DPRINTF("socket not in listen state\n");
3319
3320                 goto reject;
3321         }
3322
3323         tim.mac_addr = req->dst_mac;
3324         tim.vlan_tag = ntohs(req->vlan_tag);
3325         if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
3326                 DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
3327                 goto reject;
3328         }
3329
3330 #ifdef notyet
3331         /*
3332          * XXX do route lookup to confirm that we're still listening on this
3333          * address
3334          */
3335         if (ip_route_input(skb, req->local_ip, req->peer_ip,
3336                            G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
3337                 goto reject;
3338         rt_flags = ((struct rtable *)skb->dst)->rt_flags &
3339                 (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
3340         dst_release(skb->dst);  // done with the input route, release it
3341         skb->dst = NULL;
3342
3343         if ((rt_flags & RTF_LOCAL) == 0)
3344                 goto reject;
3345 #endif
3346         /*
3347          * XXX
3348          */
3349         rt_flags = RTF_LOCAL;
3350         if ((rt_flags & RTF_LOCAL) == 0)
3351                 goto reject;
3352
3353         /*
3354          * Calculate values and add to syncache
3355          */
3356
3357         newtoep = toepcb_alloc();
3358         if (newtoep == NULL)
3359                 goto reject;
3360
3361         bzero(&nam, sizeof(struct sockaddr_in));
3362
3363         nam.sin_len = sizeof(struct sockaddr_in);
3364         nam.sin_family = AF_INET;
3365         nam.sin_addr.s_addr =req->peer_ip;
3366         dst = rtalloc2((struct sockaddr *)&nam, 1, 0);
3367
3368         if (dst == NULL) {
3369                 printf("failed to find route\n");
3370                 goto reject;
3371         }
3372         e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
3373             (struct sockaddr *)&nam);
3374         if (e == NULL) {
3375                 DPRINTF("failed to get l2t\n");
3376         }
3377         /*
3378          * Point to our listen socket until accept
3379          */
3380         newtoep->tp_tp = tp;
3381         newtoep->tp_flags = TP_SYN_RCVD;
3382         newtoep->tp_tid = tid;
3383         newtoep->tp_toedev = tdev;
3384         tp->rcv_wnd = select_rcv_wnd(tdev, so);
3385
3386         cxgb_insert_tid(cdev, d->client, newtoep, tid);
3387         so_lock(so);
3388         LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
3389         so_unlock(so);
3390
3391         newtoep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so_options_get(so) & SO_NO_DDP) &&
3392                        tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
3393
3394         if (newtoep->tp_ulp_mode) {
3395                 ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
3396
3397                 if (ddp_mbuf == NULL)
3398                         newtoep->tp_ulp_mode = 0;
3399         }
3400
3401         CTR4(KTR_TOM, "ddp=%d rcv_wnd=%ld min_win=%d ulp_mode=%d",
3402             TOM_TUNABLE(tdev, ddp), tp->rcv_wnd, MIN_DDP_RCV_WIN, newtoep->tp_ulp_mode);
3403         set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);
3404         /*
3405          * XXX workaround for lack of syncache drop
3406          */
3407         toepcb_hold(newtoep);
3408         syncache_add_accept_req(req, so, newtoep);
3409
3410         rpl = cplhdr(reply_mbuf);
3411         reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
3412         rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
3413         rpl->wr.wr_lo = 0;
3414         OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
3415         rpl->opt2 = htonl(calc_opt2(so, tdev));
3416         rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
3417         rpl->peer_ip = req->peer_ip;    // req->peer_ip is not overwritten
3418
3419         rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
3420             V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
3421         rpl->opt0l_status = htonl(calc_opt0l(so, newtoep->tp_ulp_mode) |
3422                                   CPL_PASS_OPEN_ACCEPT);
3423
3424         DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
3425
3426         m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, newtoep));
3427
3428         l2t_send(cdev, reply_mbuf, e);
3429         m_free(m);
3430         if (newtoep->tp_ulp_mode) {
3431                 __set_tcb_field(newtoep, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
3432                                 V_TF_DDP_OFF(1) |
3433                                 TP_DDP_TIMER_WORKAROUND_MASK,
3434                                 V_TF_DDP_OFF(1) |
3435                     TP_DDP_TIMER_WORKAROUND_VAL, 1);
3436         } else
3437                 printf("not offloading\n");
3438
3439
3440
3441         return;
3442 reject:
3443         if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
3444                 mk_pass_accept_rpl(reply_mbuf, m);
3445         else
3446                 mk_tid_release(reply_mbuf, newtoep, tid);
3447         cxgb_ofld_send(cdev, reply_mbuf);
3448         m_free(m);
3449 out:
3450 #if 0
3451         TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
3452 #else
3453         return;
3454 #endif
3455 }
3456
3457 /*
3458  * Handle a CPL_PASS_ACCEPT_REQ message.
3459  */
3460 static int
3461 do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3462 {
3463         struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
3464         struct socket *lso = listen_ctx->lso; /* XXX need an interlock against the listen socket going away */
3465         struct tom_data *d = listen_ctx->tom_data;
3466
3467 #if VALIDATE_TID
3468         struct cpl_pass_accept_req *req = cplhdr(m);
3469         unsigned int tid = GET_TID(req);
3470         struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;
3471
3472         if (unlikely(!lsk)) {
3473                 printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
3474                        cdev->name,
3475                        (unsigned long)((union listen_entry *)ctx -
3476                                         t->stid_tab));
3477                 return CPL_RET_BUF_DONE;
3478         }
3479         if (unlikely(tid >= t->ntids)) {
3480                 printk(KERN_ERR "%s: passive open TID %u too large\n",
3481                        cdev->name, tid);
3482                 return CPL_RET_BUF_DONE;
3483         }
3484         /*
3485          * For T3A the current user of the TID may have closed but its last
3486          * message(s) may have been backlogged so the TID appears to be still
3487          * in use.  Just take the TID away, the connection can close at its
3488          * own leisure.  For T3B this situation is a bug.
3489          */
3490         if (!valid_new_tid(t, tid) &&
3491             cdev->type != T3A) {
3492                 printk(KERN_ERR "%s: passive open uses existing TID %u\n",
3493                        cdev->name, tid);
3494                 return CPL_RET_BUF_DONE;
3495         }
3496 #endif
3497
3498         process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
3499         return (0);
3500 }
3501
3502 /*
3503  * Called when a connection is established to translate the TCP options
3504  * reported by HW to FreeBSD's native format.
3505  */
3506 static void
3507 assign_rxopt(struct socket *so, unsigned int opt)
3508 {
3509         struct tcpcb *tp = so_sototcpcb(so);
3510         struct toepcb *toep = tp->t_toe;
3511         const struct t3c_data *td = T3C_DATA(TOEP_T3C_DEV(toep));
3512
3513         inp_lock_assert(tp->t_inpcb);
3514
3515         toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3516         tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
3517         tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
3518         tp->t_flags         |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
3519         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
3520             (TF_RCVD_SCALE|TF_REQ_SCALE))
3521                 tp->rcv_scale = tp->request_r_scale;
3522 }
3523
3524 /*
3525  * Completes some final bits of initialization for just established connections
3526  * and changes their state to TCP_ESTABLISHED.
3527  *
3528  * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
3529  */
3530 static void
3531 make_established(struct socket *so, u32 snd_isn, unsigned int opt)
3532 {
3533         struct tcpcb *tp = so_sototcpcb(so);
3534         struct toepcb *toep = tp->t_toe;
3535
3536         toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
3537         assign_rxopt(so, opt);
3538
3539         /*
3540          *XXXXXXXXXXX
3541          *
3542          */
3543 #ifdef notyet
3544         so->so_proto->pr_ctloutput = t3_ctloutput;
3545 #endif
3546
3547 #if 0
3548         inet_sk(sk)->id = tp->write_seq ^ jiffies;
3549 #endif
3550         /*
3551          * XXX not clear what rcv_wup maps to
3552          */
3553         /*
3554          * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
3555          * pass through opt0.
3556          */
3557         if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
3558                 toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);
3559
3560         dump_toepcb(toep);
3561
3562 #ifdef notyet
3563 /*
3564  * no clean interface for marking ARP up to date
3565  */
3566         dst_confirm(sk->sk_dst_cache);
3567 #endif
3568         tp->t_starttime = ticks;
3569         tp->t_state = TCPS_ESTABLISHED;
3570         soisconnected(so);
3571 }
3572
3573 static int
3574 syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
3575 {
3576
3577         struct in_conninfo inc;
3578         struct tcpopt to;
3579         struct tcphdr th;
3580         int mss, wsf, sack, ts;
3581         struct mbuf *m = NULL;
3582         const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
3583         unsigned int opt;
3584
3585 #ifdef MAC
3586 #error  "no MAC support"
3587 #endif
3588
3589         opt = ntohs(req->tcp_opt);
3590
3591         bzero(&to, sizeof(struct tcpopt));
3592
3593         /*
3594          * Fill out information for entering us into the syncache
3595          */
3596         inc.inc_fport = th.th_sport = req->peer_port;
3597         inc.inc_lport = th.th_dport = req->local_port;
3598         th.th_seq = req->rcv_isn;
3599         th.th_flags = TH_ACK;
3600
3601         inc.inc_isipv6 = 0;
3602         inc.inc_len = 0;
3603         inc.inc_faddr.s_addr = req->peer_ip;
3604         inc.inc_laddr.s_addr = req->local_ip;
3605
3606         mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
3607         wsf  = G_TCPOPT_WSCALE_OK(opt);
3608         ts   = G_TCPOPT_TSTAMP(opt);
3609         sack = G_TCPOPT_SACK(opt);
3610
3611         to.to_mss = mss;
3612         to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
3613         to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);
3614
3615         DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
3616             ntohl(req->local_ip), ntohs(req->local_port),
3617             ntohl(req->peer_ip), ntohs(req->peer_port),
3618             mss, wsf, ts, sack);
3619         return syncache_offload_expand(&inc, &to, &th, so, m);
3620 }
3621
3622
3623 /*
3624  * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
3625  * if we are in TCP_SYN_RECV due to crossed SYNs
3626  */
3627 static int
3628 do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3629 {
3630         struct cpl_pass_establish *req = cplhdr(m);
3631         struct toepcb *toep = (struct toepcb *)ctx;
3632         struct tcpcb *tp = toep->tp_tp;
3633         struct socket *so, *lso;
3634         struct t3c_data *td = T3C_DATA(cdev);
3635         struct sockbuf *snd, *rcv;
3636
3637         // Complete socket initialization now that we have the SND_ISN
3638
3639         struct toedev *tdev;
3640
3641
3642         tdev = toep->tp_toedev;
3643
3644         inp_wlock(tp->t_inpcb);
3645
3646         /*
3647          *
3648          * XXX need to add reference while we're manipulating
3649          */
3650         so = lso = inp_inpcbtosocket(tp->t_inpcb);
3651
3652         inp_wunlock(tp->t_inpcb);
3653
3654         so_lock(so);
3655         LIST_REMOVE(toep, synq_entry);
3656         so_unlock(so);
3657
3658         if (!syncache_expand_establish_req(req, &so, toep)) {
3659                 /*
3660                  * No entry
3661                  */
3662                 CXGB_UNIMPLEMENTED();
3663         }
3664         if (so == NULL) {
3665                 /*
3666                  * Couldn't create the socket
3667                  */
3668                 CXGB_UNIMPLEMENTED();
3669         }
3670
3671         tp = so_sototcpcb(so);
3672         inp_wlock(tp->t_inpcb);
3673
3674         snd = so_sockbuf_snd(so);
3675         rcv = so_sockbuf_rcv(so);
3676
3677         snd->sb_flags |= SB_NOCOALESCE;
3678         rcv->sb_flags |= SB_NOCOALESCE;
3679
3680         toep->tp_tp = tp;
3681         toep->tp_flags = 0;
3682         tp->t_toe = toep;
3683         reset_wr_list(toep);
3684         tp->rcv_wnd = select_rcv_wnd(tdev, so);
3685         tp->rcv_nxt = toep->tp_copied_seq;
3686         install_offload_ops(so);
3687
3688         toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
3689         toep->tp_wr_unacked = 0;
3690         toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3691         toep->tp_qset_idx = 0;
3692         toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
3693
3694         /*
3695          * XXX Cancel any keep alive timer
3696          */
3697
3698         make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3699
3700         /*
3701          * XXX workaround for lack of syncache drop
3702          */
3703         toepcb_release(toep);
3704         inp_wunlock(tp->t_inpcb);
3705
3706         CTR1(KTR_TOM, "do_pass_establish tid=%u", toep->tp_tid);
3707         cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3708 #ifdef notyet
3709         /*
3710          * XXX not sure how these checks map to us
3711          */
3712         if (unlikely(sk->sk_socket)) {   // simultaneous opens only
3713                 sk->sk_state_change(sk);
3714                 sk_wake_async(so, 0, POLL_OUT);
3715         }
3716         /*
3717          * The state for the new connection is now up to date.
3718          * Next check if we should add the connection to the parent's
3719          * accept queue.  When the parent closes it resets connections
3720          * on its SYN queue, so check if we are being reset.  If so we
3721          * don't need to do anything more, the coming ABORT_RPL will
3722          * destroy this socket.  Otherwise move the connection to the
3723          * accept queue.
3724          *
3725          * Note that we reset the synq before closing the server so if
3726          * we are not being reset the stid is still open.
3727          */
3728         if (unlikely(!tp->forward_skb_hint)) { // removed from synq
3729                 __kfree_skb(skb);
3730                 goto unlock;
3731         }
3732 #endif
3733         m_free(m);
3734
3735         return (0);
3736 }
3737
3738 /*
3739  * Fill in the right TID for CPL messages waiting in the out-of-order queue
3740  * and send them to the TOE.
3741  */
3742 static void
3743 fixup_and_send_ofo(struct toepcb *toep)
3744 {
3745         struct mbuf *m;
3746         struct toedev *tdev = toep->tp_toedev;
3747         struct tcpcb *tp = toep->tp_tp;
3748         unsigned int tid = toep->tp_tid;
3749
3750         log(LOG_NOTICE, "fixup_and_send_ofo\n");
3751
3752         inp_lock_assert(tp->t_inpcb);
3753         while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
3754                 /*
3755                  * A variety of messages can be waiting but the fields we'll
3756                  * be touching are common to all so any message type will do.
3757                  */
3758                 struct cpl_close_con_req *p = cplhdr(m);
3759
3760                 p->wr.wr_lo = htonl(V_WR_TID(tid));
3761                 OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
3762                 cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
3763         }
3764 }
3765
3766 /*
3767  * Updates socket state from an active establish CPL message.  Runs with the
3768  * socket lock held.
3769  */
3770 static void
3771 socket_act_establish(struct socket *so, struct mbuf *m)
3772 {
3773         struct cpl_act_establish *req = cplhdr(m);
3774         u32 rcv_isn = ntohl(req->rcv_isn);      /* real RCV_ISN + 1 */
3775         struct tcpcb *tp = so_sototcpcb(so);
3776         struct toepcb *toep = tp->t_toe;
3777
3778         if (__predict_false(tp->t_state != TCPS_SYN_SENT))
3779                 log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
3780                     toep->tp_tid, tp->t_state);
3781
3782         tp->ts_recent_age = ticks;
3783         tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
3784         toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
3785
3786         make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
3787
3788         /*
3789          * Now that we finally have a TID send any CPL messages that we had to
3790          * defer for lack of a TID.
3791          */
3792         if (mbufq_len(&toep->out_of_order_queue))
3793                 fixup_and_send_ofo(toep);
3794
3795         if (__predict_false(so_state_get(so) & SS_NOFDREF)) {
3796                 /*
3797                  * XXX does this even make sense?
3798                  */
3799                 so_sorwakeup(so);
3800         }
3801         m_free(m);
3802 #ifdef notyet
3803 /*
3804  * XXX assume no write requests permitted while socket connection is
3805  * incomplete
3806  */
3807         /*
3808          * Currently the send queue must be empty at this point because the
3809          * socket layer does not send anything before a connection is
3810          * established.  To be future proof though we handle the possibility
3811          * that there are pending buffers to send (either TX_DATA or
3812          * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
3813          * buffers according to the just learned write_seq, and then we send
3814          * them on their way.
3815          */
3816         fixup_pending_writeq_buffers(sk);
3817         if (t3_push_frames(so, 1))
3818                 sk->sk_write_space(sk);
3819 #endif
3820
3821         toep->tp_state = tp->t_state;
3822         tcpstat.tcps_connects++;
3823
3824 }
3825
3826 /*
3827  * Process a CPL_ACT_ESTABLISH message.
3828  */
3829 static int
3830 do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
3831 {
3832         struct cpl_act_establish *req = cplhdr(m);
3833         unsigned int tid = GET_TID(req);
3834         unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
3835         struct toepcb *toep = (struct toepcb *)ctx;
3836         struct tcpcb *tp = toep->tp_tp;
3837         struct socket *so;
3838         struct toedev *tdev;
3839         struct tom_data *d;
3840
3841         if (tp == NULL) {
3842                 free_atid(cdev, atid);
3843                 return (0);
3844         }
3845         inp_wlock(tp->t_inpcb);
3846
3847         /*
3848          * XXX
3849          */
3850         so = inp_inpcbtosocket(tp->t_inpcb);
3851         tdev = toep->tp_toedev; /* blow up here if link was down */
3852         d = TOM_DATA(tdev);
3853
3854         /*
3855          * It's OK if the TID is currently in use, the owning socket may have
3856          * backlogged its last CPL message(s).  Just take it away.
3857          */
3858         toep->tp_tid = tid;
3859         toep->tp_tp = tp;
3860         so_insert_tid(d, toep, tid);
3861         free_atid(cdev, atid);
3862         toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
3863
3864         socket_act_establish(so, m);
3865         inp_wunlock(tp->t_inpcb);
3866         CTR1(KTR_TOM, "do_act_establish tid=%u", toep->tp_tid);
3867         cxgb_log_tcb(cdev->adapter, toep->tp_tid);
3868
3869         return (0);
3870 }
3871
3872 /*
3873  * Process an acknowledgment of WR completion.  Advance snd_una and send the
3874  * next batch of work requests from the write queue.
3875  */
3876 static void
3877 wr_ack(struct toepcb *toep, struct mbuf *m)
3878 {
3879         struct tcpcb *tp = toep->tp_tp;
3880         struct cpl_wr_ack *hdr = cplhdr(m);
3881         struct socket *so;
3882         unsigned int credits = ntohs(hdr->credits);
3883         u32 snd_una = ntohl(hdr->snd_una);
3884         int bytes = 0;
3885         struct sockbuf *snd;
3886
3887         CTR2(KTR_SPARE2, "wr_ack: snd_una=%u credits=%d", snd_una, credits);
3888
3889         inp_wlock(tp->t_inpcb);
3890         so = inp_inpcbtosocket(tp->t_inpcb);
3891         toep->tp_wr_avail += credits;
3892         if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
3893                 toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;
3894
3895         while (credits) {
3896                 struct mbuf *p = peek_wr(toep);
3897
3898                 if (__predict_false(!p)) {
3899                         log(LOG_ERR, "%u WR_ACK credits for TID %u with "
3900                             "nothing pending, state %u wr_avail=%u\n",
3901                             credits, toep->tp_tid, tp->t_state, toep->tp_wr_avail);
3902                         break;
3903                 }
3904                 CTR2(KTR_TOM,
3905                         "wr_ack: p->credits=%d p->bytes=%d",
3906                     p->m_pkthdr.csum_data, p->m_pkthdr.len);
3907                 KASSERT(p->m_pkthdr.csum_data != 0,
3908                     ("empty request still on list"));
3909
3910                 if (__predict_false(credits < p->m_pkthdr.csum_data)) {
3911
3912 #if DEBUG_WR > 1
3913                         struct tx_data_wr *w = cplhdr(p);
3914                         log(LOG_ERR,
3915                                "TID %u got %u WR credits, need %u, len %u, "
3916                                "main body %u, frags %u, seq # %u, ACK una %u,"
3917                                " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
3918                                toep->tp_tid, credits, p->csum, p->len,
3919                                p->len - p->data_len, skb_shinfo(p)->nr_frags,
3920                                ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
3921                             toep->tp_wr_avail, count_pending_wrs(tp) - credits);
3922 #endif
3923                         p->m_pkthdr.csum_data -= credits;
3924                         break;
3925                 } else {
3926                         dequeue_wr(toep);
3927                         credits -= p->m_pkthdr.csum_data;
3928                         bytes += p->m_pkthdr.len;
3929                         CTR3(KTR_TOM,
3930                             "wr_ack: done with wr of %d bytes remain credits=%d wr credits=%d",
3931                             p->m_pkthdr.len, credits, p->m_pkthdr.csum_data);
3932
3933                         m_free(p);
3934                 }
3935         }
3936
3937 #if DEBUG_WR
3938         check_wr_invariants(tp);
3939 #endif
3940
3941         if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
3942 #if VALIDATE_SEQ
3943                 struct tom_data *d = TOM_DATA(TOE_DEV(so));
3944
3945                 log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
3946                     "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
3947                     toep->tp_tid, tp->snd_una);
3948 #endif
3949                 goto out_free;
3950         }
3951
3952         if (tp->snd_una != snd_una) {
3953                 tp->snd_una = snd_una;
3954                 tp->ts_recent_age = ticks;
3955 #ifdef notyet
3956                 /*
3957                  * Keep ARP entry "minty fresh"
3958                  */
3959                 dst_confirm(sk->sk_dst_cache);
3960 #endif
3961                 if (tp->snd_una == tp->snd_nxt)
3962                         toep->tp_flags &= ~TP_TX_WAIT_IDLE;
3963         }
3964
3965         snd = so_sockbuf_snd(so);
3966         if (bytes) {
3967                 CTR1(KTR_SPARE2, "wr_ack: sbdrop(%d)", bytes);
3968                 snd = so_sockbuf_snd(so);
3969                 sockbuf_lock(snd);
3970                 sbdrop_locked(snd, bytes);
3971                 so_sowwakeup_locked(so);
3972         }
3973
3974         if (snd->sb_sndptroff < snd->sb_cc)
3975                 t3_push_frames(so, 0);
3976
3977 out_free:
3978         inp_wunlock(tp->t_inpcb);
3979         m_free(m);
3980 }
3981
3982 /*
3983  * Handler for TX_DATA_ACK CPL messages.
3984  */
3985 static int
3986 do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
3987 {
3988         struct toepcb *toep = (struct toepcb *)ctx;
3989
3990         VALIDATE_SOCK(so);
3991
3992         wr_ack(toep, m);
3993         return 0;
3994 }
3995
3996 /*
3997  * Handler for TRACE_PKT CPL messages.  Just sink these packets.
3998  */
3999 static int
4000 do_trace_pkt(struct t3cdev *dev, struct mbuf *m, void *ctx)
4001 {
4002         m_freem(m);
4003         return 0;
4004 }
4005
4006 /*
4007  * Reset a connection that is on a listener's SYN queue or accept queue,
4008  * i.e., one that has not had a struct socket associated with it.
4009  * Must be called from process context.
4010  *
4011  * Modeled after code in inet_csk_listen_stop().
4012  */
4013 static void
4014 t3_reset_listen_child(struct socket *child)
4015 {
4016         struct tcpcb *tp = so_sototcpcb(child);
4017
4018         t3_send_reset(tp->t_toe);
4019 }
4020
4021
4022 static void
4023 t3_child_disconnect(struct socket *so, void *arg)
4024 {
4025         struct tcpcb *tp = so_sototcpcb(so);
4026
4027         if (tp->t_flags & TF_TOE) {
4028                 inp_wlock(tp->t_inpcb);
4029                 t3_reset_listen_child(so);
4030                 inp_wunlock(tp->t_inpcb);
4031         }
4032 }
4033
4034 /*
4035  * Disconnect offloaded established but not yet accepted connections sitting
4036  * on a server's accept_queue.  We just send an ABORT_REQ at this point and
4037  * finish off the disconnect later as we may need to wait for the ABORT_RPL.
4038  */
4039 void
4040 t3_disconnect_acceptq(struct socket *listen_so)
4041 {
4042
4043         so_lock(listen_so);
4044         so_listeners_apply_all(listen_so, t3_child_disconnect, NULL);
4045         so_unlock(listen_so);
4046 }
4047
4048 /*
4049  * Reset offloaded connections sitting on a server's syn queue.  As above
4050  * we send ABORT_REQ and finish off when we get ABORT_RPL.
4051  */
4052
4053 void
4054 t3_reset_synq(struct listen_ctx *lctx)
4055 {
4056         struct toepcb *toep;
4057
4058         so_lock(lctx->lso);
4059         while (!LIST_EMPTY(&lctx->synq_head)) {
4060                 toep = LIST_FIRST(&lctx->synq_head);
4061                 LIST_REMOVE(toep, synq_entry);
4062                 toep->tp_tp = NULL;
4063                 t3_send_reset(toep);
4064                 cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
4065                 toepcb_release(toep);
4066         }
4067         so_unlock(lctx->lso);
4068 }
4069
4070
4071 int
4072 t3_setup_ppods(struct toepcb *toep, const struct ddp_gather_list *gl,
4073                    unsigned int nppods, unsigned int tag, unsigned int maxoff,
4074                    unsigned int pg_off, unsigned int color)
4075 {
4076         unsigned int i, j, pidx;
4077         struct pagepod *p;
4078         struct mbuf *m;
4079         struct ulp_mem_io *req;
4080         unsigned int tid = toep->tp_tid;
4081         const struct tom_data *td = TOM_DATA(toep->tp_toedev);
4082         unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
4083
4084         CTR6(KTR_TOM, "t3_setup_ppods(gl=%p nppods=%u tag=%u maxoff=%u pg_off=%u color=%u)",
4085             gl, nppods, tag, maxoff, pg_off, color);
4086
4087         for (i = 0; i < nppods; ++i) {
4088                 m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
4089                 m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4090                 req = mtod(m, struct ulp_mem_io *);
4091                 m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
4092                 req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4093                 req->wr.wr_lo = 0;
4094                 req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
4095                                            V_ULPTX_CMD(ULP_MEM_WRITE));
4096                 req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
4097                                  V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
4098
4099                 p = (struct pagepod *)(req + 1);
4100                 if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
4101                         p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
4102                         p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
4103                                                   V_PPOD_COLOR(color));
4104                         p->pp_max_offset = htonl(maxoff);
4105                         p->pp_page_offset = htonl(pg_off);
4106                         p->pp_rsvd = 0;
4107                         for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
4108                                 p->pp_addr[j] = pidx < gl->dgl_nelem ?
4109                                     htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
4110                 } else
4111                         p->pp_vld_tid = 0;   /* mark sentinel page pods invalid */
4112                 send_or_defer(toep, m, 0);
4113                 ppod_addr += PPOD_SIZE;
4114         }
4115         return (0);
4116 }
4117
4118 /*
4119  * Build a CPL_BARRIER message as payload of a ULP_TX_PKT command.
4120  */
4121 static inline void
4122 mk_cpl_barrier_ulp(struct cpl_barrier *b)
4123 {
4124         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)b;
4125
4126         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4127         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*b) / 8));
4128         b->opcode = CPL_BARRIER;
4129 }
4130
4131 /*
4132  * Build a CPL_GET_TCB message as payload of a ULP_TX_PKT command.
4133  */
4134 static inline void
4135 mk_get_tcb_ulp(struct cpl_get_tcb *req, unsigned int tid, unsigned int cpuno)
4136 {
4137         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4138
4139         txpkt = (struct ulp_txpkt *)req;
4140         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4141         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4142         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, tid));
4143         req->cpuno = htons(cpuno);
4144 }
4145
4146 /*
4147  * Build a CPL_SET_TCB_FIELD message as payload of a ULP_TX_PKT command.
4148  */
4149 static inline void
4150 mk_set_tcb_field_ulp(struct cpl_set_tcb_field *req, unsigned int tid,
4151                      unsigned int word, uint64_t mask, uint64_t val)
4152 {
4153         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)req;
4154
4155         CTR4(KTR_TCB, "mk_set_tcb_field_ulp(tid=%u word=0x%x mask=%jx val=%jx",
4156             tid, word, mask, val);
4157
4158         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4159         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*req) / 8));
4160         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
4161         req->reply = V_NO_REPLY(1);
4162         req->cpu_idx = 0;
4163         req->word = htons(word);
4164         req->mask = htobe64(mask);
4165         req->val = htobe64(val);
4166 }
4167
4168 /*
4169  * Build a CPL_RX_DATA_ACK message as payload of a ULP_TX_PKT command.
4170  */
4171 static void
4172 mk_rx_data_ack_ulp(struct toepcb *toep, struct cpl_rx_data_ack *ack,
4173     unsigned int tid, unsigned int credits)
4174 {
4175         struct ulp_txpkt *txpkt = (struct ulp_txpkt *)ack;
4176
4177         txpkt->cmd_dest = htonl(V_ULPTX_CMD(ULP_TXPKT));
4178         txpkt->len = htonl(V_ULPTX_NFLITS(sizeof(*ack) / 8));
4179         OPCODE_TID(ack) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, tid));
4180         ack->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
4181             V_RX_DACK_MODE(TOM_TUNABLE(toep->tp_toedev, delack)) |
4182                                  V_RX_CREDITS(credits));
4183 }
4184
4185 void
4186 t3_cancel_ddpbuf(struct toepcb *toep, unsigned int bufidx)
4187 {
4188         unsigned int wrlen;
4189         struct mbuf *m;
4190         struct work_request_hdr *wr;
4191         struct cpl_barrier *lock;
4192         struct cpl_set_tcb_field *req;
4193         struct cpl_get_tcb *getreq;
4194         struct ddp_state *p = &toep->tp_ddp_state;
4195
4196 #if 0
4197         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4198 #endif
4199         wrlen = sizeof(*wr) + sizeof(*req) + 2 * sizeof(*lock) +
4200                 sizeof(*getreq);
4201         m = m_gethdr_nofail(wrlen);
4202         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4203         wr = mtod(m, struct work_request_hdr *);
4204         bzero(wr, wrlen);
4205
4206         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4207         m->m_pkthdr.len = m->m_len = wrlen;
4208
4209         lock = (struct cpl_barrier *)(wr + 1);
4210         mk_cpl_barrier_ulp(lock);
4211
4212         req = (struct cpl_set_tcb_field *)(lock + 1);
4213
4214         CTR1(KTR_TCB, "t3_cancel_ddpbuf(bufidx=%u)", bufidx);
4215
4216         /* Hmmm, not sure if this actually a good thing: reactivating
4217          * the other buffer might be an issue if it has been completed
4218          * already. However, that is unlikely, since the fact that the UBUF
4219          * is not completed indicates that there is no oustanding data.
4220          */
4221         if (bufidx == 0)
4222                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4223                                      V_TF_DDP_ACTIVE_BUF(1) |
4224                                      V_TF_DDP_BUF0_VALID(1),
4225                                      V_TF_DDP_ACTIVE_BUF(1));
4226         else
4227                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4228                                      V_TF_DDP_ACTIVE_BUF(1) |
4229                                      V_TF_DDP_BUF1_VALID(1), 0);
4230
4231         getreq = (struct cpl_get_tcb *)(req + 1);
4232         mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4233
4234         mk_cpl_barrier_ulp((struct cpl_barrier *)(getreq + 1));
4235
4236         /* Keep track of the number of oustanding CPL_GET_TCB requests
4237          */
4238         p->get_tcb_count++;
4239
4240 #ifdef T3_TRACE
4241         T3_TRACE1(TIDTB(so),
4242                   "t3_cancel_ddpbuf: bufidx %u", bufidx);
4243 #endif
4244         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4245 }
4246
4247 /**
4248  * t3_overlay_ddpbuf - overlay an existing DDP buffer with a new one
4249  * @sk: the socket associated with the buffers
4250  * @bufidx: index of HW DDP buffer (0 or 1)
4251  * @tag0: new tag for HW buffer 0
4252  * @tag1: new tag for HW buffer 1
4253  * @len: new length for HW buf @bufidx
4254  *
4255  * Sends a compound WR to overlay a new DDP buffer on top of an existing
4256  * buffer by changing the buffer tag and length and setting the valid and
4257  * active flag accordingly.  The caller must ensure the new buffer is at
4258  * least as big as the existing one.  Since we typically reprogram both HW
4259  * buffers this function sets both tags for convenience. Read the TCB to
4260  * determine how made data was written into the buffer before the overlay
4261  * took place.
4262  */
4263 void
4264 t3_overlay_ddpbuf(struct toepcb *toep, unsigned int bufidx, unsigned int tag0,
4265                        unsigned int tag1, unsigned int len)
4266 {
4267         unsigned int wrlen;
4268         struct mbuf *m;
4269         struct work_request_hdr *wr;
4270         struct cpl_get_tcb *getreq;
4271         struct cpl_set_tcb_field *req;
4272         struct ddp_state *p = &toep->tp_ddp_state;
4273
4274         CTR4(KTR_TCB, "t3_setup_ppods(bufidx=%u tag0=%u tag1=%u len=%u)",
4275             bufidx, tag0, tag1, len);
4276 #if 0
4277         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4278 #endif
4279         wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
4280         m = m_gethdr_nofail(wrlen);
4281         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4282         wr = mtod(m, struct work_request_hdr *);
4283         m->m_pkthdr.len = m->m_len = wrlen;
4284         bzero(wr, wrlen);
4285
4286
4287         /* Set the ATOMIC flag to make sure that TP processes the following
4288          * CPLs in an atomic manner and no wire segments can be interleaved.
4289          */
4290         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS) | F_WR_ATOMIC);
4291         req = (struct cpl_set_tcb_field *)(wr + 1);
4292         mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_TAG,
4293                              V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG) |
4294                              V_TCB_RX_DDP_BUF1_TAG(M_TCB_RX_DDP_BUF1_TAG) << 32,
4295                              V_TCB_RX_DDP_BUF0_TAG(tag0) |
4296                              V_TCB_RX_DDP_BUF1_TAG((uint64_t)tag1) << 32);
4297         req++;
4298         if (bufidx == 0) {
4299                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_LEN,
4300                             V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4301                             V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
4302                 req++;
4303                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4304                             V_TF_DDP_PUSH_DISABLE_0(1) |
4305                             V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4306                             V_TF_DDP_PUSH_DISABLE_0(0) |
4307                             V_TF_DDP_BUF0_VALID(1));
4308         } else {
4309                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_LEN,
4310                             V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN),
4311                             V_TCB_RX_DDP_BUF1_LEN((uint64_t)len));
4312                 req++;
4313                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS,
4314                             V_TF_DDP_PUSH_DISABLE_1(1) |
4315                             V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1),
4316                             V_TF_DDP_PUSH_DISABLE_1(0) |
4317                             V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1));
4318         }
4319
4320         getreq = (struct cpl_get_tcb *)(req + 1);
4321         mk_get_tcb_ulp(getreq, toep->tp_tid, toep->tp_qset);
4322
4323         /* Keep track of the number of oustanding CPL_GET_TCB requests
4324          */
4325         p->get_tcb_count++;
4326
4327 #ifdef T3_TRACE
4328         T3_TRACE4(TIDTB(sk),
4329                   "t3_overlay_ddpbuf: bufidx %u tag0 %u tag1 %u "
4330                   "len %d",
4331                   bufidx, tag0, tag1, len);
4332 #endif
4333         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4334 }
4335
4336 /*
4337  * Sends a compound WR containing all the CPL messages needed to program the
4338  * two HW DDP buffers, namely optionally setting up the length and offset of
4339  * each buffer, programming the DDP flags, and optionally sending RX_DATA_ACK.
4340  */
4341 void
4342 t3_setup_ddpbufs(struct toepcb *toep, unsigned int len0, unsigned int offset0,
4343                       unsigned int len1, unsigned int offset1,
4344                       uint64_t ddp_flags, uint64_t flag_mask, int modulate)
4345 {
4346         unsigned int wrlen;
4347         struct mbuf *m;
4348         struct work_request_hdr *wr;
4349         struct cpl_set_tcb_field *req;
4350
4351         CTR6(KTR_TCB, "t3_setup_ddpbufs(len0=%u offset0=%u len1=%u offset1=%u ddp_flags=0x%08x%08x ",
4352             len0, offset0, len1, offset1, ddp_flags >> 32, ddp_flags & 0xffffffff);
4353
4354 #if 0
4355         SOCKBUF_LOCK_ASSERT(&toeptoso(toep)->so_rcv);
4356 #endif
4357         wrlen = sizeof(*wr) + sizeof(*req) + (len0 ? sizeof(*req) : 0) +
4358                 (len1 ? sizeof(*req) : 0) +
4359                 (modulate ? sizeof(struct cpl_rx_data_ack) : 0);
4360         m = m_gethdr_nofail(wrlen);
4361         m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
4362         wr = mtod(m, struct work_request_hdr *);
4363         bzero(wr, wrlen);
4364
4365         wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
4366         m->m_pkthdr.len = m->m_len = wrlen;
4367
4368         req = (struct cpl_set_tcb_field *)(wr + 1);
4369         if (len0) {                  /* program buffer 0 offset and length */
4370                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,
4371                         V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
4372                         V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
4373                         V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset0) |
4374                         V_TCB_RX_DDP_BUF0_LEN((uint64_t)len0));
4375                 req++;
4376         }
4377         if (len1) {                  /* program buffer 1 offset and length */
4378                 mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF1_OFFSET,
4379                         V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
4380                         V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN) << 32,
4381                         V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset1) |
4382                         V_TCB_RX_DDP_BUF1_LEN((uint64_t)len1) << 32);
4383                 req++;
4384         }
4385
4386         mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_FLAGS, flag_mask,
4387                              ddp_flags);
4388
4389         if (modulate) {
4390                 mk_rx_data_ack_ulp(toep,
4391                     (struct cpl_rx_data_ack *)(req + 1), toep->tp_tid,
4392                     toep->tp_copied_seq - toep->tp_rcv_wup);
4393                 toep->tp_rcv_wup = toep->tp_copied_seq;
4394         }
4395
4396 #ifdef T3_TRACE
4397         T3_TRACE5(TIDTB(sk),
4398                   "t3_setup_ddpbufs: len0 %u len1 %u ddp_flags 0x%08x%08x "
4399                   "modulate %d",
4400                   len0, len1, ddp_flags >> 32, ddp_flags & 0xffffffff,
4401                   modulate);
4402 #endif
4403
4404         cxgb_ofld_send(TOEP_T3C_DEV(toep), m);
4405 }
4406
4407 void
4408 t3_init_wr_tab(unsigned int wr_len)
4409 {
4410         int i;
4411
4412         if (mbuf_wrs[1])     /* already initialized */
4413                 return;
4414
4415         for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
4416                 int sgl_len = (3 * i) / 2 + (i & 1);
4417
4418                 sgl_len += 3;
4419                 mbuf_wrs[i] = sgl_len <= wr_len ?
4420                         1 : 1 + (sgl_len - 2) / (wr_len - 1);
4421         }
4422
4423         wrlen = wr_len * 8;
4424 }
4425
4426 int
4427 t3_init_cpl_io(void)
4428 {
4429 #ifdef notyet
4430         tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
4431         if (!tcphdr_skb) {
4432                 log(LOG_ERR,
4433                        "Chelsio TCP offload: can't allocate sk_buff\n");
4434                 return -1;
4435         }
4436         skb_put(tcphdr_skb, sizeof(struct tcphdr));
4437         tcphdr_skb->h.raw = tcphdr_skb->data;
4438         memset(tcphdr_skb->data, 0, tcphdr_skb->len);
4439 #endif
4440
4441         t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
4442         t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
4443         t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
4444         t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
4445         t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
4446         t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
4447         t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
4448         t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
4449         t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
4450         t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
4451         t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
4452         t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
4453         t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
4454         t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
4455         t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
4456         return (0);
4457 }
4458