/**************************************************************************

Copyright (c) 2007, Chelsio Inc.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

 1. Redistributions of source code must retain the above copyright notice,
    this list of conditions and the following disclaimer.

 2. Neither the name of the Chelsio Corporation nor the names of its
    contributors may be used to endorse or promote products derived from
    this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

***************************************************************************/

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/priv.h>

#include <net/if.h>
#include <net/route.h>

#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>


#include <dev/cxgb/cxgb_osdep.h>
#include <dev/cxgb/sys/mbufq.h>

#include <netinet/ip.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_offload.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_syncache.h>
#include <net/route.h>


#include <dev/cxgb/t3cdev.h>
#include <dev/cxgb/common/cxgb_firmware_exports.h>
#include <dev/cxgb/common/cxgb_t3_cpl.h>
#include <dev/cxgb/common/cxgb_tcb.h>
#include <dev/cxgb/common/cxgb_ctl_defs.h>
#include <dev/cxgb/cxgb_l2t.h>
#include <dev/cxgb/cxgb_offload.h>
#include <vm/vm.h>
#include <vm/pmap.h>
#include <machine/bus.h>
#include <dev/cxgb/sys/mvec.h>
#include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
#include <dev/cxgb/ulp/tom/cxgb_defs.h>
#include <dev/cxgb/ulp/tom/cxgb_tom.h>
#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
#include <dev/cxgb/ulp/tom/cxgb_tcp.h>


/*
 * For ULP connections HW may add headers, e.g., for digests, that aren't part
 * of the messages sent by the host but that are part of the TCP payload and
 * therefore consume TCP sequence space.  Tx connection parameters that
 * operate in TCP sequence space are affected by the HW additions and need to
 * compensate for them to accurately track TCP sequence numbers. This array
 * contains the compensating extra lengths for ULP packets.  It is indexed by
 * a packet's ULP submode.
 */
const unsigned int t3_ulp_extra_len[] = {0, 4, 4, 8};

#ifdef notyet
/*
 * This sk_buff holds a fake header-only TCP segment that we use whenever we
 * need to exploit SW TCP functionality that expects TCP headers, such as
 * tcp_create_openreq_child().  It's a RO buffer that may be used by multiple
 * CPUs without locking.
 */
static struct mbuf *tcphdr_mbuf __read_mostly;
#endif

/*
 * Size of WRs in bytes.  Note that we assume all devices we are handling have
 * the same WR size.
 */
static unsigned int wrlen __read_mostly;

/*
 * The number of WRs needed for an skb depends on the number of page fragments
 * in the skb and whether it has any payload in its main body.  This maps the
 * length of the gather list represented by an skb into the # of necessary WRs.
 */
static unsigned int mbuf_wrs[TX_MAX_SEGS] __read_mostly;

/*
 * Max receive window supported by HW in bytes.  Only a small part of it can
 * be set through option0, the rest needs to be set through RX_DATA_ACK.
 */
#define MAX_RCV_WND ((1U << 27) - 1)

/*
 * Min receive window.  We want it to be large enough to accommodate receive
 * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
 */
#define MIN_RCV_WND (24 * 1024U)
#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)

#define VALIDATE_SEQ 0
#define VALIDATE_SOCK(so)
#define DEBUG_WR 0

extern int tcp_do_autorcvbuf;
extern int tcp_do_autosndbuf;
extern int tcp_autorcvbuf_max;
extern int tcp_autosndbuf_max;

static void t3_send_reset(struct toepcb *toep);
static void send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status);
static inline void free_atid(struct t3cdev *cdev, unsigned int tid);
static void handle_syncache_event(int event, void *arg);


static inline int
is_t3a(const struct toedev *dev)
{
	return (dev->tod_ttid == TOE_ID_CHELSIO_T3);
}

static void
dump_toepcb(struct toepcb *toep)
{
	DPRINTF("qset_idx=%d qset=%d ulp_mode=%d mtu_idx=%d tid=%d\n",
	    toep->tp_qset_idx, toep->tp_qset, toep->tp_ulp_mode,
	    toep->tp_mtu_idx, toep->tp_tid);

	DPRINTF("wr_max=%d wr_avail=%d wr_unacked=%d mss_clamp=%d flags=0x%x\n",
	    toep->tp_wr_max, toep->tp_wr_avail, toep->tp_wr_unacked, 
	    toep->tp_mss_clamp, toep->tp_flags);
}

static struct rtentry *
rtalloc2(struct sockaddr *dst, int report, u_long ignflags)
{
	struct rtentry *rt = NULL;
	
	if ((rt = rtalloc1(dst, report, ignflags)) != NULL)
		RT_UNLOCK(rt);

	return (rt);
}

/*
 * Determine whether to send a CPL message now or defer it.  A message is
 * deferred if the connection is in SYN_SENT since we don't know the TID yet.
 * For connections in other states the message is sent immediately.
 * If through_l2t is set the message is subject to ARP processing, otherwise
 * it is sent directly.
 */
static inline void
send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t)
{
	struct toepcb *toep = tp->t_toe;

	
	if (__predict_false(tp->t_state == TCPS_SYN_SENT)) {
		INP_LOCK(tp->t_inpcb);
		mbufq_tail(&toep->out_of_order_queue, m);  // defer
		INP_UNLOCK(tp->t_inpcb);
	} else if (through_l2t)
		l2t_send(T3C_DEV(so), m, toep->tp_l2t);  // send through L2T
	else
		cxgb_ofld_send(T3C_DEV(so), m);          // send directly
}

static inline unsigned int
mkprio(unsigned int cntrl, const struct socket *so)
{
        return cntrl;
}

/*
 * Populate a TID_RELEASE WR.  The skb must be already propely sized.
 */
static inline void
mk_tid_release(struct mbuf *m, const struct socket *so, unsigned int tid)
{
	struct cpl_tid_release *req;

	m_set_priority(m, mkprio(CPL_PRIORITY_SETUP, so));
	m->m_pkthdr.len = m->m_len = sizeof(*req);
	req = mtod(m, struct cpl_tid_release *);
	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, tid));
}

static inline void
make_tx_data_wr(struct socket *so, struct mbuf *m, int len, struct mbuf *tail)
{
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	struct tx_data_wr *req;

	INP_LOCK_ASSERT(tp->t_inpcb);

	req = mtod(m, struct tx_data_wr *);
	m->m_len = sizeof(*req);
	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
	req->wr_lo = htonl(V_WR_TID(toep->tp_tid));
	/* len includes the length of any HW ULP additions */
	req->len = htonl(len);
	req->param = htonl(V_TX_PORT(toep->tp_l2t->smt_idx));
	/* V_TX_ULP_SUBMODE sets both the mode and submode */
	req->flags = htonl(V_TX_ULP_SUBMODE(/*skb_ulp_mode(skb)*/ 0) |
	                   V_TX_URG(/* skb_urgent(skb) */ 0 ) |
	                   V_TX_SHOVE((!(tp->t_flags & TF_MORETOCOME) &&
				   (tail ? 0 : 1))));
	req->sndseq = htonl(tp->snd_nxt);
	if (__predict_false((toep->tp_flags & TP_DATASENT) == 0)) {
		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT | 
				    V_TX_CPU_IDX(toep->tp_qset));
 
		/* Sendbuffer is in units of 32KB.
		 */
		if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) 
			req->param |= htonl(V_TX_SNDBUF(tcp_autosndbuf_max >> 15));
		else
			req->param |= htonl(V_TX_SNDBUF(so->so_snd.sb_hiwat >> 15));
		toep->tp_flags |= TP_DATASENT;
	}
}

int
t3_push_frames(struct socket *so, int req_completion)
{
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	
	struct mbuf *tail, *m0, *last;
	struct t3cdev *cdev;
	struct tom_data *d;
	int bytes, count, total_bytes;
	bus_dma_segment_t segs[TX_MAX_SEGS], *segp;
	segp = segs;

	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_CLOSED) {
		DPRINTF("tcp state=%d\n", tp->t_state);	
		return (0);
	}	

	if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) {
		DPRINTF("disconnecting\n");
		
		return (0);
	}

	INP_LOCK_ASSERT(tp->t_inpcb);
	
	SOCKBUF_LOCK(&so->so_snd);
	
	d = TOM_DATA(TOE_DEV(so));
	cdev = d->cdev;
	last = tail = so->so_snd.sb_sndptr ? so->so_snd.sb_sndptr : so->so_snd.sb_mb;
	total_bytes = 0;
	DPRINTF("wr_avail=%d tail=%p snd.cc=%d tp_last=%p\n",
	    toep->tp_wr_avail, tail, so->so_snd.sb_cc, toep->tp_m_last);

	if (last && toep->tp_m_last == last  && so->so_snd.sb_sndptroff != 0) {
		KASSERT(tail, ("sbdrop error"));
		last = tail = tail->m_next;
	}

	if ((toep->tp_wr_avail == 0 ) || (tail == NULL)) {
		DPRINTF("wr_avail=%d tail=%p\n", toep->tp_wr_avail, tail);
		SOCKBUF_UNLOCK(&so->so_snd);
		return (0);		
	}
			
	toep->tp_m_last = NULL;
	while (toep->tp_wr_avail && (tail != NULL)) {
		count = bytes = 0;
		if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) {
			SOCKBUF_UNLOCK(&so->so_snd);
			return (0);
		}
		while ((mbuf_wrs[count + 1] <= toep->tp_wr_avail)
		    && (tail != NULL) && (count < TX_MAX_SEGS)) {
			bytes += tail->m_len;
			count++;
			last = tail;
			/*
			 * technically an abuse to be using this for a VA
			 * but less gross than defining my own structure
			 * or calling pmap_kextract from here :-|
			 */
			segp->ds_addr = (bus_addr_t)tail->m_data;
			segp->ds_len = tail->m_len;
			DPRINTF("count=%d wr_needed=%d ds_addr=%p ds_len=%d\n",
			    count, mbuf_wrs[count], tail->m_data, tail->m_len);
			
			segp++;
			tail = tail->m_next;
		}
		DPRINTF("wr_avail=%d mbuf_wrs[%d]=%d tail=%p\n",
		    toep->tp_wr_avail, count, mbuf_wrs[count], tail);	
		if (tail) {
			so->so_snd.sb_sndptr = tail;
			toep->tp_m_last = NULL;
		} else 
			toep->tp_m_last = so->so_snd.sb_sndptr = last;

		DPRINTF("toep->tp_m_last=%p\n", toep->tp_m_last);

		so->so_snd.sb_sndptroff += bytes;
		total_bytes += bytes;
		toep->tp_write_seq += bytes;


		SOCKBUF_UNLOCK(&so->so_snd);
		
		/*
		 * XXX can drop socket buffer lock here
		 */
	
		toep->tp_wr_avail -= mbuf_wrs[count];
		toep->tp_wr_unacked += mbuf_wrs[count];
		
		make_tx_data_wr(so, m0, bytes, tail);
		m_set_priority(m0, mkprio(CPL_PRIORITY_DATA, so));
		m_set_sgl(m0, segs);
		m_set_sgllen(m0, count);
		/*
		 * remember credits used
		 */
		m0->m_pkthdr.csum_data = mbuf_wrs[count];
		m0->m_pkthdr.len = bytes;
		if ((req_completion && toep->tp_wr_unacked == mbuf_wrs[count]) ||
		    toep->tp_wr_unacked >= toep->tp_wr_max / 2) {
			struct work_request_hdr *wr = cplhdr(m0);

			wr->wr_hi |= htonl(F_WR_COMPL);
			toep->tp_wr_unacked = 0;	
		}

		m0->m_type = MT_DONTFREE;
		enqueue_wr(toep, m0);
		DPRINTF("sending offload tx with %d bytes in %d segments\n",
		    bytes, count);
		
		l2t_send(cdev, m0, toep->tp_l2t);
		if (toep->tp_wr_avail && (tail != NULL)) 
			SOCKBUF_LOCK(&so->so_snd);
	}

	SOCKBUF_UNLOCK_ASSERT(&so->so_snd);
	return (total_bytes);
}

/*
 * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
 * under any circumstances.  We take the easy way out and always queue the
 * message to the write_queue.  We can optimize the case where the queue is
 * already empty though the optimization is probably not worth it.
 */
static void
close_conn(struct socket *so)
{
	struct mbuf *m;
	struct cpl_close_con_req *req;
	struct tom_data *d;
	struct inpcb *inp = sotoinpcb(so);
	struct tcpcb *tp;
	struct toepcb *toep;
	unsigned int tid; 


	INP_LOCK(inp);
	tp = sototcpcb(so);
	toep = tp->t_toe;
	
	if (tp->t_state != TCPS_SYN_SENT)
		t3_push_frames(so, 1);
	
	if (toep->tp_flags & TP_FIN_SENT) {
		INP_UNLOCK(inp);
		return;
	}

	tid = toep->tp_tid;
	    
	d = TOM_DATA(toep->tp_toedev);
	
	m = m_gethdr_nofail(sizeof(*req));

	toep->tp_flags |= TP_FIN_SENT;
	req = mtod(m, struct cpl_close_con_req *);
	
	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
	req->wr.wr_lo = htonl(V_WR_TID(tid));
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
	req->rsvd = htonl(toep->tp_write_seq);
	INP_UNLOCK(inp);
	/*
	 * XXX - need to defer shutdown while there is still data in the queue
	 *
	 */
	cxgb_ofld_send(d->cdev, m);

}

/*
 * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
 * and send it along.
 */
static void
abort_arp_failure(struct t3cdev *cdev, struct mbuf *m)
{
	struct cpl_abort_req *req = cplhdr(m);

	req->cmd = CPL_ABORT_NO_RST;
	cxgb_ofld_send(cdev, m);
}

/*
 * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
 * permitted to return without sending the message in case we cannot allocate
 * an sk_buff.  Returns the number of credits sent.
 */
uint32_t
t3_send_rx_credits(struct tcpcb *tp, uint32_t credits, uint32_t dack, int nofail)
{
	struct mbuf *m;
	struct cpl_rx_data_ack *req;
	struct toepcb *toep = tp->t_toe;
	struct toedev *tdev = toep->tp_toedev;
	
	m = m_gethdr_nofail(sizeof(*req));

	DPRINTF("returning %u credits to HW\n", credits);
	
	req = mtod(m, struct cpl_rx_data_ack *);
	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
	m_set_priority(m, mkprio(CPL_PRIORITY_ACK, toeptoso(toep))); 
	cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
	return (credits);
}


/*
 * Set of states for which we should return RX credits.
 */
#define CREDIT_RETURN_STATE (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2)

/*
 * Called after some received data has been read.  It returns RX credits
 * to the HW for the amount of data processed.
 */
void
t3_cleanup_rbuf(struct tcpcb *tp)
{
	struct toepcb *toep = tp->t_toe;
	struct socket *so;
	struct toedev *dev;
	int dack_mode, must_send, read;
	u32 thres, credits, dack = 0;

	if (!((tp->t_state == TCPS_ESTABLISHED) || (tp->t_state == TCPS_FIN_WAIT_1) ||
		(tp->t_state == TCPS_FIN_WAIT_2)))
		return;
	INP_LOCK_ASSERT(tp->t_inpcb);
	
	so = tp->t_inpcb->inp_socket;
	SOCKBUF_LOCK(&so->so_rcv);
	read = toep->tp_enqueued_bytes - so->so_rcv.sb_cc;
	toep->tp_copied_seq += read;
	toep->tp_enqueued_bytes -= read;
	credits = toep->tp_copied_seq - toep->tp_rcv_wup;
	SOCKBUF_UNLOCK(&so->so_rcv);

	if (credits > so->so_rcv.sb_mbmax)
	    printf("copied_seq=%u rcv_wup=%u credits=%u\n",
		toep->tp_copied_seq, toep->tp_rcv_wup, credits);
	/*
	 * XXX this won't accurately reflect credit return - we need
	 * to look at the difference between the amount that has been 
	 * put in the recv sockbuf and what is there now
	 */

	if (__predict_false(!credits))
		return;

	dev = toep->tp_toedev;
	thres = TOM_TUNABLE(dev, rx_credit_thres);

	if (__predict_false(thres == 0))
		return;

	if (toep->tp_ulp_mode)
		dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
	else {
		dack_mode = TOM_TUNABLE(dev, delack);
		if (__predict_false(dack_mode != toep->tp_delack_mode)) {
			u32 r = tp->rcv_nxt - toep->tp_delack_seq;

			if (r >= tp->rcv_wnd || r >= 16 * toep->tp_mss_clamp)
				dack = F_RX_DACK_CHANGE |
				       V_RX_DACK_MODE(dack_mode);
		}
	}

	/*
	 * For coalescing to work effectively ensure the receive window has
	 * at least 16KB left.
	 */
	must_send = credits + 16384 >= tp->rcv_wnd;

	if (must_send || credits >= thres)
		toep->tp_rcv_wup += t3_send_rx_credits(tp, credits, dack, must_send);
}

static int
cxgb_toe_disconnect(struct tcpcb *tp)
{
	struct socket *so;
	
	DPRINTF("cxgb_toe_disconnect\n");

	so = tp->t_inpcb->inp_socket;
	close_conn(so);
	return (0);
}

static int
cxgb_toe_reset(struct tcpcb *tp)
{
	struct toepcb *toep = tp->t_toe;
       
	
	t3_send_reset(toep);

	/*
	 * unhook from socket
	 */
	tp->t_flags &= ~TF_TOE;
	toep->tp_tp = NULL;
	tp->t_toe = NULL;
	return (0);
}

static int
cxgb_toe_send(struct tcpcb *tp)
{
	struct socket *so;
	
	DPRINTF("cxgb_toe_send\n");
	dump_toepcb(tp->t_toe);

	so = tp->t_inpcb->inp_socket;
	t3_push_frames(so, 1);
	return (0);
}

static int
cxgb_toe_rcvd(struct tcpcb *tp)
{
	INP_LOCK_ASSERT(tp->t_inpcb);
	t3_cleanup_rbuf(tp);
	
	return (0);
}

static void
cxgb_toe_detach(struct tcpcb *tp)
{
	struct toepcb *toep;
	/*
	 * XXX how do we handle teardown in the SYN_SENT state?
	 *
	 */
	INP_INFO_WLOCK(&tcbinfo);
	toep = tp->t_toe;
	toep->tp_tp = NULL;

	/*
	 * unhook from socket
	 */
	tp->t_flags &= ~TF_TOE;
	tp->t_toe = NULL;
	INP_INFO_WUNLOCK(&tcbinfo);
}
	

static struct toe_usrreqs cxgb_toe_usrreqs = {
	.tu_disconnect = cxgb_toe_disconnect,
	.tu_reset = cxgb_toe_reset,
	.tu_send = cxgb_toe_send,
	.tu_rcvd = cxgb_toe_rcvd,
	.tu_detach = cxgb_toe_detach,
	.tu_detach = cxgb_toe_detach,
	.tu_syncache_event = handle_syncache_event,
};


static void
__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word,
			    uint64_t mask, uint64_t val, int no_reply)
{
	struct cpl_set_tcb_field *req;
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;

	req = mtod(m, struct cpl_set_tcb_field *);
	m->m_pkthdr.len = m->m_len = sizeof(*req);
	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
	req->reply = V_NO_REPLY(no_reply);
	req->cpu_idx = 0;
	req->word = htons(word);
	req->mask = htobe64(mask);
	req->val = htobe64(val);

	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
	send_or_defer(so, tp, m, 0);
}

static void
t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
{
	struct mbuf *m;
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	
	if (toep == NULL)
		return;

	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN))
		return;

	m = m_gethdr_nofail(sizeof(struct cpl_set_tcb_field));

	__set_tcb_field(so, m, word, mask, val, 1);
}

/*
 * Set one of the t_flags bits in the TCB.
 */
static void
set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val)
{
	t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
}

/*
 * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
 */
static void
t3_set_nagle(struct socket *so)
{
	struct tcpcb *tp = sototcpcb(so);
	
	set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
}

/*
 * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
 */
void
t3_set_keepalive(struct socket *so, int on_off)
{
	set_tcb_tflag(so, S_TF_KEEPALIVE, on_off);
}

void
t3_set_rcv_coalesce_enable(struct socket *so, int on_off)
{
	set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off);
}

/*
 * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
 */
static void
t3_set_tos(struct socket *so)
{
	t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
			 V_TCB_TOS(SO_TOS(so)));
}


/*
 * In DDP mode, TP fails to schedule a timer to push RX data to the host when
 * DDP is disabled (data is delivered to freelist). [Note that, the peer should
 * set the PSH bit in the last segment, which would trigger delivery.]
 * We work around the issue by setting a DDP buffer in a partial placed state,
 * which guarantees that TP will schedule a timer.
 */
#define TP_DDP_TIMER_WORKAROUND_MASK\
    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
#define TP_DDP_TIMER_WORKAROUND_VAL\
    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
     ((V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)1) | V_TCB_RX_DDP_BUF0_LEN((uint64_t)2)) <<\
      32))

static void
t3_enable_ddp(struct socket *so, int on)
{
	if (on)
		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
				 V_TF_DDP_OFF(0));
	else
		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
				 V_TF_DDP_OFF(1) |
				 TP_DDP_TIMER_WORKAROUND_MASK,
				 V_TF_DDP_OFF(1) |
				 TP_DDP_TIMER_WORKAROUND_VAL);

}


void
t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
{
	t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
			 tag_color);
}

void
t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
		    unsigned int len)
{
	if (buf_idx == 0)
		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET,
			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
	else
		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET,
			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
}

static int
t3_set_cong_control(struct socket *so, const char *name)
{
#ifdef notyet	
	int cong_algo;

	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
		if (!strcmp(name, t3_cong_ops[cong_algo].name))
			break;

	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
		return -EINVAL;
#endif
	return 0;
}

int
t3_get_tcb(struct socket *so)
{
	struct cpl_get_tcb *req;
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);

	if (!m)
		return (ENOMEM);
	
	INP_LOCK_ASSERT(tp->t_inpcb);	
	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));	
	req = mtod(m, struct cpl_get_tcb *);
	m->m_pkthdr.len = m->m_len = sizeof(*req);
	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
	req->cpuno = htons(toep->tp_qset);
	if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
		mbufq_tail(&toep->out_of_order_queue, m);	// defer
	else
		cxgb_ofld_send(T3C_DEV(so), m);
	return 0;
}

static inline void
so_insert_tid(struct tom_data *d, struct socket *so, unsigned int tid)
{
	struct toepcb *toep = sototoep(so);
	toepcb_hold(toep);

	cxgb_insert_tid(d->cdev, d->client, toep, tid);
}

/**
 *	find_best_mtu - find the entry in the MTU table closest to an MTU
 *	@d: TOM state
 *	@mtu: the target MTU
 *
 *	Returns the index of the value in the MTU table that is closest to but
 *	does not exceed the target MTU.
 */
static unsigned int
find_best_mtu(const struct t3c_data *d, unsigned short mtu)
{
	int i = 0;

	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
		++i;
	return (i);
}

static unsigned int
select_mss(struct t3c_data *td, struct tcpcb *tp, unsigned int pmtu)
{
	unsigned int idx;
	
#ifdef notyet
	struct rtentry *dst = sotoinpcb(so)->inp_route.ro_rt;
#endif
	if (tp) {
		tp->t_maxseg = pmtu - 40;
		if (tp->t_maxseg < td->mtus[0] - 40)
			tp->t_maxseg = td->mtus[0] - 40;
		idx = find_best_mtu(td, tp->t_maxseg + 40);

		tp->t_maxseg = td->mtus[idx] - 40;
	} else
		idx = find_best_mtu(td, pmtu);
	
	return (idx);
}

void
t3_release_ddp_resources(struct toepcb *toep)
{
	/*
	 * This is a no-op until we have DDP support
	 */
}

static inline void
free_atid(struct t3cdev *cdev, unsigned int tid)
{
	struct toepcb *toep = cxgb_free_atid(cdev, tid);

	if (toep)
		toepcb_release(toep);
}

/*
 * Release resources held by an offload connection (TID, L2T entry, etc.)
 */
static void
t3_release_offload_resources(struct toepcb *toep)
{
	struct tcpcb *tp = toep->tp_tp;
	struct toedev *tdev = toep->tp_toedev;
	struct t3cdev *cdev;
	unsigned int tid = toep->tp_tid;

	if (!tdev)
		return;

	cdev = TOEP_T3C_DEV(toep);
	if (!cdev)
		return;

	toep->tp_qset = 0;
	t3_release_ddp_resources(toep);

#ifdef CTRL_SKB_CACHE
	kfree_skb(CTRL_SKB_CACHE(tp));
	CTRL_SKB_CACHE(tp) = NULL;
#endif

	if (toep->tp_wr_avail != toep->tp_wr_max) {
		purge_wr_queue(toep);
		reset_wr_list(toep);
	}

	if (toep->tp_l2t) {
		l2t_release(L2DATA(cdev), toep->tp_l2t);
		toep->tp_l2t = NULL;
	}
	printf("setting toep->tp_tp to NULL\n");
	
	toep->tp_tp = NULL;
	if (tp) {
		INP_LOCK_ASSERT(tp->t_inpcb);
		tp->t_toe = NULL;
		tp->t_flags &= ~TF_TOE;
	}
	
	if (toep->tp_state == TCPS_SYN_SENT) {
		free_atid(cdev, tid);
#ifdef notyet		
		__skb_queue_purge(&tp->out_of_order_queue);
#endif		
	} else {                                          // we have TID
		cxgb_remove_tid(cdev, toep, tid);
		toepcb_release(toep);
	}
#if 0
	log(LOG_INFO, "closing TID %u, state %u\n", tid, tp->t_state);
#endif
}

static void
install_offload_ops(struct socket *so)
{
	struct tcpcb *tp = sototcpcb(so);

	KASSERT(tp->t_toe != NULL, ("toepcb not set"));
	
	t3_install_socket_ops(so);
	tp->t_flags |= TF_TOE;
	tp->t_tu = &cxgb_toe_usrreqs;
}

/*
 * Determine the receive window scaling factor given a target max
 * receive window.
 */
static __inline int
select_rcv_wscale(int space)
{
	int wscale = 0;

	if (space > MAX_RCV_WND)
		space = MAX_RCV_WND;

	if (tcp_do_rfc1323)
		for (; space > 65535 && wscale < 14; space >>= 1, ++wscale) ;
	return wscale;
}

/*
 * Determine the receive window size for a socket.
 */
static unsigned int
select_rcv_wnd(struct socket *so)
{
	struct toedev *dev = TOE_DEV(so);
	struct tom_data *d = TOM_DATA(dev);
	unsigned int wnd;
	unsigned int max_rcv_wnd;

	if (tcp_do_autorcvbuf)
		wnd = tcp_autorcvbuf_max;
	else
		wnd = sbspace(&so->so_rcv);
	
	/* XXX
	 * For receive coalescing to work effectively we need a receive window
	 * that can accomodate a coalesced segment.
	 */	
	if (wnd < MIN_RCV_WND)
		wnd = MIN_RCV_WND; 
	
	/* PR 5138 */
	max_rcv_wnd = (dev->tod_ttid == TOE_ID_CHELSIO_T3B ? 
				    (uint32_t)d->rx_page_size * 23 :
				    MAX_RCV_WND);
	
	return min(wnd, max_rcv_wnd);
}

/*
 * Assign offload parameters to some socket fields.  This code is used by
 * both active and passive opens.
 */
static inline void
init_offload_socket(struct socket *so, struct toedev *dev, unsigned int tid,
    struct l2t_entry *e, struct rtentry *dst, struct toepcb *toep)
{
	struct tcpcb *tp = sototcpcb(so);
	struct t3c_data *td = T3C_DATA(TOM_DATA(dev)->cdev);

	SOCK_LOCK_ASSERT(so);
	
	printf("initializing offload socket\n");
	/*
	 * We either need to fix push frames to work with sbcompress
	 * or we need to add this
	 */
	so->so_snd.sb_flags |= SB_NOCOALESCE;

	tp->t_toe = toep;
	toep->tp_tp = tp;
	toep->tp_toedev = dev;
	
	toep->tp_tid = tid;
	toep->tp_l2t = e;
	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(dev, max_wrs);
	toep->tp_wr_unacked = 0;
	toep->tp_delack_mode = 0;
	
	toep->tp_mtu_idx = select_mss(td, tp, dst->rt_ifp->if_mtu);
	/*
	 * XXX broken
	 * 
	 */
	tp->rcv_wnd = select_rcv_wnd(so);
        toep->tp_ulp_mode = TOM_TUNABLE(dev, ddp) && !(so->so_options & SO_NO_DDP) &&
		       tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
	toep->tp_qset_idx = 0;
	
	reset_wr_list(toep);
	DPRINTF("initialization done\n");
}

/*
 * The next two functions calculate the option 0 value for a socket.
 */
static inline unsigned int
calc_opt0h(struct socket *so, int mtu_idx)
{
	struct tcpcb *tp = sototcpcb(so);
	int wscale = select_rcv_wscale(tp->rcv_wnd);

	return V_NAGLE((tp->t_flags & TF_NODELAY) == 0) |
	    V_KEEP_ALIVE((so->so_options & SO_KEEPALIVE) != 0) | F_TCAM_BYPASS |
	    V_WND_SCALE(wscale) | V_MSS_IDX(mtu_idx);
}

static inline unsigned int
calc_opt0l(struct socket *so, int ulp_mode)
{
	struct tcpcb *tp = sototcpcb(so);
	unsigned int val;
	
	val = V_TOS(SO_TOS(so)) | V_ULP_MODE(ulp_mode) |
	       V_RCV_BUFSIZ(min(tp->rcv_wnd >> 10, (u32)M_RCV_BUFSIZ));

	DPRINTF("opt0l tos=%08x rcv_wnd=%ld opt0l=%08x\n", SO_TOS(so), tp->rcv_wnd, val);
	return (val);
}

static inline unsigned int
calc_opt2(const struct socket *so, struct toedev *dev)
{
	int flv_valid;

	flv_valid = (TOM_TUNABLE(dev, cong_alg) != -1);

	return V_FLAVORS_VALID(flv_valid) |
	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0);
}
#if 0
(((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
#endif
	
static void
mk_act_open_req(struct socket *so, struct mbuf *m,
    unsigned int atid, const struct l2t_entry *e)
{
	struct cpl_act_open_req *req;
	struct inpcb *inp = sotoinpcb(so);
	struct tcpcb *tp = intotcpcb(inp);
	struct toepcb *toep = tp->t_toe;
	struct toedev *tdev = TOE_DEV(so);
	
	m_set_priority((struct mbuf *)m, mkprio(CPL_PRIORITY_SETUP, so));
	
	req = mtod(m, struct cpl_act_open_req *);
	m->m_pkthdr.len = m->m_len = sizeof(*req);
	
	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
	req->local_port = inp->inp_lport;
	req->peer_port = inp->inp_fport;
	memcpy(&req->local_ip, &inp->inp_laddr, 4);
	memcpy(&req->peer_ip, &inp->inp_faddr, 4);
	DPRINTF("connect smt_idx=%d\n", e->smt_idx);
	req->opt0h = htonl(calc_opt0h(so, toep->tp_mtu_idx) | V_L2T_IDX(e->idx) |
			   V_TX_CHANNEL(e->smt_idx));
	req->opt0l = htonl(calc_opt0l(so, toep->tp_ulp_mode));
	req->params = 0;
	req->opt2 = htonl(calc_opt2(so, tdev));
}


/*
 * Convert an ACT_OPEN_RPL status to an errno.
 */
static int
act_open_rpl_status_to_errno(int status)
{
	switch (status) {
	case CPL_ERR_CONN_RESET:
		return (ECONNREFUSED);
	case CPL_ERR_ARP_MISS:
		return (EHOSTUNREACH);
	case CPL_ERR_CONN_TIMEDOUT:
		return (ETIMEDOUT);
	case CPL_ERR_TCAM_FULL:
		return (ENOMEM);
	case CPL_ERR_CONN_EXIST:
		log(LOG_ERR, "ACTIVE_OPEN_RPL: 4-tuple in use\n");
		return (EADDRINUSE);
	default:
		return (EIO);
	}
}

static void
fail_act_open(struct toepcb *toep, int errno)
{
	struct tcpcb *tp = toep->tp_tp;

	t3_release_offload_resources(toep);
	if (tp) {
		INP_LOCK_ASSERT(tp->t_inpcb);
		cxgb_tcp_drop(tp, errno);
	}
	
#ifdef notyet
	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
#endif
}

/*
 * Handle active open failures.
 */
static void
active_open_failed(struct toepcb *toep, struct mbuf *m)
{
	struct cpl_act_open_rpl *rpl = cplhdr(m);
	struct inpcb *inp;

	INP_INFO_WLOCK(&tcbinfo);
	if (toep->tp_tp == NULL)
		goto done;

	inp = toep->tp_tp->t_inpcb;
	INP_LOCK(inp);

/*
 * Don't handle connection retry for now
 */
#ifdef notyet
	struct inet_connection_sock *icsk = inet_csk(sk);

	if (rpl->status == CPL_ERR_CONN_EXIST &&
	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
			       jiffies + HZ / 2);
	} else
#endif		
		fail_act_open(toep, act_open_rpl_status_to_errno(rpl->status));
	INP_UNLOCK(inp);
done:
	INP_INFO_WUNLOCK(&tcbinfo);

	m_free(m);
}

/*
 * Return whether a failed active open has allocated a TID
 */
static inline int
act_open_has_tid(int status)
{
	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
	       status != CPL_ERR_ARP_MISS;
}

/*
 * Process an ACT_OPEN_RPL CPL message.
 */
static int
do_act_open_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
	struct toepcb *toep = (struct toepcb *)ctx;
	struct cpl_act_open_rpl *rpl = cplhdr(m);
	
	if (cdev->type != T3A && act_open_has_tid(rpl->status))
		cxgb_queue_tid_release(cdev, GET_TID(rpl));
	
	active_open_failed(toep, m);
	return (0);
}

/*
 * Handle an ARP failure for an active open.   XXX purge ofo queue
 *
 * XXX badly broken for crossed SYNs as the ATID is no longer valid.
 * XXX crossed SYN errors should be generated by PASS_ACCEPT_RPL which should
 * check SOCK_DEAD or sk->sk_sock.  Or maybe generate the error here but don't
 * free the atid.  Hmm.
 */
#ifdef notyet
static void
act_open_req_arp_failure(struct t3cdev *dev, struct mbuf *m)
{
	struct toepcb *toep = m_get_toep(m);
	struct tcpcb *tp = toep->tp_tp;
	struct inpcb *inp = tp->t_inpcb;
	struct socket *so = toeptoso(toep);
	
	INP_LOCK(inp);
	if (tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED) {
		fail_act_open(so, EHOSTUNREACH);
		printf("freeing %p\n", m);
		
		m_free(m);
	}
	INP_UNLOCK(inp);
}
#endif
/*
 * Send an active open request.
 */
int
t3_connect(struct toedev *tdev, struct socket *so,
    struct rtentry *rt, struct sockaddr *nam)
{
	struct mbuf *m;
	struct l2t_entry *e;
	struct tom_data *d = TOM_DATA(tdev);
	struct inpcb *inp = sotoinpcb(so);
	struct tcpcb *tp = intotcpcb(inp);
	struct toepcb *toep; /* allocated by init_offload_socket */
		
	int atid;

	toep = toepcb_alloc();
	if (toep == NULL)
		goto out_err;
	
	if ((atid = cxgb_alloc_atid(d->cdev, d->client, toep)) < 0)
		goto out_err;
	
	e = t3_l2t_get(d->cdev, rt, rt->rt_ifp, nam);
	if (!e)
		goto free_tid;

	INP_LOCK_ASSERT(inp);
	m = m_gethdr(MT_DATA, M_WAITOK);
	
#if 0	
	m->m_toe.mt_toepcb = tp->t_toe;
	set_arp_failure_handler((struct mbuf *)m, act_open_req_arp_failure);
#endif
	SOCK_LOCK(so);
	
	init_offload_socket(so, tdev, atid, e, rt, toep);
	
	install_offload_ops(so);
	
	mk_act_open_req(so, m, atid, e);
	SOCK_UNLOCK(so);
	
	soisconnecting(so);
	toep = tp->t_toe;
	m_set_toep(m, tp->t_toe);
	
	printf("sending off request\n");
	
	toep->tp_state = TCPS_SYN_SENT;
	l2t_send(d->cdev, (struct mbuf *)m, e);

	if (toep->tp_ulp_mode)
		t3_enable_ddp(so, 0);
	return 	(0);
	
free_tid:
	printf("failing connect - free atid\n");
	
	free_atid(d->cdev, atid);
out_err:
	printf("return ENOMEM\n");
       return (ENOMEM);
}

/*
 * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
 * not send multiple ABORT_REQs for the same connection and also that we do
 * not try to send a message after the connection has closed.  Returns 1 if
 * an ABORT_REQ wasn't generated after all, 0 otherwise.
 */
static void
t3_send_reset(struct toepcb *toep)
{
	
	struct cpl_abort_req *req;
	unsigned int tid = toep->tp_tid;
	int mode = CPL_ABORT_SEND_RST;
	struct tcpcb *tp = toep->tp_tp;
	struct toedev *tdev = toep->tp_toedev;
	struct socket *so = NULL;
	struct mbuf *m;
	
	if (tp) {
		INP_LOCK_ASSERT(tp->t_inpcb);
		so = toeptoso(toep);
	}
	
	if (__predict_false((toep->tp_flags & TP_ABORT_SHUTDOWN) ||
		tdev == NULL))
		return;
	toep->tp_flags |= (TP_ABORT_RPL_PENDING|TP_ABORT_SHUTDOWN);
	
	/* Purge the send queue so we don't send anything after an abort. */
	if (so)
		sbflush(&so->so_snd);
	if ((toep->tp_flags & TP_CLOSE_CON_REQUESTED) && is_t3a(tdev))
		mode |= CPL_ABORT_POST_CLOSE_REQ;

	m = m_gethdr_nofail(sizeof(*req));
	m_set_priority(m, mkprio(CPL_PRIORITY_DATA, so));
	set_arp_failure_handler(m, abort_arp_failure);

	req = mtod(m, struct cpl_abort_req *);
	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
	req->wr.wr_lo = htonl(V_WR_TID(tid));
	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
	req->rsvd0 = tp ? htonl(tp->snd_nxt) : 0;
	req->rsvd1 = !(toep->tp_flags & TP_DATASENT);
	req->cmd = mode;
	if (tp && (tp->t_state == TCPS_SYN_SENT))
		mbufq_tail(&toep->out_of_order_queue, m);	// defer
	else
		l2t_send(TOEP_T3C_DEV(toep), m, toep->tp_l2t);
}

static int
t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
{
	struct inpcb *inp;
	int error, optval;
	
	if (sopt->sopt_name == IP_OPTIONS)
		return (ENOPROTOOPT);

	if (sopt->sopt_name != IP_TOS)
		return (EOPNOTSUPP);
	
	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);

	if (error)
		return (error);

	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
		return (EPERM);

	inp = sotoinpcb(so);
	inp->inp_ip_tos = optval;

	t3_set_tos(so);
	
	return (0);
}

static int
t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
{
	int err = 0;
	size_t copied;

	if (sopt->sopt_name != TCP_CONGESTION &&
	    sopt->sopt_name != TCP_NODELAY)
		return (EOPNOTSUPP);
	
	if (sopt->sopt_name == TCP_CONGESTION) {
		char name[TCP_CA_NAME_MAX];
		int optlen = sopt->sopt_valsize;
		struct tcpcb *tp;
		
		if (optlen < 1)
			return (EINVAL);
		
		err = copyinstr(sopt->sopt_val, name, 
		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
		if (err)
			return (err);
		if (copied < 1)
			return (EINVAL);

		tp = sototcpcb(so);
		/*
		 * XXX I need to revisit this
		 */
		if ((err = t3_set_cong_control(so, name)) == 0) {
#ifdef notyet
			tp->t_cong_control = strdup(name, M_CXGB);
#endif			
		} else
			return (err);
	} else {
		int optval, oldval;
		struct inpcb *inp;
		struct tcpcb *tp;
		
		err = sooptcopyin(sopt, &optval, sizeof optval,
		    sizeof optval);

		if (err)
			return (err);

		inp = sotoinpcb(so);
		tp = intotcpcb(inp);
		    
		INP_LOCK(inp);
		
		oldval = tp->t_flags;
		if (optval)
			tp->t_flags |= TF_NODELAY;
		else
			tp->t_flags &= ~TF_NODELAY;
		INP_UNLOCK(inp);
		
		if (oldval != tp->t_flags)
			t3_set_nagle(so);

	}

	return (0);
}

static int
t3_ctloutput(struct socket *so, struct sockopt *sopt)
{
	int err;

	if (sopt->sopt_level != IPPROTO_TCP) 
		err =  t3_ip_ctloutput(so, sopt);
	else
		err = t3_tcp_ctloutput(so, sopt);

	if (err != EOPNOTSUPP)
		return (err);

	return tcp_ctloutput(so, sopt);
}

/*
 * Process new data received for a connection.
 */
static void
new_rx_data(struct toepcb *toep, struct mbuf *m)
{
	struct cpl_rx_data *hdr = cplhdr(m);
	struct tcpcb *tp = toep->tp_tp;
	struct socket *so = toeptoso(toep);
	int len = be16toh(hdr->len);

	INP_LOCK(tp->t_inpcb);
	
#ifdef notyet	
	if (__predict_false(sk_no_receive(sk))) {
		handle_excess_rx(so, skb);
		return;
	}

	if (ULP_MODE(tp) == ULP_MODE_TCPDDP)
		handle_ddp_data(so, skb);

	TCP_SKB_CB(skb)->seq = ntohl(hdr->seq);
	TCP_SKB_CB(skb)->flags = 0;
	skb_ulp_mode(skb) = 0;                    /* for iSCSI */
#endif
#if VALIDATE_SEQ
	if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) {
		printk(KERN_ERR
		       "%s: TID %u: Bad sequence number %u, expected %u\n",
		       TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq,
		       tp->rcv_nxt);
		__kfree_skb(skb);
		return;
	}
#endif
	m_adj(m, sizeof(*hdr));

#ifdef notyet
	/*
	 * We don't handle urgent data yet
	 */
	if (__predict_false(hdr->urg))
		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
		     tp->urg_seq - tp->rcv_nxt < skb->len))
		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
							 tp->rcv_nxt];
#endif	
	if (__predict_false(hdr->dack_mode != toep->tp_delack_mode)) {
		toep->tp_delack_mode = hdr->dack_mode;
		toep->tp_delack_seq = tp->rcv_nxt;
	}

	DPRINTF("appending mbuf=%p pktlen=%d m_len=%d len=%d\n", m, m->m_pkthdr.len, m->m_len, len);
	
	if (len < m->m_pkthdr.len)
		m->m_pkthdr.len = m->m_len = len;

	tp->rcv_nxt += m->m_pkthdr.len;
	tp->t_rcvtime = ticks;
	toep->tp_enqueued_bytes += m->m_pkthdr.len;
#ifdef T3_TRACE
	T3_TRACE2(TIDTB(sk),
		  "new_rx_data: seq 0x%x len %u",
		  TCP_SKB_CB(skb)->seq, skb->len);
#endif
	SOCKBUF_LOCK(&so->so_rcv);
	if (sb_notify(&so->so_rcv))
		DPRINTF("rx_data so=%p flags=0x%x len=%d\n", so, so->so_rcv.sb_flags, m->m_pkthdr.len);

	sbappend_locked(&so->so_rcv, m);
	KASSERT(so->so_rcv.sb_cc < so->so_rcv.sb_mbmax,

	    ("so=%p, data contents exceed mbmax, sb_cc=%d sb_mbmax=%d",
		so, so->so_rcv.sb_cc, so->so_rcv.sb_mbmax));
	
	INP_UNLOCK(tp->t_inpcb);
	DPRINTF("sb_cc=%d sb_mbcnt=%d\n",
	    so->so_rcv.sb_cc, so->so_rcv.sb_mbcnt);
	    
	if (__predict_true((so->so_state & SS_NOFDREF) == 0))
		sorwakeup_locked(so);
	else
		SOCKBUF_UNLOCK(&so->so_rcv);
}

/*
 * Handler for RX_DATA CPL messages.
 */
static int
do_rx_data(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
	struct toepcb *toep = (struct toepcb *)ctx;

	DPRINTF("rx_data len=%d\n", m->m_pkthdr.len);
	
	new_rx_data(toep, m);

	return (0);
}

static void
new_rx_data_ddp(struct socket *so, struct mbuf *m)
{
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	struct ddp_state *q;
	struct ddp_buf_state *bsp;
	struct cpl_rx_data_ddp *hdr;
	unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;

#ifdef notyet
	if (unlikely(sk_no_receive(sk))) {
		handle_excess_rx(so, m);
		return;
	}
#endif
	tp = sototcpcb(so);
	q = &toep->tp_ddp_state;
	hdr = cplhdr(m);
	ddp_report = ntohl(hdr->u.ddp_report);
	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
	bsp = &q->buf_state[buf_idx];

#ifdef T3_TRACE
	T3_TRACE5(TIDTB(sk),
		  "new_rx_data_ddp: tp->rcv_nxt 0x%x cur_offset %u "
		  "hdr seq 0x%x len %u offset %u",
		  tp->rcv_nxt, bsp->cur_offset, ntohl(hdr->seq),
		  ntohs(hdr->len), G_DDP_OFFSET(ddp_report));
	T3_TRACE1(TIDTB(sk),
		  "new_rx_data_ddp: ddp_report 0x%x",
		  ddp_report);
#endif

	ddp_len = ntohs(hdr->len);
	rcv_nxt = ntohl(hdr->seq) + ddp_len;

	/*
	 * Overload to store old rcv_next
	 */
	m->m_pkthdr.csum_data = tp->rcv_nxt;
	tp->rcv_nxt = rcv_nxt;

	/*
	 * Store the length in m->m_len.  We are changing the meaning of
	 * m->m_len here, we need to be very careful that nothing from now on
	 * interprets ->len of this packet the usual way.
	 */
	m->m_len = tp->rcv_nxt - m->m_pkthdr.csum_data;

	/*
	 * Figure out where the new data was placed in the buffer and store it
	 * in when.  Assumes the buffer offset starts at 0, consumer needs to
	 * account for page pod's pg_offset.
	 */
	end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
#ifdef notyet	
	TCP_SKB_CB(skb)->when = end_offset - skb->len;

	/*
	 * We store in mac.raw the address of the gather list where the
	 * placement happened.
	 */
	skb->mac.raw = (unsigned char *)bsp->gl;
#endif	
	bsp->cur_offset = end_offset;

	/*
	 * Bit 0 of flags stores whether the DDP buffer is completed.
	 * Note that other parts of the code depend on this being in bit 0.
	 */
	if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
#if 0		
		TCP_SKB_CB(skb)->flags = 0;  /* potential spurious completion */
#endif		
		panic("spurious ddp completion");
	} else {
		m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
		if (m->m_pkthdr.csum_flags && !(bsp->flags & DDP_BF_NOFLIP))
			q->cur_buf ^= 1;                     /* flip buffers */
	}

	if (bsp->flags & DDP_BF_NOCOPY) {
		m->m_pkthdr.csum_flags |= (bsp->flags & DDP_BF_NOCOPY);
		bsp->flags &= ~DDP_BF_NOCOPY;
	}

	if (ddp_report & F_DDP_PSH)
		m->m_pkthdr.csum_flags |= DDP_BF_PSH;
	
	tp->t_rcvtime = ticks;
	sbappendstream_locked(&so->so_rcv, m);
#ifdef notyet	
	if (!sock_flag(sk, SOCK_DEAD))
		sk->sk_data_ready(sk, 0);
#endif	
}

#define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
		 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
		 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
		 F_DDP_INVALID_PPOD)

/*
 * Handler for RX_DATA_DDP CPL messages.
 */
static int
do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
	struct toepcb *toep = ctx;
	struct socket *so = toeptoso(toep);
	const struct cpl_rx_data_ddp *hdr = cplhdr(m);

	VALIDATE_SOCK(so);

	if (__predict_false(ntohl(hdr->ddpvld_status) & DDP_ERR)) {
		log(LOG_ERR, "RX_DATA_DDP for TID %u reported error 0x%x\n",
		       GET_TID(hdr), G_DDP_VALID(ntohl(hdr->ddpvld_status)));
		return CPL_RET_BUF_DONE;
	}
#if 0
	skb->h.th = tcphdr_skb->h.th;
#endif	
	new_rx_data_ddp(so, m);
	return (0);
}

static void
process_ddp_complete(struct socket *so, struct mbuf *m)
{
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	struct ddp_state *q;
	struct ddp_buf_state *bsp;
	struct cpl_rx_ddp_complete *hdr;
	unsigned int ddp_report, buf_idx, when;

#ifdef notyet	
	if (unlikely(sk_no_receive(sk))) {
		handle_excess_rx(sk, skb);
		return;
	}
#endif
	q = &toep->tp_ddp_state; 
	hdr = cplhdr(m);
	ddp_report = ntohl(hdr->ddp_report);
	buf_idx = (ddp_report >> S_DDP_BUF_IDX) & 1;
	bsp = &q->buf_state[buf_idx];

	when = bsp->cur_offset;
	m->m_len = G_DDP_OFFSET(ddp_report) - when;

#ifdef T3_TRACE
	T3_TRACE5(TIDTB(sk),
		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
		  "ddp_report 0x%x offset %u, len %u",
		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
		   G_DDP_OFFSET(ddp_report), skb->len);
#endif

	bsp->cur_offset += m->m_len;

	if (!(bsp->flags & DDP_BF_NOFLIP))
		q->cur_buf ^= 1;                     /* flip buffers */

#ifdef T3_TRACE
	T3_TRACE4(TIDTB(sk),
		  "process_ddp_complete: tp->rcv_nxt 0x%x cur_offset %u "
		  "ddp_report %u offset %u",
		  tp->rcv_nxt, bsp->cur_offset, ddp_report,
		   G_DDP_OFFSET(ddp_report));
#endif
#if 0	
	skb->mac.raw = (unsigned char *)bsp->gl;
#endif	
	m->m_pkthdr.csum_flags = (bsp->flags & DDP_BF_NOCOPY) | 1;
	if (bsp->flags & DDP_BF_NOCOPY)
		bsp->flags &= ~DDP_BF_NOCOPY;
	m->m_pkthdr.csum_data = tp->rcv_nxt;
	tp->rcv_nxt += m->m_len;

	tp->t_rcvtime = ticks;
	sbappendstream_locked(&so->so_rcv, m);
#ifdef notyet	
	if (!sock_flag(sk, SOCK_DEAD))
		sk->sk_data_ready(sk, 0);
#endif	
}

/*
 * Handler for RX_DDP_COMPLETE CPL messages.
 */
static int
do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
	struct toepcb *toep = ctx;
	struct socket *so = toeptoso(toep);

	VALIDATE_SOCK(so);
#if 0
	skb->h.th = tcphdr_skb->h.th;
#endif	
	process_ddp_complete(so, m);
	return (0);
}

/*
 * Move a socket to TIME_WAIT state.  We need to make some adjustments to the
 * socket state before calling tcp_time_wait to comply with its expectations.
 */
static void
enter_timewait(struct socket *so)
{
	struct tcpcb *tp = sototcpcb(so);

	INP_LOCK_ASSERT(tp->t_inpcb);
	/*
	 * Bump rcv_nxt for the peer FIN.  We don't do this at the time we
	 * process peer_close because we don't want to carry the peer FIN in
	 * the socket's receive queue and if we increment rcv_nxt without
	 * having the FIN in the receive queue we'll confuse facilities such
	 * as SIOCINQ.
	 */
	tp->rcv_nxt++;

	tp->ts_recent_age = 0;	     /* defeat recycling */
	tp->t_srtt = 0;                        /* defeat tcp_update_metrics */
	tcp_twstart(tp);
}

/*
 * Handle a peer FIN.
 */
static void
do_peer_fin(struct socket *so, struct mbuf *m)
{
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	int keep = 0, dead = (so->so_state & SS_NOFDREF);

	DPRINTF("do_peer_fin state=%d dead=%d\n", tp->t_state, !!dead);
	
#ifdef T3_TRACE
	T3_TRACE0(TIDTB(sk),"do_peer_fin:");
#endif

	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING)) {
		printf("abort_pending set\n");
		
		goto out;
	}
	
#ifdef notyet
	if (ULP_MODE(tp) == ULP_MODE_TCPDDP) {
		keep = handle_peer_close_data(so, skb);
		if (keep < 0)
			return;
	}
	sk->sk_shutdown |= RCV_SHUTDOWN;
	sock_set_flag(so, SOCK_DONE);
#endif
	INP_INFO_WLOCK(&tcbinfo);
	INP_LOCK(tp->t_inpcb);
	if (TCPS_HAVERCVDFIN(tp->t_state) == 0) 
		socantrcvmore(so);
	switch (tp->t_state) {
	case TCPS_SYN_RECEIVED:
	    tp->t_starttime = ticks;
	/* FALLTHROUGH */ 
	case TCPS_ESTABLISHED:
		tp->t_state = TCPS_CLOSE_WAIT;
		break;
	case TCPS_FIN_WAIT_1:
		tp->t_state = TCPS_CLOSING;
		break;
	case TCPS_FIN_WAIT_2:
		/*
		 * If we've sent an abort_req we must have sent it too late,
		 * HW will send us a reply telling us so, and this peer_close
		 * is really the last message for this connection and needs to
		 * be treated as an abort_rpl, i.e., transition the connection
		 * to TCP_CLOSE (note that the host stack does this at the
		 * time of generating the RST but we must wait for HW).
		 * Otherwise we enter TIME_WAIT.
		 */
		t3_release_offload_resources(toep);
		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
			tp = tcp_close(tp);
		} else
			enter_timewait(so);
		break;
	default:
		log(LOG_ERR,
		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
		       TOE_DEV(so)->tod_name, toep->tp_tid, tp->t_state);
	}
	INP_INFO_WUNLOCK(&tcbinfo);
	if (tp)
		INP_UNLOCK(tp->t_inpcb);					

	if (!dead) {
		DPRINTF("waking up waiters on %p rcv_notify=%d flags=0x%x\n", so, sb_notify(&so->so_rcv), so->so_rcv.sb_flags);
		
		sorwakeup(so);
		sowwakeup(so);
		wakeup(&so->so_timeo);
#ifdef notyet		
		sk->sk_state_change(sk);

		/* Do not send POLL_HUP for half duplex close. */
		if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
		    sk->sk_state == TCP_CLOSE)
			sk_wake_async(so, 1, POLL_HUP);
		else
			sk_wake_async(so, 1, POLL_IN);
#endif
	}
out:
	if (!keep)
		m_free(m);
}

/*
 * Handler for PEER_CLOSE CPL messages.
 */
static int
do_peer_close(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
	struct toepcb *toep = (struct toepcb *)ctx;
	struct socket *so = toeptoso(toep);

	VALIDATE_SOCK(so);

	do_peer_fin(so, m);
	return (0);
}

static void
process_close_con_rpl(struct socket *so, struct mbuf *m)
{
	struct tcpcb *tp = sototcpcb(so);
	struct cpl_close_con_rpl *rpl = cplhdr(m);
	struct toepcb *toep = tp->t_toe;

	tp->snd_una = ntohl(rpl->snd_nxt) - 1;  /* exclude FIN */

	DPRINTF("process_close_con_rpl(%p) state=%d dead=%d\n", so, tp->t_state,
	    !!(so->so_state & SS_NOFDREF));
	if (!is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_ABORT_RPL_PENDING))
		goto out;
	
	INP_INFO_WLOCK(&tcbinfo);
	INP_LOCK(tp->t_inpcb);
	switch (tp->t_state) {
	case TCPS_CLOSING:              /* see FIN_WAIT2 case in do_peer_fin */
		t3_release_offload_resources(toep);
		if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
			tp = tcp_close(tp);

		} else
			enter_timewait(so);
		break;
	case TCPS_LAST_ACK:
		/*
		 * In this state we don't care about pending abort_rpl.
		 * If we've sent abort_req it was post-close and was sent too
		 * late, this close_con_rpl is the actual last message.
		 */
		t3_release_offload_resources(toep);
		tp = tcp_close(tp);
		break;
	case TCPS_FIN_WAIT_1:
#ifdef notyet
		dst_confirm(sk->sk_dst_cache);
#endif
		soisdisconnecting(so);
		
		if ((so->so_state & SS_NOFDREF) == 0) {
			/*
			 * Wake up lingering close
			 */
			sowwakeup(so);
			sorwakeup(so);
			wakeup(&so->so_timeo);
		} else if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
		    (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
			tp = cxgb_tcp_drop(tp, 0);
		}

		break;
	default:
		log(LOG_ERR,
		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
		       TOE_DEV(so)->tod_name, toep->tp_tid,
		       tp->t_state);
	}
	INP_INFO_WUNLOCK(&tcbinfo);
	if (tp)
		INP_UNLOCK(tp->t_inpcb);
out:
	m_free(m);
}

/*
 * Handler for CLOSE_CON_RPL CPL messages.
 */
static int
do_close_con_rpl(struct t3cdev *cdev, struct mbuf *m,
			    void *ctx)
{
	struct toepcb *toep = (struct toepcb *)ctx;
	struct socket *so = toeptoso(toep);

	VALIDATE_SOCK(so);

	process_close_con_rpl(so, m);
	return (0);
}

/*
 * Process abort replies.  We only process these messages if we anticipate
 * them as the coordination between SW and HW in this area is somewhat lacking
 * and sometimes we get ABORT_RPLs after we are done with the connection that
 * originated the ABORT_REQ.
 */
static void
process_abort_rpl(struct socket *so, struct mbuf *m)
{
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	
#ifdef T3_TRACE
	T3_TRACE1(TIDTB(sk),
		  "process_abort_rpl: GTS rpl pending %d",
		  sock_flag(sk, ABORT_RPL_PENDING));
#endif
	INP_LOCK(tp->t_inpcb);
	
	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
		/*
		 * XXX panic on tcpdrop
		 */
		if (!(toep->tp_flags & TP_ABORT_RPL_RCVD) && !is_t3a(TOE_DEV(so)))
			toep->tp_flags |= TP_ABORT_RPL_RCVD;
		else {
			toep->tp_flags &= ~(TP_ABORT_RPL_RCVD|TP_ABORT_RPL_PENDING);
			if (!(toep->tp_flags & TP_ABORT_REQ_RCVD) ||
			    !is_t3a(TOE_DEV(so))) {
				if (toep->tp_flags & TP_ABORT_REQ_RCVD)
					panic("TP_ABORT_REQ_RCVD set");
				INP_INFO_WLOCK(&tcbinfo);
				INP_LOCK(tp->t_inpcb);
				t3_release_offload_resources(toep);
				tp = tcp_close(tp);
				INP_INFO_WUNLOCK(&tcbinfo);
			}
		}
	}
	if (tp)
		INP_UNLOCK(tp->t_inpcb);

	m_free(m);
}

/*
 * Handle an ABORT_RPL_RSS CPL message.
 */
static int
do_abort_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
	struct socket *so;
	struct cpl_abort_rpl_rss *rpl = cplhdr(m);
	struct toepcb *toep;
	
	/*
	 * Ignore replies to post-close aborts indicating that the abort was
	 * requested too late.  These connections are terminated when we get
	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
	 * arrives the TID is either no longer used or it has been recycled.
	 */
	if (rpl->status == CPL_ERR_ABORT_FAILED) {
discard:
		m_free(m);
		return (0);
	}

	toep = (struct toepcb *)ctx;
	
        /*
	 * Sometimes we've already closed the socket, e.g., a post-close
	 * abort races with ABORT_REQ_RSS, the latter frees the socket
	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
	 * but FW turns the ABORT_REQ into a regular one and so we get
	 * ABORT_RPL_RSS with status 0 and no socket.  Only on T3A.
	 */
	if (!toep)
		goto discard;

	if (toep->tp_tp == NULL) {
		printf("removing tid for abort\n");
		cxgb_remove_tid(cdev, toep, toep->tp_tid);
		if (toep->tp_l2t) 
			l2t_release(L2DATA(cdev), toep->tp_l2t);

		toepcb_release(toep);
		goto discard;
	}
	
	printf("toep=%p\n", toep);
	printf("tp=%p\n", toep->tp_tp);

	so = toeptoso(toep); /* <- XXX panic */
	toepcb_hold(toep);
	process_abort_rpl(so, m);
	toepcb_release(toep);
	return (0);
}

/*
 * Convert the status code of an ABORT_REQ into a Linux error code.  Also
 * indicate whether RST should be sent in response.
 */
static int
abort_status_to_errno(struct socket *so, int abort_reason, int *need_rst)
{
	struct tcpcb *tp = sototcpcb(so);

	switch (abort_reason) {
	case CPL_ERR_BAD_SYN:
#if 0		
		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);	// fall through
#endif		
	case CPL_ERR_CONN_RESET:
		// XXX need to handle SYN_RECV due to crossed SYNs
		return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
	case CPL_ERR_XMIT_TIMEDOUT:
	case CPL_ERR_PERSIST_TIMEDOUT:
	case CPL_ERR_FINWAIT2_TIMEDOUT:
	case CPL_ERR_KEEPALIVE_TIMEDOUT:
#if 0		
		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
#endif		
		return (ETIMEDOUT);
	default:
		return (EIO);
	}
}

static inline void
set_abort_rpl_wr(struct mbuf *m, unsigned int tid, int cmd)
{
	struct cpl_abort_rpl *rpl = cplhdr(m);

	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
	m->m_len = m->m_pkthdr.len = sizeof(*rpl);
	
	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
	rpl->cmd = cmd;
}

static void
send_deferred_abort_rpl(struct toedev *tdev, struct mbuf *m)
{
	struct mbuf *reply_mbuf;
	struct cpl_abort_req_rss *req = cplhdr(m);

	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_abort_rpl));
	m_set_priority(m, CPL_PRIORITY_DATA);
	m->m_len = m->m_pkthdr.len = sizeof(struct cpl_abort_rpl);
	set_abort_rpl_wr(reply_mbuf, GET_TID(req), req->status);
	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
	m_free(m);
}

/*
 * Returns whether an ABORT_REQ_RSS message is a negative advice.
 */
static inline int
is_neg_adv_abort(unsigned int status)
{
	return status == CPL_ERR_RTX_NEG_ADVICE ||
	    status == CPL_ERR_PERSIST_NEG_ADVICE;
}

static void
send_abort_rpl(struct mbuf *m, struct toedev *tdev, int rst_status)
{
	struct mbuf  *reply_mbuf;
	struct cpl_abort_req_rss *req = cplhdr(m);

	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);

	if (!reply_mbuf) {
		/* Defer the reply.  Stick rst_status into req->cmd. */
		req->status = rst_status;
		t3_defer_reply(m, tdev, send_deferred_abort_rpl);
		return;
	}

	m_set_priority(reply_mbuf, CPL_PRIORITY_DATA);
	set_abort_rpl_wr(reply_mbuf, GET_TID(req), rst_status);
	m_free(m);

	/*
	 * XXX need to sync with ARP as for SYN_RECV connections we can send
	 * these messages while ARP is pending.  For other connection states
	 * it's not a problem.
	 */
	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
}

#ifdef notyet
static void
cleanup_syn_rcv_conn(struct socket *child, struct socket *parent)
{
	UNIMPLEMENTED();
#ifdef notyet	
	struct request_sock *req = child->sk_user_data;

	inet_csk_reqsk_queue_removed(parent, req);
	synq_remove(tcp_sk(child));
	__reqsk_free(req);
	child->sk_user_data = NULL;
#endif
}


/*
 * Performs the actual work to abort a SYN_RECV connection.
 */
static void
do_abort_syn_rcv(struct socket *child, struct socket *parent)
{
	struct tcpcb *parenttp = sototcpcb(parent);
	struct tcpcb *childtp = sototcpcb(child);

	/*
	 * If the server is still open we clean up the child connection,
	 * otherwise the server already did the clean up as it was purging
	 * its SYN queue and the skb was just sitting in its backlog.
	 */
	if (__predict_false(parenttp->t_state == TCPS_LISTEN)) {
		cleanup_syn_rcv_conn(child, parent);
		INP_INFO_WLOCK(&tcbinfo);
		INP_LOCK(childtp->t_inpcb);
		t3_release_offload_resources(childtp->t_toe);
		childtp = tcp_close(childtp);
		INP_INFO_WUNLOCK(&tcbinfo);
		if (childtp)
			INP_UNLOCK(childtp->t_inpcb);
	}
}
#endif

/*
 * Handle abort requests for a SYN_RECV connection.  These need extra work
 * because the socket is on its parent's SYN queue.
 */
static int
abort_syn_rcv(struct socket *so, struct mbuf *m)
{
	UNIMPLEMENTED();
#ifdef notyet	
	struct socket *parent;
	struct toedev *tdev = TOE_DEV(so);
	struct t3cdev *cdev = TOM_DATA(tdev)->cdev;
	struct socket *oreq = so->so_incomp;
	struct t3c_tid_entry *t3c_stid;
	struct tid_info *t;

	if (!oreq)
		return -1;        /* somehow we are not on the SYN queue */

	t = &(T3C_DATA(cdev))->tid_maps;
	t3c_stid = lookup_stid(t, oreq->ts_recent);
	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;

	SOCK_LOCK(parent);
	do_abort_syn_rcv(so, parent);
	send_abort_rpl(m, tdev, CPL_ABORT_NO_RST);
	SOCK_UNLOCK(parent);
#endif
	return (0);
}

/*
 * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
 * request except that we need to reply to it.
 */
static void
process_abort_req(struct socket *so, struct mbuf *m, struct toedev *tdev)
{
	int rst_status = CPL_ABORT_NO_RST;
	const struct cpl_abort_req_rss *req = cplhdr(m);
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;

	INP_LOCK(tp->t_inpcb);
	if ((toep->tp_flags & TP_ABORT_REQ_RCVD) == 0) {
		toep->tp_flags |= (TP_ABORT_REQ_RCVD|TP_ABORT_SHUTDOWN);
		m_free(m);
		goto skip;
	}

	toep->tp_flags &= ~TP_ABORT_REQ_RCVD;
	/*
	 * Three cases to consider:
	 * a) We haven't sent an abort_req; close the connection.
	 * b) We have sent a post-close abort_req that will get to TP too late
	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
	 *    be ignored and the connection should be closed now.
	 * c) We have sent a regular abort_req that will get to TP too late.
	 *    That will generate an abort_rpl with status 0, wait for it.
	 */
	if (((toep->tp_flags & TP_ABORT_RPL_PENDING) == 0) ||
	    (is_t3a(TOE_DEV(so)) && (toep->tp_flags & TP_CLOSE_CON_REQUESTED))) {
		so->so_error = abort_status_to_errno(so, req->status,
		    &rst_status);
#if 0	
		if (!sock_flag(sk, SOCK_DEAD))
			sk->sk_error_report(sk);
#endif		
		/*
		 * SYN_RECV needs special processing.  If abort_syn_rcv()
		 * returns 0 is has taken care of the abort.
		 */
		if ((tp->t_state == TCPS_SYN_RECEIVED) && !abort_syn_rcv(so, m))
			goto skip;

		t3_release_offload_resources(toep);
		tp = tcp_close(tp);
	}
	if (tp)
		INP_UNLOCK(tp->t_inpcb);	
	send_abort_rpl(m, tdev, rst_status);
	return;
	
skip:
	INP_UNLOCK(tp->t_inpcb);		
}

/*
 * Handle an ABORT_REQ_RSS CPL message.
 */
static int
do_abort_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
	const struct cpl_abort_req_rss *req = cplhdr(m);
	struct toepcb *toep = (struct toepcb *)ctx;
	struct socket *so;
	struct inpcb *inp;
	
	if (is_neg_adv_abort(req->status)) {
		m_free(m);
		return (0);
	}

	printf("aborting tid=%d\n", toep->tp_tid);
	
	if ((toep->tp_flags & (TP_SYN_RCVD|TP_ABORT_REQ_RCVD)) == TP_SYN_RCVD) {
		cxgb_remove_tid(cdev, toep, toep->tp_tid);
		toep->tp_flags |= TP_ABORT_REQ_RCVD;
		printf("sending abort rpl\n");
		
		send_abort_rpl(m, toep->tp_toedev, CPL_ABORT_NO_RST);
		printf("sent\n");
		if (toep->tp_l2t) 
			l2t_release(L2DATA(cdev), toep->tp_l2t);

		/*
		 *  Unhook
		 */
		toep->tp_tp->t_toe = NULL;
		toep->tp_tp->t_flags &= ~TF_TOE;
		toep->tp_tp = NULL;
		/*
		 * XXX need to call syncache_chkrst - but we don't
		 * have a way of doing that yet
		 */
		toepcb_release(toep);
		printf("abort for unestablished connection :-(\n");
		return (0);
	}
	if (toep->tp_tp == NULL) {
		printf("disconnected toepcb\n");
		/* should be freed momentarily */
		return (0);
	}

	so = toeptoso(toep);
	inp = sotoinpcb(so);
	
	VALIDATE_SOCK(so);
	toepcb_hold(toep);
	INP_INFO_WLOCK(&tcbinfo);
	process_abort_req(so, m, TOE_DEV(so));
	INP_INFO_WUNLOCK(&tcbinfo);
	toepcb_release(toep);
	return (0);
}
#ifdef notyet
static void
pass_open_abort(struct socket *child, struct socket *parent, struct mbuf *m)
{
	struct toedev *tdev = TOE_DEV(parent);

	do_abort_syn_rcv(child, parent);
	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3) {
		struct cpl_pass_accept_rpl *rpl = cplhdr(m);

		rpl->opt0h = htonl(F_TCAM_BYPASS);
		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
	} else
		m_free(m);
}
#endif
static void
handle_pass_open_arp_failure(struct socket *so, struct mbuf *m)
{
	UNIMPLEMENTED();
	
#ifdef notyet	
	struct t3cdev *cdev;
	struct socket *parent;
	struct socket *oreq;
	struct t3c_tid_entry *t3c_stid;
	struct tid_info *t;
	struct tcpcb *otp, *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	
	/*
	 * If the connection is being aborted due to the parent listening
	 * socket going away there's nothing to do, the ABORT_REQ will close
	 * the connection.
	 */
	if (toep->tp_flags & TP_ABORT_RPL_PENDING) {
		m_free(m);
		return;
	}

	oreq = so->so_incomp;
	otp = sototcpcb(oreq);
	
	cdev = T3C_DEV(so);
	t = &(T3C_DATA(cdev))->tid_maps;
	t3c_stid = lookup_stid(t, otp->ts_recent);
	parent = ((struct listen_ctx *)t3c_stid->ctx)->lso;

	SOCK_LOCK(parent);
	pass_open_abort(so, parent, m);
	SOCK_UNLOCK(parent);
#endif	
}

/*
 * Handle an ARP failure for a CPL_PASS_ACCEPT_RPL.  This is treated similarly
 * to an ABORT_REQ_RSS in SYN_RECV as both events need to tear down a SYN_RECV
 * connection.
 */
static void
pass_accept_rpl_arp_failure(struct t3cdev *cdev, struct mbuf *m)
{

#ifdef notyet	
	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
	BLOG_SKB_CB(skb)->dev = TOE_DEV(skb->sk);
#endif
	handle_pass_open_arp_failure(m_get_socket(m), m);
}

/*
 * Populate a reject CPL_PASS_ACCEPT_RPL WR.
 */
static void
mk_pass_accept_rpl(struct mbuf *reply_mbuf, struct mbuf *req_mbuf)
{
	struct cpl_pass_accept_req *req = cplhdr(req_mbuf);
	struct cpl_pass_accept_rpl *rpl = cplhdr(reply_mbuf);
	unsigned int tid = GET_TID(req);

	m_set_priority(reply_mbuf, CPL_PRIORITY_SETUP);
	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
	rpl->peer_ip = req->peer_ip;   // req->peer_ip not overwritten yet
	rpl->opt0h = htonl(F_TCAM_BYPASS);
	rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
	rpl->opt2 = 0;
	rpl->rsvd = rpl->opt2;   /* workaround for HW bug */
}

/*
 * Send a deferred reject to an accept request.
 */
static void
reject_pass_request(struct toedev *tdev, struct mbuf *m)
{
	struct mbuf *reply_mbuf;

	reply_mbuf = m_gethdr_nofail(sizeof(struct cpl_pass_accept_rpl));
	mk_pass_accept_rpl(reply_mbuf, m);
	cxgb_ofld_send(TOM_DATA(tdev)->cdev, reply_mbuf);
	m_free(m);
}

static void
handle_syncache_event(int event, void *arg)
{
	struct toepcb *toep = arg;

	switch (event) {
	case TOE_SC_ENTRY_PRESENT:
		/*
		 * entry already exists - free toepcb
		 * and l2t
		 */
		printf("syncache entry present\n");
		toepcb_release(toep);
		break;
	case TOE_SC_DROP:
		/*
		 * The syncache has given up on this entry
		 * either it timed out, or it was evicted
		 * we need to explicitly release the tid
		 */
		printf("syncache entry dropped\n");
		toepcb_release(toep);		
		break;
	default:
		log(LOG_ERR, "unknown syncache event %d\n", event);
		break;
	}
}

static void
syncache_add_accept_req(struct cpl_pass_accept_req *req, struct socket *lso, struct toepcb *toep)
{
	struct in_conninfo inc;
	struct tcpopt to;
	struct tcphdr th;
	struct inpcb *inp;
	int mss, wsf, sack, ts;

	bzero(&to, sizeof(struct tcpopt));
	inp = sotoinpcb(lso);
	
	/*
	 * Fill out information for entering us into the syncache
	 */
	inc.inc_fport = th.th_sport = req->peer_port;
	inc.inc_lport = th.th_dport = req->local_port;
	toep->tp_iss = th.th_seq = req->rcv_isn;
	th.th_flags = TH_SYN;

	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = ntohl(req->rcv_isn);
	
	inc.inc_isipv6 = 0;
	inc.inc_len = 0;
	inc.inc_faddr.s_addr = req->peer_ip;
	inc.inc_laddr.s_addr = req->local_ip;

	DPRINTF("syncache add of %d:%d %d:%d\n",
	    ntohl(req->local_ip), ntohs(req->local_port),
	    ntohl(req->peer_ip), ntohs(req->peer_port));
	
	mss = req->tcp_options.mss;
	wsf = req->tcp_options.wsf;
	ts = req->tcp_options.tstamp;
	sack = req->tcp_options.sack;
	to.to_mss = mss;
	to.to_wscale = wsf;
	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);

	INP_INFO_WLOCK(&tcbinfo);
	INP_LOCK(inp);
	syncache_offload_add(&inc, &to, &th, inp, &lso, &cxgb_toe_usrreqs, toep);
}


/*
 * Process a CPL_PASS_ACCEPT_REQ message.  Does the part that needs the socket
 * lock held.  Note that the sock here is a listening socket that is not owned
 * by the TOE.
 */
static void
process_pass_accept_req(struct socket *so, struct mbuf *m, struct toedev *tdev,
    struct listen_ctx *lctx)
{
	int rt_flags;
	struct l2t_entry *e;
	struct iff_mac tim;
	struct mbuf *reply_mbuf, *ddp_mbuf = NULL;
	struct cpl_pass_accept_rpl *rpl;
	struct cpl_pass_accept_req *req = cplhdr(m);
	unsigned int tid = GET_TID(req);
	struct tom_data *d = TOM_DATA(tdev);
	struct t3cdev *cdev = d->cdev;
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *newtoep;
	struct rtentry *dst;
	struct sockaddr_in nam;
	struct t3c_data *td = T3C_DATA(cdev);

	reply_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
	if (__predict_false(reply_mbuf == NULL)) {
		if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
			t3_defer_reply(m, tdev, reject_pass_request);
		else {
			cxgb_queue_tid_release(cdev, tid);
			m_free(m);
		}
		DPRINTF("failed to get reply_mbuf\n");
		
		goto out;
	}

	if (tp->t_state != TCPS_LISTEN) {
		DPRINTF("socket not in listen state\n");
		
		goto reject;
	}
	
	tim.mac_addr = req->dst_mac;
	tim.vlan_tag = ntohs(req->vlan_tag);
	if (cdev->ctl(cdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
		DPRINTF("rejecting from failed GET_IFF_FROM_MAC\n");
		goto reject;
	}
	
#ifdef notyet
	/*
	 * XXX do route lookup to confirm that we're still listening on this
	 * address
	 */
	if (ip_route_input(skb, req->local_ip, req->peer_ip,
			   G_PASS_OPEN_TOS(ntohl(req->tos_tid)), tim.dev))
		goto reject;
	rt_flags = ((struct rtable *)skb->dst)->rt_flags &
		(RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL);
	dst_release(skb->dst);	// done with the input route, release it
	skb->dst = NULL;
	
	if ((rt_flags & RTF_LOCAL) == 0)
		goto reject;
#endif
	/*
	 * XXX
	 */
	rt_flags = RTF_LOCAL;
	if ((rt_flags & RTF_LOCAL) == 0)
		goto reject;
	
	/*
	 * Calculate values and add to syncache
	 */

	newtoep = toepcb_alloc();
	if (newtoep == NULL)
		goto reject;

	bzero(&nam, sizeof(struct sockaddr_in));
	
	nam.sin_len = sizeof(struct sockaddr_in);
	nam.sin_family = AF_INET;
	nam.sin_addr.s_addr =req->peer_ip;
	dst = rtalloc2((struct sockaddr *)&nam, 1, 0);

	if (dst == NULL) {
		printf("failed to find route\n");
		goto reject;
	}
	e = newtoep->tp_l2t = t3_l2t_get(d->cdev, dst, tim.dev,
	    (struct sockaddr *)&nam);
	if (e == NULL) {
		DPRINTF("failed to get l2t\n");
	}
	/*
	 * Point to our listen socket until accept
	 */
	newtoep->tp_tp = tp;
	newtoep->tp_flags = TP_SYN_RCVD;
	newtoep->tp_tid = tid;
	newtoep->tp_toedev = tdev;
	
	printf("inserting tid=%d\n", tid);
	cxgb_insert_tid(cdev, d->client, newtoep, tid);
	SOCK_LOCK(so);
	LIST_INSERT_HEAD(&lctx->synq_head, newtoep, synq_entry);
	SOCK_UNLOCK(so);
	
	
	if (lctx->ulp_mode) {
		ddp_mbuf = m_gethdr(M_NOWAIT, MT_DATA);
		
		if (!ddp_mbuf)
			newtoep->tp_ulp_mode = 0;
		else
			newtoep->tp_ulp_mode = lctx->ulp_mode;
	}

	set_arp_failure_handler(reply_mbuf, pass_accept_rpl_arp_failure);

	DPRINTF("adding request to syn cache\n");

	/*
	 * XXX workaround for lack of syncache drop
	 */
	toepcb_hold(newtoep);
	syncache_add_accept_req(req, so, newtoep);

	
	rpl = cplhdr(reply_mbuf);
	reply_mbuf->m_pkthdr.len = reply_mbuf->m_len = sizeof(*rpl);
	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
	rpl->wr.wr_lo = 0;
	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, tid));
	rpl->opt2 = htonl(calc_opt2(so, tdev));
	rpl->rsvd = rpl->opt2;                /* workaround for HW bug */
	rpl->peer_ip = req->peer_ip;	// req->peer_ip is not overwritten

	DPRINTF("accept smt_idx=%d\n", e->smt_idx);
	
	rpl->opt0h = htonl(calc_opt0h(so, select_mss(td, NULL, dst->rt_ifp->if_mtu)) |
	    V_L2T_IDX(e->idx) | V_TX_CHANNEL(e->smt_idx));
	rpl->opt0l_status = htonl(calc_opt0l(so, lctx->ulp_mode) |
				  CPL_PASS_OPEN_ACCEPT);

	DPRINTF("opt0l_status=%08x\n", rpl->opt0l_status);
	
	m_set_priority(reply_mbuf, mkprio(CPL_PRIORITY_SETUP, so));
	
#ifdef DEBUG_PRINT		
	{
		int i;

		DPRINTF("rpl:\n");
		uint32_t *rplbuf = mtod(reply_mbuf, uint32_t *);
		
		for (i = 0; i < sizeof(*rpl)/sizeof(uint32_t); i++)
			DPRINTF("[%d] %08x\n", i, rplbuf[i]);
	}
#endif	

		
	l2t_send(cdev, reply_mbuf, e);
	m_free(m);
#ifdef notyet
	/*
	 * XXX this call path has to be converted to not depend on sockets
	 */
	if (newtoep->tp_ulp_mode) 
		__set_tcb_field(newso, ddp_mbuf, W_TCB_RX_DDP_FLAGS,
				V_TF_DDP_OFF(1) |
				TP_DDP_TIMER_WORKAROUND_MASK,
				V_TF_DDP_OFF(1) |
				TP_DDP_TIMER_WORKAROUND_VAL, 1);

#endif	
	return;
reject:
	if (tdev->tod_ttid == TOE_ID_CHELSIO_T3)
		mk_pass_accept_rpl(reply_mbuf, m);
	else 
		mk_tid_release(reply_mbuf, NULL, tid);
	cxgb_ofld_send(cdev, reply_mbuf);
	m_free(m);
out:
#if 0
	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
#else
	return;
#endif	
}      

/*
 * Handle a CPL_PASS_ACCEPT_REQ message.
 */
static int
do_pass_accept_req(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
	struct listen_ctx *listen_ctx = (struct listen_ctx *)ctx;
	struct socket *lso = listen_ctx->lso;
	struct tom_data *d = listen_ctx->tom_data;

#if VALIDATE_TID
	struct cpl_pass_accept_req *req = cplhdr(m);
	unsigned int tid = GET_TID(req);
	struct tid_info *t = &(T3C_DATA(cdev))->tid_maps;

	if (unlikely(!lsk)) {
		printk(KERN_ERR "%s: PASS_ACCEPT_REQ had unknown STID %lu\n",
		       cdev->name,
		       (unsigned long)((union listen_entry *)ctx -
					t->stid_tab));
		return CPL_RET_BUF_DONE;
	}
	if (unlikely(tid >= t->ntids)) {
		printk(KERN_ERR "%s: passive open TID %u too large\n",
		       cdev->name, tid);
		return CPL_RET_BUF_DONE;
	}
	/*
	 * For T3A the current user of the TID may have closed but its last
	 * message(s) may have been backlogged so the TID appears to be still
	 * in use.  Just take the TID away, the connection can close at its
	 * own leisure.  For T3B this situation is a bug.
	 */
	if (!valid_new_tid(t, tid) &&
	    cdev->type != T3A) {
		printk(KERN_ERR "%s: passive open uses existing TID %u\n",
		       cdev->name, tid);
		return CPL_RET_BUF_DONE;
	}
#endif

	process_pass_accept_req(lso, m, &d->tdev, listen_ctx);
	return (0);
}

/*
 * Called when a connection is established to translate the TCP options
 * reported by HW to Linux's native format.
 */
static void
assign_rxopt(struct socket *so, unsigned int opt)
{
	const struct t3c_data *td = T3C_DATA(T3C_DEV(so));
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;

	INP_LOCK_ASSERT(tp->t_inpcb);
	
	toep->tp_mss_clamp = td->mtus[G_TCPOPT_MSS(opt)] - 40;
	tp->t_flags         |= G_TCPOPT_TSTAMP(opt) ? TF_RCVD_TSTMP : 0;
	tp->t_flags         |= G_TCPOPT_SACK(opt) ? TF_SACK_PERMIT : 0;
	tp->t_flags 	    |= G_TCPOPT_WSCALE_OK(opt) ? TF_RCVD_SCALE : 0;
	if (tp->t_flags & TF_RCVD_SCALE)
		tp->rcv_scale = 0;
}

/*
 * Completes some final bits of initialization for just established connections
 * and changes their state to TCP_ESTABLISHED.
 *
 * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
 */
static void
make_established(struct socket *so, u32 snd_isn, unsigned int opt)
{
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	
	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
	assign_rxopt(so, opt);
	so->so_proto->pr_ctloutput = t3_ctloutput;
	
#if 0	
	inet_sk(sk)->id = tp->write_seq ^ jiffies;
#endif	


	/*
	 * XXX not clear what rcv_wup maps to
	 */
	/*
	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
	 * pass through opt0.
	 */
	if (tp->rcv_wnd > (M_RCV_BUFSIZ << 10))
		toep->tp_rcv_wup -= tp->rcv_wnd - (M_RCV_BUFSIZ << 10);

	dump_toepcb(toep);

#ifdef notyet
/*
 * no clean interface for marking ARP up to date
 */
	dst_confirm(sk->sk_dst_cache);
#endif
	tp->t_state = TCPS_ESTABLISHED;
}

static int
syncache_expand_establish_req(struct cpl_pass_establish *req, struct socket **so, struct toepcb *toep)
{

	struct in_conninfo inc;
	struct tcpopt to;
	struct tcphdr th;
	int mss, wsf, sack, ts;
	struct mbuf *m = NULL;
	const struct t3c_data *td = T3C_DATA(TOM_DATA(toep->tp_toedev)->cdev);
	unsigned int opt;
	
#ifdef MAC
#error	"no MAC support"
#endif	
	
	opt = ntohs(req->tcp_opt);
	
	bzero(&to, sizeof(struct tcpopt));
	
	/*
	 * Fill out information for entering us into the syncache
	 */
	inc.inc_fport = th.th_sport = req->peer_port;
	inc.inc_lport = th.th_dport = req->local_port;
	th.th_seq = req->rcv_isn;
	th.th_flags = TH_ACK;
	
	inc.inc_isipv6 = 0;
	inc.inc_len = 0;
	inc.inc_faddr.s_addr = req->peer_ip;
	inc.inc_laddr.s_addr = req->local_ip;
	
	mss  = td->mtus[G_TCPOPT_MSS(opt)] - 40;
	wsf  = G_TCPOPT_WSCALE_OK(opt);
	ts   = G_TCPOPT_TSTAMP(opt);
	sack = G_TCPOPT_SACK(opt);
	
	to.to_mss = mss;
	to.to_wscale =  G_TCPOPT_SND_WSCALE(opt);
	to.to_flags = (mss ? TOF_MSS : 0) | (wsf ? TOF_SCALE : 0) | (ts ? TOF_TS : 0) | (sack ? TOF_SACKPERM : 0);

	DPRINTF("syncache expand of %d:%d %d:%d mss:%d wsf:%d ts:%d sack:%d\n",
	    ntohl(req->local_ip), ntohs(req->local_port),
	    ntohl(req->peer_ip), ntohs(req->peer_port),
	    mss, wsf, ts, sack);
	return syncache_expand(&inc, &to, &th, so, m);
}


/*
 * Process a CPL_PASS_ESTABLISH message.  XXX a lot of the locking doesn't work
 * if we are in TCP_SYN_RECV due to crossed SYNs
 */
static int
do_pass_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
	struct cpl_pass_establish *req = cplhdr(m);
	struct toepcb *toep = (struct toepcb *)ctx;
	struct tcpcb *tp;
	struct socket *so, *lso;
	struct t3c_data *td = T3C_DATA(cdev);
	// Complete socket initialization now that we have the SND_ISN
	
	struct toedev *tdev;

	so = lso = toeptoso(toep);
	tdev = toep->tp_toedev;

	SOCK_LOCK(so);
	LIST_REMOVE(toep, synq_entry);
	SOCK_UNLOCK(so);
	
	INP_INFO_WLOCK(&tcbinfo);
	if (!syncache_expand_establish_req(req, &so, toep)) {
		/*
		 * No entry 
		 */
		UNIMPLEMENTED();
	}
	if (so == NULL) {
		/*
		 * Couldn't create the socket
		 */
		UNIMPLEMENTED();
	}

	/*
	 * XXX workaround for lack of syncache drop
	 */
	toepcb_release(toep);
	
	tp = sototcpcb(so);
	INP_LOCK(tp->t_inpcb);
#ifdef notyet	
	so->so_snd.sb_flags |= SB_TOE;
	so->so_rcv.sb_flags |= SB_TOE;
#endif	
	toep->tp_tp = tp;
	toep->tp_flags = 0;
	tp->t_toe = toep;
	reset_wr_list(toep);
	tp->rcv_wnd = select_rcv_wnd(so);
	DPRINTF("rcv_wnd=%ld\n", tp->rcv_wnd);
	install_offload_ops(so);
	
	toep->tp_wr_max = toep->tp_wr_avail = TOM_TUNABLE(tdev, max_wrs);
	toep->tp_wr_unacked = 0;
	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));
	toep->tp_ulp_mode = TOM_TUNABLE(tdev, ddp) && !(so->so_options & SO_NO_DDP) &&
	    tp->rcv_wnd >= MIN_DDP_RCV_WIN ? ULP_MODE_TCPDDP : 0;
	toep->tp_qset_idx = 0;
	toep->tp_mtu_idx = select_mss(td, tp, toep->tp_l2t->neigh->rt_ifp->if_mtu);
	
	/*
	 * XXX Cancel any keep alive timer
	 */
	     
	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
	INP_INFO_WUNLOCK(&tcbinfo);
	INP_UNLOCK(tp->t_inpcb);
	soisconnected(so);
	
#ifdef notyet
	/*
	 * XXX not sure how these checks map to us
	 */
	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
		sk->sk_state_change(sk);
		sk_wake_async(so, 0, POLL_OUT);
	}
	/*
	 * The state for the new connection is now up to date.
	 * Next check if we should add the connection to the parent's
	 * accept queue.  When the parent closes it resets connections
	 * on its SYN queue, so check if we are being reset.  If so we
	 * don't need to do anything more, the coming ABORT_RPL will
	 * destroy this socket.  Otherwise move the connection to the
	 * accept queue.
	 *
	 * Note that we reset the synq before closing the server so if
	 * we are not being reset the stid is still open.
	 */
	if (unlikely(!tp->forward_skb_hint)) { // removed from synq
		__kfree_skb(skb);
		goto unlock;
	}
#endif
	m_free(m);

	return (0);
}

/*
 * Fill in the right TID for CPL messages waiting in the out-of-order queue
 * and send them to the TOE.
 */
static void
fixup_and_send_ofo(struct socket *so)
{
	struct mbuf *m;
	struct toedev *tdev = TOE_DEV(so);
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	unsigned int tid = toep->tp_tid;

	printf("fixup_and_send_ofo\n");
	
	INP_LOCK_ASSERT(tp->t_inpcb);
	while ((m = mbufq_dequeue(&toep->out_of_order_queue)) != NULL) {
		/*
		 * A variety of messages can be waiting but the fields we'll
		 * be touching are common to all so any message type will do.
		 */
		struct cpl_close_con_req *p = cplhdr(m);

		p->wr.wr_lo = htonl(V_WR_TID(tid));
		OPCODE_TID(p) = htonl(MK_OPCODE_TID(p->ot.opcode, tid));
		cxgb_ofld_send(TOM_DATA(tdev)->cdev, m);
	}
}

/*
 * Updates socket state from an active establish CPL message.  Runs with the
 * socket lock held.
 */
static void
socket_act_establish(struct socket *so, struct mbuf *m)
{
	struct cpl_act_establish *req = cplhdr(m);
	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
	struct tcpcb *tp = sototcpcb(so);
	struct toepcb *toep = tp->t_toe;
	
	if (__predict_false(tp->t_state != TCPS_SYN_SENT))
		log(LOG_ERR, "TID %u expected SYN_SENT, found %d\n",
		    toep->tp_tid, tp->t_state);

	tp->ts_recent_age = ticks;
	tp->irs = tp->rcv_wnd = tp->rcv_nxt = rcv_isn;
	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;

	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
	
	/*
	 * Now that we finally have a TID send any CPL messages that we had to
	 * defer for lack of a TID.
	 */
	if (mbufq_len(&toep->out_of_order_queue))
		fixup_and_send_ofo(so);

	if (__predict_false(so->so_state & SS_NOFDREF)) {
#ifdef notyet
		/*	
		 * XXX 	not clear what should be done here
		 * appears to correspond to sorwakeup_locked
		 */
		sk->sk_state_change(sk);
		sk_wake_async(so, 0, POLL_OUT);
#endif
	}
	m_free(m);
#ifdef notyet
/*
 * XXX assume no write requests permitted while socket connection is
 * incomplete
 */
	/*
	 * Currently the send queue must be empty at this point because the
	 * socket layer does not send anything before a connection is
	 * established.  To be future proof though we handle the possibility
	 * that there are pending buffers to send (either TX_DATA or
	 * CLOSE_CON_REQ).  First we need to adjust the sequence number of the
	 * buffers according to the just learned write_seq, and then we send
	 * them on their way.
	 */
	fixup_pending_writeq_buffers(sk);
	if (t3_push_frames(so, 1))
		sk->sk_write_space(sk);
#endif

	soisconnected(so);
	toep->tp_state = tp->t_state = TCPS_ESTABLISHED;
	tcpstat.tcps_connects++;
				
}

/*
 * Process a CPL_ACT_ESTABLISH message.
 */
static int
do_act_establish(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
	struct cpl_act_establish *req = cplhdr(m);
	unsigned int tid = GET_TID(req);
	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
	struct toepcb *toep = (struct toepcb *)ctx;
	struct tcpcb *tp = toep->tp_tp;
	struct socket *so; 
	struct toedev *tdev;
	struct tom_data *d;
	
	if (tp == NULL) {
		free_atid(cdev, atid);
		return (0);
	}
	
	so = toeptoso(toep);
	tdev = TOE_DEV(so); /* blow up here if link was down */
	d = TOM_DATA(tdev);
	
	INP_LOCK(tp->t_inpcb);
	
	/*
	 * It's OK if the TID is currently in use, the owning socket may have
	 * backlogged its last CPL message(s).  Just take it away.
	 */
	toep->tp_tid = tid;
	toep->tp_tp = tp;
	so_insert_tid(d, so, tid);
	free_atid(cdev, atid);
	toep->tp_qset = G_QNUM(ntohl(m->m_pkthdr.csum_data));

	socket_act_establish(so, m);
	INP_UNLOCK(tp->t_inpcb);
	return (0);
}

/*
 * Process an acknowledgment of WR completion.  Advance snd_una and send the
 * next batch of work requests from the write queue.
 */
static void
wr_ack(struct toepcb *toep, struct mbuf *m)
{
	struct tcpcb *tp = toep->tp_tp;
	struct cpl_wr_ack *hdr = cplhdr(m);
	struct socket *so = toeptoso(toep);
	unsigned int credits = ntohs(hdr->credits);
	u32 snd_una = ntohl(hdr->snd_una);
	int bytes = 0;
	
	DPRINTF("wr_ack: snd_una=%u credits=%d\n", snd_una, credits);

	INP_LOCK(tp->t_inpcb);
	
	toep->tp_wr_avail += credits;
	if (toep->tp_wr_unacked > toep->tp_wr_max - toep->tp_wr_avail)
		toep->tp_wr_unacked = toep->tp_wr_max - toep->tp_wr_avail;

	while (credits) {
		struct mbuf *p = peek_wr(toep);
		DPRINTF("p->credits=%d p->bytes=%d\n", p->m_pkthdr.csum_data, p->m_pkthdr.len) ;
		
		if (__predict_false(!p)) {
			log(LOG_ERR, "%u WR_ACK credits for TID %u with "
			    "nothing pending, state %u\n",
			       credits, toep->tp_tid, tp->t_state);
			break;
		}
		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
#if DEBUG_WR > 1
			struct tx_data_wr *w = cplhdr(p);
#ifdef notyet
			log(LOG_ERR,
			       "TID %u got %u WR credits, need %u, len %u, "
			       "main body %u, frags %u, seq # %u, ACK una %u,"
			       " ACK nxt %u, WR_AVAIL %u, WRs pending %u\n",
			       toep->tp_tid, credits, p->csum, p->len,
			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
			       WR_AVAIL(tp), count_pending_wrs(tp) - credits);
#endif			
#endif
			p->m_pkthdr.csum_data -= credits;
			break;
		} else {
			dequeue_wr(toep);
			credits -= p->m_pkthdr.csum_data;
			bytes += p->m_pkthdr.len;
			DPRINTF("done with wr of %d bytes\n", p->m_pkthdr.len);
	
			m_free(p);
		}
	}

#if DEBUG_WR
	check_wr_invariants(tp);
#endif

	if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
#if VALIDATE_SEQ
		struct tom_data *d = TOM_DATA(TOE_DEV(so));

		log(LOG_ERR "%s: unexpected sequence # %u in WR_ACK "
		    "for TID %u, snd_una %u\n", (&d->tdev)->name, snd_una,
		    toep->tp_tid, tp->snd_una);
#endif
		goto out_free;
	}

	if (tp->snd_una != snd_una) {
		tp->snd_una = snd_una;
		tp->ts_recent_age = ticks;
#ifdef notyet
		/*
		 * Keep ARP entry "minty fresh"
		 */
		dst_confirm(sk->sk_dst_cache);
#endif
		if (tp->snd_una == tp->snd_nxt)
			toep->tp_flags &= ~TP_TX_WAIT_IDLE;
	}
	if (bytes) {
		DPRINTF("sbdrop(%d)\n", bytes);
		SOCKBUF_LOCK(&so->so_snd);
		sbdrop_locked(&so->so_snd, bytes);
		sowwakeup_locked(so);
	}
	
	if (so->so_snd.sb_sndptroff < so->so_snd.sb_cc)
		t3_push_frames(so, 0);

out_free:
	INP_UNLOCK(tp->t_inpcb);
	m_free(m);
}

/*
 * Handler for TX_DATA_ACK CPL messages.
 */
static int
do_wr_ack(struct t3cdev *dev, struct mbuf *m, void *ctx)
{
	struct toepcb *toep = (struct toepcb *)ctx;

	DPRINTF("do_wr_ack\n");
	dump_toepcb(toep);
	
	VALIDATE_SOCK(so);

	wr_ack(toep, m);
	return 0;
}


/*
 * Reset a connection that is on a listener's SYN queue or accept queue,
 * i.e., one that has not had a struct socket associated with it.
 * Must be called from process context.
 *
 * Modeled after code in inet_csk_listen_stop().
 */
static void
t3_reset_listen_child(struct socket *child)
{
	struct tcpcb *tp = sototcpcb(child);
	
	t3_send_reset(tp->t_toe);
}

/*
 * Disconnect offloaded established but not yet accepted connections sitting
 * on a server's accept_queue.  We just send an ABORT_REQ at this point and
 * finish off the disconnect later as we may need to wait for the ABORT_RPL.
 */
void
t3_disconnect_acceptq(struct socket *listen_so)
{
	struct socket *so;
	struct tcpcb *tp;

	TAILQ_FOREACH(so, &listen_so->so_comp, so_list) {
		tp = sototcpcb(so);
		
		if (tp->t_flags & TF_TOE) {
			INP_LOCK(tp->t_inpcb);
			t3_reset_listen_child(so);
			INP_UNLOCK(tp->t_inpcb);
		}
		
	}
}

/*
 * Reset offloaded connections sitting on a server's syn queue.  As above
 * we send ABORT_REQ and finish off when we get ABORT_RPL.
 */

void
t3_reset_synq(struct listen_ctx *lctx)
{
	struct toepcb *toep;

	SOCK_LOCK(lctx->lso);
	while (!LIST_EMPTY(&lctx->synq_head)) {
		toep = LIST_FIRST(&lctx->synq_head);
		LIST_REMOVE(toep, synq_entry);
		toep->tp_tp = NULL;
		t3_send_reset(toep);
		cxgb_remove_tid(TOEP_T3C_DEV(toep), toep, toep->tp_tid);
		toepcb_release(toep);
	}
	SOCK_UNLOCK(lctx->lso);
}

void
t3_init_wr_tab(unsigned int wr_len)
{
	int i;

	if (mbuf_wrs[1])     /* already initialized */
		return;

	for (i = 1; i < ARRAY_SIZE(mbuf_wrs); i++) {
		int sgl_len = (3 * i) / 2 + (i & 1);

		sgl_len += 3;
		mbuf_wrs[i] = sgl_len <= wr_len ?
		       	1 : 1 + (sgl_len - 2) / (wr_len - 1);
	}

	wrlen = wr_len * 8;
}

int
t3_init_cpl_io(void)
{
#ifdef notyet
	tcphdr_skb = alloc_skb(sizeof(struct tcphdr), GFP_KERNEL);
	if (!tcphdr_skb) {
		log(LOG_ERR,
		       "Chelsio TCP offload: can't allocate sk_buff\n");
		return -1;
	}
	skb_put(tcphdr_skb, sizeof(struct tcphdr));
	tcphdr_skb->h.raw = tcphdr_skb->data;
	memset(tcphdr_skb->data, 0, tcphdr_skb->len);
#endif

	
	t3tom_register_cpl_handler(CPL_ACT_ESTABLISH, do_act_establish);
	t3tom_register_cpl_handler(CPL_ACT_OPEN_RPL, do_act_open_rpl);
	t3tom_register_cpl_handler(CPL_TX_DMA_ACK, do_wr_ack);
	t3tom_register_cpl_handler(CPL_RX_DATA, do_rx_data);
	t3tom_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
	t3tom_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
	t3tom_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
	t3tom_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
	t3tom_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
	t3tom_register_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl);
	t3tom_register_cpl_handler(CPL_RX_DATA_DDP, do_rx_data_ddp);
	t3tom_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
#ifdef notyet	
	t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
	t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
	t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
#endif
	return (0);
}