1 /**************************************************************************
3 Copyright (c) 2007-2008, Chelsio Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Chelsio Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/types.h>
36 #include <sys/fcntl.h>
37 #include <sys/kernel.h>
38 #include <sys/limits.h>
41 #include <sys/condvar.h>
42 #include <sys/mutex.h>
45 #include <sys/sockstate.h>
46 #include <sys/sockopt.h>
47 #include <sys/socket.h>
48 #include <sys/sockbuf.h>
49 #include <sys/syslog.h>
53 #include <machine/bus.h>
54 #include <machine/cpu.h>
57 #include <net/route.h>
59 #include <netinet/in.h>
60 #include <netinet/in_pcb.h>
61 #include <netinet/in_systm.h>
62 #include <netinet/in_var.h>
64 #include <cxgb_osdep.h>
65 #include <sys/mbufq.h>
66 #include <ulp/tom/cxgb_tcp_offload.h>
67 #include <netinet/tcp.h>
68 #include <netinet/tcp_var.h>
69 #include <netinet/tcp_fsm.h>
70 #include <netinet/tcp_offload.h>
71 #include <net/route.h>
74 #include <common/cxgb_firmware_exports.h>
75 #include <common/cxgb_t3_cpl.h>
76 #include <common/cxgb_tcb.h>
77 #include <common/cxgb_ctl_defs.h>
78 #include <cxgb_offload.h>
81 #include <vm/vm_page.h>
82 #include <vm/vm_map.h>
83 #include <vm/vm_extern.h>
87 #include <ulp/toecore/cxgb_toedev.h>
88 #include <ulp/tom/cxgb_defs.h>
89 #include <ulp/tom/cxgb_tom.h>
90 #include <ulp/tom/cxgb_t3_ddp.h>
91 #include <ulp/tom/cxgb_toepcb.h>
92 #include <ulp/tom/cxgb_tcp.h>
95 static int (*pru_sosend)(struct socket *so, struct sockaddr *addr,
96 struct uio *uio, struct mbuf *top, struct mbuf *control,
97 int flags, struct thread *td);
99 static int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr,
100 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
103 #define TMP_IOV_MAX 16
105 #define PG_FRAME ~PAGE_MASK
107 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
110 t3_init_socket_ops(void)
114 prp = pffindtype(AF_INET, SOCK_STREAM);
115 pru_sosend = prp->pr_usrreqs->pru_sosend;
116 pru_soreceive = prp->pr_usrreqs->pru_soreceive;
119 struct cxgb_dma_info {
122 bus_dma_segment_t *cdi_segs;
127 cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
128 bus_size_t mapsize, int error)
130 struct cxgb_dma_info *cdi = arg;
132 cdi->cdi_mapped = mapsize;
133 cdi->cdi_nsegs = nsegs;
134 cdi->cdi_segs = segs;
138 iov_adj(struct iovec **iov, int *iovcnt, size_t count)
140 struct iovec *iovtmp;
148 if (count < iovtmp->iov_len) {
149 ptmp = iovtmp->iov_base;
151 iovtmp->iov_base = ptmp;
152 iovtmp->iov_len -= count;
155 count -= iovtmp->iov_len;
161 } else if (count < 0) {
162 iovtmp = &(*iov)[*iovcnt - 1];
165 if (-count < iovtmp->iov_len) {
166 iovtmp->iov_len += count;
169 count += iovtmp->iov_len;
178 cxgb_zero_copy_free(void *cl, void *arg)
181 struct mbuf *m = (struct mbuf *)cl;
185 * Physical addresses, don't try to free should be unheld separately from sbdrop
189 m_free_iovec(m, m->m_type);
194 cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, vm_prot_t prot)
196 struct iovec *iov = uio->uio_iov;
197 int iovcnt = uio->uio_iovcnt;
198 int err, i, count, totcount, maxcount, totbytes, npages, curbytes;
203 map = &uio->uio_td->td_proc->p_vmspace->vm_map;
204 totbytes = totcount = 0;
208 for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount); i++, iov++) {
209 count = maxcount - totcount;
211 start = (uintptr_t)iov->iov_base;
212 end = (uintptr_t)((caddr_t)iov->iov_base + iov->iov_len);
216 npages = (end - start) >> PAGE_SHIFT;
218 count = min(count, npages);
220 /* The following return value is not used. XXX */
221 err = vm_fault_quick_hold_pages(map,
222 (vm_offset_t)iov->iov_base, iov->iov_len, prot, mp, count);
225 curbytes = iov->iov_len;
227 curbytes = count*PAGE_SIZE - (((uintptr_t)iov->iov_base)&PAGE_MASK);
228 totbytes += curbytes;
230 uio->uio_resid -= totbytes;
236 * Returns whether a connection should enable DDP. This happens when all of
237 * the following conditions are met:
238 * - the connection's ULP mode is DDP
239 * - DDP is not already enabled
240 * - the last receive was above the DDP threshold
241 * - receive buffers are in user space
242 * - receive side isn't shutdown (handled by caller)
243 * - the connection's receive window is big enough so that sizable buffers
244 * can be posted without closing the window in the middle of DDP (checked
245 * when the connection is offloaded)
248 so_should_ddp(const struct toepcb *toep, int last_recv_len)
251 DPRINTF("ulp_mode=%d last_recv_len=%d ddp_thresh=%d rcv_wnd=%ld ddp_copy_limit=%d\n",
252 toep->tp_ulp_mode, last_recv_len, TOM_TUNABLE(toep->tp_toedev, ddp_thres),
253 toep->tp_tp->rcv_wnd, (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN));
255 return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.kbuf[0] == NULL) &&
256 last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) &&
257 toep->tp_tp->rcv_wnd >
258 (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN);
262 is_ddp(const struct mbuf *m)
264 return ((m->m_flags & M_DDP) != 0);
268 is_ddp_psh(const struct mbuf *m)
270 return ((is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH)) != 0);
274 m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio)
276 int curlen, startlen, resid_init, err = 0;
279 DPRINTF("m_uiomove(m=%p, offset=%d, len=%d, ...)\n",
283 resid_init = uio->uio_resid;
285 buf = mtod(m, caddr_t);
287 if (offset && (offset < curlen)) {
296 err = uiomove(buf, min(len, curlen), uio);
298 printf("uiomove returned %d\n", err);
302 len -= min(len, curlen);
305 DPRINTF("copied %d bytes - resid_init=%d uio_resid=%d\n",
306 startlen - len, resid_init, uio->uio_resid);
311 * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the
312 * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
316 copy_data(const struct mbuf *m, int offset, int len, struct uio *uio)
318 struct iovec *to = uio->uio_iov;
321 if (__predict_true(!is_ddp(m))) /* RX_DATA */
322 return m_uiomove(m, offset, len, uio);
323 if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */
325 to->iov_base = ((caddr_t)to->iov_base) + len;
327 uio->uio_resid -= len;
330 err = t3_ddp_copy(m, offset, uio, len); /* kernel DDP */
335 cxgb_wait_dma_completion(struct toepcb *toep)
339 lock = &toep->tp_tp->t_inpcb->inp_lock;
340 inp_wlock(toep->tp_tp->t_inpcb);
341 cv_wait_unlock(&toep->tp_cv, lock);
345 cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m)
347 int i, seg_count, err, type;
349 struct cxgb_dma_info cdi;
351 struct mbuf_iovec *mi;
352 bus_dma_segment_t *segs;
354 err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio,
355 cxgb_dma_callback, &cdi, 0);
359 seg_count = cdi.cdi_nsegs;
360 if ((m0 = mcl_alloc(seg_count, &type)) == NULL) {
361 bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap);
366 m0->m_flags = (M_EXT|M_NOFREE);
367 m0->m_ext.ext_type = EXT_EXTREF;
368 m0->m_ext.ext_free = cxgb_zero_copy_free;
369 #if __FreeBSD_version >= 800016
370 m0->m_ext.ext_arg1 = NULL; /* XXX: probably wrong /phk */
371 m0->m_ext.ext_arg2 = NULL;
373 m0->m_ext.ext_args = NULL;
377 mv->mv_count = seg_count;
379 for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++)
380 mi_collapse_sge(mi, segs);
385 * This appears to be a no-op at the moment
386 * as busdma is all or nothing need to make
387 * sure the tag values are large enough
390 if (cdi.cdi_mapped < uio->uio_resid) {
391 uio->uio_resid -= cdi.cdi_mapped;
399 t3_sosend(struct socket *so, struct uio *uio)
401 int rv, count, hold_resid, sent, iovcnt;
402 struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
403 struct tcpcb *tp = so_sototcpcb(so);
404 struct toepcb *toep = tp->t_toe;
410 * Events requiring iteration:
411 * - number of pages exceeds max hold pages for process or system
412 * - number of pages exceeds maximum sg entries for a single WR
414 * We're limited to holding 128 pages at once - and we're limited to
415 * 34 SG entries per work request, but each SG entry can be any number
416 * of contiguous pages
421 iovcnt = uio->uio_iovcnt;
424 snd = so_sockbuf_snd(so);
427 * Make sure we don't exceed the socket buffer
429 count = min(toep->tp_page_count, (sockbuf_sbspace(snd) >> PAGE_SHIFT) + 2*PAGE_SIZE);
430 rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, VM_PROT_READ);
431 hold_resid = uiotmp.uio_resid;
436 * Bump past sent and shave off the unheld amount
438 if (hold_resid > 0) {
440 memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
442 iov_adj(&iovtmpp, &iovcnt, sent);
443 iov_adj(&iovtmpp, &iovcnt, -hold_resid);
444 uiotmp.uio_iov = iovtmpp;
445 uiotmp.uio_iovcnt = iovcnt;
448 uiotmp.uio_resid = uio->uio_resid - hold_resid;
451 * Push off all held pages
454 while (uiotmp.uio_resid > 0) {
455 rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m);
457 vm_page_unhold_pages(toep->tp_pages, count);
460 uio->uio_resid -= m->m_pkthdr.len;
461 sent += m->m_pkthdr.len;
463 t3_push_frames(so, TRUE);
464 iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
468 * Wait for pending I/O to be DMA'd to the card
471 cxgb_wait_dma_completion(toep);
472 vm_page_unhold_pages(toep->tp_pages, count);
474 * If there is more data to send adjust local copy of iov
475 * to point to teh start
479 memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
480 iov_adj(&iovtmpp, &iovcnt, sent);
482 uiotmp.uio_iov = iovtmpp;
483 uiotmp.uio_iovcnt = iovcnt;
491 cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
492 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
494 struct tcpcb *tp = so_sototcpcb(so);
496 int zcopy_thres, zcopy_enabled, rv;
499 * In order to use DMA direct from userspace the following
500 * conditions must be met:
501 * - the connection is currently offloaded
503 * - the number of bytes to be transferred exceeds the threshold
504 * - the number of bytes currently in flight won't exceed the in-flight
506 * - vm_fault_quick_hold_pages succeeds
507 * - blocking socket XXX for now
510 if (tp && tp->t_flags & TF_TOE) {
511 struct toepcb *toep = tp->t_toe;
513 tdev = toep->tp_toedev;
514 zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres);
515 zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled);
517 if (uio && (uio->uio_resid > zcopy_thres) &&
518 (uio->uio_iovcnt < TMP_IOV_MAX) && ((so_state_get(so) & SS_NBIO) == 0)
520 rv = t3_sosend(so, uio);
525 return pru_sosend(so, addr, uio, top, control, flags, td);
529 * Following replacement or removal of the first mbuf on the first mbuf chain
530 * of a socket buffer, push necessary state changes back into the socket
531 * buffer so that other consumers see the values consistently. 'nextrecord'
532 * is the callers locally stored value of the original value of
533 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
534 * NOTE: 'nextrecord' may be NULL.
537 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
539 sockbuf_lock_assert(sb);
541 * First, update for the new value of nextrecord. If necessary, make
542 * it the first record.
544 if (sb->sb_mb != NULL)
545 sb->sb_mb->m_nextpkt = nextrecord;
547 sb->sb_mb = nextrecord;
550 * Now update any dependent socket buffer fields to reflect the new
551 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
552 * addition of a second clause that takes care of the case where
553 * sb_mb has been updated, but remains the last record.
555 if (sb->sb_mb == NULL) {
556 sb->sb_mbtail = NULL;
557 sb->sb_lastrecord = NULL;
558 } else if (sb->sb_mb->m_nextpkt == NULL)
559 sb->sb_lastrecord = sb->sb_mb;
562 #define IS_NONBLOCKING(so) (so_state_get(so) & SS_NBIO)
565 t3_soreceive(struct socket *so, int *flagsp, struct uio *uio)
567 struct tcpcb *tp = so_sototcpcb(so);
568 struct toepcb *toep = tp->t_toe;
571 int err, flags, avail, len, copied, copied_unacked;
572 int target; /* Read at least this many bytes */
575 struct inpcb *inp = so_sotoinpcb(so);
576 int socket_state, socket_error;
579 avail = offset = copied = copied_unacked = 0;
580 flags = flagsp ? (*flagsp &~ MSG_EOR) : 0;
581 rcv = so_sockbuf_rcv(so);
583 err = sblock(rcv, SBLOCKWAIT(flags));
584 p = &toep->tp_ddp_state;
589 rcv = so_sockbuf_rcv(so);
591 if ((tp->t_flags & TF_TOE) == 0) {
597 p->user_ddp_pending = 0;
599 if ((tp->t_flags & TF_TOE) == 0) {
605 len = uio->uio_resid;
607 target = (flags & MSG_WAITALL) ? len : rcv->sb_lowat;
608 user_ddp_ok = p->ubuf_ddp_ready;
616 /* empty receive queue */
617 if (copied >= target && (rcv->sb_mb == NULL) &&
618 !p->user_ddp_pending)
621 socket_state = so_state_get(so);
622 socket_error = so_error_get(so);
623 rcv = so_sockbuf_rcv(so);
626 if (socket_error || tp->t_state == TCPS_CLOSED ||
627 (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)))
630 if (socket_state & SS_NOFDREF)
637 if (rcv->sb_state & SBS_CANTRCVMORE)
639 if (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
641 if (tp->t_state == TCPS_CLOSED) {
646 if (rcv->sb_mb && !p->user_ddp_pending) {
649 t3_cleanup_rbuf(tp, copied_unacked);
655 if (p->kbuf[0] && user_ddp_ok && !p->user_ddp_pending &&
656 uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
658 p->user_ddp_pending =
659 !t3_overlay_ubuf(toep, rcv, uio,
660 IS_NONBLOCKING(so), flags, 1, 1);
661 if (p->user_ddp_pending) {
666 if (p->kbuf[0] && (p->kbuf_posted == 0)) {
667 t3_post_kbuf(toep, 1, IS_NONBLOCKING(so));
670 if (p->user_ddp_pending) {
671 /* One shot at DDP if we already have enough data */
672 if (copied >= target)
675 if (rcv->sb_state & SBS_CANTRCVMORE)
677 CTR0(KTR_TOM, "ddp pending -- waiting");
678 if ((err = sbwait(rcv)) != 0)
680 //for timers to work await_ddp_completion(sk, flags, &timeo);
681 } else if (copied >= target)
684 if (copied_unacked) {
689 t3_cleanup_rbuf(tp, copied_unacked);
693 while (i++ < 200 && rcv->sb_mb == NULL)
700 if (rcv->sb_state & SBS_CANTRCVMORE)
703 CTR0(KTR_TOM, "no buffers -- waiting");
705 if ((err = sbwait(rcv)) != 0)
711 * Adjust the mbuf seqno if it has already been partially processed by
714 if (m->m_pkthdr.len != m->m_len) {
715 m->m_seq += m->m_pkthdr.len - m->m_len;
716 m->m_pkthdr.len = m->m_len;
719 CTR6(KTR_TOM, "t3_soreceive: ddp_flags=0x%x m_len=%u resid=%u "
720 "m_seq=0x%08x c_seq=0x%08x c_unack=%u",
721 (is_ddp(m) ? m->m_ddp_flags : 0), m->m_pkthdr.len, len,
722 m->m_seq, toep->tp_copied_seq, copied_unacked);
723 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT),
724 ("unexpected type M_EXT=%d ext_type=%d m_len=%d m_pktlen=%d\n", !!(m->m_flags & M_EXT),
725 m->m_ext.ext_type, m->m_len, m->m_pkthdr.len));
726 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p"
727 " m_flags=0x%x m->m_len=%d", m->m_next, m->m_nextpkt, m->m_flags, m->m_len));
728 if (m->m_pkthdr.len == 0) {
729 if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0)
730 panic("empty mbuf and NOCOPY not set\n");
731 CTR0(KTR_TOM, "ddp done notification");
732 p->user_ddp_pending = 0;
733 sbdroprecord_locked(rcv);
737 KASSERT((int32_t)(toep->tp_copied_seq + copied_unacked - m->m_seq) >= 0,
738 ("offset will go negative: offset=%d copied_seq=0x%08x copied_unacked=%d m_seq=0x%08x",
739 offset, toep->tp_copied_seq, copied_unacked, m->m_seq));
740 offset = toep->tp_copied_seq + copied_unacked - m->m_seq;
742 if (offset >= m->m_pkthdr.len)
743 panic("t3_soreceive: OFFSET >= LEN offset %d copied_seq 0x%x "
744 "seq 0x%x pktlen %d ddp flags 0x%x", offset,
745 toep->tp_copied_seq + copied_unacked, m->m_seq,
746 m->m_pkthdr.len, m->m_ddp_flags);
748 avail = m->m_pkthdr.len - offset;
750 if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY))
751 panic("bad state in t3_soreceive len=%d avail=%d offset=%d\n", len, avail, offset);
753 rcv->sb_flags |= SB_IN_TOE;
754 } else if (p->kbuf_posted == 0 && p->user_ddp_pending == 0)
755 rcv->sb_flags &= ~SB_IN_TOE;
757 #ifdef URGENT_DATA_SUPPORTED
759 * Check if the data we are preparing to copy contains urgent
760 * data. Either stop short of urgent data or skip it if it's
761 * first and we are not delivering urgent data inline.
763 if (__predict_false(toep->tp_urg_data)) {
764 uint32_t urg_offset = tp->rcv_up - tp->copied_seq + copied_unacked;
766 if (urg_offset < avail) {
768 /* stop short of the urgent data */
770 } else if ((so_options_get(so) & SO_OOBINLINE) == 0) {
771 /* First byte is urgent, skip */
772 toep->tp_copied_seq++;
781 if (is_ddp_psh(m) || offset || (rcv->sb_mb && !is_ddp(m))) {
784 T3_TRACE0(TIDTB(so), "t3_sosend: PSH");
788 if (user_ddp_ok && !p->user_ddp_pending &&
789 uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
791 p->user_ddp_pending =
792 !t3_overlay_ubuf(toep, rcv, uio,
793 IS_NONBLOCKING(so), flags, 1, 1);
794 if (p->user_ddp_pending) {
798 DPRINTF("user_ddp_pending=%d\n", p->user_ddp_pending);
800 DPRINTF("user_ddp_ok=%d user_ddp_pending=%d iov_len=%ld dgl_length=%d ubuf_ddp_ready=%d ulp_mode=%d is_ddp(m)=%d flags=0x%x ubuf=%p kbuf_posted=%d\n",
801 user_ddp_ok, p->user_ddp_pending, uio->uio_iov->iov_len, p->kbuf[0] ? p->kbuf[0]->dgl_length : 0,
802 p->ubuf_ddp_ready, toep->tp_ulp_mode, !!is_ddp(m), m->m_ddp_flags, p->ubuf, p->kbuf_posted);
805 * If MSG_TRUNC is specified the data is discarded.
806 * XXX need to check pr_atomic
808 KASSERT(avail > 0, ("avail=%d resid=%d offset=%d", avail, uio->uio_resid, offset));
809 if (__predict_true(!(flags & MSG_TRUNC))) {
810 int resid = uio->uio_resid;
813 if ((err = copy_data(m, offset, avail, uio))) {
820 if (avail != (resid - uio->uio_resid))
821 printf("didn't copy all bytes :-/ avail=%d offset=%d pktlen=%d resid=%d uio_resid=%d copied=%d copied_unacked=%d is_ddp(m)=%d\n",
822 avail, offset, m->m_pkthdr.len, resid, uio->uio_resid, copied, copied_unacked, is_ddp(m));
824 if ((tp->t_flags & TF_TOE) == 0) {
832 copied_unacked += avail;
835 #ifdef URGENT_DATA_SUPPORTED
837 if (tp->urg_data && after(tp->copied_seq + copied_unacked, tp->urg_seq))
841 * If the buffer is fully consumed free it. If it's a DDP
842 * buffer also handle any events it indicates.
844 if (avail + offset >= m->m_pkthdr.len) {
845 unsigned int fl = m->m_ddp_flags;
846 int exitnow, got_psh = 0, nomoredata = 0;
848 struct mbuf *nextrecord;
850 if (p->kbuf[0] != NULL && is_ddp(m) && (fl & 1)) {
851 if (is_ddp_psh(m) && p->user_ddp_pending)
854 if (fl & DDP_BF_NOCOPY)
855 p->user_ddp_pending = 0;
856 else if ((fl & DDP_BF_NODATA) && IS_NONBLOCKING(so)) {
861 p->ubuf_ddp_ready = 1;
865 nextrecord = m->m_nextpkt;
866 count = m->m_pkthdr.len;
869 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
870 CTR2(KTR_TOM, "freeing mbuf m_len = %d pktlen = %d", m->m_len, m->m_pkthdr.len);
872 rcv->sb_mb = m_free(m);
875 sockbuf_pushsync(rcv, nextrecord);
877 sbdrop_locked(rcv, m->m_pkthdr.len);
879 exitnow = got_psh || nomoredata;
880 if (copied >= target && (rcv->sb_mb == NULL) && exitnow)
882 if (copied_unacked > (rcv->sb_hiwat >> 2)) {
885 t3_cleanup_rbuf(tp, copied_unacked);
895 if ((tp->t_flags & TF_TOE) == 0) {
901 * If we can still receive decide what to do in preparation for the
902 * next receive. Note that RCV_SHUTDOWN is set if the connection
903 * transitioned to CLOSE but not if it was in that state to begin with.
905 if (__predict_true((so_state_get(so) & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) {
906 if (p->user_ddp_pending) {
908 t3_cancel_ubuf(toep, rcv);
915 p->user_ddp_pending = 0;
917 if ((p->kbuf[0] != NULL) && (p->kbuf_posted == 0)) {
920 "chelsio_recvmsg: about to exit, repost kbuf");
923 t3_post_kbuf(toep, 1, IS_NONBLOCKING(so));
925 } else if (so_should_ddp(toep, copied) && uio->uio_iovcnt == 1) {
926 CTR1(KTR_TOM ,"entering ddp on tid=%u", toep->tp_tid);
927 if (!t3_enter_ddp(toep, TOM_TUNABLE(toep->tp_toedev,
928 ddp_copy_limit), 0, IS_NONBLOCKING(so))) {
929 rcv->sb_flags |= SB_IN_TOE;
937 "chelsio_recvmsg <-: copied %d len %d buffers_freed %d "
938 "kbuf_posted %d user_ddp_pending %u",
939 copied, len, buffers_freed, p ? p->kbuf_posted : -1,
940 p->user_ddp_pending);
944 if (copied_unacked && (tp->t_flags & TF_TOE)) {
946 t3_cleanup_rbuf(tp, copied_unacked);
955 cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
956 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
959 int rv, zcopy_thres, zcopy_enabled, flags;
960 struct tcpcb *tp = so_sototcpcb(so);
961 struct sockbuf *rcv = so_sockbuf_rcv(so);
963 flags = flagsp ? *flagsp &~ MSG_EOR : 0;
966 * In order to use DMA direct from userspace the following
967 * conditions must be met:
968 * - the connection is currently offloaded
970 * - the number of bytes to be transferred exceeds the threshold
971 * - the number of bytes currently in flight won't exceed the in-flight
973 * - vm_fault_quick_hold_pages succeeds
974 * - blocking socket XXX for now
978 if (tp && (tp->t_flags & TF_TOE) && uio && ((flags & (MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0)
979 && (uio->uio_iovcnt == 1) && (mp0 == NULL) &&
980 ((rcv->sb_flags & SB_IN_TOE) || (uio->uio_iovcnt == 1))) {
981 struct toepcb *toep = tp->t_toe;
983 tdev = toep->tp_toedev;
984 zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
985 zcopy_enabled = TOM_TUNABLE(tdev, ddp);
986 if ((rcv->sb_flags & SB_IN_TOE) ||((uio->uio_resid > zcopy_thres) &&
987 (uio->uio_iovcnt == 1) && zcopy_enabled)) {
988 CTR4(KTR_TOM, "cxgb_soreceive: sb_flags=0x%x t_flags=0x%x flags=0x%x uio_resid=%d",
989 rcv->sb_flags, tp->t_flags, flags, uio->uio_resid);
990 rv = t3_soreceive(so, flagsp, uio);
994 printf("returned EAGAIN\n");
996 } else if (tp && (tp->t_flags & TF_TOE) && uio && mp0 == NULL) {
997 struct sockbuf *rcv = so_sockbuf_rcv(so);
999 log(LOG_INFO, "skipping t3_soreceive flags=0x%x iovcnt=%d sb_state=0x%x\n",
1000 flags, uio->uio_iovcnt, rcv->sb_state);
1003 return pru_soreceive(so, psa, uio, mp0, controlp, flagsp);
1006 struct protosw cxgb_protosw;
1007 struct pr_usrreqs cxgb_tcp_usrreqs;
1010 t3_install_socket_ops(struct socket *so)
1012 static int copied = 0;
1013 struct pr_usrreqs *pru;
1014 struct protosw *psw;
1017 psw = so_protosw_get(so);
1018 pru = psw->pr_usrreqs;
1020 bcopy(psw, &cxgb_protosw, sizeof(*psw));
1021 bcopy(pru, &cxgb_tcp_usrreqs, sizeof(*pru));
1023 cxgb_protosw.pr_ctloutput = t3_ctloutput;
1024 cxgb_protosw.pr_usrreqs = &cxgb_tcp_usrreqs;
1025 cxgb_tcp_usrreqs.pru_sosend = cxgb_sosend;
1026 cxgb_tcp_usrreqs.pru_soreceive = cxgb_soreceive;
1028 so_protosw_set(so, &cxgb_protosw);
1031 so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend;
1032 so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive;