1 /**************************************************************************
3 Copyright (c) 2007-2008, Chelsio Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Chelsio Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/types.h>
36 #include <sys/fcntl.h>
37 #include <sys/kernel.h>
38 #include <sys/limits.h>
41 #include <sys/condvar.h>
42 #include <sys/mutex.h>
45 #include <sys/sockstate.h>
46 #include <sys/sockopt.h>
47 #include <sys/socket.h>
48 #include <sys/sockbuf.h>
49 #include <sys/syslog.h>
53 #include <machine/bus.h>
54 #include <machine/cpu.h>
57 #include <net/route.h>
59 #include <netinet/in.h>
60 #include <netinet/in_pcb.h>
61 #include <netinet/in_systm.h>
62 #include <netinet/in_var.h>
64 #include <cxgb_config.h>
65 #include <cxgb_osdep.h>
66 #include <sys/mbufq.h>
67 #include <ulp/tom/cxgb_tcp_offload.h>
68 #include <netinet/tcp.h>
69 #include <netinet/tcp_var.h>
70 #include <netinet/tcp_fsm.h>
71 #include <netinet/tcp_offload.h>
72 #include <net/route.h>
75 #include <common/cxgb_firmware_exports.h>
76 #include <common/cxgb_t3_cpl.h>
77 #include <common/cxgb_tcb.h>
78 #include <common/cxgb_ctl_defs.h>
79 #include <cxgb_offload.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_extern.h>
88 #include <ulp/toecore/cxgb_toedev.h>
89 #include <ulp/tom/cxgb_defs.h>
90 #include <ulp/tom/cxgb_tom.h>
91 #include <ulp/tom/cxgb_t3_ddp.h>
92 #include <ulp/tom/cxgb_toepcb.h>
93 #include <ulp/tom/cxgb_tcp.h>
94 #include <ulp/tom/cxgb_vm.h>
97 static int (*pru_sosend)(struct socket *so, struct sockaddr *addr,
98 struct uio *uio, struct mbuf *top, struct mbuf *control,
99 int flags, struct thread *td);
101 static int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr,
102 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
105 #define TMP_IOV_MAX 16
107 #define PG_FRAME ~PAGE_MASK
109 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
112 t3_init_socket_ops(void)
116 prp = pffindtype(AF_INET, SOCK_STREAM);
117 pru_sosend = prp->pr_usrreqs->pru_sosend;
118 pru_soreceive = prp->pr_usrreqs->pru_soreceive;
121 struct cxgb_dma_info {
124 bus_dma_segment_t *cdi_segs;
129 cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
130 bus_size_t mapsize, int error)
132 struct cxgb_dma_info *cdi = arg;
134 cdi->cdi_mapped = mapsize;
135 cdi->cdi_nsegs = nsegs;
136 cdi->cdi_segs = segs;
140 iov_adj(struct iovec **iov, int *iovcnt, size_t count)
142 struct iovec *iovtmp;
150 if (count < iovtmp->iov_len) {
151 ptmp = iovtmp->iov_base;
153 iovtmp->iov_base = ptmp;
154 iovtmp->iov_len -= count;
157 count -= iovtmp->iov_len;
163 } else if (count < 0) {
164 iovtmp = &(*iov)[*iovcnt - 1];
167 if (-count < iovtmp->iov_len) {
168 iovtmp->iov_len += count;
171 count += iovtmp->iov_len;
180 cxgb_zero_copy_free(void *cl, void *arg)
183 struct mbuf *m = (struct mbuf *)cl;
187 * Physical addresses, don't try to free should be unheld separately from sbdrop
191 m_free_iovec(m, m->m_type);
196 cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, vm_prot_t prot)
198 struct iovec *iov = uio->uio_iov;
199 int iovcnt = uio->uio_iovcnt;
200 int err, i, count, totcount, maxcount, totbytes, npages, curbytes;
205 map = &uio->uio_td->td_proc->p_vmspace->vm_map;
206 totbytes = totcount = 0;
210 for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount); i++, iov++) {
211 count = maxcount - totcount;
213 start = (uintptr_t)iov->iov_base;
214 end = (uintptr_t)((caddr_t)iov->iov_base + iov->iov_len);
218 npages = (end - start) >> PAGE_SHIFT;
220 count = min(count, npages);
222 err = vm_fault_hold_user_pages(map,
223 (vm_offset_t)iov->iov_base, mp, count, prot);
226 curbytes = iov->iov_len;
228 curbytes = count*PAGE_SIZE - (((uintptr_t)iov->iov_base)&PAGE_MASK);
229 totbytes += curbytes;
231 uio->uio_resid -= totbytes;
237 * Returns whether a connection should enable DDP. This happens when all of
238 * the following conditions are met:
239 * - the connection's ULP mode is DDP
240 * - DDP is not already enabled
241 * - the last receive was above the DDP threshold
242 * - receive buffers are in user space
243 * - receive side isn't shutdown (handled by caller)
244 * - the connection's receive window is big enough so that sizable buffers
245 * can be posted without closing the window in the middle of DDP (checked
246 * when the connection is offloaded)
249 so_should_ddp(const struct toepcb *toep, int last_recv_len)
252 DPRINTF("ulp_mode=%d last_recv_len=%d ddp_thresh=%d rcv_wnd=%ld ddp_copy_limit=%d\n",
253 toep->tp_ulp_mode, last_recv_len, TOM_TUNABLE(toep->tp_toedev, ddp_thres),
254 toep->tp_tp->rcv_wnd, (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN));
256 return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.kbuf[0] == NULL) &&
257 last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) &&
258 toep->tp_tp->rcv_wnd >
259 (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN);
263 is_ddp(const struct mbuf *m)
265 return ((m->m_flags & M_DDP) != 0);
269 is_ddp_psh(const struct mbuf *m)
271 return ((is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH)) != 0);
275 m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio)
277 int curlen, startlen, resid_init, err = 0;
280 DPRINTF("m_uiomove(m=%p, offset=%d, len=%d, ...)\n",
284 resid_init = uio->uio_resid;
286 buf = mtod(m, caddr_t);
288 if (offset && (offset < curlen)) {
297 err = uiomove(buf, min(len, curlen), uio);
299 printf("uiomove returned %d\n", err);
303 len -= min(len, curlen);
306 DPRINTF("copied %d bytes - resid_init=%d uio_resid=%d\n",
307 startlen - len, resid_init, uio->uio_resid);
312 * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the
313 * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
317 copy_data(const struct mbuf *m, int offset, int len, struct uio *uio)
319 struct iovec *to = uio->uio_iov;
322 if (__predict_true(!is_ddp(m))) /* RX_DATA */
323 return m_uiomove(m, offset, len, uio);
324 if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */
326 to->iov_base = ((caddr_t)to->iov_base) + len;
328 uio->uio_resid -= len;
331 err = t3_ddp_copy(m, offset, uio, len); /* kernel DDP */
336 cxgb_wait_dma_completion(struct toepcb *toep)
340 lock = &toep->tp_tp->t_inpcb->inp_lock;
341 inp_wlock(toep->tp_tp->t_inpcb);
342 cv_wait_unlock(&toep->tp_cv, lock);
346 cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m)
348 int i, seg_count, err, type;
350 struct cxgb_dma_info cdi;
352 struct mbuf_iovec *mi;
353 bus_dma_segment_t *segs;
355 err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio,
356 cxgb_dma_callback, &cdi, 0);
360 seg_count = cdi.cdi_nsegs;
361 if ((m0 = mcl_alloc(seg_count, &type)) == NULL) {
362 bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap);
367 m0->m_flags = (M_EXT|M_NOFREE);
368 m0->m_ext.ext_type = EXT_EXTREF;
369 m0->m_ext.ext_free = cxgb_zero_copy_free;
370 #if __FreeBSD_version >= 800016
371 m0->m_ext.ext_arg1 = NULL; /* XXX: probably wrong /phk */
372 m0->m_ext.ext_arg2 = NULL;
374 m0->m_ext.ext_args = NULL;
378 mv->mv_count = seg_count;
380 for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++)
381 mi_collapse_sge(mi, segs);
386 * This appears to be a no-op at the moment
387 * as busdma is all or nothing need to make
388 * sure the tag values are large enough
391 if (cdi.cdi_mapped < uio->uio_resid) {
392 uio->uio_resid -= cdi.cdi_mapped;
400 t3_sosend(struct socket *so, struct uio *uio)
402 int rv, count, hold_resid, sent, iovcnt;
403 struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
404 struct tcpcb *tp = so_sototcpcb(so);
405 struct toepcb *toep = tp->t_toe;
411 * Events requiring iteration:
412 * - number of pages exceeds max hold pages for process or system
413 * - number of pages exceeds maximum sg entries for a single WR
415 * We're limited to holding 128 pages at once - and we're limited to
416 * 34 SG entries per work request, but each SG entry can be any number
417 * of contiguous pages
422 iovcnt = uio->uio_iovcnt;
425 snd = so_sockbuf_snd(so);
428 * Make sure we don't exceed the socket buffer
430 count = min(toep->tp_page_count, (sockbuf_sbspace(snd) >> PAGE_SHIFT) + 2*PAGE_SIZE);
431 rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, VM_PROT_READ);
432 hold_resid = uiotmp.uio_resid;
437 * Bump past sent and shave off the unheld amount
439 if (hold_resid > 0) {
441 memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
443 iov_adj(&iovtmpp, &iovcnt, sent);
444 iov_adj(&iovtmpp, &iovcnt, -hold_resid);
445 uiotmp.uio_iov = iovtmpp;
446 uiotmp.uio_iovcnt = iovcnt;
449 uiotmp.uio_resid = uio->uio_resid - hold_resid;
452 * Push off all held pages
455 while (uiotmp.uio_resid > 0) {
456 rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m);
458 vm_fault_unhold_pages(toep->tp_pages, count);
461 uio->uio_resid -= m->m_pkthdr.len;
462 sent += m->m_pkthdr.len;
464 t3_push_frames(so, TRUE);
465 iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
469 * Wait for pending I/O to be DMA'd to the card
472 cxgb_wait_dma_completion(toep);
473 vm_fault_unhold_pages(toep->tp_pages, count);
475 * If there is more data to send adjust local copy of iov
476 * to point to teh start
480 memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
481 iov_adj(&iovtmpp, &iovcnt, sent);
483 uiotmp.uio_iov = iovtmpp;
484 uiotmp.uio_iovcnt = iovcnt;
492 cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
493 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
495 struct tcpcb *tp = so_sototcpcb(so);
497 int zcopy_thres, zcopy_enabled, rv;
500 * In order to use DMA direct from userspace the following
501 * conditions must be met:
502 * - the connection is currently offloaded
504 * - the number of bytes to be transferred exceeds the threshold
505 * - the number of bytes currently in flight won't exceed the in-flight
507 * - vm_fault_hold_user_pages succeeds
508 * - blocking socket XXX for now
511 if (tp && tp->t_flags & TF_TOE) {
512 struct toepcb *toep = tp->t_toe;
514 tdev = toep->tp_toedev;
515 zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres);
516 zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled);
518 if (uio && (uio->uio_resid > zcopy_thres) &&
519 (uio->uio_iovcnt < TMP_IOV_MAX) && ((so_state_get(so) & SS_NBIO) == 0)
521 rv = t3_sosend(so, uio);
526 return pru_sosend(so, addr, uio, top, control, flags, td);
530 * Following replacement or removal of the first mbuf on the first mbuf chain
531 * of a socket buffer, push necessary state changes back into the socket
532 * buffer so that other consumers see the values consistently. 'nextrecord'
533 * is the callers locally stored value of the original value of
534 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
535 * NOTE: 'nextrecord' may be NULL.
538 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
540 sockbuf_lock_assert(sb);
542 * First, update for the new value of nextrecord. If necessary, make
543 * it the first record.
545 if (sb->sb_mb != NULL)
546 sb->sb_mb->m_nextpkt = nextrecord;
548 sb->sb_mb = nextrecord;
551 * Now update any dependent socket buffer fields to reflect the new
552 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
553 * addition of a second clause that takes care of the case where
554 * sb_mb has been updated, but remains the last record.
556 if (sb->sb_mb == NULL) {
557 sb->sb_mbtail = NULL;
558 sb->sb_lastrecord = NULL;
559 } else if (sb->sb_mb->m_nextpkt == NULL)
560 sb->sb_lastrecord = sb->sb_mb;
563 #define IS_NONBLOCKING(so) (so_state_get(so) & SS_NBIO)
566 t3_soreceive(struct socket *so, int *flagsp, struct uio *uio)
568 struct tcpcb *tp = so_sototcpcb(so);
569 struct toepcb *toep = tp->t_toe;
572 int err, flags, avail, len, copied, copied_unacked;
573 int target; /* Read at least this many bytes */
576 struct inpcb *inp = so_sotoinpcb(so);
577 int socket_state, socket_error;
580 avail = offset = copied = copied_unacked = 0;
581 flags = flagsp ? (*flagsp &~ MSG_EOR) : 0;
582 rcv = so_sockbuf_rcv(so);
584 err = sblock(rcv, SBLOCKWAIT(flags));
585 p = &toep->tp_ddp_state;
590 rcv = so_sockbuf_rcv(so);
592 if ((tp->t_flags & TF_TOE) == 0) {
598 p->user_ddp_pending = 0;
600 if ((tp->t_flags & TF_TOE) == 0) {
606 len = uio->uio_resid;
608 target = (flags & MSG_WAITALL) ? len : rcv->sb_lowat;
609 user_ddp_ok = p->ubuf_ddp_ready;
617 /* empty receive queue */
618 if (copied >= target && (rcv->sb_mb == NULL) &&
619 !p->user_ddp_pending)
622 socket_state = so_state_get(so);
623 socket_error = so_error_get(so);
624 rcv = so_sockbuf_rcv(so);
627 if (socket_error || tp->t_state == TCPS_CLOSED ||
628 (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)))
631 if (socket_state & SS_NOFDREF)
638 if (rcv->sb_state & SBS_CANTRCVMORE)
640 if (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
642 if (tp->t_state == TCPS_CLOSED) {
647 if (rcv->sb_mb && !p->user_ddp_pending) {
650 t3_cleanup_rbuf(tp, copied_unacked);
656 if (p->kbuf[0] && user_ddp_ok && !p->user_ddp_pending &&
657 uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
659 p->user_ddp_pending =
660 !t3_overlay_ubuf(toep, rcv, uio,
661 IS_NONBLOCKING(so), flags, 1, 1);
662 if (p->user_ddp_pending) {
667 if (p->kbuf[0] && (p->kbuf_posted == 0)) {
668 t3_post_kbuf(toep, 1, IS_NONBLOCKING(so));
671 if (p->user_ddp_pending) {
672 /* One shot at DDP if we already have enough data */
673 if (copied >= target)
676 if (rcv->sb_state & SBS_CANTRCVMORE)
678 CTR0(KTR_TOM, "ddp pending -- waiting");
679 if ((err = sbwait(rcv)) != 0)
681 //for timers to work await_ddp_completion(sk, flags, &timeo);
682 } else if (copied >= target)
685 if (copied_unacked) {
690 t3_cleanup_rbuf(tp, copied_unacked);
694 while (i++ < 200 && rcv->sb_mb == NULL)
701 if (rcv->sb_state & SBS_CANTRCVMORE)
704 CTR0(KTR_TOM, "no buffers -- waiting");
706 if ((err = sbwait(rcv)) != 0)
712 * Adjust the mbuf seqno if it has already been partially processed by
715 if (m->m_pkthdr.len != m->m_len) {
716 m->m_seq += m->m_pkthdr.len - m->m_len;
717 m->m_pkthdr.len = m->m_len;
720 CTR6(KTR_TOM, "t3_soreceive: ddp_flags=0x%x m_len=%u resid=%u "
721 "m_seq=0x%08x c_seq=0x%08x c_unack=%u",
722 (is_ddp(m) ? m->m_ddp_flags : 0), m->m_pkthdr.len, len,
723 m->m_seq, toep->tp_copied_seq, copied_unacked);
724 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT),
725 ("unexpected type M_EXT=%d ext_type=%d m_len=%d m_pktlen=%d\n", !!(m->m_flags & M_EXT),
726 m->m_ext.ext_type, m->m_len, m->m_pkthdr.len));
727 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p"
728 " m_flags=0x%x m->m_len=%d", m->m_next, m->m_nextpkt, m->m_flags, m->m_len));
729 if (m->m_pkthdr.len == 0) {
730 if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0)
731 panic("empty mbuf and NOCOPY not set\n");
732 CTR0(KTR_TOM, "ddp done notification");
733 p->user_ddp_pending = 0;
734 sbdroprecord_locked(rcv);
738 KASSERT((int32_t)(toep->tp_copied_seq + copied_unacked - m->m_seq) >= 0,
739 ("offset will go negative: offset=%d copied_seq=0x%08x copied_unacked=%d m_seq=0x%08x",
740 offset, toep->tp_copied_seq, copied_unacked, m->m_seq));
741 offset = toep->tp_copied_seq + copied_unacked - m->m_seq;
743 if (offset >= m->m_pkthdr.len)
744 panic("t3_soreceive: OFFSET >= LEN offset %d copied_seq 0x%x "
745 "seq 0x%x pktlen %d ddp flags 0x%x", offset,
746 toep->tp_copied_seq + copied_unacked, m->m_seq,
747 m->m_pkthdr.len, m->m_ddp_flags);
749 avail = m->m_pkthdr.len - offset;
751 if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY))
752 panic("bad state in t3_soreceive len=%d avail=%d offset=%d\n", len, avail, offset);
754 rcv->sb_flags |= SB_IN_TOE;
755 } else if (p->kbuf_posted == 0 && p->user_ddp_pending == 0)
756 rcv->sb_flags &= ~SB_IN_TOE;
758 #ifdef URGENT_DATA_SUPPORTED
760 * Check if the data we are preparing to copy contains urgent
761 * data. Either stop short of urgent data or skip it if it's
762 * first and we are not delivering urgent data inline.
764 if (__predict_false(toep->tp_urg_data)) {
765 uint32_t urg_offset = tp->rcv_up - tp->copied_seq + copied_unacked;
767 if (urg_offset < avail) {
769 /* stop short of the urgent data */
771 } else if ((so_options_get(so) & SO_OOBINLINE) == 0) {
772 /* First byte is urgent, skip */
773 toep->tp_copied_seq++;
782 if (is_ddp_psh(m) || offset || (rcv->sb_mb && !is_ddp(m))) {
785 T3_TRACE0(TIDTB(so), "t3_sosend: PSH");
789 if (user_ddp_ok && !p->user_ddp_pending &&
790 uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
792 p->user_ddp_pending =
793 !t3_overlay_ubuf(toep, rcv, uio,
794 IS_NONBLOCKING(so), flags, 1, 1);
795 if (p->user_ddp_pending) {
799 DPRINTF("user_ddp_pending=%d\n", p->user_ddp_pending);
801 DPRINTF("user_ddp_ok=%d user_ddp_pending=%d iov_len=%ld dgl_length=%d ubuf_ddp_ready=%d ulp_mode=%d is_ddp(m)=%d flags=0x%x ubuf=%p kbuf_posted=%d\n",
802 user_ddp_ok, p->user_ddp_pending, uio->uio_iov->iov_len, p->kbuf[0] ? p->kbuf[0]->dgl_length : 0,
803 p->ubuf_ddp_ready, toep->tp_ulp_mode, !!is_ddp(m), m->m_ddp_flags, p->ubuf, p->kbuf_posted);
806 * If MSG_TRUNC is specified the data is discarded.
807 * XXX need to check pr_atomic
809 KASSERT(avail > 0, ("avail=%d resid=%d offset=%d", avail, uio->uio_resid, offset));
810 if (__predict_true(!(flags & MSG_TRUNC))) {
811 int resid = uio->uio_resid;
814 if ((err = copy_data(m, offset, avail, uio))) {
821 if (avail != (resid - uio->uio_resid))
822 printf("didn't copy all bytes :-/ avail=%d offset=%d pktlen=%d resid=%d uio_resid=%d copied=%d copied_unacked=%d is_ddp(m)=%d\n",
823 avail, offset, m->m_pkthdr.len, resid, uio->uio_resid, copied, copied_unacked, is_ddp(m));
825 if ((tp->t_flags & TF_TOE) == 0) {
833 copied_unacked += avail;
836 #ifdef URGENT_DATA_SUPPORTED
838 if (tp->urg_data && after(tp->copied_seq + copied_unacked, tp->urg_seq))
842 * If the buffer is fully consumed free it. If it's a DDP
843 * buffer also handle any events it indicates.
845 if (avail + offset >= m->m_pkthdr.len) {
846 unsigned int fl = m->m_ddp_flags;
847 int exitnow, got_psh = 0, nomoredata = 0;
849 struct mbuf *nextrecord;
851 if (p->kbuf[0] != NULL && is_ddp(m) && (fl & 1)) {
852 if (is_ddp_psh(m) && p->user_ddp_pending)
855 if (fl & DDP_BF_NOCOPY)
856 p->user_ddp_pending = 0;
857 else if ((fl & DDP_BF_NODATA) && IS_NONBLOCKING(so)) {
862 p->ubuf_ddp_ready = 1;
866 nextrecord = m->m_nextpkt;
867 count = m->m_pkthdr.len;
870 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
871 CTR2(KTR_TOM, "freeing mbuf m_len = %d pktlen = %d", m->m_len, m->m_pkthdr.len);
873 rcv->sb_mb = m_free(m);
876 sockbuf_pushsync(rcv, nextrecord);
878 sbdrop_locked(rcv, m->m_pkthdr.len);
880 exitnow = got_psh || nomoredata;
881 if (copied >= target && (rcv->sb_mb == NULL) && exitnow)
883 if (copied_unacked > (rcv->sb_hiwat >> 2)) {
886 t3_cleanup_rbuf(tp, copied_unacked);
896 if ((tp->t_flags & TF_TOE) == 0) {
902 * If we can still receive decide what to do in preparation for the
903 * next receive. Note that RCV_SHUTDOWN is set if the connection
904 * transitioned to CLOSE but not if it was in that state to begin with.
906 if (__predict_true((so_state_get(so) & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) {
907 if (p->user_ddp_pending) {
909 t3_cancel_ubuf(toep, rcv);
916 p->user_ddp_pending = 0;
918 if ((p->kbuf[0] != NULL) && (p->kbuf_posted == 0)) {
921 "chelsio_recvmsg: about to exit, repost kbuf");
924 t3_post_kbuf(toep, 1, IS_NONBLOCKING(so));
926 } else if (so_should_ddp(toep, copied) && uio->uio_iovcnt == 1) {
927 CTR1(KTR_TOM ,"entering ddp on tid=%u", toep->tp_tid);
928 if (!t3_enter_ddp(toep, TOM_TUNABLE(toep->tp_toedev,
929 ddp_copy_limit), 0, IS_NONBLOCKING(so))) {
930 rcv->sb_flags |= SB_IN_TOE;
938 "chelsio_recvmsg <-: copied %d len %d buffers_freed %d "
939 "kbuf_posted %d user_ddp_pending %u",
940 copied, len, buffers_freed, p ? p->kbuf_posted : -1,
941 p->user_ddp_pending);
945 if (copied_unacked && (tp->t_flags & TF_TOE)) {
947 t3_cleanup_rbuf(tp, copied_unacked);
956 cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
957 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
960 int rv, zcopy_thres, zcopy_enabled, flags;
961 struct tcpcb *tp = so_sototcpcb(so);
962 struct sockbuf *rcv = so_sockbuf_rcv(so);
964 flags = flagsp ? *flagsp &~ MSG_EOR : 0;
967 * In order to use DMA direct from userspace the following
968 * conditions must be met:
969 * - the connection is currently offloaded
971 * - the number of bytes to be transferred exceeds the threshold
972 * - the number of bytes currently in flight won't exceed the in-flight
974 * - vm_fault_hold_user_pages succeeds
975 * - blocking socket XXX for now
979 if (tp && (tp->t_flags & TF_TOE) && uio && ((flags & (MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0)
980 && (uio->uio_iovcnt == 1) && (mp0 == NULL) &&
981 ((rcv->sb_flags & SB_IN_TOE) || (uio->uio_iovcnt == 1))) {
982 struct toepcb *toep = tp->t_toe;
984 tdev = toep->tp_toedev;
985 zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
986 zcopy_enabled = TOM_TUNABLE(tdev, ddp);
987 if ((rcv->sb_flags & SB_IN_TOE) ||((uio->uio_resid > zcopy_thres) &&
988 (uio->uio_iovcnt == 1) && zcopy_enabled)) {
989 CTR4(KTR_TOM, "cxgb_soreceive: sb_flags=0x%x t_flags=0x%x flags=0x%x uio_resid=%d",
990 rcv->sb_flags, tp->t_flags, flags, uio->uio_resid);
991 rv = t3_soreceive(so, flagsp, uio);
995 printf("returned EAGAIN\n");
997 } else if (tp && (tp->t_flags & TF_TOE) && uio && mp0 == NULL) {
998 struct sockbuf *rcv = so_sockbuf_rcv(so);
1000 log(LOG_INFO, "skipping t3_soreceive flags=0x%x iovcnt=%d sb_state=0x%x\n",
1001 flags, uio->uio_iovcnt, rcv->sb_state);
1004 return pru_soreceive(so, psa, uio, mp0, controlp, flagsp);
1007 struct protosw cxgb_protosw;
1008 struct pr_usrreqs cxgb_tcp_usrreqs;
1011 t3_install_socket_ops(struct socket *so)
1013 static int copied = 0;
1014 struct pr_usrreqs *pru;
1015 struct protosw *psw;
1018 psw = so_protosw_get(so);
1019 pru = psw->pr_usrreqs;
1021 bcopy(psw, &cxgb_protosw, sizeof(*psw));
1022 bcopy(pru, &cxgb_tcp_usrreqs, sizeof(*pru));
1024 cxgb_protosw.pr_ctloutput = t3_ctloutput;
1025 cxgb_protosw.pr_usrreqs = &cxgb_tcp_usrreqs;
1026 cxgb_tcp_usrreqs.pru_sosend = cxgb_sosend;
1027 cxgb_tcp_usrreqs.pru_soreceive = cxgb_soreceive;
1029 so_protosw_set(so, &cxgb_protosw);
1032 so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend;
1033 so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive;