1 /**************************************************************************
3 Copyright (c) 2007-2008, Chelsio Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Chelsio Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/types.h>
36 #include <sys/fcntl.h>
37 #include <sys/kernel.h>
38 #include <sys/limits.h>
41 #include <sys/condvar.h>
42 #include <sys/mutex.h>
45 #include <sys/socket.h>
46 #include <sys/syslog.h>
50 #include <machine/bus.h>
51 #include <machine/cpu.h>
54 #include <net/route.h>
56 #include <netinet/in.h>
57 #include <netinet/in_pcb.h>
58 #include <netinet/in_systm.h>
59 #include <netinet/in_var.h>
61 #include <dev/cxgb/cxgb_config.h>
62 #include <dev/cxgb/cxgb_osdep.h>
63 #include <dev/cxgb/sys/mbufq.h>
64 #include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
65 #include <netinet/tcp.h>
66 #include <netinet/tcp_var.h>
67 #include <netinet/tcp_fsm.h>
68 #include <netinet/tcp_offload.h>
69 #include <net/route.h>
71 #include <dev/cxgb/t3cdev.h>
72 #include <dev/cxgb/common/cxgb_firmware_exports.h>
73 #include <dev/cxgb/common/cxgb_t3_cpl.h>
74 #include <dev/cxgb/common/cxgb_tcb.h>
75 #include <dev/cxgb/common/cxgb_ctl_defs.h>
76 #include <dev/cxgb/cxgb_offload.h>
79 #include <vm/vm_page.h>
80 #include <vm/vm_map.h>
81 #include <vm/vm_extern.h>
84 #include <dev/cxgb/sys/mvec.h>
85 #include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
86 #include <dev/cxgb/ulp/tom/cxgb_defs.h>
87 #include <dev/cxgb/ulp/tom/cxgb_tom.h>
88 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
89 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
90 #include <dev/cxgb/ulp/tom/cxgb_tcp.h>
91 #include <dev/cxgb/ulp/tom/cxgb_vm.h>
94 static int (*pru_sosend)(struct socket *so, struct sockaddr *addr,
95 struct uio *uio, struct mbuf *top, struct mbuf *control,
96 int flags, struct thread *td);
98 static int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr,
99 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
102 #define TMP_IOV_MAX 16
104 #define PG_FRAME ~PAGE_MASK
106 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
109 t3_init_socket_ops(void)
113 prp = pffindtype(AF_INET, SOCK_STREAM);
114 pru_sosend = prp->pr_usrreqs->pru_sosend;
115 pru_soreceive = prp->pr_usrreqs->pru_soreceive;
118 struct cxgb_dma_info {
121 bus_dma_segment_t *cdi_segs;
126 cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
127 bus_size_t mapsize, int error)
129 struct cxgb_dma_info *cdi = arg;
131 cdi->cdi_mapped = mapsize;
132 cdi->cdi_nsegs = nsegs;
133 cdi->cdi_segs = segs;
137 iov_adj(struct iovec **iov, int *iovcnt, size_t count)
139 struct iovec *iovtmp;
147 if (count < iovtmp->iov_len) {
148 ptmp = iovtmp->iov_base;
150 iovtmp->iov_base = ptmp;
151 iovtmp->iov_len -= count;
154 count -= iovtmp->iov_len;
160 } else if (count < 0) {
161 iovtmp = &(*iov)[*iovcnt - 1];
164 if (-count < iovtmp->iov_len) {
165 iovtmp->iov_len += count;
168 count += iovtmp->iov_len;
177 cxgb_zero_copy_free(void *cl, void *arg)
180 struct mbuf *m = (struct mbuf *)cl;
184 * Physical addresses, don't try to free should be unheld separately from sbdrop
188 m_free_iovec(m, m->m_type);
193 cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags)
195 struct iovec *iov = uio->uio_iov;
196 int iovcnt = uio->uio_iovcnt;
197 int err, i, count, totcount, maxcount, totbytes, npages, curbytes;
201 totbytes = totcount = 0;
205 for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount); i++, iov++) {
206 count = maxcount - totcount;
208 start = (uintptr_t)iov->iov_base;
209 end = (uintptr_t)((caddr_t)iov->iov_base + iov->iov_len);
213 npages = (end - start) >> PAGE_SHIFT;
215 count = min(count, npages);
217 err = vm_fault_hold_user_pages((vm_offset_t)iov->iov_base, mp, count, flags);
219 vm_fault_unhold_pages(m, totcount);
224 curbytes = iov->iov_len;
226 curbytes = count*PAGE_SIZE - (((uintptr_t)iov->iov_base)&PAGE_MASK);
227 totbytes += curbytes;
229 uio->uio_resid -= totbytes;
235 * Returns whether a connection should enable DDP. This happens when all of
236 * the following conditions are met:
237 * - the connection's ULP mode is DDP
238 * - DDP is not already enabled
239 * - the last receive was above the DDP threshold
240 * - receive buffers are in user space
241 * - receive side isn't shutdown (handled by caller)
242 * - the connection's receive window is big enough so that sizable buffers
243 * can be posted without closing the window in the middle of DDP (checked
244 * when the connection is offloaded)
247 so_should_ddp(const struct toepcb *toep, int last_recv_len)
250 DPRINTF("ulp_mode=%d last_recv_len=%d ddp_thresh=%d rcv_wnd=%ld ddp_copy_limit=%d\n",
251 toep->tp_ulp_mode, last_recv_len, TOM_TUNABLE(toep->tp_toedev, ddp_thres),
252 toep->tp_tp->rcv_wnd, (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN));
254 return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.kbuf[0] == NULL) &&
255 last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) &&
256 toep->tp_tp->rcv_wnd >
257 (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN);
261 is_ddp(const struct mbuf *m)
263 return ((m->m_flags & M_DDP) != 0);
267 is_ddp_psh(const struct mbuf *m)
269 return ((is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH)) != 0);
273 m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio)
275 int curlen, startlen, resid_init, err = 0;
278 DPRINTF("m_uiomove(m=%p, offset=%d, len=%d, ...)\n",
282 resid_init = uio->uio_resid;
284 buf = mtod(m, caddr_t);
286 if (offset && (offset < curlen)) {
295 err = uiomove(buf, min(len, curlen), uio);
297 printf("uiomove returned %d\n", err);
301 len -= min(len, curlen);
304 DPRINTF("copied %d bytes - resid_init=%d uio_resid=%d\n",
305 startlen - len, resid_init, uio->uio_resid);
310 * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the
311 * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
315 copy_data(const struct mbuf *m, int offset, int len, struct uio *uio)
317 struct iovec *to = uio->uio_iov;
320 if (__predict_true(!is_ddp(m))) /* RX_DATA */
321 return m_uiomove(m, offset, len, uio);
322 if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */
324 to->iov_base = ((caddr_t)to->iov_base) + len;
326 uio->uio_resid -= len;
329 err = t3_ddp_copy(m, offset, uio, len); /* kernel DDP */
334 cxgb_wait_dma_completion(struct toepcb *toep)
338 lock = &toep->tp_tp->t_inpcb->inp_lock;
339 inp_wlock(toep->tp_tp->t_inpcb);
340 cv_wait_unlock(&toep->tp_cv, lock);
344 cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m)
346 int i, seg_count, err, type;
348 struct cxgb_dma_info cdi;
350 struct mbuf_iovec *mi;
351 bus_dma_segment_t *segs;
353 err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio,
354 cxgb_dma_callback, &cdi, 0);
358 seg_count = cdi.cdi_nsegs;
359 if ((m0 = mcl_alloc(seg_count, &type)) == NULL) {
360 bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap);
365 m0->m_flags = (M_EXT|M_NOFREE);
366 m0->m_ext.ext_type = EXT_EXTREF;
367 m0->m_ext.ext_free = cxgb_zero_copy_free;
368 m0->m_ext.ext_arg1 = NULL; /* XXX: probably wrong /phk */
369 m0->m_ext.ext_arg2 = NULL;
372 mv->mv_count = seg_count;
374 for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++)
375 mi_collapse_sge(mi, segs);
380 * This appears to be a no-op at the moment
381 * as busdma is all or nothing need to make
382 * sure the tag values are large enough
385 if (cdi.cdi_mapped < uio->uio_resid) {
386 uio->uio_resid -= cdi.cdi_mapped;
394 t3_sosend(struct socket *so, struct uio *uio)
396 int rv, count, hold_resid, sent, iovcnt;
397 struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
398 struct tcpcb *tp = so_sototcpcb(so);
399 struct toepcb *toep = tp->t_toe;
405 * Events requiring iteration:
406 * - number of pages exceeds max hold pages for process or system
407 * - number of pages exceeds maximum sg entries for a single WR
409 * We're limited to holding 128 pages at once - and we're limited to
410 * 34 SG entries per work request, but each SG entry can be any number
411 * of contiguous pages
416 iovcnt = uio->uio_iovcnt;
419 snd = so_sockbuf_snd(so);
422 * Make sure we don't exceed the socket buffer
424 count = min(toep->tp_page_count, (sockbuf_sbspace(snd) >> PAGE_SHIFT) + 2*PAGE_SIZE);
425 rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, 0);
426 hold_resid = uiotmp.uio_resid;
431 * Bump past sent and shave off the unheld amount
433 if (hold_resid > 0) {
435 memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
437 iov_adj(&iovtmpp, &iovcnt, sent);
438 iov_adj(&iovtmpp, &iovcnt, -hold_resid);
439 uiotmp.uio_iov = iovtmpp;
440 uiotmp.uio_iovcnt = iovcnt;
443 uiotmp.uio_resid = uio->uio_resid - hold_resid;
446 * Push off all held pages
449 while (uiotmp.uio_resid > 0) {
450 rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m);
452 vm_fault_unhold_pages(toep->tp_pages, count);
455 uio->uio_resid -= m->m_pkthdr.len;
456 sent += m->m_pkthdr.len;
458 t3_push_frames(so, TRUE);
459 iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
463 * Wait for pending I/O to be DMA'd to the card
466 cxgb_wait_dma_completion(toep);
467 vm_fault_unhold_pages(toep->tp_pages, count);
469 * If there is more data to send adjust local copy of iov
470 * to point to teh start
474 memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
475 iov_adj(&iovtmpp, &iovcnt, sent);
477 uiotmp.uio_iov = iovtmpp;
478 uiotmp.uio_iovcnt = iovcnt;
486 cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
487 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
489 struct tcpcb *tp = so_sototcpcb(so);
491 int zcopy_thres, zcopy_enabled, rv;
494 * In order to use DMA direct from userspace the following
495 * conditions must be met:
496 * - the connection is currently offloaded
498 * - the number of bytes to be transferred exceeds the threshold
499 * - the number of bytes currently in flight won't exceed the in-flight
501 * - vm_fault_hold_user_pages succeeds
502 * - blocking socket XXX for now
505 if (tp && tp->t_flags & TF_TOE) {
506 struct toepcb *toep = tp->t_toe;
508 tdev = toep->tp_toedev;
509 zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres);
510 zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled);
512 if (uio && (uio->uio_resid > zcopy_thres) &&
513 (uio->uio_iovcnt < TMP_IOV_MAX) && ((so_state_get(so) & SS_NBIO) == 0)
515 rv = t3_sosend(so, uio);
520 return pru_sosend(so, addr, uio, top, control, flags, td);
524 * Following replacement or removal of the first mbuf on the first mbuf chain
525 * of a socket buffer, push necessary state changes back into the socket
526 * buffer so that other consumers see the values consistently. 'nextrecord'
527 * is the callers locally stored value of the original value of
528 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
529 * NOTE: 'nextrecord' may be NULL.
532 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
534 sockbuf_lock_assert(sb);
536 * First, update for the new value of nextrecord. If necessary, make
537 * it the first record.
539 if (sb->sb_mb != NULL)
540 sb->sb_mb->m_nextpkt = nextrecord;
542 sb->sb_mb = nextrecord;
545 * Now update any dependent socket buffer fields to reflect the new
546 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
547 * addition of a second clause that takes care of the case where
548 * sb_mb has been updated, but remains the last record.
550 if (sb->sb_mb == NULL) {
551 sb->sb_mbtail = NULL;
552 sb->sb_lastrecord = NULL;
553 } else if (sb->sb_mb->m_nextpkt == NULL)
554 sb->sb_lastrecord = sb->sb_mb;
557 #define IS_NONBLOCKING(so) (so_state_get(so) & SS_NBIO)
560 t3_soreceive(struct socket *so, int *flagsp, struct uio *uio)
562 struct tcpcb *tp = so_sototcpcb(so);
563 struct toepcb *toep = tp->t_toe;
566 int err, flags, avail, len, copied, copied_unacked;
567 int target; /* Read at least this many bytes */
570 struct inpcb *inp = so_sotoinpcb(so);
571 int socket_state, socket_error;
574 avail = offset = copied = copied_unacked = 0;
575 flags = flagsp ? (*flagsp &~ MSG_EOR) : 0;
576 rcv = so_sockbuf_rcv(so);
578 err = sblock(rcv, SBLOCKWAIT(flags));
579 p = &toep->tp_ddp_state;
584 rcv = so_sockbuf_rcv(so);
586 if ((tp->t_flags & TF_TOE) == 0) {
592 p->user_ddp_pending = 0;
594 if ((tp->t_flags & TF_TOE) == 0) {
600 len = uio->uio_resid;
602 target = (flags & MSG_WAITALL) ? len : rcv->sb_lowat;
603 user_ddp_ok = p->ubuf_ddp_ready;
611 /* empty receive queue */
612 if (copied >= target && (rcv->sb_mb == NULL) &&
613 !p->user_ddp_pending)
616 socket_state = so_state_get(so);
617 socket_error = so_error_get(so);
618 rcv = so_sockbuf_rcv(so);
621 if (socket_error || tp->t_state == TCPS_CLOSED ||
622 (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)))
625 if (socket_state & SS_NOFDREF)
632 if (rcv->sb_state & SBS_CANTRCVMORE)
634 if (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
636 if (tp->t_state == TCPS_CLOSED) {
641 if (rcv->sb_mb && !p->user_ddp_pending) {
644 t3_cleanup_rbuf(tp, copied_unacked);
650 if (p->kbuf[0] && user_ddp_ok && !p->user_ddp_pending &&
651 uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
653 p->user_ddp_pending =
654 !t3_overlay_ubuf(toep, rcv, uio,
655 IS_NONBLOCKING(so), flags, 1, 1);
656 if (p->user_ddp_pending) {
661 if (p->kbuf[0] && (p->kbuf_posted == 0)) {
662 t3_post_kbuf(toep, 1, IS_NONBLOCKING(so));
665 if (p->user_ddp_pending) {
666 /* One shot at DDP if we already have enough data */
667 if (copied >= target)
670 if (rcv->sb_state & SBS_CANTRCVMORE)
672 CTR0(KTR_TOM, "ddp pending -- waiting");
673 if ((err = sbwait(rcv)) != 0)
675 //for timers to work await_ddp_completion(sk, flags, &timeo);
676 } else if (copied >= target)
679 if (copied_unacked) {
684 t3_cleanup_rbuf(tp, copied_unacked);
688 while (i++ < 200 && rcv->sb_mb == NULL)
695 if (rcv->sb_state & SBS_CANTRCVMORE)
698 CTR0(KTR_TOM, "no buffers -- waiting");
700 if ((err = sbwait(rcv)) != 0)
706 * Adjust the mbuf seqno if it has already been partially processed by
709 if (m->m_pkthdr.len != m->m_len) {
710 m->m_seq += m->m_pkthdr.len - m->m_len;
711 m->m_pkthdr.len = m->m_len;
714 CTR6(KTR_TOM, "t3_soreceive: ddp_flags=0x%x m_len=%u resid=%u "
715 "m_seq=0x%08x c_seq=0x%08x c_unack=%u",
716 (is_ddp(m) ? m->m_ddp_flags : 0), m->m_pkthdr.len, len,
717 m->m_seq, toep->tp_copied_seq, copied_unacked);
718 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT),
719 ("unexpected type M_EXT=%d ext_type=%d m_len=%d m_pktlen=%d\n", !!(m->m_flags & M_EXT),
720 m->m_ext.ext_type, m->m_len, m->m_pkthdr.len));
721 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p"
722 " m_flags=0x%x m->m_len=%d", m->m_next, m->m_nextpkt, m->m_flags, m->m_len));
723 if (m->m_pkthdr.len == 0) {
724 if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0)
725 panic("empty mbuf and NOCOPY not set\n");
726 CTR0(KTR_TOM, "ddp done notification");
727 p->user_ddp_pending = 0;
728 sbdroprecord_locked(rcv);
732 KASSERT((int32_t)(toep->tp_copied_seq + copied_unacked - m->m_seq) >= 0,
733 ("offset will go negative: offset=%d copied_seq=0x%08x copied_unacked=%d m_seq=0x%08x",
734 offset, toep->tp_copied_seq, copied_unacked, m->m_seq));
735 offset = toep->tp_copied_seq + copied_unacked - m->m_seq;
737 if (offset >= m->m_pkthdr.len)
738 panic("t3_soreceive: OFFSET >= LEN offset %d copied_seq 0x%x "
739 "seq 0x%x pktlen %d ddp flags 0x%x", offset,
740 toep->tp_copied_seq + copied_unacked, m->m_seq,
741 m->m_pkthdr.len, m->m_ddp_flags);
743 avail = m->m_pkthdr.len - offset;
745 if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY))
746 panic("bad state in t3_soreceive len=%d avail=%d offset=%d\n", len, avail, offset);
748 rcv->sb_flags |= SB_IN_TOE;
749 } else if (p->kbuf_posted == 0 && p->user_ddp_pending == 0)
750 rcv->sb_flags &= ~SB_IN_TOE;
752 #ifdef URGENT_DATA_SUPPORTED
754 * Check if the data we are preparing to copy contains urgent
755 * data. Either stop short of urgent data or skip it if it's
756 * first and we are not delivering urgent data inline.
758 if (__predict_false(toep->tp_urg_data)) {
759 uint32_t urg_offset = tp->rcv_up - tp->copied_seq + copied_unacked;
761 if (urg_offset < avail) {
763 /* stop short of the urgent data */
765 } else if ((so_options_get(so) & SO_OOBINLINE) == 0) {
766 /* First byte is urgent, skip */
767 toep->tp_copied_seq++;
776 if (is_ddp_psh(m) || offset || (rcv->sb_mb && !is_ddp(m))) {
779 T3_TRACE0(TIDTB(so), "t3_sosend: PSH");
783 if (user_ddp_ok && !p->user_ddp_pending &&
784 uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
786 p->user_ddp_pending =
787 !t3_overlay_ubuf(toep, rcv, uio,
788 IS_NONBLOCKING(so), flags, 1, 1);
789 if (p->user_ddp_pending) {
793 DPRINTF("user_ddp_pending=%d\n", p->user_ddp_pending);
795 DPRINTF("user_ddp_ok=%d user_ddp_pending=%d iov_len=%ld dgl_length=%d ubuf_ddp_ready=%d ulp_mode=%d is_ddp(m)=%d flags=0x%x ubuf=%p kbuf_posted=%d\n",
796 user_ddp_ok, p->user_ddp_pending, uio->uio_iov->iov_len, p->kbuf[0] ? p->kbuf[0]->dgl_length : 0,
797 p->ubuf_ddp_ready, toep->tp_ulp_mode, !!is_ddp(m), m->m_ddp_flags, p->ubuf, p->kbuf_posted);
800 * If MSG_TRUNC is specified the data is discarded.
801 * XXX need to check pr_atomic
803 KASSERT(avail > 0, ("avail=%d resid=%d offset=%d", avail, uio->uio_resid, offset));
804 if (__predict_true(!(flags & MSG_TRUNC))) {
805 int resid = uio->uio_resid;
808 if ((err = copy_data(m, offset, avail, uio))) {
815 if (avail != (resid - uio->uio_resid))
816 printf("didn't copy all bytes :-/ avail=%d offset=%d pktlen=%d resid=%d uio_resid=%d copied=%d copied_unacked=%d is_ddp(m)=%d\n",
817 avail, offset, m->m_pkthdr.len, resid, uio->uio_resid, copied, copied_unacked, is_ddp(m));
819 if ((tp->t_flags & TF_TOE) == 0) {
827 copied_unacked += avail;
830 #ifdef URGENT_DATA_SUPPORTED
832 if (tp->urg_data && after(tp->copied_seq + copied_unacked, tp->urg_seq))
836 * If the buffer is fully consumed free it. If it's a DDP
837 * buffer also handle any events it indicates.
839 if (avail + offset >= m->m_pkthdr.len) {
840 unsigned int fl = m->m_ddp_flags;
841 int exitnow, got_psh = 0, nomoredata = 0;
843 struct mbuf *nextrecord;
845 if (p->kbuf[0] != NULL && is_ddp(m) && (fl & 1)) {
846 if (is_ddp_psh(m) && p->user_ddp_pending)
849 if (fl & DDP_BF_NOCOPY)
850 p->user_ddp_pending = 0;
851 else if ((fl & DDP_BF_NODATA) && IS_NONBLOCKING(so)) {
856 p->ubuf_ddp_ready = 1;
860 nextrecord = m->m_nextpkt;
861 count = m->m_pkthdr.len;
864 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
865 CTR2(KTR_TOM, "freeing mbuf m_len = %d pktlen = %d", m->m_len, m->m_pkthdr.len);
867 rcv->sb_mb = m_free(m);
870 sockbuf_pushsync(rcv, nextrecord);
872 sbdrop_locked(rcv, m->m_pkthdr.len);
874 exitnow = got_psh || nomoredata;
875 if (copied >= target && (rcv->sb_mb == NULL) && exitnow)
877 if (copied_unacked > (rcv->sb_hiwat >> 2)) {
880 t3_cleanup_rbuf(tp, copied_unacked);
890 if ((tp->t_flags & TF_TOE) == 0) {
896 * If we can still receive decide what to do in preparation for the
897 * next receive. Note that RCV_SHUTDOWN is set if the connection
898 * transitioned to CLOSE but not if it was in that state to begin with.
900 if (__predict_true((so_state_get(so) & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) {
901 if (p->user_ddp_pending) {
903 t3_cancel_ubuf(toep, rcv);
910 p->user_ddp_pending = 0;
912 if ((p->kbuf[0] != NULL) && (p->kbuf_posted == 0)) {
915 "chelsio_recvmsg: about to exit, repost kbuf");
918 t3_post_kbuf(toep, 1, IS_NONBLOCKING(so));
920 } else if (so_should_ddp(toep, copied) && uio->uio_iovcnt == 1) {
921 CTR1(KTR_TOM ,"entering ddp on tid=%u", toep->tp_tid);
922 if (!t3_enter_ddp(toep, TOM_TUNABLE(toep->tp_toedev,
923 ddp_copy_limit), 0, IS_NONBLOCKING(so))) {
924 rcv->sb_flags |= SB_IN_TOE;
932 "chelsio_recvmsg <-: copied %d len %d buffers_freed %d "
933 "kbuf_posted %d user_ddp_pending %u",
934 copied, len, buffers_freed, p ? p->kbuf_posted : -1,
935 p->user_ddp_pending);
939 if (copied_unacked && (tp->t_flags & TF_TOE)) {
941 t3_cleanup_rbuf(tp, copied_unacked);
950 cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
951 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
954 int rv, zcopy_thres, zcopy_enabled, flags;
955 struct tcpcb *tp = so_sototcpcb(so);
956 struct sockbuf *rcv = so_sockbuf_rcv(so);
958 flags = flagsp ? *flagsp &~ MSG_EOR : 0;
961 * In order to use DMA direct from userspace the following
962 * conditions must be met:
963 * - the connection is currently offloaded
965 * - the number of bytes to be transferred exceeds the threshold
966 * - the number of bytes currently in flight won't exceed the in-flight
968 * - vm_fault_hold_user_pages succeeds
969 * - blocking socket XXX for now
973 if (tp && (tp->t_flags & TF_TOE) && uio && ((flags & (MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0)
974 && (uio->uio_iovcnt == 1) && (mp0 == NULL) &&
975 ((rcv->sb_flags & SB_IN_TOE) || (uio->uio_iovcnt == 1))) {
976 struct toepcb *toep = tp->t_toe;
978 tdev = toep->tp_toedev;
979 zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
980 zcopy_enabled = TOM_TUNABLE(tdev, ddp);
981 if ((rcv->sb_flags & SB_IN_TOE) ||((uio->uio_resid > zcopy_thres) &&
982 (uio->uio_iovcnt == 1) && zcopy_enabled)) {
983 CTR4(KTR_TOM, "cxgb_soreceive: sb_flags=0x%x t_flags=0x%x flags=0x%x uio_resid=%d",
984 rcv->sb_flags, tp->t_flags, flags, uio->uio_resid);
985 rv = t3_soreceive(so, flagsp, uio);
989 printf("returned EAGAIN\n");
991 } else if (tp && (tp->t_flags & TF_TOE) && uio && mp0 == NULL) {
992 struct sockbuf *rcv = so_sockbuf_rcv(so);
994 log(LOG_INFO, "skipping t3_soreceive flags=0x%x iovcnt=%d sb_state=0x%x\n",
995 flags, uio->uio_iovcnt, rcv->sb_state);
998 return pru_soreceive(so, psa, uio, mp0, controlp, flagsp);
1001 struct protosw cxgb_protosw;
1002 struct pr_usrreqs cxgb_tcp_usrreqs;
1006 t3_install_socket_ops(struct socket *so)
1008 static int copied = 0;
1009 struct pr_usrreqs *pru;
1010 struct protosw *psw;
1013 psw = so_protosw_get(so);
1014 pru = psw->pr_usrreqs;
1016 bcopy(psw, &cxgb_protosw, sizeof(*psw));
1017 bcopy(pru, &cxgb_tcp_usrreqs, sizeof(*pru));
1019 cxgb_protosw.pr_ctloutput = t3_ctloutput;
1020 cxgb_protosw.pr_usrreqs = &cxgb_tcp_usrreqs;
1021 cxgb_tcp_usrreqs.pru_sosend = cxgb_sosend;
1022 cxgb_tcp_usrreqs.pru_soreceive = cxgb_soreceive;
1024 so_protosw_set(so, &cxgb_protosw);
1027 so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend;
1028 so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive;