1 /**************************************************************************
3 Copyright (c) 2007-2008, Chelsio Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Chelsio Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/types.h>
36 #include <sys/fcntl.h>
37 #include <sys/kernel.h>
38 #include <sys/limits.h>
41 #include <sys/condvar.h>
42 #include <sys/mutex.h>
45 #include <sys/sockbuf.h>
46 #include <sys/sockopt.h>
47 #include <sys/sockstate.h>
48 #include <sys/socket.h>
49 #include <sys/syslog.h>
53 #include <machine/bus.h>
54 #include <machine/cpu.h>
57 #include <net/route.h>
59 #include <netinet/in.h>
60 #include <netinet/in_pcb.h>
61 #include <netinet/in_systm.h>
62 #include <netinet/in_var.h>
64 #include <dev/cxgb/cxgb_config.h>
65 #include <dev/cxgb/cxgb_osdep.h>
66 #include <dev/cxgb/sys/mbufq.h>
67 #include <dev/cxgb/ulp/tom/cxgb_tcp_offload.h>
68 #include <netinet/tcp.h>
69 #include <netinet/tcp_var.h>
70 #include <netinet/tcp_fsm.h>
71 #include <netinet/tcp_offload.h>
72 #include <net/route.h>
74 #include <dev/cxgb/t3cdev.h>
75 #include <dev/cxgb/common/cxgb_firmware_exports.h>
76 #include <dev/cxgb/common/cxgb_t3_cpl.h>
77 #include <dev/cxgb/common/cxgb_tcb.h>
78 #include <dev/cxgb/common/cxgb_ctl_defs.h>
79 #include <dev/cxgb/cxgb_offload.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_extern.h>
87 #include <dev/cxgb/sys/mvec.h>
88 #include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
89 #include <dev/cxgb/ulp/tom/cxgb_defs.h>
90 #include <dev/cxgb/ulp/tom/cxgb_tom.h>
91 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
92 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
93 #include <dev/cxgb/ulp/tom/cxgb_tcp.h>
94 #include <dev/cxgb/ulp/tom/cxgb_vm.h>
97 static int (*pru_sosend)(struct socket *so, struct sockaddr *addr,
98 struct uio *uio, struct mbuf *top, struct mbuf *control,
99 int flags, struct thread *td);
101 static int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr,
102 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
105 #define TMP_IOV_MAX 16
107 #define PG_FRAME ~PAGE_MASK
109 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
112 t3_init_socket_ops(void)
116 prp = pffindtype(AF_INET, SOCK_STREAM);
117 pru_sosend = prp->pr_usrreqs->pru_sosend;
118 pru_soreceive = prp->pr_usrreqs->pru_soreceive;
121 struct cxgb_dma_info {
124 bus_dma_segment_t *cdi_segs;
129 cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
130 bus_size_t mapsize, int error)
132 struct cxgb_dma_info *cdi = arg;
134 cdi->cdi_mapped = mapsize;
135 cdi->cdi_nsegs = nsegs;
136 cdi->cdi_segs = segs;
140 iov_adj(struct iovec **iov, int *iovcnt, size_t count)
142 struct iovec *iovtmp;
150 if (count < iovtmp->iov_len) {
151 ptmp = iovtmp->iov_base;
153 iovtmp->iov_base = ptmp;
154 iovtmp->iov_len -= count;
157 count -= iovtmp->iov_len;
163 } else if (count < 0) {
164 iovtmp = &(*iov)[*iovcnt - 1];
167 if (-count < iovtmp->iov_len) {
168 iovtmp->iov_len += count;
171 count += iovtmp->iov_len;
180 cxgb_zero_copy_free(void *cl, void *arg)
183 struct mbuf *m = (struct mbuf *)cl;
187 * Physical addresses, don't try to free should be unheld separately from sbdrop
191 m_free_iovec(m, m->m_type);
196 cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags)
198 struct iovec *iov = uio->uio_iov;
199 int iovcnt = uio->uio_iovcnt;
200 int err, i, count, totcount, maxcount, totbytes, npages, curbytes;
204 totbytes = totcount = 0;
208 for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount); i++, iov++) {
209 count = maxcount - totcount;
211 start = (uintptr_t)iov->iov_base;
212 end = (uintptr_t)((caddr_t)iov->iov_base + iov->iov_len);
216 npages = (end - start) >> PAGE_SHIFT;
218 count = min(count, npages);
220 err = vm_fault_hold_user_pages((vm_offset_t)iov->iov_base, mp, count, flags);
222 vm_fault_unhold_pages(m, totcount);
227 curbytes = iov->iov_len;
229 curbytes = count*PAGE_SIZE - (((uintptr_t)iov->iov_base)&PAGE_MASK);
230 totbytes += curbytes;
232 uio->uio_resid -= totbytes;
238 * Returns whether a connection should enable DDP. This happens when all of
239 * the following conditions are met:
240 * - the connection's ULP mode is DDP
241 * - DDP is not already enabled
242 * - the last receive was above the DDP threshold
243 * - receive buffers are in user space
244 * - receive side isn't shutdown (handled by caller)
245 * - the connection's receive window is big enough so that sizable buffers
246 * can be posted without closing the window in the middle of DDP (checked
247 * when the connection is offloaded)
250 so_should_ddp(const struct toepcb *toep, int last_recv_len)
253 DPRINTF("ulp_mode=%d last_recv_len=%d ddp_thresh=%d rcv_wnd=%ld ddp_copy_limit=%d\n",
254 toep->tp_ulp_mode, last_recv_len, TOM_TUNABLE(toep->tp_toedev, ddp_thres),
255 toep->tp_tp->rcv_wnd, (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN));
257 return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.kbuf[0] == NULL) &&
258 last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) &&
259 toep->tp_tp->rcv_wnd >
260 (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN);
264 is_ddp(const struct mbuf *m)
266 return ((m->m_flags & M_DDP) != 0);
270 is_ddp_psh(const struct mbuf *m)
272 return ((is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH)) != 0);
276 m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio)
278 int curlen, startlen, resid_init, err = 0;
281 DPRINTF("m_uiomove(m=%p, offset=%d, len=%d, ...)\n",
285 resid_init = uio->uio_resid;
287 buf = mtod(m, caddr_t);
289 if (offset && (offset < curlen)) {
298 err = uiomove(buf, min(len, curlen), uio);
300 printf("uiomove returned %d\n", err);
304 len -= min(len, curlen);
307 DPRINTF("copied %d bytes - resid_init=%d uio_resid=%d\n",
308 startlen - len, resid_init, uio->uio_resid);
313 * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the
314 * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
318 copy_data(const struct mbuf *m, int offset, int len, struct uio *uio)
320 struct iovec *to = uio->uio_iov;
323 if (__predict_true(!is_ddp(m))) /* RX_DATA */
324 return m_uiomove(m, offset, len, uio);
325 if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */
327 to->iov_base = ((caddr_t)to->iov_base) + len;
329 uio->uio_resid -= len;
332 err = t3_ddp_copy(m, offset, uio, len); /* kernel DDP */
337 cxgb_wait_dma_completion(struct toepcb *toep)
341 lock = &toep->tp_tp->t_inpcb->inp_lock;
342 inp_wlock(toep->tp_tp->t_inpcb);
343 cv_wait_unlock(&toep->tp_cv, lock);
347 cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m)
349 int i, seg_count, err, type;
351 struct cxgb_dma_info cdi;
353 struct mbuf_iovec *mi;
354 bus_dma_segment_t *segs;
356 err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio,
357 cxgb_dma_callback, &cdi, 0);
361 seg_count = cdi.cdi_nsegs;
362 if ((m0 = mcl_alloc(seg_count, &type)) == NULL) {
363 bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap);
368 m0->m_flags = (M_EXT|M_NOFREE);
369 m0->m_ext.ext_type = EXT_EXTREF;
370 m0->m_ext.ext_free = cxgb_zero_copy_free;
371 m0->m_ext.ext_args = NULL; /* XXX: probably wrong /phk */
374 mv->mv_count = seg_count;
376 for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++)
377 mi_collapse_sge(mi, segs);
382 * This appears to be a no-op at the moment
383 * as busdma is all or nothing need to make
384 * sure the tag values are large enough
387 if (cdi.cdi_mapped < uio->uio_resid) {
388 uio->uio_resid -= cdi.cdi_mapped;
396 t3_sosend(struct socket *so, struct uio *uio)
398 int rv, count, hold_resid, sent, iovcnt;
399 struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
400 struct tcpcb *tp = so_sototcpcb(so);
401 struct toepcb *toep = tp->t_toe;
407 * Events requiring iteration:
408 * - number of pages exceeds max hold pages for process or system
409 * - number of pages exceeds maximum sg entries for a single WR
411 * We're limited to holding 128 pages at once - and we're limited to
412 * 34 SG entries per work request, but each SG entry can be any number
413 * of contiguous pages
418 iovcnt = uio->uio_iovcnt;
421 snd = so_sockbuf_snd(so);
424 * Make sure we don't exceed the socket buffer
426 count = min(toep->tp_page_count, (sockbuf_sbspace(snd) >> PAGE_SHIFT) + 2*PAGE_SIZE);
427 rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, 0);
428 hold_resid = uiotmp.uio_resid;
433 * Bump past sent and shave off the unheld amount
435 if (hold_resid > 0) {
437 memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
439 iov_adj(&iovtmpp, &iovcnt, sent);
440 iov_adj(&iovtmpp, &iovcnt, -hold_resid);
441 uiotmp.uio_iov = iovtmpp;
442 uiotmp.uio_iovcnt = iovcnt;
445 uiotmp.uio_resid = uio->uio_resid - hold_resid;
448 * Push off all held pages
451 while (uiotmp.uio_resid > 0) {
452 rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m);
454 vm_fault_unhold_pages(toep->tp_pages, count);
457 uio->uio_resid -= m->m_pkthdr.len;
458 sent += m->m_pkthdr.len;
460 t3_push_frames(so, TRUE);
461 iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
465 * Wait for pending I/O to be DMA'd to the card
468 cxgb_wait_dma_completion(toep);
469 vm_fault_unhold_pages(toep->tp_pages, count);
471 * If there is more data to send adjust local copy of iov
472 * to point to teh start
476 memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
477 iov_adj(&iovtmpp, &iovcnt, sent);
479 uiotmp.uio_iov = iovtmpp;
480 uiotmp.uio_iovcnt = iovcnt;
488 cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
489 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
491 struct tcpcb *tp = so_sototcpcb(so);
493 int zcopy_thres, zcopy_enabled, rv;
496 * In order to use DMA direct from userspace the following
497 * conditions must be met:
498 * - the connection is currently offloaded
500 * - the number of bytes to be transferred exceeds the threshold
501 * - the number of bytes currently in flight won't exceed the in-flight
503 * - vm_fault_hold_user_pages succeeds
504 * - blocking socket XXX for now
507 if (tp && tp->t_flags & TF_TOE) {
508 struct toepcb *toep = tp->t_toe;
510 tdev = toep->tp_toedev;
511 zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres);
512 zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled);
514 if (uio && (uio->uio_resid > zcopy_thres) &&
515 (uio->uio_iovcnt < TMP_IOV_MAX) && ((so_state_get(so) & SS_NBIO) == 0)
517 rv = t3_sosend(so, uio);
522 return pru_sosend(so, addr, uio, top, control, flags, td);
526 * Following replacement or removal of the first mbuf on the first mbuf chain
527 * of a socket buffer, push necessary state changes back into the socket
528 * buffer so that other consumers see the values consistently. 'nextrecord'
529 * is the callers locally stored value of the original value of
530 * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
531 * NOTE: 'nextrecord' may be NULL.
534 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
536 sockbuf_lock_assert(sb);
538 * First, update for the new value of nextrecord. If necessary, make
539 * it the first record.
541 if (sb->sb_mb != NULL)
542 sb->sb_mb->m_nextpkt = nextrecord;
544 sb->sb_mb = nextrecord;
547 * Now update any dependent socket buffer fields to reflect the new
548 * state. This is an expanded inline of SB_EMPTY_FIXUP(), with the
549 * addition of a second clause that takes care of the case where
550 * sb_mb has been updated, but remains the last record.
552 if (sb->sb_mb == NULL) {
553 sb->sb_mbtail = NULL;
554 sb->sb_lastrecord = NULL;
555 } else if (sb->sb_mb->m_nextpkt == NULL)
556 sb->sb_lastrecord = sb->sb_mb;
559 #define IS_NONBLOCKING(so) (so_state_get(so) & SS_NBIO)
562 t3_soreceive(struct socket *so, int *flagsp, struct uio *uio)
564 struct tcpcb *tp = so_sototcpcb(so);
565 struct toepcb *toep = tp->t_toe;
568 int err, flags, avail, len, copied, copied_unacked;
569 int target; /* Read at least this many bytes */
572 struct inpcb *inp = so_sotoinpcb(so);
573 int socket_state, socket_error;
576 avail = offset = copied = copied_unacked = 0;
577 flags = flagsp ? (*flagsp &~ MSG_EOR) : 0;
578 rcv = so_sockbuf_rcv(so);
580 err = sblock(rcv, SBLOCKWAIT(flags));
581 p = &toep->tp_ddp_state;
586 rcv = so_sockbuf_rcv(so);
588 if ((tp->t_flags & TF_TOE) == 0) {
594 p->user_ddp_pending = 0;
596 if ((tp->t_flags & TF_TOE) == 0) {
602 len = uio->uio_resid;
604 target = (flags & MSG_WAITALL) ? len : rcv->sb_lowat;
605 user_ddp_ok = p->ubuf_ddp_ready;
613 /* empty receive queue */
614 if (copied >= target && (rcv->sb_mb == NULL) &&
615 !p->user_ddp_pending)
618 socket_state = so_state_get(so);
619 socket_error = so_error_get(so);
620 rcv = so_sockbuf_rcv(so);
623 if (socket_error || tp->t_state == TCPS_CLOSED ||
624 (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)))
627 if (socket_state & SS_NOFDREF)
634 if (rcv->sb_state & SBS_CANTRCVMORE)
636 if (socket_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
638 if (tp->t_state == TCPS_CLOSED) {
643 if (rcv->sb_mb && !p->user_ddp_pending) {
646 t3_cleanup_rbuf(tp, copied_unacked);
652 if (p->kbuf[0] && user_ddp_ok && !p->user_ddp_pending &&
653 uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
655 p->user_ddp_pending =
656 !t3_overlay_ubuf(toep, rcv, uio,
657 IS_NONBLOCKING(so), flags, 1, 1);
658 if (p->user_ddp_pending) {
663 if (p->kbuf[0] && (p->kbuf_posted == 0)) {
664 t3_post_kbuf(toep, 1, IS_NONBLOCKING(so));
667 if (p->user_ddp_pending) {
668 /* One shot at DDP if we already have enough data */
669 if (copied >= target)
672 if (rcv->sb_state & SBS_CANTRCVMORE)
674 CTR0(KTR_TOM, "ddp pending -- waiting");
675 if ((err = sbwait(rcv)) != 0)
677 //for timers to work await_ddp_completion(sk, flags, &timeo);
678 } else if (copied >= target)
681 if (copied_unacked) {
686 t3_cleanup_rbuf(tp, copied_unacked);
690 while (i++ < 200 && rcv->sb_mb == NULL)
697 if (rcv->sb_state & SBS_CANTRCVMORE)
700 CTR0(KTR_TOM, "no buffers -- waiting");
702 if ((err = sbwait(rcv)) != 0)
708 * Adjust the mbuf seqno if it has already been partially processed by
711 if (m->m_pkthdr.len != m->m_len) {
712 m->m_seq += m->m_pkthdr.len - m->m_len;
713 m->m_pkthdr.len = m->m_len;
716 CTR6(KTR_TOM, "t3_soreceive: ddp_flags=0x%x m_len=%u resid=%u "
717 "m_seq=0x%08x c_seq=0x%08x c_unack=%u",
718 (is_ddp(m) ? m->m_ddp_flags : 0), m->m_pkthdr.len, len,
719 m->m_seq, toep->tp_copied_seq, copied_unacked);
720 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT),
721 ("unexpected type M_EXT=%d ext_type=%d m_len=%d m_pktlen=%d\n", !!(m->m_flags & M_EXT),
722 m->m_ext.ext_type, m->m_len, m->m_pkthdr.len));
723 KASSERT(m->m_next != (struct mbuf *)0xffffffff, ("bad next value m_next=%p m_nextpkt=%p"
724 " m_flags=0x%x m->m_len=%d", m->m_next, m->m_nextpkt, m->m_flags, m->m_len));
725 if (m->m_pkthdr.len == 0) {
726 if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0)
727 panic("empty mbuf and NOCOPY not set\n");
728 CTR0(KTR_TOM, "ddp done notification");
729 p->user_ddp_pending = 0;
730 sbdroprecord_locked(rcv);
734 KASSERT((int32_t)(toep->tp_copied_seq + copied_unacked - m->m_seq) >= 0,
735 ("offset will go negative: offset=%d copied_seq=0x%08x copied_unacked=%d m_seq=0x%08x",
736 offset, toep->tp_copied_seq, copied_unacked, m->m_seq));
737 offset = toep->tp_copied_seq + copied_unacked - m->m_seq;
739 if (offset >= m->m_pkthdr.len)
740 panic("t3_soreceive: OFFSET >= LEN offset %d copied_seq 0x%x "
741 "seq 0x%x pktlen %d ddp flags 0x%x", offset,
742 toep->tp_copied_seq + copied_unacked, m->m_seq,
743 m->m_pkthdr.len, m->m_ddp_flags);
745 avail = m->m_pkthdr.len - offset;
747 if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY))
748 panic("bad state in t3_soreceive len=%d avail=%d offset=%d\n", len, avail, offset);
750 rcv->sb_flags |= SB_IN_TOE;
751 } else if (p->kbuf_posted == 0 && p->user_ddp_pending == 0)
752 rcv->sb_flags &= ~SB_IN_TOE;
754 #ifdef URGENT_DATA_SUPPORTED
756 * Check if the data we are preparing to copy contains urgent
757 * data. Either stop short of urgent data or skip it if it's
758 * first and we are not delivering urgent data inline.
760 if (__predict_false(toep->tp_urg_data)) {
761 uint32_t urg_offset = tp->rcv_up - tp->copied_seq + copied_unacked;
763 if (urg_offset < avail) {
765 /* stop short of the urgent data */
767 } else if ((so_options_get(so) & SO_OOBINLINE) == 0) {
768 /* First byte is urgent, skip */
769 toep->tp_copied_seq++;
778 if (is_ddp_psh(m) || offset || (rcv->sb_mb && !is_ddp(m))) {
781 T3_TRACE0(TIDTB(so), "t3_sosend: PSH");
785 if (user_ddp_ok && !p->user_ddp_pending &&
786 uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
788 p->user_ddp_pending =
789 !t3_overlay_ubuf(toep, rcv, uio,
790 IS_NONBLOCKING(so), flags, 1, 1);
791 if (p->user_ddp_pending) {
795 DPRINTF("user_ddp_pending=%d\n", p->user_ddp_pending);
797 DPRINTF("user_ddp_ok=%d user_ddp_pending=%d iov_len=%ld dgl_length=%d ubuf_ddp_ready=%d ulp_mode=%d is_ddp(m)=%d flags=0x%x ubuf=%p kbuf_posted=%d\n",
798 user_ddp_ok, p->user_ddp_pending, uio->uio_iov->iov_len, p->kbuf[0] ? p->kbuf[0]->dgl_length : 0,
799 p->ubuf_ddp_ready, toep->tp_ulp_mode, !!is_ddp(m), m->m_ddp_flags, p->ubuf, p->kbuf_posted);
802 * If MSG_TRUNC is specified the data is discarded.
803 * XXX need to check pr_atomic
805 KASSERT(avail > 0, ("avail=%d resid=%d offset=%d", avail, uio->uio_resid, offset));
806 if (__predict_true(!(flags & MSG_TRUNC))) {
807 int resid = uio->uio_resid;
810 if ((err = copy_data(m, offset, avail, uio))) {
817 if (avail != (resid - uio->uio_resid))
818 printf("didn't copy all bytes :-/ avail=%d offset=%d pktlen=%d resid=%d uio_resid=%d copied=%d copied_unacked=%d is_ddp(m)=%d\n",
819 avail, offset, m->m_pkthdr.len, resid, uio->uio_resid, copied, copied_unacked, is_ddp(m));
821 if ((tp->t_flags & TF_TOE) == 0) {
829 copied_unacked += avail;
832 #ifdef URGENT_DATA_SUPPORTED
834 if (tp->urg_data && after(tp->copied_seq + copied_unacked, tp->urg_seq))
838 * If the buffer is fully consumed free it. If it's a DDP
839 * buffer also handle any events it indicates.
841 if (avail + offset >= m->m_pkthdr.len) {
842 unsigned int fl = m->m_ddp_flags;
843 int exitnow, got_psh = 0, nomoredata = 0;
845 struct mbuf *nextrecord;
847 if (p->kbuf[0] != NULL && is_ddp(m) && (fl & 1)) {
848 if (is_ddp_psh(m) && p->user_ddp_pending)
851 if (fl & DDP_BF_NOCOPY)
852 p->user_ddp_pending = 0;
853 else if ((fl & DDP_BF_NODATA) && IS_NONBLOCKING(so)) {
858 p->ubuf_ddp_ready = 1;
862 nextrecord = m->m_nextpkt;
863 count = m->m_pkthdr.len;
866 KASSERT(((m->m_flags & M_EXT) && (m->m_ext.ext_type == EXT_EXTREF)) || !(m->m_flags & M_EXT), ("unexpected type M_EXT=%d ext_type=%d m_len=%d\n", !!(m->m_flags & M_EXT), m->m_ext.ext_type, m->m_len));
867 CTR2(KTR_TOM, "freeing mbuf m_len = %d pktlen = %d", m->m_len, m->m_pkthdr.len);
869 rcv->sb_mb = m_free(m);
872 sockbuf_pushsync(rcv, nextrecord);
874 sbdrop_locked(rcv, m->m_pkthdr.len);
876 exitnow = got_psh || nomoredata;
877 if (copied >= target && (rcv->sb_mb == NULL) && exitnow)
879 if (copied_unacked > (rcv->sb_hiwat >> 2)) {
882 t3_cleanup_rbuf(tp, copied_unacked);
892 if ((tp->t_flags & TF_TOE) == 0) {
898 * If we can still receive decide what to do in preparation for the
899 * next receive. Note that RCV_SHUTDOWN is set if the connection
900 * transitioned to CLOSE but not if it was in that state to begin with.
902 if (__predict_true((so_state_get(so) & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) {
903 if (p->user_ddp_pending) {
905 t3_cancel_ubuf(toep, rcv);
912 p->user_ddp_pending = 0;
914 if ((p->kbuf[0] != NULL) && (p->kbuf_posted == 0)) {
917 "chelsio_recvmsg: about to exit, repost kbuf");
920 t3_post_kbuf(toep, 1, IS_NONBLOCKING(so));
922 } else if (so_should_ddp(toep, copied) && uio->uio_iovcnt == 1) {
923 CTR1(KTR_TOM ,"entering ddp on tid=%u", toep->tp_tid);
924 if (!t3_enter_ddp(toep, TOM_TUNABLE(toep->tp_toedev,
925 ddp_copy_limit), 0, IS_NONBLOCKING(so))) {
926 rcv->sb_flags |= SB_IN_TOE;
934 "chelsio_recvmsg <-: copied %d len %d buffers_freed %d "
935 "kbuf_posted %d user_ddp_pending %u",
936 copied, len, buffers_freed, p ? p->kbuf_posted : -1,
937 p->user_ddp_pending);
941 if (copied_unacked && (tp->t_flags & TF_TOE)) {
943 t3_cleanup_rbuf(tp, copied_unacked);
952 cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
953 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
956 int rv, zcopy_thres, zcopy_enabled, flags;
957 struct tcpcb *tp = so_sototcpcb(so);
958 struct sockbuf *rcv = so_sockbuf_rcv(so);
960 flags = flagsp ? *flagsp &~ MSG_EOR : 0;
963 * In order to use DMA direct from userspace the following
964 * conditions must be met:
965 * - the connection is currently offloaded
967 * - the number of bytes to be transferred exceeds the threshold
968 * - the number of bytes currently in flight won't exceed the in-flight
970 * - vm_fault_hold_user_pages succeeds
971 * - blocking socket XXX for now
975 if (tp && (tp->t_flags & TF_TOE) && uio && ((flags & (MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0)
976 && (uio->uio_iovcnt == 1) && (mp0 == NULL) &&
977 ((rcv->sb_flags & SB_IN_TOE) || (uio->uio_iovcnt == 1))) {
978 struct toepcb *toep = tp->t_toe;
980 tdev = toep->tp_toedev;
981 zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
982 zcopy_enabled = TOM_TUNABLE(tdev, ddp);
983 if ((rcv->sb_flags & SB_IN_TOE) ||((uio->uio_resid > zcopy_thres) &&
984 (uio->uio_iovcnt == 1) && zcopy_enabled)) {
985 CTR4(KTR_TOM, "cxgb_soreceive: sb_flags=0x%x t_flags=0x%x flags=0x%x uio_resid=%d",
986 rcv->sb_flags, tp->t_flags, flags, uio->uio_resid);
987 rv = t3_soreceive(so, flagsp, uio);
991 printf("returned EAGAIN\n");
993 } else if (tp && (tp->t_flags & TF_TOE) && uio && mp0 == NULL) {
994 struct sockbuf *rcv = so_sockbuf_rcv(so);
996 log(LOG_INFO, "skipping t3_soreceive flags=0x%x iovcnt=%d sb_state=0x%x\n",
997 flags, uio->uio_iovcnt, rcv->sb_state);
1000 return pru_soreceive(so, psa, uio, mp0, controlp, flagsp);
1003 struct protosw cxgb_protosw;
1004 struct pr_usrreqs cxgb_tcp_usrreqs;
1008 t3_install_socket_ops(struct socket *so)
1010 static int copied = 0;
1011 struct pr_usrreqs *pru;
1012 struct protosw *psw;
1015 psw = so_protosw_get(so);
1016 pru = psw->pr_usrreqs;
1018 bcopy(psw, &cxgb_protosw, sizeof(*psw));
1019 bcopy(pru, &cxgb_tcp_usrreqs, sizeof(*pru));
1021 cxgb_protosw.pr_ctloutput = t3_ctloutput;
1022 cxgb_protosw.pr_usrreqs = &cxgb_tcp_usrreqs;
1023 cxgb_tcp_usrreqs.pru_sosend = cxgb_sosend;
1024 cxgb_tcp_usrreqs.pru_soreceive = cxgb_soreceive;
1026 so_protosw_set(so, &cxgb_protosw);
1029 so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend;
1030 so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive;