1 /**************************************************************************
3 Copyright (c) 2007, Chelsio Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Chelsio Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/types.h>
36 #include <sys/fcntl.h>
37 #include <sys/kernel.h>
38 #include <sys/limits.h>
41 #include <sys/mutex.h>
43 #include <sys/socket.h>
44 #include <sys/syslog.h>
45 #include <sys/socketvar.h>
48 #include <machine/bus.h>
51 #include <net/route.h>
53 #include <netinet/in.h>
54 #include <netinet/in_pcb.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/in_var.h>
59 #include <dev/cxgb/cxgb_osdep.h>
60 #include <dev/cxgb/sys/mbufq.h>
62 #include <netinet/tcp.h>
63 #include <netinet/tcp_var.h>
64 #include <netinet/tcp_fsm.h>
65 #include <netinet/tcp_offload.h>
66 #include <net/route.h>
68 #include <dev/cxgb/t3cdev.h>
69 #include <dev/cxgb/common/cxgb_firmware_exports.h>
70 #include <dev/cxgb/common/cxgb_t3_cpl.h>
71 #include <dev/cxgb/common/cxgb_tcb.h>
72 #include <dev/cxgb/common/cxgb_ctl_defs.h>
73 #include <dev/cxgb/cxgb_l2t.h>
74 #include <dev/cxgb/cxgb_offload.h>
76 #include <vm/vm_page.h>
77 #include <vm/vm_map.h>
78 #include <vm/vm_extern.h>
81 #include <dev/cxgb/sys/mvec.h>
82 #include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
83 #include <dev/cxgb/ulp/tom/cxgb_defs.h>
84 #include <dev/cxgb/ulp/tom/cxgb_tom.h>
85 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
86 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
87 #include <dev/cxgb/ulp/tom/cxgb_tcp.h>
89 static int (*pru_sosend)(struct socket *so, struct sockaddr *addr,
90 struct uio *uio, struct mbuf *top, struct mbuf *control,
91 int flags, struct thread *td);
93 static int (*pru_soreceive)(struct socket *so, struct sockaddr **paddr,
94 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
98 #define VM_HOLD_WRITEABLE 0x1
99 static int vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp,
100 int *count, int flags);
102 static void vm_fault_unhold_pages(vm_page_t *m, int count);
103 #define TMP_IOV_MAX 16
106 t3_init_socket_ops(void)
110 prp = pffindtype(AF_INET, SOCK_STREAM);
111 pru_sosend = prp->pr_usrreqs->pru_sosend;
112 pru_soreceive = prp->pr_usrreqs->pru_soreceive;
113 #ifdef TCP_USRREQS_OVERLOAD
114 tcp_usrreqs.pru_connect = cxgb_tcp_usrreqs.pru_connect;
115 tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort;
116 tcp_usrreqs.pru_listen = cxgb_tcp_usrreqs.pru_listen;
117 tcp_usrreqs.pru_send = cxgb_tcp_usrreqs.pru_send;
118 tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort;
119 tcp_usrreqs.pru_disconnect = cxgb_tcp_usrreqs.pru_disconnect;
120 tcp_usrreqs.pru_close = cxgb_tcp_usrreqs.pru_close;
121 tcp_usrreqs.pru_shutdown = cxgb_tcp_usrreqs.pru_shutdown;
122 tcp_usrreqs.pru_rcvd = cxgb_tcp_usrreqs.pru_rcvd;
127 struct cxgb_dma_info {
130 bus_dma_segment_t *cdi_segs;
135 cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
136 bus_size_t mapsize, int error)
138 struct cxgb_dma_info *cdi = arg;
140 cdi->cdi_mapped = mapsize;
141 cdi->cdi_nsegs = nsegs;
142 cdi->cdi_segs = segs;
146 iov_adj(struct iovec **iov, int *iovcnt, size_t count)
148 struct iovec *iovtmp;
156 if (count < iovtmp->iov_len) {
157 ptmp = iovtmp->iov_base;
159 iovtmp->iov_base = ptmp;
160 iovtmp->iov_len -= count;
163 count -= iovtmp->iov_len;
169 } else if (count < 0) {
170 iovtmp = &(*iov)[*iovcnt - 1];
173 if (-count < iovtmp->iov_len) {
174 iovtmp->iov_len += count;
177 count += iovtmp->iov_len;
187 cxgb_zero_copy_free(void *cl, void *arg) {}
190 cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags)
197 cxgb_wait_dma_completion(struct toepcb *tp)
203 cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m)
205 int i, seg_count, err, type;
207 struct cxgb_dma_info cdi;
209 struct mbuf_iovec *mi;
210 bus_dma_segment_t *segs;
212 err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio,
213 cxgb_dma_callback, &cdi, 0);
217 seg_count = cdi.cdi_nsegs;
218 if ((m0 = mcl_alloc(seg_count, &type)) == NULL) {
219 bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap);
224 m0->m_flags = (M_EXT|M_NOFREE);
225 m0->m_ext.ext_type = EXT_EXTREF;
226 m0->m_ext.ext_free = cxgb_zero_copy_free;
227 m0->m_ext.ext_args = NULL;
230 mv->mv_count = seg_count;
232 for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++)
233 mi_collapse_sge(mi, segs);
237 if (cdi.cdi_mapped < uio->uio_resid) {
238 uio->uio_resid -= cdi.cdi_mapped;
246 t3_sosend(struct socket *so, struct uio *uio)
248 int rv, count, hold_resid, sent, iovcnt;
249 struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
250 struct tcpcb *tp = sototcpcb(so);
251 struct toepcb *toep = tp->t_toe;
256 * Events requiring iteration:
257 * - number of pages exceeds max hold pages for process or system
258 * - number of pages exceeds maximum sg entries for a single WR
260 * We're limited to holding 128 pages at once - and we're limited to
261 * 34 SG entries per work request, but each SG entry can be any number
262 * of contiguous pages
267 iovcnt = uio->uio_iovcnt;
272 * Make sure we don't exceed the socket buffer
274 count = min(toep->tp_page_count, (sbspace(&so->so_snd) >> PAGE_SHIFT) + 2*PAGE_SIZE);
275 rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, 0);
276 hold_resid = uiotmp.uio_resid;
281 * Bump past sent and shave off the unheld amount
283 if (hold_resid > 0) {
285 memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
287 iov_adj(&iovtmpp, &iovcnt, sent);
288 iov_adj(&iovtmpp, &iovcnt, -hold_resid);
289 uiotmp.uio_iov = iovtmpp;
290 uiotmp.uio_iovcnt = iovcnt;
293 uiotmp.uio_resid = uio->uio_resid - hold_resid;
296 * Push off all held pages
299 while (uiotmp.uio_resid > 0) {
300 rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m);
302 vm_fault_unhold_pages(toep->tp_pages, count);
305 uio->uio_resid -= m->m_pkthdr.len;
306 sent += m->m_pkthdr.len;
307 sbappend_locked(&so->so_snd, m);
308 t3_push_frames(so, TRUE);
309 iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
312 * Wait for pending I/O to be DMA'd to the card
315 cxgb_wait_dma_completion(toep);
316 vm_fault_unhold_pages(toep->tp_pages, count);
318 * If there is more data to send adjust local copy of iov
319 * to point to teh start
323 memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
324 iov_adj(&iovtmpp, &iovcnt, sent);
326 uiotmp.uio_iov = iovtmpp;
327 uiotmp.uio_iovcnt = iovcnt;
335 cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
336 struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
338 struct tcpcb *tp = sototcpcb(so);
340 int zcopy_thres, zcopy_enabled, rv;
343 * In order to use DMA direct from userspace the following
344 * conditions must be met:
345 * - the connection is currently offloaded
347 * - the number of bytes to be transferred exceeds the threshold
348 * - the number of bytes currently in flight won't exceed the in-flight
350 * - vm_fault_hold_user_pages succeeds
351 * - blocking socket XXX for now
354 if (tp->t_flags & TF_TOE) {
356 zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres);
357 zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled);
359 if ((uio->uio_resid > zcopy_thres) &&
360 (uio->uio_iovcnt < TMP_IOV_MAX) && ((so->so_state & SS_NBIO) == 0)
362 rv = t3_sosend(so, uio);
367 return pru_sosend(so, addr, uio, top, control, flags, td);
372 t3_soreceive(struct socket *so, struct uio *uio)
375 int i, rv, count, hold_resid, sent, iovcnt;
376 struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
377 struct tcpcb *tp = sototcpcb(so);
378 struct toepcb *toep = tp->t_toe;
383 * Events requiring iteration:
384 * - number of pages exceeds max hold pages for process or system
385 * - number of pages exceeds maximum sg entries for a single WR
387 * We're limited to holding 128 pages at once - and we're limited to
388 * 34 SG entries per work request, but each SG entry can be any number
389 * of contiguous pages
394 iovcnt = uio->uio_iovcnt;
403 cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
404 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
407 int rv, zcopy_thres, zcopy_enabled;
408 struct tcpcb *tp = sototcpcb(so);
411 * In order to use DMA direct from userspace the following
412 * conditions must be met:
413 * - the connection is currently offloaded
415 * - the number of bytes to be transferred exceeds the threshold
416 * - the number of bytes currently in flight won't exceed the in-flight
418 * - vm_fault_hold_user_pages succeeds
419 * - blocking socket XXX for now
423 if (tp->t_flags & TF_TOE) {
425 zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
426 zcopy_enabled = TOM_TUNABLE(tdev, ddp);
427 if ((uio->uio_resid > zcopy_thres) &&
428 (uio->uio_iovcnt == 1) && ((so->so_state & SS_NBIO) == 0)
430 rv = t3_soreceive(so, uio);
436 return pru_soreceive(so, psa, uio, mp0, controlp, flagsp);
441 t3_install_socket_ops(struct socket *so)
443 so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend;
444 so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive;
448 * This routine takes a user address range and does the following:
449 * - validate that the user has access to those pages (flags indicates read or write) - if not fail
450 * - validate that count is enough to hold range number of pages - if not fail
451 * - fault in any non-resident pages
452 * - if the user is doing a read force a write fault for any COWed pages
453 * - if the user is doing a read mark all pages as dirty
455 * - return number of pages in count
459 vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, int *count, int flags)
462 vm_offset_t start, va;
464 int pageslen, faults, rv;
472 start = addr & ~PAGE_MASK;
473 pageslen = roundup2(addr + len, PAGE_SIZE);
474 if (*count < (pageslen >> PAGE_SHIFT))
477 *count = pageslen >> PAGE_SHIFT;
479 * Check that virtual address range is legal
480 * This check is somewhat bogus as on some architectures kernel
481 * and user do not share VA - however, it appears that all FreeBSD
482 * architectures define it
484 if (addr + len > VM_MAXUSER_ADDRESS)
488 map = &td->td_proc->p_vmspace->vm_map;
489 pmap = &td->td_proc->p_vmspace->vm_pmap;
492 prot = (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : VM_PROT_READ;
493 bzero(pages, sizeof(vm_page_t *) * (*count));
497 * First optimistically assume that all pages are resident (and R/W if for write)
498 * if so just mark pages as held (and dirty if for write) and return
500 vm_page_lock_queues();
501 for (pages = mp, faults = 0, va = start; va < pageslen; va += PAGE_SIZE, pages++) {
503 * Assure that we only hold the page once
505 if (*pages == NULL) {
507 * page queue mutex is recursable so this is OK
508 * it would be really nice if we had an unlocked version of this so
509 * we were only acquiring the pmap lock 1 time as opposed to potentially
510 * many dozens of times
512 m = pmap_extract_and_hold(pmap, va, prot);
518 if (flags & VM_HOLD_WRITEABLE)
522 vm_page_unlock_queues();
527 * Pages either have insufficient permissions or are not present
528 * trigger a fault where neccessary
531 for (va = start; va < pageslen; va += PAGE_SIZE) {
533 pa = pmap_extract(pmap, va);
536 m = PHYS_TO_VM_PAGE(pa);
537 if (flags & VM_HOLD_WRITEABLE) {
538 if (m == NULL || (m->flags & PG_WRITEABLE) == 0)
539 rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
540 } else if (m == NULL)
541 rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL);
548 vm_page_lock_queues();
549 for (pages = mp, va = start; va < pageslen; va += PAGE_SIZE, pages++)
551 vm_page_unhold(*pages);
552 vm_page_unlock_queues();
558 vm_fault_unhold_pages(vm_page_t *mp, int count)
561 KASSERT(count >= 0, ("negative count %d", count));
562 vm_page_lock_queues();
567 vm_page_unlock_queues();