sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c

   1 /**************************************************************************
   2
   3 Copyright (c) 2007, Chelsio Inc.
   4 All rights reserved.
   5
   6 Redistribution and use in source and binary forms, with or without
   7 modification, are permitted provided that the following conditions are met:
   8
   9  1. Redistributions of source code must retain the above copyright notice,
  10     this list of conditions and the following disclaimer.
  11
  12  2. Neither the name of the Chelsio Corporation nor the names of its
  13     contributors may be used to endorse or promote products derived from
  14     this software without specific prior written permission.
  15
  16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  26 POSSIBILITY OF SUCH DAMAGE.
  27
  28 ***************************************************************************/
  29
  30 #include <sys/cdefs.h>
  31 __FBSDID("$FreeBSD$");
  32
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/types.h>
  36 #include <sys/fcntl.h>
  37 #include <sys/kernel.h>
  38 #include <sys/limits.h>
  39 #include <sys/lock.h>
  40 #include <sys/mbuf.h>
  41 #include <sys/mutex.h>
  42 #include <sys/proc.h>
  43 #include <sys/socket.h>
  44 #include <sys/syslog.h>
  45 #include <sys/socketvar.h>
  46 #include <sys/uio.h>
  47
  48 #include <machine/bus.h>
  49
  50 #include <net/if.h>
  51 #include <net/route.h>
  52
  53 #include <netinet/in.h>
  54 #include <netinet/in_pcb.h>
  55 #include <netinet/in_systm.h>
  56 #include <netinet/in_var.h>
  57
  58
  59 #include <dev/cxgb/cxgb_osdep.h>
  60 #include <dev/cxgb/sys/mbufq.h>
  61
  62 #include <netinet/tcp.h>
  63 #include <netinet/tcp_var.h>
  64 #include <netinet/tcp_fsm.h>
  65 #include <netinet/tcp_offload.h>
  66 #include <net/route.h>
  67
  68 #include <dev/cxgb/t3cdev.h>
  69 #include <dev/cxgb/common/cxgb_firmware_exports.h>
  70 #include <dev/cxgb/common/cxgb_t3_cpl.h>
  71 #include <dev/cxgb/common/cxgb_tcb.h>
  72 #include <dev/cxgb/common/cxgb_ctl_defs.h>
  73 #include <dev/cxgb/cxgb_l2t.h>
  74 #include <dev/cxgb/cxgb_offload.h>
  75 #include <vm/vm.h>
  76 #include <vm/vm_page.h>
  77 #include <vm/vm_map.h>
  78 #include <vm/vm_extern.h>
  79 #include <vm/pmap.h>
  80
  81 #include <dev/cxgb/sys/mvec.h>
  82 #include <dev/cxgb/ulp/toecore/cxgb_toedev.h>
  83 #include <dev/cxgb/ulp/tom/cxgb_defs.h>
  84 #include <dev/cxgb/ulp/tom/cxgb_tom.h>
  85 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
  86 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
  87 #include <dev/cxgb/ulp/tom/cxgb_tcp.h>
  88
  89 static int      (*pru_sosend)(struct socket *so, struct sockaddr *addr,
  90     struct uio *uio, struct mbuf *top, struct mbuf *control,
  91     int flags, struct thread *td);
  92
  93 static int      (*pru_soreceive)(struct socket *so, struct sockaddr **paddr,
  94     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
  95     int *flagsp);
  96
  97 #ifdef notyet
  98 #define VM_HOLD_WRITEABLE       0x1
  99 static int  vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp,
 100     int *count, int flags);
 101 #endif
 102 static void vm_fault_unhold_pages(vm_page_t *m, int count);
 103 #define TMP_IOV_MAX 16
 104
 105 void
 106 t3_init_socket_ops(void)
 107 {
 108         struct protosw *prp;
 109
 110         prp = pffindtype(AF_INET, SOCK_STREAM);
 111         pru_sosend = prp->pr_usrreqs->pru_sosend;
 112         pru_soreceive = prp->pr_usrreqs->pru_soreceive;
 113 #ifdef TCP_USRREQS_OVERLOAD
 114         tcp_usrreqs.pru_connect = cxgb_tcp_usrreqs.pru_connect;
 115         tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort;
 116         tcp_usrreqs.pru_listen = cxgb_tcp_usrreqs.pru_listen;
 117         tcp_usrreqs.pru_send = cxgb_tcp_usrreqs.pru_send;
 118         tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort;
 119         tcp_usrreqs.pru_disconnect = cxgb_tcp_usrreqs.pru_disconnect;
 120         tcp_usrreqs.pru_close = cxgb_tcp_usrreqs.pru_close;
 121         tcp_usrreqs.pru_shutdown = cxgb_tcp_usrreqs.pru_shutdown;
 122         tcp_usrreqs.pru_rcvd = cxgb_tcp_usrreqs.pru_rcvd;
 123 #endif
 124 }
 125
 126
 127 struct cxgb_dma_info {
 128         size_t                  cdi_mapped;
 129         int                     cdi_nsegs;
 130         bus_dma_segment_t       *cdi_segs;
 131
 132 };
 133
 134 static void
 135 cxgb_dma_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
 136     bus_size_t mapsize, int error)
 137 {
 138         struct cxgb_dma_info *cdi = arg;
 139
 140         cdi->cdi_mapped = mapsize;
 141         cdi->cdi_nsegs = nsegs;
 142         cdi->cdi_segs = segs;
 143 }
 144
 145 static void
 146 iov_adj(struct iovec **iov, int *iovcnt, size_t count)
 147 {
 148         struct iovec *iovtmp;
 149         int iovcnttmp;
 150         caddr_t ptmp;
 151
 152         if (count > 0) {
 153                 iovtmp = *iov;
 154                 iovcnttmp = *iovcnt;
 155                 while (count > 0) {
 156                         if (count < iovtmp->iov_len) {
 157                                 ptmp = iovtmp->iov_base;
 158                                 ptmp += count;
 159                                 iovtmp->iov_base = ptmp;
 160                                 iovtmp->iov_len -= count;
 161                                 break;
 162                         } else
 163                                 count -= iovtmp->iov_len;
 164                         iovtmp++;
 165                         iovcnttmp--;
 166                 }
 167                 *iov = iovtmp;
 168                 *iovcnt = iovcnttmp;
 169         } else if (count < 0) {
 170                 iovtmp = &(*iov)[*iovcnt - 1];
 171                 iovcnttmp = *iovcnt;
 172                 while (count < 0) {
 173                         if (-count < iovtmp->iov_len) {
 174                                 iovtmp->iov_len += count;
 175                                 break;
 176                         } else
 177                                 count += iovtmp->iov_len;
 178                         iovtmp--;
 179                         iovcnttmp--;
 180                 }
 181                 *iovcnt = iovcnttmp;
 182         }
 183 }
 184
 185
 186 static void
 187 cxgb_zero_copy_free(void *cl, void *arg) {}
 188
 189 static int
 190 cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags)
 191 {
 192
 193         return (EINVAL);
 194 }
 195
 196 static void
 197 cxgb_wait_dma_completion(struct toepcb *tp)
 198 {
 199
 200 }
 201
 202 static int
 203 cxgb_vm_page_to_miov(struct toepcb *toep, struct uio *uio, struct mbuf **m)
 204 {
 205         int i, seg_count, err, type;
 206         struct mbuf *m0;
 207         struct cxgb_dma_info cdi;
 208         struct mbuf_vec *mv;
 209         struct mbuf_iovec *mi;
 210         bus_dma_segment_t *segs;
 211
 212         err = bus_dmamap_load_uio(toep->tp_tx_dmat, toep->tp_dmamap, uio,
 213             cxgb_dma_callback, &cdi, 0);
 214
 215         if (err)
 216                 return (err);
 217         seg_count = cdi.cdi_nsegs;
 218         if ((m0 = mcl_alloc(seg_count, &type)) == NULL) {
 219                 bus_dmamap_unload(toep->tp_tx_dmat, toep->tp_dmamap);
 220                 return (ENOMEM);
 221         }
 222         segs = cdi.cdi_segs;
 223         m0->m_type = type;
 224         m0->m_flags = (M_EXT|M_NOFREE);
 225         m0->m_ext.ext_type = EXT_EXTREF;
 226         m0->m_ext.ext_free = cxgb_zero_copy_free;
 227         m0->m_ext.ext_args = NULL;
 228
 229         mv = mtomv(m0);
 230         mv->mv_count = seg_count;
 231         mv->mv_first = 0;
 232         for (i = 0, mi = mv->mv_vec; i < seg_count; mi++, segs++, i++)
 233                 mi_collapse_sge(mi, segs);
 234
 235         *m = m0;
 236
 237         if (cdi.cdi_mapped < uio->uio_resid) {
 238                 uio->uio_resid -= cdi.cdi_mapped;
 239         } else
 240                 uio->uio_resid = 0;
 241
 242         return (0);
 243 }
 244
 245 static int
 246 t3_sosend(struct socket *so, struct uio *uio)
 247 {
 248         int rv, count, hold_resid, sent, iovcnt;
 249         struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
 250         struct tcpcb *tp = sototcpcb(so);
 251         struct toepcb *toep = tp->t_toe;
 252         struct mbuf *m;
 253         struct uio uiotmp;
 254
 255         /*
 256          * Events requiring iteration:
 257          *  - number of pages exceeds max hold pages for process or system
 258          *  - number of pages exceeds maximum sg entries for a single WR
 259          *
 260          * We're limited to holding 128 pages at once - and we're limited to
 261          * 34 SG entries per work request, but each SG entry can be any number
 262          * of contiguous pages
 263          *
 264          */
 265
 266         uiotmp = *uio;
 267         iovcnt = uio->uio_iovcnt;
 268         iov = uio->uio_iov;
 269         sent = 0;
 270 sendmore:
 271         /*
 272          * Make sure we don't exceed the socket buffer
 273          */
 274         count = min(toep->tp_page_count, (sbspace(&so->so_snd) >> PAGE_SHIFT) + 2*PAGE_SIZE);
 275         rv = cxgb_hold_iovec_pages(&uiotmp, toep->tp_pages, &count, 0);
 276         hold_resid = uiotmp.uio_resid;
 277         if (rv)
 278                 return (rv);
 279
 280         /*
 281          * Bump past sent and shave off the unheld amount
 282          */
 283         if (hold_resid  > 0) {
 284                 iovtmpp = iovtmp;
 285                 memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
 286                 if (sent)
 287                         iov_adj(&iovtmpp, &iovcnt, sent);
 288                 iov_adj(&iovtmpp, &iovcnt, -hold_resid);
 289                 uiotmp.uio_iov = iovtmpp;
 290                 uiotmp.uio_iovcnt = iovcnt;
 291
 292         }
 293         uiotmp.uio_resid = uio->uio_resid - hold_resid;
 294
 295         /*
 296          * Push off all held pages
 297          *
 298          */
 299         while (uiotmp.uio_resid > 0) {
 300                 rv = cxgb_vm_page_to_miov(toep, &uiotmp, &m);
 301                 if (rv) {
 302                         vm_fault_unhold_pages(toep->tp_pages, count);
 303                         return (rv);
 304                 }
 305                 uio->uio_resid -= m->m_pkthdr.len;
 306                 sent += m->m_pkthdr.len;
 307                 sbappend_locked(&so->so_snd, m);
 308                 t3_push_frames(so, TRUE);
 309                 iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
 310         }
 311         /*
 312          * Wait for pending I/O to be DMA'd to the card
 313          *
 314          */
 315         cxgb_wait_dma_completion(toep);
 316         vm_fault_unhold_pages(toep->tp_pages, count);
 317         /*
 318          * If there is more data to send adjust local copy of iov
 319          * to point to teh start
 320          */
 321         if (hold_resid) {
 322                 iovtmpp = iovtmp;
 323                 memcpy(iovtmp, iov, iovcnt*sizeof(*iov));
 324                 iov_adj(&iovtmpp, &iovcnt, sent);
 325                 uiotmp = *uio;
 326                 uiotmp.uio_iov = iovtmpp;
 327                 uiotmp.uio_iovcnt = iovcnt;
 328                 goto sendmore;
 329         }
 330
 331         return (0);
 332 }
 333
 334 static int
 335 cxgb_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
 336     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 337 {
 338         struct tcpcb *tp = sototcpcb(so);
 339         struct toedev *tdev;
 340         int zcopy_thres, zcopy_enabled, rv;
 341
 342         /*
 343          * In order to use DMA direct from userspace the following
 344          * conditions must be met:
 345          *  - the connection is currently offloaded
 346          *  - ddp is enabled
 347          *  - the number of bytes to be transferred exceeds the threshold
 348          *  - the number of bytes currently in flight won't exceed the in-flight
 349          *    threshold XXX TODO
 350          *  - vm_fault_hold_user_pages succeeds
 351          *  - blocking socket XXX for now
 352          *
 353          */
 354         if (tp->t_flags & TF_TOE) {
 355                 tdev = TOE_DEV(so);
 356                 zcopy_thres = TOM_TUNABLE(tdev, zcopy_sosend_partial_thres);
 357                 zcopy_enabled = TOM_TUNABLE(tdev, zcopy_sosend_enabled);
 358
 359                 if ((uio->uio_resid > zcopy_thres) &&
 360                     (uio->uio_iovcnt < TMP_IOV_MAX) &&  ((so->so_state & SS_NBIO) == 0)
 361                     && zcopy_enabled) {
 362                         rv = t3_sosend(so, uio);
 363                         if (rv != EAGAIN)
 364                                 return (rv);
 365                 }
 366         }
 367         return pru_sosend(so, addr, uio, top, control, flags, td);
 368 }
 369
 370
 371 static int
 372 t3_soreceive(struct socket *so, struct uio *uio)
 373 {
 374 #ifdef notyet
 375         int i, rv, count, hold_resid, sent, iovcnt;
 376         struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
 377         struct tcpcb *tp = sototcpcb(so);
 378         struct toepcb *toep = tp->t_toe;
 379         struct mbuf *m;
 380         struct uio uiotmp;
 381
 382         /*
 383          * Events requiring iteration:
 384          *  - number of pages exceeds max hold pages for process or system
 385          *  - number of pages exceeds maximum sg entries for a single WR
 386          *
 387          * We're limited to holding 128 pages at once - and we're limited to
 388          * 34 SG entries per work request, but each SG entry can be any number
 389          * of contiguous pages
 390          *
 391          */
 392
 393         uiotmp = *uio;
 394         iovcnt = uio->uio_iovcnt;
 395         iov = uio->uio_iov;
 396         sent = 0;
 397         re;
 398 #endif
 399         return (0);
 400 }
 401
 402 static int
 403 cxgb_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
 404     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 405 {
 406         struct toedev *tdev;
 407         int rv, zcopy_thres, zcopy_enabled;
 408         struct tcpcb *tp = sototcpcb(so);
 409
 410         /*
 411          * In order to use DMA direct from userspace the following
 412          * conditions must be met:
 413          *  - the connection is currently offloaded
 414          *  - ddp is enabled
 415          *  - the number of bytes to be transferred exceeds the threshold
 416          *  - the number of bytes currently in flight won't exceed the in-flight
 417          *    threshold XXX TODO
 418          *  - vm_fault_hold_user_pages succeeds
 419          *  - blocking socket XXX for now
 420          *  - iovcnt is 1
 421          *
 422          */
 423         if (tp->t_flags & TF_TOE) {
 424                 tdev =  TOE_DEV(so);
 425                 zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
 426                 zcopy_enabled = TOM_TUNABLE(tdev, ddp);
 427                 if ((uio->uio_resid > zcopy_thres) &&
 428                     (uio->uio_iovcnt == 1) &&  ((so->so_state & SS_NBIO) == 0)
 429                     && zcopy_enabled) {
 430                         rv = t3_soreceive(so, uio);
 431                         if (rv != EAGAIN)
 432                                 return (rv);
 433                 }
 434         }
 435
 436         return pru_soreceive(so, psa, uio, mp0, controlp, flagsp);
 437 }
 438
 439
 440 void
 441 t3_install_socket_ops(struct socket *so)
 442 {
 443         so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend;
 444         so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive;
 445 }
 446
 447 /*
 448  * This routine takes a user address range and does the following:
 449  *  - validate that the user has access to those pages (flags indicates read or write) - if not fail
 450  *  - validate that count is enough to hold range number of pages - if not fail
 451  *  - fault in any non-resident pages
 452  *  - if the user is doing a read force a write fault for any COWed pages
 453  *  - if the user is doing a read mark all pages as dirty
 454  *  - hold all pages
 455  *  - return number of pages in count
 456  */
 457 #ifdef notyet
 458 static int
 459 vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, int *count, int flags)
 460 {
 461
 462         vm_offset_t start, va;
 463         vm_paddr_t pa;
 464         int pageslen, faults, rv;
 465
 466         struct thread *td;
 467         vm_map_t map;
 468         pmap_t pmap;
 469         vm_page_t m, *pages;
 470         vm_prot_t prot;
 471
 472         start = addr & ~PAGE_MASK;
 473         pageslen = roundup2(addr + len, PAGE_SIZE);
 474         if (*count < (pageslen >> PAGE_SHIFT))
 475                 return (EFBIG);
 476
 477         *count = pageslen >> PAGE_SHIFT;
 478         /*
 479          * Check that virtual address range is legal
 480          * This check is somewhat bogus as on some architectures kernel
 481          * and user do not share VA - however, it appears that all FreeBSD
 482          * architectures define it
 483          */
 484         if (addr + len > VM_MAXUSER_ADDRESS)
 485                 return (EFAULT);
 486
 487         td = curthread;
 488         map = &td->td_proc->p_vmspace->vm_map;
 489         pmap = &td->td_proc->p_vmspace->vm_pmap;
 490         pages = mp;
 491
 492         prot = (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : VM_PROT_READ;
 493         bzero(pages, sizeof(vm_page_t *) * (*count));
 494 retry:
 495
 496         /*
 497          * First optimistically assume that all pages are resident (and R/W if for write)
 498          * if so just mark pages as held (and dirty if for write) and return
 499          */
 500         vm_page_lock_queues();
 501         for (pages = mp, faults = 0, va = start; va < pageslen; va += PAGE_SIZE, pages++) {
 502                 /*
 503                  * Assure that we only hold the page once
 504                  */
 505                 if (*pages == NULL) {
 506                         /*
 507                          * page queue mutex is recursable so this is OK
 508                          * it would be really nice if we had an unlocked version of this so
 509                          * we were only acquiring the pmap lock 1 time as opposed to potentially
 510                          * many dozens of times
 511                          */
 512                         m = pmap_extract_and_hold(pmap, va, prot);
 513                         if (m == NULL) {
 514                                 faults++;
 515                                 continue;
 516                         }
 517                         *pages = m;
 518                 if (flags & VM_HOLD_WRITEABLE)
 519                         vm_page_dirty(m);
 520                 }
 521         }
 522         vm_page_unlock_queues();
 523
 524         if (faults == 0)
 525                 return (0);
 526         /*
 527          * Pages either have insufficient permissions or are not present
 528          * trigger a fault where neccessary
 529          *
 530          */
 531         for (va = start; va < pageslen; va += PAGE_SIZE) {
 532                 m = NULL;
 533                 pa = pmap_extract(pmap, va);
 534                 rv = 0;
 535                 if (pa)
 536                         m = PHYS_TO_VM_PAGE(pa);
 537                 if (flags & VM_HOLD_WRITEABLE) {
 538                         if (m == NULL  || (m->flags & PG_WRITEABLE) == 0)
 539                                 rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
 540                 } else if (m == NULL)
 541                         rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL);
 542                 if (rv)
 543                         goto error;
 544         }
 545         goto retry;
 546
 547 error:
 548         vm_page_lock_queues();
 549         for (pages = mp, va = start; va < pageslen; va += PAGE_SIZE, pages++)
 550                 if (*pages)
 551                         vm_page_unhold(*pages);
 552         vm_page_unlock_queues();
 553         return (EFAULT);
 554 }
 555 #endif
 556
 557 static void
 558 vm_fault_unhold_pages(vm_page_t *mp, int count)
 559 {
 560
 561         KASSERT(count >= 0, ("negative count %d", count));
 562         vm_page_lock_queues();
 563         while (count--) {
 564                 vm_page_unhold(*mp);
 565                 mp++;
 566         }
 567         vm_page_unlock_queues();
 568 }
 569