2 * Copyright (c) 2011 NetApp, Inc.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
32 #include <sys/param.h>
33 #include <sys/linker_set.h>
34 #include <sys/select.h>
36 #include <sys/ioctl.h>
55 #define VTNET_RINGSZ 256
57 #define VTNET_MAXSEGS 32
60 * PCI config-space register offsets
62 #define VTNET_R_CFG0 24
63 #define VTNET_R_CFG1 25
64 #define VTNET_R_CFG2 26
65 #define VTNET_R_CFG3 27
66 #define VTNET_R_CFG4 28
67 #define VTNET_R_CFG5 29
68 #define VTNET_R_CFG6 30
69 #define VTNET_R_CFG7 31
70 #define VTNET_R_MAX 31
72 #define VTNET_REGSZ VTNET_R_MAX+1
77 #define VTNET_S_HOSTCAPS \
78 ( 0x00000020 | /* host supplies MAC */ \
79 0x00008000 | /* host can merge Rx buffers */ \
80 0x00010000 ) /* config status available */
91 static int use_msix = 1;
96 uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
98 /* Host-context pointers to the queue */
99 struct virtio_desc *hq_dtable;
100 uint16_t *hq_avail_flags;
101 uint16_t *hq_avail_idx; /* monotonically increasing */
102 uint16_t *hq_avail_ring;
104 uint16_t *hq_used_flags;
105 uint16_t *hq_used_idx; /* monotonically increasing */
106 struct virtio_used *hq_used_ring;
110 * Fixed network header size
112 struct virtio_net_rxhdr {
114 uint8_t vrh_gso_type;
115 uint16_t vrh_hdr_len;
116 uint16_t vrh_gso_size;
117 uint16_t vrh_csum_start;
118 uint16_t vrh_csum_offset;
125 static int pci_vtnet_debug;
126 #define DPRINTF(params) if (pci_vtnet_debug) printf params
127 #define WPRINTF(params) printf params
132 struct pci_vtnet_softc {
133 struct pci_devinst *vsc_pi;
134 pthread_mutex_t vsc_mtx;
135 struct mevent *vsc_mevp;
144 uint32_t vsc_features;
145 uint8_t vsc_macaddr[6];
147 uint64_t vsc_pfn[VTNET_MAXQ];
148 struct vring_hqueue vsc_hq[VTNET_MAXQ];
149 uint16_t vsc_msix_table_idx[VTNET_MAXQ];
151 #define vtnet_ctx(sc) ((sc)->vsc_pi->pi_vmctx)
154 * Return the size of IO BAR that maps virtio header and device specific
155 * region. The size would vary depending on whether MSI-X is enabled or
159 pci_vtnet_iosize(struct pci_devinst *pi)
161 if (pci_msix_enabled(pi))
162 return (VTNET_REGSZ);
164 return (VTNET_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX));
168 * Return the number of available descriptors in the vring taking care
169 * of the 16-bit index wraparound.
172 hq_num_avail(struct vring_hqueue *hq)
177 * We're just computing (a-b) in GF(216).
179 * The only glitch here is that in standard C,
180 * uint16_t promotes to (signed) int when int has
181 * more than 16 bits (pretty much always now), so
182 * we have to force it back to unsigned.
184 ndesc = (unsigned)*hq->hq_avail_idx - (unsigned)hq->hq_cur_aidx;
186 assert(ndesc <= hq->hq_size);
192 pci_vtnet_qsize(int qnum)
194 /* XXX no ctl queue currently */
195 if (qnum == VTNET_CTLQ) {
199 /* XXX fixed currently. Maybe different for tx/rx/ctl */
200 return (VTNET_RINGSZ);
204 pci_vtnet_ring_reset(struct pci_vtnet_softc *sc, int ring)
206 struct vring_hqueue *hq;
208 assert(ring < VTNET_MAXQ);
210 hq = &sc->vsc_hq[ring];
213 * Reset all soft state
219 pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value)
223 DPRINTF(("vtnet: device reset requested !\n"));
224 pci_vtnet_ring_reset(sc, VTNET_RXQ);
225 pci_vtnet_ring_reset(sc, VTNET_TXQ);
226 sc->vsc_rx_ready = 0;
229 sc->vsc_status = value;
233 * Called to send a buffer chain out to the tap device
236 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
241 if (sc->vsc_tapfd == -1)
245 * If the length is < 60, pad out to that and add the
246 * extra zero'd segment to the iov. It is guaranteed that
247 * there is always an extra iov available by the caller.
250 memset(pad, 0, 60 - len);
251 iov[iovcnt].iov_base = pad;
252 iov[iovcnt].iov_len = 60 - len;
255 (void) writev(sc->vsc_tapfd, iov, iovcnt);
259 * Called when there is read activity on the tap file descriptor.
260 * Each buffer posted by the guest is assumed to be able to contain
261 * an entire ethernet frame + rx header.
262 * MP note: the dummybuf is only used for discarding frames, so there
263 * is no need for it to be per-vtnet or locked.
265 static uint8_t dummybuf[2048];
268 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
270 struct virtio_desc *vd;
271 struct virtio_used *vu;
272 struct vring_hqueue *hq;
273 struct virtio_net_rxhdr *vrx;
278 int didx, uidx, aidx; /* descriptor, avail and used index */
281 * Should never be called without a valid tap fd
283 assert(sc->vsc_tapfd != -1);
286 * But, will be called when the rx ring hasn't yet
289 if (sc->vsc_rx_ready == 0) {
291 * Drop the packet and try later.
293 (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
298 * Calculate the number of available rx buffers
300 hq = &sc->vsc_hq[VTNET_RXQ];
302 ndescs = hq_num_avail(hq);
306 * Need to wait for host notification to read
308 if (sc->vsc_rxpend == 0) {
309 WPRINTF(("vtnet: no rx descriptors !\n"));
314 * Drop the packet and try later
316 (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
320 aidx = hq->hq_cur_aidx;
321 uidx = *hq->hq_used_idx;
322 for (i = 0; i < ndescs; i++) {
324 * 'aidx' indexes into the an array of descriptor indexes
326 didx = hq->hq_avail_ring[aidx % hq->hq_size];
327 assert(didx >= 0 && didx < hq->hq_size);
329 vd = &hq->hq_dtable[didx];
332 * Get a pointer to the rx header, and use the
333 * data immediately following it for the packet buffer.
335 vrx = paddr_guest2host(vtnet_ctx(sc), vd->vd_addr, vd->vd_len);
336 buf = (uint8_t *)(vrx + 1);
338 len = read(sc->vsc_tapfd, buf,
339 vd->vd_len - sizeof(struct virtio_net_rxhdr));
341 if (len < 0 && errno == EWOULDBLOCK) {
346 * The only valid field in the rx packet header is the
347 * number of buffers, which is always 1 without TSO
350 memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
354 * Write this descriptor into the used ring
356 vu = &hq->hq_used_ring[uidx % hq->hq_size];
358 vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr);
364 * Update the used pointer, and signal an interrupt if allowed
366 *hq->hq_used_idx = uidx;
367 hq->hq_cur_aidx = aidx;
369 if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
371 pci_generate_msix(sc->vsc_pi,
372 sc->vsc_msix_table_idx[VTNET_RXQ]);
375 pci_generate_msi(sc->vsc_pi, 0);
381 pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
383 struct pci_vtnet_softc *sc = param;
385 pthread_mutex_lock(&sc->vsc_mtx);
386 pci_vtnet_tap_rx(sc);
387 pthread_mutex_unlock(&sc->vsc_mtx);
392 pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
395 * A qnotify means that the rx process can now begin
397 if (sc->vsc_rx_ready == 0) {
398 sc->vsc_rx_ready = 1;
402 * If the rx queue was empty, attempt to receive a
403 * packet that was previously blocked due to no rx bufs
406 if (sc->vsc_rxpend) {
407 WPRINTF(("vtnet: rx resumed\n\r"));
409 pci_vtnet_tap_rx(sc);
414 pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
416 struct iovec iov[VTNET_MAXSEGS + 1];
417 struct virtio_desc *vd;
418 struct virtio_used *vu;
422 int uidx, aidx, didx;
424 uidx = *hq->hq_used_idx;
425 aidx = hq->hq_cur_aidx;
426 didx = hq->hq_avail_ring[aidx % hq->hq_size];
427 assert(didx >= 0 && didx < hq->hq_size);
429 vd = &hq->hq_dtable[didx];
432 * Run through the chain of descriptors, ignoring the
433 * first header descriptor. However, include the header
434 * length in the total length that will be put into the
438 vd = &hq->hq_dtable[vd->vd_next];
440 for (i = 0, plen = 0;
442 i++, vd = &hq->hq_dtable[vd->vd_next]) {
443 iov[i].iov_base = paddr_guest2host(vtnet_ctx(sc),
444 vd->vd_addr, vd->vd_len);
445 iov[i].iov_len = vd->vd_len;
449 if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
452 assert(i < VTNET_MAXSEGS);
454 DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1));
455 pci_vtnet_tap_tx(sc, iov, i + 1, plen);
458 * Return this chain back to the host
460 vu = &hq->hq_used_ring[uidx % hq->hq_size];
463 hq->hq_cur_aidx = aidx + 1;
464 *hq->hq_used_idx = uidx + 1;
467 * Generate an interrupt if able
469 if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
471 pci_generate_msix(sc->vsc_pi,
472 sc->vsc_msix_table_idx[VTNET_TXQ]);
475 pci_generate_msi(sc->vsc_pi, 0);
481 pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
483 struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ];
488 * Calculate number of ring entries to process
490 ndescs = hq_num_avail(hq);
496 * Run through all the entries, placing them into iovecs and
497 * sending when an end-of-packet is found
499 for (i = 0; i < ndescs; i++)
500 pci_vtnet_proctx(sc, hq);
504 pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc)
507 DPRINTF(("vtnet: control qnotify!\n\r"));
511 pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn)
513 struct vring_hqueue *hq;
514 int qnum = sc->vsc_curq;
516 assert(qnum < VTNET_MAXQ);
518 sc->vsc_pfn[qnum] = pfn << VRING_PFN;
521 * Set up host pointers to the various parts of the
524 hq = &sc->vsc_hq[qnum];
525 hq->hq_size = pci_vtnet_qsize(qnum);
527 hq->hq_dtable = paddr_guest2host(vtnet_ctx(sc), pfn << VRING_PFN,
528 vring_size(hq->hq_size));
529 hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
530 hq->hq_avail_idx = hq->hq_avail_flags + 1;
531 hq->hq_avail_ring = hq->hq_avail_flags + 2;
532 hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
534 hq->hq_used_idx = hq->hq_used_flags + 1;
535 hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
538 * Initialize queue indexes
544 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
547 unsigned char digest[16];
549 struct pci_vtnet_softc *sc;
552 sc = malloc(sizeof(struct pci_vtnet_softc));
553 memset(sc, 0, sizeof(struct pci_vtnet_softc));
558 pthread_mutex_init(&sc->vsc_mtx, NULL);
561 * Use MSI if set by user
563 if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) {
564 if (strcasecmp(env_msi, "yes") == 0)
569 * Attempt to open the tap device
575 strcpy(tbuf, "/dev/");
576 strlcat(tbuf, opts, sizeof(tbuf));
578 sc->vsc_tapfd = open(tbuf, O_RDWR);
579 if (sc->vsc_tapfd == -1) {
580 WPRINTF(("open of tap device %s failed\n", tbuf));
583 * Set non-blocking and register for read
584 * notifications with the event loop
587 if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
588 WPRINTF(("tap device O_NONBLOCK failed\n"));
589 close(sc->vsc_tapfd);
593 sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
595 pci_vtnet_tap_callback,
597 if (sc->vsc_mevp == NULL) {
598 WPRINTF(("Could not register event\n"));
599 close(sc->vsc_tapfd);
606 * The MAC address is the standard NetApp OUI of 00-a0-98,
607 * followed by an MD5 of the vm name. The slot/func number is
608 * prepended to this for slots other than 1:0, so that
609 * a bootloader can netboot from the equivalent of slot 1.
611 if (pi->pi_slot == 1 && pi->pi_func == 0) {
612 strncpy(nstr, vmname, sizeof(nstr));
614 snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
615 pi->pi_func, vmname);
619 MD5Update(&mdctx, nstr, strlen(nstr));
620 MD5Final(digest, &mdctx);
622 sc->vsc_macaddr[0] = 0x00;
623 sc->vsc_macaddr[1] = 0xa0;
624 sc->vsc_macaddr[2] = 0x98;
625 sc->vsc_macaddr[3] = digest[0];
626 sc->vsc_macaddr[4] = digest[1];
627 sc->vsc_macaddr[5] = digest[2];
629 /* initialize config space */
630 pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
631 pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
632 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
633 pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
639 for (i = 0; i < VTNET_MAXQ; i++)
640 sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
643 * BAR 1 used to map MSI-X table and PBA
645 if (pci_emul_add_msixcap(pi, VTNET_MAXQ, 1))
649 pci_emul_add_msicap(pi, 1);
652 pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ);
658 * Function pointer array to handle queue notifications
660 static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
667 vtnet_adjust_offset(struct pci_devinst *pi, uint64_t offset)
670 * Device specific offsets used by guest would change based on
671 * whether MSI-X capability is enabled or not
673 if (!pci_msix_enabled(pi)) {
674 if (offset >= VTCFG_R_MSIX)
675 return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX));
682 pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
683 int baridx, uint64_t offset, int size, uint64_t value)
685 struct pci_vtnet_softc *sc = pi->pi_arg;
689 if (baridx == pci_msix_table_bar(pi) ||
690 baridx == pci_msix_pba_bar(pi)) {
691 pci_emul_msix_twrite(pi, offset, size, value);
698 if (offset + size > pci_vtnet_iosize(pi)) {
699 DPRINTF(("vtnet_write: 2big, offset %ld size %d\n",
704 pthread_mutex_lock(&sc->vsc_mtx);
706 offset = vtnet_adjust_offset(pi, offset);
709 case VTCFG_R_GUESTCAP:
711 sc->vsc_features = value & VTNET_S_HOSTCAPS;
715 pci_vtnet_ring_init(sc, value);
719 assert(value < VTNET_MAXQ);
720 sc->vsc_curq = value;
722 case VTCFG_R_QNOTIFY:
724 assert(value < VTNET_MAXQ);
725 (*pci_vtnet_qnotify[value])(sc);
729 pci_vtnet_update_status(sc, value);
733 sc->vsc_msix_table_idx[VTNET_CTLQ] = value;
737 assert(sc->vsc_curq != VTNET_CTLQ);
738 sc->vsc_msix_table_idx[sc->vsc_curq] = value;
746 assert((size + offset) <= (VTNET_R_CFG5 + 1));
747 ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
749 * The driver is allowed to change the MAC address
751 sc->vsc_macaddr[offset - VTNET_R_CFG0] = value;
753 *(uint8_t *) ptr = value;
754 } else if (size == 2) {
755 *(uint16_t *) ptr = value;
757 *(uint32_t *) ptr = value;
760 case VTCFG_R_HOSTCAP:
765 DPRINTF(("vtnet: write to readonly reg %ld\n\r", offset));
768 DPRINTF(("vtnet: unknown i/o write offset %ld\n\r", offset));
773 pthread_mutex_unlock(&sc->vsc_mtx);
777 pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
778 int baridx, uint64_t offset, int size)
780 struct pci_vtnet_softc *sc = pi->pi_arg;
785 if (baridx == pci_msix_table_bar(pi) ||
786 baridx == pci_msix_pba_bar(pi)) {
787 return (pci_emul_msix_tread(pi, offset, size));
793 if (offset + size > pci_vtnet_iosize(pi)) {
794 DPRINTF(("vtnet_read: 2big, offset %ld size %d\n",
799 pthread_mutex_lock(&sc->vsc_mtx);
801 offset = vtnet_adjust_offset(pi, offset);
804 case VTCFG_R_HOSTCAP:
806 value = VTNET_S_HOSTCAPS;
808 case VTCFG_R_GUESTCAP:
810 value = sc->vsc_features; /* XXX never read ? */
814 value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
818 value = pci_vtnet_qsize(sc->vsc_curq);
822 value = sc->vsc_curq; /* XXX never read ? */
824 case VTCFG_R_QNOTIFY:
826 value = sc->vsc_curq; /* XXX never read ? */
830 value = sc->vsc_status;
835 sc->vsc_isr = 0; /* a read clears this flag */
839 value = sc->vsc_msix_table_idx[VTNET_CTLQ];
843 assert(sc->vsc_curq != VTNET_CTLQ);
844 value = sc->vsc_msix_table_idx[sc->vsc_curq];
852 assert((size + offset) <= (VTNET_R_CFG5 + 1));
853 ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
855 value = *(uint8_t *) ptr;
856 } else if (size == 2) {
857 value = *(uint16_t *) ptr;
859 value = *(uint32_t *) ptr;
864 value = 0x01; /* XXX link always up */
868 value = 0; /* XXX link status in LSB */
871 DPRINTF(("vtnet: unknown i/o read offset %ld\n\r", offset));
876 pthread_mutex_unlock(&sc->vsc_mtx);
881 struct pci_devemu pci_de_vnet = {
882 .pe_emu = "virtio-net",
883 .pe_init = pci_vtnet_init,
884 .pe_barwrite = pci_vtnet_write,
885 .pe_barread = pci_vtnet_read
887 PCI_EMUL_SET(pci_de_vnet);