2 * Copyright (c) 2011 NetApp, Inc.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
32 #include <sys/param.h>
33 #include <sys/linker_set.h>
34 #include <sys/select.h>
36 #include <sys/ioctl.h>
55 #define VTNET_RINGSZ 256
57 #define VTNET_MAXSEGS 32
60 * PCI config-space register offsets
62 #define VTNET_R_CFG0 20
63 #define VTNET_R_CFG1 21
64 #define VTNET_R_CFG2 22
65 #define VTNET_R_CFG3 23
66 #define VTNET_R_CFG4 24
67 #define VTNET_R_CFG5 25
68 #define VTNET_R_CFG6 26
69 #define VTNET_R_CFG7 27
70 #define VTNET_R_MAX 27
72 #define VTNET_REGSZ VTNET_R_MAX+1
77 #define VTNET_S_HOSTCAPS \
78 ( 0x00000020 | /* host supplies MAC */ \
79 0x00008000 | /* host can merge Rx buffers */ \
80 0x00010000 ) /* config status available */
94 uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
96 /* Host-context pointers to the queue */
97 struct virtio_desc *hq_dtable;
98 uint16_t *hq_avail_flags;
99 uint16_t *hq_avail_idx; /* monotonically increasing */
100 uint16_t *hq_avail_ring;
102 uint16_t *hq_used_flags;
103 uint16_t *hq_used_idx; /* monotonically increasing */
104 struct virtio_used *hq_used_ring;
108 * Fixed network header size
110 struct virtio_net_rxhdr {
112 uint8_t vrh_gso_type;
113 uint16_t vrh_hdr_len;
114 uint16_t vrh_gso_size;
115 uint16_t vrh_csum_start;
116 uint16_t vrh_csum_offset;
123 static int pci_vtnet_debug;
124 #define DPRINTF(params) if (pci_vtnet_debug) printf params
125 #define WPRINTF(params) printf params
130 struct pci_vtnet_softc {
131 struct pci_devinst *vsc_pi;
132 pthread_mutex_t vsc_mtx;
133 struct mevent *vsc_mevp;
142 uint32_t vsc_features;
143 uint8_t vsc_macaddr[6];
145 uint64_t vsc_pfn[VTNET_MAXQ];
146 struct vring_hqueue vsc_hq[VTNET_MAXQ];
150 * Return the number of available descriptors in the vring taking care
151 * of the 16-bit index wraparound.
154 hq_num_avail(struct vring_hqueue *hq)
158 if (*hq->hq_avail_idx >= hq->hq_cur_aidx)
159 ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx;
161 ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1;
163 assert(ndesc >= 0 && ndesc <= hq->hq_size);
169 pci_vtnet_qsize(int qnum)
171 /* XXX no ctl queue currently */
172 if (qnum == VTNET_CTLQ) {
176 /* XXX fixed currently. Maybe different for tx/rx/ctl */
177 return (VTNET_RINGSZ);
181 pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value)
184 DPRINTF(("vtnet: device reset requested !\n"));
187 sc->vsc_status = value;
191 * Called to send a buffer chain out to the tap device
194 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
199 if (sc->vsc_tapfd == -1)
203 * If the length is < 60, pad out to that and add the
204 * extra zero'd segment to the iov. It is guaranteed that
205 * there is always an extra iov available by the caller.
208 memset(pad, 0, 60 - len);
209 iov[iovcnt].iov_base = pad;
210 iov[iovcnt].iov_len = 60 - len;
213 (void) writev(sc->vsc_tapfd, iov, iovcnt);
217 * Called when there is read activity on the tap file descriptor.
218 * Each buffer posted by the guest is assumed to be able to contain
219 * an entire ethernet frame + rx header.
220 * MP note: the dummybuf is only used for discarding frames, so there
221 * is no need for it to be per-vtnet or locked.
223 static uint8_t dummybuf[2048];
226 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
228 struct virtio_desc *vd;
229 struct virtio_used *vu;
230 struct vring_hqueue *hq;
231 struct virtio_net_rxhdr *vrx;
236 int didx, uidx, aidx; /* descriptor, avail and used index */
239 * Should never be called without a valid tap fd
241 assert(sc->vsc_tapfd != -1);
244 * But, will be called when the rx ring hasn't yet
247 if (sc->vsc_rx_ready == 0) {
249 * Drop the packet and try later.
251 (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
256 * Calculate the number of available rx buffers
258 hq = &sc->vsc_hq[VTNET_RXQ];
260 ndescs = hq_num_avail(hq);
264 * Need to wait for host notification to read
266 if (sc->vsc_rxpend == 0) {
267 WPRINTF(("vtnet: no rx descriptors !\n"));
272 * Drop the packet and try later
274 (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
278 aidx = hq->hq_cur_aidx;
279 uidx = *hq->hq_used_idx;
280 for (i = 0; i < ndescs; i++) {
282 * 'aidx' indexes into the an array of descriptor indexes
284 didx = hq->hq_avail_ring[aidx % hq->hq_size];
285 assert(didx >= 0 && didx < hq->hq_size);
287 vd = &hq->hq_dtable[didx];
290 * Get a pointer to the rx header, and use the
291 * data immediately following it for the packet buffer.
293 vrx = (struct virtio_net_rxhdr *)paddr_guest2host(vd->vd_addr);
294 buf = (uint8_t *)(vrx + 1);
296 len = read(sc->vsc_tapfd, buf,
297 vd->vd_len - sizeof(struct virtio_net_rxhdr));
299 if (len < 0 && errno == EWOULDBLOCK) {
304 * The only valid field in the rx packet header is the
305 * number of buffers, which is always 1 without TSO
308 memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
312 * Write this descriptor into the used ring
314 vu = &hq->hq_used_ring[uidx % hq->hq_size];
316 vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr);
322 * Update the used pointer, and signal an interrupt if allowed
324 *hq->hq_used_idx = uidx;
325 hq->hq_cur_aidx = aidx;
327 if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
329 pci_generate_msi(sc->vsc_pi, 0);
334 pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
336 struct pci_vtnet_softc *sc = param;
338 pthread_mutex_lock(&sc->vsc_mtx);
339 pci_vtnet_tap_rx(sc);
340 pthread_mutex_unlock(&sc->vsc_mtx);
345 pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
348 * A qnotify means that the rx process can now begin
350 if (sc->vsc_rx_ready == 0) {
351 sc->vsc_rx_ready = 1;
355 * If the rx queue was empty, attempt to receive a
356 * packet that was previously blocked due to no rx bufs
359 if (sc->vsc_rxpend) {
360 WPRINTF(("vtnet: rx resumed\n\r"));
362 pci_vtnet_tap_rx(sc);
367 pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
369 struct iovec iov[VTNET_MAXSEGS + 1];
370 struct virtio_desc *vd;
371 struct virtio_used *vu;
375 int uidx, aidx, didx;
377 uidx = *hq->hq_used_idx;
378 aidx = hq->hq_cur_aidx;
379 didx = hq->hq_avail_ring[aidx % hq->hq_size];
380 assert(didx >= 0 && didx < hq->hq_size);
382 vd = &hq->hq_dtable[didx];
385 * Run through the chain of descriptors, ignoring the
386 * first header descriptor. However, include the header
387 * length in the total length that will be put into the
391 vd = &hq->hq_dtable[vd->vd_next];
393 for (i = 0, plen = 0;
395 i++, vd = &hq->hq_dtable[vd->vd_next]) {
396 iov[i].iov_base = paddr_guest2host(vd->vd_addr);
397 iov[i].iov_len = vd->vd_len;
401 if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
404 assert(i < VTNET_MAXSEGS);
406 DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1));
407 pci_vtnet_tap_tx(sc, iov, i + 1, plen);
410 * Return this chain back to the host
412 vu = &hq->hq_used_ring[uidx % hq->hq_size];
415 hq->hq_cur_aidx = aidx + 1;
416 *hq->hq_used_idx = uidx + 1;
419 * Generate an interrupt if able
421 if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
423 pci_generate_msi(sc->vsc_pi, 0);
428 pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
430 struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ];
435 * Calculate number of ring entries to process
437 ndescs = hq_num_avail(hq);
443 * Run through all the entries, placing them into iovecs and
444 * sending when an end-of-packet is found
446 for (i = 0; i < ndescs; i++)
447 pci_vtnet_proctx(sc, hq);
451 pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc)
454 DPRINTF(("vtnet: control qnotify!\n\r"));
458 pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn)
460 struct vring_hqueue *hq;
461 int qnum = sc->vsc_curq;
463 assert(qnum < VTNET_MAXQ);
465 sc->vsc_pfn[qnum] = pfn << VRING_PFN;
468 * Set up host pointers to the various parts of the
471 hq = &sc->vsc_hq[qnum];
472 hq->hq_size = pci_vtnet_qsize(qnum);
474 hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN);
475 hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size);
476 hq->hq_avail_idx = hq->hq_avail_flags + 1;
477 hq->hq_avail_ring = hq->hq_avail_flags + 2;
478 hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
480 hq->hq_used_idx = hq->hq_used_flags + 1;
481 hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
484 * Initialize queue indexes
490 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
493 unsigned char digest[16];
495 struct pci_vtnet_softc *sc;
498 * Access to guest memory is required. Fail if
501 if (paddr_guest2host(0) == NULL)
504 sc = malloc(sizeof(struct pci_vtnet_softc));
505 memset(sc, 0, sizeof(struct pci_vtnet_softc));
510 pthread_mutex_init(&sc->vsc_mtx, NULL);
513 * Attempt to open the tap device
519 strcpy(tbuf, "/dev/");
520 strlcat(tbuf, opts, sizeof(tbuf));
522 sc->vsc_tapfd = open(tbuf, O_RDWR);
523 if (sc->vsc_tapfd == -1) {
524 WPRINTF(("open of tap device %s failed\n", tbuf));
527 * Set non-blocking and register for read
528 * notifications with the event loop
531 if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
532 WPRINTF(("tap device O_NONBLOCK failed\n"));
533 close(sc->vsc_tapfd);
537 sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
539 pci_vtnet_tap_callback,
541 if (sc->vsc_mevp == NULL) {
542 WPRINTF(("Could not register event\n"));
543 close(sc->vsc_tapfd);
550 * The MAC address is the standard NetApp OUI of 00-a0-98,
551 * followed by an MD5 of the vm name. The slot/func number is
552 * prepended to this for slots other than 1:0, so that
553 * a bootloader can netboot from the equivalent of slot 1.
555 if (pi->pi_slot == 1 && pi->pi_func == 0) {
556 strncpy(nstr, vmname, sizeof(nstr));
558 snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
559 pi->pi_func, vmname);
563 MD5Update(&mdctx, nstr, strlen(nstr));
564 MD5Final(digest, &mdctx);
566 sc->vsc_macaddr[0] = 0x00;
567 sc->vsc_macaddr[1] = 0xa0;
568 sc->vsc_macaddr[2] = 0x98;
569 sc->vsc_macaddr[3] = digest[0];
570 sc->vsc_macaddr[4] = digest[1];
571 sc->vsc_macaddr[5] = digest[2];
573 /* initialize config space */
574 pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
575 pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
576 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
577 pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
578 pci_emul_add_msicap(pi, 1);
579 pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ);
585 * Function pointer array to handle queue notifications
587 static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
594 pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
595 int baridx, uint64_t offset, int size, uint64_t value)
597 struct pci_vtnet_softc *sc = pi->pi_arg;
602 if (offset + size > VTNET_REGSZ) {
603 DPRINTF(("vtnet_write: 2big, offset %ld size %d\n",
608 pthread_mutex_lock(&sc->vsc_mtx);
611 case VTCFG_R_GUESTCAP:
613 sc->vsc_features = value & VTNET_S_HOSTCAPS;
617 pci_vtnet_ring_init(sc, value);
621 assert(value < VTNET_MAXQ);
622 sc->vsc_curq = value;
624 case VTCFG_R_QNOTIFY:
626 assert(value < VTNET_MAXQ);
627 (*pci_vtnet_qnotify[value])(sc);
631 pci_vtnet_update_status(sc, value);
639 assert((size + offset) <= (VTNET_R_CFG5 + 1));
640 ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
642 * The driver is allowed to change the MAC address
644 sc->vsc_macaddr[offset - VTNET_R_CFG0] = value;
646 *(uint8_t *) ptr = value;
647 } else if (size == 2) {
648 *(uint16_t *) ptr = value;
650 *(uint32_t *) ptr = value;
653 case VTCFG_R_HOSTCAP:
658 DPRINTF(("vtnet: write to readonly reg %ld\n\r", offset));
661 DPRINTF(("vtnet: unknown i/o write offset %ld\n\r", offset));
666 pthread_mutex_unlock(&sc->vsc_mtx);
670 pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
671 int baridx, uint64_t offset, int size)
673 struct pci_vtnet_softc *sc = pi->pi_arg;
679 if (offset + size > VTNET_REGSZ) {
680 DPRINTF(("vtnet_read: 2big, offset %ld size %d\n",
685 pthread_mutex_lock(&sc->vsc_mtx);
688 case VTCFG_R_HOSTCAP:
690 value = VTNET_S_HOSTCAPS;
692 case VTCFG_R_GUESTCAP:
694 value = sc->vsc_features; /* XXX never read ? */
698 value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
702 value = pci_vtnet_qsize(sc->vsc_curq);
706 value = sc->vsc_curq; /* XXX never read ? */
708 case VTCFG_R_QNOTIFY:
710 value = sc->vsc_curq; /* XXX never read ? */
714 value = sc->vsc_status;
719 sc->vsc_isr = 0; /* a read clears this flag */
727 assert((size + offset) <= (VTNET_R_CFG5 + 1));
728 ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
730 value = *(uint8_t *) ptr;
731 } else if (size == 2) {
732 value = *(uint16_t *) ptr;
734 value = *(uint32_t *) ptr;
739 value = 0x01; /* XXX link always up */
743 value = 0; /* XXX link status in LSB */
746 DPRINTF(("vtnet: unknown i/o read offset %ld\n\r", offset));
751 pthread_mutex_unlock(&sc->vsc_mtx);
756 struct pci_devemu pci_de_vnet = {
757 .pe_emu = "virtio-net",
758 .pe_init = pci_vtnet_init,
759 .pe_barwrite = pci_vtnet_write,
760 .pe_barread = pci_vtnet_read
762 PCI_EMUL_SET(pci_de_vnet);