1 /******************************************************************************
3 Copyright (c) 2006-2013, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
40 #include <sys/malloc.h>
42 #include <sys/kernel.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
48 #include <sys/taskqueue.h>
51 #include <net/if_arp.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
58 #include <net/if_types.h>
59 #include <net/if_vlan_var.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/ip6.h>
66 #include <netinet/tcp.h>
67 #include <netinet/tcp_lro.h>
68 #include <netinet6/ip6_var.h>
70 #include <machine/bus.h>
71 #include <machine/in_cksum.h>
72 #include <machine/resource.h>
77 #include <dev/pci/pcireg.h>
78 #include <dev/pci/pcivar.h>
79 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
81 #include <vm/vm.h> /* for pmap_mapdev() */
84 #if defined(__i386) || defined(__amd64)
85 #include <machine/specialreg.h>
88 #include <dev/mxge/mxge_mcp.h>
89 #include <dev/mxge/mcp_gen_header.h>
90 /*#define MXGE_FAKE_IFP*/
91 #include <dev/mxge/if_mxge_var.h>
93 #include <sys/buf_ring.h>
97 #include "opt_inet6.h"
100 static int mxge_nvidia_ecrc_enable = 1;
101 static int mxge_force_firmware = 0;
102 static int mxge_intr_coal_delay = 30;
103 static int mxge_deassert_wait = 1;
104 static int mxge_flow_control = 1;
105 static int mxge_verbose = 0;
106 static int mxge_ticks;
107 static int mxge_max_slices = 1;
108 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
109 static int mxge_always_promisc = 0;
110 static int mxge_initial_mtu = ETHERMTU_JUMBO;
111 static int mxge_throttle = 0;
112 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
113 static char *mxge_fw_aligned = "mxge_eth_z8e";
114 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
115 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117 static int mxge_probe(device_t dev);
118 static int mxge_attach(device_t dev);
119 static int mxge_detach(device_t dev);
120 static int mxge_shutdown(device_t dev);
121 static void mxge_intr(void *arg);
123 static device_method_t mxge_methods[] =
125 /* Device interface */
126 DEVMETHOD(device_probe, mxge_probe),
127 DEVMETHOD(device_attach, mxge_attach),
128 DEVMETHOD(device_detach, mxge_detach),
129 DEVMETHOD(device_shutdown, mxge_shutdown),
133 static driver_t mxge_driver =
137 sizeof(mxge_softc_t),
140 static devclass_t mxge_devclass;
142 /* Declare ourselves to be a child of the PCI bus.*/
143 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
144 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
145 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
147 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
148 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
149 static int mxge_close(mxge_softc_t *sc, int down);
150 static int mxge_open(mxge_softc_t *sc);
151 static void mxge_tick(void *arg);
154 mxge_probe(device_t dev)
159 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
160 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
161 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
162 rev = pci_get_revid(dev);
164 case MXGE_PCI_REV_Z8E:
165 device_set_desc(dev, "Myri10G-PCIE-8A");
167 case MXGE_PCI_REV_Z8ES:
168 device_set_desc(dev, "Myri10G-PCIE-8B");
171 device_set_desc(dev, "Myri10G-PCIE-8??");
172 device_printf(dev, "Unrecognized rev %d NIC\n",
182 mxge_enable_wc(mxge_softc_t *sc)
184 #if defined(__i386) || defined(__amd64)
189 len = rman_get_size(sc->mem_res);
190 err = pmap_change_attr((vm_offset_t) sc->sram,
191 len, PAT_WRITE_COMBINING);
193 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
201 /* callback to get our DMA address */
203 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
207 *(bus_addr_t *) arg = segs->ds_addr;
212 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
213 bus_size_t alignment)
216 device_t dev = sc->dev;
217 bus_size_t boundary, maxsegsize;
219 if (bytes > 4096 && alignment == 4096) {
227 /* allocate DMAable memory tags */
228 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
229 alignment, /* alignment */
230 boundary, /* boundary */
231 BUS_SPACE_MAXADDR, /* low */
232 BUS_SPACE_MAXADDR, /* high */
233 NULL, NULL, /* filter */
236 maxsegsize, /* maxsegsize */
237 BUS_DMA_COHERENT, /* flags */
238 NULL, NULL, /* lock */
239 &dma->dmat); /* tag */
241 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
245 /* allocate DMAable memory & map */
246 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
247 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
248 | BUS_DMA_ZERO), &dma->map);
250 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
251 goto abort_with_dmat;
254 /* load the memory */
255 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
256 mxge_dmamap_callback,
257 (void *)&dma->bus_addr, 0);
259 device_printf(dev, "couldn't load map (err = %d)\n", err);
265 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
267 (void)bus_dma_tag_destroy(dma->dmat);
273 mxge_dma_free(mxge_dma_t *dma)
275 bus_dmamap_unload(dma->dmat, dma->map);
276 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
277 (void)bus_dma_tag_destroy(dma->dmat);
281 * The eeprom strings on the lanaiX have the format
288 mxge_parse_strings(mxge_softc_t *sc)
291 int i, found_mac, found_sn2;
294 ptr = sc->eeprom_strings;
297 while (*ptr != '\0') {
298 if (strncmp(ptr, "MAC=", 4) == 0) {
301 sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
302 if (endptr - ptr != 2)
311 } else if (strncmp(ptr, "PC=", 3) == 0) {
313 strlcpy(sc->product_code_string, ptr,
314 sizeof(sc->product_code_string));
315 } else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
317 strlcpy(sc->serial_number_string, ptr,
318 sizeof(sc->serial_number_string));
319 } else if (strncmp(ptr, "SN2=", 4) == 0) {
320 /* SN2 takes precedence over SN */
323 strlcpy(sc->serial_number_string, ptr,
324 sizeof(sc->serial_number_string));
326 while (*ptr++ != '\0') {}
333 device_printf(sc->dev, "failed to parse eeprom_strings\n");
338 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
340 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
343 unsigned long base, off;
345 device_t pdev, mcp55;
346 uint16_t vendor_id, device_id, word;
347 uintptr_t bus, slot, func, ivend, idev;
351 if (!mxge_nvidia_ecrc_enable)
354 pdev = device_get_parent(device_get_parent(sc->dev));
356 device_printf(sc->dev, "could not find parent?\n");
359 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
360 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
362 if (vendor_id != 0x10de)
367 if (device_id == 0x005d) {
368 /* ck804, base address is magic */
370 } else if (device_id >= 0x0374 && device_id <= 0x378) {
371 /* mcp55, base address stored in chipset */
372 mcp55 = pci_find_bsf(0, 0, 0);
374 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
375 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
376 word = pci_read_config(mcp55, 0x90, 2);
377 base = ((unsigned long)word & 0x7ffeU) << 25;
384 Test below is commented because it is believed that doing
385 config read/write beyond 0xff will access the config space
386 for the next larger function. Uncomment this and remove
387 the hacky pmap_mapdev() way of accessing config space when
388 FreeBSD grows support for extended pcie config space access
391 /* See if we can, by some miracle, access the extended
393 val = pci_read_config(pdev, 0x178, 4);
394 if (val != 0xffffffff) {
396 pci_write_config(pdev, 0x178, val, 4);
400 /* Rather than using normal pci config space writes, we must
401 * map the Nvidia config space ourselves. This is because on
402 * opteron/nvidia class machine the 0xe000000 mapping is
403 * handled by the nvidia chipset, that means the internal PCI
404 * device (the on-chip northbridge), or the amd-8131 bridge
405 * and things behind them are not visible by this method.
408 BUS_READ_IVAR(device_get_parent(pdev), pdev,
410 BUS_READ_IVAR(device_get_parent(pdev), pdev,
411 PCI_IVAR_SLOT, &slot);
412 BUS_READ_IVAR(device_get_parent(pdev), pdev,
413 PCI_IVAR_FUNCTION, &func);
414 BUS_READ_IVAR(device_get_parent(pdev), pdev,
415 PCI_IVAR_VENDOR, &ivend);
416 BUS_READ_IVAR(device_get_parent(pdev), pdev,
417 PCI_IVAR_DEVICE, &idev);
420 + 0x00100000UL * (unsigned long)bus
421 + 0x00001000UL * (unsigned long)(func
424 /* map it into the kernel */
425 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
429 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
432 /* get a pointer to the config space mapped into the kernel */
433 cfgptr = va + (off & PAGE_MASK);
435 /* make sure that we can really access it */
436 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
437 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
438 if (! (vendor_id == ivend && device_id == idev)) {
439 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
440 vendor_id, device_id);
441 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
445 ptr32 = (uint32_t*)(cfgptr + 0x178);
448 if (val == 0xffffffff) {
449 device_printf(sc->dev, "extended mapping failed\n");
450 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
454 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
456 device_printf(sc->dev,
457 "Enabled ECRC on upstream Nvidia bridge "
459 (int)bus, (int)slot, (int)func);
464 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
466 device_printf(sc->dev,
467 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
474 mxge_dma_test(mxge_softc_t *sc, int test_type)
477 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
483 /* Run a small DMA test.
484 * The magic multipliers to the length tell the firmware
485 * to do DMA read, write, or read+write tests. The
486 * results are returned in cmd.data0. The upper 16
487 * bits of the return is the number of transfers completed.
488 * The lower 16 bits is the time in 0.5us ticks that the
489 * transfers took to complete.
492 len = sc->tx_boundary;
494 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
495 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
496 cmd.data2 = len * 0x10000;
497 status = mxge_send_cmd(sc, test_type, &cmd);
502 sc->read_dma = ((cmd.data0>>16) * len * 2) /
503 (cmd.data0 & 0xffff);
504 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
505 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
506 cmd.data2 = len * 0x1;
507 status = mxge_send_cmd(sc, test_type, &cmd);
512 sc->write_dma = ((cmd.data0>>16) * len * 2) /
513 (cmd.data0 & 0xffff);
515 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
516 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
517 cmd.data2 = len * 0x10001;
518 status = mxge_send_cmd(sc, test_type, &cmd);
523 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
524 (cmd.data0 & 0xffff);
527 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
528 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
535 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
536 * when the PCI-E Completion packets are aligned on an 8-byte
537 * boundary. Some PCI-E chip sets always align Completion packets; on
538 * the ones that do not, the alignment can be enforced by enabling
539 * ECRC generation (if supported).
541 * When PCI-E Completion packets are not aligned, it is actually more
542 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
544 * If the driver can neither enable ECRC nor verify that it has
545 * already been enabled, then it must use a firmware image which works
546 * around unaligned completion packets (ethp_z8e.dat), and it should
547 * also ensure that it never gives the device a Read-DMA which is
548 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
549 * enabled, then the driver should use the aligned (eth_z8e.dat)
550 * firmware image, and set tx_boundary to 4KB.
554 mxge_firmware_probe(mxge_softc_t *sc)
556 device_t dev = sc->dev;
560 sc->tx_boundary = 4096;
562 * Verify the max read request size was set to 4KB
563 * before trying the test with 4KB.
565 if (pci_find_cap(dev, PCIY_EXPRESS, ®) == 0) {
566 pectl = pci_read_config(dev, reg + 0x8, 2);
567 if ((pectl & (5 << 12)) != (5 << 12)) {
568 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
570 sc->tx_boundary = 2048;
575 * load the optimized firmware (which assumes aligned PCIe
576 * completions) in order to see if it works on this host.
578 sc->fw_name = mxge_fw_aligned;
579 status = mxge_load_firmware(sc, 1);
585 * Enable ECRC if possible
587 mxge_enable_nvidia_ecrc(sc);
590 * Run a DMA test which watches for unaligned completions and
591 * aborts on the first one seen. Not required on Z8ES or newer.
593 if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
595 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
597 return 0; /* keep the aligned firmware */
600 device_printf(dev, "DMA test failed: %d\n", status);
601 if (status == ENOSYS)
602 device_printf(dev, "Falling back to ethp! "
603 "Please install up to date fw\n");
608 mxge_select_firmware(mxge_softc_t *sc)
611 int force_firmware = mxge_force_firmware;
614 force_firmware = sc->throttle;
616 if (force_firmware != 0) {
617 if (force_firmware == 1)
622 device_printf(sc->dev,
623 "Assuming %s completions (forced)\n",
624 aligned ? "aligned" : "unaligned");
628 /* if the PCIe link width is 4 or less, we can use the aligned
629 firmware and skip any checks */
630 if (sc->link_width != 0 && sc->link_width <= 4) {
631 device_printf(sc->dev,
632 "PCIe x%d Link, expect reduced performance\n",
638 if (0 == mxge_firmware_probe(sc))
643 sc->fw_name = mxge_fw_aligned;
644 sc->tx_boundary = 4096;
646 sc->fw_name = mxge_fw_unaligned;
647 sc->tx_boundary = 2048;
649 return (mxge_load_firmware(sc, 0));
653 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
657 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
658 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
659 be32toh(hdr->mcp_type));
663 /* save firmware version for sysctl */
664 strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
666 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
668 sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
669 &sc->fw_ver_minor, &sc->fw_ver_tiny);
671 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
672 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
673 device_printf(sc->dev, "Found firmware version %s\n",
675 device_printf(sc->dev, "Driver needs %d.%d\n",
676 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
684 z_alloc(void *nil, u_int items, u_int size)
688 ptr = malloc(items * size, M_TEMP, M_NOWAIT);
693 z_free(void *nil, void *ptr)
700 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
703 char *inflate_buffer;
704 const struct firmware *fw;
705 const mcp_gen_header_t *hdr;
712 fw = firmware_get(sc->fw_name);
714 device_printf(sc->dev, "Could not find firmware image %s\n",
721 /* setup zlib and decompress f/w */
722 bzero(&zs, sizeof (zs));
725 status = inflateInit(&zs);
726 if (status != Z_OK) {
731 /* the uncompressed size is stored as the firmware version,
732 which would otherwise go unused */
733 fw_len = (size_t) fw->version;
734 inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
735 if (inflate_buffer == NULL)
737 zs.avail_in = fw->datasize;
738 zs.next_in = __DECONST(char *, fw->data);
739 zs.avail_out = fw_len;
740 zs.next_out = inflate_buffer;
741 status = inflate(&zs, Z_FINISH);
742 if (status != Z_STREAM_END) {
743 device_printf(sc->dev, "zlib %d\n", status);
745 goto abort_with_buffer;
749 hdr_offset = htobe32(*(const uint32_t *)
750 (inflate_buffer + MCP_HEADER_PTR_OFFSET));
751 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
752 device_printf(sc->dev, "Bad firmware file");
754 goto abort_with_buffer;
756 hdr = (const void*)(inflate_buffer + hdr_offset);
758 status = mxge_validate_firmware(sc, hdr);
760 goto abort_with_buffer;
762 /* Copy the inflated firmware to NIC SRAM. */
763 for (i = 0; i < fw_len; i += 256) {
764 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
766 min(256U, (unsigned)(fw_len - i)));
775 free(inflate_buffer, M_TEMP);
779 firmware_put(fw, FIRMWARE_UNLOAD);
784 * Enable or disable periodic RDMAs from the host to make certain
785 * chipsets resend dropped PCIe messages
789 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
792 volatile uint32_t *confirm;
793 volatile char *submit;
794 uint32_t *buf, dma_low, dma_high;
797 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
799 /* clear confirmation addr */
800 confirm = (volatile uint32_t *)sc->cmd;
804 /* send an rdma command to the PCIe engine, and wait for the
805 response in the confirmation address. The firmware should
806 write a -1 there to indicate it is alive and well
809 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
810 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
811 buf[0] = htobe32(dma_high); /* confirm addr MSW */
812 buf[1] = htobe32(dma_low); /* confirm addr LSW */
813 buf[2] = htobe32(0xffffffff); /* confirm data */
814 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
815 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
816 buf[3] = htobe32(dma_high); /* dummy addr MSW */
817 buf[4] = htobe32(dma_low); /* dummy addr LSW */
818 buf[5] = htobe32(enable); /* enable? */
821 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
823 mxge_pio_copy(submit, buf, 64);
828 while (*confirm != 0xffffffff && i < 20) {
832 if (*confirm != 0xffffffff) {
833 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
834 (enable ? "enable" : "disable"), confirm,
841 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
844 char buf_bytes[sizeof(*buf) + 8];
845 volatile mcp_cmd_response_t *response = sc->cmd;
846 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
847 uint32_t dma_low, dma_high;
848 int err, sleep_total = 0;
850 /* ensure buf is aligned to 8 bytes */
851 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
853 buf->data0 = htobe32(data->data0);
854 buf->data1 = htobe32(data->data1);
855 buf->data2 = htobe32(data->data2);
856 buf->cmd = htobe32(cmd);
857 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
858 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
860 buf->response_addr.low = htobe32(dma_low);
861 buf->response_addr.high = htobe32(dma_high);
862 mtx_lock(&sc->cmd_mtx);
863 response->result = 0xffffffff;
865 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
867 /* wait up to 20ms */
869 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
870 bus_dmamap_sync(sc->cmd_dma.dmat,
871 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
873 switch (be32toh(response->result)) {
875 data->data0 = be32toh(response->data);
881 case MXGEFW_CMD_UNKNOWN:
884 case MXGEFW_CMD_ERROR_UNALIGNED:
887 case MXGEFW_CMD_ERROR_BUSY:
890 case MXGEFW_CMD_ERROR_I2C_ABSENT:
894 device_printf(sc->dev,
896 "failed, result = %d\n",
897 cmd, be32toh(response->result));
905 device_printf(sc->dev, "mxge: command %d timed out"
907 cmd, be32toh(response->result));
908 mtx_unlock(&sc->cmd_mtx);
913 mxge_adopt_running_firmware(mxge_softc_t *sc)
915 struct mcp_gen_header *hdr;
916 const size_t bytes = sizeof (struct mcp_gen_header);
920 /* find running firmware header */
921 hdr_offset = htobe32(*(volatile uint32_t *)
922 (sc->sram + MCP_HEADER_PTR_OFFSET));
924 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
925 device_printf(sc->dev,
926 "Running firmware has bad header offset (%d)\n",
931 /* copy header of running firmware from SRAM to host memory to
932 * validate firmware */
933 hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
935 device_printf(sc->dev, "could not malloc firmware hdr\n");
938 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
939 rman_get_bushandle(sc->mem_res),
940 hdr_offset, (char *)hdr, bytes);
941 status = mxge_validate_firmware(sc, hdr);
945 * check to see if adopted firmware has bug where adopting
946 * it will cause broadcasts to be filtered unless the NIC
947 * is kept in ALLMULTI mode
949 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
950 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
951 sc->adopted_rx_filter_bug = 1;
952 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
953 "working around rx filter bug\n",
954 sc->fw_ver_major, sc->fw_ver_minor,
963 mxge_load_firmware(mxge_softc_t *sc, int adopt)
965 volatile uint32_t *confirm;
966 volatile char *submit;
968 uint32_t *buf, size, dma_low, dma_high;
971 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
973 size = sc->sram_size;
974 status = mxge_load_firmware_helper(sc, &size);
978 /* Try to use the currently running firmware, if
980 status = mxge_adopt_running_firmware(sc);
982 device_printf(sc->dev,
983 "failed to adopt running firmware\n");
986 device_printf(sc->dev,
987 "Successfully adopted running firmware\n");
988 if (sc->tx_boundary == 4096) {
989 device_printf(sc->dev,
990 "Using firmware currently running on NIC"
992 device_printf(sc->dev,
993 "performance consider loading optimized "
996 sc->fw_name = mxge_fw_unaligned;
997 sc->tx_boundary = 2048;
1000 /* clear confirmation addr */
1001 confirm = (volatile uint32_t *)sc->cmd;
1004 /* send a reload command to the bootstrap MCP, and wait for the
1005 response in the confirmation address. The firmware should
1006 write a -1 there to indicate it is alive and well
1009 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1010 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1012 buf[0] = htobe32(dma_high); /* confirm addr MSW */
1013 buf[1] = htobe32(dma_low); /* confirm addr LSW */
1014 buf[2] = htobe32(0xffffffff); /* confirm data */
1016 /* FIX: All newest firmware should un-protect the bottom of
1017 the sram before handoff. However, the very first interfaces
1018 do not. Therefore the handoff copy must skip the first 8 bytes
1020 /* where the code starts*/
1021 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1022 buf[4] = htobe32(size - 8); /* length of code */
1023 buf[5] = htobe32(8); /* where to copy to */
1024 buf[6] = htobe32(0); /* where to jump to */
1026 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1027 mxge_pio_copy(submit, buf, 64);
1032 while (*confirm != 0xffffffff && i < 20) {
1035 bus_dmamap_sync(sc->cmd_dma.dmat,
1036 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1038 if (*confirm != 0xffffffff) {
1039 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1048 mxge_update_mac_address(mxge_softc_t *sc)
1051 uint8_t *addr = sc->mac_addr;
1055 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1056 | (addr[2] << 8) | addr[3]);
1058 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1060 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1065 mxge_change_pause(mxge_softc_t *sc, int pause)
1071 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1074 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1078 device_printf(sc->dev, "Failed to set flow control mode\n");
1086 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1091 if (mxge_always_promisc)
1095 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1098 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1102 device_printf(sc->dev, "Failed to set promisc mode\n");
1107 mxge_set_multicast_list(mxge_softc_t *sc)
1110 struct ifmultiaddr *ifma;
1111 struct ifnet *ifp = sc->ifp;
1114 /* This firmware is known to not support multicast */
1115 if (!sc->fw_multicast_support)
1118 /* Disable multicast filtering while we play with the lists*/
1119 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1121 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1122 " error status: %d\n", err);
1126 if (sc->adopted_rx_filter_bug)
1129 if (ifp->if_flags & IFF_ALLMULTI)
1130 /* request to disable multicast filtering, so quit here */
1133 /* Flush all the filters */
1135 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1137 device_printf(sc->dev,
1138 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1139 ", error status: %d\n", err);
1143 /* Walk the multicast list, and add each address */
1145 if_maddr_rlock(ifp);
1146 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1147 if (ifma->ifma_addr->sa_family != AF_LINK)
1149 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1151 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1153 cmd.data0 = htonl(cmd.data0);
1154 cmd.data1 = htonl(cmd.data1);
1155 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1157 device_printf(sc->dev, "Failed "
1158 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1160 /* abort, leaving multicast filtering off */
1161 if_maddr_runlock(ifp);
1165 if_maddr_runlock(ifp);
1166 /* Enable multicast filtering */
1167 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1169 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1170 ", error status: %d\n", err);
1175 mxge_max_mtu(mxge_softc_t *sc)
1180 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1181 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1183 /* try to set nbufs to see if it we can
1184 use virtually contiguous jumbos */
1186 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1189 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1191 /* otherwise, we're limited to MJUMPAGESIZE */
1192 return MJUMPAGESIZE - MXGEFW_PAD;
1196 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1198 struct mxge_slice_state *ss;
1199 mxge_rx_done_t *rx_done;
1200 volatile uint32_t *irq_claim;
1204 /* try to send a reset command to the card to see if it
1206 memset(&cmd, 0, sizeof (cmd));
1207 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1209 device_printf(sc->dev, "failed reset\n");
1213 mxge_dummy_rdma(sc, 1);
1216 /* set the intrq size */
1217 cmd.data0 = sc->rx_ring_size;
1218 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1221 * Even though we already know how many slices are supported
1222 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1223 * has magic side effects, and must be called after a reset.
1224 * It must be called prior to calling any RSS related cmds,
1225 * including assigning an interrupt queue for anything but
1226 * slice 0. It must also be called *after*
1227 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1228 * the firmware to compute offsets.
1231 if (sc->num_slices > 1) {
1232 /* ask the maximum number of slices it supports */
1233 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1236 device_printf(sc->dev,
1237 "failed to get number of slices\n");
1241 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1242 * to setting up the interrupt queue DMA
1244 cmd.data0 = sc->num_slices;
1245 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1246 #ifdef IFNET_BUF_RING
1247 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1249 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1252 device_printf(sc->dev,
1253 "failed to set number of slices\n");
1259 if (interrupts_setup) {
1260 /* Now exchange information about interrupts */
1261 for (slice = 0; slice < sc->num_slices; slice++) {
1262 rx_done = &sc->ss[slice].rx_done;
1263 memset(rx_done->entry, 0, sc->rx_ring_size);
1264 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1265 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1267 status |= mxge_send_cmd(sc,
1268 MXGEFW_CMD_SET_INTRQ_DMA,
1273 status |= mxge_send_cmd(sc,
1274 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1277 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1279 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1280 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1283 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1285 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1287 device_printf(sc->dev, "failed set interrupt parameters\n");
1292 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1295 /* run a DMA benchmark */
1296 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1298 for (slice = 0; slice < sc->num_slices; slice++) {
1299 ss = &sc->ss[slice];
1301 ss->irq_claim = irq_claim + (2 * slice);
1302 /* reset mcp/driver shared state back to 0 */
1303 ss->rx_done.idx = 0;
1304 ss->rx_done.cnt = 0;
1307 ss->tx.pkt_done = 0;
1308 ss->tx.queue_active = 0;
1309 ss->tx.activate = 0;
1310 ss->tx.deactivate = 0;
1315 ss->rx_small.cnt = 0;
1316 ss->lc.lro_bad_csum = 0;
1317 ss->lc.lro_queued = 0;
1318 ss->lc.lro_flushed = 0;
1319 if (ss->fw_stats != NULL) {
1320 bzero(ss->fw_stats, sizeof *ss->fw_stats);
1323 sc->rdma_tags_available = 15;
1324 status = mxge_update_mac_address(sc);
1325 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1326 mxge_change_pause(sc, sc->pause);
1327 mxge_set_multicast_list(sc);
1329 cmd.data0 = sc->throttle;
1330 if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1332 device_printf(sc->dev,
1333 "can't enable throttle\n");
1340 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1345 unsigned int throttle;
1348 throttle = sc->throttle;
1349 err = sysctl_handle_int(oidp, &throttle, arg2, req);
1354 if (throttle == sc->throttle)
1357 if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1360 mtx_lock(&sc->driver_mtx);
1361 cmd.data0 = throttle;
1362 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1364 sc->throttle = throttle;
1365 mtx_unlock(&sc->driver_mtx);
1370 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1373 unsigned int intr_coal_delay;
1377 intr_coal_delay = sc->intr_coal_delay;
1378 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1382 if (intr_coal_delay == sc->intr_coal_delay)
1385 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1388 mtx_lock(&sc->driver_mtx);
1389 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1390 sc->intr_coal_delay = intr_coal_delay;
1392 mtx_unlock(&sc->driver_mtx);
1397 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1400 unsigned int enabled;
1404 enabled = sc->pause;
1405 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1409 if (enabled == sc->pause)
1412 mtx_lock(&sc->driver_mtx);
1413 err = mxge_change_pause(sc, enabled);
1414 mtx_unlock(&sc->driver_mtx);
1419 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1425 arg2 = be32toh(*(int *)arg1);
1427 err = sysctl_handle_int(oidp, arg1, arg2, req);
1433 mxge_rem_sysctls(mxge_softc_t *sc)
1435 struct mxge_slice_state *ss;
1438 if (sc->slice_sysctl_tree == NULL)
1441 for (slice = 0; slice < sc->num_slices; slice++) {
1442 ss = &sc->ss[slice];
1443 if (ss == NULL || ss->sysctl_tree == NULL)
1445 sysctl_ctx_free(&ss->sysctl_ctx);
1446 ss->sysctl_tree = NULL;
1448 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1449 sc->slice_sysctl_tree = NULL;
1453 mxge_add_sysctls(mxge_softc_t *sc)
1455 struct sysctl_ctx_list *ctx;
1456 struct sysctl_oid_list *children;
1458 struct mxge_slice_state *ss;
1462 ctx = device_get_sysctl_ctx(sc->dev);
1463 children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1464 fw = sc->ss[0].fw_stats;
1466 /* random information */
1467 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1469 CTLFLAG_RD, &sc->fw_version,
1470 0, "firmware version");
1471 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1473 CTLFLAG_RD, &sc->serial_number_string,
1474 0, "serial number");
1475 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1477 CTLFLAG_RD, &sc->product_code_string,
1479 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1481 CTLFLAG_RD, &sc->link_width,
1483 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1485 CTLFLAG_RD, &sc->tx_boundary,
1487 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1489 CTLFLAG_RD, &sc->wc,
1490 0, "write combining PIO?");
1491 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1493 CTLFLAG_RD, &sc->read_dma,
1494 0, "DMA Read speed in MB/s");
1495 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1497 CTLFLAG_RD, &sc->write_dma,
1498 0, "DMA Write speed in MB/s");
1499 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1500 "read_write_dma_MBs",
1501 CTLFLAG_RD, &sc->read_write_dma,
1502 0, "DMA concurrent Read/Write speed in MB/s");
1503 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1505 CTLFLAG_RD, &sc->watchdog_resets,
1506 0, "Number of times NIC was reset");
1509 /* performance related tunables */
1510 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1512 CTLTYPE_INT|CTLFLAG_RW, sc,
1513 0, mxge_change_intr_coal,
1514 "I", "interrupt coalescing delay in usecs");
1516 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1518 CTLTYPE_INT|CTLFLAG_RW, sc,
1519 0, mxge_change_throttle,
1520 "I", "transmit throttling");
1522 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1523 "flow_control_enabled",
1524 CTLTYPE_INT|CTLFLAG_RW, sc,
1525 0, mxge_change_flow_control,
1526 "I", "interrupt coalescing delay in usecs");
1528 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1530 CTLFLAG_RW, &mxge_deassert_wait,
1531 0, "Wait for IRQ line to go low in ihandler");
1533 /* stats block from firmware is in network byte order.
1535 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1537 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1538 0, mxge_handle_be32,
1540 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1541 "rdma_tags_available",
1542 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1543 0, mxge_handle_be32,
1544 "I", "rdma_tags_available");
1545 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546 "dropped_bad_crc32",
1547 CTLTYPE_INT|CTLFLAG_RD,
1548 &fw->dropped_bad_crc32,
1549 0, mxge_handle_be32,
1550 "I", "dropped_bad_crc32");
1551 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1553 CTLTYPE_INT|CTLFLAG_RD,
1554 &fw->dropped_bad_phy,
1555 0, mxge_handle_be32,
1556 "I", "dropped_bad_phy");
1557 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1558 "dropped_link_error_or_filtered",
1559 CTLTYPE_INT|CTLFLAG_RD,
1560 &fw->dropped_link_error_or_filtered,
1561 0, mxge_handle_be32,
1562 "I", "dropped_link_error_or_filtered");
1563 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1564 "dropped_link_overflow",
1565 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1566 0, mxge_handle_be32,
1567 "I", "dropped_link_overflow");
1568 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1569 "dropped_multicast_filtered",
1570 CTLTYPE_INT|CTLFLAG_RD,
1571 &fw->dropped_multicast_filtered,
1572 0, mxge_handle_be32,
1573 "I", "dropped_multicast_filtered");
1574 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1575 "dropped_no_big_buffer",
1576 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1577 0, mxge_handle_be32,
1578 "I", "dropped_no_big_buffer");
1579 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1580 "dropped_no_small_buffer",
1581 CTLTYPE_INT|CTLFLAG_RD,
1582 &fw->dropped_no_small_buffer,
1583 0, mxge_handle_be32,
1584 "I", "dropped_no_small_buffer");
1585 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1587 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1588 0, mxge_handle_be32,
1589 "I", "dropped_overrun");
1590 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1592 CTLTYPE_INT|CTLFLAG_RD,
1594 0, mxge_handle_be32,
1595 "I", "dropped_pause");
1596 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1598 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1599 0, mxge_handle_be32,
1600 "I", "dropped_runt");
1602 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1603 "dropped_unicast_filtered",
1604 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1605 0, mxge_handle_be32,
1606 "I", "dropped_unicast_filtered");
1608 /* verbose printing? */
1609 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1611 CTLFLAG_RW, &mxge_verbose,
1612 0, "verbose printing");
1614 /* add counters exported for debugging from all slices */
1615 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1616 sc->slice_sysctl_tree =
1617 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1618 "slice", CTLFLAG_RD, 0, "");
1620 for (slice = 0; slice < sc->num_slices; slice++) {
1621 ss = &sc->ss[slice];
1622 sysctl_ctx_init(&ss->sysctl_ctx);
1623 ctx = &ss->sysctl_ctx;
1624 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1625 sprintf(slice_num, "%d", slice);
1627 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1629 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1630 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1632 CTLFLAG_RD, &ss->rx_small.cnt,
1634 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1636 CTLFLAG_RD, &ss->rx_big.cnt,
1638 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1639 "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1640 0, "number of lro merge queues flushed");
1642 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1643 "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1644 0, "number of bad csums preventing LRO");
1646 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1647 "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1648 0, "number of frames appended to lro merge"
1651 #ifndef IFNET_BUF_RING
1652 /* only transmit from slice 0 for now */
1656 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1658 CTLFLAG_RD, &ss->tx.req,
1661 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1663 CTLFLAG_RD, &ss->tx.done,
1665 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1667 CTLFLAG_RD, &ss->tx.pkt_done,
1669 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1671 CTLFLAG_RD, &ss->tx.stall,
1673 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1675 CTLFLAG_RD, &ss->tx.wake,
1677 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679 CTLFLAG_RD, &ss->tx.defrag,
1681 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1683 CTLFLAG_RD, &ss->tx.queue_active,
1684 0, "tx_queue_active");
1685 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1687 CTLFLAG_RD, &ss->tx.activate,
1689 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1691 CTLFLAG_RD, &ss->tx.deactivate,
1692 0, "tx_deactivate");
1696 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1697 backwards one at a time and handle ring wraps */
1700 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1701 mcp_kreq_ether_send_t *src, int cnt)
1703 int idx, starting_slot;
1704 starting_slot = tx->req;
1707 idx = (starting_slot + cnt) & tx->mask;
1708 mxge_pio_copy(&tx->lanai[idx],
1709 &src[cnt], sizeof(*src));
1715 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1716 * at most 32 bytes at a time, so as to avoid involving the software
1717 * pio handler in the nic. We re-write the first segment's flags
1718 * to mark them valid only after writing the entire chain
1722 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1727 volatile uint32_t *dst_ints;
1728 mcp_kreq_ether_send_t *srcp;
1729 volatile mcp_kreq_ether_send_t *dstp, *dst;
1732 idx = tx->req & tx->mask;
1734 last_flags = src->flags;
1737 dst = dstp = &tx->lanai[idx];
1740 if ((idx + cnt) < tx->mask) {
1741 for (i = 0; i < (cnt - 1); i += 2) {
1742 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1743 wmb(); /* force write every 32 bytes */
1748 /* submit all but the first request, and ensure
1749 that it is submitted below */
1750 mxge_submit_req_backwards(tx, src, cnt);
1754 /* submit the first request */
1755 mxge_pio_copy(dstp, srcp, sizeof(*src));
1756 wmb(); /* barrier before setting valid flag */
1759 /* re-write the last 32-bits with the valid flags */
1760 src->flags = last_flags;
1761 src_ints = (uint32_t *)src;
1763 dst_ints = (volatile uint32_t *)dst;
1765 *dst_ints = *src_ints;
1771 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1772 struct mxge_pkt_info *pi)
1774 struct ether_vlan_header *eh;
1776 int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1777 #if IFCAP_TSO6 && defined(INET6)
1781 eh = mtod(m, struct ether_vlan_header *);
1782 if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1783 etype = ntohs(eh->evl_proto);
1784 pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1786 etype = ntohs(eh->evl_encap_proto);
1787 pi->ip_off = ETHER_HDR_LEN;
1793 * ensure ip header is in first mbuf, copy it to a
1794 * scratch buffer if not
1796 pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1798 if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1799 m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1801 pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1803 pi->ip_hlen = pi->ip->ip_hl << 2;
1807 if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1808 sizeof(struct tcphdr))) {
1809 m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1810 sizeof(struct tcphdr), ss->scratch);
1811 pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1813 pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1815 #if IFCAP_TSO6 && defined(INET6)
1816 case ETHERTYPE_IPV6:
1817 pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1818 if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1819 m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1821 pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1824 pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1825 pi->ip_hlen -= pi->ip_off;
1826 if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1832 if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1835 if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1836 sizeof(struct tcphdr))) {
1837 m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1838 sizeof(struct tcphdr), ss->scratch);
1839 pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1841 pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1853 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1854 int busdma_seg_cnt, struct mxge_pkt_info *pi)
1857 mcp_kreq_ether_send_t *req;
1858 bus_dma_segment_t *seg;
1859 uint32_t low, high_swapped;
1860 int len, seglen, cum_len, cum_len_next;
1861 int next_is_first, chop, cnt, rdma_count, small;
1862 uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1863 uint8_t flags, flags_next;
1866 mss = m->m_pkthdr.tso_segsz;
1868 /* negative cum_len signifies to the
1869 * send loop that we are still in the
1870 * header portion of the TSO packet.
1873 cksum_offset = pi->ip_off + pi->ip_hlen;
1874 cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1876 /* TSO implies checksum offload on this hardware */
1877 if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1879 * If packet has full TCP csum, replace it with pseudo hdr
1880 * sum that the NIC expects, otherwise the NIC will emit
1881 * packets with bad TCP checksums.
1883 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1885 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1886 m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1887 sum = in6_cksum_pseudo(pi->ip6,
1888 m->m_pkthdr.len - cksum_offset,
1893 m->m_pkthdr.csum_flags |= CSUM_TCP;
1894 sum = in_pseudo(pi->ip->ip_src.s_addr,
1895 pi->ip->ip_dst.s_addr,
1896 htons(IPPROTO_TCP + (m->m_pkthdr.len -
1900 m_copyback(m, offsetof(struct tcphdr, th_sum) +
1901 cksum_offset, sizeof(sum), (caddr_t)&sum);
1903 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1906 /* for TSO, pseudo_hdr_offset holds mss.
1907 * The firmware figures out where to put
1908 * the checksum by parsing the header. */
1909 pseudo_hdr_offset = htobe16(mss);
1913 * for IPv6 TSO, the "checksum offset" is re-purposed
1914 * to store the TCP header len
1916 cksum_offset = (pi->tcp->th_off << 2);
1924 /* "rdma_count" is the number of RDMAs belonging to the
1925 * current packet BEFORE the current send request. For
1926 * non-TSO packets, this is equal to "count".
1927 * For TSO packets, rdma_count needs to be reset
1928 * to 0 after a segment cut.
1930 * The rdma_count field of the send request is
1931 * the number of RDMAs of the packet starting at
1932 * that request. For TSO send requests with one ore more cuts
1933 * in the middle, this is the number of RDMAs starting
1934 * after the last cut in the request. All previous
1935 * segments before the last cut implicitly have 1 RDMA.
1937 * Since the number of RDMAs is not known beforehand,
1938 * it must be filled-in retroactively - after each
1939 * segmentation cut or at the end of the entire packet.
1942 while (busdma_seg_cnt) {
1943 /* Break the busdma segment up into pieces*/
1944 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1945 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1949 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1951 cum_len_next = cum_len + seglen;
1952 (req-rdma_count)->rdma_count = rdma_count + 1;
1953 if (__predict_true(cum_len >= 0)) {
1955 chop = (cum_len_next > mss);
1956 cum_len_next = cum_len_next % mss;
1957 next_is_first = (cum_len_next == 0);
1958 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1959 flags_next |= next_is_first *
1961 rdma_count |= -(chop | next_is_first);
1962 rdma_count += chop & !next_is_first;
1963 } else if (cum_len_next >= 0) {
1968 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1969 flags_next = MXGEFW_FLAGS_TSO_PLD |
1970 MXGEFW_FLAGS_FIRST |
1971 (small * MXGEFW_FLAGS_SMALL);
1974 req->addr_high = high_swapped;
1975 req->addr_low = htobe32(low);
1976 req->pseudo_hdr_offset = pseudo_hdr_offset;
1978 req->rdma_count = 1;
1979 req->length = htobe16(seglen);
1980 req->cksum_offset = cksum_offset;
1981 req->flags = flags | ((cum_len & 1) *
1982 MXGEFW_FLAGS_ALIGN_ODD);
1985 cum_len = cum_len_next;
1990 if (cksum_offset != 0 && !pi->ip6) {
1991 if (__predict_false(cksum_offset > seglen))
1992 cksum_offset -= seglen;
1996 if (__predict_false(cnt > tx->max_desc))
2002 (req-rdma_count)->rdma_count = rdma_count;
2006 req->flags |= MXGEFW_FLAGS_TSO_LAST;
2007 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2009 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2010 mxge_submit_req(tx, tx->req_list, cnt);
2011 #ifdef IFNET_BUF_RING
2012 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2013 /* tell the NIC to start polling this slice */
2015 tx->queue_active = 1;
2023 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2027 printf("tx->max_desc exceeded via TSO!\n");
2028 printf("mss = %d, %ld, %d!\n", mss,
2029 (long)seg - (long)tx->seg_list, tx->max_desc);
2036 #endif /* IFCAP_TSO4 */
2038 #ifdef MXGE_NEW_VLAN_API
2040 * We reproduce the software vlan tag insertion from
2041 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2042 * vlan tag insertion. We need to advertise this in order to have the
2043 * vlan interface respect our csum offload flags.
2045 static struct mbuf *
2046 mxge_vlan_tag_insert(struct mbuf *m)
2048 struct ether_vlan_header *evl;
2050 M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2051 if (__predict_false(m == NULL))
2053 if (m->m_len < sizeof(*evl)) {
2054 m = m_pullup(m, sizeof(*evl));
2055 if (__predict_false(m == NULL))
2059 * Transform the Ethernet header into an Ethernet header
2060 * with 802.1Q encapsulation.
2062 evl = mtod(m, struct ether_vlan_header *);
2063 bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2064 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2065 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2066 evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2067 m->m_flags &= ~M_VLANTAG;
2070 #endif /* MXGE_NEW_VLAN_API */
2073 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2075 struct mxge_pkt_info pi = {0,0,0,0};
2077 mcp_kreq_ether_send_t *req;
2078 bus_dma_segment_t *seg;
2082 int cnt, cum_len, err, i, idx, odd_flag;
2083 uint16_t pseudo_hdr_offset;
2084 uint8_t flags, cksum_offset;
2091 #ifdef MXGE_NEW_VLAN_API
2092 if (m->m_flags & M_VLANTAG) {
2093 m = mxge_vlan_tag_insert(m);
2094 if (__predict_false(m == NULL))
2095 goto drop_without_m;
2098 if (m->m_pkthdr.csum_flags &
2099 (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2100 if (mxge_parse_tx(ss, m, &pi))
2104 /* (try to) map the frame for DMA */
2105 idx = tx->req & tx->mask;
2106 err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2107 m, tx->seg_list, &cnt,
2109 if (__predict_false(err == EFBIG)) {
2110 /* Too many segments in the chain. Try
2112 m_tmp = m_defrag(m, M_NOWAIT);
2113 if (m_tmp == NULL) {
2118 err = bus_dmamap_load_mbuf_sg(tx->dmat,
2120 m, tx->seg_list, &cnt,
2123 if (__predict_false(err != 0)) {
2124 device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2125 " packet len = %d\n", err, m->m_pkthdr.len);
2128 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2129 BUS_DMASYNC_PREWRITE);
2130 tx->info[idx].m = m;
2133 /* TSO is different enough, we handle it in another routine */
2134 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2135 mxge_encap_tso(ss, m, cnt, &pi);
2142 pseudo_hdr_offset = 0;
2143 flags = MXGEFW_FLAGS_NO_TSO;
2145 /* checksum offloading? */
2146 if (m->m_pkthdr.csum_flags &
2147 (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2148 /* ensure ip header is in first mbuf, copy
2149 it to a scratch buffer if not */
2150 cksum_offset = pi.ip_off + pi.ip_hlen;
2151 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2152 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2153 req->cksum_offset = cksum_offset;
2154 flags |= MXGEFW_FLAGS_CKSUM;
2155 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2159 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2160 flags |= MXGEFW_FLAGS_SMALL;
2162 /* convert segments into a request list */
2165 req->flags = MXGEFW_FLAGS_FIRST;
2166 for (i = 0; i < cnt; i++) {
2168 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2170 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2171 req->length = htobe16(seg->ds_len);
2172 req->cksum_offset = cksum_offset;
2173 if (cksum_offset > seg->ds_len)
2174 cksum_offset -= seg->ds_len;
2177 req->pseudo_hdr_offset = pseudo_hdr_offset;
2178 req->pad = 0; /* complete solid 16-byte block */
2179 req->rdma_count = 1;
2180 req->flags |= flags | ((cum_len & 1) * odd_flag);
2181 cum_len += seg->ds_len;
2187 /* pad runts to 60 bytes */
2191 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2193 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2194 req->length = htobe16(60 - cum_len);
2195 req->cksum_offset = 0;
2196 req->pseudo_hdr_offset = pseudo_hdr_offset;
2197 req->pad = 0; /* complete solid 16-byte block */
2198 req->rdma_count = 1;
2199 req->flags |= flags | ((cum_len & 1) * odd_flag);
2203 tx->req_list[0].rdma_count = cnt;
2205 /* print what the firmware will see */
2206 for (i = 0; i < cnt; i++) {
2207 printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2208 "cso:%d, flags:0x%x, rdma:%d\n",
2209 i, (int)ntohl(tx->req_list[i].addr_high),
2210 (int)ntohl(tx->req_list[i].addr_low),
2211 (int)ntohs(tx->req_list[i].length),
2212 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2213 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2214 tx->req_list[i].rdma_count);
2216 printf("--------------\n");
2218 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2219 mxge_submit_req(tx, tx->req_list, cnt);
2220 #ifdef IFNET_BUF_RING
2221 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2222 /* tell the NIC to start polling this slice */
2224 tx->queue_active = 1;
2238 #ifdef IFNET_BUF_RING
2240 mxge_qflush(struct ifnet *ifp)
2242 mxge_softc_t *sc = ifp->if_softc;
2247 for (slice = 0; slice < sc->num_slices; slice++) {
2248 tx = &sc->ss[slice].tx;
2250 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2252 mtx_unlock(&tx->mtx);
2258 mxge_start_locked(struct mxge_slice_state *ss)
2269 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2270 m = drbr_dequeue(ifp, tx->br);
2274 /* let BPF see it */
2277 /* give it to the nic */
2280 /* ran out of transmit slots */
2281 if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2282 && (!drbr_empty(ifp, tx->br))) {
2283 ss->if_drv_flags |= IFF_DRV_OACTIVE;
2289 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2300 if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2302 err = drbr_enqueue(ifp, tx->br, m);
2306 if (!drbr_needs_enqueue(ifp, tx->br) &&
2307 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2308 /* let BPF see it */
2310 /* give it to the nic */
2312 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2315 if (!drbr_empty(ifp, tx->br))
2316 mxge_start_locked(ss);
2321 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2323 mxge_softc_t *sc = ifp->if_softc;
2324 struct mxge_slice_state *ss;
2329 slice = m->m_pkthdr.flowid;
2330 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2332 ss = &sc->ss[slice];
2335 if (mtx_trylock(&tx->mtx)) {
2336 err = mxge_transmit_locked(ss, m);
2337 mtx_unlock(&tx->mtx);
2339 err = drbr_enqueue(ifp, tx->br, m);
2348 mxge_start_locked(struct mxge_slice_state *ss)
2358 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2359 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2363 /* let BPF see it */
2366 /* give it to the nic */
2369 /* ran out of transmit slots */
2370 if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2371 sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2377 mxge_start(struct ifnet *ifp)
2379 mxge_softc_t *sc = ifp->if_softc;
2380 struct mxge_slice_state *ss;
2382 /* only use the first slice for now */
2384 mtx_lock(&ss->tx.mtx);
2385 mxge_start_locked(ss);
2386 mtx_unlock(&ss->tx.mtx);
2390 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2391 * at most 32 bytes at a time, so as to avoid involving the software
2392 * pio handler in the nic. We re-write the first segment's low
2393 * DMA address to mark it valid only after we write the entire chunk
2397 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2398 mcp_kreq_ether_recv_t *src)
2402 low = src->addr_low;
2403 src->addr_low = 0xffffffff;
2404 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2406 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2408 src->addr_low = low;
2409 dst->addr_low = low;
2414 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2416 bus_dma_segment_t seg;
2418 mxge_rx_ring_t *rx = &ss->rx_small;
2421 m = m_gethdr(M_NOWAIT, MT_DATA);
2428 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2429 &seg, &cnt, BUS_DMA_NOWAIT);
2434 rx->info[idx].m = m;
2435 rx->shadow[idx].addr_low =
2436 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2437 rx->shadow[idx].addr_high =
2438 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2442 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2447 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2449 bus_dma_segment_t seg[3];
2451 mxge_rx_ring_t *rx = &ss->rx_big;
2454 m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2460 m->m_len = rx->mlen;
2461 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2462 seg, &cnt, BUS_DMA_NOWAIT);
2467 rx->info[idx].m = m;
2468 rx->shadow[idx].addr_low =
2469 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2470 rx->shadow[idx].addr_high =
2471 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2473 #if MXGE_VIRT_JUMBOS
2474 for (i = 1; i < cnt; i++) {
2475 rx->shadow[idx + i].addr_low =
2476 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2477 rx->shadow[idx + i].addr_high =
2478 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2483 for (i = 0; i < rx->nbufs; i++) {
2484 if ((idx & 7) == 7) {
2485 mxge_submit_8rx(&rx->lanai[idx - 7],
2486 &rx->shadow[idx - 7]);
2496 mxge_csum_generic(uint16_t *raw, int len)
2507 csum = (csum >> 16) + (csum & 0xffff);
2508 csum = (csum >> 16) + (csum & 0xffff);
2509 return (uint16_t)csum;
2512 static inline uint16_t
2513 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2516 int nxt, cksum_offset;
2517 struct ip6_hdr *ip6 = p;
2521 cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2522 if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2523 cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2524 IPPROTO_IPV6, &nxt);
2525 if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2530 * IPv6 headers do not contain a checksum, and hence
2531 * do not checksum to zero, so they don't "fall out"
2532 * of the partial checksum calculation like IPv4
2533 * headers do. We need to fix the partial checksum by
2534 * subtracting the checksum of the IPv6 header.
2537 partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2540 csum += (csum < ~partial);
2541 csum = (csum >> 16) + (csum & 0xFFFF);
2542 csum = (csum >> 16) + (csum & 0xFFFF);
2543 c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2550 * Myri10GE hardware checksums are not valid if the sender
2551 * padded the frame with non-zero padding. This is because
2552 * the firmware just does a simple 16-bit 1s complement
2553 * checksum across the entire frame, excluding the first 14
2554 * bytes. It is best to simply to check the checksum and
2555 * tell the stack about it only if the checksum is good
2558 static inline uint16_t
2559 mxge_rx_csum(struct mbuf *m, int csum)
2561 struct ether_header *eh;
2565 #if defined(INET) || defined(INET6)
2566 int cap = m->m_pkthdr.rcvif->if_capenable;
2571 eh = mtod(m, struct ether_header *);
2572 etype = ntohs(eh->ether_type);
2576 if ((cap & IFCAP_RXCSUM) == 0)
2578 ip = (struct ip *)(eh + 1);
2579 if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2581 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2582 htonl(ntohs(csum) + ntohs(ip->ip_len) -
2583 (ip->ip_hl << 2) + ip->ip_p));
2588 case ETHERTYPE_IPV6:
2589 if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2591 c = mxge_rx_csum6((eh + 1), m, csum);
2601 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2603 struct ether_vlan_header *evl;
2604 struct ether_header *eh;
2607 evl = mtod(m, struct ether_vlan_header *);
2608 eh = mtod(m, struct ether_header *);
2611 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2612 * after what the firmware thought was the end of the ethernet
2616 /* put checksum into host byte order */
2617 *csum = ntohs(*csum);
2618 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2619 (*csum) += ~partial;
2620 (*csum) += ((*csum) < ~partial);
2621 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2622 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2624 /* restore checksum to network byte order;
2625 later consumers expect this */
2626 *csum = htons(*csum);
2629 #ifdef MXGE_NEW_VLAN_API
2630 m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2634 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2638 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2639 m_tag_prepend(m, mtag);
2643 m->m_flags |= M_VLANTAG;
2646 * Remove the 802.1q header by copying the Ethernet
2647 * addresses over it and adjusting the beginning of
2648 * the data in the mbuf. The encapsulated Ethernet
2649 * type field is already in place.
2651 bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2652 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2653 m_adj(m, ETHER_VLAN_ENCAP_LEN);
2658 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2659 uint32_t csum, int lro)
2664 struct ether_header *eh;
2666 bus_dmamap_t old_map;
2672 idx = rx->cnt & rx->mask;
2673 rx->cnt += rx->nbufs;
2674 /* save a pointer to the received mbuf */
2675 m = rx->info[idx].m;
2676 /* try to replace the received mbuf */
2677 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2678 /* drop the frame -- the old mbuf is re-cycled */
2683 /* unmap the received buffer */
2684 old_map = rx->info[idx].map;
2685 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2686 bus_dmamap_unload(rx->dmat, old_map);
2688 /* swap the bus_dmamap_t's */
2689 rx->info[idx].map = rx->extra_map;
2690 rx->extra_map = old_map;
2692 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2694 m->m_data += MXGEFW_PAD;
2696 m->m_pkthdr.rcvif = ifp;
2697 m->m_len = m->m_pkthdr.len = len;
2699 eh = mtod(m, struct ether_header *);
2700 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2701 mxge_vlan_tag_remove(m, &csum);
2703 /* if the checksum is valid, mark it in the mbuf header */
2705 if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2706 (0 == mxge_rx_csum(m, csum))) {
2707 /* Tell the stack that the checksum is good */
2708 m->m_pkthdr.csum_data = 0xffff;
2709 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2712 #if defined(INET) || defined (INET6)
2713 if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2717 /* flowid only valid if RSS hashing is enabled */
2718 if (sc->num_slices > 1) {
2719 m->m_pkthdr.flowid = (ss - sc->ss);
2720 m->m_flags |= M_FLOWID;
2722 /* pass the frame up the stack */
2723 (*ifp->if_input)(ifp, m);
2727 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2728 uint32_t csum, int lro)
2732 struct ether_header *eh;
2735 bus_dmamap_t old_map;
2741 idx = rx->cnt & rx->mask;
2743 /* save a pointer to the received mbuf */
2744 m = rx->info[idx].m;
2745 /* try to replace the received mbuf */
2746 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2747 /* drop the frame -- the old mbuf is re-cycled */
2752 /* unmap the received buffer */
2753 old_map = rx->info[idx].map;
2754 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2755 bus_dmamap_unload(rx->dmat, old_map);
2757 /* swap the bus_dmamap_t's */
2758 rx->info[idx].map = rx->extra_map;
2759 rx->extra_map = old_map;
2761 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2763 m->m_data += MXGEFW_PAD;
2765 m->m_pkthdr.rcvif = ifp;
2766 m->m_len = m->m_pkthdr.len = len;
2768 eh = mtod(m, struct ether_header *);
2769 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2770 mxge_vlan_tag_remove(m, &csum);
2772 /* if the checksum is valid, mark it in the mbuf header */
2773 if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2774 (0 == mxge_rx_csum(m, csum))) {
2775 /* Tell the stack that the checksum is good */
2776 m->m_pkthdr.csum_data = 0xffff;
2777 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2780 #if defined(INET) || defined (INET6)
2781 if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2785 /* flowid only valid if RSS hashing is enabled */
2786 if (sc->num_slices > 1) {
2787 m->m_pkthdr.flowid = (ss - sc->ss);
2788 m->m_flags |= M_FLOWID;
2790 /* pass the frame up the stack */
2791 (*ifp->if_input)(ifp, m);
2795 mxge_clean_rx_done(struct mxge_slice_state *ss)
2797 mxge_rx_done_t *rx_done = &ss->rx_done;
2803 lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2804 while (rx_done->entry[rx_done->idx].length != 0) {
2805 length = ntohs(rx_done->entry[rx_done->idx].length);
2806 rx_done->entry[rx_done->idx].length = 0;
2807 checksum = rx_done->entry[rx_done->idx].checksum;
2808 if (length <= (MHLEN - MXGEFW_PAD))
2809 mxge_rx_done_small(ss, length, checksum, lro);
2811 mxge_rx_done_big(ss, length, checksum, lro);
2813 rx_done->idx = rx_done->cnt & rx_done->mask;
2815 /* limit potential for livelock */
2816 if (__predict_false(++limit > rx_done->mask / 2))
2819 #if defined(INET) || defined (INET6)
2820 while (!SLIST_EMPTY(&ss->lc.lro_active)) {
2821 struct lro_entry *lro = SLIST_FIRST(&ss->lc.lro_active);
2822 SLIST_REMOVE_HEAD(&ss->lc.lro_active, next);
2823 tcp_lro_flush(&ss->lc, lro);
2830 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2841 while (tx->pkt_done != mcp_idx) {
2842 idx = tx->done & tx->mask;
2844 m = tx->info[idx].m;
2845 /* mbuf and DMA map only attached to the first
2848 ss->obytes += m->m_pkthdr.len;
2849 if (m->m_flags & M_MCAST)
2852 tx->info[idx].m = NULL;
2853 map = tx->info[idx].map;
2854 bus_dmamap_unload(tx->dmat, map);
2857 if (tx->info[idx].flag) {
2858 tx->info[idx].flag = 0;
2863 /* If we have space, clear IFF_OACTIVE to tell the stack that
2864 its OK to send packets */
2865 #ifdef IFNET_BUF_RING
2866 flags = &ss->if_drv_flags;
2868 flags = &ifp->if_drv_flags;
2870 mtx_lock(&ss->tx.mtx);
2871 if ((*flags) & IFF_DRV_OACTIVE &&
2872 tx->req - tx->done < (tx->mask + 1)/4) {
2873 *(flags) &= ~IFF_DRV_OACTIVE;
2875 mxge_start_locked(ss);
2877 #ifdef IFNET_BUF_RING
2878 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2879 /* let the NIC stop polling this queue, since there
2880 * are no more transmits pending */
2881 if (tx->req == tx->done) {
2883 tx->queue_active = 0;
2889 mtx_unlock(&ss->tx.mtx);
2893 static struct mxge_media_type mxge_xfp_media_types[] =
2895 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2896 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2897 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2898 {0, (1 << 5), "10GBASE-ER"},
2899 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2900 {0, (1 << 3), "10GBASE-SW"},
2901 {0, (1 << 2), "10GBASE-LW"},
2902 {0, (1 << 1), "10GBASE-EW"},
2903 {0, (1 << 0), "Reserved"}
2905 static struct mxge_media_type mxge_sfp_media_types[] =
2907 {IFM_10G_TWINAX, 0, "10GBASE-Twinax"},
2908 {0, (1 << 7), "Reserved"},
2909 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2910 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2911 {IFM_10G_SR, (1 << 4), "10GBASE-SR"},
2912 {IFM_10G_TWINAX,(1 << 0), "10GBASE-Twinax"}
2916 mxge_media_set(mxge_softc_t *sc, int media_type)
2920 ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2922 ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2923 sc->current_media = media_type;
2924 sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2928 mxge_media_init(mxge_softc_t *sc)
2933 ifmedia_removeall(&sc->media);
2934 mxge_media_set(sc, IFM_AUTO);
2937 * parse the product code to deterimine the interface type
2938 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2939 * after the 3rd dash in the driver's cached copy of the
2940 * EEPROM's product code string.
2942 ptr = sc->product_code_string;
2944 device_printf(sc->dev, "Missing product code\n");
2948 for (i = 0; i < 3; i++, ptr++) {
2949 ptr = index(ptr, '-');
2951 device_printf(sc->dev,
2952 "only %d dashes in PC?!?\n", i);
2956 if (*ptr == 'C' || *(ptr +1) == 'C') {
2958 sc->connector = MXGE_CX4;
2959 mxge_media_set(sc, IFM_10G_CX4);
2960 } else if (*ptr == 'Q') {
2961 /* -Q is Quad Ribbon Fiber */
2962 sc->connector = MXGE_QRF;
2963 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2964 /* FreeBSD has no media type for Quad ribbon fiber */
2965 } else if (*ptr == 'R') {
2967 sc->connector = MXGE_XFP;
2968 } else if (*ptr == 'S' || *(ptr +1) == 'S') {
2969 /* -S or -2S is SFP+ */
2970 sc->connector = MXGE_SFP;
2972 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2977 * Determine the media type for a NIC. Some XFPs will identify
2978 * themselves only when their link is up, so this is initiated via a
2979 * link up interrupt. However, this can potentially take up to
2980 * several milliseconds, so it is run via the watchdog routine, rather
2981 * than in the interrupt handler itself.
2984 mxge_media_probe(mxge_softc_t *sc)
2989 struct mxge_media_type *mxge_media_types = NULL;
2990 int i, err, ms, mxge_media_type_entries;
2993 sc->need_media_probe = 0;
2995 if (sc->connector == MXGE_XFP) {
2997 mxge_media_types = mxge_xfp_media_types;
2998 mxge_media_type_entries =
2999 sizeof (mxge_xfp_media_types) /
3000 sizeof (mxge_xfp_media_types[0]);
3001 byte = MXGE_XFP_COMPLIANCE_BYTE;
3003 } else if (sc->connector == MXGE_SFP) {
3004 /* -S or -2S is SFP+ */
3005 mxge_media_types = mxge_sfp_media_types;
3006 mxge_media_type_entries =
3007 sizeof (mxge_sfp_media_types) /
3008 sizeof (mxge_sfp_media_types[0]);
3012 /* nothing to do; media type cannot change */
3017 * At this point we know the NIC has an XFP cage, so now we
3018 * try to determine what is in the cage by using the
3019 * firmware's XFP I2C commands to read the XFP 10GbE compilance
3020 * register. We read just one byte, which may take over
3024 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
3026 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3027 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3028 device_printf(sc->dev, "failed to read XFP\n");
3030 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3031 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3033 if (err != MXGEFW_CMD_OK) {
3037 /* now we wait for the data to be cached */
3039 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3040 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3043 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3045 if (err != MXGEFW_CMD_OK) {
3046 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3047 cage_type, err, ms);
3051 if (cmd.data0 == mxge_media_types[0].bitmask) {
3053 device_printf(sc->dev, "%s:%s\n", cage_type,
3054 mxge_media_types[0].name);
3055 if (sc->current_media != mxge_media_types[0].flag) {
3056 mxge_media_init(sc);
3057 mxge_media_set(sc, mxge_media_types[0].flag);
3061 for (i = 1; i < mxge_media_type_entries; i++) {
3062 if (cmd.data0 & mxge_media_types[i].bitmask) {
3064 device_printf(sc->dev, "%s:%s\n",
3066 mxge_media_types[i].name);
3068 if (sc->current_media != mxge_media_types[i].flag) {
3069 mxge_media_init(sc);
3070 mxge_media_set(sc, mxge_media_types[i].flag);
3076 device_printf(sc->dev, "%s media 0x%x unknown\n",
3077 cage_type, cmd.data0);
3083 mxge_intr(void *arg)
3085 struct mxge_slice_state *ss = arg;
3086 mxge_softc_t *sc = ss->sc;
3087 mcp_irq_data_t *stats = ss->fw_stats;
3088 mxge_tx_ring_t *tx = &ss->tx;
3089 mxge_rx_done_t *rx_done = &ss->rx_done;
3090 uint32_t send_done_count;
3094 #ifndef IFNET_BUF_RING
3095 /* an interrupt on a non-zero slice is implicitly valid
3096 since MSI-X irqs are not shared */
3098 mxge_clean_rx_done(ss);
3099 *ss->irq_claim = be32toh(3);
3104 /* make sure the DMA has finished */
3105 if (!stats->valid) {
3108 valid = stats->valid;
3110 if (sc->legacy_irq) {
3111 /* lower legacy IRQ */
3112 *sc->irq_deassert = 0;
3113 if (!mxge_deassert_wait)
3114 /* don't wait for conf. that irq is low */
3120 /* loop while waiting for legacy irq deassertion */
3122 /* check for transmit completes and receives */
3123 send_done_count = be32toh(stats->send_done_count);
3124 while ((send_done_count != tx->pkt_done) ||
3125 (rx_done->entry[rx_done->idx].length != 0)) {
3126 if (send_done_count != tx->pkt_done)
3127 mxge_tx_done(ss, (int)send_done_count);
3128 mxge_clean_rx_done(ss);
3129 send_done_count = be32toh(stats->send_done_count);
3131 if (sc->legacy_irq && mxge_deassert_wait)
3133 } while (*((volatile uint8_t *) &stats->valid));
3135 /* fw link & error stats meaningful only on the first slice */
3136 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3137 if (sc->link_state != stats->link_up) {
3138 sc->link_state = stats->link_up;
3139 if (sc->link_state) {
3140 if_link_state_change(sc->ifp, LINK_STATE_UP);
3141 sc->ifp->if_baudrate = IF_Gbps(10UL);
3143 device_printf(sc->dev, "link up\n");
3145 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3146 sc->ifp->if_baudrate = 0;
3148 device_printf(sc->dev, "link down\n");
3150 sc->need_media_probe = 1;
3152 if (sc->rdma_tags_available !=
3153 be32toh(stats->rdma_tags_available)) {
3154 sc->rdma_tags_available =
3155 be32toh(stats->rdma_tags_available);
3156 device_printf(sc->dev, "RDMA timed out! %d tags "
3157 "left\n", sc->rdma_tags_available);
3160 if (stats->link_down) {
3161 sc->down_cnt += stats->link_down;
3163 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3167 /* check to see if we have rx token to pass back */
3169 *ss->irq_claim = be32toh(3);
3170 *(ss->irq_claim + 1) = be32toh(3);
3174 mxge_init(void *arg)
3176 mxge_softc_t *sc = arg;
3177 struct ifnet *ifp = sc->ifp;
3180 mtx_lock(&sc->driver_mtx);
3181 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3182 (void) mxge_open(sc);
3183 mtx_unlock(&sc->driver_mtx);
3189 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3193 #if defined(INET) || defined(INET6)
3194 tcp_lro_free(&ss->lc);
3196 for (i = 0; i <= ss->rx_big.mask; i++) {
3197 if (ss->rx_big.info[i].m == NULL)
3199 bus_dmamap_unload(ss->rx_big.dmat,
3200 ss->rx_big.info[i].map);
3201 m_freem(ss->rx_big.info[i].m);
3202 ss->rx_big.info[i].m = NULL;
3205 for (i = 0; i <= ss->rx_small.mask; i++) {
3206 if (ss->rx_small.info[i].m == NULL)
3208 bus_dmamap_unload(ss->rx_small.dmat,
3209 ss->rx_small.info[i].map);
3210 m_freem(ss->rx_small.info[i].m);
3211 ss->rx_small.info[i].m = NULL;
3214 /* transmit ring used only on the first slice */
3215 if (ss->tx.info == NULL)
3218 for (i = 0; i <= ss->tx.mask; i++) {
3219 ss->tx.info[i].flag = 0;
3220 if (ss->tx.info[i].m == NULL)
3222 bus_dmamap_unload(ss->tx.dmat,
3223 ss->tx.info[i].map);
3224 m_freem(ss->tx.info[i].m);
3225 ss->tx.info[i].m = NULL;
3230 mxge_free_mbufs(mxge_softc_t *sc)
3234 for (slice = 0; slice < sc->num_slices; slice++)
3235 mxge_free_slice_mbufs(&sc->ss[slice]);
3239 mxge_free_slice_rings(struct mxge_slice_state *ss)
3244 if (ss->rx_done.entry != NULL)
3245 mxge_dma_free(&ss->rx_done.dma);
3246 ss->rx_done.entry = NULL;
3248 if (ss->tx.req_bytes != NULL)
3249 free(ss->tx.req_bytes, M_DEVBUF);
3250 ss->tx.req_bytes = NULL;
3252 if (ss->tx.seg_list != NULL)
3253 free(ss->tx.seg_list, M_DEVBUF);
3254 ss->tx.seg_list = NULL;
3256 if (ss->rx_small.shadow != NULL)
3257 free(ss->rx_small.shadow, M_DEVBUF);
3258 ss->rx_small.shadow = NULL;
3260 if (ss->rx_big.shadow != NULL)
3261 free(ss->rx_big.shadow, M_DEVBUF);
3262 ss->rx_big.shadow = NULL;
3264 if (ss->tx.info != NULL) {
3265 if (ss->tx.dmat != NULL) {
3266 for (i = 0; i <= ss->tx.mask; i++) {
3267 bus_dmamap_destroy(ss->tx.dmat,
3268 ss->tx.info[i].map);
3270 bus_dma_tag_destroy(ss->tx.dmat);
3272 free(ss->tx.info, M_DEVBUF);
3276 if (ss->rx_small.info != NULL) {
3277 if (ss->rx_small.dmat != NULL) {
3278 for (i = 0; i <= ss->rx_small.mask; i++) {
3279 bus_dmamap_destroy(ss->rx_small.dmat,
3280 ss->rx_small.info[i].map);
3282 bus_dmamap_destroy(ss->rx_small.dmat,
3283 ss->rx_small.extra_map);
3284 bus_dma_tag_destroy(ss->rx_small.dmat);
3286 free(ss->rx_small.info, M_DEVBUF);
3288 ss->rx_small.info = NULL;
3290 if (ss->rx_big.info != NULL) {
3291 if (ss->rx_big.dmat != NULL) {
3292 for (i = 0; i <= ss->rx_big.mask; i++) {
3293 bus_dmamap_destroy(ss->rx_big.dmat,
3294 ss->rx_big.info[i].map);
3296 bus_dmamap_destroy(ss->rx_big.dmat,
3297 ss->rx_big.extra_map);
3298 bus_dma_tag_destroy(ss->rx_big.dmat);
3300 free(ss->rx_big.info, M_DEVBUF);
3302 ss->rx_big.info = NULL;
3306 mxge_free_rings(mxge_softc_t *sc)
3310 for (slice = 0; slice < sc->num_slices; slice++)
3311 mxge_free_slice_rings(&sc->ss[slice]);
3315 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3316 int tx_ring_entries)
3318 mxge_softc_t *sc = ss->sc;
3322 /* allocate per-slice receive resources */
3324 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3325 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3327 /* allocate the rx shadow rings */
3328 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3329 ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3331 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3332 ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3334 /* allocate the rx host info rings */
3335 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3336 ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3338 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3339 ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3341 /* allocate the rx busdma resources */
3342 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3344 4096, /* boundary */
3345 BUS_SPACE_MAXADDR, /* low */
3346 BUS_SPACE_MAXADDR, /* high */
3347 NULL, NULL, /* filter */
3348 MHLEN, /* maxsize */
3350 MHLEN, /* maxsegsize */
3351 BUS_DMA_ALLOCNOW, /* flags */
3352 NULL, NULL, /* lock */
3353 &ss->rx_small.dmat); /* tag */
3355 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3360 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3362 #if MXGE_VIRT_JUMBOS
3363 4096, /* boundary */
3367 BUS_SPACE_MAXADDR, /* low */
3368 BUS_SPACE_MAXADDR, /* high */
3369 NULL, NULL, /* filter */
3370 3*4096, /* maxsize */
3371 #if MXGE_VIRT_JUMBOS
3373 4096, /* maxsegsize*/
3376 MJUM9BYTES, /* maxsegsize*/
3378 BUS_DMA_ALLOCNOW, /* flags */
3379 NULL, NULL, /* lock */
3380 &ss->rx_big.dmat); /* tag */
3382 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3386 for (i = 0; i <= ss->rx_small.mask; i++) {
3387 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3388 &ss->rx_small.info[i].map);
3390 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3395 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3396 &ss->rx_small.extra_map);
3398 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3403 for (i = 0; i <= ss->rx_big.mask; i++) {
3404 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3405 &ss->rx_big.info[i].map);
3407 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3412 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3413 &ss->rx_big.extra_map);
3415 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3420 /* now allocate TX resouces */
3422 #ifndef IFNET_BUF_RING
3423 /* only use a single TX ring for now */
3424 if (ss != ss->sc->ss)
3428 ss->tx.mask = tx_ring_entries - 1;
3429 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3432 /* allocate the tx request copy block */
3434 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3435 ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3436 /* ensure req_list entries are aligned to 8 bytes */
3437 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3438 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3440 /* allocate the tx busdma segment list */
3441 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3442 ss->tx.seg_list = (bus_dma_segment_t *)
3443 malloc(bytes, M_DEVBUF, M_WAITOK);
3445 /* allocate the tx host info ring */
3446 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3447 ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3449 /* allocate the tx busdma resources */
3450 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3452 sc->tx_boundary, /* boundary */
3453 BUS_SPACE_MAXADDR, /* low */
3454 BUS_SPACE_MAXADDR, /* high */
3455 NULL, NULL, /* filter */
3456 65536 + 256, /* maxsize */
3457 ss->tx.max_desc - 2, /* num segs */
3458 sc->tx_boundary, /* maxsegsz */
3459 BUS_DMA_ALLOCNOW, /* flags */
3460 NULL, NULL, /* lock */
3461 &ss->tx.dmat); /* tag */
3464 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3469 /* now use these tags to setup dmamaps for each slot
3471 for (i = 0; i <= ss->tx.mask; i++) {
3472 err = bus_dmamap_create(ss->tx.dmat, 0,
3473 &ss->tx.info[i].map);
3475 device_printf(sc->dev, "Err %d tx dmamap\n",
3485 mxge_alloc_rings(mxge_softc_t *sc)
3489 int tx_ring_entries, rx_ring_entries;
3492 /* get ring sizes */
3493 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3494 tx_ring_size = cmd.data0;
3496 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3500 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3501 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3502 IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3503 sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3504 IFQ_SET_READY(&sc->ifp->if_snd);
3506 for (slice = 0; slice < sc->num_slices; slice++) {
3507 err = mxge_alloc_slice_rings(&sc->ss[slice],
3516 mxge_free_rings(sc);
3523 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3525 int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3527 if (bufsize < MCLBYTES) {
3528 /* easy, everything fits in a single buffer */
3529 *big_buf_size = MCLBYTES;
3530 *cl_size = MCLBYTES;
3535 if (bufsize < MJUMPAGESIZE) {
3536 /* still easy, everything still fits in a single buffer */
3537 *big_buf_size = MJUMPAGESIZE;
3538 *cl_size = MJUMPAGESIZE;
3542 #if MXGE_VIRT_JUMBOS
3543 /* now we need to use virtually contiguous buffers */
3544 *cl_size = MJUM9BYTES;
3545 *big_buf_size = 4096;
3546 *nbufs = mtu / 4096 + 1;
3547 /* needs to be a power of two, so round up */
3551 *cl_size = MJUM9BYTES;
3552 *big_buf_size = MJUM9BYTES;
3558 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3567 slice = ss - sc->ss;
3569 #if defined(INET) || defined(INET6)
3570 (void)tcp_lro_init(&ss->lc);
3572 ss->lc.ifp = sc->ifp;
3574 /* get the lanai pointers to the send and receive rings */
3577 #ifndef IFNET_BUF_RING
3578 /* We currently only send from the first slice */
3582 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3584 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3585 ss->tx.send_go = (volatile uint32_t *)
3586 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3587 ss->tx.send_stop = (volatile uint32_t *)
3588 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3589 #ifndef IFNET_BUF_RING
3593 err |= mxge_send_cmd(sc,
3594 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3595 ss->rx_small.lanai =
3596 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3598 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3600 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3603 device_printf(sc->dev,
3604 "failed to get ring sizes or locations\n");
3608 /* stock receive rings */
3609 for (i = 0; i <= ss->rx_small.mask; i++) {
3610 map = ss->rx_small.info[i].map;
3611 err = mxge_get_buf_small(ss, map, i);
3613 device_printf(sc->dev, "alloced %d/%d smalls\n",
3614 i, ss->rx_small.mask + 1);
3618 for (i = 0; i <= ss->rx_big.mask; i++) {
3619 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3620 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3622 ss->rx_big.nbufs = nbufs;
3623 ss->rx_big.cl_size = cl_size;
3624 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3625 ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3626 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3627 map = ss->rx_big.info[i].map;
3628 err = mxge_get_buf_big(ss, map, i);
3630 device_printf(sc->dev, "alloced %d/%d bigs\n",
3631 i, ss->rx_big.mask + 1);
3639 mxge_open(mxge_softc_t *sc)
3642 int err, big_bytes, nbufs, slice, cl_size, i;
3644 volatile uint8_t *itable;
3645 struct mxge_slice_state *ss;
3647 /* Copy the MAC address in case it was overridden */
3648 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3650 err = mxge_reset(sc, 1);
3652 device_printf(sc->dev, "failed to reset\n");
3656 if (sc->num_slices > 1) {
3657 /* setup the indirection table */
3658 cmd.data0 = sc->num_slices;
3659 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3662 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3665 device_printf(sc->dev,
3666 "failed to setup rss tables\n");
3670 /* just enable an identity mapping */
3671 itable = sc->sram + cmd.data0;
3672 for (i = 0; i < sc->num_slices; i++)
3673 itable[i] = (uint8_t)i;
3676 cmd.data1 = mxge_rss_hash_type;
3677 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3679 device_printf(sc->dev, "failed to enable slices\n");
3685 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3688 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3690 /* error is only meaningful if we're trying to set
3691 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3692 if (err && nbufs > 1) {
3693 device_printf(sc->dev,
3694 "Failed to set alway-use-n to %d\n",
3698 /* Give the firmware the mtu and the big and small buffer
3699 sizes. The firmware wants the big buf size to be a power
3700 of two. Luckily, FreeBSD's clusters are powers of two */
3701 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3702 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3703 cmd.data0 = MHLEN - MXGEFW_PAD;
3704 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3706 cmd.data0 = big_bytes;
3707 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3710 device_printf(sc->dev, "failed to setup params\n");
3714 /* Now give him the pointer to the stats block */
3716 #ifdef IFNET_BUF_RING
3717 slice < sc->num_slices;
3722 ss = &sc->ss[slice];
3724 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3726 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3727 cmd.data2 = sizeof(struct mcp_irq_data);
3728 cmd.data2 |= (slice << 16);
3729 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3733 bus = sc->ss->fw_stats_dma.bus_addr;
3734 bus += offsetof(struct mcp_irq_data, send_done_count);
3735 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3736 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3737 err = mxge_send_cmd(sc,
3738 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3740 /* Firmware cannot support multicast without STATS_DMA_V2 */
3741 sc->fw_multicast_support = 0;
3743 sc->fw_multicast_support = 1;
3747 device_printf(sc->dev, "failed to setup params\n");
3751 for (slice = 0; slice < sc->num_slices; slice++) {
3752 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3754 device_printf(sc->dev, "couldn't open slice %d\n",
3760 /* Finally, start the firmware running */
3761 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3763 device_printf(sc->dev, "Couldn't bring up link\n");
3766 #ifdef IFNET_BUF_RING
3767 for (slice = 0; slice < sc->num_slices; slice++) {
3768 ss = &sc->ss[slice];
3769 ss->if_drv_flags |= IFF_DRV_RUNNING;
3770 ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3773 sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3774 sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3780 mxge_free_mbufs(sc);
3786 mxge_close(mxge_softc_t *sc, int down)
3789 int err, old_down_cnt;
3790 #ifdef IFNET_BUF_RING
3791 struct mxge_slice_state *ss;
3795 #ifdef IFNET_BUF_RING
3796 for (slice = 0; slice < sc->num_slices; slice++) {
3797 ss = &sc->ss[slice];
3798 ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3801 sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3803 old_down_cnt = sc->down_cnt;
3805 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3807 device_printf(sc->dev,
3808 "Couldn't bring down link\n");
3810 if (old_down_cnt == sc->down_cnt) {
3811 /* wait for down irq */
3812 DELAY(10 * sc->intr_coal_delay);
3815 if (old_down_cnt == sc->down_cnt) {
3816 device_printf(sc->dev, "never got down irq\n");
3819 mxge_free_mbufs(sc);
3825 mxge_setup_cfg_space(mxge_softc_t *sc)
3827 device_t dev = sc->dev;
3829 uint16_t cmd, lnk, pectl;
3831 /* find the PCIe link width and set max read request to 4KB*/
3832 if (pci_find_cap(dev, PCIY_EXPRESS, ®) == 0) {
3833 lnk = pci_read_config(dev, reg + 0x12, 2);
3834 sc->link_width = (lnk >> 4) & 0x3f;
3836 if (sc->pectl == 0) {
3837 pectl = pci_read_config(dev, reg + 0x8, 2);
3838 pectl = (pectl & ~0x7000) | (5 << 12);
3839 pci_write_config(dev, reg + 0x8, pectl, 2);
3842 /* restore saved pectl after watchdog reset */
3843 pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3847 /* Enable DMA and Memory space access */
3848 pci_enable_busmaster(dev);
3849 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3850 cmd |= PCIM_CMD_MEMEN;
3851 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3855 mxge_read_reboot(mxge_softc_t *sc)
3857 device_t dev = sc->dev;
3860 /* find the vendor specific offset */
3861 if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3862 device_printf(sc->dev,
3863 "could not find vendor specific offset\n");
3864 return (uint32_t)-1;
3866 /* enable read32 mode */
3867 pci_write_config(dev, vs + 0x10, 0x3, 1);
3868 /* tell NIC which register to read */
3869 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3870 return (pci_read_config(dev, vs + 0x14, 4));
3874 mxge_watchdog_reset(mxge_softc_t *sc)
3876 struct pci_devinfo *dinfo;
3877 struct mxge_slice_state *ss;
3878 int err, running, s, num_tx_slices = 1;
3884 device_printf(sc->dev, "Watchdog reset!\n");
3887 * check to see if the NIC rebooted. If it did, then all of
3888 * PCI config space has been reset, and things like the
3889 * busmaster bit will be zero. If this is the case, then we
3890 * must restore PCI config space before the NIC can be used
3893 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3894 if (cmd == 0xffff) {
3896 * maybe the watchdog caught the NIC rebooting; wait
3897 * up to 100ms for it to finish. If it does not come
3898 * back, then give up
3901 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3902 if (cmd == 0xffff) {
3903 device_printf(sc->dev, "NIC disappeared!\n");
3906 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3907 /* print the reboot status */
3908 reboot = mxge_read_reboot(sc);
3909 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3911 running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3915 * quiesce NIC so that TX routines will not try to
3916 * xmit after restoration of BAR
3919 /* Mark the link as down */
3920 if (sc->link_state) {
3922 if_link_state_change(sc->ifp,
3925 #ifdef IFNET_BUF_RING
3926 num_tx_slices = sc->num_slices;
3928 /* grab all TX locks to ensure no tx */
3929 for (s = 0; s < num_tx_slices; s++) {
3931 mtx_lock(&ss->tx.mtx);
3935 /* restore PCI configuration space */
3936 dinfo = device_get_ivars(sc->dev);
3937 pci_cfg_restore(sc->dev, dinfo);
3939 /* and redo any changes we made to our config space */
3940 mxge_setup_cfg_space(sc);
3943 err = mxge_load_firmware(sc, 0);
3945 device_printf(sc->dev,
3946 "Unable to re-load f/w\n");
3950 err = mxge_open(sc);
3951 /* release all TX locks */
3952 for (s = 0; s < num_tx_slices; s++) {
3954 #ifdef IFNET_BUF_RING
3955 mxge_start_locked(ss);
3957 mtx_unlock(&ss->tx.mtx);
3960 sc->watchdog_resets++;
3962 device_printf(sc->dev,
3963 "NIC did not reboot, not resetting\n");
3967 device_printf(sc->dev, "watchdog reset failed\n");
3971 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3976 mxge_watchdog_task(void *arg, int pending)
3978 mxge_softc_t *sc = arg;
3981 mtx_lock(&sc->driver_mtx);
3982 mxge_watchdog_reset(sc);
3983 mtx_unlock(&sc->driver_mtx);
3987 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3989 tx = &sc->ss[slice].tx;
3990 device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3991 device_printf(sc->dev,
3992 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3993 tx->req, tx->done, tx->queue_active);
3994 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3995 tx->activate, tx->deactivate);
3996 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3998 be32toh(sc->ss->fw_stats->send_done_count));
4002 mxge_watchdog(mxge_softc_t *sc)
4005 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
4008 /* see if we have outstanding transmits, which
4009 have been pending for more than mxge_ticks */
4011 #ifdef IFNET_BUF_RING
4012 (i < sc->num_slices) && (err == 0);
4014 (i < 1) && (err == 0);
4018 if (tx->req != tx->done &&
4019 tx->watchdog_req != tx->watchdog_done &&
4020 tx->done == tx->watchdog_done) {
4021 /* check for pause blocking before resetting */
4022 if (tx->watchdog_rx_pause == rx_pause) {
4023 mxge_warn_stuck(sc, tx, i);
4024 taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4028 device_printf(sc->dev, "Flow control blocking "
4029 "xmits, check link partner\n");
4032 tx->watchdog_req = tx->req;
4033 tx->watchdog_done = tx->done;
4034 tx->watchdog_rx_pause = rx_pause;
4037 if (sc->need_media_probe)
4038 mxge_media_probe(sc);
4043 mxge_update_stats(mxge_softc_t *sc)
4045 struct mxge_slice_state *ss;
4047 u_long ipackets = 0;
4048 u_long opackets = 0;
4049 #ifdef IFNET_BUF_RING
4057 for (slice = 0; slice < sc->num_slices; slice++) {
4058 ss = &sc->ss[slice];
4059 ipackets += ss->ipackets;
4060 opackets += ss->opackets;
4061 #ifdef IFNET_BUF_RING
4062 obytes += ss->obytes;
4063 omcasts += ss->omcasts;
4064 odrops += ss->tx.br->br_drops;
4066 oerrors += ss->oerrors;
4068 pkts = (ipackets - sc->ifp->if_ipackets);
4069 pkts += (opackets - sc->ifp->if_opackets);
4070 sc->ifp->if_ipackets = ipackets;
4071 sc->ifp->if_opackets = opackets;
4072 #ifdef IFNET_BUF_RING
4073 sc->ifp->if_obytes = obytes;
4074 sc->ifp->if_omcasts = omcasts;
4075 sc->ifp->if_snd.ifq_drops = odrops;
4077 sc->ifp->if_oerrors = oerrors;
4082 mxge_tick(void *arg)
4084 mxge_softc_t *sc = arg;
4091 running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4093 /* aggregate stats from different slices */
4094 pkts = mxge_update_stats(sc);
4095 if (!sc->watchdog_countdown) {
4096 err = mxge_watchdog(sc);
4097 sc->watchdog_countdown = 4;
4099 sc->watchdog_countdown--;
4102 /* ensure NIC did not suffer h/w fault while idle */
4103 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4104 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4106 taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4109 /* look less often if NIC is idle */
4114 callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4119 mxge_media_change(struct ifnet *ifp)
4125 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4127 struct ifnet *ifp = sc->ifp;
4128 int real_mtu, old_mtu;
4132 real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4133 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4135 mtx_lock(&sc->driver_mtx);
4136 old_mtu = ifp->if_mtu;
4138 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4140 err = mxge_open(sc);
4142 ifp->if_mtu = old_mtu;
4144 (void) mxge_open(sc);
4147 mtx_unlock(&sc->driver_mtx);
4152 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4154 mxge_softc_t *sc = ifp->if_softc;
4159 ifmr->ifm_status = IFM_AVALID;
4160 ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4161 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4162 ifmr->ifm_active |= sc->current_media;
4166 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4168 mxge_softc_t *sc = ifp->if_softc;
4169 struct ifreq *ifr = (struct ifreq *)data;
4176 err = ether_ioctl(ifp, command, data);
4180 err = mxge_change_mtu(sc, ifr->ifr_mtu);
4184 mtx_lock(&sc->driver_mtx);
4186 mtx_unlock(&sc->driver_mtx);
4189 if (ifp->if_flags & IFF_UP) {
4190 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4191 err = mxge_open(sc);
4193 /* take care of promis can allmulti
4195 mxge_change_promisc(sc,
4196 ifp->if_flags & IFF_PROMISC);
4197 mxge_set_multicast_list(sc);
4200 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4204 mtx_unlock(&sc->driver_mtx);
4209 mtx_lock(&sc->driver_mtx);
4210 mxge_set_multicast_list(sc);
4211 mtx_unlock(&sc->driver_mtx);
4215 mtx_lock(&sc->driver_mtx);
4216 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4217 if (mask & IFCAP_TXCSUM) {
4218 if (IFCAP_TXCSUM & ifp->if_capenable) {
4219 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4220 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4222 ifp->if_capenable |= IFCAP_TXCSUM;
4223 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4225 } else if (mask & IFCAP_RXCSUM) {
4226 if (IFCAP_RXCSUM & ifp->if_capenable) {
4227 ifp->if_capenable &= ~IFCAP_RXCSUM;
4229 ifp->if_capenable |= IFCAP_RXCSUM;
4232 if (mask & IFCAP_TSO4) {
4233 if (IFCAP_TSO4 & ifp->if_capenable) {
4234 ifp->if_capenable &= ~IFCAP_TSO4;
4235 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
4236 ifp->if_capenable |= IFCAP_TSO4;
4237 ifp->if_hwassist |= CSUM_TSO;
4239 printf("mxge requires tx checksum offload"
4240 " be enabled to use TSO\n");
4245 if (mask & IFCAP_TXCSUM_IPV6) {
4246 if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4247 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4249 ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4252 ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4253 ifp->if_hwassist |= (CSUM_TCP_IPV6
4256 } else if (mask & IFCAP_RXCSUM_IPV6) {
4257 if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4258 ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4260 ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4263 if (mask & IFCAP_TSO6) {
4264 if (IFCAP_TSO6 & ifp->if_capenable) {
4265 ifp->if_capenable &= ~IFCAP_TSO6;
4266 } else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4267 ifp->if_capenable |= IFCAP_TSO6;
4268 ifp->if_hwassist |= CSUM_TSO;
4270 printf("mxge requires tx checksum offload"
4271 " be enabled to use TSO\n");
4275 #endif /*IFCAP_TSO6 */
4277 if (mask & IFCAP_LRO)
4278 ifp->if_capenable ^= IFCAP_LRO;
4279 if (mask & IFCAP_VLAN_HWTAGGING)
4280 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4281 if (mask & IFCAP_VLAN_HWTSO)
4282 ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4284 if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4285 !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4286 ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4288 mtx_unlock(&sc->driver_mtx);
4289 VLAN_CAPABILITIES(ifp);
4294 mtx_lock(&sc->driver_mtx);
4295 mxge_media_probe(sc);
4296 mtx_unlock(&sc->driver_mtx);
4297 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4298 &sc->media, command);
4308 mxge_fetch_tunables(mxge_softc_t *sc)
4311 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4312 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4313 &mxge_flow_control);
4314 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4315 &mxge_intr_coal_delay);
4316 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4317 &mxge_nvidia_ecrc_enable);
4318 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4319 &mxge_force_firmware);
4320 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4321 &mxge_deassert_wait);
4322 TUNABLE_INT_FETCH("hw.mxge.verbose",
4324 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4325 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4326 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4327 TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4328 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4329 TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4333 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4334 mxge_intr_coal_delay = 30;
4335 if (mxge_ticks == 0)
4336 mxge_ticks = hz / 2;
4337 sc->pause = mxge_flow_control;
4338 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4339 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4340 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4342 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4343 mxge_initial_mtu < ETHER_MIN_LEN)
4344 mxge_initial_mtu = ETHERMTU_JUMBO;
4346 if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4347 mxge_throttle = MXGE_MAX_THROTTLE;
4348 if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4349 mxge_throttle = MXGE_MIN_THROTTLE;
4350 sc->throttle = mxge_throttle;
4355 mxge_free_slices(mxge_softc_t *sc)
4357 struct mxge_slice_state *ss;
4364 for (i = 0; i < sc->num_slices; i++) {
4366 if (ss->fw_stats != NULL) {
4367 mxge_dma_free(&ss->fw_stats_dma);
4368 ss->fw_stats = NULL;
4369 #ifdef IFNET_BUF_RING
4370 if (ss->tx.br != NULL) {
4371 drbr_free(ss->tx.br, M_DEVBUF);
4375 mtx_destroy(&ss->tx.mtx);
4377 if (ss->rx_done.entry != NULL) {
4378 mxge_dma_free(&ss->rx_done.dma);
4379 ss->rx_done.entry = NULL;
4382 free(sc->ss, M_DEVBUF);
4387 mxge_alloc_slices(mxge_softc_t *sc)
4390 struct mxge_slice_state *ss;
4392 int err, i, max_intr_slots;
4394 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4396 device_printf(sc->dev, "Cannot determine rx ring size\n");
4399 sc->rx_ring_size = cmd.data0;
4400 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4402 bytes = sizeof (*sc->ss) * sc->num_slices;
4403 sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4406 for (i = 0; i < sc->num_slices; i++) {
4411 /* allocate per-slice rx interrupt queues */
4413 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4414 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4417 ss->rx_done.entry = ss->rx_done.dma.addr;
4418 bzero(ss->rx_done.entry, bytes);
4421 * allocate the per-slice firmware stats; stats
4422 * (including tx) are used used only on the first
4425 #ifndef IFNET_BUF_RING
4430 bytes = sizeof (*ss->fw_stats);
4431 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4432 sizeof (*ss->fw_stats), 64);
4435 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4436 snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4437 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4438 mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4439 #ifdef IFNET_BUF_RING
4440 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4448 mxge_free_slices(sc);
4453 mxge_slice_probe(mxge_softc_t *sc)
4457 int msix_cnt, status, max_intr_slots;
4461 * don't enable multiple slices if they are not enabled,
4462 * or if this is not an SMP system
4465 if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4468 /* see how many MSI-X interrupts are available */
4469 msix_cnt = pci_msix_count(sc->dev);
4473 /* now load the slice aware firmware see what it supports */
4474 old_fw = sc->fw_name;
4475 if (old_fw == mxge_fw_aligned)
4476 sc->fw_name = mxge_fw_rss_aligned;
4478 sc->fw_name = mxge_fw_rss_unaligned;
4479 status = mxge_load_firmware(sc, 0);
4481 device_printf(sc->dev, "Falling back to a single slice\n");
4485 /* try to send a reset command to the card to see if it
4487 memset(&cmd, 0, sizeof (cmd));
4488 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4490 device_printf(sc->dev, "failed reset\n");
4494 /* get rx ring size */
4495 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4497 device_printf(sc->dev, "Cannot determine rx ring size\n");
4500 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4502 /* tell it the size of the interrupt queues */
4503 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4504 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4506 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4510 /* ask the maximum number of slices it supports */
4511 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4513 device_printf(sc->dev,
4514 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4517 sc->num_slices = cmd.data0;
4518 if (sc->num_slices > msix_cnt)
4519 sc->num_slices = msix_cnt;
4521 if (mxge_max_slices == -1) {
4522 /* cap to number of CPUs in system */
4523 if (sc->num_slices > mp_ncpus)
4524 sc->num_slices = mp_ncpus;
4526 if (sc->num_slices > mxge_max_slices)
4527 sc->num_slices = mxge_max_slices;
4529 /* make sure it is a power of two */
4530 while (sc->num_slices & (sc->num_slices - 1))
4534 device_printf(sc->dev, "using %d slices\n",
4540 sc->fw_name = old_fw;
4541 (void) mxge_load_firmware(sc, 0);
4545 mxge_add_msix_irqs(mxge_softc_t *sc)
4548 int count, err, i, rid;
4551 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4554 if (sc->msix_table_res == NULL) {
4555 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4559 count = sc->num_slices;
4560 err = pci_alloc_msix(sc->dev, &count);
4562 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4563 "err = %d \n", sc->num_slices, err);
4564 goto abort_with_msix_table;
4566 if (count < sc->num_slices) {
4567 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4568 count, sc->num_slices);
4569 device_printf(sc->dev,
4570 "Try setting hw.mxge.max_slices to %d\n",
4573 goto abort_with_msix;
4575 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4576 sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4577 if (sc->msix_irq_res == NULL) {
4579 goto abort_with_msix;
4582 for (i = 0; i < sc->num_slices; i++) {
4584 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4587 if (sc->msix_irq_res[i] == NULL) {
4588 device_printf(sc->dev, "couldn't allocate IRQ res"
4589 " for message %d\n", i);
4591 goto abort_with_res;
4595 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4596 sc->msix_ih = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4598 for (i = 0; i < sc->num_slices; i++) {
4599 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4600 INTR_TYPE_NET | INTR_MPSAFE,
4601 #if __FreeBSD_version > 700030
4604 mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4606 device_printf(sc->dev, "couldn't setup intr for "
4608 goto abort_with_intr;
4610 bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4611 sc->msix_ih[i], "s%d", i);
4615 device_printf(sc->dev, "using %d msix IRQs:",
4617 for (i = 0; i < sc->num_slices; i++)
4618 printf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4624 for (i = 0; i < sc->num_slices; i++) {
4625 if (sc->msix_ih[i] != NULL) {
4626 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4628 sc->msix_ih[i] = NULL;
4631 free(sc->msix_ih, M_DEVBUF);
4635 for (i = 0; i < sc->num_slices; i++) {
4637 if (sc->msix_irq_res[i] != NULL)
4638 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4639 sc->msix_irq_res[i]);
4640 sc->msix_irq_res[i] = NULL;
4642 free(sc->msix_irq_res, M_DEVBUF);
4646 pci_release_msi(sc->dev);
4648 abort_with_msix_table:
4649 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4650 sc->msix_table_res);
4656 mxge_add_single_irq(mxge_softc_t *sc)
4658 int count, err, rid;
4660 count = pci_msi_count(sc->dev);
4661 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4667 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4668 1, RF_SHAREABLE | RF_ACTIVE);
4669 if (sc->irq_res == NULL) {
4670 device_printf(sc->dev, "could not alloc interrupt\n");
4674 device_printf(sc->dev, "using %s irq %ld\n",
4675 sc->legacy_irq ? "INTx" : "MSI",
4676 rman_get_start(sc->irq_res));
4677 err = bus_setup_intr(sc->dev, sc->irq_res,
4678 INTR_TYPE_NET | INTR_MPSAFE,
4679 #if __FreeBSD_version > 700030
4682 mxge_intr, &sc->ss[0], &sc->ih);
4684 bus_release_resource(sc->dev, SYS_RES_IRQ,
4685 sc->legacy_irq ? 0 : 1, sc->irq_res);
4686 if (!sc->legacy_irq)
4687 pci_release_msi(sc->dev);
4693 mxge_rem_msix_irqs(mxge_softc_t *sc)
4697 for (i = 0; i < sc->num_slices; i++) {
4698 if (sc->msix_ih[i] != NULL) {
4699 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4701 sc->msix_ih[i] = NULL;
4704 free(sc->msix_ih, M_DEVBUF);
4706 for (i = 0; i < sc->num_slices; i++) {
4708 if (sc->msix_irq_res[i] != NULL)
4709 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4710 sc->msix_irq_res[i]);
4711 sc->msix_irq_res[i] = NULL;
4713 free(sc->msix_irq_res, M_DEVBUF);
4715 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4716 sc->msix_table_res);
4718 pci_release_msi(sc->dev);
4723 mxge_rem_single_irq(mxge_softc_t *sc)
4725 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4726 bus_release_resource(sc->dev, SYS_RES_IRQ,
4727 sc->legacy_irq ? 0 : 1, sc->irq_res);
4728 if (!sc->legacy_irq)
4729 pci_release_msi(sc->dev);
4733 mxge_rem_irq(mxge_softc_t *sc)
4735 if (sc->num_slices > 1)
4736 mxge_rem_msix_irqs(sc);
4738 mxge_rem_single_irq(sc);
4742 mxge_add_irq(mxge_softc_t *sc)
4746 if (sc->num_slices > 1)
4747 err = mxge_add_msix_irqs(sc);
4749 err = mxge_add_single_irq(sc);
4751 if (0 && err == 0 && sc->num_slices > 1) {
4752 mxge_rem_msix_irqs(sc);
4753 err = mxge_add_msix_irqs(sc);
4760 mxge_attach(device_t dev)
4763 mxge_softc_t *sc = device_get_softc(dev);
4768 mxge_fetch_tunables(sc);
4770 TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4771 sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4772 taskqueue_thread_enqueue, &sc->tq);
4773 if (sc->tq == NULL) {
4775 goto abort_with_nothing;
4778 err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
4781 BUS_SPACE_MAXADDR, /* low */
4782 BUS_SPACE_MAXADDR, /* high */
4783 NULL, NULL, /* filter */
4784 65536 + 256, /* maxsize */
4785 MXGE_MAX_SEND_DESC, /* num segs */
4786 65536, /* maxsegsize */
4788 NULL, NULL, /* lock */
4789 &sc->parent_dmat); /* tag */
4792 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4797 ifp = sc->ifp = if_alloc(IFT_ETHER);
4799 device_printf(dev, "can not if_alloc()\n");
4801 goto abort_with_parent_dmat;
4803 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4805 snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4806 device_get_nameunit(dev));
4807 mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4808 snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4809 "%s:drv", device_get_nameunit(dev));
4810 mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4811 MTX_NETWORK_LOCK, MTX_DEF);
4813 callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4815 mxge_setup_cfg_space(sc);
4817 /* Map the board into the kernel */
4819 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4821 if (sc->mem_res == NULL) {
4822 device_printf(dev, "could not map memory\n");
4824 goto abort_with_lock;
4826 sc->sram = rman_get_virtual(sc->mem_res);
4827 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4828 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4829 device_printf(dev, "impossible memory region size %ld\n",
4830 rman_get_size(sc->mem_res));
4832 goto abort_with_mem_res;
4835 /* make NULL terminated copy of the EEPROM strings section of
4837 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4838 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4839 rman_get_bushandle(sc->mem_res),
4840 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4842 MXGE_EEPROM_STRINGS_SIZE - 2);
4843 err = mxge_parse_strings(sc);
4845 goto abort_with_mem_res;
4847 /* Enable write combining for efficient use of PCIe bus */
4850 /* Allocate the out of band dma memory */
4851 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4852 sizeof (mxge_cmd_t), 64);
4854 goto abort_with_mem_res;
4855 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4856 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4858 goto abort_with_cmd_dma;
4860 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4862 goto abort_with_zeropad_dma;
4864 /* select & load the firmware */
4865 err = mxge_select_firmware(sc);
4867 goto abort_with_dmabench;
4868 sc->intr_coal_delay = mxge_intr_coal_delay;
4870 mxge_slice_probe(sc);
4871 err = mxge_alloc_slices(sc);
4873 goto abort_with_dmabench;
4875 err = mxge_reset(sc, 0);
4877 goto abort_with_slices;
4879 err = mxge_alloc_rings(sc);
4881 device_printf(sc->dev, "failed to allocate rings\n");
4882 goto abort_with_slices;
4885 err = mxge_add_irq(sc);
4887 device_printf(sc->dev, "failed to add irq\n");
4888 goto abort_with_rings;
4891 ifp->if_baudrate = IF_Gbps(10UL);
4892 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4893 IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4895 #if defined(INET) || defined(INET6)
4896 ifp->if_capabilities |= IFCAP_LRO;
4899 #ifdef MXGE_NEW_VLAN_API
4900 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4902 /* Only FW 1.4.32 and newer can do TSO over vlans */
4903 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4904 sc->fw_ver_tiny >= 32)
4905 ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4907 sc->max_mtu = mxge_max_mtu(sc);
4908 if (sc->max_mtu >= 9000)
4909 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4911 device_printf(dev, "MTU limited to %d. Install "
4912 "latest firmware for 9000 byte jumbo support\n",
4913 sc->max_mtu - ETHER_HDR_LEN);
4914 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4915 ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4916 /* check to see if f/w supports TSO for IPv6 */
4917 if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4919 ifp->if_capabilities |= IFCAP_TSO6;
4920 sc->max_tso6_hlen = min(cmd.data0,
4921 sizeof (sc->ss[0].scratch));
4923 ifp->if_capenable = ifp->if_capabilities;
4924 if (sc->lro_cnt == 0)
4925 ifp->if_capenable &= ~IFCAP_LRO;
4926 ifp->if_init = mxge_init;
4928 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4929 ifp->if_ioctl = mxge_ioctl;
4930 ifp->if_start = mxge_start;
4931 /* Initialise the ifmedia structure */
4932 ifmedia_init(&sc->media, 0, mxge_media_change,
4934 mxge_media_init(sc);
4935 mxge_media_probe(sc);
4937 ether_ifattach(ifp, sc->mac_addr);
4938 /* ether_ifattach sets mtu to ETHERMTU */
4939 if (mxge_initial_mtu != ETHERMTU)
4940 mxge_change_mtu(sc, mxge_initial_mtu);
4942 mxge_add_sysctls(sc);
4943 #ifdef IFNET_BUF_RING
4944 ifp->if_transmit = mxge_transmit;
4945 ifp->if_qflush = mxge_qflush;
4947 taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4948 device_get_nameunit(sc->dev));
4949 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4953 mxge_free_rings(sc);
4955 mxge_free_slices(sc);
4956 abort_with_dmabench:
4957 mxge_dma_free(&sc->dmabench_dma);
4958 abort_with_zeropad_dma:
4959 mxge_dma_free(&sc->zeropad_dma);
4961 mxge_dma_free(&sc->cmd_dma);
4963 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4965 pci_disable_busmaster(dev);
4966 mtx_destroy(&sc->cmd_mtx);
4967 mtx_destroy(&sc->driver_mtx);
4969 abort_with_parent_dmat:
4970 bus_dma_tag_destroy(sc->parent_dmat);
4972 if (sc->tq != NULL) {
4973 taskqueue_drain(sc->tq, &sc->watchdog_task);
4974 taskqueue_free(sc->tq);
4982 mxge_detach(device_t dev)
4984 mxge_softc_t *sc = device_get_softc(dev);
4986 if (mxge_vlans_active(sc)) {
4987 device_printf(sc->dev,
4988 "Detach vlans before removing module\n");
4991 mtx_lock(&sc->driver_mtx);
4993 if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4995 mtx_unlock(&sc->driver_mtx);
4996 ether_ifdetach(sc->ifp);
4997 if (sc->tq != NULL) {
4998 taskqueue_drain(sc->tq, &sc->watchdog_task);
4999 taskqueue_free(sc->tq);
5002 callout_drain(&sc->co_hdl);
5003 ifmedia_removeall(&sc->media);
5004 mxge_dummy_rdma(sc, 0);
5005 mxge_rem_sysctls(sc);
5007 mxge_free_rings(sc);
5008 mxge_free_slices(sc);
5009 mxge_dma_free(&sc->dmabench_dma);
5010 mxge_dma_free(&sc->zeropad_dma);
5011 mxge_dma_free(&sc->cmd_dma);
5012 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5013 pci_disable_busmaster(dev);
5014 mtx_destroy(&sc->cmd_mtx);
5015 mtx_destroy(&sc->driver_mtx);
5017 bus_dma_tag_destroy(sc->parent_dmat);
5022 mxge_shutdown(device_t dev)
5028 This file uses Myri10GE driver indentation.
5031 c-file-style:"linux"