1 /******************************************************************************
3 Copyright (c) 2006-2009, Myricom Inc.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 ***************************************************************************/
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
40 #include <sys/malloc.h>
42 #include <sys/kernel.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
49 /* count xmits ourselves, rather than via drbr */
52 #include <net/if_arp.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
59 #include <net/if_types.h>
60 #include <net/if_vlan_var.h>
63 #include <netinet/in_systm.h>
64 #include <netinet/in.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
68 #include <machine/bus.h>
69 #include <machine/in_cksum.h>
70 #include <machine/resource.h>
75 #include <dev/pci/pcireg.h>
76 #include <dev/pci/pcivar.h>
77 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
79 #include <vm/vm.h> /* for pmap_mapdev() */
82 #if defined(__i386) || defined(__amd64)
83 #include <machine/specialreg.h>
86 #include <dev/mxge/mxge_mcp.h>
87 #include <dev/mxge/mcp_gen_header.h>
88 /*#define MXGE_FAKE_IFP*/
89 #include <dev/mxge/if_mxge_var.h>
91 #include <sys/buf_ring.h>
97 static int mxge_nvidia_ecrc_enable = 1;
98 static int mxge_force_firmware = 0;
99 static int mxge_intr_coal_delay = 30;
100 static int mxge_deassert_wait = 1;
101 static int mxge_flow_control = 1;
102 static int mxge_verbose = 0;
103 static int mxge_lro_cnt = 8;
104 static int mxge_ticks;
105 static int mxge_max_slices = 1;
106 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
107 static int mxge_always_promisc = 0;
108 static int mxge_initial_mtu = ETHERMTU_JUMBO;
109 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
110 static char *mxge_fw_aligned = "mxge_eth_z8e";
111 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
112 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
114 static int mxge_probe(device_t dev);
115 static int mxge_attach(device_t dev);
116 static int mxge_detach(device_t dev);
117 static int mxge_shutdown(device_t dev);
118 static void mxge_intr(void *arg);
120 static device_method_t mxge_methods[] =
122 /* Device interface */
123 DEVMETHOD(device_probe, mxge_probe),
124 DEVMETHOD(device_attach, mxge_attach),
125 DEVMETHOD(device_detach, mxge_detach),
126 DEVMETHOD(device_shutdown, mxge_shutdown),
130 static driver_t mxge_driver =
134 sizeof(mxge_softc_t),
137 static devclass_t mxge_devclass;
139 /* Declare ourselves to be a child of the PCI bus.*/
140 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
141 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
142 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
144 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
145 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
146 static int mxge_close(mxge_softc_t *sc, int down);
147 static int mxge_open(mxge_softc_t *sc);
148 static void mxge_tick(void *arg);
151 mxge_probe(device_t dev)
156 if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
157 ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
158 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
159 rev = pci_get_revid(dev);
161 case MXGE_PCI_REV_Z8E:
162 device_set_desc(dev, "Myri10G-PCIE-8A");
164 case MXGE_PCI_REV_Z8ES:
165 device_set_desc(dev, "Myri10G-PCIE-8B");
168 device_set_desc(dev, "Myri10G-PCIE-8??");
169 device_printf(dev, "Unrecognized rev %d NIC\n",
179 mxge_enable_wc(mxge_softc_t *sc)
181 #if defined(__i386) || defined(__amd64)
186 len = rman_get_size(sc->mem_res);
187 err = pmap_change_attr((vm_offset_t) sc->sram,
188 len, PAT_WRITE_COMBINING);
190 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
198 /* callback to get our DMA address */
200 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
204 *(bus_addr_t *) arg = segs->ds_addr;
209 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
210 bus_size_t alignment)
213 device_t dev = sc->dev;
214 bus_size_t boundary, maxsegsize;
216 if (bytes > 4096 && alignment == 4096) {
224 /* allocate DMAable memory tags */
225 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
226 alignment, /* alignment */
227 boundary, /* boundary */
228 BUS_SPACE_MAXADDR, /* low */
229 BUS_SPACE_MAXADDR, /* high */
230 NULL, NULL, /* filter */
233 maxsegsize, /* maxsegsize */
234 BUS_DMA_COHERENT, /* flags */
235 NULL, NULL, /* lock */
236 &dma->dmat); /* tag */
238 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
242 /* allocate DMAable memory & map */
243 err = bus_dmamem_alloc(dma->dmat, &dma->addr,
244 (BUS_DMA_WAITOK | BUS_DMA_COHERENT
245 | BUS_DMA_ZERO), &dma->map);
247 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
248 goto abort_with_dmat;
251 /* load the memory */
252 err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
253 mxge_dmamap_callback,
254 (void *)&dma->bus_addr, 0);
256 device_printf(dev, "couldn't load map (err = %d)\n", err);
262 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
264 (void)bus_dma_tag_destroy(dma->dmat);
270 mxge_dma_free(mxge_dma_t *dma)
272 bus_dmamap_unload(dma->dmat, dma->map);
273 bus_dmamem_free(dma->dmat, dma->addr, dma->map);
274 (void)bus_dma_tag_destroy(dma->dmat);
278 * The eeprom strings on the lanaiX have the format
285 mxge_parse_strings(mxge_softc_t *sc)
287 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
292 ptr = sc->eeprom_strings;
293 limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
295 while (ptr < limit && *ptr != '\0') {
296 if (memcmp(ptr, "MAC=", 4) == 0) {
298 sc->mac_addr_string = ptr;
299 for (i = 0; i < 6; i++) {
301 if ((ptr + 2) > limit)
303 sc->mac_addr[i] = strtoul(ptr, NULL, 16);
306 } else if (memcmp(ptr, "PC=", 3) == 0) {
308 strncpy(sc->product_code_string, ptr,
309 sizeof (sc->product_code_string) - 1);
310 } else if (memcmp(ptr, "SN=", 3) == 0) {
312 strncpy(sc->serial_number_string, ptr,
313 sizeof (sc->serial_number_string) - 1);
315 MXGE_NEXT_STRING(ptr);
322 device_printf(sc->dev, "failed to parse eeprom_strings\n");
327 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
329 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
332 unsigned long base, off;
334 device_t pdev, mcp55;
335 uint16_t vendor_id, device_id, word;
336 uintptr_t bus, slot, func, ivend, idev;
340 if (!mxge_nvidia_ecrc_enable)
343 pdev = device_get_parent(device_get_parent(sc->dev));
345 device_printf(sc->dev, "could not find parent?\n");
348 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
349 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
351 if (vendor_id != 0x10de)
356 if (device_id == 0x005d) {
357 /* ck804, base address is magic */
359 } else if (device_id >= 0x0374 && device_id <= 0x378) {
360 /* mcp55, base address stored in chipset */
361 mcp55 = pci_find_bsf(0, 0, 0);
363 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
364 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
365 word = pci_read_config(mcp55, 0x90, 2);
366 base = ((unsigned long)word & 0x7ffeU) << 25;
373 Test below is commented because it is believed that doing
374 config read/write beyond 0xff will access the config space
375 for the next larger function. Uncomment this and remove
376 the hacky pmap_mapdev() way of accessing config space when
377 FreeBSD grows support for extended pcie config space access
380 /* See if we can, by some miracle, access the extended
382 val = pci_read_config(pdev, 0x178, 4);
383 if (val != 0xffffffff) {
385 pci_write_config(pdev, 0x178, val, 4);
389 /* Rather than using normal pci config space writes, we must
390 * map the Nvidia config space ourselves. This is because on
391 * opteron/nvidia class machine the 0xe000000 mapping is
392 * handled by the nvidia chipset, that means the internal PCI
393 * device (the on-chip northbridge), or the amd-8131 bridge
394 * and things behind them are not visible by this method.
397 BUS_READ_IVAR(device_get_parent(pdev), pdev,
399 BUS_READ_IVAR(device_get_parent(pdev), pdev,
400 PCI_IVAR_SLOT, &slot);
401 BUS_READ_IVAR(device_get_parent(pdev), pdev,
402 PCI_IVAR_FUNCTION, &func);
403 BUS_READ_IVAR(device_get_parent(pdev), pdev,
404 PCI_IVAR_VENDOR, &ivend);
405 BUS_READ_IVAR(device_get_parent(pdev), pdev,
406 PCI_IVAR_DEVICE, &idev);
409 + 0x00100000UL * (unsigned long)bus
410 + 0x00001000UL * (unsigned long)(func
413 /* map it into the kernel */
414 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
418 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
421 /* get a pointer to the config space mapped into the kernel */
422 cfgptr = va + (off & PAGE_MASK);
424 /* make sure that we can really access it */
425 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
426 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
427 if (! (vendor_id == ivend && device_id == idev)) {
428 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
429 vendor_id, device_id);
430 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
434 ptr32 = (uint32_t*)(cfgptr + 0x178);
437 if (val == 0xffffffff) {
438 device_printf(sc->dev, "extended mapping failed\n");
439 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
443 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
445 device_printf(sc->dev,
446 "Enabled ECRC on upstream Nvidia bridge "
448 (int)bus, (int)slot, (int)func);
453 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
455 device_printf(sc->dev,
456 "Nforce 4 chipset on non-x86/amd64!?!?!\n");
463 mxge_dma_test(mxge_softc_t *sc, int test_type)
466 bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
472 /* Run a small DMA test.
473 * The magic multipliers to the length tell the firmware
474 * to do DMA read, write, or read+write tests. The
475 * results are returned in cmd.data0. The upper 16
476 * bits of the return is the number of transfers completed.
477 * The lower 16 bits is the time in 0.5us ticks that the
478 * transfers took to complete.
481 len = sc->tx_boundary;
483 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
484 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
485 cmd.data2 = len * 0x10000;
486 status = mxge_send_cmd(sc, test_type, &cmd);
491 sc->read_dma = ((cmd.data0>>16) * len * 2) /
492 (cmd.data0 & 0xffff);
493 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
494 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
495 cmd.data2 = len * 0x1;
496 status = mxge_send_cmd(sc, test_type, &cmd);
501 sc->write_dma = ((cmd.data0>>16) * len * 2) /
502 (cmd.data0 & 0xffff);
504 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
505 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
506 cmd.data2 = len * 0x10001;
507 status = mxge_send_cmd(sc, test_type, &cmd);
512 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
513 (cmd.data0 & 0xffff);
516 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
517 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
524 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
525 * when the PCI-E Completion packets are aligned on an 8-byte
526 * boundary. Some PCI-E chip sets always align Completion packets; on
527 * the ones that do not, the alignment can be enforced by enabling
528 * ECRC generation (if supported).
530 * When PCI-E Completion packets are not aligned, it is actually more
531 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
533 * If the driver can neither enable ECRC nor verify that it has
534 * already been enabled, then it must use a firmware image which works
535 * around unaligned completion packets (ethp_z8e.dat), and it should
536 * also ensure that it never gives the device a Read-DMA which is
537 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
538 * enabled, then the driver should use the aligned (eth_z8e.dat)
539 * firmware image, and set tx_boundary to 4KB.
543 mxge_firmware_probe(mxge_softc_t *sc)
545 device_t dev = sc->dev;
549 sc->tx_boundary = 4096;
551 * Verify the max read request size was set to 4KB
552 * before trying the test with 4KB.
554 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
555 pectl = pci_read_config(dev, reg + 0x8, 2);
556 if ((pectl & (5 << 12)) != (5 << 12)) {
557 device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
559 sc->tx_boundary = 2048;
564 * load the optimized firmware (which assumes aligned PCIe
565 * completions) in order to see if it works on this host.
567 sc->fw_name = mxge_fw_aligned;
568 status = mxge_load_firmware(sc, 1);
574 * Enable ECRC if possible
576 mxge_enable_nvidia_ecrc(sc);
579 * Run a DMA test which watches for unaligned completions and
580 * aborts on the first one seen.
583 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
585 return 0; /* keep the aligned firmware */
588 device_printf(dev, "DMA test failed: %d\n", status);
589 if (status == ENOSYS)
590 device_printf(dev, "Falling back to ethp! "
591 "Please install up to date fw\n");
596 mxge_select_firmware(mxge_softc_t *sc)
601 if (mxge_force_firmware != 0) {
602 if (mxge_force_firmware == 1)
607 device_printf(sc->dev,
608 "Assuming %s completions (forced)\n",
609 aligned ? "aligned" : "unaligned");
613 /* if the PCIe link width is 4 or less, we can use the aligned
614 firmware and skip any checks */
615 if (sc->link_width != 0 && sc->link_width <= 4) {
616 device_printf(sc->dev,
617 "PCIe x%d Link, expect reduced performance\n",
623 if (0 == mxge_firmware_probe(sc))
628 sc->fw_name = mxge_fw_aligned;
629 sc->tx_boundary = 4096;
631 sc->fw_name = mxge_fw_unaligned;
632 sc->tx_boundary = 2048;
634 return (mxge_load_firmware(sc, 0));
644 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
648 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
649 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
650 be32toh(hdr->mcp_type));
654 /* save firmware version for sysctl */
655 strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
657 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
659 sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
660 &sc->fw_ver_minor, &sc->fw_ver_tiny);
662 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
663 && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
664 device_printf(sc->dev, "Found firmware version %s\n",
666 device_printf(sc->dev, "Driver needs %d.%d\n",
667 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
675 z_alloc(void *nil, u_int items, u_int size)
679 ptr = malloc(items * size, M_TEMP, M_NOWAIT);
684 z_free(void *nil, void *ptr)
691 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
694 char *inflate_buffer;
695 const struct firmware *fw;
696 const mcp_gen_header_t *hdr;
703 fw = firmware_get(sc->fw_name);
705 device_printf(sc->dev, "Could not find firmware image %s\n",
712 /* setup zlib and decompress f/w */
713 bzero(&zs, sizeof (zs));
716 status = inflateInit(&zs);
717 if (status != Z_OK) {
722 /* the uncompressed size is stored as the firmware version,
723 which would otherwise go unused */
724 fw_len = (size_t) fw->version;
725 inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
726 if (inflate_buffer == NULL)
728 zs.avail_in = fw->datasize;
729 zs.next_in = __DECONST(char *, fw->data);
730 zs.avail_out = fw_len;
731 zs.next_out = inflate_buffer;
732 status = inflate(&zs, Z_FINISH);
733 if (status != Z_STREAM_END) {
734 device_printf(sc->dev, "zlib %d\n", status);
736 goto abort_with_buffer;
740 hdr_offset = htobe32(*(const uint32_t *)
741 (inflate_buffer + MCP_HEADER_PTR_OFFSET));
742 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
743 device_printf(sc->dev, "Bad firmware file");
745 goto abort_with_buffer;
747 hdr = (const void*)(inflate_buffer + hdr_offset);
749 status = mxge_validate_firmware(sc, hdr);
751 goto abort_with_buffer;
753 /* Copy the inflated firmware to NIC SRAM. */
754 for (i = 0; i < fw_len; i += 256) {
755 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
757 min(256U, (unsigned)(fw_len - i)));
766 free(inflate_buffer, M_TEMP);
770 firmware_put(fw, FIRMWARE_UNLOAD);
775 * Enable or disable periodic RDMAs from the host to make certain
776 * chipsets resend dropped PCIe messages
780 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
783 volatile uint32_t *confirm;
784 volatile char *submit;
785 uint32_t *buf, dma_low, dma_high;
788 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
790 /* clear confirmation addr */
791 confirm = (volatile uint32_t *)sc->cmd;
795 /* send an rdma command to the PCIe engine, and wait for the
796 response in the confirmation address. The firmware should
797 write a -1 there to indicate it is alive and well
800 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
801 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
802 buf[0] = htobe32(dma_high); /* confirm addr MSW */
803 buf[1] = htobe32(dma_low); /* confirm addr LSW */
804 buf[2] = htobe32(0xffffffff); /* confirm data */
805 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
806 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
807 buf[3] = htobe32(dma_high); /* dummy addr MSW */
808 buf[4] = htobe32(dma_low); /* dummy addr LSW */
809 buf[5] = htobe32(enable); /* enable? */
812 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
814 mxge_pio_copy(submit, buf, 64);
819 while (*confirm != 0xffffffff && i < 20) {
823 if (*confirm != 0xffffffff) {
824 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
825 (enable ? "enable" : "disable"), confirm,
832 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
835 char buf_bytes[sizeof(*buf) + 8];
836 volatile mcp_cmd_response_t *response = sc->cmd;
837 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
838 uint32_t dma_low, dma_high;
839 int err, sleep_total = 0;
841 /* ensure buf is aligned to 8 bytes */
842 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
844 buf->data0 = htobe32(data->data0);
845 buf->data1 = htobe32(data->data1);
846 buf->data2 = htobe32(data->data2);
847 buf->cmd = htobe32(cmd);
848 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
849 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
851 buf->response_addr.low = htobe32(dma_low);
852 buf->response_addr.high = htobe32(dma_high);
853 mtx_lock(&sc->cmd_mtx);
854 response->result = 0xffffffff;
856 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
858 /* wait up to 20ms */
860 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
861 bus_dmamap_sync(sc->cmd_dma.dmat,
862 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
864 switch (be32toh(response->result)) {
866 data->data0 = be32toh(response->data);
872 case MXGEFW_CMD_UNKNOWN:
875 case MXGEFW_CMD_ERROR_UNALIGNED:
878 case MXGEFW_CMD_ERROR_BUSY:
882 device_printf(sc->dev,
884 "failed, result = %d\n",
885 cmd, be32toh(response->result));
893 device_printf(sc->dev, "mxge: command %d timed out"
895 cmd, be32toh(response->result));
896 mtx_unlock(&sc->cmd_mtx);
901 mxge_adopt_running_firmware(mxge_softc_t *sc)
903 struct mcp_gen_header *hdr;
904 const size_t bytes = sizeof (struct mcp_gen_header);
908 /* find running firmware header */
909 hdr_offset = htobe32(*(volatile uint32_t *)
910 (sc->sram + MCP_HEADER_PTR_OFFSET));
912 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
913 device_printf(sc->dev,
914 "Running firmware has bad header offset (%d)\n",
919 /* copy header of running firmware from SRAM to host memory to
920 * validate firmware */
921 hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
923 device_printf(sc->dev, "could not malloc firmware hdr\n");
926 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
927 rman_get_bushandle(sc->mem_res),
928 hdr_offset, (char *)hdr, bytes);
929 status = mxge_validate_firmware(sc, hdr);
933 * check to see if adopted firmware has bug where adopting
934 * it will cause broadcasts to be filtered unless the NIC
935 * is kept in ALLMULTI mode
937 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
938 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
939 sc->adopted_rx_filter_bug = 1;
940 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
941 "working around rx filter bug\n",
942 sc->fw_ver_major, sc->fw_ver_minor,
951 mxge_load_firmware(mxge_softc_t *sc, int adopt)
953 volatile uint32_t *confirm;
954 volatile char *submit;
956 uint32_t *buf, size, dma_low, dma_high;
959 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
961 size = sc->sram_size;
962 status = mxge_load_firmware_helper(sc, &size);
966 /* Try to use the currently running firmware, if
968 status = mxge_adopt_running_firmware(sc);
970 device_printf(sc->dev,
971 "failed to adopt running firmware\n");
974 device_printf(sc->dev,
975 "Successfully adopted running firmware\n");
976 if (sc->tx_boundary == 4096) {
977 device_printf(sc->dev,
978 "Using firmware currently running on NIC"
980 device_printf(sc->dev,
981 "performance consider loading optimized "
984 sc->fw_name = mxge_fw_unaligned;
985 sc->tx_boundary = 2048;
988 /* clear confirmation addr */
989 confirm = (volatile uint32_t *)sc->cmd;
992 /* send a reload command to the bootstrap MCP, and wait for the
993 response in the confirmation address. The firmware should
994 write a -1 there to indicate it is alive and well
997 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
998 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1000 buf[0] = htobe32(dma_high); /* confirm addr MSW */
1001 buf[1] = htobe32(dma_low); /* confirm addr LSW */
1002 buf[2] = htobe32(0xffffffff); /* confirm data */
1004 /* FIX: All newest firmware should un-protect the bottom of
1005 the sram before handoff. However, the very first interfaces
1006 do not. Therefore the handoff copy must skip the first 8 bytes
1008 /* where the code starts*/
1009 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1010 buf[4] = htobe32(size - 8); /* length of code */
1011 buf[5] = htobe32(8); /* where to copy to */
1012 buf[6] = htobe32(0); /* where to jump to */
1014 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1015 mxge_pio_copy(submit, buf, 64);
1020 while (*confirm != 0xffffffff && i < 20) {
1023 bus_dmamap_sync(sc->cmd_dma.dmat,
1024 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1026 if (*confirm != 0xffffffff) {
1027 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1036 mxge_update_mac_address(mxge_softc_t *sc)
1039 uint8_t *addr = sc->mac_addr;
1043 cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1044 | (addr[2] << 8) | addr[3]);
1046 cmd.data1 = ((addr[4] << 8) | (addr[5]));
1048 status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1053 mxge_change_pause(mxge_softc_t *sc, int pause)
1059 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1062 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1066 device_printf(sc->dev, "Failed to set flow control mode\n");
1074 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1079 if (mxge_always_promisc)
1083 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1086 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1090 device_printf(sc->dev, "Failed to set promisc mode\n");
1095 mxge_set_multicast_list(mxge_softc_t *sc)
1098 struct ifmultiaddr *ifma;
1099 struct ifnet *ifp = sc->ifp;
1102 /* This firmware is known to not support multicast */
1103 if (!sc->fw_multicast_support)
1106 /* Disable multicast filtering while we play with the lists*/
1107 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1109 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1110 " error status: %d\n", err);
1114 if (sc->adopted_rx_filter_bug)
1117 if (ifp->if_flags & IFF_ALLMULTI)
1118 /* request to disable multicast filtering, so quit here */
1121 /* Flush all the filters */
1123 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1125 device_printf(sc->dev,
1126 "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1127 ", error status: %d\n", err);
1131 /* Walk the multicast list, and add each address */
1133 if_maddr_rlock(ifp);
1134 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1135 if (ifma->ifma_addr->sa_family != AF_LINK)
1137 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1139 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1141 cmd.data0 = htonl(cmd.data0);
1142 cmd.data1 = htonl(cmd.data1);
1143 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1145 device_printf(sc->dev, "Failed "
1146 "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1148 /* abort, leaving multicast filtering off */
1149 if_maddr_runlock(ifp);
1153 if_maddr_runlock(ifp);
1154 /* Enable multicast filtering */
1155 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1157 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1158 ", error status: %d\n", err);
1163 mxge_max_mtu(mxge_softc_t *sc)
1168 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1169 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1171 /* try to set nbufs to see if it we can
1172 use virtually contiguous jumbos */
1174 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1177 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1179 /* otherwise, we're limited to MJUMPAGESIZE */
1180 return MJUMPAGESIZE - MXGEFW_PAD;
1184 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1186 struct mxge_slice_state *ss;
1187 mxge_rx_done_t *rx_done;
1188 volatile uint32_t *irq_claim;
1192 /* try to send a reset command to the card to see if it
1194 memset(&cmd, 0, sizeof (cmd));
1195 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1197 device_printf(sc->dev, "failed reset\n");
1201 mxge_dummy_rdma(sc, 1);
1204 /* set the intrq size */
1205 cmd.data0 = sc->rx_ring_size;
1206 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1209 * Even though we already know how many slices are supported
1210 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1211 * has magic side effects, and must be called after a reset.
1212 * It must be called prior to calling any RSS related cmds,
1213 * including assigning an interrupt queue for anything but
1214 * slice 0. It must also be called *after*
1215 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1216 * the firmware to compute offsets.
1219 if (sc->num_slices > 1) {
1220 /* ask the maximum number of slices it supports */
1221 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1224 device_printf(sc->dev,
1225 "failed to get number of slices\n");
1229 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1230 * to setting up the interrupt queue DMA
1232 cmd.data0 = sc->num_slices;
1233 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1234 #ifdef IFNET_BUF_RING
1235 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1237 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1240 device_printf(sc->dev,
1241 "failed to set number of slices\n");
1247 if (interrupts_setup) {
1248 /* Now exchange information about interrupts */
1249 for (slice = 0; slice < sc->num_slices; slice++) {
1250 rx_done = &sc->ss[slice].rx_done;
1251 memset(rx_done->entry, 0, sc->rx_ring_size);
1252 cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1253 cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1255 status |= mxge_send_cmd(sc,
1256 MXGEFW_CMD_SET_INTRQ_DMA,
1261 status |= mxge_send_cmd(sc,
1262 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1265 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1267 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1268 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1271 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1273 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1275 device_printf(sc->dev, "failed set interrupt parameters\n");
1280 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1283 /* run a DMA benchmark */
1284 (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1286 for (slice = 0; slice < sc->num_slices; slice++) {
1287 ss = &sc->ss[slice];
1289 ss->irq_claim = irq_claim + (2 * slice);
1290 /* reset mcp/driver shared state back to 0 */
1291 ss->rx_done.idx = 0;
1292 ss->rx_done.cnt = 0;
1295 ss->tx.pkt_done = 0;
1296 ss->tx.queue_active = 0;
1297 ss->tx.activate = 0;
1298 ss->tx.deactivate = 0;
1303 ss->rx_small.cnt = 0;
1304 ss->lro_bad_csum = 0;
1306 ss->lro_flushed = 0;
1307 if (ss->fw_stats != NULL) {
1308 bzero(ss->fw_stats, sizeof *ss->fw_stats);
1311 sc->rdma_tags_available = 15;
1312 status = mxge_update_mac_address(sc);
1313 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1314 mxge_change_pause(sc, sc->pause);
1315 mxge_set_multicast_list(sc);
1320 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1323 unsigned int intr_coal_delay;
1327 intr_coal_delay = sc->intr_coal_delay;
1328 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1332 if (intr_coal_delay == sc->intr_coal_delay)
1335 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1338 mtx_lock(&sc->driver_mtx);
1339 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1340 sc->intr_coal_delay = intr_coal_delay;
1342 mtx_unlock(&sc->driver_mtx);
1347 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1350 unsigned int enabled;
1354 enabled = sc->pause;
1355 err = sysctl_handle_int(oidp, &enabled, arg2, req);
1359 if (enabled == sc->pause)
1362 mtx_lock(&sc->driver_mtx);
1363 err = mxge_change_pause(sc, enabled);
1364 mtx_unlock(&sc->driver_mtx);
1369 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1376 ifp->if_capenable &= ~IFCAP_LRO;
1378 ifp->if_capenable |= IFCAP_LRO;
1379 sc->lro_cnt = lro_cnt;
1380 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1382 err = mxge_open(sc);
1388 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1391 unsigned int lro_cnt;
1395 lro_cnt = sc->lro_cnt;
1396 err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1400 if (lro_cnt == sc->lro_cnt)
1406 mtx_lock(&sc->driver_mtx);
1407 err = mxge_change_lro_locked(sc, lro_cnt);
1408 mtx_unlock(&sc->driver_mtx);
1413 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1419 arg2 = be32toh(*(int *)arg1);
1421 err = sysctl_handle_int(oidp, arg1, arg2, req);
1427 mxge_rem_sysctls(mxge_softc_t *sc)
1429 struct mxge_slice_state *ss;
1432 if (sc->slice_sysctl_tree == NULL)
1435 for (slice = 0; slice < sc->num_slices; slice++) {
1436 ss = &sc->ss[slice];
1437 if (ss == NULL || ss->sysctl_tree == NULL)
1439 sysctl_ctx_free(&ss->sysctl_ctx);
1440 ss->sysctl_tree = NULL;
1442 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1443 sc->slice_sysctl_tree = NULL;
1447 mxge_add_sysctls(mxge_softc_t *sc)
1449 struct sysctl_ctx_list *ctx;
1450 struct sysctl_oid_list *children;
1452 struct mxge_slice_state *ss;
1456 ctx = device_get_sysctl_ctx(sc->dev);
1457 children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1458 fw = sc->ss[0].fw_stats;
1460 /* random information */
1461 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1463 CTLFLAG_RD, &sc->fw_version,
1464 0, "firmware version");
1465 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1467 CTLFLAG_RD, &sc->serial_number_string,
1468 0, "serial number");
1469 SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1471 CTLFLAG_RD, &sc->product_code_string,
1473 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1475 CTLFLAG_RD, &sc->link_width,
1477 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1479 CTLFLAG_RD, &sc->tx_boundary,
1481 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1483 CTLFLAG_RD, &sc->wc,
1484 0, "write combining PIO?");
1485 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1487 CTLFLAG_RD, &sc->read_dma,
1488 0, "DMA Read speed in MB/s");
1489 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1491 CTLFLAG_RD, &sc->write_dma,
1492 0, "DMA Write speed in MB/s");
1493 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1494 "read_write_dma_MBs",
1495 CTLFLAG_RD, &sc->read_write_dma,
1496 0, "DMA concurrent Read/Write speed in MB/s");
1497 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1499 CTLFLAG_RD, &sc->watchdog_resets,
1500 0, "Number of times NIC was reset");
1503 /* performance related tunables */
1504 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1506 CTLTYPE_INT|CTLFLAG_RW, sc,
1507 0, mxge_change_intr_coal,
1508 "I", "interrupt coalescing delay in usecs");
1510 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1511 "flow_control_enabled",
1512 CTLTYPE_INT|CTLFLAG_RW, sc,
1513 0, mxge_change_flow_control,
1514 "I", "interrupt coalescing delay in usecs");
1516 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1518 CTLFLAG_RW, &mxge_deassert_wait,
1519 0, "Wait for IRQ line to go low in ihandler");
1521 /* stats block from firmware is in network byte order.
1523 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1525 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1526 0, mxge_handle_be32,
1528 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1529 "rdma_tags_available",
1530 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1531 0, mxge_handle_be32,
1532 "I", "rdma_tags_available");
1533 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1534 "dropped_bad_crc32",
1535 CTLTYPE_INT|CTLFLAG_RD,
1536 &fw->dropped_bad_crc32,
1537 0, mxge_handle_be32,
1538 "I", "dropped_bad_crc32");
1539 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1541 CTLTYPE_INT|CTLFLAG_RD,
1542 &fw->dropped_bad_phy,
1543 0, mxge_handle_be32,
1544 "I", "dropped_bad_phy");
1545 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546 "dropped_link_error_or_filtered",
1547 CTLTYPE_INT|CTLFLAG_RD,
1548 &fw->dropped_link_error_or_filtered,
1549 0, mxge_handle_be32,
1550 "I", "dropped_link_error_or_filtered");
1551 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1552 "dropped_link_overflow",
1553 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1554 0, mxge_handle_be32,
1555 "I", "dropped_link_overflow");
1556 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1557 "dropped_multicast_filtered",
1558 CTLTYPE_INT|CTLFLAG_RD,
1559 &fw->dropped_multicast_filtered,
1560 0, mxge_handle_be32,
1561 "I", "dropped_multicast_filtered");
1562 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1563 "dropped_no_big_buffer",
1564 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1565 0, mxge_handle_be32,
1566 "I", "dropped_no_big_buffer");
1567 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1568 "dropped_no_small_buffer",
1569 CTLTYPE_INT|CTLFLAG_RD,
1570 &fw->dropped_no_small_buffer,
1571 0, mxge_handle_be32,
1572 "I", "dropped_no_small_buffer");
1573 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1575 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1576 0, mxge_handle_be32,
1577 "I", "dropped_overrun");
1578 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1580 CTLTYPE_INT|CTLFLAG_RD,
1582 0, mxge_handle_be32,
1583 "I", "dropped_pause");
1584 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1586 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1587 0, mxge_handle_be32,
1588 "I", "dropped_runt");
1590 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1591 "dropped_unicast_filtered",
1592 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1593 0, mxge_handle_be32,
1594 "I", "dropped_unicast_filtered");
1596 /* verbose printing? */
1597 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1599 CTLFLAG_RW, &mxge_verbose,
1600 0, "verbose printing");
1603 SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1605 CTLTYPE_INT|CTLFLAG_RW, sc,
1607 "I", "number of lro merge queues");
1610 /* add counters exported for debugging from all slices */
1611 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1612 sc->slice_sysctl_tree =
1613 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1614 "slice", CTLFLAG_RD, 0, "");
1616 for (slice = 0; slice < sc->num_slices; slice++) {
1617 ss = &sc->ss[slice];
1618 sysctl_ctx_init(&ss->sysctl_ctx);
1619 ctx = &ss->sysctl_ctx;
1620 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1621 sprintf(slice_num, "%d", slice);
1623 SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1625 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1626 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1628 CTLFLAG_RD, &ss->rx_small.cnt,
1630 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1632 CTLFLAG_RD, &ss->rx_big.cnt,
1634 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1635 "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1636 0, "number of lro merge queues flushed");
1638 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1639 "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1640 0, "number of frames appended to lro merge"
1643 #ifndef IFNET_BUF_RING
1644 /* only transmit from slice 0 for now */
1648 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1650 CTLFLAG_RD, &ss->tx.req,
1653 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1655 CTLFLAG_RD, &ss->tx.done,
1657 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1659 CTLFLAG_RD, &ss->tx.pkt_done,
1661 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1663 CTLFLAG_RD, &ss->tx.stall,
1665 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1667 CTLFLAG_RD, &ss->tx.wake,
1669 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1671 CTLFLAG_RD, &ss->tx.defrag,
1673 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1675 CTLFLAG_RD, &ss->tx.queue_active,
1676 0, "tx_queue_active");
1677 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679 CTLFLAG_RD, &ss->tx.activate,
1681 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1683 CTLFLAG_RD, &ss->tx.deactivate,
1684 0, "tx_deactivate");
1688 /* copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1689 backwards one at a time and handle ring wraps */
1692 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1693 mcp_kreq_ether_send_t *src, int cnt)
1695 int idx, starting_slot;
1696 starting_slot = tx->req;
1699 idx = (starting_slot + cnt) & tx->mask;
1700 mxge_pio_copy(&tx->lanai[idx],
1701 &src[cnt], sizeof(*src));
1707 * copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1708 * at most 32 bytes at a time, so as to avoid involving the software
1709 * pio handler in the nic. We re-write the first segment's flags
1710 * to mark them valid only after writing the entire chain
1714 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1719 volatile uint32_t *dst_ints;
1720 mcp_kreq_ether_send_t *srcp;
1721 volatile mcp_kreq_ether_send_t *dstp, *dst;
1724 idx = tx->req & tx->mask;
1726 last_flags = src->flags;
1729 dst = dstp = &tx->lanai[idx];
1732 if ((idx + cnt) < tx->mask) {
1733 for (i = 0; i < (cnt - 1); i += 2) {
1734 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1735 wmb(); /* force write every 32 bytes */
1740 /* submit all but the first request, and ensure
1741 that it is submitted below */
1742 mxge_submit_req_backwards(tx, src, cnt);
1746 /* submit the first request */
1747 mxge_pio_copy(dstp, srcp, sizeof(*src));
1748 wmb(); /* barrier before setting valid flag */
1751 /* re-write the last 32-bits with the valid flags */
1752 src->flags = last_flags;
1753 src_ints = (uint32_t *)src;
1755 dst_ints = (volatile uint32_t *)dst;
1757 *dst_ints = *src_ints;
1765 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1766 int busdma_seg_cnt, int ip_off)
1769 mcp_kreq_ether_send_t *req;
1770 bus_dma_segment_t *seg;
1773 uint32_t low, high_swapped;
1774 int len, seglen, cum_len, cum_len_next;
1775 int next_is_first, chop, cnt, rdma_count, small;
1776 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1777 uint8_t flags, flags_next;
1780 mss = m->m_pkthdr.tso_segsz;
1782 /* negative cum_len signifies to the
1783 * send loop that we are still in the
1784 * header portion of the TSO packet.
1787 /* ensure we have the ethernet, IP and TCP
1788 header together in the first mbuf, copy
1789 it to a scratch buffer if not */
1790 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1791 m_copydata(m, 0, ip_off + sizeof (*ip),
1793 ip = (struct ip *)(ss->scratch + ip_off);
1795 ip = (struct ip *)(mtod(m, char *) + ip_off);
1797 if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1799 m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1800 + sizeof (*tcp), ss->scratch);
1801 ip = (struct ip *)(mtod(m, char *) + ip_off);
1804 tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1805 cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1807 /* TSO implies checksum offload on this hardware */
1808 cksum_offset = ip_off + (ip->ip_hl << 2);
1809 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1812 /* for TSO, pseudo_hdr_offset holds mss.
1813 * The firmware figures out where to put
1814 * the checksum by parsing the header. */
1815 pseudo_hdr_offset = htobe16(mss);
1822 /* "rdma_count" is the number of RDMAs belonging to the
1823 * current packet BEFORE the current send request. For
1824 * non-TSO packets, this is equal to "count".
1825 * For TSO packets, rdma_count needs to be reset
1826 * to 0 after a segment cut.
1828 * The rdma_count field of the send request is
1829 * the number of RDMAs of the packet starting at
1830 * that request. For TSO send requests with one ore more cuts
1831 * in the middle, this is the number of RDMAs starting
1832 * after the last cut in the request. All previous
1833 * segments before the last cut implicitly have 1 RDMA.
1835 * Since the number of RDMAs is not known beforehand,
1836 * it must be filled-in retroactively - after each
1837 * segmentation cut or at the end of the entire packet.
1840 while (busdma_seg_cnt) {
1841 /* Break the busdma segment up into pieces*/
1842 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1843 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1847 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1849 cum_len_next = cum_len + seglen;
1850 (req-rdma_count)->rdma_count = rdma_count + 1;
1851 if (__predict_true(cum_len >= 0)) {
1853 chop = (cum_len_next > mss);
1854 cum_len_next = cum_len_next % mss;
1855 next_is_first = (cum_len_next == 0);
1856 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1857 flags_next |= next_is_first *
1859 rdma_count |= -(chop | next_is_first);
1860 rdma_count += chop & !next_is_first;
1861 } else if (cum_len_next >= 0) {
1866 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1867 flags_next = MXGEFW_FLAGS_TSO_PLD |
1868 MXGEFW_FLAGS_FIRST |
1869 (small * MXGEFW_FLAGS_SMALL);
1872 req->addr_high = high_swapped;
1873 req->addr_low = htobe32(low);
1874 req->pseudo_hdr_offset = pseudo_hdr_offset;
1876 req->rdma_count = 1;
1877 req->length = htobe16(seglen);
1878 req->cksum_offset = cksum_offset;
1879 req->flags = flags | ((cum_len & 1) *
1880 MXGEFW_FLAGS_ALIGN_ODD);
1883 cum_len = cum_len_next;
1888 if (__predict_false(cksum_offset > seglen))
1889 cksum_offset -= seglen;
1892 if (__predict_false(cnt > tx->max_desc))
1898 (req-rdma_count)->rdma_count = rdma_count;
1902 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1903 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1905 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1906 mxge_submit_req(tx, tx->req_list, cnt);
1907 #ifdef IFNET_BUF_RING
1908 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1909 /* tell the NIC to start polling this slice */
1911 tx->queue_active = 1;
1919 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1923 printf("tx->max_desc exceeded via TSO!\n");
1924 printf("mss = %d, %ld, %d!\n", mss,
1925 (long)seg - (long)tx->seg_list, tx->max_desc);
1932 #endif /* IFCAP_TSO4 */
1934 #ifdef MXGE_NEW_VLAN_API
1936 * We reproduce the software vlan tag insertion from
1937 * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1938 * vlan tag insertion. We need to advertise this in order to have the
1939 * vlan interface respect our csum offload flags.
1941 static struct mbuf *
1942 mxge_vlan_tag_insert(struct mbuf *m)
1944 struct ether_vlan_header *evl;
1946 M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1947 if (__predict_false(m == NULL))
1949 if (m->m_len < sizeof(*evl)) {
1950 m = m_pullup(m, sizeof(*evl));
1951 if (__predict_false(m == NULL))
1955 * Transform the Ethernet header into an Ethernet header
1956 * with 802.1Q encapsulation.
1958 evl = mtod(m, struct ether_vlan_header *);
1959 bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1960 (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1961 evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1962 evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1963 m->m_flags &= ~M_VLANTAG;
1966 #endif /* MXGE_NEW_VLAN_API */
1969 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1972 mcp_kreq_ether_send_t *req;
1973 bus_dma_segment_t *seg;
1978 int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1979 uint16_t pseudo_hdr_offset;
1980 uint8_t flags, cksum_offset;
1987 ip_off = sizeof (struct ether_header);
1988 #ifdef MXGE_NEW_VLAN_API
1989 if (m->m_flags & M_VLANTAG) {
1990 m = mxge_vlan_tag_insert(m);
1991 if (__predict_false(m == NULL))
1993 ip_off += ETHER_VLAN_ENCAP_LEN;
1996 /* (try to) map the frame for DMA */
1997 idx = tx->req & tx->mask;
1998 err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1999 m, tx->seg_list, &cnt,
2001 if (__predict_false(err == EFBIG)) {
2002 /* Too many segments in the chain. Try
2004 m_tmp = m_defrag(m, M_NOWAIT);
2005 if (m_tmp == NULL) {
2010 err = bus_dmamap_load_mbuf_sg(tx->dmat,
2012 m, tx->seg_list, &cnt,
2015 if (__predict_false(err != 0)) {
2016 device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2017 " packet len = %d\n", err, m->m_pkthdr.len);
2020 bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2021 BUS_DMASYNC_PREWRITE);
2022 tx->info[idx].m = m;
2025 /* TSO is different enough, we handle it in another routine */
2026 if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2027 mxge_encap_tso(ss, m, cnt, ip_off);
2034 pseudo_hdr_offset = 0;
2035 flags = MXGEFW_FLAGS_NO_TSO;
2037 /* checksum offloading? */
2038 if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2039 /* ensure ip header is in first mbuf, copy
2040 it to a scratch buffer if not */
2041 if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2042 m_copydata(m, 0, ip_off + sizeof (*ip),
2044 ip = (struct ip *)(ss->scratch + ip_off);
2046 ip = (struct ip *)(mtod(m, char *) + ip_off);
2048 cksum_offset = ip_off + (ip->ip_hl << 2);
2049 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
2050 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2051 req->cksum_offset = cksum_offset;
2052 flags |= MXGEFW_FLAGS_CKSUM;
2053 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2057 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2058 flags |= MXGEFW_FLAGS_SMALL;
2060 /* convert segments into a request list */
2063 req->flags = MXGEFW_FLAGS_FIRST;
2064 for (i = 0; i < cnt; i++) {
2066 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2068 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2069 req->length = htobe16(seg->ds_len);
2070 req->cksum_offset = cksum_offset;
2071 if (cksum_offset > seg->ds_len)
2072 cksum_offset -= seg->ds_len;
2075 req->pseudo_hdr_offset = pseudo_hdr_offset;
2076 req->pad = 0; /* complete solid 16-byte block */
2077 req->rdma_count = 1;
2078 req->flags |= flags | ((cum_len & 1) * odd_flag);
2079 cum_len += seg->ds_len;
2085 /* pad runts to 60 bytes */
2089 htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2091 htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2092 req->length = htobe16(60 - cum_len);
2093 req->cksum_offset = 0;
2094 req->pseudo_hdr_offset = pseudo_hdr_offset;
2095 req->pad = 0; /* complete solid 16-byte block */
2096 req->rdma_count = 1;
2097 req->flags |= flags | ((cum_len & 1) * odd_flag);
2101 tx->req_list[0].rdma_count = cnt;
2103 /* print what the firmware will see */
2104 for (i = 0; i < cnt; i++) {
2105 printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2106 "cso:%d, flags:0x%x, rdma:%d\n",
2107 i, (int)ntohl(tx->req_list[i].addr_high),
2108 (int)ntohl(tx->req_list[i].addr_low),
2109 (int)ntohs(tx->req_list[i].length),
2110 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2111 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2112 tx->req_list[i].rdma_count);
2114 printf("--------------\n");
2116 tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2117 mxge_submit_req(tx, tx->req_list, cnt);
2118 #ifdef IFNET_BUF_RING
2119 if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2120 /* tell the NIC to start polling this slice */
2122 tx->queue_active = 1;
2135 #ifdef IFNET_BUF_RING
2137 mxge_qflush(struct ifnet *ifp)
2139 mxge_softc_t *sc = ifp->if_softc;
2144 for (slice = 0; slice < sc->num_slices; slice++) {
2145 tx = &sc->ss[slice].tx;
2147 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2149 mtx_unlock(&tx->mtx);
2155 mxge_start_locked(struct mxge_slice_state *ss)
2166 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2167 m = drbr_dequeue(ifp, tx->br);
2171 /* let BPF see it */
2174 /* give it to the nic */
2177 /* ran out of transmit slots */
2178 if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2179 && (!drbr_empty(ifp, tx->br))) {
2180 ss->if_drv_flags |= IFF_DRV_OACTIVE;
2186 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2197 if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2199 err = drbr_enqueue(ifp, tx->br, m);
2203 if (drbr_empty(ifp, tx->br) &&
2204 ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2205 /* let BPF see it */
2207 /* give it to the nic */
2209 } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2212 if (!drbr_empty(ifp, tx->br))
2213 mxge_start_locked(ss);
2218 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2220 mxge_softc_t *sc = ifp->if_softc;
2221 struct mxge_slice_state *ss;
2226 slice = m->m_pkthdr.flowid;
2227 slice &= (sc->num_slices - 1); /* num_slices always power of 2 */
2229 ss = &sc->ss[slice];
2232 if (mtx_trylock(&tx->mtx)) {
2233 err = mxge_transmit_locked(ss, m);
2234 mtx_unlock(&tx->mtx);
2236 err = drbr_enqueue(ifp, tx->br, m);
2245 mxge_start_locked(struct mxge_slice_state *ss)
2255 while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2256 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2260 /* let BPF see it */
2263 /* give it to the nic */
2266 /* ran out of transmit slots */
2267 if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2268 sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2274 mxge_start(struct ifnet *ifp)
2276 mxge_softc_t *sc = ifp->if_softc;
2277 struct mxge_slice_state *ss;
2279 /* only use the first slice for now */
2281 mtx_lock(&ss->tx.mtx);
2282 mxge_start_locked(ss);
2283 mtx_unlock(&ss->tx.mtx);
2287 * copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2288 * at most 32 bytes at a time, so as to avoid involving the software
2289 * pio handler in the nic. We re-write the first segment's low
2290 * DMA address to mark it valid only after we write the entire chunk
2294 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2295 mcp_kreq_ether_recv_t *src)
2299 low = src->addr_low;
2300 src->addr_low = 0xffffffff;
2301 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2303 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2305 src->addr_low = low;
2306 dst->addr_low = low;
2311 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2313 bus_dma_segment_t seg;
2315 mxge_rx_ring_t *rx = &ss->rx_small;
2318 m = m_gethdr(M_DONTWAIT, MT_DATA);
2325 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2326 &seg, &cnt, BUS_DMA_NOWAIT);
2331 rx->info[idx].m = m;
2332 rx->shadow[idx].addr_low =
2333 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2334 rx->shadow[idx].addr_high =
2335 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2339 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2344 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2346 bus_dma_segment_t seg[3];
2348 mxge_rx_ring_t *rx = &ss->rx_big;
2351 if (rx->cl_size == MCLBYTES)
2352 m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2354 m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2360 m->m_len = rx->mlen;
2361 err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2362 seg, &cnt, BUS_DMA_NOWAIT);
2367 rx->info[idx].m = m;
2368 rx->shadow[idx].addr_low =
2369 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2370 rx->shadow[idx].addr_high =
2371 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2373 #if MXGE_VIRT_JUMBOS
2374 for (i = 1; i < cnt; i++) {
2375 rx->shadow[idx + i].addr_low =
2376 htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2377 rx->shadow[idx + i].addr_high =
2378 htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2383 for (i = 0; i < rx->nbufs; i++) {
2384 if ((idx & 7) == 7) {
2385 mxge_submit_8rx(&rx->lanai[idx - 7],
2386 &rx->shadow[idx - 7]);
2394 * Myri10GE hardware checksums are not valid if the sender
2395 * padded the frame with non-zero padding. This is because
2396 * the firmware just does a simple 16-bit 1s complement
2397 * checksum across the entire frame, excluding the first 14
2398 * bytes. It is best to simply to check the checksum and
2399 * tell the stack about it only if the checksum is good
2402 static inline uint16_t
2403 mxge_rx_csum(struct mbuf *m, int csum)
2405 struct ether_header *eh;
2409 eh = mtod(m, struct ether_header *);
2411 /* only deal with IPv4 TCP & UDP for now */
2412 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2414 ip = (struct ip *)(eh + 1);
2415 if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2416 ip->ip_p != IPPROTO_UDP))
2419 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2420 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2421 - (ip->ip_hl << 2) + ip->ip_p));
2430 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2432 struct ether_vlan_header *evl;
2433 struct ether_header *eh;
2436 evl = mtod(m, struct ether_vlan_header *);
2437 eh = mtod(m, struct ether_header *);
2440 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2441 * after what the firmware thought was the end of the ethernet
2445 /* put checksum into host byte order */
2446 *csum = ntohs(*csum);
2447 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2448 (*csum) += ~partial;
2449 (*csum) += ((*csum) < ~partial);
2450 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2451 (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2453 /* restore checksum to network byte order;
2454 later consumers expect this */
2455 *csum = htons(*csum);
2458 #ifdef MXGE_NEW_VLAN_API
2459 m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2463 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2467 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2468 m_tag_prepend(m, mtag);
2472 m->m_flags |= M_VLANTAG;
2475 * Remove the 802.1q header by copying the Ethernet
2476 * addresses over it and adjusting the beginning of
2477 * the data in the mbuf. The encapsulated Ethernet
2478 * type field is already in place.
2480 bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2481 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2482 m_adj(m, ETHER_VLAN_ENCAP_LEN);
2487 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2492 struct ether_header *eh;
2494 bus_dmamap_t old_map;
2496 uint16_t tcpudp_csum;
2501 idx = rx->cnt & rx->mask;
2502 rx->cnt += rx->nbufs;
2503 /* save a pointer to the received mbuf */
2504 m = rx->info[idx].m;
2505 /* try to replace the received mbuf */
2506 if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2507 /* drop the frame -- the old mbuf is re-cycled */
2512 /* unmap the received buffer */
2513 old_map = rx->info[idx].map;
2514 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2515 bus_dmamap_unload(rx->dmat, old_map);
2517 /* swap the bus_dmamap_t's */
2518 rx->info[idx].map = rx->extra_map;
2519 rx->extra_map = old_map;
2521 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2523 m->m_data += MXGEFW_PAD;
2525 m->m_pkthdr.rcvif = ifp;
2526 m->m_len = m->m_pkthdr.len = len;
2528 eh = mtod(m, struct ether_header *);
2529 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2530 mxge_vlan_tag_remove(m, &csum);
2532 /* if the checksum is valid, mark it in the mbuf header */
2533 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2534 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2536 /* otherwise, it was a UDP frame, or a TCP frame which
2537 we could not do LRO on. Tell the stack that the
2539 m->m_pkthdr.csum_data = 0xffff;
2540 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2542 /* flowid only valid if RSS hashing is enabled */
2543 if (sc->num_slices > 1) {
2544 m->m_pkthdr.flowid = (ss - sc->ss);
2545 m->m_flags |= M_FLOWID;
2547 /* pass the frame up the stack */
2548 (*ifp->if_input)(ifp, m);
2552 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2556 struct ether_header *eh;
2559 bus_dmamap_t old_map;
2561 uint16_t tcpudp_csum;
2566 idx = rx->cnt & rx->mask;
2568 /* save a pointer to the received mbuf */
2569 m = rx->info[idx].m;
2570 /* try to replace the received mbuf */
2571 if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2572 /* drop the frame -- the old mbuf is re-cycled */
2577 /* unmap the received buffer */
2578 old_map = rx->info[idx].map;
2579 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2580 bus_dmamap_unload(rx->dmat, old_map);
2582 /* swap the bus_dmamap_t's */
2583 rx->info[idx].map = rx->extra_map;
2584 rx->extra_map = old_map;
2586 /* mcp implicitly skips 1st 2 bytes so that packet is properly
2588 m->m_data += MXGEFW_PAD;
2590 m->m_pkthdr.rcvif = ifp;
2591 m->m_len = m->m_pkthdr.len = len;
2593 eh = mtod(m, struct ether_header *);
2594 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2595 mxge_vlan_tag_remove(m, &csum);
2597 /* if the checksum is valid, mark it in the mbuf header */
2598 if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2599 if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2601 /* otherwise, it was a UDP frame, or a TCP frame which
2602 we could not do LRO on. Tell the stack that the
2604 m->m_pkthdr.csum_data = 0xffff;
2605 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2607 /* flowid only valid if RSS hashing is enabled */
2608 if (sc->num_slices > 1) {
2609 m->m_pkthdr.flowid = (ss - sc->ss);
2610 m->m_flags |= M_FLOWID;
2612 /* pass the frame up the stack */
2613 (*ifp->if_input)(ifp, m);
2617 mxge_clean_rx_done(struct mxge_slice_state *ss)
2619 mxge_rx_done_t *rx_done = &ss->rx_done;
2625 while (rx_done->entry[rx_done->idx].length != 0) {
2626 length = ntohs(rx_done->entry[rx_done->idx].length);
2627 rx_done->entry[rx_done->idx].length = 0;
2628 checksum = rx_done->entry[rx_done->idx].checksum;
2629 if (length <= (MHLEN - MXGEFW_PAD))
2630 mxge_rx_done_small(ss, length, checksum);
2632 mxge_rx_done_big(ss, length, checksum);
2634 rx_done->idx = rx_done->cnt & rx_done->mask;
2636 /* limit potential for livelock */
2637 if (__predict_false(++limit > rx_done->mask / 2))
2641 while (!SLIST_EMPTY(&ss->lro_active)) {
2642 struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2643 SLIST_REMOVE_HEAD(&ss->lro_active, next);
2644 mxge_lro_flush(ss, lro);
2651 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2662 while (tx->pkt_done != mcp_idx) {
2663 idx = tx->done & tx->mask;
2665 m = tx->info[idx].m;
2666 /* mbuf and DMA map only attached to the first
2669 ss->obytes += m->m_pkthdr.len;
2670 if (m->m_flags & M_MCAST)
2673 tx->info[idx].m = NULL;
2674 map = tx->info[idx].map;
2675 bus_dmamap_unload(tx->dmat, map);
2678 if (tx->info[idx].flag) {
2679 tx->info[idx].flag = 0;
2684 /* If we have space, clear IFF_OACTIVE to tell the stack that
2685 its OK to send packets */
2686 #ifdef IFNET_BUF_RING
2687 flags = &ss->if_drv_flags;
2689 flags = &ifp->if_drv_flags;
2691 mtx_lock(&ss->tx.mtx);
2692 if ((*flags) & IFF_DRV_OACTIVE &&
2693 tx->req - tx->done < (tx->mask + 1)/4) {
2694 *(flags) &= ~IFF_DRV_OACTIVE;
2696 mxge_start_locked(ss);
2698 #ifdef IFNET_BUF_RING
2699 if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2700 /* let the NIC stop polling this queue, since there
2701 * are no more transmits pending */
2702 if (tx->req == tx->done) {
2704 tx->queue_active = 0;
2710 mtx_unlock(&ss->tx.mtx);
2714 static struct mxge_media_type mxge_xfp_media_types[] =
2716 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2717 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2718 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2719 {0, (1 << 5), "10GBASE-ER"},
2720 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2721 {0, (1 << 3), "10GBASE-SW"},
2722 {0, (1 << 2), "10GBASE-LW"},
2723 {0, (1 << 1), "10GBASE-EW"},
2724 {0, (1 << 0), "Reserved"}
2726 static struct mxge_media_type mxge_sfp_media_types[] =
2728 {0, (1 << 7), "Reserved"},
2729 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2730 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2731 {IFM_10G_SR, (1 << 4), "10GBASE-SR"}
2735 mxge_set_media(mxge_softc_t *sc, int type)
2737 sc->media_flags |= type;
2738 ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2739 ifmedia_set(&sc->media, sc->media_flags);
2744 * Determine the media type for a NIC. Some XFPs will identify
2745 * themselves only when their link is up, so this is initiated via a
2746 * link up interrupt. However, this can potentially take up to
2747 * several milliseconds, so it is run via the watchdog routine, rather
2748 * than in the interrupt handler itself. This need only be done
2749 * once, not each time the link is up.
2752 mxge_media_probe(mxge_softc_t *sc)
2757 struct mxge_media_type *mxge_media_types = NULL;
2758 int i, err, ms, mxge_media_type_entries;
2761 sc->need_media_probe = 0;
2763 /* if we've already set a media type, we're done */
2764 if (sc->media_flags != (IFM_ETHER | IFM_AUTO))
2768 * parse the product code to deterimine the interface type
2769 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2770 * after the 3rd dash in the driver's cached copy of the
2771 * EEPROM's product code string.
2773 ptr = sc->product_code_string;
2775 device_printf(sc->dev, "Missing product code\n");
2778 for (i = 0; i < 3; i++, ptr++) {
2779 ptr = index(ptr, '-');
2781 device_printf(sc->dev,
2782 "only %d dashes in PC?!?\n", i);
2788 mxge_set_media(sc, IFM_10G_CX4);
2791 else if (*ptr == 'Q') {
2792 /* -Q is Quad Ribbon Fiber */
2793 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2794 /* FreeBSD has no media type for Quad ribbon fiber */
2800 mxge_media_types = mxge_xfp_media_types;
2801 mxge_media_type_entries =
2802 sizeof (mxge_xfp_media_types) /
2803 sizeof (mxge_xfp_media_types[0]);
2804 byte = MXGE_XFP_COMPLIANCE_BYTE;
2808 if (*ptr == 'S' || *(ptr +1) == 'S') {
2809 /* -S or -2S is SFP+ */
2810 mxge_media_types = mxge_sfp_media_types;
2811 mxge_media_type_entries =
2812 sizeof (mxge_sfp_media_types) /
2813 sizeof (mxge_sfp_media_types[0]);
2818 if (mxge_media_types == NULL) {
2819 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2824 * At this point we know the NIC has an XFP cage, so now we
2825 * try to determine what is in the cage by using the
2826 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2827 * register. We read just one byte, which may take over
2831 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2833 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2834 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2835 device_printf(sc->dev, "failed to read XFP\n");
2837 if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2838 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2840 if (err != MXGEFW_CMD_OK) {
2844 /* now we wait for the data to be cached */
2846 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2847 for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2850 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2852 if (err != MXGEFW_CMD_OK) {
2853 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2854 cage_type, err, ms);
2858 if (cmd.data0 == mxge_media_types[0].bitmask) {
2860 device_printf(sc->dev, "%s:%s\n", cage_type,
2861 mxge_media_types[0].name);
2862 mxge_set_media(sc, IFM_10G_CX4);
2865 for (i = 1; i < mxge_media_type_entries; i++) {
2866 if (cmd.data0 & mxge_media_types[i].bitmask) {
2868 device_printf(sc->dev, "%s:%s\n",
2870 mxge_media_types[i].name);
2872 mxge_set_media(sc, mxge_media_types[i].flag);
2876 device_printf(sc->dev, "%s media 0x%x unknown\n", cage_type,
2883 mxge_intr(void *arg)
2885 struct mxge_slice_state *ss = arg;
2886 mxge_softc_t *sc = ss->sc;
2887 mcp_irq_data_t *stats = ss->fw_stats;
2888 mxge_tx_ring_t *tx = &ss->tx;
2889 mxge_rx_done_t *rx_done = &ss->rx_done;
2890 uint32_t send_done_count;
2894 #ifndef IFNET_BUF_RING
2895 /* an interrupt on a non-zero slice is implicitly valid
2896 since MSI-X irqs are not shared */
2898 mxge_clean_rx_done(ss);
2899 *ss->irq_claim = be32toh(3);
2904 /* make sure the DMA has finished */
2905 if (!stats->valid) {
2908 valid = stats->valid;
2910 if (sc->legacy_irq) {
2911 /* lower legacy IRQ */
2912 *sc->irq_deassert = 0;
2913 if (!mxge_deassert_wait)
2914 /* don't wait for conf. that irq is low */
2920 /* loop while waiting for legacy irq deassertion */
2922 /* check for transmit completes and receives */
2923 send_done_count = be32toh(stats->send_done_count);
2924 while ((send_done_count != tx->pkt_done) ||
2925 (rx_done->entry[rx_done->idx].length != 0)) {
2926 if (send_done_count != tx->pkt_done)
2927 mxge_tx_done(ss, (int)send_done_count);
2928 mxge_clean_rx_done(ss);
2929 send_done_count = be32toh(stats->send_done_count);
2931 if (sc->legacy_irq && mxge_deassert_wait)
2933 } while (*((volatile uint8_t *) &stats->valid));
2935 /* fw link & error stats meaningful only on the first slice */
2936 if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
2937 if (sc->link_state != stats->link_up) {
2938 sc->link_state = stats->link_up;
2939 if (sc->link_state) {
2940 if_link_state_change(sc->ifp, LINK_STATE_UP);
2942 device_printf(sc->dev, "link up\n");
2944 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2946 device_printf(sc->dev, "link down\n");
2948 sc->need_media_probe = 1;
2950 if (sc->rdma_tags_available !=
2951 be32toh(stats->rdma_tags_available)) {
2952 sc->rdma_tags_available =
2953 be32toh(stats->rdma_tags_available);
2954 device_printf(sc->dev, "RDMA timed out! %d tags "
2955 "left\n", sc->rdma_tags_available);
2958 if (stats->link_down) {
2959 sc->down_cnt += stats->link_down;
2961 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2965 /* check to see if we have rx token to pass back */
2967 *ss->irq_claim = be32toh(3);
2968 *(ss->irq_claim + 1) = be32toh(3);
2972 mxge_init(void *arg)
2979 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2981 struct lro_entry *lro_entry;
2984 while (!SLIST_EMPTY(&ss->lro_free)) {
2985 lro_entry = SLIST_FIRST(&ss->lro_free);
2986 SLIST_REMOVE_HEAD(&ss->lro_free, next);
2987 free(lro_entry, M_DEVBUF);
2990 for (i = 0; i <= ss->rx_big.mask; i++) {
2991 if (ss->rx_big.info[i].m == NULL)
2993 bus_dmamap_unload(ss->rx_big.dmat,
2994 ss->rx_big.info[i].map);
2995 m_freem(ss->rx_big.info[i].m);
2996 ss->rx_big.info[i].m = NULL;
2999 for (i = 0; i <= ss->rx_small.mask; i++) {
3000 if (ss->rx_small.info[i].m == NULL)
3002 bus_dmamap_unload(ss->rx_small.dmat,
3003 ss->rx_small.info[i].map);
3004 m_freem(ss->rx_small.info[i].m);
3005 ss->rx_small.info[i].m = NULL;
3008 /* transmit ring used only on the first slice */
3009 if (ss->tx.info == NULL)
3012 for (i = 0; i <= ss->tx.mask; i++) {
3013 ss->tx.info[i].flag = 0;
3014 if (ss->tx.info[i].m == NULL)
3016 bus_dmamap_unload(ss->tx.dmat,
3017 ss->tx.info[i].map);
3018 m_freem(ss->tx.info[i].m);
3019 ss->tx.info[i].m = NULL;
3024 mxge_free_mbufs(mxge_softc_t *sc)
3028 for (slice = 0; slice < sc->num_slices; slice++)
3029 mxge_free_slice_mbufs(&sc->ss[slice]);
3033 mxge_free_slice_rings(struct mxge_slice_state *ss)
3038 if (ss->rx_done.entry != NULL)
3039 mxge_dma_free(&ss->rx_done.dma);
3040 ss->rx_done.entry = NULL;
3042 if (ss->tx.req_bytes != NULL)
3043 free(ss->tx.req_bytes, M_DEVBUF);
3044 ss->tx.req_bytes = NULL;
3046 if (ss->tx.seg_list != NULL)
3047 free(ss->tx.seg_list, M_DEVBUF);
3048 ss->tx.seg_list = NULL;
3050 if (ss->rx_small.shadow != NULL)
3051 free(ss->rx_small.shadow, M_DEVBUF);
3052 ss->rx_small.shadow = NULL;
3054 if (ss->rx_big.shadow != NULL)
3055 free(ss->rx_big.shadow, M_DEVBUF);
3056 ss->rx_big.shadow = NULL;
3058 if (ss->tx.info != NULL) {
3059 if (ss->tx.dmat != NULL) {
3060 for (i = 0; i <= ss->tx.mask; i++) {
3061 bus_dmamap_destroy(ss->tx.dmat,
3062 ss->tx.info[i].map);
3064 bus_dma_tag_destroy(ss->tx.dmat);
3066 free(ss->tx.info, M_DEVBUF);
3070 if (ss->rx_small.info != NULL) {
3071 if (ss->rx_small.dmat != NULL) {
3072 for (i = 0; i <= ss->rx_small.mask; i++) {
3073 bus_dmamap_destroy(ss->rx_small.dmat,
3074 ss->rx_small.info[i].map);
3076 bus_dmamap_destroy(ss->rx_small.dmat,
3077 ss->rx_small.extra_map);
3078 bus_dma_tag_destroy(ss->rx_small.dmat);
3080 free(ss->rx_small.info, M_DEVBUF);
3082 ss->rx_small.info = NULL;
3084 if (ss->rx_big.info != NULL) {
3085 if (ss->rx_big.dmat != NULL) {
3086 for (i = 0; i <= ss->rx_big.mask; i++) {
3087 bus_dmamap_destroy(ss->rx_big.dmat,
3088 ss->rx_big.info[i].map);
3090 bus_dmamap_destroy(ss->rx_big.dmat,
3091 ss->rx_big.extra_map);
3092 bus_dma_tag_destroy(ss->rx_big.dmat);
3094 free(ss->rx_big.info, M_DEVBUF);
3096 ss->rx_big.info = NULL;
3100 mxge_free_rings(mxge_softc_t *sc)
3104 for (slice = 0; slice < sc->num_slices; slice++)
3105 mxge_free_slice_rings(&sc->ss[slice]);
3109 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3110 int tx_ring_entries)
3112 mxge_softc_t *sc = ss->sc;
3118 /* allocate per-slice receive resources */
3120 ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3121 ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3123 /* allocate the rx shadow rings */
3124 bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3125 ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3126 if (ss->rx_small.shadow == NULL)
3129 bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3130 ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3131 if (ss->rx_big.shadow == NULL)
3134 /* allocate the rx host info rings */
3135 bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3136 ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3137 if (ss->rx_small.info == NULL)
3140 bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3141 ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3142 if (ss->rx_big.info == NULL)
3145 /* allocate the rx busdma resources */
3146 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3148 4096, /* boundary */
3149 BUS_SPACE_MAXADDR, /* low */
3150 BUS_SPACE_MAXADDR, /* high */
3151 NULL, NULL, /* filter */
3152 MHLEN, /* maxsize */
3154 MHLEN, /* maxsegsize */
3155 BUS_DMA_ALLOCNOW, /* flags */
3156 NULL, NULL, /* lock */
3157 &ss->rx_small.dmat); /* tag */
3159 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3164 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3166 #if MXGE_VIRT_JUMBOS
3167 4096, /* boundary */
3171 BUS_SPACE_MAXADDR, /* low */
3172 BUS_SPACE_MAXADDR, /* high */
3173 NULL, NULL, /* filter */
3174 3*4096, /* maxsize */
3175 #if MXGE_VIRT_JUMBOS
3177 4096, /* maxsegsize*/
3180 MJUM9BYTES, /* maxsegsize*/
3182 BUS_DMA_ALLOCNOW, /* flags */
3183 NULL, NULL, /* lock */
3184 &ss->rx_big.dmat); /* tag */
3186 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3190 for (i = 0; i <= ss->rx_small.mask; i++) {
3191 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3192 &ss->rx_small.info[i].map);
3194 device_printf(sc->dev, "Err %d rx_small dmamap\n",
3199 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3200 &ss->rx_small.extra_map);
3202 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3207 for (i = 0; i <= ss->rx_big.mask; i++) {
3208 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3209 &ss->rx_big.info[i].map);
3211 device_printf(sc->dev, "Err %d rx_big dmamap\n",
3216 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3217 &ss->rx_big.extra_map);
3219 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3224 /* now allocate TX resouces */
3226 #ifndef IFNET_BUF_RING
3227 /* only use a single TX ring for now */
3228 if (ss != ss->sc->ss)
3232 ss->tx.mask = tx_ring_entries - 1;
3233 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3236 /* allocate the tx request copy block */
3238 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3239 ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3240 if (ss->tx.req_bytes == NULL)
3242 /* ensure req_list entries are aligned to 8 bytes */
3243 ss->tx.req_list = (mcp_kreq_ether_send_t *)
3244 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3246 /* allocate the tx busdma segment list */
3247 bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3248 ss->tx.seg_list = (bus_dma_segment_t *)
3249 malloc(bytes, M_DEVBUF, M_WAITOK);
3250 if (ss->tx.seg_list == NULL)
3253 /* allocate the tx host info ring */
3254 bytes = tx_ring_entries * sizeof (*ss->tx.info);
3255 ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3256 if (ss->tx.info == NULL)
3259 /* allocate the tx busdma resources */
3260 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3262 sc->tx_boundary, /* boundary */
3263 BUS_SPACE_MAXADDR, /* low */
3264 BUS_SPACE_MAXADDR, /* high */
3265 NULL, NULL, /* filter */
3266 65536 + 256, /* maxsize */
3267 ss->tx.max_desc - 2, /* num segs */
3268 sc->tx_boundary, /* maxsegsz */
3269 BUS_DMA_ALLOCNOW, /* flags */
3270 NULL, NULL, /* lock */
3271 &ss->tx.dmat); /* tag */
3274 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3279 /* now use these tags to setup dmamaps for each slot
3281 for (i = 0; i <= ss->tx.mask; i++) {
3282 err = bus_dmamap_create(ss->tx.dmat, 0,
3283 &ss->tx.info[i].map);
3285 device_printf(sc->dev, "Err %d tx dmamap\n",
3295 mxge_alloc_rings(mxge_softc_t *sc)
3299 int tx_ring_entries, rx_ring_entries;
3302 /* get ring sizes */
3303 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3304 tx_ring_size = cmd.data0;
3306 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3310 tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3311 rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3312 IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3313 sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3314 IFQ_SET_READY(&sc->ifp->if_snd);
3316 for (slice = 0; slice < sc->num_slices; slice++) {
3317 err = mxge_alloc_slice_rings(&sc->ss[slice],
3326 mxge_free_rings(sc);
3333 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3335 int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3337 if (bufsize < MCLBYTES) {
3338 /* easy, everything fits in a single buffer */
3339 *big_buf_size = MCLBYTES;
3340 *cl_size = MCLBYTES;
3345 if (bufsize < MJUMPAGESIZE) {
3346 /* still easy, everything still fits in a single buffer */
3347 *big_buf_size = MJUMPAGESIZE;
3348 *cl_size = MJUMPAGESIZE;
3352 #if MXGE_VIRT_JUMBOS
3353 /* now we need to use virtually contiguous buffers */
3354 *cl_size = MJUM9BYTES;
3355 *big_buf_size = 4096;
3356 *nbufs = mtu / 4096 + 1;
3357 /* needs to be a power of two, so round up */
3361 *cl_size = MJUM9BYTES;
3362 *big_buf_size = MJUM9BYTES;
3368 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3373 struct lro_entry *lro_entry;
3378 slice = ss - sc->ss;
3380 SLIST_INIT(&ss->lro_free);
3381 SLIST_INIT(&ss->lro_active);
3383 for (i = 0; i < sc->lro_cnt; i++) {
3384 lro_entry = (struct lro_entry *)
3385 malloc(sizeof (*lro_entry), M_DEVBUF,
3387 if (lro_entry == NULL) {
3391 SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3393 /* get the lanai pointers to the send and receive rings */
3396 #ifndef IFNET_BUF_RING
3397 /* We currently only send from the first slice */
3401 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3403 (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3404 ss->tx.send_go = (volatile uint32_t *)
3405 (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3406 ss->tx.send_stop = (volatile uint32_t *)
3407 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3408 #ifndef IFNET_BUF_RING
3412 err |= mxge_send_cmd(sc,
3413 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3414 ss->rx_small.lanai =
3415 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3417 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3419 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3422 device_printf(sc->dev,
3423 "failed to get ring sizes or locations\n");
3427 /* stock receive rings */
3428 for (i = 0; i <= ss->rx_small.mask; i++) {
3429 map = ss->rx_small.info[i].map;
3430 err = mxge_get_buf_small(ss, map, i);
3432 device_printf(sc->dev, "alloced %d/%d smalls\n",
3433 i, ss->rx_small.mask + 1);
3437 for (i = 0; i <= ss->rx_big.mask; i++) {
3438 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3439 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3441 ss->rx_big.nbufs = nbufs;
3442 ss->rx_big.cl_size = cl_size;
3443 ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3444 ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3445 for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3446 map = ss->rx_big.info[i].map;
3447 err = mxge_get_buf_big(ss, map, i);
3449 device_printf(sc->dev, "alloced %d/%d bigs\n",
3450 i, ss->rx_big.mask + 1);
3458 mxge_open(mxge_softc_t *sc)
3461 int err, big_bytes, nbufs, slice, cl_size, i;
3463 volatile uint8_t *itable;
3464 struct mxge_slice_state *ss;
3466 /* Copy the MAC address in case it was overridden */
3467 bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3469 err = mxge_reset(sc, 1);
3471 device_printf(sc->dev, "failed to reset\n");
3475 if (sc->num_slices > 1) {
3476 /* setup the indirection table */
3477 cmd.data0 = sc->num_slices;
3478 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3481 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3484 device_printf(sc->dev,
3485 "failed to setup rss tables\n");
3489 /* just enable an identity mapping */
3490 itable = sc->sram + cmd.data0;
3491 for (i = 0; i < sc->num_slices; i++)
3492 itable[i] = (uint8_t)i;
3495 cmd.data1 = mxge_rss_hash_type;
3496 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3498 device_printf(sc->dev, "failed to enable slices\n");
3504 mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3507 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3509 /* error is only meaningful if we're trying to set
3510 MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3511 if (err && nbufs > 1) {
3512 device_printf(sc->dev,
3513 "Failed to set alway-use-n to %d\n",
3517 /* Give the firmware the mtu and the big and small buffer
3518 sizes. The firmware wants the big buf size to be a power
3519 of two. Luckily, FreeBSD's clusters are powers of two */
3520 cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3521 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3522 cmd.data0 = MHLEN - MXGEFW_PAD;
3523 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3525 cmd.data0 = big_bytes;
3526 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3529 device_printf(sc->dev, "failed to setup params\n");
3533 /* Now give him the pointer to the stats block */
3535 #ifdef IFNET_BUF_RING
3536 slice < sc->num_slices;
3541 ss = &sc->ss[slice];
3543 MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3545 MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3546 cmd.data2 = sizeof(struct mcp_irq_data);
3547 cmd.data2 |= (slice << 16);
3548 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3552 bus = sc->ss->fw_stats_dma.bus_addr;
3553 bus += offsetof(struct mcp_irq_data, send_done_count);
3554 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3555 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3556 err = mxge_send_cmd(sc,
3557 MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3559 /* Firmware cannot support multicast without STATS_DMA_V2 */
3560 sc->fw_multicast_support = 0;
3562 sc->fw_multicast_support = 1;
3566 device_printf(sc->dev, "failed to setup params\n");
3570 for (slice = 0; slice < sc->num_slices; slice++) {
3571 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3573 device_printf(sc->dev, "couldn't open slice %d\n",
3579 /* Finally, start the firmware running */
3580 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3582 device_printf(sc->dev, "Couldn't bring up link\n");
3585 #ifdef IFNET_BUF_RING
3586 for (slice = 0; slice < sc->num_slices; slice++) {
3587 ss = &sc->ss[slice];
3588 ss->if_drv_flags |= IFF_DRV_RUNNING;
3589 ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3592 sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3593 sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3594 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3600 mxge_free_mbufs(sc);
3606 mxge_close(mxge_softc_t *sc, int down)
3609 int err, old_down_cnt;
3610 #ifdef IFNET_BUF_RING
3611 struct mxge_slice_state *ss;
3615 callout_stop(&sc->co_hdl);
3616 #ifdef IFNET_BUF_RING
3617 for (slice = 0; slice < sc->num_slices; slice++) {
3618 ss = &sc->ss[slice];
3619 ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3622 sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3624 old_down_cnt = sc->down_cnt;
3626 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3628 device_printf(sc->dev,
3629 "Couldn't bring down link\n");
3631 if (old_down_cnt == sc->down_cnt) {
3632 /* wait for down irq */
3633 DELAY(10 * sc->intr_coal_delay);
3636 if (old_down_cnt == sc->down_cnt) {
3637 device_printf(sc->dev, "never got down irq\n");
3640 mxge_free_mbufs(sc);
3646 mxge_setup_cfg_space(mxge_softc_t *sc)
3648 device_t dev = sc->dev;
3650 uint16_t cmd, lnk, pectl;
3652 /* find the PCIe link width and set max read request to 4KB*/
3653 if (pci_find_extcap(dev, PCIY_EXPRESS, ®) == 0) {
3654 lnk = pci_read_config(dev, reg + 0x12, 2);
3655 sc->link_width = (lnk >> 4) & 0x3f;
3657 if (sc->pectl == 0) {
3658 pectl = pci_read_config(dev, reg + 0x8, 2);
3659 pectl = (pectl & ~0x7000) | (5 << 12);
3660 pci_write_config(dev, reg + 0x8, pectl, 2);
3663 /* restore saved pectl after watchdog reset */
3664 pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3668 /* Enable DMA and Memory space access */
3669 pci_enable_busmaster(dev);
3670 cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3671 cmd |= PCIM_CMD_MEMEN;
3672 pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3676 mxge_read_reboot(mxge_softc_t *sc)
3678 device_t dev = sc->dev;
3681 /* find the vendor specific offset */
3682 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3683 device_printf(sc->dev,
3684 "could not find vendor specific offset\n");
3685 return (uint32_t)-1;
3687 /* enable read32 mode */
3688 pci_write_config(dev, vs + 0x10, 0x3, 1);
3689 /* tell NIC which register to read */
3690 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3691 return (pci_read_config(dev, vs + 0x14, 4));
3695 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
3697 struct pci_devinfo *dinfo;
3698 struct mxge_slice_state *ss;
3700 int err, running, s, num_tx_slices = 1;
3706 device_printf(sc->dev, "Watchdog reset!\n");
3709 * check to see if the NIC rebooted. If it did, then all of
3710 * PCI config space has been reset, and things like the
3711 * busmaster bit will be zero. If this is the case, then we
3712 * must restore PCI config space before the NIC can be used
3715 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3716 if (cmd == 0xffff) {
3718 * maybe the watchdog caught the NIC rebooting; wait
3719 * up to 100ms for it to finish. If it does not come
3720 * back, then give up
3723 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3724 if (cmd == 0xffff) {
3725 device_printf(sc->dev, "NIC disappeared!\n");
3729 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3730 /* print the reboot status */
3731 reboot = mxge_read_reboot(sc);
3732 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3734 running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3738 * quiesce NIC so that TX routines will not try to
3739 * xmit after restoration of BAR
3742 /* Mark the link as down */
3743 if (sc->link_state) {
3745 if_link_state_change(sc->ifp,
3748 #ifdef IFNET_BUF_RING
3749 num_tx_slices = sc->num_slices;
3751 /* grab all TX locks to ensure no tx */
3752 for (s = 0; s < num_tx_slices; s++) {
3754 mtx_lock(&ss->tx.mtx);
3758 /* restore PCI configuration space */
3759 dinfo = device_get_ivars(sc->dev);
3760 pci_cfg_restore(sc->dev, dinfo);
3762 /* and redo any changes we made to our config space */
3763 mxge_setup_cfg_space(sc);
3766 err = mxge_load_firmware(sc, 0);
3768 device_printf(sc->dev,
3769 "Unable to re-load f/w\n");
3773 err = mxge_open(sc);
3774 /* release all TX locks */
3775 for (s = 0; s < num_tx_slices; s++) {
3777 #ifdef IFNET_BUF_RING
3778 mxge_start_locked(ss);
3780 mtx_unlock(&ss->tx.mtx);
3783 sc->watchdog_resets++;
3785 tx = &sc->ss[slice].tx;
3786 device_printf(sc->dev,
3787 "NIC did not reboot, slice %d ring state:\n",
3789 device_printf(sc->dev,
3790 "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3791 tx->req, tx->done, tx->queue_active);
3792 device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3793 tx->activate, tx->deactivate);
3794 device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3796 be32toh(sc->ss->fw_stats->send_done_count));
3797 device_printf(sc->dev, "not resetting\n");
3800 device_printf(sc->dev, "watchdog reset failed\n");
3806 mxge_watchdog(mxge_softc_t *sc)
3809 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3812 /* see if we have outstanding transmits, which
3813 have been pending for more than mxge_ticks */
3815 #ifdef IFNET_BUF_RING
3816 (i < sc->num_slices) && (err == 0);
3818 (i < 1) && (err == 0);
3822 if (tx->req != tx->done &&
3823 tx->watchdog_req != tx->watchdog_done &&
3824 tx->done == tx->watchdog_done) {
3825 /* check for pause blocking before resetting */
3826 if (tx->watchdog_rx_pause == rx_pause)
3827 err = mxge_watchdog_reset(sc, i);
3829 device_printf(sc->dev, "Flow control blocking "
3830 "xmits, check link partner\n");
3833 tx->watchdog_req = tx->req;
3834 tx->watchdog_done = tx->done;
3835 tx->watchdog_rx_pause = rx_pause;
3838 if (sc->need_media_probe)
3839 mxge_media_probe(sc);
3844 mxge_update_stats(mxge_softc_t *sc)
3846 struct mxge_slice_state *ss;
3847 u_long ipackets = 0;
3848 u_long opackets = 0;
3849 #ifdef IFNET_BUF_RING
3857 for (slice = 0; slice < sc->num_slices; slice++) {
3858 ss = &sc->ss[slice];
3859 ipackets += ss->ipackets;
3860 opackets += ss->opackets;
3861 #ifdef IFNET_BUF_RING
3862 obytes += ss->obytes;
3863 omcasts += ss->omcasts;
3864 odrops += ss->tx.br->br_drops;
3866 oerrors += ss->oerrors;
3868 sc->ifp->if_ipackets = ipackets;
3869 sc->ifp->if_opackets = opackets;
3870 #ifdef IFNET_BUF_RING
3871 sc->ifp->if_obytes = obytes;
3872 sc->ifp->if_omcasts = omcasts;
3873 sc->ifp->if_snd.ifq_drops = odrops;
3875 sc->ifp->if_oerrors = oerrors;
3879 mxge_tick(void *arg)
3881 mxge_softc_t *sc = arg;
3884 /* aggregate stats from different slices */
3885 mxge_update_stats(sc);
3886 if (!sc->watchdog_countdown) {
3887 err = mxge_watchdog(sc);
3888 sc->watchdog_countdown = 4;
3890 sc->watchdog_countdown--;
3892 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3897 mxge_media_change(struct ifnet *ifp)
3903 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3905 struct ifnet *ifp = sc->ifp;
3906 int real_mtu, old_mtu;
3910 real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3911 if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3913 mtx_lock(&sc->driver_mtx);
3914 old_mtu = ifp->if_mtu;
3916 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3918 err = mxge_open(sc);
3920 ifp->if_mtu = old_mtu;
3922 (void) mxge_open(sc);
3925 mtx_unlock(&sc->driver_mtx);
3930 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3932 mxge_softc_t *sc = ifp->if_softc;
3937 ifmr->ifm_status = IFM_AVALID;
3938 ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3939 ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3940 ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3944 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3946 mxge_softc_t *sc = ifp->if_softc;
3947 struct ifreq *ifr = (struct ifreq *)data;
3954 err = ether_ioctl(ifp, command, data);
3958 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3962 mtx_lock(&sc->driver_mtx);
3964 mtx_unlock(&sc->driver_mtx);
3967 if (ifp->if_flags & IFF_UP) {
3968 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3969 err = mxge_open(sc);
3971 /* take care of promis can allmulti
3973 mxge_change_promisc(sc,
3974 ifp->if_flags & IFF_PROMISC);
3975 mxge_set_multicast_list(sc);
3978 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3982 mtx_unlock(&sc->driver_mtx);
3987 mtx_lock(&sc->driver_mtx);
3988 mxge_set_multicast_list(sc);
3989 mtx_unlock(&sc->driver_mtx);
3993 mtx_lock(&sc->driver_mtx);
3994 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3995 if (mask & IFCAP_TXCSUM) {
3996 if (IFCAP_TXCSUM & ifp->if_capenable) {
3997 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3998 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4001 ifp->if_capenable |= IFCAP_TXCSUM;
4002 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4004 } else if (mask & IFCAP_RXCSUM) {
4005 if (IFCAP_RXCSUM & ifp->if_capenable) {
4006 ifp->if_capenable &= ~IFCAP_RXCSUM;
4009 ifp->if_capenable |= IFCAP_RXCSUM;
4013 if (mask & IFCAP_TSO4) {
4014 if (IFCAP_TSO4 & ifp->if_capenable) {
4015 ifp->if_capenable &= ~IFCAP_TSO4;
4016 ifp->if_hwassist &= ~CSUM_TSO;
4017 } else if (IFCAP_TXCSUM & ifp->if_capenable) {
4018 ifp->if_capenable |= IFCAP_TSO4;
4019 ifp->if_hwassist |= CSUM_TSO;
4021 printf("mxge requires tx checksum offload"
4022 " be enabled to use TSO\n");
4026 if (mask & IFCAP_LRO) {
4027 if (IFCAP_LRO & ifp->if_capenable)
4028 err = mxge_change_lro_locked(sc, 0);
4030 err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4032 if (mask & IFCAP_VLAN_HWTAGGING)
4033 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4034 mtx_unlock(&sc->driver_mtx);
4035 VLAN_CAPABILITIES(ifp);
4040 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4041 &sc->media, command);
4051 mxge_fetch_tunables(mxge_softc_t *sc)
4054 TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4055 TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4056 &mxge_flow_control);
4057 TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4058 &mxge_intr_coal_delay);
4059 TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4060 &mxge_nvidia_ecrc_enable);
4061 TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4062 &mxge_force_firmware);
4063 TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4064 &mxge_deassert_wait);
4065 TUNABLE_INT_FETCH("hw.mxge.verbose",
4067 TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4068 TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4069 TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4070 TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4071 TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4072 TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4073 if (sc->lro_cnt != 0)
4074 mxge_lro_cnt = sc->lro_cnt;
4078 if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4079 mxge_intr_coal_delay = 30;
4080 if (mxge_ticks == 0)
4081 mxge_ticks = hz / 2;
4082 sc->pause = mxge_flow_control;
4083 if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4084 || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4085 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
4087 if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4088 mxge_initial_mtu < ETHER_MIN_LEN)
4089 mxge_initial_mtu = ETHERMTU_JUMBO;
4094 mxge_free_slices(mxge_softc_t *sc)
4096 struct mxge_slice_state *ss;
4103 for (i = 0; i < sc->num_slices; i++) {
4105 if (ss->fw_stats != NULL) {
4106 mxge_dma_free(&ss->fw_stats_dma);
4107 ss->fw_stats = NULL;
4108 #ifdef IFNET_BUF_RING
4109 if (ss->tx.br != NULL) {
4110 drbr_free(ss->tx.br, M_DEVBUF);
4114 mtx_destroy(&ss->tx.mtx);
4116 if (ss->rx_done.entry != NULL) {
4117 mxge_dma_free(&ss->rx_done.dma);
4118 ss->rx_done.entry = NULL;
4121 free(sc->ss, M_DEVBUF);
4126 mxge_alloc_slices(mxge_softc_t *sc)
4129 struct mxge_slice_state *ss;
4131 int err, i, max_intr_slots;
4133 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4135 device_printf(sc->dev, "Cannot determine rx ring size\n");
4138 sc->rx_ring_size = cmd.data0;
4139 max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4141 bytes = sizeof (*sc->ss) * sc->num_slices;
4142 sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4145 for (i = 0; i < sc->num_slices; i++) {
4150 /* allocate per-slice rx interrupt queues */
4152 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4153 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4156 ss->rx_done.entry = ss->rx_done.dma.addr;
4157 bzero(ss->rx_done.entry, bytes);
4160 * allocate the per-slice firmware stats; stats
4161 * (including tx) are used used only on the first
4164 #ifndef IFNET_BUF_RING
4169 bytes = sizeof (*ss->fw_stats);
4170 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4171 sizeof (*ss->fw_stats), 64);
4174 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4175 snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4176 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4177 mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4178 #ifdef IFNET_BUF_RING
4179 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4187 mxge_free_slices(sc);
4192 mxge_slice_probe(mxge_softc_t *sc)
4196 int msix_cnt, status, max_intr_slots;
4200 * don't enable multiple slices if they are not enabled,
4201 * or if this is not an SMP system
4204 if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4207 /* see how many MSI-X interrupts are available */
4208 msix_cnt = pci_msix_count(sc->dev);
4212 /* now load the slice aware firmware see what it supports */
4213 old_fw = sc->fw_name;
4214 if (old_fw == mxge_fw_aligned)
4215 sc->fw_name = mxge_fw_rss_aligned;
4217 sc->fw_name = mxge_fw_rss_unaligned;
4218 status = mxge_load_firmware(sc, 0);
4220 device_printf(sc->dev, "Falling back to a single slice\n");
4224 /* try to send a reset command to the card to see if it
4226 memset(&cmd, 0, sizeof (cmd));
4227 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4229 device_printf(sc->dev, "failed reset\n");
4233 /* get rx ring size */
4234 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4236 device_printf(sc->dev, "Cannot determine rx ring size\n");
4239 max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4241 /* tell it the size of the interrupt queues */
4242 cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4243 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4245 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4249 /* ask the maximum number of slices it supports */
4250 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4252 device_printf(sc->dev,
4253 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4256 sc->num_slices = cmd.data0;
4257 if (sc->num_slices > msix_cnt)
4258 sc->num_slices = msix_cnt;
4260 if (mxge_max_slices == -1) {
4261 /* cap to number of CPUs in system */
4262 if (sc->num_slices > mp_ncpus)
4263 sc->num_slices = mp_ncpus;
4265 if (sc->num_slices > mxge_max_slices)
4266 sc->num_slices = mxge_max_slices;
4268 /* make sure it is a power of two */
4269 while (sc->num_slices & (sc->num_slices - 1))
4273 device_printf(sc->dev, "using %d slices\n",
4279 sc->fw_name = old_fw;
4280 (void) mxge_load_firmware(sc, 0);
4284 mxge_add_msix_irqs(mxge_softc_t *sc)
4287 int count, err, i, rid;
4290 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4293 if (sc->msix_table_res == NULL) {
4294 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4298 count = sc->num_slices;
4299 err = pci_alloc_msix(sc->dev, &count);
4301 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4302 "err = %d \n", sc->num_slices, err);
4303 goto abort_with_msix_table;
4305 if (count < sc->num_slices) {
4306 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4307 count, sc->num_slices);
4308 device_printf(sc->dev,
4309 "Try setting hw.mxge.max_slices to %d\n",
4312 goto abort_with_msix;
4314 bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4315 sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4316 if (sc->msix_irq_res == NULL) {
4318 goto abort_with_msix;
4321 for (i = 0; i < sc->num_slices; i++) {
4323 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4326 if (sc->msix_irq_res[i] == NULL) {
4327 device_printf(sc->dev, "couldn't allocate IRQ res"
4328 " for message %d\n", i);
4330 goto abort_with_res;
4334 bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4335 sc->msix_ih = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4337 for (i = 0; i < sc->num_slices; i++) {
4338 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4339 INTR_TYPE_NET | INTR_MPSAFE,
4340 #if __FreeBSD_version > 700030
4343 mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4345 device_printf(sc->dev, "couldn't setup intr for "
4347 goto abort_with_intr;
4352 device_printf(sc->dev, "using %d msix IRQs:",
4354 for (i = 0; i < sc->num_slices; i++)
4355 printf(" %ld", rman_get_start(sc->msix_irq_res[i]));
4361 for (i = 0; i < sc->num_slices; i++) {
4362 if (sc->msix_ih[i] != NULL) {
4363 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4365 sc->msix_ih[i] = NULL;
4368 free(sc->msix_ih, M_DEVBUF);
4372 for (i = 0; i < sc->num_slices; i++) {
4374 if (sc->msix_irq_res[i] != NULL)
4375 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4376 sc->msix_irq_res[i]);
4377 sc->msix_irq_res[i] = NULL;
4379 free(sc->msix_irq_res, M_DEVBUF);
4383 pci_release_msi(sc->dev);
4385 abort_with_msix_table:
4386 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4387 sc->msix_table_res);
4393 mxge_add_single_irq(mxge_softc_t *sc)
4395 int count, err, rid;
4397 count = pci_msi_count(sc->dev);
4398 if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4404 sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4405 1, RF_SHAREABLE | RF_ACTIVE);
4406 if (sc->irq_res == NULL) {
4407 device_printf(sc->dev, "could not alloc interrupt\n");
4411 device_printf(sc->dev, "using %s irq %ld\n",
4412 sc->legacy_irq ? "INTx" : "MSI",
4413 rman_get_start(sc->irq_res));
4414 err = bus_setup_intr(sc->dev, sc->irq_res,
4415 INTR_TYPE_NET | INTR_MPSAFE,
4416 #if __FreeBSD_version > 700030
4419 mxge_intr, &sc->ss[0], &sc->ih);
4421 bus_release_resource(sc->dev, SYS_RES_IRQ,
4422 sc->legacy_irq ? 0 : 1, sc->irq_res);
4423 if (!sc->legacy_irq)
4424 pci_release_msi(sc->dev);
4430 mxge_rem_msix_irqs(mxge_softc_t *sc)
4434 for (i = 0; i < sc->num_slices; i++) {
4435 if (sc->msix_ih[i] != NULL) {
4436 bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4438 sc->msix_ih[i] = NULL;
4441 free(sc->msix_ih, M_DEVBUF);
4443 for (i = 0; i < sc->num_slices; i++) {
4445 if (sc->msix_irq_res[i] != NULL)
4446 bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4447 sc->msix_irq_res[i]);
4448 sc->msix_irq_res[i] = NULL;
4450 free(sc->msix_irq_res, M_DEVBUF);
4452 bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4453 sc->msix_table_res);
4455 pci_release_msi(sc->dev);
4460 mxge_rem_single_irq(mxge_softc_t *sc)
4462 bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4463 bus_release_resource(sc->dev, SYS_RES_IRQ,
4464 sc->legacy_irq ? 0 : 1, sc->irq_res);
4465 if (!sc->legacy_irq)
4466 pci_release_msi(sc->dev);
4470 mxge_rem_irq(mxge_softc_t *sc)
4472 if (sc->num_slices > 1)
4473 mxge_rem_msix_irqs(sc);
4475 mxge_rem_single_irq(sc);
4479 mxge_add_irq(mxge_softc_t *sc)
4483 if (sc->num_slices > 1)
4484 err = mxge_add_msix_irqs(sc);
4486 err = mxge_add_single_irq(sc);
4488 if (0 && err == 0 && sc->num_slices > 1) {
4489 mxge_rem_msix_irqs(sc);
4490 err = mxge_add_msix_irqs(sc);
4497 mxge_attach(device_t dev)
4499 mxge_softc_t *sc = device_get_softc(dev);
4504 mxge_fetch_tunables(sc);
4506 err = bus_dma_tag_create(NULL, /* parent */
4509 BUS_SPACE_MAXADDR, /* low */
4510 BUS_SPACE_MAXADDR, /* high */
4511 NULL, NULL, /* filter */
4512 65536 + 256, /* maxsize */
4513 MXGE_MAX_SEND_DESC, /* num segs */
4514 65536, /* maxsegsize */
4516 NULL, NULL, /* lock */
4517 &sc->parent_dmat); /* tag */
4520 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4522 goto abort_with_nothing;
4525 ifp = sc->ifp = if_alloc(IFT_ETHER);
4527 device_printf(dev, "can not if_alloc()\n");
4529 goto abort_with_parent_dmat;
4531 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4533 snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4534 device_get_nameunit(dev));
4535 mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4536 snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4537 "%s:drv", device_get_nameunit(dev));
4538 mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4539 MTX_NETWORK_LOCK, MTX_DEF);
4541 callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4543 mxge_setup_cfg_space(sc);
4545 /* Map the board into the kernel */
4547 sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4549 if (sc->mem_res == NULL) {
4550 device_printf(dev, "could not map memory\n");
4552 goto abort_with_lock;
4554 sc->sram = rman_get_virtual(sc->mem_res);
4555 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4556 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4557 device_printf(dev, "impossible memory region size %ld\n",
4558 rman_get_size(sc->mem_res));
4560 goto abort_with_mem_res;
4563 /* make NULL terminated copy of the EEPROM strings section of
4565 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4566 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4567 rman_get_bushandle(sc->mem_res),
4568 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4570 MXGE_EEPROM_STRINGS_SIZE - 2);
4571 err = mxge_parse_strings(sc);
4573 goto abort_with_mem_res;
4575 /* Enable write combining for efficient use of PCIe bus */
4578 /* Allocate the out of band dma memory */
4579 err = mxge_dma_alloc(sc, &sc->cmd_dma,
4580 sizeof (mxge_cmd_t), 64);
4582 goto abort_with_mem_res;
4583 sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4584 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4586 goto abort_with_cmd_dma;
4588 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4590 goto abort_with_zeropad_dma;
4592 /* select & load the firmware */
4593 err = mxge_select_firmware(sc);
4595 goto abort_with_dmabench;
4596 sc->intr_coal_delay = mxge_intr_coal_delay;
4598 mxge_slice_probe(sc);
4599 err = mxge_alloc_slices(sc);
4601 goto abort_with_dmabench;
4603 err = mxge_reset(sc, 0);
4605 goto abort_with_slices;
4607 err = mxge_alloc_rings(sc);
4609 device_printf(sc->dev, "failed to allocate rings\n");
4610 goto abort_with_dmabench;
4613 err = mxge_add_irq(sc);
4615 device_printf(sc->dev, "failed to add irq\n");
4616 goto abort_with_rings;
4619 ifp->if_baudrate = IF_Gbps(10UL);
4620 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4623 ifp->if_capabilities |= IFCAP_LRO;
4626 #ifdef MXGE_NEW_VLAN_API
4627 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4630 sc->max_mtu = mxge_max_mtu(sc);
4631 if (sc->max_mtu >= 9000)
4632 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4634 device_printf(dev, "MTU limited to %d. Install "
4635 "latest firmware for 9000 byte jumbo support\n",
4636 sc->max_mtu - ETHER_HDR_LEN);
4637 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4638 ifp->if_capenable = ifp->if_capabilities;
4639 if (sc->lro_cnt == 0)
4640 ifp->if_capenable &= ~IFCAP_LRO;
4642 ifp->if_init = mxge_init;
4644 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4645 ifp->if_ioctl = mxge_ioctl;
4646 ifp->if_start = mxge_start;
4647 /* Initialise the ifmedia structure */
4648 ifmedia_init(&sc->media, 0, mxge_media_change,
4650 mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4651 mxge_media_probe(sc);
4653 ether_ifattach(ifp, sc->mac_addr);
4654 /* ether_ifattach sets mtu to ETHERMTU */
4655 if (mxge_initial_mtu != ETHERMTU)
4656 mxge_change_mtu(sc, mxge_initial_mtu);
4658 mxge_add_sysctls(sc);
4659 #ifdef IFNET_BUF_RING
4660 ifp->if_transmit = mxge_transmit;
4661 ifp->if_qflush = mxge_qflush;
4666 mxge_free_rings(sc);
4668 mxge_free_slices(sc);
4669 abort_with_dmabench:
4670 mxge_dma_free(&sc->dmabench_dma);
4671 abort_with_zeropad_dma:
4672 mxge_dma_free(&sc->zeropad_dma);
4674 mxge_dma_free(&sc->cmd_dma);
4676 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4678 pci_disable_busmaster(dev);
4679 mtx_destroy(&sc->cmd_mtx);
4680 mtx_destroy(&sc->driver_mtx);
4682 abort_with_parent_dmat:
4683 bus_dma_tag_destroy(sc->parent_dmat);
4690 mxge_detach(device_t dev)
4692 mxge_softc_t *sc = device_get_softc(dev);
4694 if (mxge_vlans_active(sc)) {
4695 device_printf(sc->dev,
4696 "Detach vlans before removing module\n");
4699 mtx_lock(&sc->driver_mtx);
4701 if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4703 mtx_unlock(&sc->driver_mtx);
4704 ether_ifdetach(sc->ifp);
4705 callout_drain(&sc->co_hdl);
4706 ifmedia_removeall(&sc->media);
4707 mxge_dummy_rdma(sc, 0);
4708 mxge_rem_sysctls(sc);
4710 mxge_free_rings(sc);
4711 mxge_free_slices(sc);
4712 mxge_dma_free(&sc->dmabench_dma);
4713 mxge_dma_free(&sc->zeropad_dma);
4714 mxge_dma_free(&sc->cmd_dma);
4715 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4716 pci_disable_busmaster(dev);
4717 mtx_destroy(&sc->cmd_mtx);
4718 mtx_destroy(&sc->driver_mtx);
4720 bus_dma_tag_destroy(sc->parent_dmat);
4725 mxge_shutdown(device_t dev)
4731 This file uses Myri10GE driver indentation.
4734 c-file-style:"linux"