]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/dev/mxge/if_mxge.c
gnu/dts: Update our copy of arm dts from Linux 4.16
[FreeBSD/FreeBSD.git] / sys / dev / mxge / if_mxge.c
1 /******************************************************************************
2 SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3
4 Copyright (c) 2006-2013, Myricom Inc.
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9
10  1. Redistributions of source code must retain the above copyright notice,
11     this list of conditions and the following disclaimer.
12
13  2. Neither the name of the Myricom Inc, nor the names of its
14     contributors may be used to endorse or promote products derived from
15     this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
28
29 ***************************************************************************/
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/linker.h>
37 #include <sys/firmware.h>
38 #include <sys/endian.h>
39 #include <sys/sockio.h>
40 #include <sys/mbuf.h>
41 #include <sys/malloc.h>
42 #include <sys/kdb.h>
43 #include <sys/kernel.h>
44 #include <sys/lock.h>
45 #include <sys/module.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 #include <sys/taskqueue.h>
50 #include <sys/zlib.h>
51
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <net/if_arp.h>
55 #include <net/ethernet.h>
56 #include <net/if_dl.h>
57 #include <net/if_media.h>
58
59 #include <net/bpf.h>
60
61 #include <net/if_types.h>
62 #include <net/if_vlan_var.h>
63
64 #include <netinet/in_systm.h>
65 #include <netinet/in.h>
66 #include <netinet/ip.h>
67 #include <netinet/ip6.h>
68 #include <netinet/tcp.h>
69 #include <netinet/tcp_lro.h>
70 #include <netinet6/ip6_var.h>
71
72 #include <machine/bus.h>
73 #include <machine/in_cksum.h>
74 #include <machine/resource.h>
75 #include <sys/bus.h>
76 #include <sys/rman.h>
77 #include <sys/smp.h>
78
79 #include <dev/pci/pcireg.h>
80 #include <dev/pci/pcivar.h>
81 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
82
83 #include <vm/vm.h>              /* for pmap_mapdev() */
84 #include <vm/pmap.h>
85
86 #if defined(__i386) || defined(__amd64)
87 #include <machine/specialreg.h>
88 #endif
89
90 #include <dev/mxge/mxge_mcp.h>
91 #include <dev/mxge/mcp_gen_header.h>
92 /*#define MXGE_FAKE_IFP*/
93 #include <dev/mxge/if_mxge_var.h>
94 #ifdef IFNET_BUF_RING
95 #include <sys/buf_ring.h>
96 #endif
97
98 #include "opt_inet.h"
99 #include "opt_inet6.h"
100
101 /* tunable params */
102 static int mxge_nvidia_ecrc_enable = 1;
103 static int mxge_force_firmware = 0;
104 static int mxge_intr_coal_delay = 30;
105 static int mxge_deassert_wait = 1;
106 static int mxge_flow_control = 1;
107 static int mxge_verbose = 0;
108 static int mxge_ticks;
109 static int mxge_max_slices = 1;
110 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
111 static int mxge_always_promisc = 0;
112 static int mxge_initial_mtu = ETHERMTU_JUMBO;
113 static int mxge_throttle = 0;
114 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
115 static char *mxge_fw_aligned = "mxge_eth_z8e";
116 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
117 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
118
119 static int mxge_probe(device_t dev);
120 static int mxge_attach(device_t dev);
121 static int mxge_detach(device_t dev);
122 static int mxge_shutdown(device_t dev);
123 static void mxge_intr(void *arg);
124
125 static device_method_t mxge_methods[] =
126 {
127   /* Device interface */
128   DEVMETHOD(device_probe, mxge_probe),
129   DEVMETHOD(device_attach, mxge_attach),
130   DEVMETHOD(device_detach, mxge_detach),
131   DEVMETHOD(device_shutdown, mxge_shutdown),
132
133   DEVMETHOD_END
134 };
135
136 static driver_t mxge_driver =
137 {
138   "mxge",
139   mxge_methods,
140   sizeof(mxge_softc_t),
141 };
142
143 static devclass_t mxge_devclass;
144
145 /* Declare ourselves to be a child of the PCI bus.*/
146 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
147 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
148 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
149
150 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
151 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
152 static int mxge_close(mxge_softc_t *sc, int down);
153 static int mxge_open(mxge_softc_t *sc);
154 static void mxge_tick(void *arg);
155
156 static int
157 mxge_probe(device_t dev)
158 {
159         int rev;
160
161
162         if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
163             ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
164              (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
165                 rev = pci_get_revid(dev);
166                 switch (rev) {
167                 case MXGE_PCI_REV_Z8E:
168                         device_set_desc(dev, "Myri10G-PCIE-8A");
169                         break;
170                 case MXGE_PCI_REV_Z8ES:
171                         device_set_desc(dev, "Myri10G-PCIE-8B");
172                         break;
173                 default:
174                         device_set_desc(dev, "Myri10G-PCIE-8??");
175                         device_printf(dev, "Unrecognized rev %d NIC\n",
176                                       rev);
177                         break;  
178                 }
179                 return 0;
180         }
181         return ENXIO;
182 }
183
184 static void
185 mxge_enable_wc(mxge_softc_t *sc)
186 {
187 #if defined(__i386) || defined(__amd64)
188         vm_offset_t len;
189         int err;
190
191         sc->wc = 1;
192         len = rman_get_size(sc->mem_res);
193         err = pmap_change_attr((vm_offset_t) sc->sram,
194                                len, PAT_WRITE_COMBINING);
195         if (err != 0) {
196                 device_printf(sc->dev, "pmap_change_attr failed, %d\n",
197                               err);
198                 sc->wc = 0;
199         }
200 #endif          
201 }
202
203
204 /* callback to get our DMA address */
205 static void
206 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
207                          int error)
208 {
209         if (error == 0) {
210                 *(bus_addr_t *) arg = segs->ds_addr;
211         }
212 }
213
214 static int
215 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
216                    bus_size_t alignment)
217 {
218         int err;
219         device_t dev = sc->dev;
220         bus_size_t boundary, maxsegsize;
221
222         if (bytes > 4096 && alignment == 4096) {
223                 boundary = 0;
224                 maxsegsize = bytes;
225         } else {
226                 boundary = 4096;
227                 maxsegsize = 4096;
228         }
229
230         /* allocate DMAable memory tags */
231         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
232                                  alignment,             /* alignment */
233                                  boundary,              /* boundary */
234                                  BUS_SPACE_MAXADDR,     /* low */
235                                  BUS_SPACE_MAXADDR,     /* high */
236                                  NULL, NULL,            /* filter */
237                                  bytes,                 /* maxsize */
238                                  1,                     /* num segs */
239                                  maxsegsize,            /* maxsegsize */
240                                  BUS_DMA_COHERENT,      /* flags */
241                                  NULL, NULL,            /* lock */
242                                  &dma->dmat);           /* tag */
243         if (err != 0) {
244                 device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
245                 return err;
246         }
247
248         /* allocate DMAable memory & map */
249         err = bus_dmamem_alloc(dma->dmat, &dma->addr,
250                                (BUS_DMA_WAITOK | BUS_DMA_COHERENT
251                                 | BUS_DMA_ZERO),  &dma->map);
252         if (err != 0) {
253                 device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
254                 goto abort_with_dmat;
255         }
256
257         /* load the memory */
258         err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
259                               mxge_dmamap_callback,
260                               (void *)&dma->bus_addr, 0);
261         if (err != 0) {
262                 device_printf(dev, "couldn't load map (err = %d)\n", err);
263                 goto abort_with_mem;
264         }
265         return 0;
266
267 abort_with_mem:
268         bus_dmamem_free(dma->dmat, dma->addr, dma->map);
269 abort_with_dmat:
270         (void)bus_dma_tag_destroy(dma->dmat);
271         return err;
272 }
273
274
275 static void
276 mxge_dma_free(mxge_dma_t *dma)
277 {
278         bus_dmamap_unload(dma->dmat, dma->map);
279         bus_dmamem_free(dma->dmat, dma->addr, dma->map);
280         (void)bus_dma_tag_destroy(dma->dmat);
281 }
282
283 /*
284  * The eeprom strings on the lanaiX have the format
285  * SN=x\0
286  * MAC=x:x:x:x:x:x\0
287  * PC=text\0
288  */
289
290 static int
291 mxge_parse_strings(mxge_softc_t *sc)
292 {
293         char *ptr;
294         int i, found_mac, found_sn2;
295         char *endptr;
296
297         ptr = sc->eeprom_strings;
298         found_mac = 0;
299         found_sn2 = 0;
300         while (*ptr != '\0') {
301                 if (strncmp(ptr, "MAC=", 4) == 0) {
302                         ptr += 4;
303                         for (i = 0;;) {
304                                 sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
305                                 if (endptr - ptr != 2)
306                                         goto abort;
307                                 ptr = endptr;
308                                 if (++i == 6)
309                                         break;
310                                 if (*ptr++ != ':')
311                                         goto abort;
312                         }
313                         found_mac = 1;
314                 } else if (strncmp(ptr, "PC=", 3) == 0) {
315                         ptr += 3;
316                         strlcpy(sc->product_code_string, ptr,
317                             sizeof(sc->product_code_string));
318                 } else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
319                         ptr += 3;
320                         strlcpy(sc->serial_number_string, ptr,
321                             sizeof(sc->serial_number_string));
322                 } else if (strncmp(ptr, "SN2=", 4) == 0) {
323                         /* SN2 takes precedence over SN */
324                         ptr += 4;
325                         found_sn2 = 1;
326                         strlcpy(sc->serial_number_string, ptr,
327                             sizeof(sc->serial_number_string));
328                 }
329                 while (*ptr++ != '\0') {}
330         }
331
332         if (found_mac)
333                 return 0;
334
335  abort:
336         device_printf(sc->dev, "failed to parse eeprom_strings\n");
337
338         return ENXIO;
339 }
340
341 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
342 static void
343 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
344 {
345         uint32_t val;
346         unsigned long base, off;
347         char *va, *cfgptr;
348         device_t pdev, mcp55;
349         uint16_t vendor_id, device_id, word;
350         uintptr_t bus, slot, func, ivend, idev;
351         uint32_t *ptr32;
352
353
354         if (!mxge_nvidia_ecrc_enable)
355                 return;
356
357         pdev = device_get_parent(device_get_parent(sc->dev));
358         if (pdev == NULL) {
359                 device_printf(sc->dev, "could not find parent?\n");
360                 return;
361         }
362         vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
363         device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
364
365         if (vendor_id != 0x10de)
366                 return;
367
368         base = 0;
369
370         if (device_id == 0x005d) {
371                 /* ck804, base address is magic */
372                 base = 0xe0000000UL;
373         } else if (device_id >= 0x0374 && device_id <= 0x378) {
374                 /* mcp55, base address stored in chipset */
375                 mcp55 = pci_find_bsf(0, 0, 0);
376                 if (mcp55 &&
377                     0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
378                     0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
379                         word = pci_read_config(mcp55, 0x90, 2);
380                         base = ((unsigned long)word & 0x7ffeU) << 25;
381                 }
382         }
383         if (!base)
384                 return;
385
386         /* XXXX
387            Test below is commented because it is believed that doing
388            config read/write beyond 0xff will access the config space
389            for the next larger function.  Uncomment this and remove
390            the hacky pmap_mapdev() way of accessing config space when
391            FreeBSD grows support for extended pcie config space access
392         */
393 #if 0   
394         /* See if we can, by some miracle, access the extended
395            config space */
396         val = pci_read_config(pdev, 0x178, 4);
397         if (val != 0xffffffff) {
398                 val |= 0x40;
399                 pci_write_config(pdev, 0x178, val, 4);
400                 return;
401         }
402 #endif
403         /* Rather than using normal pci config space writes, we must
404          * map the Nvidia config space ourselves.  This is because on
405          * opteron/nvidia class machine the 0xe000000 mapping is
406          * handled by the nvidia chipset, that means the internal PCI
407          * device (the on-chip northbridge), or the amd-8131 bridge
408          * and things behind them are not visible by this method.
409          */
410
411         BUS_READ_IVAR(device_get_parent(pdev), pdev,
412                       PCI_IVAR_BUS, &bus);
413         BUS_READ_IVAR(device_get_parent(pdev), pdev,
414                       PCI_IVAR_SLOT, &slot);
415         BUS_READ_IVAR(device_get_parent(pdev), pdev,
416                       PCI_IVAR_FUNCTION, &func);
417         BUS_READ_IVAR(device_get_parent(pdev), pdev,
418                       PCI_IVAR_VENDOR, &ivend);
419         BUS_READ_IVAR(device_get_parent(pdev), pdev,
420                       PCI_IVAR_DEVICE, &idev);
421                                         
422         off =  base
423                 + 0x00100000UL * (unsigned long)bus
424                 + 0x00001000UL * (unsigned long)(func
425                                                  + 8 * slot);
426
427         /* map it into the kernel */
428         va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
429         
430
431         if (va == NULL) {
432                 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
433                 return;
434         }
435         /* get a pointer to the config space mapped into the kernel */
436         cfgptr = va + (off & PAGE_MASK);
437
438         /* make sure that we can really access it */
439         vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
440         device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
441         if (! (vendor_id == ivend && device_id == idev)) {
442                 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
443                               vendor_id, device_id);
444                 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
445                 return;
446         }
447
448         ptr32 = (uint32_t*)(cfgptr + 0x178);
449         val = *ptr32;
450
451         if (val == 0xffffffff) {
452                 device_printf(sc->dev, "extended mapping failed\n");
453                 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
454                 return;
455         }
456         *ptr32 = val | 0x40;
457         pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
458         if (mxge_verbose)
459                 device_printf(sc->dev,
460                               "Enabled ECRC on upstream Nvidia bridge "
461                               "at %d:%d:%d\n",
462                               (int)bus, (int)slot, (int)func);
463         return;
464 }
465 #else
466 static void
467 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
468 {
469         device_printf(sc->dev,
470                       "Nforce 4 chipset on non-x86/amd64!?!?!\n");
471         return;
472 }
473 #endif
474
475
476 static int
477 mxge_dma_test(mxge_softc_t *sc, int test_type)
478 {
479         mxge_cmd_t cmd;
480         bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
481         int status;
482         uint32_t len;
483         char *test = " ";
484
485
486         /* Run a small DMA test.
487          * The magic multipliers to the length tell the firmware
488          * to do DMA read, write, or read+write tests.  The
489          * results are returned in cmd.data0.  The upper 16
490          * bits of the return is the number of transfers completed.
491          * The lower 16 bits is the time in 0.5us ticks that the
492          * transfers took to complete.
493          */
494
495         len = sc->tx_boundary;
496
497         cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
498         cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
499         cmd.data2 = len * 0x10000;
500         status = mxge_send_cmd(sc, test_type, &cmd);
501         if (status != 0) {
502                 test = "read";
503                 goto abort;
504         }
505         sc->read_dma = ((cmd.data0>>16) * len * 2) /
506                 (cmd.data0 & 0xffff);
507         cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
508         cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
509         cmd.data2 = len * 0x1;
510         status = mxge_send_cmd(sc, test_type, &cmd);
511         if (status != 0) {
512                 test = "write";
513                 goto abort;
514         }
515         sc->write_dma = ((cmd.data0>>16) * len * 2) /
516                 (cmd.data0 & 0xffff);
517
518         cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
519         cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
520         cmd.data2 = len * 0x10001;
521         status = mxge_send_cmd(sc, test_type, &cmd);
522         if (status != 0) {
523                 test = "read/write";
524                 goto abort;
525         }
526         sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
527                 (cmd.data0 & 0xffff);
528
529 abort:
530         if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
531                 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
532                               test, status);
533
534         return status;
535 }
536
537 /*
538  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
539  * when the PCI-E Completion packets are aligned on an 8-byte
540  * boundary.  Some PCI-E chip sets always align Completion packets; on
541  * the ones that do not, the alignment can be enforced by enabling
542  * ECRC generation (if supported).
543  *
544  * When PCI-E Completion packets are not aligned, it is actually more
545  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
546  *
547  * If the driver can neither enable ECRC nor verify that it has
548  * already been enabled, then it must use a firmware image which works
549  * around unaligned completion packets (ethp_z8e.dat), and it should
550  * also ensure that it never gives the device a Read-DMA which is
551  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
552  * enabled, then the driver should use the aligned (eth_z8e.dat)
553  * firmware image, and set tx_boundary to 4KB.
554  */
555
556 static int
557 mxge_firmware_probe(mxge_softc_t *sc)
558 {
559         device_t dev = sc->dev;
560         int reg, status;
561         uint16_t pectl;
562
563         sc->tx_boundary = 4096;
564         /*
565          * Verify the max read request size was set to 4KB
566          * before trying the test with 4KB.
567          */
568         if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
569                 pectl = pci_read_config(dev, reg + 0x8, 2);
570                 if ((pectl & (5 << 12)) != (5 << 12)) {
571                         device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
572                                       pectl);
573                         sc->tx_boundary = 2048;
574                 }
575         }
576
577         /*
578          * load the optimized firmware (which assumes aligned PCIe
579          * completions) in order to see if it works on this host.
580          */
581         sc->fw_name = mxge_fw_aligned;
582         status = mxge_load_firmware(sc, 1);
583         if (status != 0) {
584                 return status;
585         }
586
587         /*
588          * Enable ECRC if possible
589          */
590         mxge_enable_nvidia_ecrc(sc);
591
592         /*
593          * Run a DMA test which watches for unaligned completions and
594          * aborts on the first one seen.  Not required on Z8ES or newer.
595          */
596         if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
597                 return 0;
598         status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
599         if (status == 0)
600                 return 0; /* keep the aligned firmware */
601
602         if (status != E2BIG)
603                 device_printf(dev, "DMA test failed: %d\n", status);
604         if (status == ENOSYS)
605                 device_printf(dev, "Falling back to ethp! "
606                               "Please install up to date fw\n");
607         return status;
608 }
609
610 static int
611 mxge_select_firmware(mxge_softc_t *sc)
612 {
613         int aligned = 0;
614         int force_firmware = mxge_force_firmware;
615
616         if (sc->throttle)
617                 force_firmware = sc->throttle;
618
619         if (force_firmware != 0) {
620                 if (force_firmware == 1)
621                         aligned = 1;
622                 else
623                         aligned = 0;
624                 if (mxge_verbose)
625                         device_printf(sc->dev,
626                                       "Assuming %s completions (forced)\n",
627                                       aligned ? "aligned" : "unaligned");
628                 goto abort;
629         }
630
631         /* if the PCIe link width is 4 or less, we can use the aligned
632            firmware and skip any checks */
633         if (sc->link_width != 0 && sc->link_width <= 4) {
634                 device_printf(sc->dev,
635                               "PCIe x%d Link, expect reduced performance\n",
636                               sc->link_width);
637                 aligned = 1;
638                 goto abort;
639         }
640
641         if (0 == mxge_firmware_probe(sc))
642                 return 0;
643
644 abort:
645         if (aligned) {
646                 sc->fw_name = mxge_fw_aligned;
647                 sc->tx_boundary = 4096;
648         } else {
649                 sc->fw_name = mxge_fw_unaligned;
650                 sc->tx_boundary = 2048;
651         }
652         return (mxge_load_firmware(sc, 0));
653 }
654
655 static int
656 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
657 {
658
659
660         if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
661                 device_printf(sc->dev, "Bad firmware type: 0x%x\n",
662                               be32toh(hdr->mcp_type));
663                 return EIO;
664         }
665
666         /* save firmware version for sysctl */
667         strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
668         if (mxge_verbose)
669                 device_printf(sc->dev, "firmware id: %s\n", hdr->version);
670
671         sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
672                &sc->fw_ver_minor, &sc->fw_ver_tiny);
673
674         if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
675               && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
676                 device_printf(sc->dev, "Found firmware version %s\n",
677                               sc->fw_version);
678                 device_printf(sc->dev, "Driver needs %d.%d\n",
679                               MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
680                 return EINVAL;
681         }
682         return 0;
683
684 }
685
686 static void *
687 z_alloc(void *nil, u_int items, u_int size)
688 {
689         void *ptr;
690
691         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
692         return ptr;
693 }
694
695 static void
696 z_free(void *nil, void *ptr)
697 {
698         free(ptr, M_TEMP);
699 }
700
701
702 static int
703 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
704 {
705         z_stream zs;
706         char *inflate_buffer;
707         const struct firmware *fw;
708         const mcp_gen_header_t *hdr;
709         unsigned hdr_offset;
710         int status;
711         unsigned int i;
712         char dummy;
713         size_t fw_len;
714
715         fw = firmware_get(sc->fw_name);
716         if (fw == NULL) {
717                 device_printf(sc->dev, "Could not find firmware image %s\n",
718                               sc->fw_name);
719                 return ENOENT;
720         }
721
722
723
724         /* setup zlib and decompress f/w */
725         bzero(&zs, sizeof (zs));
726         zs.zalloc = z_alloc;
727         zs.zfree = z_free;
728         status = inflateInit(&zs);
729         if (status != Z_OK) {
730                 status = EIO;
731                 goto abort_with_fw;
732         }
733
734         /* the uncompressed size is stored as the firmware version,
735            which would otherwise go unused */
736         fw_len = (size_t) fw->version;
737         inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
738         if (inflate_buffer == NULL)
739                 goto abort_with_zs;
740         zs.avail_in = fw->datasize;
741         zs.next_in = __DECONST(char *, fw->data);
742         zs.avail_out = fw_len;
743         zs.next_out = inflate_buffer;
744         status = inflate(&zs, Z_FINISH);
745         if (status != Z_STREAM_END) {
746                 device_printf(sc->dev, "zlib %d\n", status);
747                 status = EIO;
748                 goto abort_with_buffer;
749         }
750
751         /* check id */
752         hdr_offset = htobe32(*(const uint32_t *)
753                              (inflate_buffer + MCP_HEADER_PTR_OFFSET));
754         if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
755                 device_printf(sc->dev, "Bad firmware file");
756                 status = EIO;
757                 goto abort_with_buffer;
758         }
759         hdr = (const void*)(inflate_buffer + hdr_offset);
760
761         status = mxge_validate_firmware(sc, hdr);
762         if (status != 0)
763                 goto abort_with_buffer;
764
765         /* Copy the inflated firmware to NIC SRAM. */
766         for (i = 0; i < fw_len; i += 256) {
767                 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
768                               inflate_buffer + i,
769                               min(256U, (unsigned)(fw_len - i)));
770                 wmb();
771                 dummy = *sc->sram;
772                 wmb();
773         }
774
775         *limit = fw_len;
776         status = 0;
777 abort_with_buffer:
778         free(inflate_buffer, M_TEMP);
779 abort_with_zs:
780         inflateEnd(&zs);
781 abort_with_fw:
782         firmware_put(fw, FIRMWARE_UNLOAD);
783         return status;
784 }
785
786 /*
787  * Enable or disable periodic RDMAs from the host to make certain
788  * chipsets resend dropped PCIe messages
789  */
790
791 static void
792 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
793 {
794         char buf_bytes[72];
795         volatile uint32_t *confirm;
796         volatile char *submit;
797         uint32_t *buf, dma_low, dma_high;
798         int i;
799
800         buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
801
802         /* clear confirmation addr */
803         confirm = (volatile uint32_t *)sc->cmd;
804         *confirm = 0;
805         wmb();
806
807         /* send an rdma command to the PCIe engine, and wait for the
808            response in the confirmation address.  The firmware should
809            write a -1 there to indicate it is alive and well
810         */
811
812         dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
813         dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
814         buf[0] = htobe32(dma_high);             /* confirm addr MSW */
815         buf[1] = htobe32(dma_low);              /* confirm addr LSW */
816         buf[2] = htobe32(0xffffffff);           /* confirm data */
817         dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
818         dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
819         buf[3] = htobe32(dma_high);             /* dummy addr MSW */
820         buf[4] = htobe32(dma_low);              /* dummy addr LSW */
821         buf[5] = htobe32(enable);                       /* enable? */
822
823
824         submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
825
826         mxge_pio_copy(submit, buf, 64);
827         wmb();
828         DELAY(1000);
829         wmb();
830         i = 0;
831         while (*confirm != 0xffffffff && i < 20) {
832                 DELAY(1000);
833                 i++;
834         }
835         if (*confirm != 0xffffffff) {
836                 device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
837                               (enable ? "enable" : "disable"), confirm,
838                               *confirm);
839         }
840         return;
841 }
842
843 static int
844 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
845 {
846         mcp_cmd_t *buf;
847         char buf_bytes[sizeof(*buf) + 8];
848         volatile mcp_cmd_response_t *response = sc->cmd;
849         volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
850         uint32_t dma_low, dma_high;
851         int err, sleep_total = 0;
852
853         /* ensure buf is aligned to 8 bytes */
854         buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
855
856         buf->data0 = htobe32(data->data0);
857         buf->data1 = htobe32(data->data1);
858         buf->data2 = htobe32(data->data2);
859         buf->cmd = htobe32(cmd);
860         dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
861         dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
862
863         buf->response_addr.low = htobe32(dma_low);
864         buf->response_addr.high = htobe32(dma_high);
865         mtx_lock(&sc->cmd_mtx);
866         response->result = 0xffffffff;
867         wmb();
868         mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
869
870         /* wait up to 20ms */
871         err = EAGAIN;
872         for (sleep_total = 0; sleep_total <  20; sleep_total++) {
873                 bus_dmamap_sync(sc->cmd_dma.dmat,
874                                 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
875                 wmb();
876                 switch (be32toh(response->result)) {
877                 case 0:
878                         data->data0 = be32toh(response->data);
879                         err = 0;
880                         break;
881                 case 0xffffffff:
882                         DELAY(1000);
883                         break;
884                 case MXGEFW_CMD_UNKNOWN:
885                         err = ENOSYS;
886                         break;
887                 case MXGEFW_CMD_ERROR_UNALIGNED:
888                         err = E2BIG;
889                         break;
890                 case MXGEFW_CMD_ERROR_BUSY:
891                         err = EBUSY;
892                         break;
893                 case MXGEFW_CMD_ERROR_I2C_ABSENT:
894                         err = ENXIO;
895                         break;
896                 default:
897                         device_printf(sc->dev,
898                                       "mxge: command %d "
899                                       "failed, result = %d\n",
900                                       cmd, be32toh(response->result));
901                         err = ENXIO;
902                         break;
903                 }
904                 if (err != EAGAIN)
905                         break;
906         }
907         if (err == EAGAIN)
908                 device_printf(sc->dev, "mxge: command %d timed out"
909                               "result = %d\n",
910                               cmd, be32toh(response->result));
911         mtx_unlock(&sc->cmd_mtx);
912         return err;
913 }
914
915 static int
916 mxge_adopt_running_firmware(mxge_softc_t *sc)
917 {
918         struct mcp_gen_header *hdr;
919         const size_t bytes = sizeof (struct mcp_gen_header);
920         size_t hdr_offset;
921         int status;
922
923         /* find running firmware header */
924         hdr_offset = htobe32(*(volatile uint32_t *)
925                              (sc->sram + MCP_HEADER_PTR_OFFSET));
926
927         if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
928                 device_printf(sc->dev,
929                               "Running firmware has bad header offset (%d)\n",
930                               (int)hdr_offset);
931                 return EIO;
932         }
933
934         /* copy header of running firmware from SRAM to host memory to
935          * validate firmware */
936         hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
937         if (hdr == NULL) {
938                 device_printf(sc->dev, "could not malloc firmware hdr\n");
939                 return ENOMEM;
940         }
941         bus_space_read_region_1(rman_get_bustag(sc->mem_res),
942                                 rman_get_bushandle(sc->mem_res),
943                                 hdr_offset, (char *)hdr, bytes);
944         status = mxge_validate_firmware(sc, hdr);
945         free(hdr, M_DEVBUF);
946
947         /*
948          * check to see if adopted firmware has bug where adopting
949          * it will cause broadcasts to be filtered unless the NIC
950          * is kept in ALLMULTI mode
951          */
952         if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
953             sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
954                 sc->adopted_rx_filter_bug = 1;
955                 device_printf(sc->dev, "Adopting fw %d.%d.%d: "
956                               "working around rx filter bug\n",
957                               sc->fw_ver_major, sc->fw_ver_minor,
958                               sc->fw_ver_tiny);
959         }
960
961         return status;
962 }
963
964
965 static int
966 mxge_load_firmware(mxge_softc_t *sc, int adopt)
967 {
968         volatile uint32_t *confirm;
969         volatile char *submit;
970         char buf_bytes[72];
971         uint32_t *buf, size, dma_low, dma_high;
972         int status, i;
973
974         buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
975
976         size = sc->sram_size;
977         status = mxge_load_firmware_helper(sc, &size);
978         if (status) {
979                 if (!adopt)
980                         return status;
981                 /* Try to use the currently running firmware, if
982                    it is new enough */
983                 status = mxge_adopt_running_firmware(sc);
984                 if (status) {
985                         device_printf(sc->dev,
986                                       "failed to adopt running firmware\n");
987                         return status;
988                 }
989                 device_printf(sc->dev,
990                               "Successfully adopted running firmware\n");
991                 if (sc->tx_boundary == 4096) {
992                         device_printf(sc->dev,
993                                 "Using firmware currently running on NIC"
994                                  ".  For optimal\n");
995                         device_printf(sc->dev,
996                                  "performance consider loading optimized "
997                                  "firmware\n");
998                 }
999                 sc->fw_name = mxge_fw_unaligned;
1000                 sc->tx_boundary = 2048;
1001                 return 0;
1002         }
1003         /* clear confirmation addr */
1004         confirm = (volatile uint32_t *)sc->cmd;
1005         *confirm = 0;
1006         wmb();
1007         /* send a reload command to the bootstrap MCP, and wait for the
1008            response in the confirmation address.  The firmware should
1009            write a -1 there to indicate it is alive and well
1010         */
1011
1012         dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1013         dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1014
1015         buf[0] = htobe32(dma_high);     /* confirm addr MSW */
1016         buf[1] = htobe32(dma_low);      /* confirm addr LSW */
1017         buf[2] = htobe32(0xffffffff);   /* confirm data */
1018
1019         /* FIX: All newest firmware should un-protect the bottom of
1020            the sram before handoff. However, the very first interfaces
1021            do not. Therefore the handoff copy must skip the first 8 bytes
1022         */
1023                                         /* where the code starts*/
1024         buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1025         buf[4] = htobe32(size - 8);     /* length of code */
1026         buf[5] = htobe32(8);            /* where to copy to */
1027         buf[6] = htobe32(0);            /* where to jump to */
1028
1029         submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1030         mxge_pio_copy(submit, buf, 64);
1031         wmb();
1032         DELAY(1000);
1033         wmb();
1034         i = 0;
1035         while (*confirm != 0xffffffff && i < 20) {
1036                 DELAY(1000*10);
1037                 i++;
1038                 bus_dmamap_sync(sc->cmd_dma.dmat,
1039                                 sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1040         }
1041         if (*confirm != 0xffffffff) {
1042                 device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1043                         confirm, *confirm);
1044                 
1045                 return ENXIO;
1046         }
1047         return 0;
1048 }
1049
1050 static int
1051 mxge_update_mac_address(mxge_softc_t *sc)
1052 {
1053         mxge_cmd_t cmd;
1054         uint8_t *addr = sc->mac_addr;
1055         int status;
1056
1057         
1058         cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1059                      | (addr[2] << 8) | addr[3]);
1060
1061         cmd.data1 = ((addr[4] << 8) | (addr[5]));
1062
1063         status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1064         return status;
1065 }
1066
1067 static int
1068 mxge_change_pause(mxge_softc_t *sc, int pause)
1069 {       
1070         mxge_cmd_t cmd;
1071         int status;
1072
1073         if (pause)
1074                 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1075                                        &cmd);
1076         else
1077                 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1078                                        &cmd);
1079
1080         if (status) {
1081                 device_printf(sc->dev, "Failed to set flow control mode\n");
1082                 return ENXIO;
1083         }
1084         sc->pause = pause;
1085         return 0;
1086 }
1087
1088 static void
1089 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1090 {       
1091         mxge_cmd_t cmd;
1092         int status;
1093
1094         if (mxge_always_promisc)
1095                 promisc = 1;
1096
1097         if (promisc)
1098                 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1099                                        &cmd);
1100         else
1101                 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1102                                        &cmd);
1103
1104         if (status) {
1105                 device_printf(sc->dev, "Failed to set promisc mode\n");
1106         }
1107 }
1108
1109 static void
1110 mxge_set_multicast_list(mxge_softc_t *sc)
1111 {
1112         mxge_cmd_t cmd;
1113         struct ifmultiaddr *ifma;
1114         struct ifnet *ifp = sc->ifp;
1115         int err;
1116
1117         /* This firmware is known to not support multicast */
1118         if (!sc->fw_multicast_support)
1119                 return;
1120
1121         /* Disable multicast filtering while we play with the lists*/
1122         err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1123         if (err != 0) {
1124                 device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1125                        " error status: %d\n", err);
1126                 return;
1127         }
1128         
1129         if (sc->adopted_rx_filter_bug)
1130                 return;
1131         
1132         if (ifp->if_flags & IFF_ALLMULTI)
1133                 /* request to disable multicast filtering, so quit here */
1134                 return;
1135
1136         /* Flush all the filters */
1137
1138         err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1139         if (err != 0) {
1140                 device_printf(sc->dev,
1141                               "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1142                               ", error status: %d\n", err);
1143                 return;
1144         }
1145
1146         /* Walk the multicast list, and add each address */
1147
1148         if_maddr_rlock(ifp);
1149         TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1150                 if (ifma->ifma_addr->sa_family != AF_LINK)
1151                         continue;
1152                 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1153                       &cmd.data0, 4);
1154                 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1155                       &cmd.data1, 2);
1156                 cmd.data0 = htonl(cmd.data0);
1157                 cmd.data1 = htonl(cmd.data1);
1158                 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1159                 if (err != 0) {
1160                         device_printf(sc->dev, "Failed "
1161                                "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1162                                "%d\t", err);
1163                         /* abort, leaving multicast filtering off */
1164                         if_maddr_runlock(ifp);
1165                         return;
1166                 }
1167         }
1168         if_maddr_runlock(ifp);
1169         /* Enable multicast filtering */
1170         err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1171         if (err != 0) {
1172                 device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1173                        ", error status: %d\n", err);
1174         }
1175 }
1176
1177 static int
1178 mxge_max_mtu(mxge_softc_t *sc)
1179 {
1180         mxge_cmd_t cmd;
1181         int status;
1182
1183         if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1184                 return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1185
1186         /* try to set nbufs to see if it we can
1187            use virtually contiguous jumbos */
1188         cmd.data0 = 0;
1189         status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1190                                &cmd);
1191         if (status == 0)
1192                 return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1193
1194         /* otherwise, we're limited to MJUMPAGESIZE */
1195         return MJUMPAGESIZE - MXGEFW_PAD;
1196 }
1197
1198 static int
1199 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1200 {
1201         struct mxge_slice_state *ss;
1202         mxge_rx_done_t *rx_done;
1203         volatile uint32_t *irq_claim;
1204         mxge_cmd_t cmd;
1205         int slice, status;
1206
1207         /* try to send a reset command to the card to see if it
1208            is alive */
1209         memset(&cmd, 0, sizeof (cmd));
1210         status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1211         if (status != 0) {
1212                 device_printf(sc->dev, "failed reset\n");
1213                 return ENXIO;
1214         }
1215
1216         mxge_dummy_rdma(sc, 1);
1217
1218
1219         /* set the intrq size */
1220         cmd.data0 = sc->rx_ring_size;
1221         status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1222
1223         /*
1224          * Even though we already know how many slices are supported
1225          * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1226          * has magic side effects, and must be called after a reset.
1227          * It must be called prior to calling any RSS related cmds,
1228          * including assigning an interrupt queue for anything but
1229          * slice 0.  It must also be called *after*
1230          * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1231          * the firmware to compute offsets.
1232          */
1233         
1234         if (sc->num_slices > 1) {
1235                 /* ask the maximum number of slices it supports */
1236                 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1237                                            &cmd);
1238                 if (status != 0) {
1239                         device_printf(sc->dev,
1240                                       "failed to get number of slices\n");
1241                         return status;
1242                 }
1243                 /*
1244                  * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1245                  * to setting up the interrupt queue DMA
1246                  */
1247                 cmd.data0 = sc->num_slices;
1248                 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1249 #ifdef IFNET_BUF_RING
1250                 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1251 #endif
1252                 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1253                                            &cmd);
1254                 if (status != 0) {
1255                         device_printf(sc->dev,
1256                                       "failed to set number of slices\n");
1257                         return status;
1258                 }
1259         }
1260
1261
1262         if (interrupts_setup) {
1263                 /* Now exchange information about interrupts  */
1264                 for (slice = 0; slice < sc->num_slices; slice++) {
1265                         rx_done = &sc->ss[slice].rx_done;
1266                         memset(rx_done->entry, 0, sc->rx_ring_size);
1267                         cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1268                         cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1269                         cmd.data2 = slice;
1270                         status |= mxge_send_cmd(sc,
1271                                                 MXGEFW_CMD_SET_INTRQ_DMA,
1272                                                 &cmd);
1273                 }
1274         }
1275
1276         status |= mxge_send_cmd(sc,
1277                                 MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1278         
1279
1280         sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1281
1282         status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1283         irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1284
1285
1286         status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1287                                 &cmd);
1288         sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1289         if (status != 0) {
1290                 device_printf(sc->dev, "failed set interrupt parameters\n");
1291                 return status;
1292         }
1293         
1294
1295         *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1296
1297         
1298         /* run a DMA benchmark */
1299         (void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1300
1301         for (slice = 0; slice < sc->num_slices; slice++) {
1302                 ss = &sc->ss[slice];
1303
1304                 ss->irq_claim = irq_claim + (2 * slice);
1305                 /* reset mcp/driver shared state back to 0 */
1306                 ss->rx_done.idx = 0;
1307                 ss->rx_done.cnt = 0;
1308                 ss->tx.req = 0;
1309                 ss->tx.done = 0;
1310                 ss->tx.pkt_done = 0;
1311                 ss->tx.queue_active = 0;
1312                 ss->tx.activate = 0;
1313                 ss->tx.deactivate = 0;
1314                 ss->tx.wake = 0;
1315                 ss->tx.defrag = 0;
1316                 ss->tx.stall = 0;
1317                 ss->rx_big.cnt = 0;
1318                 ss->rx_small.cnt = 0;
1319                 ss->lc.lro_bad_csum = 0;
1320                 ss->lc.lro_queued = 0;
1321                 ss->lc.lro_flushed = 0;
1322                 if (ss->fw_stats != NULL) {
1323                         bzero(ss->fw_stats, sizeof *ss->fw_stats);
1324                 }
1325         }
1326         sc->rdma_tags_available = 15;
1327         status = mxge_update_mac_address(sc);
1328         mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1329         mxge_change_pause(sc, sc->pause);
1330         mxge_set_multicast_list(sc);
1331         if (sc->throttle) {
1332                 cmd.data0 = sc->throttle;
1333                 if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1334                                   &cmd)) {
1335                         device_printf(sc->dev,
1336                                       "can't enable throttle\n");
1337                 }
1338         }
1339         return status;
1340 }
1341
1342 static int
1343 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1344 {
1345         mxge_cmd_t cmd;
1346         mxge_softc_t *sc;
1347         int err;
1348         unsigned int throttle;
1349
1350         sc = arg1;
1351         throttle = sc->throttle;
1352         err = sysctl_handle_int(oidp, &throttle, arg2, req);
1353         if (err != 0) {
1354                 return err;
1355         }
1356
1357         if (throttle == sc->throttle)
1358                 return 0;
1359
1360         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1361                 return EINVAL;
1362         
1363         mtx_lock(&sc->driver_mtx);
1364         cmd.data0 = throttle;
1365         err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1366         if (err == 0)
1367                 sc->throttle = throttle;
1368         mtx_unlock(&sc->driver_mtx);    
1369         return err;
1370 }
1371
1372 static int
1373 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1374 {
1375         mxge_softc_t *sc;
1376         unsigned int intr_coal_delay;
1377         int err;
1378
1379         sc = arg1;
1380         intr_coal_delay = sc->intr_coal_delay;
1381         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1382         if (err != 0) {
1383                 return err;
1384         }
1385         if (intr_coal_delay == sc->intr_coal_delay)
1386                 return 0;
1387
1388         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1389                 return EINVAL;
1390
1391         mtx_lock(&sc->driver_mtx);
1392         *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1393         sc->intr_coal_delay = intr_coal_delay;
1394         
1395         mtx_unlock(&sc->driver_mtx);
1396         return err;
1397 }
1398
1399 static int
1400 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1401 {
1402         mxge_softc_t *sc;
1403         unsigned int enabled;
1404         int err;
1405
1406         sc = arg1;
1407         enabled = sc->pause;
1408         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1409         if (err != 0) {
1410                 return err;
1411         }
1412         if (enabled == sc->pause)
1413                 return 0;
1414
1415         mtx_lock(&sc->driver_mtx);
1416         err = mxge_change_pause(sc, enabled);
1417         mtx_unlock(&sc->driver_mtx);
1418         return err;
1419 }
1420
1421 static int
1422 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1423 {
1424         int err;
1425
1426         if (arg1 == NULL)
1427                 return EFAULT;
1428         arg2 = be32toh(*(int *)arg1);
1429         arg1 = NULL;
1430         err = sysctl_handle_int(oidp, arg1, arg2, req);
1431
1432         return err;
1433 }
1434
1435 static void
1436 mxge_rem_sysctls(mxge_softc_t *sc)
1437 {
1438         struct mxge_slice_state *ss;
1439         int slice;
1440
1441         if (sc->slice_sysctl_tree == NULL)
1442                 return;
1443
1444         for (slice = 0; slice < sc->num_slices; slice++) {
1445                 ss = &sc->ss[slice];
1446                 if (ss == NULL || ss->sysctl_tree == NULL)
1447                         continue;
1448                 sysctl_ctx_free(&ss->sysctl_ctx);
1449                 ss->sysctl_tree = NULL;
1450         }
1451         sysctl_ctx_free(&sc->slice_sysctl_ctx);
1452         sc->slice_sysctl_tree = NULL;
1453 }
1454
1455 static void
1456 mxge_add_sysctls(mxge_softc_t *sc)
1457 {
1458         struct sysctl_ctx_list *ctx;
1459         struct sysctl_oid_list *children;
1460         mcp_irq_data_t *fw;
1461         struct mxge_slice_state *ss;
1462         int slice;
1463         char slice_num[8];
1464
1465         ctx = device_get_sysctl_ctx(sc->dev);
1466         children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1467         fw = sc->ss[0].fw_stats;
1468
1469         /* random information */
1470         SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1471                        "firmware_version",
1472                        CTLFLAG_RD, sc->fw_version,
1473                        0, "firmware version");
1474         SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1475                        "serial_number",
1476                        CTLFLAG_RD, sc->serial_number_string,
1477                        0, "serial number");
1478         SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1479                        "product_code",
1480                        CTLFLAG_RD, sc->product_code_string,
1481                        0, "product_code");
1482         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1483                        "pcie_link_width",
1484                        CTLFLAG_RD, &sc->link_width,
1485                        0, "tx_boundary");
1486         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1487                        "tx_boundary",
1488                        CTLFLAG_RD, &sc->tx_boundary,
1489                        0, "tx_boundary");
1490         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1491                        "write_combine",
1492                        CTLFLAG_RD, &sc->wc,
1493                        0, "write combining PIO?");
1494         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1495                        "read_dma_MBs",
1496                        CTLFLAG_RD, &sc->read_dma,
1497                        0, "DMA Read speed in MB/s");
1498         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1499                        "write_dma_MBs",
1500                        CTLFLAG_RD, &sc->write_dma,
1501                        0, "DMA Write speed in MB/s");
1502         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1503                        "read_write_dma_MBs",
1504                        CTLFLAG_RD, &sc->read_write_dma,
1505                        0, "DMA concurrent Read/Write speed in MB/s");
1506         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1507                        "watchdog_resets",
1508                        CTLFLAG_RD, &sc->watchdog_resets,
1509                        0, "Number of times NIC was reset");
1510
1511
1512         /* performance related tunables */
1513         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1514                         "intr_coal_delay",
1515                         CTLTYPE_INT|CTLFLAG_RW, sc,
1516                         0, mxge_change_intr_coal,
1517                         "I", "interrupt coalescing delay in usecs");
1518
1519         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1520                         "throttle",
1521                         CTLTYPE_INT|CTLFLAG_RW, sc,
1522                         0, mxge_change_throttle,
1523                         "I", "transmit throttling");
1524
1525         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526                         "flow_control_enabled",
1527                         CTLTYPE_INT|CTLFLAG_RW, sc,
1528                         0, mxge_change_flow_control,
1529                         "I", "interrupt coalescing delay in usecs");
1530
1531         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1532                        "deassert_wait",
1533                        CTLFLAG_RW, &mxge_deassert_wait,
1534                        0, "Wait for IRQ line to go low in ihandler");
1535
1536         /* stats block from firmware is in network byte order.
1537            Need to swap it */
1538         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1539                         "link_up",
1540                         CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1541                         0, mxge_handle_be32,
1542                         "I", "link up");
1543         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1544                         "rdma_tags_available",
1545                         CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1546                         0, mxge_handle_be32,
1547                         "I", "rdma_tags_available");
1548         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549                         "dropped_bad_crc32",
1550                         CTLTYPE_INT|CTLFLAG_RD,
1551                         &fw->dropped_bad_crc32,
1552                         0, mxge_handle_be32,
1553                         "I", "dropped_bad_crc32");
1554         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1555                         "dropped_bad_phy",
1556                         CTLTYPE_INT|CTLFLAG_RD,
1557                         &fw->dropped_bad_phy,
1558                         0, mxge_handle_be32,
1559                         "I", "dropped_bad_phy");
1560         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1561                         "dropped_link_error_or_filtered",
1562                         CTLTYPE_INT|CTLFLAG_RD,
1563                         &fw->dropped_link_error_or_filtered,
1564                         0, mxge_handle_be32,
1565                         "I", "dropped_link_error_or_filtered");
1566         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1567                         "dropped_link_overflow",
1568                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1569                         0, mxge_handle_be32,
1570                         "I", "dropped_link_overflow");
1571         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1572                         "dropped_multicast_filtered",
1573                         CTLTYPE_INT|CTLFLAG_RD,
1574                         &fw->dropped_multicast_filtered,
1575                         0, mxge_handle_be32,
1576                         "I", "dropped_multicast_filtered");
1577         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1578                         "dropped_no_big_buffer",
1579                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1580                         0, mxge_handle_be32,
1581                         "I", "dropped_no_big_buffer");
1582         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1583                         "dropped_no_small_buffer",
1584                         CTLTYPE_INT|CTLFLAG_RD,
1585                         &fw->dropped_no_small_buffer,
1586                         0, mxge_handle_be32,
1587                         "I", "dropped_no_small_buffer");
1588         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589                         "dropped_overrun",
1590                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1591                         0, mxge_handle_be32,
1592                         "I", "dropped_overrun");
1593         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1594                         "dropped_pause",
1595                         CTLTYPE_INT|CTLFLAG_RD,
1596                         &fw->dropped_pause,
1597                         0, mxge_handle_be32,
1598                         "I", "dropped_pause");
1599         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1600                         "dropped_runt",
1601                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1602                         0, mxge_handle_be32,
1603                         "I", "dropped_runt");
1604
1605         SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1606                         "dropped_unicast_filtered",
1607                         CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1608                         0, mxge_handle_be32,
1609                         "I", "dropped_unicast_filtered");
1610
1611         /* verbose printing? */
1612         SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1613                        "verbose",
1614                        CTLFLAG_RW, &mxge_verbose,
1615                        0, "verbose printing");
1616
1617         /* add counters exported for debugging from all slices */
1618         sysctl_ctx_init(&sc->slice_sysctl_ctx);
1619         sc->slice_sysctl_tree =
1620                 SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1621                                 "slice", CTLFLAG_RD, 0, "");
1622
1623         for (slice = 0; slice < sc->num_slices; slice++) {
1624                 ss = &sc->ss[slice];
1625                 sysctl_ctx_init(&ss->sysctl_ctx);
1626                 ctx = &ss->sysctl_ctx;
1627                 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1628                 sprintf(slice_num, "%d", slice);
1629                 ss->sysctl_tree =
1630                         SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1631                                         CTLFLAG_RD, 0, "");
1632                 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1633                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1634                                "rx_small_cnt",
1635                                CTLFLAG_RD, &ss->rx_small.cnt,
1636                                0, "rx_small_cnt");
1637                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1638                                "rx_big_cnt",
1639                                CTLFLAG_RD, &ss->rx_big.cnt,
1640                                0, "rx_small_cnt");
1641                 SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1642                                "lro_flushed", CTLFLAG_RD, &ss->lc.lro_flushed,
1643                                0, "number of lro merge queues flushed");
1644
1645                 SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1646                                "lro_bad_csum", CTLFLAG_RD, &ss->lc.lro_bad_csum,
1647                                0, "number of bad csums preventing LRO");
1648
1649                 SYSCTL_ADD_U64(ctx, children, OID_AUTO,
1650                                "lro_queued", CTLFLAG_RD, &ss->lc.lro_queued,
1651                                0, "number of frames appended to lro merge"
1652                                "queues");
1653
1654 #ifndef IFNET_BUF_RING
1655                 /* only transmit from slice 0 for now */
1656                 if (slice > 0)
1657                         continue;
1658 #endif
1659                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1660                                "tx_req",
1661                                CTLFLAG_RD, &ss->tx.req,
1662                                0, "tx_req");
1663
1664                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1665                                "tx_done",
1666                                CTLFLAG_RD, &ss->tx.done,
1667                                0, "tx_done");
1668                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1669                                "tx_pkt_done",
1670                                CTLFLAG_RD, &ss->tx.pkt_done,
1671                                0, "tx_done");
1672                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1673                                "tx_stall",
1674                                CTLFLAG_RD, &ss->tx.stall,
1675                                0, "tx_stall");
1676                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1677                                "tx_wake",
1678                                CTLFLAG_RD, &ss->tx.wake,
1679                                0, "tx_wake");
1680                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1681                                "tx_defrag",
1682                                CTLFLAG_RD, &ss->tx.defrag,
1683                                0, "tx_defrag");
1684                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1685                                "tx_queue_active",
1686                                CTLFLAG_RD, &ss->tx.queue_active,
1687                                0, "tx_queue_active");
1688                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1689                                "tx_activate",
1690                                CTLFLAG_RD, &ss->tx.activate,
1691                                0, "tx_activate");
1692                 SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1693                                "tx_deactivate",
1694                                CTLFLAG_RD, &ss->tx.deactivate,
1695                                0, "tx_deactivate");
1696         }
1697 }
1698
1699 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1700    backwards one at a time and handle ring wraps */
1701
1702 static inline void
1703 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1704                             mcp_kreq_ether_send_t *src, int cnt)
1705 {
1706         int idx, starting_slot;
1707         starting_slot = tx->req;
1708         while (cnt > 1) {
1709                 cnt--;
1710                 idx = (starting_slot + cnt) & tx->mask;
1711                 mxge_pio_copy(&tx->lanai[idx],
1712                               &src[cnt], sizeof(*src));
1713                 wmb();
1714         }
1715 }
1716
1717 /*
1718  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1719  * at most 32 bytes at a time, so as to avoid involving the software
1720  * pio handler in the nic.   We re-write the first segment's flags
1721  * to mark them valid only after writing the entire chain
1722  */
1723
1724 static inline void
1725 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1726                   int cnt)
1727 {
1728         int idx, i;
1729         uint32_t *src_ints;
1730         volatile uint32_t *dst_ints;
1731         mcp_kreq_ether_send_t *srcp;
1732         volatile mcp_kreq_ether_send_t *dstp, *dst;
1733         uint8_t last_flags;
1734         
1735         idx = tx->req & tx->mask;
1736
1737         last_flags = src->flags;
1738         src->flags = 0;
1739         wmb();
1740         dst = dstp = &tx->lanai[idx];
1741         srcp = src;
1742
1743         if ((idx + cnt) < tx->mask) {
1744                 for (i = 0; i < (cnt - 1); i += 2) {
1745                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1746                         wmb(); /* force write every 32 bytes */
1747                         srcp += 2;
1748                         dstp += 2;
1749                 }
1750         } else {
1751                 /* submit all but the first request, and ensure
1752                    that it is submitted below */
1753                 mxge_submit_req_backwards(tx, src, cnt);
1754                 i = 0;
1755         }
1756         if (i < cnt) {
1757                 /* submit the first request */
1758                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1759                 wmb(); /* barrier before setting valid flag */
1760         }
1761
1762         /* re-write the last 32-bits with the valid flags */
1763         src->flags = last_flags;
1764         src_ints = (uint32_t *)src;
1765         src_ints+=3;
1766         dst_ints = (volatile uint32_t *)dst;
1767         dst_ints+=3;
1768         *dst_ints =  *src_ints;
1769         tx->req += cnt;
1770         wmb();
1771 }
1772
1773 static int
1774 mxge_parse_tx(struct mxge_slice_state *ss, struct mbuf *m,
1775     struct mxge_pkt_info *pi)
1776 {
1777         struct ether_vlan_header *eh;
1778         uint16_t etype;
1779         int tso = m->m_pkthdr.csum_flags & (CSUM_TSO);
1780 #if IFCAP_TSO6 && defined(INET6)
1781         int nxt;
1782 #endif
1783
1784         eh = mtod(m, struct ether_vlan_header *);
1785         if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1786                 etype = ntohs(eh->evl_proto);
1787                 pi->ip_off = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1788         } else {
1789                 etype = ntohs(eh->evl_encap_proto);
1790                 pi->ip_off = ETHER_HDR_LEN;
1791         }
1792
1793         switch (etype) {
1794         case ETHERTYPE_IP:
1795                 /*
1796                  * ensure ip header is in first mbuf, copy it to a
1797                  * scratch buffer if not
1798                  */
1799                 pi->ip = (struct ip *)(m->m_data + pi->ip_off);
1800                 pi->ip6 = NULL;
1801                 if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip))) {
1802                         m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip),
1803                             ss->scratch);
1804                         pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1805                 }
1806                 pi->ip_hlen = pi->ip->ip_hl << 2;
1807                 if (!tso)
1808                         return 0;
1809
1810                 if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1811                     sizeof(struct tcphdr))) {
1812                         m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1813                             sizeof(struct tcphdr), ss->scratch);
1814                         pi->ip = (struct ip *)(ss->scratch + pi->ip_off);
1815                 }
1816                 pi->tcp = (struct tcphdr *)((char *)pi->ip + pi->ip_hlen);
1817                 break;
1818 #if IFCAP_TSO6 && defined(INET6)
1819         case ETHERTYPE_IPV6:
1820                 pi->ip6 = (struct ip6_hdr *)(m->m_data + pi->ip_off);
1821                 if (__predict_false(m->m_len < pi->ip_off + sizeof(*pi->ip6))) {
1822                         m_copydata(m, 0, pi->ip_off + sizeof(*pi->ip6),
1823                             ss->scratch);
1824                         pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1825                 }
1826                 nxt = 0;
1827                 pi->ip_hlen = ip6_lasthdr(m, pi->ip_off, IPPROTO_IPV6, &nxt);
1828                 pi->ip_hlen -= pi->ip_off;
1829                 if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
1830                         return EINVAL;
1831
1832                 if (!tso)
1833                         return 0;
1834
1835                 if (pi->ip_off + pi->ip_hlen > ss->sc->max_tso6_hlen)
1836                         return EINVAL;
1837
1838                 if (__predict_false(m->m_len < pi->ip_off + pi->ip_hlen +
1839                     sizeof(struct tcphdr))) {
1840                         m_copydata(m, 0, pi->ip_off + pi->ip_hlen +
1841                             sizeof(struct tcphdr), ss->scratch);
1842                         pi->ip6 = (struct ip6_hdr *)(ss->scratch + pi->ip_off);
1843                 }
1844                 pi->tcp = (struct tcphdr *)((char *)pi->ip6 + pi->ip_hlen);
1845                 break;
1846 #endif
1847         default:
1848                 return EINVAL;
1849         }
1850         return 0;
1851 }
1852
1853 #if IFCAP_TSO4
1854
1855 static void
1856 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1857                int busdma_seg_cnt, struct mxge_pkt_info *pi)
1858 {
1859         mxge_tx_ring_t *tx;
1860         mcp_kreq_ether_send_t *req;
1861         bus_dma_segment_t *seg;
1862         uint32_t low, high_swapped;
1863         int len, seglen, cum_len, cum_len_next;
1864         int next_is_first, chop, cnt, rdma_count, small;
1865         uint16_t pseudo_hdr_offset, cksum_offset, mss, sum;
1866         uint8_t flags, flags_next;
1867         static int once;
1868
1869         mss = m->m_pkthdr.tso_segsz;
1870
1871         /* negative cum_len signifies to the
1872          * send loop that we are still in the
1873          * header portion of the TSO packet.
1874          */
1875
1876         cksum_offset = pi->ip_off + pi->ip_hlen;
1877         cum_len = -(cksum_offset + (pi->tcp->th_off << 2));
1878
1879         /* TSO implies checksum offload on this hardware */
1880         if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP|CSUM_TCP_IPV6)) == 0)) {
1881                 /*
1882                  * If packet has full TCP csum, replace it with pseudo hdr
1883                  * sum that the NIC expects, otherwise the NIC will emit
1884                  * packets with bad TCP checksums.
1885                  */
1886                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1887                 if (pi->ip6) {
1888 #if (CSUM_TCP_IPV6 != 0) && defined(INET6)
1889                         m->m_pkthdr.csum_flags |= CSUM_TCP_IPV6;
1890                         sum = in6_cksum_pseudo(pi->ip6,
1891                             m->m_pkthdr.len - cksum_offset,
1892                             IPPROTO_TCP, 0);
1893 #endif
1894                 } else {
1895 #ifdef INET
1896                         m->m_pkthdr.csum_flags |= CSUM_TCP;
1897                         sum = in_pseudo(pi->ip->ip_src.s_addr,
1898                             pi->ip->ip_dst.s_addr,
1899                             htons(IPPROTO_TCP + (m->m_pkthdr.len -
1900                                     cksum_offset)));
1901 #endif
1902                 }
1903                 m_copyback(m, offsetof(struct tcphdr, th_sum) +
1904                     cksum_offset, sizeof(sum), (caddr_t)&sum);
1905         }
1906         flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1907
1908         
1909         /* for TSO, pseudo_hdr_offset holds mss.
1910          * The firmware figures out where to put
1911          * the checksum by parsing the header. */
1912         pseudo_hdr_offset = htobe16(mss);
1913
1914         if (pi->ip6) {
1915                 /*
1916                  * for IPv6 TSO, the "checksum offset" is re-purposed
1917                  * to store the TCP header len
1918                  */
1919                 cksum_offset = (pi->tcp->th_off << 2);
1920         }
1921
1922         tx = &ss->tx;
1923         req = tx->req_list;
1924         seg = tx->seg_list;
1925         cnt = 0;
1926         rdma_count = 0;
1927         /* "rdma_count" is the number of RDMAs belonging to the
1928          * current packet BEFORE the current send request. For
1929          * non-TSO packets, this is equal to "count".
1930          * For TSO packets, rdma_count needs to be reset
1931          * to 0 after a segment cut.
1932          *
1933          * The rdma_count field of the send request is
1934          * the number of RDMAs of the packet starting at
1935          * that request. For TSO send requests with one ore more cuts
1936          * in the middle, this is the number of RDMAs starting
1937          * after the last cut in the request. All previous
1938          * segments before the last cut implicitly have 1 RDMA.
1939          *
1940          * Since the number of RDMAs is not known beforehand,
1941          * it must be filled-in retroactively - after each
1942          * segmentation cut or at the end of the entire packet.
1943          */
1944
1945         while (busdma_seg_cnt) {
1946                 /* Break the busdma segment up into pieces*/
1947                 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1948                 high_swapped =  htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1949                 len = seg->ds_len;
1950
1951                 while (len) {
1952                         flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1953                         seglen = len;
1954                         cum_len_next = cum_len + seglen;
1955                         (req-rdma_count)->rdma_count = rdma_count + 1;
1956                         if (__predict_true(cum_len >= 0)) {
1957                                 /* payload */
1958                                 chop = (cum_len_next > mss);
1959                                 cum_len_next = cum_len_next % mss;
1960                                 next_is_first = (cum_len_next == 0);
1961                                 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1962                                 flags_next |= next_is_first *
1963                                         MXGEFW_FLAGS_FIRST;
1964                                 rdma_count |= -(chop | next_is_first);
1965                                 rdma_count += chop & !next_is_first;
1966                         } else if (cum_len_next >= 0) {
1967                                 /* header ends */
1968                                 rdma_count = -1;
1969                                 cum_len_next = 0;
1970                                 seglen = -cum_len;
1971                                 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1972                                 flags_next = MXGEFW_FLAGS_TSO_PLD |
1973                                         MXGEFW_FLAGS_FIRST |
1974                                         (small * MXGEFW_FLAGS_SMALL);
1975                             }
1976                         
1977                         req->addr_high = high_swapped;
1978                         req->addr_low = htobe32(low);
1979                         req->pseudo_hdr_offset = pseudo_hdr_offset;
1980                         req->pad = 0;
1981                         req->rdma_count = 1;
1982                         req->length = htobe16(seglen);
1983                         req->cksum_offset = cksum_offset;
1984                         req->flags = flags | ((cum_len & 1) *
1985                                               MXGEFW_FLAGS_ALIGN_ODD);
1986                         low += seglen;
1987                         len -= seglen;
1988                         cum_len = cum_len_next;
1989                         flags = flags_next;
1990                         req++;
1991                         cnt++;
1992                         rdma_count++;
1993                         if (cksum_offset != 0 && !pi->ip6) {
1994                                 if (__predict_false(cksum_offset > seglen))
1995                                         cksum_offset -= seglen;
1996                                 else
1997                                         cksum_offset = 0;
1998                         }
1999                         if (__predict_false(cnt > tx->max_desc))
2000                                 goto drop;
2001                 }
2002                 busdma_seg_cnt--;
2003                 seg++;
2004         }
2005         (req-rdma_count)->rdma_count = rdma_count;
2006
2007         do {
2008                 req--;
2009                 req->flags |= MXGEFW_FLAGS_TSO_LAST;
2010         } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
2011
2012         tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2013         mxge_submit_req(tx, tx->req_list, cnt);
2014 #ifdef IFNET_BUF_RING
2015         if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2016                 /* tell the NIC to start polling this slice */
2017                 *tx->send_go = 1;
2018                 tx->queue_active = 1;
2019                 tx->activate++;
2020                 wmb();
2021         }
2022 #endif
2023         return;
2024
2025 drop:
2026         bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
2027         m_freem(m);
2028         ss->oerrors++;
2029         if (!once) {
2030                 printf("tx->max_desc exceeded via TSO!\n");
2031                 printf("mss = %d, %ld, %d!\n", mss,
2032                        (long)seg - (long)tx->seg_list, tx->max_desc);
2033                 once = 1;
2034         }
2035         return;
2036
2037 }
2038
2039 #endif /* IFCAP_TSO4 */
2040
2041 #ifdef MXGE_NEW_VLAN_API
2042 /*
2043  * We reproduce the software vlan tag insertion from
2044  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2045  * vlan tag insertion. We need to advertise this in order to have the
2046  * vlan interface respect our csum offload flags.
2047  */
2048 static struct mbuf *
2049 mxge_vlan_tag_insert(struct mbuf *m)
2050 {
2051         struct ether_vlan_header *evl;
2052
2053         M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2054         if (__predict_false(m == NULL))
2055                 return NULL;
2056         if (m->m_len < sizeof(*evl)) {
2057                 m = m_pullup(m, sizeof(*evl));
2058                 if (__predict_false(m == NULL))
2059                         return NULL;
2060         }
2061         /*
2062          * Transform the Ethernet header into an Ethernet header
2063          * with 802.1Q encapsulation.
2064          */
2065         evl = mtod(m, struct ether_vlan_header *);
2066         bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2067               (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2068         evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2069         evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2070         m->m_flags &= ~M_VLANTAG;
2071         return m;
2072 }
2073 #endif /* MXGE_NEW_VLAN_API */
2074
2075 static void
2076 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2077 {
2078         struct mxge_pkt_info pi = {0,0,0,0};
2079         mxge_softc_t *sc;
2080         mcp_kreq_ether_send_t *req;
2081         bus_dma_segment_t *seg;
2082         struct mbuf *m_tmp;
2083         struct ifnet *ifp;
2084         mxge_tx_ring_t *tx;
2085         int cnt, cum_len, err, i, idx, odd_flag;
2086         uint16_t pseudo_hdr_offset;
2087         uint8_t flags, cksum_offset;
2088
2089
2090         sc = ss->sc;
2091         ifp = sc->ifp;
2092         tx = &ss->tx;
2093
2094 #ifdef MXGE_NEW_VLAN_API
2095         if (m->m_flags & M_VLANTAG) {
2096                 m = mxge_vlan_tag_insert(m);
2097                 if (__predict_false(m == NULL))
2098                         goto drop_without_m;
2099         }
2100 #endif
2101         if (m->m_pkthdr.csum_flags &
2102             (CSUM_TSO | CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2103                 if (mxge_parse_tx(ss, m, &pi))
2104                         goto drop;
2105         }
2106
2107         /* (try to) map the frame for DMA */
2108         idx = tx->req & tx->mask;
2109         err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2110                                       m, tx->seg_list, &cnt,
2111                                       BUS_DMA_NOWAIT);
2112         if (__predict_false(err == EFBIG)) {
2113                 /* Too many segments in the chain.  Try
2114                    to defrag */
2115                 m_tmp = m_defrag(m, M_NOWAIT);
2116                 if (m_tmp == NULL) {
2117                         goto drop;
2118                 }
2119                 ss->tx.defrag++;
2120                 m = m_tmp;
2121                 err = bus_dmamap_load_mbuf_sg(tx->dmat,
2122                                               tx->info[idx].map,
2123                                               m, tx->seg_list, &cnt,
2124                                               BUS_DMA_NOWAIT);
2125         }
2126         if (__predict_false(err != 0)) {
2127                 device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2128                               " packet len = %d\n", err, m->m_pkthdr.len);
2129                 goto drop;
2130         }
2131         bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2132                         BUS_DMASYNC_PREWRITE);
2133         tx->info[idx].m = m;
2134
2135 #if IFCAP_TSO4
2136         /* TSO is different enough, we handle it in another routine */
2137         if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2138                 mxge_encap_tso(ss, m, cnt, &pi);
2139                 return;
2140         }
2141 #endif
2142
2143         req = tx->req_list;
2144         cksum_offset = 0;
2145         pseudo_hdr_offset = 0;
2146         flags = MXGEFW_FLAGS_NO_TSO;
2147
2148         /* checksum offloading? */
2149         if (m->m_pkthdr.csum_flags &
2150             (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6)) {
2151                 /* ensure ip header is in first mbuf, copy
2152                    it to a scratch buffer if not */
2153                 cksum_offset = pi.ip_off + pi.ip_hlen;
2154                 pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2155                 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2156                 req->cksum_offset = cksum_offset;
2157                 flags |= MXGEFW_FLAGS_CKSUM;
2158                 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2159         } else {
2160                 odd_flag = 0;
2161         }
2162         if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2163                 flags |= MXGEFW_FLAGS_SMALL;
2164
2165         /* convert segments into a request list */
2166         cum_len = 0;
2167         seg = tx->seg_list;
2168         req->flags = MXGEFW_FLAGS_FIRST;
2169         for (i = 0; i < cnt; i++) {
2170                 req->addr_low =
2171                         htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2172                 req->addr_high =
2173                         htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2174                 req->length = htobe16(seg->ds_len);
2175                 req->cksum_offset = cksum_offset;
2176                 if (cksum_offset > seg->ds_len)
2177                         cksum_offset -= seg->ds_len;
2178                 else
2179                         cksum_offset = 0;
2180                 req->pseudo_hdr_offset = pseudo_hdr_offset;
2181                 req->pad = 0; /* complete solid 16-byte block */
2182                 req->rdma_count = 1;
2183                 req->flags |= flags | ((cum_len & 1) * odd_flag);
2184                 cum_len += seg->ds_len;
2185                 seg++;
2186                 req++;
2187                 req->flags = 0;
2188         }
2189         req--;
2190         /* pad runts to 60 bytes */
2191         if (cum_len < 60) {
2192                 req++;
2193                 req->addr_low =
2194                         htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2195                 req->addr_high =
2196                         htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2197                 req->length = htobe16(60 - cum_len);
2198                 req->cksum_offset = 0;
2199                 req->pseudo_hdr_offset = pseudo_hdr_offset;
2200                 req->pad = 0; /* complete solid 16-byte block */
2201                 req->rdma_count = 1;
2202                 req->flags |= flags | ((cum_len & 1) * odd_flag);
2203                 cnt++;
2204         }
2205
2206         tx->req_list[0].rdma_count = cnt;
2207 #if 0
2208         /* print what the firmware will see */
2209         for (i = 0; i < cnt; i++) {
2210                 printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2211                     "cso:%d, flags:0x%x, rdma:%d\n",
2212                     i, (int)ntohl(tx->req_list[i].addr_high),
2213                     (int)ntohl(tx->req_list[i].addr_low),
2214                     (int)ntohs(tx->req_list[i].length),
2215                     (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2216                     tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2217                     tx->req_list[i].rdma_count);
2218         }
2219         printf("--------------\n");
2220 #endif
2221         tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2222         mxge_submit_req(tx, tx->req_list, cnt);
2223 #ifdef IFNET_BUF_RING
2224         if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2225                 /* tell the NIC to start polling this slice */
2226                 *tx->send_go = 1;
2227                 tx->queue_active = 1;
2228                 tx->activate++;
2229                 wmb();
2230         }
2231 #endif
2232         return;
2233
2234 drop:
2235         m_freem(m);
2236 drop_without_m:
2237         ss->oerrors++;
2238         return;
2239 }
2240
2241 #ifdef IFNET_BUF_RING
2242 static void
2243 mxge_qflush(struct ifnet *ifp)
2244 {
2245         mxge_softc_t *sc = ifp->if_softc;
2246         mxge_tx_ring_t *tx;
2247         struct mbuf *m;
2248         int slice;
2249
2250         for (slice = 0; slice < sc->num_slices; slice++) {
2251                 tx = &sc->ss[slice].tx;
2252                 mtx_lock(&tx->mtx);
2253                 while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2254                         m_freem(m);
2255                 mtx_unlock(&tx->mtx);
2256         }
2257         if_qflush(ifp);
2258 }
2259
2260 static inline void
2261 mxge_start_locked(struct mxge_slice_state *ss)
2262 {
2263         mxge_softc_t *sc;
2264         struct mbuf *m;
2265         struct ifnet *ifp;
2266         mxge_tx_ring_t *tx;
2267
2268         sc = ss->sc;
2269         ifp = sc->ifp;
2270         tx = &ss->tx;
2271
2272         while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2273                 m = drbr_dequeue(ifp, tx->br);
2274                 if (m == NULL) {
2275                         return;
2276                 }
2277                 /* let BPF see it */
2278                 BPF_MTAP(ifp, m);
2279
2280                 /* give it to the nic */
2281                 mxge_encap(ss, m);
2282         }
2283         /* ran out of transmit slots */
2284         if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2285             && (!drbr_empty(ifp, tx->br))) {
2286                 ss->if_drv_flags |= IFF_DRV_OACTIVE;
2287                 tx->stall++;
2288         }
2289 }
2290
2291 static int
2292 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2293 {
2294         mxge_softc_t *sc;
2295         struct ifnet *ifp;
2296         mxge_tx_ring_t *tx;
2297         int err;
2298
2299         sc = ss->sc;
2300         ifp = sc->ifp;
2301         tx = &ss->tx;
2302
2303         if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2304             IFF_DRV_RUNNING) {
2305                 err = drbr_enqueue(ifp, tx->br, m);
2306                 return (err);
2307         }
2308
2309         if (!drbr_needs_enqueue(ifp, tx->br) &&
2310             ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2311                 /* let BPF see it */
2312                 BPF_MTAP(ifp, m);
2313                 /* give it to the nic */
2314                 mxge_encap(ss, m);
2315         } else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2316                 return (err);
2317         }
2318         if (!drbr_empty(ifp, tx->br))
2319                 mxge_start_locked(ss);
2320         return (0);
2321 }
2322
2323 static int
2324 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2325 {
2326         mxge_softc_t *sc = ifp->if_softc;
2327         struct mxge_slice_state *ss;
2328         mxge_tx_ring_t *tx;
2329         int err = 0;
2330         int slice;
2331
2332         slice = m->m_pkthdr.flowid;
2333         slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2334
2335         ss = &sc->ss[slice];
2336         tx = &ss->tx;
2337
2338         if (mtx_trylock(&tx->mtx)) {
2339                 err = mxge_transmit_locked(ss, m);
2340                 mtx_unlock(&tx->mtx);
2341         } else {
2342                 err = drbr_enqueue(ifp, tx->br, m);
2343         }
2344
2345         return (err);
2346 }
2347
2348 #else
2349
2350 static inline void
2351 mxge_start_locked(struct mxge_slice_state *ss)
2352 {
2353         mxge_softc_t *sc;
2354         struct mbuf *m;
2355         struct ifnet *ifp;
2356         mxge_tx_ring_t *tx;
2357
2358         sc = ss->sc;
2359         ifp = sc->ifp;
2360         tx = &ss->tx;
2361         while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2362                 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2363                 if (m == NULL) {
2364                         return;
2365                 }
2366                 /* let BPF see it */
2367                 BPF_MTAP(ifp, m);
2368
2369                 /* give it to the nic */
2370                 mxge_encap(ss, m);
2371         }
2372         /* ran out of transmit slots */
2373         if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2374                 sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2375                 tx->stall++;
2376         }
2377 }
2378 #endif
2379 static void
2380 mxge_start(struct ifnet *ifp)
2381 {
2382         mxge_softc_t *sc = ifp->if_softc;
2383         struct mxge_slice_state *ss;
2384
2385         /* only use the first slice for now */
2386         ss = &sc->ss[0];
2387         mtx_lock(&ss->tx.mtx);
2388         mxge_start_locked(ss);
2389         mtx_unlock(&ss->tx.mtx);                
2390 }
2391
2392 /*
2393  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2394  * at most 32 bytes at a time, so as to avoid involving the software
2395  * pio handler in the nic.   We re-write the first segment's low
2396  * DMA address to mark it valid only after we write the entire chunk
2397  * in a burst
2398  */
2399 static inline void
2400 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2401                 mcp_kreq_ether_recv_t *src)
2402 {
2403         uint32_t low;
2404
2405         low = src->addr_low;
2406         src->addr_low = 0xffffffff;
2407         mxge_pio_copy(dst, src, 4 * sizeof (*src));
2408         wmb();
2409         mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2410         wmb();
2411         src->addr_low = low;
2412         dst->addr_low = low;
2413         wmb();
2414 }
2415
2416 static int
2417 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2418 {
2419         bus_dma_segment_t seg;
2420         struct mbuf *m;
2421         mxge_rx_ring_t *rx = &ss->rx_small;
2422         int cnt, err;
2423
2424         m = m_gethdr(M_NOWAIT, MT_DATA);
2425         if (m == NULL) {
2426                 rx->alloc_fail++;
2427                 err = ENOBUFS;
2428                 goto done;
2429         }
2430         m->m_len = MHLEN;
2431         err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2432                                       &seg, &cnt, BUS_DMA_NOWAIT);
2433         if (err != 0) {
2434                 m_free(m);
2435                 goto done;
2436         }
2437         rx->info[idx].m = m;
2438         rx->shadow[idx].addr_low =
2439                 htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2440         rx->shadow[idx].addr_high =
2441                 htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2442
2443 done:
2444         if ((idx & 7) == 7)
2445                 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2446         return err;
2447 }
2448
2449 static int
2450 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2451 {
2452         bus_dma_segment_t seg[3];
2453         struct mbuf *m;
2454         mxge_rx_ring_t *rx = &ss->rx_big;
2455         int cnt, err, i;
2456
2457         m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2458         if (m == NULL) {
2459                 rx->alloc_fail++;
2460                 err = ENOBUFS;
2461                 goto done;
2462         }
2463         m->m_len = rx->mlen;
2464         err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2465                                       seg, &cnt, BUS_DMA_NOWAIT);
2466         if (err != 0) {
2467                 m_free(m);
2468                 goto done;
2469         }
2470         rx->info[idx].m = m;
2471         rx->shadow[idx].addr_low =
2472                 htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2473         rx->shadow[idx].addr_high =
2474                 htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2475
2476 #if MXGE_VIRT_JUMBOS
2477         for (i = 1; i < cnt; i++) {
2478                 rx->shadow[idx + i].addr_low =
2479                         htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2480                 rx->shadow[idx + i].addr_high =
2481                         htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2482        }
2483 #endif
2484
2485 done:
2486        for (i = 0; i < rx->nbufs; i++) {
2487                 if ((idx & 7) == 7) {
2488                         mxge_submit_8rx(&rx->lanai[idx - 7],
2489                                         &rx->shadow[idx - 7]);
2490                 }
2491                 idx++;
2492         }
2493         return err;
2494 }
2495
2496 #ifdef INET6
2497
2498 static uint16_t
2499 mxge_csum_generic(uint16_t *raw, int len)
2500 {
2501         uint32_t csum;
2502
2503
2504         csum = 0;
2505         while (len > 0) {
2506                 csum += *raw;
2507                 raw++;
2508                 len -= 2;
2509         }
2510         csum = (csum >> 16) + (csum & 0xffff);
2511         csum = (csum >> 16) + (csum & 0xffff);
2512         return (uint16_t)csum;
2513 }
2514
2515 static inline uint16_t
2516 mxge_rx_csum6(void *p, struct mbuf *m, uint32_t csum)
2517 {
2518         uint32_t partial;
2519         int nxt, cksum_offset;
2520         struct ip6_hdr *ip6 = p;
2521         uint16_t c;
2522
2523         nxt = ip6->ip6_nxt;
2524         cksum_offset = sizeof (*ip6) + ETHER_HDR_LEN;
2525         if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP) {
2526                 cksum_offset = ip6_lasthdr(m, ETHER_HDR_LEN,
2527                                            IPPROTO_IPV6, &nxt);
2528                 if (nxt != IPPROTO_TCP && nxt != IPPROTO_UDP)
2529                         return (1);
2530         }
2531
2532         /*
2533          * IPv6 headers do not contain a checksum, and hence
2534          * do not checksum to zero, so they don't "fall out"
2535          * of the partial checksum calculation like IPv4
2536          * headers do.  We need to fix the partial checksum by
2537          * subtracting the checksum of the IPv6 header.
2538          */
2539
2540         partial = mxge_csum_generic((uint16_t *)ip6, cksum_offset -
2541                                     ETHER_HDR_LEN);
2542         csum += ~partial;
2543         csum +=  (csum < ~partial);
2544         csum = (csum >> 16) + (csum & 0xFFFF);
2545         csum = (csum >> 16) + (csum & 0xFFFF);
2546         c = in6_cksum_pseudo(ip6, m->m_pkthdr.len - cksum_offset, nxt,
2547                              csum);
2548         c ^= 0xffff;
2549         return (c);
2550 }
2551 #endif /* INET6 */
2552 /*
2553  *  Myri10GE hardware checksums are not valid if the sender
2554  *  padded the frame with non-zero padding.  This is because
2555  *  the firmware just does a simple 16-bit 1s complement
2556  *  checksum across the entire frame, excluding the first 14
2557  *  bytes.  It is best to simply to check the checksum and
2558  *  tell the stack about it only if the checksum is good
2559  */
2560
2561 static inline uint16_t
2562 mxge_rx_csum(struct mbuf *m, int csum)
2563 {
2564         struct ether_header *eh;
2565 #ifdef INET
2566         struct ip *ip;
2567 #endif
2568 #if defined(INET) || defined(INET6)
2569         int cap = m->m_pkthdr.rcvif->if_capenable;
2570 #endif
2571         uint16_t c, etype;
2572
2573
2574         eh = mtod(m, struct ether_header *);
2575         etype = ntohs(eh->ether_type);
2576         switch (etype) {
2577 #ifdef INET
2578         case ETHERTYPE_IP:
2579                 if ((cap & IFCAP_RXCSUM) == 0)
2580                         return (1);
2581                 ip = (struct ip *)(eh + 1);
2582                 if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP)
2583                         return (1);
2584                 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2585                               htonl(ntohs(csum) + ntohs(ip->ip_len) -
2586                                     (ip->ip_hl << 2) + ip->ip_p));
2587                 c ^= 0xffff;
2588                 break;
2589 #endif
2590 #ifdef INET6
2591         case ETHERTYPE_IPV6:
2592                 if ((cap & IFCAP_RXCSUM_IPV6) == 0)
2593                         return (1);
2594                 c = mxge_rx_csum6((eh + 1), m, csum);
2595                 break;
2596 #endif
2597         default:
2598                 c = 1;
2599         }
2600         return (c);
2601 }
2602
2603 static void
2604 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2605 {
2606         struct ether_vlan_header *evl;
2607         struct ether_header *eh;
2608         uint32_t partial;
2609
2610         evl = mtod(m, struct ether_vlan_header *);
2611         eh = mtod(m, struct ether_header *);
2612
2613         /*
2614          * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2615          * after what the firmware thought was the end of the ethernet
2616          * header.
2617          */
2618
2619         /* put checksum into host byte order */
2620         *csum = ntohs(*csum);
2621         partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2622         (*csum) += ~partial;
2623         (*csum) +=  ((*csum) < ~partial);
2624         (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2625         (*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2626
2627         /* restore checksum to network byte order;
2628            later consumers expect this */
2629         *csum = htons(*csum);
2630
2631         /* save the tag */
2632 #ifdef MXGE_NEW_VLAN_API        
2633         m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2634 #else
2635         {
2636                 struct m_tag *mtag;
2637                 mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2638                                    M_NOWAIT);
2639                 if (mtag == NULL)
2640                         return;
2641                 VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2642                 m_tag_prepend(m, mtag);
2643         }
2644
2645 #endif
2646         m->m_flags |= M_VLANTAG;
2647
2648         /*
2649          * Remove the 802.1q header by copying the Ethernet
2650          * addresses over it and adjusting the beginning of
2651          * the data in the mbuf.  The encapsulated Ethernet
2652          * type field is already in place.
2653          */
2654         bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2655               ETHER_HDR_LEN - ETHER_TYPE_LEN);
2656         m_adj(m, ETHER_VLAN_ENCAP_LEN);
2657 }
2658
2659
2660 static inline void
2661 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len,
2662                  uint32_t csum, int lro)
2663 {
2664         mxge_softc_t *sc;
2665         struct ifnet *ifp;
2666         struct mbuf *m;
2667         struct ether_header *eh;
2668         mxge_rx_ring_t *rx;
2669         bus_dmamap_t old_map;
2670         int idx;
2671
2672         sc = ss->sc;
2673         ifp = sc->ifp;
2674         rx = &ss->rx_big;
2675         idx = rx->cnt & rx->mask;
2676         rx->cnt += rx->nbufs;
2677         /* save a pointer to the received mbuf */
2678         m = rx->info[idx].m;
2679         /* try to replace the received mbuf */
2680         if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2681                 /* drop the frame -- the old mbuf is re-cycled */
2682                 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2683                 return;
2684         }
2685
2686         /* unmap the received buffer */
2687         old_map = rx->info[idx].map;
2688         bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2689         bus_dmamap_unload(rx->dmat, old_map);
2690
2691         /* swap the bus_dmamap_t's */
2692         rx->info[idx].map = rx->extra_map;
2693         rx->extra_map = old_map;
2694
2695         /* mcp implicitly skips 1st 2 bytes so that packet is properly
2696          * aligned */
2697         m->m_data += MXGEFW_PAD;
2698
2699         m->m_pkthdr.rcvif = ifp;
2700         m->m_len = m->m_pkthdr.len = len;
2701         ss->ipackets++;
2702         eh = mtod(m, struct ether_header *);
2703         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2704                 mxge_vlan_tag_remove(m, &csum);
2705         }
2706         /* flowid only valid if RSS hashing is enabled */
2707         if (sc->num_slices > 1) {
2708                 m->m_pkthdr.flowid = (ss - sc->ss);
2709                 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2710         }
2711         /* if the checksum is valid, mark it in the mbuf header */
2712         if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2713             (0 == mxge_rx_csum(m, csum))) {
2714                 /* Tell the stack that the  checksum is good */
2715                 m->m_pkthdr.csum_data = 0xffff;
2716                 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2717                         CSUM_DATA_VALID;
2718
2719 #if defined(INET) || defined (INET6)
2720                 if (lro && (0 == tcp_lro_rx(&ss->lc, m, 0)))
2721                         return;
2722 #endif
2723         }
2724         /* pass the frame up the stack */
2725         (*ifp->if_input)(ifp, m);
2726 }
2727
2728 static inline void
2729 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len,
2730                    uint32_t csum, int lro)
2731 {
2732         mxge_softc_t *sc;
2733         struct ifnet *ifp;
2734         struct ether_header *eh;
2735         struct mbuf *m;
2736         mxge_rx_ring_t *rx;
2737         bus_dmamap_t old_map;
2738         int idx;
2739
2740         sc = ss->sc;
2741         ifp = sc->ifp;
2742         rx = &ss->rx_small;
2743         idx = rx->cnt & rx->mask;
2744         rx->cnt++;
2745         /* save a pointer to the received mbuf */
2746         m = rx->info[idx].m;
2747         /* try to replace the received mbuf */
2748         if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2749                 /* drop the frame -- the old mbuf is re-cycled */
2750                 if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
2751                 return;
2752         }
2753
2754         /* unmap the received buffer */
2755         old_map = rx->info[idx].map;
2756         bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2757         bus_dmamap_unload(rx->dmat, old_map);
2758
2759         /* swap the bus_dmamap_t's */
2760         rx->info[idx].map = rx->extra_map;
2761         rx->extra_map = old_map;
2762
2763         /* mcp implicitly skips 1st 2 bytes so that packet is properly
2764          * aligned */
2765         m->m_data += MXGEFW_PAD;
2766
2767         m->m_pkthdr.rcvif = ifp;
2768         m->m_len = m->m_pkthdr.len = len;
2769         ss->ipackets++;
2770         eh = mtod(m, struct ether_header *);
2771         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2772                 mxge_vlan_tag_remove(m, &csum);
2773         }
2774         /* flowid only valid if RSS hashing is enabled */
2775         if (sc->num_slices > 1) {
2776                 m->m_pkthdr.flowid = (ss - sc->ss);
2777                 M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2778         }
2779         /* if the checksum is valid, mark it in the mbuf header */
2780         if ((ifp->if_capenable & (IFCAP_RXCSUM_IPV6 | IFCAP_RXCSUM)) &&
2781             (0 == mxge_rx_csum(m, csum))) {
2782                 /* Tell the stack that the  checksum is good */
2783                 m->m_pkthdr.csum_data = 0xffff;
2784                 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2785                         CSUM_DATA_VALID;
2786
2787 #if defined(INET) || defined (INET6)
2788                 if (lro && (0 == tcp_lro_rx(&ss->lc, m, csum)))
2789                         return;
2790 #endif
2791         }
2792         /* pass the frame up the stack */
2793         (*ifp->if_input)(ifp, m);
2794 }
2795
2796 static inline void
2797 mxge_clean_rx_done(struct mxge_slice_state *ss)
2798 {
2799         mxge_rx_done_t *rx_done = &ss->rx_done;
2800         int limit = 0;
2801         uint16_t length;
2802         uint16_t checksum;
2803         int lro;
2804
2805         lro = ss->sc->ifp->if_capenable & IFCAP_LRO;
2806         while (rx_done->entry[rx_done->idx].length != 0) {
2807                 length = ntohs(rx_done->entry[rx_done->idx].length);
2808                 rx_done->entry[rx_done->idx].length = 0;
2809                 checksum = rx_done->entry[rx_done->idx].checksum;
2810                 if (length <= (MHLEN - MXGEFW_PAD))
2811                         mxge_rx_done_small(ss, length, checksum, lro);
2812                 else
2813                         mxge_rx_done_big(ss, length, checksum, lro);
2814                 rx_done->cnt++;
2815                 rx_done->idx = rx_done->cnt & rx_done->mask;
2816
2817                 /* limit potential for livelock */
2818                 if (__predict_false(++limit > rx_done->mask / 2))
2819                         break;
2820         }
2821 #if defined(INET)  || defined (INET6)
2822         tcp_lro_flush_all(&ss->lc);
2823 #endif
2824 }
2825
2826
2827 static inline void
2828 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2829 {
2830         struct ifnet *ifp;
2831         mxge_tx_ring_t *tx;
2832         struct mbuf *m;
2833         bus_dmamap_t map;
2834         int idx;
2835         int *flags;
2836
2837         tx = &ss->tx;
2838         ifp = ss->sc->ifp;
2839         while (tx->pkt_done != mcp_idx) {
2840                 idx = tx->done & tx->mask;
2841                 tx->done++;
2842                 m = tx->info[idx].m;
2843                 /* mbuf and DMA map only attached to the first
2844                    segment per-mbuf */
2845                 if (m != NULL) {
2846                         ss->obytes += m->m_pkthdr.len;
2847                         if (m->m_flags & M_MCAST)
2848                                 ss->omcasts++;
2849                         ss->opackets++;
2850                         tx->info[idx].m = NULL;
2851                         map = tx->info[idx].map;
2852                         bus_dmamap_unload(tx->dmat, map);
2853                         m_freem(m);
2854                 }
2855                 if (tx->info[idx].flag) {
2856                         tx->info[idx].flag = 0;
2857                         tx->pkt_done++;
2858                 }
2859         }
2860         
2861         /* If we have space, clear IFF_OACTIVE to tell the stack that
2862            its OK to send packets */
2863 #ifdef IFNET_BUF_RING
2864         flags = &ss->if_drv_flags;
2865 #else
2866         flags = &ifp->if_drv_flags;
2867 #endif
2868         mtx_lock(&ss->tx.mtx);
2869         if ((*flags) & IFF_DRV_OACTIVE &&
2870             tx->req - tx->done < (tx->mask + 1)/4) {
2871                 *(flags) &= ~IFF_DRV_OACTIVE;
2872                 ss->tx.wake++;
2873                 mxge_start_locked(ss);
2874         }
2875 #ifdef IFNET_BUF_RING
2876         if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2877                 /* let the NIC stop polling this queue, since there
2878                  * are no more transmits pending */
2879                 if (tx->req == tx->done) {
2880                         *tx->send_stop = 1;
2881                         tx->queue_active = 0;
2882                         tx->deactivate++;
2883                         wmb();
2884                 }
2885         }
2886 #endif
2887         mtx_unlock(&ss->tx.mtx);
2888
2889 }
2890
2891 static struct mxge_media_type mxge_xfp_media_types[] =
2892 {
2893         {IFM_10G_CX4,   0x7f,           "10GBASE-CX4 (module)"},
2894         {IFM_10G_SR,    (1 << 7),       "10GBASE-SR"},
2895         {IFM_10G_LR,    (1 << 6),       "10GBASE-LR"},
2896         {0,             (1 << 5),       "10GBASE-ER"},
2897         {IFM_10G_LRM,   (1 << 4),       "10GBASE-LRM"},
2898         {0,             (1 << 3),       "10GBASE-SW"},
2899         {0,             (1 << 2),       "10GBASE-LW"},
2900         {0,             (1 << 1),       "10GBASE-EW"},
2901         {0,             (1 << 0),       "Reserved"}
2902 };
2903 static struct mxge_media_type mxge_sfp_media_types[] =
2904 {
2905         {IFM_10G_TWINAX,      0,        "10GBASE-Twinax"},
2906         {0,             (1 << 7),       "Reserved"},
2907         {IFM_10G_LRM,   (1 << 6),       "10GBASE-LRM"},
2908         {IFM_10G_LR,    (1 << 5),       "10GBASE-LR"},
2909         {IFM_10G_SR,    (1 << 4),       "10GBASE-SR"},
2910         {IFM_10G_TWINAX,(1 << 0),       "10GBASE-Twinax"}
2911 };
2912
2913 static void
2914 mxge_media_set(mxge_softc_t *sc, int media_type)
2915 {
2916
2917         
2918         ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2919                     0, NULL);
2920         ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2921         sc->current_media = media_type;
2922         sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2923 }
2924
2925 static void
2926 mxge_media_init(mxge_softc_t *sc)
2927 {
2928         char *ptr;
2929         int i;
2930
2931         ifmedia_removeall(&sc->media);
2932         mxge_media_set(sc, IFM_AUTO);
2933
2934         /*
2935          * parse the product code to deterimine the interface type
2936          * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2937          * after the 3rd dash in the driver's cached copy of the
2938          * EEPROM's product code string.
2939          */
2940         ptr = sc->product_code_string;
2941         if (ptr == NULL) {
2942                 device_printf(sc->dev, "Missing product code\n");
2943                 return;
2944         }
2945
2946         for (i = 0; i < 3; i++, ptr++) {
2947                 ptr = strchr(ptr, '-');
2948                 if (ptr == NULL) {
2949                         device_printf(sc->dev,
2950                                       "only %d dashes in PC?!?\n", i);
2951                         return;
2952                 }
2953         }
2954         if (*ptr == 'C' || *(ptr +1) == 'C') {
2955                 /* -C is CX4 */
2956                 sc->connector = MXGE_CX4;
2957                 mxge_media_set(sc, IFM_10G_CX4);
2958         } else if (*ptr == 'Q') {
2959                 /* -Q is Quad Ribbon Fiber */
2960                 sc->connector = MXGE_QRF;
2961                 device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2962                 /* FreeBSD has no media type for Quad ribbon fiber */
2963         } else if (*ptr == 'R') {
2964                 /* -R is XFP */
2965                 sc->connector = MXGE_XFP;
2966         } else if (*ptr == 'S' || *(ptr +1) == 'S') {
2967                 /* -S or -2S is SFP+ */
2968                 sc->connector = MXGE_SFP;
2969         } else {
2970                 device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2971         }
2972 }
2973
2974 /*
2975  * Determine the media type for a NIC.  Some XFPs will identify
2976  * themselves only when their link is up, so this is initiated via a
2977  * link up interrupt.  However, this can potentially take up to
2978  * several milliseconds, so it is run via the watchdog routine, rather
2979  * than in the interrupt handler itself.
2980  */
2981 static void
2982 mxge_media_probe(mxge_softc_t *sc)
2983 {
2984         mxge_cmd_t cmd;
2985         char *cage_type;
2986
2987         struct mxge_media_type *mxge_media_types = NULL;
2988         int i, err, ms, mxge_media_type_entries;
2989         uint32_t byte;
2990
2991         sc->need_media_probe = 0;
2992
2993         if (sc->connector == MXGE_XFP) {
2994                 /* -R is XFP */
2995                 mxge_media_types = mxge_xfp_media_types;
2996                 mxge_media_type_entries =
2997                         nitems(mxge_xfp_media_types);
2998                 byte = MXGE_XFP_COMPLIANCE_BYTE;
2999                 cage_type = "XFP";
3000         } else  if (sc->connector == MXGE_SFP) {
3001                 /* -S or -2S is SFP+ */
3002                 mxge_media_types = mxge_sfp_media_types;
3003                 mxge_media_type_entries =
3004                         nitems(mxge_sfp_media_types);
3005                 cage_type = "SFP+";
3006                 byte = 3;
3007         } else {
3008                 /* nothing to do; media type cannot change */
3009                 return;
3010         }
3011
3012         /*
3013          * At this point we know the NIC has an XFP cage, so now we
3014          * try to determine what is in the cage by using the
3015          * firmware's XFP I2C commands to read the XFP 10GbE compilance
3016          * register.  We read just one byte, which may take over
3017          * a millisecond
3018          */
3019
3020         cmd.data0 = 0;   /* just fetch 1 byte, not all 256 */
3021         cmd.data1 = byte;
3022         err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
3023         if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
3024                 device_printf(sc->dev, "failed to read XFP\n");
3025         }
3026         if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
3027                 device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
3028         }
3029         if (err != MXGEFW_CMD_OK) {
3030                 return;
3031         }
3032
3033         /* now we wait for the data to be cached */
3034         cmd.data0 = byte;
3035         err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3036         for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
3037                 DELAY(1000);
3038                 cmd.data0 = byte;
3039                 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
3040         }
3041         if (err != MXGEFW_CMD_OK) {
3042                 device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
3043                               cage_type, err, ms);
3044                 return;
3045         }
3046                 
3047         if (cmd.data0 == mxge_media_types[0].bitmask) {
3048                 if (mxge_verbose)
3049                         device_printf(sc->dev, "%s:%s\n", cage_type,
3050                                       mxge_media_types[0].name);
3051                 if (sc->current_media != mxge_media_types[0].flag) {
3052                         mxge_media_init(sc);
3053                         mxge_media_set(sc, mxge_media_types[0].flag);
3054                 }
3055                 return;
3056         }
3057         for (i = 1; i < mxge_media_type_entries; i++) {
3058                 if (cmd.data0 & mxge_media_types[i].bitmask) {
3059                         if (mxge_verbose)
3060                                 device_printf(sc->dev, "%s:%s\n",
3061                                               cage_type,
3062                                               mxge_media_types[i].name);
3063
3064                         if (sc->current_media != mxge_media_types[i].flag) {
3065                                 mxge_media_init(sc);
3066                                 mxge_media_set(sc, mxge_media_types[i].flag);
3067                         }
3068                         return;
3069                 }
3070         }
3071         if (mxge_verbose)
3072                 device_printf(sc->dev, "%s media 0x%x unknown\n",
3073                               cage_type, cmd.data0);
3074
3075         return;
3076 }
3077
3078 static void
3079 mxge_intr(void *arg)
3080 {
3081         struct mxge_slice_state *ss = arg;
3082         mxge_softc_t *sc = ss->sc;
3083         mcp_irq_data_t *stats = ss->fw_stats;
3084         mxge_tx_ring_t *tx = &ss->tx;
3085         mxge_rx_done_t *rx_done = &ss->rx_done;
3086         uint32_t send_done_count;
3087         uint8_t valid;
3088
3089
3090 #ifndef IFNET_BUF_RING
3091         /* an interrupt on a non-zero slice is implicitly valid
3092            since MSI-X irqs are not shared */
3093         if (ss != sc->ss) {
3094                 mxge_clean_rx_done(ss);
3095                 *ss->irq_claim = be32toh(3);
3096                 return;
3097         }
3098 #endif
3099
3100         /* make sure the DMA has finished */
3101         if (!stats->valid) {
3102                 return;
3103         }
3104         valid = stats->valid;
3105
3106         if (sc->legacy_irq) {
3107                 /* lower legacy IRQ  */
3108                 *sc->irq_deassert = 0;
3109                 if (!mxge_deassert_wait)
3110                         /* don't wait for conf. that irq is low */
3111                         stats->valid = 0;
3112         } else {
3113                 stats->valid = 0;
3114         }
3115
3116         /* loop while waiting for legacy irq deassertion */
3117         do {
3118                 /* check for transmit completes and receives */
3119                 send_done_count = be32toh(stats->send_done_count);
3120                 while ((send_done_count != tx->pkt_done) ||
3121                        (rx_done->entry[rx_done->idx].length != 0)) {
3122                         if (send_done_count != tx->pkt_done)
3123                                 mxge_tx_done(ss, (int)send_done_count);
3124                         mxge_clean_rx_done(ss);
3125                         send_done_count = be32toh(stats->send_done_count);
3126                 }
3127                 if (sc->legacy_irq && mxge_deassert_wait)
3128                         wmb();
3129         } while (*((volatile uint8_t *) &stats->valid));
3130
3131         /* fw link & error stats meaningful only on the first slice */
3132         if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3133                 if (sc->link_state != stats->link_up) {
3134                         sc->link_state = stats->link_up;
3135                         if (sc->link_state) {
3136                                 if_link_state_change(sc->ifp, LINK_STATE_UP);
3137                                 if (mxge_verbose)
3138                                         device_printf(sc->dev, "link up\n");
3139                         } else {
3140                                 if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3141                                 if (mxge_verbose)
3142                                         device_printf(sc->dev, "link down\n");
3143                         }
3144                         sc->need_media_probe = 1;
3145                 }
3146                 if (sc->rdma_tags_available !=
3147                     be32toh(stats->rdma_tags_available)) {
3148                         sc->rdma_tags_available =
3149                                 be32toh(stats->rdma_tags_available);
3150                         device_printf(sc->dev, "RDMA timed out! %d tags "
3151                                       "left\n", sc->rdma_tags_available);
3152                 }
3153
3154                 if (stats->link_down) {
3155                         sc->down_cnt += stats->link_down;
3156                         sc->link_state = 0;
3157                         if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3158                 }
3159         }
3160
3161         /* check to see if we have rx token to pass back */
3162         if (valid & 0x1)
3163             *ss->irq_claim = be32toh(3);
3164         *(ss->irq_claim + 1) = be32toh(3);
3165 }
3166
3167 static void
3168 mxge_init(void *arg)
3169 {
3170         mxge_softc_t *sc = arg;
3171         struct ifnet *ifp = sc->ifp;
3172
3173
3174         mtx_lock(&sc->driver_mtx);
3175         if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3176                 (void) mxge_open(sc);
3177         mtx_unlock(&sc->driver_mtx);
3178 }
3179
3180
3181
3182 static void
3183 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3184 {
3185         int i;
3186
3187 #if defined(INET) || defined(INET6)
3188         tcp_lro_free(&ss->lc);
3189 #endif
3190         for (i = 0; i <= ss->rx_big.mask; i++) {
3191                 if (ss->rx_big.info[i].m == NULL)
3192                         continue;
3193                 bus_dmamap_unload(ss->rx_big.dmat,
3194                                   ss->rx_big.info[i].map);
3195                 m_freem(ss->rx_big.info[i].m);
3196                 ss->rx_big.info[i].m = NULL;
3197         }
3198
3199         for (i = 0; i <= ss->rx_small.mask; i++) {
3200                 if (ss->rx_small.info[i].m == NULL)
3201                         continue;
3202                 bus_dmamap_unload(ss->rx_small.dmat,
3203                                   ss->rx_small.info[i].map);
3204                 m_freem(ss->rx_small.info[i].m);
3205                 ss->rx_small.info[i].m = NULL;
3206         }
3207
3208         /* transmit ring used only on the first slice */
3209         if (ss->tx.info == NULL)
3210                 return;
3211
3212         for (i = 0; i <= ss->tx.mask; i++) {
3213                 ss->tx.info[i].flag = 0;
3214                 if (ss->tx.info[i].m == NULL)
3215                         continue;
3216                 bus_dmamap_unload(ss->tx.dmat,
3217                                   ss->tx.info[i].map);
3218                 m_freem(ss->tx.info[i].m);
3219                 ss->tx.info[i].m = NULL;
3220         }
3221 }
3222
3223 static void
3224 mxge_free_mbufs(mxge_softc_t *sc)
3225 {
3226         int slice;
3227
3228         for (slice = 0; slice < sc->num_slices; slice++)
3229                 mxge_free_slice_mbufs(&sc->ss[slice]);
3230 }
3231
3232 static void
3233 mxge_free_slice_rings(struct mxge_slice_state *ss)
3234 {
3235         int i;
3236
3237
3238         if (ss->rx_done.entry != NULL)
3239                 mxge_dma_free(&ss->rx_done.dma);
3240         ss->rx_done.entry = NULL;
3241
3242         if (ss->tx.req_bytes != NULL)
3243                 free(ss->tx.req_bytes, M_DEVBUF);
3244         ss->tx.req_bytes = NULL;
3245
3246         if (ss->tx.seg_list != NULL)
3247                 free(ss->tx.seg_list, M_DEVBUF);
3248         ss->tx.seg_list = NULL;
3249
3250         if (ss->rx_small.shadow != NULL)
3251                 free(ss->rx_small.shadow, M_DEVBUF);
3252         ss->rx_small.shadow = NULL;
3253
3254         if (ss->rx_big.shadow != NULL)
3255                 free(ss->rx_big.shadow, M_DEVBUF);
3256         ss->rx_big.shadow = NULL;
3257
3258         if (ss->tx.info != NULL) {
3259                 if (ss->tx.dmat != NULL) {
3260                         for (i = 0; i <= ss->tx.mask; i++) {
3261                                 bus_dmamap_destroy(ss->tx.dmat,
3262                                                    ss->tx.info[i].map);
3263                         }
3264                         bus_dma_tag_destroy(ss->tx.dmat);
3265                 }
3266                 free(ss->tx.info, M_DEVBUF);
3267         }
3268         ss->tx.info = NULL;
3269
3270         if (ss->rx_small.info != NULL) {
3271                 if (ss->rx_small.dmat != NULL) {
3272                         for (i = 0; i <= ss->rx_small.mask; i++) {
3273                                 bus_dmamap_destroy(ss->rx_small.dmat,
3274                                                    ss->rx_small.info[i].map);
3275                         }
3276                         bus_dmamap_destroy(ss->rx_small.dmat,
3277                                            ss->rx_small.extra_map);
3278                         bus_dma_tag_destroy(ss->rx_small.dmat);
3279                 }
3280                 free(ss->rx_small.info, M_DEVBUF);
3281         }
3282         ss->rx_small.info = NULL;
3283
3284         if (ss->rx_big.info != NULL) {
3285                 if (ss->rx_big.dmat != NULL) {
3286                         for (i = 0; i <= ss->rx_big.mask; i++) {
3287                                 bus_dmamap_destroy(ss->rx_big.dmat,
3288                                                    ss->rx_big.info[i].map);
3289                         }
3290                         bus_dmamap_destroy(ss->rx_big.dmat,
3291                                            ss->rx_big.extra_map);
3292                         bus_dma_tag_destroy(ss->rx_big.dmat);
3293                 }
3294                 free(ss->rx_big.info, M_DEVBUF);
3295         }
3296         ss->rx_big.info = NULL;
3297 }
3298
3299 static void
3300 mxge_free_rings(mxge_softc_t *sc)
3301 {
3302         int slice;
3303
3304         for (slice = 0; slice < sc->num_slices; slice++)
3305                 mxge_free_slice_rings(&sc->ss[slice]);
3306 }
3307
3308 static int
3309 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3310                        int tx_ring_entries)
3311 {
3312         mxge_softc_t *sc = ss->sc;
3313         size_t bytes;
3314         int err, i;
3315
3316         /* allocate per-slice receive resources */
3317
3318         ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3319         ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3320
3321         /* allocate the rx shadow rings */
3322         bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3323         ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3324
3325         bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3326         ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3327
3328         /* allocate the rx host info rings */
3329         bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3330         ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3331
3332         bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3333         ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3334
3335         /* allocate the rx busdma resources */
3336         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
3337                                  1,                     /* alignment */
3338                                  4096,                  /* boundary */
3339                                  BUS_SPACE_MAXADDR,     /* low */
3340                                  BUS_SPACE_MAXADDR,     /* high */
3341                                  NULL, NULL,            /* filter */
3342                                  MHLEN,                 /* maxsize */
3343                                  1,                     /* num segs */
3344                                  MHLEN,                 /* maxsegsize */
3345                                  BUS_DMA_ALLOCNOW,      /* flags */
3346                                  NULL, NULL,            /* lock */
3347                                  &ss->rx_small.dmat);   /* tag */
3348         if (err != 0) {
3349                 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3350                               err);
3351                 return err;
3352         }
3353
3354         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
3355                                  1,                     /* alignment */
3356 #if MXGE_VIRT_JUMBOS
3357                                  4096,                  /* boundary */
3358 #else
3359                                  0,                     /* boundary */
3360 #endif
3361                                  BUS_SPACE_MAXADDR,     /* low */
3362                                  BUS_SPACE_MAXADDR,     /* high */
3363                                  NULL, NULL,            /* filter */
3364                                  3*4096,                /* maxsize */
3365 #if MXGE_VIRT_JUMBOS
3366                                  3,                     /* num segs */
3367                                  4096,                  /* maxsegsize*/
3368 #else
3369                                  1,                     /* num segs */
3370                                  MJUM9BYTES,            /* maxsegsize*/
3371 #endif
3372                                  BUS_DMA_ALLOCNOW,      /* flags */
3373                                  NULL, NULL,            /* lock */
3374                                  &ss->rx_big.dmat);     /* tag */
3375         if (err != 0) {
3376                 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3377                               err);
3378                 return err;
3379         }
3380         for (i = 0; i <= ss->rx_small.mask; i++) {
3381                 err = bus_dmamap_create(ss->rx_small.dmat, 0,
3382                                         &ss->rx_small.info[i].map);
3383                 if (err != 0) {
3384                         device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3385                                       err);
3386                         return err;
3387                 }
3388         }
3389         err = bus_dmamap_create(ss->rx_small.dmat, 0,
3390                                 &ss->rx_small.extra_map);
3391         if (err != 0) {
3392                 device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3393                               err);
3394                 return err;
3395         }
3396
3397         for (i = 0; i <= ss->rx_big.mask; i++) {
3398                 err = bus_dmamap_create(ss->rx_big.dmat, 0,
3399                                         &ss->rx_big.info[i].map);
3400                 if (err != 0) {
3401                         device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3402                                       err);
3403                         return err;
3404                 }
3405         }
3406         err = bus_dmamap_create(ss->rx_big.dmat, 0,
3407                                 &ss->rx_big.extra_map);
3408         if (err != 0) {
3409                 device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3410                               err);
3411                 return err;
3412         }
3413
3414         /* now allocate TX resources */
3415
3416 #ifndef IFNET_BUF_RING
3417         /* only use a single TX ring for now */
3418         if (ss != ss->sc->ss)
3419                 return 0;
3420 #endif
3421
3422         ss->tx.mask = tx_ring_entries - 1;
3423         ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3424
3425         
3426         /* allocate the tx request copy block */
3427         bytes = 8 +
3428                 sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3429         ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3430         /* ensure req_list entries are aligned to 8 bytes */
3431         ss->tx.req_list = (mcp_kreq_ether_send_t *)
3432                 ((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3433
3434         /* allocate the tx busdma segment list */
3435         bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3436         ss->tx.seg_list = (bus_dma_segment_t *)
3437                 malloc(bytes, M_DEVBUF, M_WAITOK);
3438
3439         /* allocate the tx host info ring */
3440         bytes = tx_ring_entries * sizeof (*ss->tx.info);
3441         ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3442         
3443         /* allocate the tx busdma resources */
3444         err = bus_dma_tag_create(sc->parent_dmat,       /* parent */
3445                                  1,                     /* alignment */
3446                                  sc->tx_boundary,       /* boundary */
3447                                  BUS_SPACE_MAXADDR,     /* low */
3448                                  BUS_SPACE_MAXADDR,     /* high */
3449                                  NULL, NULL,            /* filter */
3450                                  65536 + 256,           /* maxsize */
3451                                  ss->tx.max_desc - 2,   /* num segs */
3452                                  sc->tx_boundary,       /* maxsegsz */
3453                                  BUS_DMA_ALLOCNOW,      /* flags */
3454                                  NULL, NULL,            /* lock */
3455                                  &ss->tx.dmat);         /* tag */
3456         
3457         if (err != 0) {
3458                 device_printf(sc->dev, "Err %d allocating tx dmat\n",
3459                               err);
3460                 return err;
3461         }
3462
3463         /* now use these tags to setup dmamaps for each slot
3464            in the ring */
3465         for (i = 0; i <= ss->tx.mask; i++) {
3466                 err = bus_dmamap_create(ss->tx.dmat, 0,
3467                                         &ss->tx.info[i].map);
3468                 if (err != 0) {
3469                         device_printf(sc->dev, "Err %d  tx dmamap\n",
3470                                       err);
3471                         return err;
3472                 }
3473         }
3474         return 0;
3475
3476 }
3477
3478 static int
3479 mxge_alloc_rings(mxge_softc_t *sc)
3480 {
3481         mxge_cmd_t cmd;
3482         int tx_ring_size;
3483         int tx_ring_entries, rx_ring_entries;
3484         int err, slice;
3485         
3486         /* get ring sizes */
3487         err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3488         tx_ring_size = cmd.data0;
3489         if (err != 0) {
3490                 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3491                 goto abort;
3492         }
3493
3494         tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3495         rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3496         IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3497         sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3498         IFQ_SET_READY(&sc->ifp->if_snd);
3499
3500         for (slice = 0; slice < sc->num_slices; slice++) {
3501                 err = mxge_alloc_slice_rings(&sc->ss[slice],
3502                                              rx_ring_entries,
3503                                              tx_ring_entries);
3504                 if (err != 0)
3505                         goto abort;
3506         }
3507         return 0;
3508
3509 abort:
3510         mxge_free_rings(sc);
3511         return err;
3512
3513 }
3514
3515
3516 static void
3517 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3518 {
3519         int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3520
3521         if (bufsize < MCLBYTES) {
3522                 /* easy, everything fits in a single buffer */
3523                 *big_buf_size = MCLBYTES;
3524                 *cl_size = MCLBYTES;
3525                 *nbufs = 1;
3526                 return;
3527         }
3528
3529         if (bufsize < MJUMPAGESIZE) {
3530                 /* still easy, everything still fits in a single buffer */
3531                 *big_buf_size = MJUMPAGESIZE;
3532                 *cl_size = MJUMPAGESIZE;
3533                 *nbufs = 1;
3534                 return;
3535         }
3536 #if MXGE_VIRT_JUMBOS
3537         /* now we need to use virtually contiguous buffers */
3538         *cl_size = MJUM9BYTES;
3539         *big_buf_size = 4096;
3540         *nbufs = mtu / 4096 + 1;
3541         /* needs to be a power of two, so round up */
3542         if (*nbufs == 3)
3543                 *nbufs = 4;
3544 #else
3545         *cl_size = MJUM9BYTES;
3546         *big_buf_size = MJUM9BYTES;
3547         *nbufs = 1;
3548 #endif
3549 }
3550
3551 static int
3552 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3553 {
3554         mxge_softc_t *sc;
3555         mxge_cmd_t cmd;
3556         bus_dmamap_t map;
3557         int err, i, slice;
3558
3559
3560         sc = ss->sc;
3561         slice = ss - sc->ss;
3562
3563 #if defined(INET) || defined(INET6)
3564         (void)tcp_lro_init(&ss->lc);
3565 #endif
3566         ss->lc.ifp = sc->ifp;
3567         
3568         /* get the lanai pointers to the send and receive rings */
3569
3570         err = 0;
3571 #ifndef IFNET_BUF_RING
3572         /* We currently only send from the first slice */
3573         if (slice == 0) {
3574 #endif
3575                 cmd.data0 = slice;
3576                 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3577                 ss->tx.lanai =
3578                         (volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3579                 ss->tx.send_go = (volatile uint32_t *)
3580                         (sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3581                 ss->tx.send_stop = (volatile uint32_t *)
3582                 (sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3583 #ifndef IFNET_BUF_RING
3584         }
3585 #endif
3586         cmd.data0 = slice;
3587         err |= mxge_send_cmd(sc,
3588                              MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3589         ss->rx_small.lanai =
3590                 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3591         cmd.data0 = slice;
3592         err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3593         ss->rx_big.lanai =
3594                 (volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3595
3596         if (err != 0) {
3597                 device_printf(sc->dev,
3598                               "failed to get ring sizes or locations\n");
3599                 return EIO;
3600         }
3601
3602         /* stock receive rings */
3603         for (i = 0; i <= ss->rx_small.mask; i++) {
3604                 map = ss->rx_small.info[i].map;
3605                 err = mxge_get_buf_small(ss, map, i);
3606                 if (err) {
3607                         device_printf(sc->dev, "alloced %d/%d smalls\n",
3608                                       i, ss->rx_small.mask + 1);
3609                         return ENOMEM;
3610                 }
3611         }
3612         for (i = 0; i <= ss->rx_big.mask; i++) {
3613                 ss->rx_big.shadow[i].addr_low = 0xffffffff;
3614                 ss->rx_big.shadow[i].addr_high = 0xffffffff;
3615         }
3616         ss->rx_big.nbufs = nbufs;
3617         ss->rx_big.cl_size = cl_size;
3618         ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3619                 ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3620         for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3621                 map = ss->rx_big.info[i].map;
3622                 err = mxge_get_buf_big(ss, map, i);
3623                 if (err) {
3624                         device_printf(sc->dev, "alloced %d/%d bigs\n",
3625                                       i, ss->rx_big.mask + 1);
3626                         return ENOMEM;
3627                 }
3628         }
3629         return 0;
3630 }
3631
3632 static int
3633 mxge_open(mxge_softc_t *sc)
3634 {
3635         mxge_cmd_t cmd;
3636         int err, big_bytes, nbufs, slice, cl_size, i;
3637         bus_addr_t bus;
3638         volatile uint8_t *itable;
3639         struct mxge_slice_state *ss;
3640
3641         /* Copy the MAC address in case it was overridden */
3642         bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3643
3644         err = mxge_reset(sc, 1);
3645         if (err != 0) {
3646                 device_printf(sc->dev, "failed to reset\n");
3647                 return EIO;
3648         }
3649
3650         if (sc->num_slices > 1) {
3651                 /* setup the indirection table */
3652                 cmd.data0 = sc->num_slices;
3653                 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3654                                     &cmd);
3655
3656                 err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3657                                      &cmd);
3658                 if (err != 0) {
3659                         device_printf(sc->dev,
3660                                       "failed to setup rss tables\n");
3661                         return err;
3662                 }
3663
3664                 /* just enable an identity mapping */
3665                 itable = sc->sram + cmd.data0;
3666                 for (i = 0; i < sc->num_slices; i++)
3667                         itable[i] = (uint8_t)i;
3668
3669                 cmd.data0 = 1;
3670                 cmd.data1 = mxge_rss_hash_type;
3671                 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3672                 if (err != 0) {
3673                         device_printf(sc->dev, "failed to enable slices\n");
3674                         return err;
3675                 }
3676         }
3677
3678
3679         mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3680
3681         cmd.data0 = nbufs;
3682         err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3683                             &cmd);
3684         /* error is only meaningful if we're trying to set
3685            MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3686         if (err && nbufs > 1) {
3687                 device_printf(sc->dev,
3688                               "Failed to set alway-use-n to %d\n",
3689                               nbufs);
3690                 return EIO;
3691         }
3692         /* Give the firmware the mtu and the big and small buffer
3693            sizes.  The firmware wants the big buf size to be a power
3694            of two. Luckily, FreeBSD's clusters are powers of two */
3695         cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3696         err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3697         cmd.data0 = MHLEN - MXGEFW_PAD;
3698         err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3699                              &cmd);
3700         cmd.data0 = big_bytes;
3701         err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3702
3703         if (err != 0) {
3704                 device_printf(sc->dev, "failed to setup params\n");
3705                 goto abort;
3706         }
3707
3708         /* Now give him the pointer to the stats block */
3709         for (slice = 0;
3710 #ifdef IFNET_BUF_RING
3711              slice < sc->num_slices;
3712 #else
3713              slice < 1;
3714 #endif
3715              slice++) {
3716                 ss = &sc->ss[slice];
3717                 cmd.data0 =
3718                         MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3719                 cmd.data1 =
3720                         MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3721                 cmd.data2 = sizeof(struct mcp_irq_data);
3722                 cmd.data2 |= (slice << 16);
3723                 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3724         }
3725
3726         if (err != 0) {
3727                 bus = sc->ss->fw_stats_dma.bus_addr;
3728                 bus += offsetof(struct mcp_irq_data, send_done_count);
3729                 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3730                 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3731                 err = mxge_send_cmd(sc,
3732                                     MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3733                                     &cmd);
3734                 /* Firmware cannot support multicast without STATS_DMA_V2 */
3735                 sc->fw_multicast_support = 0;
3736         } else {
3737                 sc->fw_multicast_support = 1;
3738         }
3739
3740         if (err != 0) {
3741                 device_printf(sc->dev, "failed to setup params\n");
3742                 goto abort;
3743         }
3744
3745         for (slice = 0; slice < sc->num_slices; slice++) {
3746                 err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3747                 if (err != 0) {
3748                         device_printf(sc->dev, "couldn't open slice %d\n",
3749                                       slice);
3750                         goto abort;
3751                 }
3752         }
3753
3754         /* Finally, start the firmware running */
3755         err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3756         if (err) {
3757                 device_printf(sc->dev, "Couldn't bring up link\n");
3758                 goto abort;
3759         }
3760 #ifdef IFNET_BUF_RING
3761         for (slice = 0; slice < sc->num_slices; slice++) {
3762                 ss = &sc->ss[slice];
3763                 ss->if_drv_flags |= IFF_DRV_RUNNING;
3764                 ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3765         }
3766 #endif
3767         sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3768         sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3769
3770         return 0;
3771
3772
3773 abort:
3774         mxge_free_mbufs(sc);
3775
3776         return err;
3777 }
3778
3779 static int
3780 mxge_close(mxge_softc_t *sc, int down)
3781 {
3782         mxge_cmd_t cmd;
3783         int err, old_down_cnt;
3784 #ifdef IFNET_BUF_RING
3785         struct mxge_slice_state *ss;    
3786         int slice;
3787 #endif
3788
3789 #ifdef IFNET_BUF_RING
3790         for (slice = 0; slice < sc->num_slices; slice++) {
3791                 ss = &sc->ss[slice];
3792                 ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3793         }
3794 #endif
3795         sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3796         if (!down) {
3797                 old_down_cnt = sc->down_cnt;
3798                 wmb();
3799                 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3800                 if (err) {
3801                         device_printf(sc->dev,
3802                                       "Couldn't bring down link\n");
3803                 }
3804                 if (old_down_cnt == sc->down_cnt) {
3805                         /* wait for down irq */
3806                         DELAY(10 * sc->intr_coal_delay);
3807                 }
3808                 wmb();
3809                 if (old_down_cnt == sc->down_cnt) {
3810                         device_printf(sc->dev, "never got down irq\n");
3811                 }
3812         }
3813         mxge_free_mbufs(sc);
3814
3815         return 0;
3816 }
3817
3818 static void
3819 mxge_setup_cfg_space(mxge_softc_t *sc)
3820 {
3821         device_t dev = sc->dev;
3822         int reg;
3823         uint16_t lnk, pectl;
3824
3825         /* find the PCIe link width and set max read request to 4KB*/
3826         if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3827                 lnk = pci_read_config(dev, reg + 0x12, 2);
3828                 sc->link_width = (lnk >> 4) & 0x3f;
3829
3830                 if (sc->pectl == 0) {
3831                         pectl = pci_read_config(dev, reg + 0x8, 2);
3832                         pectl = (pectl & ~0x7000) | (5 << 12);
3833                         pci_write_config(dev, reg + 0x8, pectl, 2);
3834                         sc->pectl = pectl;
3835                 } else {
3836                         /* restore saved pectl after watchdog reset */
3837                         pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3838                 }
3839         }
3840
3841         /* Enable DMA and Memory space access */
3842         pci_enable_busmaster(dev);
3843 }
3844
3845 static uint32_t
3846 mxge_read_reboot(mxge_softc_t *sc)
3847 {
3848         device_t dev = sc->dev;
3849         uint32_t vs;
3850
3851         /* find the vendor specific offset */
3852         if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3853                 device_printf(sc->dev,
3854                               "could not find vendor specific offset\n");
3855                 return (uint32_t)-1;
3856         }
3857         /* enable read32 mode */
3858         pci_write_config(dev, vs + 0x10, 0x3, 1);
3859         /* tell NIC which register to read */
3860         pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3861         return (pci_read_config(dev, vs + 0x14, 4));
3862 }
3863
3864 static void
3865 mxge_watchdog_reset(mxge_softc_t *sc)
3866 {
3867         struct pci_devinfo *dinfo;
3868         struct mxge_slice_state *ss;
3869         int err, running, s, num_tx_slices = 1;
3870         uint32_t reboot;
3871         uint16_t cmd;
3872
3873         err = ENXIO;
3874
3875         device_printf(sc->dev, "Watchdog reset!\n");
3876
3877         /*
3878          * check to see if the NIC rebooted.  If it did, then all of
3879          * PCI config space has been reset, and things like the
3880          * busmaster bit will be zero.  If this is the case, then we
3881          * must restore PCI config space before the NIC can be used
3882          * again
3883          */
3884         cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3885         if (cmd == 0xffff) {
3886                 /*
3887                  * maybe the watchdog caught the NIC rebooting; wait
3888                  * up to 100ms for it to finish.  If it does not come
3889                  * back, then give up
3890                  */
3891                 DELAY(1000*100);
3892                 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3893                 if (cmd == 0xffff) {
3894                         device_printf(sc->dev, "NIC disappeared!\n");
3895                 }
3896         }
3897         if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3898                 /* print the reboot status */
3899                 reboot = mxge_read_reboot(sc);
3900                 device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3901                               reboot);
3902                 running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3903                 if (running) {
3904
3905                         /*
3906                          * quiesce NIC so that TX routines will not try to
3907                          * xmit after restoration of BAR
3908                          */
3909
3910                         /* Mark the link as down */
3911                         if (sc->link_state) {
3912                                 sc->link_state = 0;
3913                                 if_link_state_change(sc->ifp,
3914                                                      LINK_STATE_DOWN);
3915                         }
3916 #ifdef IFNET_BUF_RING
3917                         num_tx_slices = sc->num_slices;
3918 #endif
3919                         /* grab all TX locks to ensure no tx  */
3920                         for (s = 0; s < num_tx_slices; s++) {
3921                                 ss = &sc->ss[s];
3922                                 mtx_lock(&ss->tx.mtx);
3923                         }
3924                         mxge_close(sc, 1);
3925                 }
3926                 /* restore PCI configuration space */
3927                 dinfo = device_get_ivars(sc->dev);
3928                 pci_cfg_restore(sc->dev, dinfo);
3929
3930                 /* and redo any changes we made to our config space */
3931                 mxge_setup_cfg_space(sc);
3932
3933                 /* reload f/w */
3934                 err = mxge_load_firmware(sc, 0);
3935                 if (err) {
3936                         device_printf(sc->dev,
3937                                       "Unable to re-load f/w\n");
3938                 }
3939                 if (running) {
3940                         if (!err)
3941                                 err = mxge_open(sc);
3942                         /* release all TX locks */
3943                         for (s = 0; s < num_tx_slices; s++) {
3944                                 ss = &sc->ss[s];
3945 #ifdef IFNET_BUF_RING
3946                                 mxge_start_locked(ss);
3947 #endif
3948                                 mtx_unlock(&ss->tx.mtx);
3949                         }
3950                 }
3951                 sc->watchdog_resets++;
3952         } else {
3953                 device_printf(sc->dev,
3954                               "NIC did not reboot, not resetting\n");
3955                 err = 0;
3956         }
3957         if (err) {
3958                 device_printf(sc->dev, "watchdog reset failed\n");
3959         } else {
3960                 if (sc->dying == 2)
3961                         sc->dying = 0;
3962                 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3963         }
3964 }
3965
3966 static void
3967 mxge_watchdog_task(void *arg, int pending)
3968 {
3969         mxge_softc_t *sc = arg;
3970
3971
3972         mtx_lock(&sc->driver_mtx);
3973         mxge_watchdog_reset(sc);
3974         mtx_unlock(&sc->driver_mtx);
3975 }
3976
3977 static void
3978 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3979 {
3980         tx = &sc->ss[slice].tx;
3981         device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3982         device_printf(sc->dev,
3983                       "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3984                       tx->req, tx->done, tx->queue_active);
3985         device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3986                               tx->activate, tx->deactivate);
3987         device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3988                       tx->pkt_done,
3989                       be32toh(sc->ss->fw_stats->send_done_count));
3990 }
3991
3992 static int
3993 mxge_watchdog(mxge_softc_t *sc)
3994 {
3995         mxge_tx_ring_t *tx;
3996         uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3997         int i, err = 0;
3998
3999         /* see if we have outstanding transmits, which
4000            have been pending for more than mxge_ticks */
4001         for (i = 0;
4002 #ifdef IFNET_BUF_RING
4003              (i < sc->num_slices) && (err == 0);
4004 #else
4005              (i < 1) && (err == 0);
4006 #endif
4007              i++) {
4008                 tx = &sc->ss[i].tx;             
4009                 if (tx->req != tx->done &&
4010                     tx->watchdog_req != tx->watchdog_done &&
4011                     tx->done == tx->watchdog_done) {
4012                         /* check for pause blocking before resetting */
4013                         if (tx->watchdog_rx_pause == rx_pause) {
4014                                 mxge_warn_stuck(sc, tx, i);
4015                                 taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4016                                 return (ENXIO);
4017                         }
4018                         else
4019                                 device_printf(sc->dev, "Flow control blocking "
4020                                               "xmits, check link partner\n");
4021                 }
4022
4023                 tx->watchdog_req = tx->req;
4024                 tx->watchdog_done = tx->done;
4025                 tx->watchdog_rx_pause = rx_pause;
4026         }
4027
4028         if (sc->need_media_probe)
4029                 mxge_media_probe(sc);
4030         return (err);
4031 }
4032
4033 static uint64_t
4034 mxge_get_counter(struct ifnet *ifp, ift_counter cnt)
4035 {
4036         struct mxge_softc *sc;
4037         uint64_t rv;
4038
4039         sc = if_getsoftc(ifp);
4040         rv = 0;
4041
4042         switch (cnt) {
4043         case IFCOUNTER_IPACKETS:
4044                 for (int s = 0; s < sc->num_slices; s++)
4045                         rv += sc->ss[s].ipackets;
4046                 return (rv);
4047         case IFCOUNTER_OPACKETS:
4048                 for (int s = 0; s < sc->num_slices; s++)
4049                         rv += sc->ss[s].opackets;
4050                 return (rv);
4051         case IFCOUNTER_OERRORS:
4052                 for (int s = 0; s < sc->num_slices; s++)
4053                         rv += sc->ss[s].oerrors;
4054                 return (rv);
4055 #ifdef IFNET_BUF_RING
4056         case IFCOUNTER_OBYTES:
4057                 for (int s = 0; s < sc->num_slices; s++)
4058                         rv += sc->ss[s].obytes;
4059                 return (rv);
4060         case IFCOUNTER_OMCASTS:
4061                 for (int s = 0; s < sc->num_slices; s++)
4062                         rv += sc->ss[s].omcasts;
4063                 return (rv);
4064         case IFCOUNTER_OQDROPS:
4065                 for (int s = 0; s < sc->num_slices; s++)
4066                         rv += sc->ss[s].tx.br->br_drops;
4067                 return (rv);
4068 #endif
4069         default:
4070                 return (if_get_counter_default(ifp, cnt));
4071         }
4072 }
4073
4074 static void
4075 mxge_tick(void *arg)
4076 {
4077         mxge_softc_t *sc = arg;
4078         u_long pkts = 0;
4079         int err = 0;
4080         int running, ticks;
4081         uint16_t cmd;
4082
4083         ticks = mxge_ticks;
4084         running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4085         if (running) {
4086                 if (!sc->watchdog_countdown) {
4087                         err = mxge_watchdog(sc);
4088                         sc->watchdog_countdown = 4;
4089                 }
4090                 sc->watchdog_countdown--;
4091         }
4092         if (pkts == 0) {
4093                 /* ensure NIC did not suffer h/w fault while idle */
4094                 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);                
4095                 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4096                         sc->dying = 2;
4097                         taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4098                         err = ENXIO;
4099                 }
4100                 /* look less often if NIC is idle */
4101                 ticks *= 4;
4102         }
4103
4104         if (err == 0)
4105                 callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4106
4107 }
4108
4109 static int
4110 mxge_media_change(struct ifnet *ifp)
4111 {
4112         return EINVAL;
4113 }
4114
4115 static int
4116 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4117 {
4118         struct ifnet *ifp = sc->ifp;
4119         int real_mtu, old_mtu;
4120         int err = 0;
4121
4122
4123         real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4124         if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4125                 return EINVAL;
4126         mtx_lock(&sc->driver_mtx);
4127         old_mtu = ifp->if_mtu;
4128         ifp->if_mtu = mtu;
4129         if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4130                 mxge_close(sc, 0);
4131                 err = mxge_open(sc);
4132                 if (err != 0) {
4133                         ifp->if_mtu = old_mtu;
4134                         mxge_close(sc, 0);
4135                         (void) mxge_open(sc);
4136                 }
4137         }
4138         mtx_unlock(&sc->driver_mtx);
4139         return err;
4140 }       
4141
4142 static void
4143 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4144 {
4145         mxge_softc_t *sc = ifp->if_softc;
4146         
4147
4148         if (sc == NULL)
4149                 return;
4150         ifmr->ifm_status = IFM_AVALID;
4151         ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4152         ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4153         ifmr->ifm_active |= sc->current_media;
4154 }
4155
4156 static int
4157 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4158 {
4159         mxge_softc_t *sc = ifp->if_softc;
4160         struct ifreq *ifr = (struct ifreq *)data;
4161         int err, mask;
4162
4163         err = 0;
4164         switch (command) {
4165         case SIOCSIFMTU:
4166                 err = mxge_change_mtu(sc, ifr->ifr_mtu);
4167                 break;
4168
4169         case SIOCSIFFLAGS:
4170                 mtx_lock(&sc->driver_mtx);
4171                 if (sc->dying) {
4172                         mtx_unlock(&sc->driver_mtx);
4173                         return EINVAL;
4174                 }
4175                 if (ifp->if_flags & IFF_UP) {
4176                         if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4177                                 err = mxge_open(sc);
4178                         } else {
4179                                 /* take care of promis can allmulti
4180                                    flag chages */
4181                                 mxge_change_promisc(sc,
4182                                                     ifp->if_flags & IFF_PROMISC);
4183                                 mxge_set_multicast_list(sc);
4184                         }
4185                 } else {
4186                         if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4187                                 mxge_close(sc, 0);
4188                         }
4189                 }
4190                 mtx_unlock(&sc->driver_mtx);
4191                 break;
4192
4193         case SIOCADDMULTI:
4194         case SIOCDELMULTI:
4195                 mtx_lock(&sc->driver_mtx);
4196                 mxge_set_multicast_list(sc);
4197                 mtx_unlock(&sc->driver_mtx);
4198                 break;
4199
4200         case SIOCSIFCAP:
4201                 mtx_lock(&sc->driver_mtx);
4202                 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4203                 if (mask & IFCAP_TXCSUM) {
4204                         if (IFCAP_TXCSUM & ifp->if_capenable) {
4205                                 ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4206                                 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
4207                         } else {
4208                                 ifp->if_capenable |= IFCAP_TXCSUM;
4209                                 ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4210                         }
4211                 } else if (mask & IFCAP_RXCSUM) {
4212                         if (IFCAP_RXCSUM & ifp->if_capenable) {
4213                                 ifp->if_capenable &= ~IFCAP_RXCSUM;
4214                         } else {
4215                                 ifp->if_capenable |= IFCAP_RXCSUM;
4216                         }
4217                 }
4218                 if (mask & IFCAP_TSO4) {
4219                         if (IFCAP_TSO4 & ifp->if_capenable) {
4220                                 ifp->if_capenable &= ~IFCAP_TSO4;
4221                         } else if (IFCAP_TXCSUM & ifp->if_capenable) {
4222                                 ifp->if_capenable |= IFCAP_TSO4;
4223                                 ifp->if_hwassist |= CSUM_TSO;
4224                         } else {
4225                                 printf("mxge requires tx checksum offload"
4226                                        " be enabled to use TSO\n");
4227                                 err = EINVAL;
4228                         }
4229                 }
4230 #if IFCAP_TSO6
4231                 if (mask & IFCAP_TXCSUM_IPV6) {
4232                         if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4233                                 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6
4234                                                        | IFCAP_TSO6);
4235                                 ifp->if_hwassist &= ~(CSUM_TCP_IPV6
4236                                                       | CSUM_UDP);
4237                         } else {
4238                                 ifp->if_capenable |= IFCAP_TXCSUM_IPV6;
4239                                 ifp->if_hwassist |= (CSUM_TCP_IPV6
4240                                                      | CSUM_UDP_IPV6);
4241                         }
4242                 } else if (mask & IFCAP_RXCSUM_IPV6) {
4243                         if (IFCAP_RXCSUM_IPV6 & ifp->if_capenable) {
4244                                 ifp->if_capenable &= ~IFCAP_RXCSUM_IPV6;
4245                         } else {
4246                                 ifp->if_capenable |= IFCAP_RXCSUM_IPV6;
4247                         }
4248                 }
4249                 if (mask & IFCAP_TSO6) {
4250                         if (IFCAP_TSO6 & ifp->if_capenable) {
4251                                 ifp->if_capenable &= ~IFCAP_TSO6;
4252                         } else if (IFCAP_TXCSUM_IPV6 & ifp->if_capenable) {
4253                                 ifp->if_capenable |= IFCAP_TSO6;
4254                                 ifp->if_hwassist |= CSUM_TSO;
4255                         } else {
4256                                 printf("mxge requires tx checksum offload"
4257                                        " be enabled to use TSO\n");
4258                                 err = EINVAL;
4259                         }
4260                 }
4261 #endif /*IFCAP_TSO6 */
4262
4263                 if (mask & IFCAP_LRO)
4264                         ifp->if_capenable ^= IFCAP_LRO;
4265                 if (mask & IFCAP_VLAN_HWTAGGING)
4266                         ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4267                 if (mask & IFCAP_VLAN_HWTSO)
4268                         ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4269
4270                 if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4271                     !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4272                         ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4273
4274                 mtx_unlock(&sc->driver_mtx);
4275                 VLAN_CAPABILITIES(ifp);
4276
4277                 break;
4278
4279         case SIOCGIFMEDIA:
4280                 mtx_lock(&sc->driver_mtx);
4281                 mxge_media_probe(sc);
4282                 mtx_unlock(&sc->driver_mtx);
4283                 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4284                                     &sc->media, command);
4285                 break;
4286
4287         default:
4288                 err = ether_ioctl(ifp, command, data);
4289                 break;
4290         }
4291         return err;
4292 }
4293
4294 static void
4295 mxge_fetch_tunables(mxge_softc_t *sc)
4296 {
4297
4298         TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4299         TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4300                           &mxge_flow_control);
4301         TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4302                           &mxge_intr_coal_delay);       
4303         TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4304                           &mxge_nvidia_ecrc_enable);    
4305         TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4306                           &mxge_force_firmware);        
4307         TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4308                           &mxge_deassert_wait); 
4309         TUNABLE_INT_FETCH("hw.mxge.verbose",
4310                           &mxge_verbose);       
4311         TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4312         TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4313         TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4314         TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4315         TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4316         TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4317
4318         if (bootverbose)
4319                 mxge_verbose = 1;
4320         if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4321                 mxge_intr_coal_delay = 30;
4322         if (mxge_ticks == 0)
4323                 mxge_ticks = hz / 2;
4324         sc->pause = mxge_flow_control;
4325         if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4326             || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4327                 mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4328         }
4329         if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4330             mxge_initial_mtu < ETHER_MIN_LEN)
4331                 mxge_initial_mtu = ETHERMTU_JUMBO;
4332
4333         if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4334                 mxge_throttle = MXGE_MAX_THROTTLE;
4335         if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4336                 mxge_throttle = MXGE_MIN_THROTTLE;
4337         sc->throttle = mxge_throttle;
4338 }
4339
4340
4341 static void
4342 mxge_free_slices(mxge_softc_t *sc)
4343 {
4344         struct mxge_slice_state *ss;
4345         int i;
4346
4347
4348         if (sc->ss == NULL)
4349                 return;
4350
4351         for (i = 0; i < sc->num_slices; i++) {
4352                 ss = &sc->ss[i];
4353                 if (ss->fw_stats != NULL) {
4354                         mxge_dma_free(&ss->fw_stats_dma);
4355                         ss->fw_stats = NULL;
4356 #ifdef IFNET_BUF_RING
4357                         if (ss->tx.br != NULL) {
4358                                 drbr_free(ss->tx.br, M_DEVBUF);
4359                                 ss->tx.br = NULL;
4360                         }
4361 #endif
4362                         mtx_destroy(&ss->tx.mtx);
4363                 }
4364                 if (ss->rx_done.entry != NULL) {
4365                         mxge_dma_free(&ss->rx_done.dma);
4366                         ss->rx_done.entry = NULL;
4367                 }
4368         }
4369         free(sc->ss, M_DEVBUF);
4370         sc->ss = NULL;
4371 }
4372
4373 static int
4374 mxge_alloc_slices(mxge_softc_t *sc)
4375 {
4376         mxge_cmd_t cmd;
4377         struct mxge_slice_state *ss;
4378         size_t bytes;
4379         int err, i, max_intr_slots;
4380
4381         err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4382         if (err != 0) {
4383                 device_printf(sc->dev, "Cannot determine rx ring size\n");
4384                 return err;
4385         }
4386         sc->rx_ring_size = cmd.data0;
4387         max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4388         
4389         bytes = sizeof (*sc->ss) * sc->num_slices;
4390         sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4391         if (sc->ss == NULL)
4392                 return (ENOMEM);
4393         for (i = 0; i < sc->num_slices; i++) {
4394                 ss = &sc->ss[i];
4395
4396                 ss->sc = sc;
4397
4398                 /* allocate per-slice rx interrupt queues */
4399                 
4400                 bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4401                 err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4402                 if (err != 0)
4403                         goto abort;
4404                 ss->rx_done.entry = ss->rx_done.dma.addr;
4405                 bzero(ss->rx_done.entry, bytes);
4406
4407                 /*
4408                  * allocate the per-slice firmware stats; stats
4409                  * (including tx) are used used only on the first
4410                  * slice for now
4411                  */
4412 #ifndef IFNET_BUF_RING
4413                 if (i > 0)
4414                         continue;
4415 #endif
4416
4417                 bytes = sizeof (*ss->fw_stats);
4418                 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4419                                      sizeof (*ss->fw_stats), 64);
4420                 if (err != 0)
4421                         goto abort;
4422                 ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4423                 snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4424                          "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4425                 mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4426 #ifdef IFNET_BUF_RING
4427                 ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4428                                            &ss->tx.mtx);
4429 #endif
4430         }
4431
4432         return (0);
4433
4434 abort:
4435         mxge_free_slices(sc);
4436         return (ENOMEM);
4437 }
4438
4439 static void
4440 mxge_slice_probe(mxge_softc_t *sc)
4441 {
4442         mxge_cmd_t cmd;
4443         char *old_fw;
4444         int msix_cnt, status, max_intr_slots;
4445
4446         sc->num_slices = 1;
4447         /*
4448          *  don't enable multiple slices if they are not enabled,
4449          *  or if this is not an SMP system
4450          */
4451         
4452         if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4453                 return;
4454
4455         /* see how many MSI-X interrupts are available */
4456         msix_cnt = pci_msix_count(sc->dev);
4457         if (msix_cnt < 2)
4458                 return;
4459
4460         /* now load the slice aware firmware see what it supports */
4461         old_fw = sc->fw_name;
4462         if (old_fw == mxge_fw_aligned)
4463                 sc->fw_name = mxge_fw_rss_aligned;
4464         else
4465                 sc->fw_name = mxge_fw_rss_unaligned;
4466         status = mxge_load_firmware(sc, 0);
4467         if (status != 0) {
4468                 device_printf(sc->dev, "Falling back to a single slice\n");
4469                 return;
4470         }
4471         
4472         /* try to send a reset command to the card to see if it
4473            is alive */
4474         memset(&cmd, 0, sizeof (cmd));
4475         status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4476         if (status != 0) {
4477                 device_printf(sc->dev, "failed reset\n");
4478                 goto abort_with_fw;
4479         }
4480
4481         /* get rx ring size */
4482         status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4483         if (status != 0) {
4484                 device_printf(sc->dev, "Cannot determine rx ring size\n");
4485                 goto abort_with_fw;
4486         }
4487         max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4488
4489         /* tell it the size of the interrupt queues */
4490         cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4491         status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4492         if (status != 0) {
4493                 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4494                 goto abort_with_fw;
4495         }
4496
4497         /* ask the maximum number of slices it supports */
4498         status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4499         if (status != 0) {
4500                 device_printf(sc->dev,
4501                               "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4502                 goto abort_with_fw;
4503         }
4504         sc->num_slices = cmd.data0;
4505         if (sc->num_slices > msix_cnt)
4506                 sc->num_slices = msix_cnt;
4507
4508         if (mxge_max_slices == -1) {
4509                 /* cap to number of CPUs in system */
4510                 if (sc->num_slices > mp_ncpus)
4511                         sc->num_slices = mp_ncpus;
4512         } else {
4513                 if (sc->num_slices > mxge_max_slices)
4514                         sc->num_slices = mxge_max_slices;
4515         }
4516         /* make sure it is a power of two */
4517         while (sc->num_slices & (sc->num_slices - 1))
4518                 sc->num_slices--;
4519
4520         if (mxge_verbose)
4521                 device_printf(sc->dev, "using %d slices\n",
4522                               sc->num_slices);
4523         
4524         return;
4525
4526 abort_with_fw:
4527         sc->fw_name = old_fw;
4528         (void) mxge_load_firmware(sc, 0);
4529 }
4530
4531 static int
4532 mxge_add_msix_irqs(mxge_softc_t *sc)
4533 {
4534         size_t bytes;
4535         int count, err, i, rid;
4536
4537         rid = PCIR_BAR(2);
4538         sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4539                                                     &rid, RF_ACTIVE);
4540
4541         if (sc->msix_table_res == NULL) {
4542                 device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4543                 return ENXIO;
4544         }
4545
4546         count = sc->num_slices;
4547         err = pci_alloc_msix(sc->dev, &count);
4548         if (err != 0) {
4549                 device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4550                               "err = %d \n", sc->num_slices, err);
4551                 goto abort_with_msix_table;
4552         }
4553         if (count < sc->num_slices) {
4554                 device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4555                               count, sc->num_slices);
4556                 device_printf(sc->dev,
4557                               "Try setting hw.mxge.max_slices to %d\n",
4558                               count);
4559                 err = ENOSPC;
4560                 goto abort_with_msix;
4561         }
4562         bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4563         sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4564         if (sc->msix_irq_res == NULL) {
4565                 err = ENOMEM;
4566                 goto abort_with_msix;
4567         }
4568
4569         for (i = 0; i < sc->num_slices; i++) {
4570                 rid = i + 1;
4571                 sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4572                                                           SYS_RES_IRQ,
4573                                                           &rid, RF_ACTIVE);
4574                 if (sc->msix_irq_res[i] == NULL) {
4575                         device_printf(sc->dev, "couldn't allocate IRQ res"
4576                                       " for message %d\n", i);
4577                         err = ENXIO;
4578                         goto abort_with_res;
4579                 }
4580         }
4581
4582         bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4583         sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4584
4585         for (i = 0; i < sc->num_slices; i++) {
4586                 err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4587                                      INTR_TYPE_NET | INTR_MPSAFE,
4588 #if __FreeBSD_version > 700030
4589                                      NULL,
4590 #endif
4591                                      mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4592                 if (err != 0) {
4593                         device_printf(sc->dev, "couldn't setup intr for "
4594                                       "message %d\n", i);
4595                         goto abort_with_intr;
4596                 }
4597                 bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4598                                   sc->msix_ih[i], "s%d", i);
4599         }
4600
4601         if (mxge_verbose) {
4602                 device_printf(sc->dev, "using %d msix IRQs:",
4603                               sc->num_slices);
4604                 for (i = 0; i < sc->num_slices; i++)
4605                         printf(" %jd", rman_get_start(sc->msix_irq_res[i]));
4606                 printf("\n");
4607         }
4608         return (0);
4609
4610 abort_with_intr:
4611         for (i = 0; i < sc->num_slices; i++) {
4612                 if (sc->msix_ih[i] != NULL) {
4613                         bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4614                                           sc->msix_ih[i]);
4615                         sc->msix_ih[i] = NULL;
4616                 }
4617         }
4618         free(sc->msix_ih, M_DEVBUF);
4619
4620
4621 abort_with_res:
4622         for (i = 0; i < sc->num_slices; i++) {
4623                 rid = i + 1;
4624                 if (sc->msix_irq_res[i] != NULL)
4625                         bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4626                                              sc->msix_irq_res[i]);
4627                 sc->msix_irq_res[i] = NULL;
4628         }
4629         free(sc->msix_irq_res, M_DEVBUF);
4630
4631
4632 abort_with_msix:
4633         pci_release_msi(sc->dev);
4634
4635 abort_with_msix_table:
4636         bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4637                              sc->msix_table_res);
4638
4639         return err;
4640 }
4641
4642 static int
4643 mxge_add_single_irq(mxge_softc_t *sc)
4644 {
4645         int count, err, rid;
4646
4647         count = pci_msi_count(sc->dev);
4648         if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4649                 rid = 1;
4650         } else {
4651                 rid = 0;
4652                 sc->legacy_irq = 1;
4653         }
4654         sc->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &rid,
4655                                              RF_SHAREABLE | RF_ACTIVE);
4656         if (sc->irq_res == NULL) {
4657                 device_printf(sc->dev, "could not alloc interrupt\n");
4658                 return ENXIO;
4659         }
4660         if (mxge_verbose)
4661                 device_printf(sc->dev, "using %s irq %jd\n",
4662                               sc->legacy_irq ? "INTx" : "MSI",
4663                               rman_get_start(sc->irq_res));
4664         err = bus_setup_intr(sc->dev, sc->irq_res,
4665                              INTR_TYPE_NET | INTR_MPSAFE,
4666 #if __FreeBSD_version > 700030
4667                              NULL,
4668 #endif
4669                              mxge_intr, &sc->ss[0], &sc->ih);
4670         if (err != 0) {
4671                 bus_release_resource(sc->dev, SYS_RES_IRQ,
4672                                      sc->legacy_irq ? 0 : 1, sc->irq_res);
4673                 if (!sc->legacy_irq)
4674                         pci_release_msi(sc->dev);
4675         }
4676         return err;
4677 }
4678
4679 static void
4680 mxge_rem_msix_irqs(mxge_softc_t *sc)
4681 {
4682         int i, rid;
4683
4684         for (i = 0; i < sc->num_slices; i++) {
4685                 if (sc->msix_ih[i] != NULL) {
4686                         bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4687                                           sc->msix_ih[i]);
4688                         sc->msix_ih[i] = NULL;
4689                 }
4690         }
4691         free(sc->msix_ih, M_DEVBUF);
4692
4693         for (i = 0; i < sc->num_slices; i++) {
4694                 rid = i + 1;
4695                 if (sc->msix_irq_res[i] != NULL)
4696                         bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4697                                              sc->msix_irq_res[i]);
4698                 sc->msix_irq_res[i] = NULL;
4699         }
4700         free(sc->msix_irq_res, M_DEVBUF);
4701
4702         bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4703                              sc->msix_table_res);
4704
4705         pci_release_msi(sc->dev);
4706         return;
4707 }
4708
4709 static void
4710 mxge_rem_single_irq(mxge_softc_t *sc)
4711 {
4712         bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4713         bus_release_resource(sc->dev, SYS_RES_IRQ,
4714                              sc->legacy_irq ? 0 : 1, sc->irq_res);
4715         if (!sc->legacy_irq)
4716                 pci_release_msi(sc->dev);
4717 }
4718
4719 static void
4720 mxge_rem_irq(mxge_softc_t *sc)
4721 {
4722         if (sc->num_slices > 1)
4723                 mxge_rem_msix_irqs(sc);
4724         else
4725                 mxge_rem_single_irq(sc);
4726 }
4727
4728 static int
4729 mxge_add_irq(mxge_softc_t *sc)
4730 {
4731         int err;
4732
4733         if (sc->num_slices > 1)
4734                 err = mxge_add_msix_irqs(sc);
4735         else
4736                 err = mxge_add_single_irq(sc);
4737         
4738         if (0 && err == 0 && sc->num_slices > 1) {
4739                 mxge_rem_msix_irqs(sc);
4740                 err = mxge_add_msix_irqs(sc);
4741         }
4742         return err;
4743 }
4744
4745
4746 static int
4747 mxge_attach(device_t dev)
4748 {
4749         mxge_cmd_t cmd;
4750         mxge_softc_t *sc = device_get_softc(dev);
4751         struct ifnet *ifp;
4752         int err, rid;
4753
4754         sc->dev = dev;
4755         mxge_fetch_tunables(sc);
4756
4757         TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4758         sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4759                                   taskqueue_thread_enqueue, &sc->tq);
4760         if (sc->tq == NULL) {
4761                 err = ENOMEM;
4762                 goto abort_with_nothing;
4763         }
4764
4765         err = bus_dma_tag_create(bus_get_dma_tag(dev),  /* parent */
4766                                  1,                     /* alignment */
4767                                  0,                     /* boundary */
4768                                  BUS_SPACE_MAXADDR,     /* low */
4769                                  BUS_SPACE_MAXADDR,     /* high */
4770                                  NULL, NULL,            /* filter */
4771                                  65536 + 256,           /* maxsize */
4772                                  MXGE_MAX_SEND_DESC,    /* num segs */
4773                                  65536,                 /* maxsegsize */
4774                                  0,                     /* flags */
4775                                  NULL, NULL,            /* lock */
4776                                  &sc->parent_dmat);     /* tag */
4777
4778         if (err != 0) {
4779                 device_printf(sc->dev, "Err %d allocating parent dmat\n",
4780                               err);
4781                 goto abort_with_tq;
4782         }
4783
4784         ifp = sc->ifp = if_alloc(IFT_ETHER);
4785         if (ifp == NULL) {
4786                 device_printf(dev, "can not if_alloc()\n");
4787                 err = ENOSPC;
4788                 goto abort_with_parent_dmat;
4789         }
4790         if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4791
4792         snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4793                  device_get_nameunit(dev));
4794         mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4795         snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4796                  "%s:drv", device_get_nameunit(dev));
4797         mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4798                  MTX_NETWORK_LOCK, MTX_DEF);
4799
4800         callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4801
4802         mxge_setup_cfg_space(sc);
4803         
4804         /* Map the board into the kernel */
4805         rid = PCIR_BARS;
4806         sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
4807                                              RF_ACTIVE);
4808         if (sc->mem_res == NULL) {
4809                 device_printf(dev, "could not map memory\n");
4810                 err = ENXIO;
4811                 goto abort_with_lock;
4812         }
4813         sc->sram = rman_get_virtual(sc->mem_res);
4814         sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4815         if (sc->sram_size > rman_get_size(sc->mem_res)) {
4816                 device_printf(dev, "impossible memory region size %jd\n",
4817                               rman_get_size(sc->mem_res));
4818                 err = ENXIO;
4819                 goto abort_with_mem_res;
4820         }
4821
4822         /* make NULL terminated copy of the EEPROM strings section of
4823            lanai SRAM */
4824         bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4825         bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4826                                 rman_get_bushandle(sc->mem_res),
4827                                 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4828                                 sc->eeprom_strings,
4829                                 MXGE_EEPROM_STRINGS_SIZE - 2);
4830         err = mxge_parse_strings(sc);
4831         if (err != 0)
4832                 goto abort_with_mem_res;
4833
4834         /* Enable write combining for efficient use of PCIe bus */
4835         mxge_enable_wc(sc);
4836
4837         /* Allocate the out of band dma memory */
4838         err = mxge_dma_alloc(sc, &sc->cmd_dma,
4839                              sizeof (mxge_cmd_t), 64);
4840         if (err != 0)
4841                 goto abort_with_mem_res;
4842         sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4843         err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4844         if (err != 0)
4845                 goto abort_with_cmd_dma;
4846
4847         err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4848         if (err != 0)
4849                 goto abort_with_zeropad_dma;
4850
4851         /* select & load the firmware */
4852         err = mxge_select_firmware(sc);
4853         if (err != 0)
4854                 goto abort_with_dmabench;
4855         sc->intr_coal_delay = mxge_intr_coal_delay;
4856
4857         mxge_slice_probe(sc);
4858         err = mxge_alloc_slices(sc);
4859         if (err != 0)
4860                 goto abort_with_dmabench;
4861
4862         err = mxge_reset(sc, 0);
4863         if (err != 0)
4864                 goto abort_with_slices;
4865
4866         err = mxge_alloc_rings(sc);
4867         if (err != 0) {
4868                 device_printf(sc->dev, "failed to allocate rings\n");
4869                 goto abort_with_slices;
4870         }
4871
4872         err = mxge_add_irq(sc);
4873         if (err != 0) {
4874                 device_printf(sc->dev, "failed to add irq\n");
4875                 goto abort_with_rings;
4876         }
4877
4878         ifp->if_baudrate = IF_Gbps(10);
4879         ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4880                 IFCAP_VLAN_MTU | IFCAP_LINKSTATE | IFCAP_TXCSUM_IPV6 |
4881                 IFCAP_RXCSUM_IPV6;
4882 #if defined(INET) || defined(INET6)
4883         ifp->if_capabilities |= IFCAP_LRO;
4884 #endif
4885
4886 #ifdef MXGE_NEW_VLAN_API
4887         ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4888
4889         /* Only FW 1.4.32 and newer can do TSO over vlans */
4890         if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4891             sc->fw_ver_tiny >= 32)
4892                 ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4893 #endif
4894         sc->max_mtu = mxge_max_mtu(sc);
4895         if (sc->max_mtu >= 9000)
4896                 ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4897         else
4898                 device_printf(dev, "MTU limited to %d.  Install "
4899                               "latest firmware for 9000 byte jumbo support\n",
4900                               sc->max_mtu - ETHER_HDR_LEN);
4901         ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4902         ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6;
4903         /* check to see if f/w supports TSO for IPv6 */
4904         if (!mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_TSO6_HDR_SIZE, &cmd)) {
4905                 if (CSUM_TCP_IPV6)
4906                         ifp->if_capabilities |= IFCAP_TSO6;
4907                 sc->max_tso6_hlen = min(cmd.data0,
4908                                         sizeof (sc->ss[0].scratch));
4909         }
4910         ifp->if_capenable = ifp->if_capabilities;
4911         if (sc->lro_cnt == 0)
4912                 ifp->if_capenable &= ~IFCAP_LRO;
4913         ifp->if_init = mxge_init;
4914         ifp->if_softc = sc;
4915         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4916         ifp->if_ioctl = mxge_ioctl;
4917         ifp->if_start = mxge_start;
4918         ifp->if_get_counter = mxge_get_counter;
4919         /* Initialise the ifmedia structure */
4920         ifmedia_init(&sc->media, 0, mxge_media_change,
4921                      mxge_media_status);
4922         mxge_media_init(sc);
4923         mxge_media_probe(sc);
4924         sc->dying = 0;
4925         ether_ifattach(ifp, sc->mac_addr);
4926         /* ether_ifattach sets mtu to ETHERMTU */
4927         if (mxge_initial_mtu != ETHERMTU)
4928                 mxge_change_mtu(sc, mxge_initial_mtu);
4929
4930         mxge_add_sysctls(sc);
4931 #ifdef IFNET_BUF_RING
4932         ifp->if_transmit = mxge_transmit;
4933         ifp->if_qflush = mxge_qflush;
4934 #endif
4935         taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4936                                 device_get_nameunit(sc->dev));
4937         callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4938         return 0;
4939
4940 abort_with_rings:
4941         mxge_free_rings(sc);
4942 abort_with_slices:
4943         mxge_free_slices(sc);
4944 abort_with_dmabench:
4945         mxge_dma_free(&sc->dmabench_dma);
4946 abort_with_zeropad_dma:
4947         mxge_dma_free(&sc->zeropad_dma);
4948 abort_with_cmd_dma:
4949         mxge_dma_free(&sc->cmd_dma);
4950 abort_with_mem_res:
4951         bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4952 abort_with_lock:
4953         pci_disable_busmaster(dev);
4954         mtx_destroy(&sc->cmd_mtx);
4955         mtx_destroy(&sc->driver_mtx);
4956         if_free(ifp);
4957 abort_with_parent_dmat:
4958         bus_dma_tag_destroy(sc->parent_dmat);
4959 abort_with_tq:
4960         if (sc->tq != NULL) {
4961                 taskqueue_drain(sc->tq, &sc->watchdog_task);
4962                 taskqueue_free(sc->tq);
4963                 sc->tq = NULL;
4964         }
4965 abort_with_nothing:
4966         return err;
4967 }
4968
4969 static int
4970 mxge_detach(device_t dev)
4971 {
4972         mxge_softc_t *sc = device_get_softc(dev);
4973
4974         if (mxge_vlans_active(sc)) {
4975                 device_printf(sc->dev,
4976                               "Detach vlans before removing module\n");
4977                 return EBUSY;
4978         }
4979         mtx_lock(&sc->driver_mtx);
4980         sc->dying = 1;
4981         if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4982                 mxge_close(sc, 0);
4983         mtx_unlock(&sc->driver_mtx);
4984         ether_ifdetach(sc->ifp);
4985         if (sc->tq != NULL) {
4986                 taskqueue_drain(sc->tq, &sc->watchdog_task);
4987                 taskqueue_free(sc->tq);
4988                 sc->tq = NULL;
4989         }
4990         callout_drain(&sc->co_hdl);
4991         ifmedia_removeall(&sc->media);
4992         mxge_dummy_rdma(sc, 0);
4993         mxge_rem_sysctls(sc);
4994         mxge_rem_irq(sc);
4995         mxge_free_rings(sc);
4996         mxge_free_slices(sc);
4997         mxge_dma_free(&sc->dmabench_dma);
4998         mxge_dma_free(&sc->zeropad_dma);
4999         mxge_dma_free(&sc->cmd_dma);
5000         bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
5001         pci_disable_busmaster(dev);
5002         mtx_destroy(&sc->cmd_mtx);
5003         mtx_destroy(&sc->driver_mtx);
5004         if_free(sc->ifp);
5005         bus_dma_tag_destroy(sc->parent_dmat);
5006         return 0;
5007 }
5008
5009 static int
5010 mxge_shutdown(device_t dev)
5011 {
5012         return 0;
5013 }
5014
5015 /*
5016   This file uses Myri10GE driver indentation.
5017
5018   Local Variables:
5019   c-file-style:"linux"
5020   tab-width:8
5021   End:
5022 */